-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathreport.tex
executable file
·993 lines (792 loc) · 36.9 KB
/
report.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
%% template.tex
%% from
%% bare_conf.tex
%% V1.4b
%% 2015/08/26
%% by Michael Shell
%% See:
%% http://www.michaelshell.org/
%% for current contact information.
%%
%% This is a skeleton file demonstrating the use of IEEEtran.cls
%% (requires IEEEtran.cls version 1.8b or later) with an IEEE
%% conference paper.
%%
%% Support sites:
%% http://www.michaelshell.org/tex/ieeetran/
%% http://www.ctan.org/pkg/ieeetran
%% and
%% http://www.ieee.org/
%%*************************************************************************
%% Legal Notice:
%% This code is offered as-is without any warranty either expressed or
%% implied; without even the implied warranty of MERCHANTABILITY or
%% FITNESS FOR A PARTICULAR PURPOSE!
%% User assumes all risk.
%% In no event shall the IEEE or any contributor to this code be liable for
%% any damages or losses, including, but not limited to, incidental,
%% consequential, or any other damages, resulting from the use or misuse
%% of any information contained here.
%%
%% All comments are the opinions of their respective authors and are not
%% necessarily endorsed by the IEEE.
%%
%% This work is distributed under the LaTeX Project Public License (LPPL)
%% ( http://www.latex-project.org/ ) version 1.3, and may be freely used,
%% distributed and modified. A copy of the LPPL, version 1.3, is included
%% in the base LaTeX documentation of all distributions of LaTeX released
%% 2003/12/01 or later.
%% Retain all contribution notices and credits.
%% ** Modified files should be clearly indicated as such, including **
%% ** renaming them and changing author support contact information. **
%%*************************************************************************
% *** Authors should verify (and, if needed, correct) their LaTeX system ***
% *** with the testflow diagnostic prior to trusting their LaTeX platform ***
% *** with production work. The IEEE's font choices and paper sizes can ***
% *** trigger bugs that do not appear when using other class files. *** ***
% The testflow support page is at:
% http://www.michaelshell.org/tex/testflow/
\documentclass[conference,final,]{IEEEtran}
% Some Computer Society conferences also require the compsoc mode option,
% but others use the standard conference format.
%
% If IEEEtran.cls has not been installed into the LaTeX system files,
% manually specify the path to it like:
% \documentclass[conference]{../sty/IEEEtran}
% Some very useful LaTeX packages include:
% (uncomment the ones you want to load)
% *** MISC UTILITY PACKAGES ***
%
%\usepackage{ifpdf}
% Heiko Oberdiek's ifpdf.sty is very useful if you need conditional
% compilation based on whether the output is pdf or dvi.
% usage:
% \ifpdf
% % pdf code
% \else
% % dvi code
% \fi
% The latest version of ifpdf.sty can be obtained from:
% http://www.ctan.org/pkg/ifpdf
% Also, note that IEEEtran.cls V1.7 and later provides a builtin
% \ifCLASSINFOpdf conditional that works the same way.
% When switching from latex to pdflatex and vice-versa, the compiler may
% have to be run twice to clear warning/error messages.
% *** CITATION PACKAGES ***
%
%\usepackage{cite}
% cite.sty was written by Donald Arseneau
% V1.6 and later of IEEEtran pre-defines the format of the cite.sty package
% \cite{} output to follow that of the IEEE. Loading the cite package will
% result in citation numbers being automatically sorted and properly
% "compressed/ranged". e.g., [1], [9], [2], [7], [5], [6] without using
% cite.sty will become [1], [2], [5]--[7], [9] using cite.sty. cite.sty's
% \cite will automatically add leading space, if needed. Use cite.sty's
% noadjust option (cite.sty V3.8 and later) if you want to turn this off
% such as if a citation ever needs to be enclosed in parenthesis.
% cite.sty is already installed on most LaTeX systems. Be sure and use
% version 5.0 (2009-03-20) and later if using hyperref.sty.
% The latest version can be obtained at:
% http://www.ctan.org/pkg/cite
% The documentation is contained in the cite.sty file itself.
% *** GRAPHICS RELATED PACKAGES ***
%
\ifCLASSINFOpdf
% \usepackage[pdftex]{graphicx}
% declare the path(s) where your graphic files are
% \graphicspath{{../pdf/}{../jpeg/}}
% and their extensions so you won't have to specify these with
% every instance of \includegraphics
% \DeclareGraphicsExtensions{.pdf,.jpeg,.png}
\else
% or other class option (dvipsone, dvipdf, if not using dvips). graphicx
% will default to the driver specified in the system graphics.cfg if no
% driver is specified.
% \usepackage[dvips]{graphicx}
% declare the path(s) where your graphic files are
% \graphicspath{{../eps/}}
% and their extensions so you won't have to specify these with
% every instance of \includegraphics
% \DeclareGraphicsExtensions{.eps}
\fi
% graphicx was written by David Carlisle and Sebastian Rahtz. It is
% required if you want graphics, photos, etc. graphicx.sty is already
% installed on most LaTeX systems. The latest version and documentation
% can be obtained at:
% http://www.ctan.org/pkg/graphicx
% Another good source of documentation is "Using Imported Graphics in
% LaTeX2e" by Keith Reckdahl which can be found at:
% http://www.ctan.org/pkg/epslatex
%
% latex, and pdflatex in dvi mode, support graphics in encapsulated
% postscript (.eps) format. pdflatex in pdf mode supports graphics
% in .pdf, .jpeg, .png and .mps (metapost) formats. Users should ensure
% that all non-photo figures use a vector format (.eps, .pdf, .mps) and
% not a bitmapped formats (.jpeg, .png). The IEEE frowns on bitmapped formats
% which can result in "jaggedy"/blurry rendering of lines and letters as
% well as large increases in file sizes.
%
% You can find documentation about the pdfTeX application at:
% http://www.tug.org/applications/pdftex
% *** MATH PACKAGES ***
%
%\usepackage{amsmath}
% A popular package from the American Mathematical Society that provides
% many useful and powerful commands for dealing with mathematics.
%
% Note that the amsmath package sets \interdisplaylinepenalty to 10000
% thus preventing page breaks from occurring within multiline equations. Use:
%\interdisplaylinepenalty=2500
% after loading amsmath to restore such page breaks as IEEEtran.cls normally
% does. amsmath.sty is already installed on most LaTeX systems. The latest
% version and documentation can be obtained at:
% http://www.ctan.org/pkg/amsmath
% *** SPECIALIZED LIST PACKAGES ***
%
%\usepackage{algorithmic}
% algorithmic.sty was written by Peter Williams and Rogerio Brito.
% This package provides an algorithmic environment fo describing algorithms.
% You can use the algorithmic environment in-text or within a figure
% environment to provide for a floating algorithm. Do NOT use the algorithm
% floating environment provided by algorithm.sty (by the same authors) or
% algorithm2e.sty (by Christophe Fiorio) as the IEEE does not use dedicated
% algorithm float types and packages that provide these will not provide
% correct IEEE style captions. The latest version and documentation of
% algorithmic.sty can be obtained at:
% http://www.ctan.org/pkg/algorithms
% Also of interest may be the (relatively newer and more customizable)
% algorithmicx.sty package by Szasz Janos:
% http://www.ctan.org/pkg/algorithmicx
% *** ALIGNMENT PACKAGES ***
%
%\usepackage{array}
% Frank Mittelbach's and David Carlisle's array.sty patches and improves
% the standard LaTeX2e array and tabular environments to provide better
% appearance and additional user controls. As the default LaTeX2e table
% generation code is lacking to the point of almost being broken with
% respect to the quality of the end results, all users are strongly
% advised to use an enhanced (at the very least that provided by array.sty)
% set of table tools. array.sty is already installed on most systems. The
% latest version and documentation can be obtained at:
% http://www.ctan.org/pkg/array
% IEEEtran contains the IEEEeqnarray family of commands that can be used to
% generate multiline equations as well as matrices, tables, etc., of high
% quality.
% *** SUBFIGURE PACKAGES ***
%\ifCLASSOPTIONcompsoc
% \usepackage[caption=false,font=normalsize,labelfont=sf,textfont=sf]{subfig}
%\else
% \usepackage[caption=false,font=footnotesize]{subfig}
%\fi
% subfig.sty, written by Steven Douglas Cochran, is the modern replacement
% for subfigure.sty, the latter of which is no longer maintained and is
% incompatible with some LaTeX packages including fixltx2e. However,
% subfig.sty requires and automatically loads Axel Sommerfeldt's caption.sty
% which will override IEEEtran.cls' handling of captions and this will result
% in non-IEEE style figure/table captions. To prevent this problem, be sure
% and invoke subfig.sty's "caption=false" package option (available since
% subfig.sty version 1.3, 2005/06/28) as this is will preserve IEEEtran.cls
% handling of captions.
% Note that the Computer Society format requires a larger sans serif font
% than the serif footnote size font used in traditional IEEE formatting
% and thus the need to invoke different subfig.sty package options depending
% on whether compsoc mode has been enabled.
%
% The latest version and documentation of subfig.sty can be obtained at:
% http://www.ctan.org/pkg/subfig
% *** FLOAT PACKAGES ***
%
%\usepackage{fixltx2e}
% fixltx2e, the successor to the earlier fix2col.sty, was written by
% Frank Mittelbach and David Carlisle. This package corrects a few problems
% in the LaTeX2e kernel, the most notable of which is that in current
% LaTeX2e releases, the ordering of single and double column floats is not
% guaranteed to be preserved. Thus, an unpatched LaTeX2e can allow a
% single column figure to be placed prior to an earlier double column
% figure.
% Be aware that LaTeX2e kernels dated 2015 and later have fixltx2e.sty's
% corrections already built into the system in which case a warning will
% be issued if an attempt is made to load fixltx2e.sty as it is no longer
% needed.
% The latest version and documentation can be found at:
% http://www.ctan.org/pkg/fixltx2e
%\usepackage{stfloats}
% stfloats.sty was written by Sigitas Tolusis. This package gives LaTeX2e
% the ability to do double column floats at the bottom of the page as well
% as the top. (e.g., "\begin{figure*}[!b]" is not normally possible in
% LaTeX2e). It also provides a command:
%\fnbelowfloat
% to enable the placement of footnotes below bottom floats (the standard
% LaTeX2e kernel puts them above bottom floats). This is an invasive package
% which rewrites many portions of the LaTeX2e float routines. It may not work
% with other packages that modify the LaTeX2e float routines. The latest
% version and documentation can be obtained at:
% http://www.ctan.org/pkg/stfloats
% Do not use the stfloats baselinefloat ability as the IEEE does not allow
% \baselineskip to stretch. Authors submitting work to the IEEE should note
% that the IEEE rarely uses double column equations and that authors should try
% to avoid such use. Do not be tempted to use the cuted.sty or midfloat.sty
% packages (also by Sigitas Tolusis) as the IEEE does not format its papers in
% such ways.
% Do not attempt to use stfloats with fixltx2e as they are incompatible.
% Instead, use Morten Hogholm'a dblfloatfix which combines the features
% of both fixltx2e and stfloats:
%
% \usepackage{dblfloatfix}
% The latest version can be found at:
% http://www.ctan.org/pkg/dblfloatfix
% *** PDF, URL AND HYPERLINK PACKAGES ***
%
%\usepackage{url}
% url.sty was written by Donald Arseneau. It provides better support for
% handling and breaking URLs. url.sty is already installed on most LaTeX
% systems. The latest version and documentation can be obtained at:
% http://www.ctan.org/pkg/url
% Basically, \url{my_url_here}.
% *** Do not adjust lengths that control margins, column widths, etc. ***
% *** Do not use packages that alter fonts (such as pslatex). ***
% There should be no need to do such things with IEEEtran.cls V1.6 and later.
% (Unless specifically asked to do so by the journal or conference you plan
% to submit to, of course. )
%% BEGIN MY ADDITIONS %%
\usepackage{graphicx}
% We will generate all images so they have a width \maxwidth. This means
% that they will get their normal width if they fit onto the page, but
% are scaled down if they would overflow the margins.
\makeatletter
\def\maxwidth{\ifdim\Gin@nat@width>\linewidth\linewidth
\else\Gin@nat@width\fi}
\makeatother
\let\Oldincludegraphics\includegraphics
\renewcommand{\includegraphics}[1]{\Oldincludegraphics[width=\maxwidth]{#1}}
\usepackage[unicode=true]{hyperref}
\hypersetup{
pdftitle={Statistical Learning - Final Project Report},
pdfborder={0 0 0},
breaklinks=true}
\urlstyle{same} % don't use monospace font for urls
% Pandoc toggle for numbering sections (defaults to be off)
\setcounter{secnumdepth}{0}
% Pandoc syntax highlighting
% Pandoc header
\usepackage{booktabs} \usepackage{listings} \usepackage[]{algorithm2e} \usepackage{amsmath} \usepackage{amssymb}
\providecommand{\tightlist}{%
\setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}}
%% END MY ADDITIONS %%
\hyphenation{op-tical net-works semi-conduc-tor}
\begin{document}
%
% paper title
% Titles are generally capitalized except for words such as a, an, and, as,
% at, but, by, for, in, nor, of, on, or, the, to and up, which are usually
% not capitalized unless they are the first or last word of the title.
% Linebreaks \\ can be used within to get better formatting as desired.
% Do not put math or special symbols in the title.
\title{Statistical Learning - Final Project Report}
% author names and affiliations
% use a multiple column layout for up to three different
% affiliations
\author{
%% ---- classic IEEETrans wide authors' list ----------------
% -- end affiliation.wide
%% ----------------------------------------------------------
%% ---- classic IEEETrans one column per institution --------
%% -- beg if/affiliation.institution-columnar
\IEEEauthorblockN{
%% -- beg for/affiliation.institution.author
Carlos Perez-Guerra (118033990013),
%% -- beg for/affiliation.institution.author
%% -- end for/affiliation.institution.author
}
\IEEEauthorblockA{Department of CSE\\
Shanghai Jiaotong University
%% -- beg for/affiliation.institution.author
%% -- beg for/affiliation.institution.author
%% -- end for/affiliation.institution.author
}
%% -- end for/affiliation.institution
%% -- end if/affiliation.institution-columnar
%% ----------------------------------------------------------
%% ---- one column per author, classic/default IEEETrans ----
%% -- end if/affiliation.institution-columnar
%% ----------------------------------------------------------
}
% conference papers do not typically use \thanks and this command
% is locked out in conference mode. If really needed, such as for
% the acknowledgment of grants, issue a \IEEEoverridecommandlockouts
% after \documentclass
% for over three affiliations, or if they all won't fit within the width
% of the page, use this alternative format:
%
%\author{\IEEEauthorblockN{Michael Shell\IEEEauthorrefmark{1},
%Homer Simpson\IEEEauthorrefmark{2},
%James Kirk\IEEEauthorrefmark{3},
%Montgomery Scott\IEEEauthorrefmark{3} and
%Eldon Tyrell\IEEEauthorrefmark{4}}
%\IEEEauthorblockA{\IEEEauthorrefmark{1}School of Electrical and Computer Engineering\\
%Georgia Institute of Technology,
%Atlanta, Georgia 30332--0250\\ Email: see http://www.michaelshell.org/contact.html}
%\IEEEauthorblockA{\IEEEauthorrefmark{2}Twentieth Century Fox, Springfield, USA\\
%Email: [email protected]}
%\IEEEauthorblockA{\IEEEauthorrefmark{3}Starfleet Academy, San Francisco, California 96678-2391\\
%Telephone: (800) 555--1212, Fax: (888) 555--1212}
%\IEEEauthorblockA{\IEEEauthorrefmark{4}Tyrell Inc., 123 Replicant Street, Los Angeles, California 90210--4321}}
% use for special paper notices
%\IEEEspecialpapernotice{(Invited Paper)}
% make the title area
\maketitle
% As a general rule, do not put math, special symbols or citations
% in the abstract
\begin{abstract}
In this report, I examine the different classification algorithms we
discussed in class as applied to the image training dataset
Specifically, I will focus on discriminative supervised classification
algorithms, and then enhance their power through the semi-supervised
learning technique known as pseudo-labeling. The best algorithm achieved
a private score of 0.93891 (top 5\%, Fig. 1), although it was a late
submission, hence validating the methods described herewith.
\end{abstract}
% no keywords
% use for special paper notices
% make the title area
\maketitle
% no keywords
% For peer review papers, you can put extra information on the cover
% page as needed:
% \ifCLASSOPTIONpeerreview
% \begin{center} \bfseries EDICS Category: 3-BBND \end{center}
% \fi
%
% For peerreview papers, this IEEEtran command inserts a page break and
% creates the second title. It will be ignored for other modes.
\IEEEpeerreviewmaketitle
\hypertarget{introduction-methodology}{%
\section{Introduction \& Methodology}\label{introduction-methodology}}
In keeping with assignment guidelines, my choice of algorithms drew from
class discussion. The R programming language was chosen for this task
since all algorithms discussed had an implementation by their creators
or affiliated researchers in this language.
Due to the high dimension of the data, PCA was performed and used as a
base for the input space, and hence the number of components was treated
as an additional parameter. The number of components retained is denoted
with the letter \(p\). All parameters for the algorithms in Table 1 were
estimated using 5-fold, 2-repetition cross validation. Cross-Validation
was chosen over Bootstrap Estimation due to the shape of the data, in
which \(N_k<p\). Initially, my investigation focused on supervised
learning techniques, and the results of the most effective algorithms
are summarised in Table 1.
\begin{table}[htbp]
\caption{\label{tab:org1e2928b}
Results using some classic supervised learning algorithms}
\centering
\begin{tabular}{ *{4}{c} }
\hline
\multicolumn{1}{|p{1cm}|}{\centering Algorithm}
& \multicolumn{1}{|p{1.5cm}|}{\centering Parameters}
& \multicolumn{1}{|p{1.5cm}|}{\centering Accuracy \\ (Training, \\ 95\% CI)}
& \multicolumn{1}{|p{1.5cm}|}{\centering Accuracy \\ (Private \\ Leaderboard)} \\
\hline
HDRDA Ridge
& \multicolumn{1}{|p{1.5cm}|}{\centering $\lambda=0.5$, \\ $\gamma=0.5$ }
& \multicolumn{1}{|p{1.5cm}|}{\centering 0.98339 \\ (0.9775, 0.98924) }
& 0.91346 \\
\hline
HDRDA Convex
& \multicolumn{1}{|p{1.5cm}|}{\centering $\lambda=0.5$, \\ $\gamma=0.5$ }
& \multicolumn{1}{|p{1.5cm}|}{\centering 0.98122 \\ (0.9715, 0.9893) }
& 0.912471 \\
\hline
PLSDA
& \multicolumn{1}{|p{1.5cm}|}{\centering $n=15$, \\ $p=1000$ }
& \multicolumn{1}{|p{1.5cm}|}{\centering 0.98122 \\ (0.9715, 0.9893) }
& 0.92655 \\
\hline
Logistic Regression
& \multicolumn{1}{|p{1.5cm}|}{\centering }
& \multicolumn{1}{|p{1.5cm}|}{\centering 0.98122 \\ (0.9715, 0.9893) }
& 0.92115 \\
\hline
LDA
& \multicolumn{1}{|p{1.5cm}|}{\centering }
& \multicolumn{1}{|p{1.5cm}|}{\centering 0.98122 \\ (0.9715, 0.9893) }
& 0.90576 \\
\hline
MDA
& \multicolumn{1}{|p{1.5cm}|}{\centering $n=10$ }
& \multicolumn{1}{|p{1.5cm}|}{\centering 0.98122 \\ (0.9715, 0.9893) }
& 0.88791 \\
\hline
SVM - Linear
& \multicolumn{1}{|p{1.5cm}|}{\centering }
& \multicolumn{1}{|p{1.5cm}|}{\centering 0.98122 \\ (0.9715, 0.9893) }
& 0.85778 \\
\bottomrule
\end{tabular}
\end{table}
However, it soon became clear that further generalization power could be
achieved by considering the unlabeled data set to impose constraints on
the dataset, thereby reducing the estimator's bias (semi-supervised
learning).Using the pseudo-labeling algorithm, which I implemented in R
and which is explained in detail below, resulted in an improvedment in
generalization ability for all algorithms. Due to the nature of the
pseudo-labeling algorithm (training error increases with each iteration
as bias is smoothed out), cross-validation was not used. Results using
pseudo-labeling are summarised in Table 2.
\begin{table}[htbp]
\caption{\label{tab:org1e2928b}
Results using pseudo-labeling}
\centering
\begin{tabular}{ *{4}{c} }
\hline
\multicolumn{1}{|p{2cm}|}{\centering Algorithm}
& \multicolumn{1}{|p{2cm}|}{\centering Parameters}
& \multicolumn{1}{|p{2cm}|}{\centering Accuracy \\ (Private \\ Leaderboard)} \\
\hline
PLSDA & $n=15, p=1500$ & 0.92902 \\
\hline
\textbf{Least Squares Classifier} & $\lambda=0.1, p=1500$ & \textbf{0.93891} \\
\bottomrule
\end{tabular}
\end{table}
\begin{figure}
\centering
\includegraphics{../fig/kaggle_best.png}
\caption{Screenshot of my best prediction set in the Kaggle competition,
as required by the task description}
\end{figure}
\hypertarget{exploratory-analysis}{%
\section{Exploratory Analysis}\label{exploratory-analysis}}
A first glance at our data set reveals we have almost as many features
as we have data points. There are 12 classes is the training set
(balanced), and each predictor is real-valued, so each class is severely
undersampled. Hence we should treat this as a supervised dimensionality
problem, namely we wish to find a transformation that maps data to a
lower dimensional space while maintaining or improving separability of
the data.
A common technique in dimensionality reduction is to start by doing
Principal Component Analysis. PCA is an unsupervised training technique
that consists on doing SVD on the centered and scaled predictors
covariance matrix. This is equivalent to projecting the input space on a
new space whose basis is the set of eigenvectors, where the resulting
magnitude of the eigenvalues reveals how much variance of the data is
explained by each eigenvector in the new space. PCA is an expensive
algorithm, but it is feasible for the size of our dataset (especially
since there are parallel implementations). The results are summarised in
Figs. 2,3, 4 and Table 1:
\begin{table}[htbp]
\caption{\label{tab:org1e2928b}
Percentage of variance explained as number of principal components increases}
\centering
\begin{tabular}{rl}
\# PC's & \% of Variance explained\\
\hline
3 & 13\%\\
8 & 26\%\\
50 & 52\%\\
425 & 79\%\\
1000 & 90\%\\
1500 & 94\%\\
4094 & 100\%\\
\end{tabular}
\end{table}
PCA reveals two predictors are redundant, and most of the variance is
explained with 2000 components. It is very likely that most of the
near-zero components are simply perturbations or noise. Somewhere
between 1000 and 2000 components there exist optima, where variance
(resp. separability) is preserved and dimensionality is reduced.
\begin{figure}
\centering
\includegraphics{../fig/scree.jpg}
\caption{Scree plot reveals most components are highly correlated. The
first 1000 components explain 90\% of variance in the data}
\end{figure}
\begin{figure}
\centering
\includegraphics{../fig/pca_3d.jpg}
\caption{A 3D scatterplot of the inputs in PC space using the first 3
components.}
\end{figure}
\begin{figure}
\centering
\includegraphics{../fig/pca_pairs.jpg}
\caption{Pair scatterplots of the first 8 components (which explain 26\%
of the variance)}
\end{figure}
\hypertarget{discriminant-analysis}{%
\section{Discriminant Analysis}\label{discriminant-analysis}}
There exist many discriminant analysis approaches, all derived from the
maximum likelihood estimate of log odds between two classes of data
assumed to be normally distributed. As required, I will describe briefly
their origin and formulation:
Recall that the MLE of samples from a normal distribution is given by:
\begin{equation}
\hat{\pi}_k = \frac{N_k}{N}
\end{equation}
\begin{equation}
\hat{\mu}_k = \frac{1}{N_k}\sum^{N_k}_{n=1}x_n
\end{equation}
\begin{equation}
\hat{\Sigma}_k = \sum^{N_k}_{n=1}(x_n - \hat \mu_k)(x_n - \hat \mu_k)^T
\end{equation}
We can express the odds using Bayes' Rule:
\begin{equation}
\frac {\hat {\pi}_k \hat {f}_k} {\hat {\pi}_j \hat {f}_j} = 1
\end{equation}
where the equality of numerator and denominator will occur at the class
boundary. We can further assume that the classes are multivariate
Gaussian. Taking logs and doing some algebra we get the QDA
discriminant:
\begin{equation}
\delta_{QDA}^{(k)} = \log \hat \pi _k - \frac 1 2 [log |\hat \Sigma _k| + (x_n - \hat \mu_k)^T\hat \Sigma _k ^{-1}(x_n - \hat \mu_k)]
\end{equation}
The LDA discriminant follows by allowing the covariance matrix to be a
pooled covariance (weighted sum of the covariance matrices of all
classes). Then terms that don't depend on \(\hat{\mu}_k\) cancel out and
we have:
\begin{equation}
\delta_{LDA}^{(k)} = \log \hat {\pi} _k + \hat {\mu} _k ^T \hat {\Sigma} ^{-1} x -\hat {\mu} _k ^T \hat {\Sigma} ^{-1} \hat {\mu} _k
\end{equation}
Friedman (1989) proposed Regularized Discriminant Analysis, which is
essentially a refinement of QDA in which the pooled covariance matrix
\(\hat \Sigma\) is used to smooth \(\hat \Sigma _k\) by a parameter
\(\lambda\), hence the new covariance matrix is now a convex
combination:
\begin{equation}
\tilde{\Sigma}_k(\lambda) = \lambda \hat \Sigma _k - (1-\lambda) \hat \Sigma
\end{equation}
Lastly, Ramey et al. (2016) proposed HDRDA, whose formulation is:
\begin{equation}
\tilde{\Sigma}_{HDRDA}^{(k)}(\lambda, \gamma) = \alpha \tilde{\Sigma}_k(\lambda) - \gamma I
\end{equation}
In this case if \(\alpha=1\) then we have Friedman's RDA with a
shrinkage term on the eigenvectors, which the authors define as
\emph{HDRDA Ridge}, while if \(\alpha=1-\gamma\) we have a different
algorithm which they define as \emph{HDRDA Convex}.
Hence it is clear that we can do a search on \(\lambda\) and \(\gamma\)
and it is equivalent to testing the data on LDA,QDA and RDA, while also
accounting for variance in high-dimensional space with shrinkage. The
cross-validated optimum of the Accuracy error surface was found to be
\(\lambda = 0.5, \gamma=0.5, \alpha=1-\gamma\). The \(sparsediscrim\)
package (written by the authors of the paper) was used.
\begin{figure}
\centering
\includegraphics{../fig/accuracy_hdrda.png}
\caption{Parameter tuning for the HDRDA classifier)}
\end{figure}
\hypertarget{mixture-discriminant-analysis}{%
\section{Mixture Discriminant
Analysis}\label{mixture-discriminant-analysis}}
It is clear that while HDRDA performs well on the data, there is still
room for improvement. Could the error be reduced by fitting a non-linear
model? The logical extension is MDA, first proposed by Hastie and
Tibshirani (1996), which attempts to model each class as a Gaussian
Mixture Model instead of as a single Gaussian. However this model
performed poorly, as can be seen in Table 1 and Fig. 6.
\begin{figure}
\centering
\includegraphics{../fig/accuracy_mda.png}
\caption{Accuracy clearly decreases with subclasses per class for the
MDA classifier.}
\end{figure}
\hypertarget{partial-least-squares-discriminant-analysis-plsda}{%
\section{Partial Least Squares Discriminant Analysis
(PLSDA)}\label{partial-least-squares-discriminant-analysis-plsda}}
Partial Least-Squares Discriminant Analysis (PLS-DA) is a supervised
multivariate dimensionality-reduction tool {[}15{]}, {[}2{]} that has
been popular in the field of chemometrics for well over two decades
(Gottfries et al. (1995), Ståhle and Wold (1987), Perez and Narasimhan
(2018)). Chemometrics data sets are characterized by large volume, large
number of features, noise and missing data (Barker and Rayens (2003)).
Because of the difficulty in obtaining labeled data, and the large
number of features, usually \(n_k<p\), as in our case.
PLSDA can be thought of as the supervised counterpart of PCA is a
different approach to dimensionality reduction. In contrast to PCA,
PLSDA considers the correlation of each predictor with the labeled data.
It is similar to Fisher Discriminant Analalysis, but allows for a
feature space that is larger than \(k-1\) dimensions, and is hence able
to capture more nuanced data patterns.
The way it works is not mysterious. In the two-class case, the labels
are encoded numerically as \{0,1\}, then the classic partial least
squares algorithm is applied (Algorithm 1). The only parameter that
requires tuning is the number of components in the new space.
\begin{algorithm}
\SetKwInOut{Input}{input}\SetKwInOut{Output}{output}
\Input{$X,y,k$}
\Output{$[\phi]_{\dim(X) \times k} $}
$k \triangleq$ Desired number of components
\Begin{
\tcc{Center and normalize each predictor to have zero mean and unit variance}
}
\For{$m=1,2,..,$k}{
$\phi_{mj}=x_j^{(m-1)} \cdot y$\;
$z_m = \sum_{j=1}^{p} \phi_{mj}x_j$\;
$\hat{\theta}_m = \frac{z_m \cdot y}{z_m \cdot z_m}$\;
Orthogonalize each $x_j^{(m-1)}$ w.r.t $z_m$
}
\caption{The PLS Algorithm}
\end{algorithm}
In the multiclass case, the two-class case is extended by means of an
indicator matrix \(Y\). Then, the two-class algorithm is applied to each
column of the matrix Y, and then k transformation matrices are computed.
The discriminant is then calculated using softmax criteria on each new
space.
The PLSDA algorithm enjoys widespread use for image analysis in the
Chemometrics community (Chevallier et al. (2006)).
In my comparative study, PLSDA had the greatest generalization ability
out of all supervised algorithms.
\begin{figure}
\centering
\includegraphics{../fig/accuracy_plsda.png}
\caption{PLS-DA Accuracy with 95\% CI as number of components is
increased. The optimum is found to be 15 components.}
\end{figure}
\hypertarget{other-algorithms}{%
\section{Other algorithms}\label{other-algorithms}}
SVMs performed poorly on the dataset. Both Linear and RBF kernels were
employed using hinge loss, the results were significantly lower than
other methods discussed in this report. This is somewhat surprising
considering SVMs are more robust when dealing with noisy data, as the
boundaries only depend on the support vectors.
Furthermore, generally speaking, nonlinear models performed poorly on
the data set. MDA has already been mentioned, Neural Nets were also
attempted with disappointing results.
\hypertarget{pseudo-labeling}{%
\section{Pseudo-labeling}\label{pseudo-labeling}}
Pseudo-labeling, also known as self-training or Yarowsky's Algorithm, is
a commonly used technique for semi-supervised learning (Algorithm 2). A
classifier is first trained with the small amount of labeled data. The
classifier is then used to classify the unlabeled data. The classifier
is re-trained and the procedure repeated, namely the classifier uses its
own predictions to teach itself (Zhu (2006)).
\begin{algorithm}
\SetKwInOut{Input}{input}\SetKwInOut{Output}{output}
\Input{classifier($\cdot,\cdot$), $y^{(0)}, X^{(0)}, X^{(u)}$, {max\_iter}}
\Output{model}
$y^{(0)} \triangleq$ Labels vector (labeled set)\;
$X^{(0)} \triangleq$ Predictor matrix (labeled set)\;
$X^{(u)} \triangleq$ Predictor matrix (unlabeled set)\;
\Begin{
\tcc{We first train a model using the classifier function and label the unlabeled set}
model := classifier($X^{(0)}, y^{(0)}$)\;
$\hat{y}^{(u)}$ := model.predict($X^{(u)}$)\;
}
\While{$j< maxiter$}{
$j = j+1$\;
$X := \begin{bmatrix}
X^{(0)} \\
X^{(u)}
\end{bmatrix}$
\;
$\hat{y} := \begin{bmatrix}
y^{(0)} \\
\hat{y}^{(u)}
\end{bmatrix}
$\;
model := classifier($X, \hat y$)\;
$\hat{y}^{(u)}$ := model.predict($X^{(u)}$)\;
}
\caption{The Pseudo-Labeling Algorithm}
\end{algorithm}
Pseudo-labeling is a wrapper algorithm, and is hard to analyze in
general. However Abney (2004) provides an analytic proof of how the
algorithm essentially ``optimizes either likelihood or a closely related
objective function K''. Like many other semi-supervised learning
techniques, it is in common use in fields where labeled data is scarce
with respect to test data. In my review of the literature I found that
test set sizes are always significantly larger than training sets, and
my own experiments showed that pseudo labeling doesn't have a strong
impact when the test set is similar in size or smaller than the training
set. However, in our case the test set is significantly larger than the
training set, and pseudo-labeling was shown to significantly improve
accuracy on the test set.
PLSDA showed a marked improvement over it's supervised counterpart, as
seen in Table 2.
However the algorithm that performed the best out of all is Least
Squares Classifier with L2 regularization. This algorithm is equivalent
to assigning each class a numeric value, then performing Ridge
Regression on the data set. To predict new values, the weights are
applied as in Linear Regression and the class value closest to
\(\hat y_i\) is the predicted class. A grid search was performed on
\(p=1000:2000\) in steps of 100, and \(lambda=0:1\) in steps of 0.1.
\hypertarget{conclusion-future-research-and-personal-thoughts}{%
\section{Conclusion, Future Research and Personal
Thoughts}\label{conclusion-future-research-and-personal-thoughts}}
The results show that for a data set of these characteristics, namely,
small proportion of labeled data points, high dimensionality, less
samples per class than features (\(n_k<p\)), noise, and collinearity, we
can use the sort of techniques that are used in Chemometrics and
Biostatistics, where many data sets are of this nature. Discriminant
Analysis, PLSDA, logistic regression -the fundamental tools of
Statistical Learning Theory - all yielded good results.
However, it is clear that semi-supervised learning techniques are very
useful in dealing with scenarios where unlabeled data far exceeds
labeled data. Using pseudo-labeling always reduced the generalization
error. If I had more time I would have attempted other classifiers in
conjunction with pseudo-labeling, for example SVMs. Furthermore, it is
clear that there are many semi-supervised learning techniques which I
should have explored but didn't, so these present are a clear
opportunities for improvement.
One drawback of semi-supervised learning, though, is the inherently
serial nature of many of its algorithms. Most algorithms are very much
like the EM algorithm family or the Pseudo-labeling algorithm, in that
they iterate until convergence using results from the previous
iteration, and hence don't lend themselves to parallel implementations.
This increase in running time is compounded by the fact that
semi-supervised learning uses a lot more data (several times more).
Having said that, one can still run many different algorithms, each
using one core, but this would essentially require containers, or look
for a parallel implementation of the base classifier if it exists - but
these pose a greater technical challenge for the average practitioner.
It is clear that a lot of the properties of these algorithms have still
to be researched. A clear example is how pseudo-labeling, which is a
very simple algorithm, is generally not well understood. For my personal
work, I observed that training error increased with each iteration of
the pseudo-labeling algorithm, but it is not rigourously proven whether
there is a lower bound, and hence convergence, and whether this
convergence is some sort of optima, or whether the algorithm eventually
also overfits and misses its optimum.
\hypertarget{bibliography}{%
\section*{Bibliography}\label{bibliography}}
\addcontentsline{toc}{section}{Bibliography}
\hypertarget{refs}{}
\leavevmode\hypertarget{ref-abney2004understanding}{}%
Abney, Steven. 2004. ``Understanding the Yarowsky Algorithm.''
\emph{Computational Linguistics} 30 (3): 365--95.
\leavevmode\hypertarget{ref-barker2003partial}{}%
Barker, Matthew, and William Rayens. 2003. ``Partial Least Squares for
Discrimination.'' \emph{Journal of Chemometrics: A Journal of the
Chemometrics Society} 17 (3): 166--73.
\leavevmode\hypertarget{ref-chevallier2006application}{}%
Chevallier, Sylvie, Dominique Bertrand, Achim Kohler, and Philippe
Courcoux. 2006. ``Application of Pls-Da in Multivariate Image
Analysis.'' \emph{Journal of Chemometrics: A Journal of the Chemometrics
Society} 20 (5): 221--29.
\leavevmode\hypertarget{ref-friedman1989regularized}{}%
Friedman, Jerome H. 1989. ``Regularized Discriminant Analysis.''
\emph{Journal of the American Statistical Association} 84 (405):
165--75.
\leavevmode\hypertarget{ref-gottfries1995diagnosis}{}%
Gottfries, Johan, Kaj Blennow, Anders Wallin, and CG Gottfries. 1995.
``Diagnosis of Dementias Using Partial Least Squares Discriminant
Analysis.'' \emph{Dementia and Geriatric Cognitive Disorders} 6 (2):
83--88.
\leavevmode\hypertarget{ref-hastie1996discriminant}{}%
Hastie, Trevor, and Robert Tibshirani. 1996. ``Discriminant Analysis by
Gaussian Mixtures.'' \emph{Journal of the Royal Statistical Society.
Series B (Methodological)}, 155--76.
\leavevmode\hypertarget{ref-Perez207225}{}%
Perez, Daniel Ruiz, and Giri Narasimhan. 2018. ``So You Think You Can
Pls-Da?'' \emph{bioRxiv}. \url{https://doi.org/10.1101/207225}.
\leavevmode\hypertarget{ref-ramey2016high}{}%
Ramey, John A, Caleb K Stein, Phil D Young, and Dean M Young. 2016.
``High-Dimensional Regularized Discriminant Analysis.'' \emph{arXiv
Preprint arXiv:1602.01182}.
\leavevmode\hypertarget{ref-staahle1987partial}{}%
Ståhle, Lars, and Svante Wold. 1987. ``Partial Least Squares Analysis
with Cross-Validation for the Two-Class Problem: A Monte Carlo Study.''
\emph{Journal of Chemometrics} 1 (3): 185--96.
\leavevmode\hypertarget{ref-zhu2006semi}{}%
Zhu, Xiaojin. 2006. ``Semi-Supervised Learning Literature Survey.''
\emph{Computer Science, University of Wisconsin-Madison} 2 (3): 4.
\end{document}