-
Notifications
You must be signed in to change notification settings - Fork 2
/
2.1-Introduction_to_Deep_Learning.html
1140 lines (1108 loc) · 83.5 KB
/
2.1-Introduction_to_Deep_Learning.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
<meta charset="utf-8">
<meta name="generator" content="quarto-1.3.336">
<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
<meta name="dcterms.date" content="2023-05-15">
<title>Introduction to Deep Neural Networks</title>
<style>
code{white-space: pre-wrap;}
span.smallcaps{font-variant: small-caps;}
div.columns{display: flex; gap: min(4vw, 1.5em);}
div.column{flex: auto; overflow-x: auto;}
div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
ul.task-list{list-style: none;}
ul.task-list li input[type="checkbox"] {
width: 0.8em;
margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */
vertical-align: middle;
}
/* CSS for syntax highlighting */
pre > code.sourceCode { white-space: pre; position: relative; }
pre > code.sourceCode > span { display: inline-block; line-height: 1.25; }
pre > code.sourceCode > span:empty { height: 1.2em; }
.sourceCode { overflow: visible; }
code.sourceCode > span { color: inherit; text-decoration: inherit; }
div.sourceCode { margin: 1em 0; }
pre.sourceCode { margin: 0; }
@media screen {
div.sourceCode { overflow: auto; }
}
@media print {
pre > code.sourceCode { white-space: pre-wrap; }
pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; }
}
pre.numberSource code
{ counter-reset: source-line 0; }
pre.numberSource code > span
{ position: relative; left: -4em; counter-increment: source-line; }
pre.numberSource code > span > a:first-child::before
{ content: counter(source-line);
position: relative; left: -1em; text-align: right; vertical-align: baseline;
border: none; display: inline-block;
-webkit-touch-callout: none; -webkit-user-select: none;
-khtml-user-select: none; -moz-user-select: none;
-ms-user-select: none; user-select: none;
padding: 0 4px; width: 4em;
}
pre.numberSource { margin-left: 3em; padding-left: 4px; }
div.sourceCode
{ }
@media screen {
pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
}
</style>
<script src="Introduction_to_Deep_Learning_files/libs/clipboard/clipboard.min.js"></script>
<script src="Introduction_to_Deep_Learning_files/libs/quarto-html/quarto.js"></script>
<script src="Introduction_to_Deep_Learning_files/libs/quarto-html/popper.min.js"></script>
<script src="Introduction_to_Deep_Learning_files/libs/quarto-html/tippy.umd.min.js"></script>
<script src="Introduction_to_Deep_Learning_files/libs/quarto-html/anchor.min.js"></script>
<link href="Introduction_to_Deep_Learning_files/libs/quarto-html/tippy.css" rel="stylesheet">
<link href="Introduction_to_Deep_Learning_files/libs/quarto-html/quarto-syntax-highlighting.css" rel="stylesheet" id="quarto-text-highlighting-styles">
<script src="Introduction_to_Deep_Learning_files/libs/bootstrap/bootstrap.min.js"></script>
<link href="Introduction_to_Deep_Learning_files/libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
<link href="Introduction_to_Deep_Learning_files/libs/bootstrap/bootstrap.min.css" rel="stylesheet" id="quarto-bootstrap" data-mode="light">
<script src="https://polyfill.io/v3/polyfill.min.js?features=es6"></script>
<script src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-chtml-full.js" type="text/javascript"></script>
</head>
<body>
<div id="quarto-content" class="page-columns page-rows-contents page-layout-article">
<div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
<nav id="TOC" role="doc-toc" class="toc-active">
<h2 id="toc-title">Table of contents</h2>
<ul>
<li><a href="#introduction-to-deep-neural-networks" id="toc-introduction-to-deep-neural-networks" class="nav-link active" data-scroll-target="#introduction-to-deep-neural-networks">Introduction to Deep Neural Networks</a>
<ul class="collapse">
<li><a href="#historical-background-and-key-milestones" id="toc-historical-background-and-key-milestones" class="nav-link" data-scroll-target="#historical-background-and-key-milestones">Historical Background and Key Milestones</a>
<ul class="collapse">
<li><a href="#the-rise-of-deep-learning" id="toc-the-rise-of-deep-learning" class="nav-link" data-scroll-target="#the-rise-of-deep-learning">The rise of Deep learning</a></li>
<li><a href="#the-early-history-of-artificial-neural-networksintelligence" id="toc-the-early-history-of-artificial-neural-networksintelligence" class="nav-link" data-scroll-target="#the-early-history-of-artificial-neural-networksintelligence">The early history of artificial [neural networks]/intelligence</a></li>
<li><a href="#comparison-with-traditional-machine-learning" id="toc-comparison-with-traditional-machine-learning" class="nav-link" data-scroll-target="#comparison-with-traditional-machine-learning">Comparison with Traditional Machine Learning</a></li>
</ul></li>
<li><a href="#artificial-neural-networks" id="toc-artificial-neural-networks" class="nav-link" data-scroll-target="#artificial-neural-networks">Artificial Neural Networks</a>
<ul class="collapse">
<li><a href="#the-perceptron-the-building-block" id="toc-the-perceptron-the-building-block" class="nav-link" data-scroll-target="#the-perceptron-the-building-block">The perceptron, the building block</a></li>
<li><a href="#neurons-and-activation-functions" id="toc-neurons-and-activation-functions" class="nav-link" data-scroll-target="#neurons-and-activation-functions">Neurons and Activation Functions</a></li>
<li><a href="#multilayer-perceptrons" id="toc-multilayer-perceptrons" class="nav-link" data-scroll-target="#multilayer-perceptrons">Multilayer perceptrons</a></li>
</ul></li>
<li><a href="#an-example" id="toc-an-example" class="nav-link" data-scroll-target="#an-example">An example</a>
<ul class="collapse">
<li><a href="#data-pre-processing" id="toc-data-pre-processing" class="nav-link" data-scroll-target="#data-pre-processing">Data pre-processing</a></li>
<li><a href="#training-a-neural-network" id="toc-training-a-neural-network" class="nav-link" data-scroll-target="#training-a-neural-network">Training a neural network</a></li>
<li><a href="#model-evaluation" id="toc-model-evaluation" class="nav-link" data-scroll-target="#model-evaluation">Model evaluation</a></li>
</ul></li>
</ul></li>
<li><a href="#some-mathematics-behind-ann" id="toc-some-mathematics-behind-ann" class="nav-link" data-scroll-target="#some-mathematics-behind-ann">Some mathematics behind ANN</a>
<ul class="collapse">
<li><a href="#a-guiding-example" id="toc-a-guiding-example" class="nav-link" data-scroll-target="#a-guiding-example">A guiding example</a>
<ul class="collapse">
<li><a href="#a-logistic-regression-ann" id="toc-a-logistic-regression-ann" class="nav-link" data-scroll-target="#a-logistic-regression-ann">A logistic regression ANN</a></li>
</ul></li>
<li><a href="#parametrizing-an-ann" id="toc-parametrizing-an-ann" class="nav-link" data-scroll-target="#parametrizing-an-ann">Parametrizing an ANN</a></li>
<li><a href="#compacting-notation" id="toc-compacting-notation" class="nav-link" data-scroll-target="#compacting-notation">Compacting notation</a>
<ul class="collapse">
<li><a href="#forward-propagation" id="toc-forward-propagation" class="nav-link" data-scroll-target="#forward-propagation">Forward propagation</a></li>
</ul></li>
<li><a href="#multiple-architectures-for-ann" id="toc-multiple-architectures-for-ann" class="nav-link" data-scroll-target="#multiple-architectures-for-ann">Multiple architectures for ANN</a>
<ul class="collapse">
<li><a href="#feed-forward-neural-networks" id="toc-feed-forward-neural-networks" class="nav-link" data-scroll-target="#feed-forward-neural-networks">Feed Forward Neural networks</a></li>
<li><a href="#the-number-of-output-units" id="toc-the-number-of-output-units" class="nav-link" data-scroll-target="#the-number-of-output-units">The number of output units</a></li>
</ul></li>
<li><a href="#a-loss-function-for-optimization" id="toc-a-loss-function-for-optimization" class="nav-link" data-scroll-target="#a-loss-function-for-optimization">A loss function for optimization</a></li>
<li><a href="#gradient-descent" id="toc-gradient-descent" class="nav-link" data-scroll-target="#gradient-descent">Gradient descent</a>
<ul class="collapse">
<li><a href="#initialization" id="toc-initialization" class="nav-link" data-scroll-target="#initialization">Initialization</a></li>
</ul></li>
<li><a href="#stochastic-gradient" id="toc-stochastic-gradient" class="nav-link" data-scroll-target="#stochastic-gradient">Stochastic Gradient</a></li>
</ul></li>
<li><a href="#references-and-resources" id="toc-references-and-resources" class="nav-link" data-scroll-target="#references-and-resources">References and resources</a></li>
</ul>
<div class="quarto-alternate-formats"><h2>Other Formats</h2><ul><li><a href="Introduction_to_Deep_Learning.pdf"><i class="bi bi-file-pdf"></i>PDF</a></li></ul></div></nav>
</div>
<main class="content" id="quarto-document-content">
<header id="title-block-header" class="quarto-title-block default">
<div class="quarto-title">
<h1 class="title">Introduction to Deep Neural Networks</h1>
</div>
<div class="quarto-title-meta">
<div>
<div class="quarto-title-meta-heading">Authors</div>
<div class="quarto-title-meta-contents">
<p>Esteban Vegas </p>
<p>Ferran Reverter </p>
<p>Alex Sanchez </p>
</div>
</div>
<div>
<div class="quarto-title-meta-heading">Published</div>
<div class="quarto-title-meta-contents">
<p class="date">May 15, 2023</p>
</div>
</div>
</div>
</header>
<div class="cell" data-hash="Introduction_to_Deep_Learning_cache/html/unnamed-chunk-1_8331f248d7e6948494d8d0dedf192baa">
<div class="sourceCode cell-code" id="cb1"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="fu">options</span>(<span class="at">width=</span><span class="dv">100</span>) </span>
<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="cf">if</span>(<span class="sc">!</span><span class="fu">require</span>(<span class="st">"knitr"</span>)) <span class="fu">install.packages</span>(<span class="st">"knitr"</span>)</span>
<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(<span class="st">"knitr"</span>)</span>
<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a><span class="co">#getOption("width")</span></span>
<span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a>knitr<span class="sc">::</span>opts_chunk<span class="sc">$</span><span class="fu">set</span>(<span class="at">comment=</span><span class="cn">NA</span>,<span class="at">echo =</span> <span class="cn">TRUE</span>, <span class="at">cache=</span><span class="cn">TRUE</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
<section id="introduction-to-deep-neural-networks" class="level1">
<h1>Introduction to Deep Neural Networks</h1>
<section id="historical-background-and-key-milestones" class="level2">
<h2 class="anchored" data-anchor-id="historical-background-and-key-milestones">Historical Background and Key Milestones</h2>
<p>Today, in April 2023, our world is convulsed by the explosion of Artificial Intelligence.</p>
<p>Although it has been growing steadily, it has probably been in the last months (weeks), since ChatGPT has arrived, that everybody has an opinion, or a fear on the topic.</p>
<div class="quarto-figure quarto-figure-center">
<figure class="figure">
<p><img src="https://bernardmarr.com/wp-content/uploads/2022/04/The-Dangers-Of-Not-Aligning-Artificial-Intelligence-With-Human-Values.jpg" title="The 5 Biggest Artificial Intelligence (AI) Trends In 2023, Bernard Marr" class="img-fluid figure-img"></p>
</figure>
</div>
<p>AI engines use statistical learning methods, such as machine learning algorithms, to make predictions based on large amounts of data.</p>
<p>Prediction is a fundamental capability of AI and is used in a wide range of applications, from recommendation systems to natural language processing to image recognition.</p>
<p>However, it is important to keep in mind that AI has far-reaching implications beyond its predictive capabilities, including ethical, social, and technological considerations that must be taken into account when developing and deploying AI systems.</p>
<section id="the-rise-of-deep-learning" class="level3">
<h3 class="anchored" data-anchor-id="the-rise-of-deep-learning">The rise of Deep learning</h3>
<p>Deep learning is a highly successful model in the field of AI, which has powered numerous applications in various domains. It has shown remarkable performance in tasks such as image recognition, natural language processing, and speech recognition.</p>
<p>Deep learning extends the basic principles of artificial neural networks by introducing more complex architectures and algorithms and, at the same time, by enabling machines to learn from large datasets by automatically identifying relevant patterns and features without explicit programming.</p>
<p>One key advantage of deep learning over traditional machine learning algorithms is its ability to handle high-dimensional and unstructured data such as images, videos, and audio.</p>
</section>
<section id="the-early-history-of-artificial-neural-networksintelligence" class="level3">
<h3 class="anchored" data-anchor-id="the-early-history-of-artificial-neural-networksintelligence">The early history of artificial [neural networks]/intelligence</h3>
<div class="quarto-figure quarto-figure-center">
<figure class="figure">
<p><img src="images/AIHistory1.jpg" class="img-fluid figure-img"></p>
<figcaption class="figure-caption">A Brief History of AI from 1940s till Today</figcaption>
</figure>
</div>
<div class="quarto-figure quarto-figure-center">
<figure class="figure">
<p><a href="https://nerdyelectronics.com/a-quick-history-of-ai-ml-and-dl/"><img src="images/AIHistory2.jpg" class="img-fluid figure-img"></a></p>
<figcaption class="figure-caption">The origins of Deep learning and modern Artificial Intelligence can be traced back to the per4ceptron. Source: “A Quick History of AI, ML and DL”</figcaption>
</figure>
</div>
<p>The origins of AI, and as such of DL can be traced almost one century backwards. While it is an interesting, or even fascinating, history wee don’t go into it (see a summary in <a href="https://nerdyelectronics.com/a-quick-history-of-ai-ml-and-dl/" id="AIHistory">A Quick History of AI, ML and DL</a></p>
<p>We can see there however, several hints worth to account for because we will go through them to understand how a deep neural network works. These are:</p>
<ul>
<li><p>The <strong>Perceptron</strong> and the first <strong>Artificial Neural Network</strong> where the basic building block was introduced.</p></li>
<li><p>The <strong>Multilayered perceptron</strong> and back-propagation where complex architectures were suggested to improve the capabilities.</p></li>
<li><p><strong>Deep Neural Networks</strong>, with many hidden layers, and auto-tunability capabilities.</p></li>
</ul>
<p>In short, there has been an mathematical and a technological evolution that at some point has allowed to meet with</p>
<ul>
<li><p>The required theoretical background (DNN)</p></li>
<li><p>The required computational capabilities (GPU, HPC)</p></li>
<li><p>The required quantity of data (Big Data, Images, Social Networks)</p></li>
</ul>
<p>This has resulted in making artificial intelligence widely accessible to businesses, researchers, and the general public.</p>
<p><img src="images/WhyDLNow.png" class="img-fluid" style="width:100.0%" data-fig-align="center" alt="Why Deep Learning Now?"> Source: Alex Amini’s ‘MIT Introduction to Deep Learning’ course (introtodeeplearning.com)</p>
<p>Success stories such as</p>
<ul>
<li><p>the development of self-driving cars,</p></li>
<li><p>the use of AI in medical diagnosis, and</p></li>
<li><p>the creation of personalized recommendations in online shopping</p></li>
</ul>
<p>have also contributed to the widespread adoption of AI.</p>
</section>
<section id="comparison-with-traditional-machine-learning" class="level3">
<h3 class="anchored" data-anchor-id="comparison-with-traditional-machine-learning">Comparison with Traditional Machine Learning</h3>
<p>A reasonable question is: “<em>How are Artificial Intelligence, Machine Learning and Deep learning related</em>”?</p>
<p>A standard answer can be found in the image below that has a myriad variations:</p>
<div class="cell" data-layout-align="center" data-hash="Introduction_to_Deep_Learning_cache/html/unnamed-chunk-2_01086ad8d94a6150ca19bcf3ba777a53">
<div class="sourceCode cell-code" id="cb2"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a>knitr<span class="sc">::</span><span class="fu">include_graphics</span>(<span class="st">"images/AI-ML-DL-1.jpg"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-display">
<div class="quarto-figure quarto-figure-center">
<figure class="figure">
<p><img src="images/AI-ML-DL-1.jpg" class="img-fluid figure-img" style="width:100.0%"></p>
</figure>
</div>
</div>
</div>
<p>We can keep, for instance, the followin three definitions, which also have many variants:</p>
<ul>
<li><p>Artificial intelligence is the ability of a computer to perform tasks commonly associated with intelligent beings.</p></li>
<li><p>Machine learning is the study of algorithms that learn from examples and experience instead of relying on hard-coded rules and make predictions on new data</p></li>
<li><p>Deep learning is a sub-field of machine learning focusing on learning data representations as successive successive layers of increasingly meaningful representations.</p></li>
</ul>
<div class="cell" data-layout-align="center" data-hash="Introduction_to_Deep_Learning_cache/html/unnamed-chunk-3_a5fa9d639ae9d6458a58f33e44eb255c">
<div class="sourceCode cell-code" id="cb3"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a>knitr<span class="sc">::</span><span class="fu">include_graphics</span>(<span class="st">"images/ML_vs_DL-2.png"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-display">
<div class="quarto-figure quarto-figure-center">
<figure class="figure">
<p><img src="images/ML_vs_DL-2.png" class="img-fluid figure-img" style="width:100.0%"></p>
</figure>
</div>
</div>
</div>
<p>We will be coming back to the difference between ML and DL, but two strengths of DL that differentiate it from ML <strong>,</strong>:</p>
<ul>
<li>DNN combine feature extraction and classification in a way that does not require (or dramatically decreases) human intervention.</li>
<li>The power of DNN requires in its ability to improve with data availability, without seemingly reaching plateaus as ML.</li>
</ul>
<div class="quarto-figure quarto-figure-center">
<figure class="figure">
<p><img src="images/PerformanceVsAmountOfData.png" class="img-fluid figure-img"></p>
<figcaption class="figure-caption">An illustration of the performance comparison between deep learning (DL) and other machine learning (ML) algorithms, where DL modeling from large amounts of data can increase the performance</figcaption>
</figure>
</div>
<p><strong>Deep learning is having a strong impact</strong></p>
<ul>
<li><p>Near-human-level image classification</p></li>
<li><p>Near-human-level speech transcription</p></li>
<li><p>Near-human-level handwriting transcription</p></li>
<li><p>Dramatically improved machine translation</p></li>
<li><p>Dramatically improved text-to-speech conversion</p></li>
<li><p>Digital assistants such as Google Assistant and Amazon Alexa</p></li>
<li><p>Near-human-level autonomous driving</p></li>
<li><p>Improved ad targeting, as used by Google, Baidu, or Bing</p></li>
<li><p>Improved search results on the web</p></li>
<li><p>Ability to answer natural language questions</p></li>
<li><p>Superhuman Go playing</p></li>
</ul>
<p>According to <span class="citation" data-cites="chollet2022">(<a href="#ref-chollet2022" role="doc-biblioref"><strong>chollet2022?</strong></a>)</span> … “<em>we shouldn’t believe the short-term hype, but should believe in the long-term vision. It may take a while for AI to be deployed to its true potential—a potential the full extent of which no one has yet dared to dream—but AI is coming, and it will transform our world in a fantastic way</em>”.</p>
<p>Once the introduction is ready we con move onto the building blocks of neural networks, perceptrons.</p>
</section>
</section>
<section id="artificial-neural-networks" class="level2">
<h2 class="anchored" data-anchor-id="artificial-neural-networks">Artificial Neural Networks</h2>
<section id="the-perceptron-the-building-block" class="level3">
<h3 class="anchored" data-anchor-id="the-perceptron-the-building-block">The perceptron, the building block</h3>
<p>The perceptron, was introduced by Rosenblatt (one version of the perceptron at least), as a mathematical model that might emulate a neuron.</p>
<p>The idea is: <em>how can we produce a model that, given some inputs, and an appropriate set of examples, learn to produce the desired output</em>?</p>
<p>The first computational model of a neuron was proposed by Warren McCullough (neuroscientist) and Walter Pitts (logician) in 1943.</p>
<div class="quarto-figure quarto-figure-center">
<figure class="figure">
<p><a href="https://towardsdatascience.com/mcculloch-pitts-model-5fdf65ac5dd1"><img src="images/MacCulloghPitts-Neuron.png" class="img-fluid figure-img"></a></p>
</figure>
</div>
<p>It may be divided into 2 parts. The first part, <span class="math inline">\(g\)</span>,takes an input (ahem dendrite ahem), performs an aggregation and based on the aggregated value the second part, <span class="math inline">\(f\)</span>, makes a decision. See <a href="https://towardsdatascience.com/mcculloch-pitts-model-5fdf65ac5dd1">the source of this picture</a> for an illustration on how this can be used to emulate logical operations such as AND, OR or NOT, but not XOR.</p>
<p>This first attempt to emulate neurons succeeded but with limitations:</p>
<ul>
<li><p>What about non-Boolean (say, real) inputs?</p></li>
<li><p>What if all inputs are not equal?</p></li>
<li><p>What if we want to assign more importance to some inputs?</p></li>
<li><p>What about functions which are not linearly separable? Say XOR function</p></li>
</ul>
<p>To overcome these limitations Frank Rosenblatt, an American psychologist, proposed the classical perception model, the <em>artificial neuron</em>, in 1958. It is more generalized computational model than the McCullough-Pitts neuron where weights and thresholds can be learnt over time.</p>
<p>The perceptron proposed by Rosenblatt this is very similar to an M-P neuron but we take a weighted sum of the inputs and set the output as one only when the sum is more than an arbitrary threshold (<strong><em>theta</em></strong>).</p>
<div class="quarto-figure quarto-figure-center">
<figure class="figure">
<p><a href="https://towardsdatascience.com/perceptron-the-artificial-neuron-4d8c70d5cc8d"><img src="images/RosenblattPerceptron1.png" class="img-fluid figure-img"></a></p>
</figure>
</div>
<p>Additionally, instead of hand coding the thresholding parameter <span class="math inline">\(\theta\)</span>, we add it as one of the inputs, with the weight <span class="math inline">\(w_0=-\theta\)</span> like shown below, which makes it learnable.</p>
<div class="quarto-figure quarto-figure-center">
<figure class="figure">
<p><a href="https://towardsdatascience.com/perceptron-the-artificial-neuron-4d8c70d5cc8d"><img src="images/RosenblattPerceptron2.png" class="img-fluid figure-img"></a></p>
</figure>
</div>
<div class="quarto-figure quarto-figure-center">
<figure class="figure">
<p><a href="https://towardsdatascience.com/perceptron-the-artificial-neuron-4d8c70d5cc8d"><img src="images/McCullaughVSRosenblattPerceptron.png" class="img-fluid figure-img"></a></p>
</figure>
</div>
<p>Now, while this is an improvement (because both the weights and the threshold can be learned and the inputs can be real values) there is still a drawback in that a single perceptron can only be used to implement linearly separable functions.</p>
<p>Artificial Neural Networks improve on this by introducing <em>Activation Functions</em> which, eventually, can be non-linear.</p>
</section>
<section id="neurons-and-activation-functions" class="level3">
<h3 class="anchored" data-anchor-id="neurons-and-activation-functions">Neurons and Activation Functions</h3>
<p>An activation function is a function that is added into an artificial neuron in order to help it learn complex patterns in the data.</p>
<p>How come biological and artificial neurons come to compare?</p>
<p>Biological neurons are specialized cells in the central nervous system that transmit electrical and chemical signals to communicate with each other and the rest of the body.</p>
<p>On the other hand, artificial neurons are mathematical models used in neural networks to process information.</p>
<p>In both biological and artificial neurons, the <strong>activation function</strong> is what is responsible for <em>deciding whether the neuron activates or not based on the input it receives</em>.</p>
<ul>
<li>In the case of a biological neuron, the activation function is based on the release of neurotransmitters, which are chemical substances that transmit signals between nerve cells. When the electrical signal reaching the neuron exceeds a certain threshold, the neuron releases neurotransmitters, which are received by other neurons or cells in the body to continue the communication process.</li>
<li>On the other hand, in an artificial neuron, the activation function is a mathematical function applied to the neuron’s input to produce an output. Like in the biological neuron, this activation function decides whether the neuron activates or not based on the input it receives.</li>
</ul>
<div class="cell" data-layout-align="center" data-hash="Introduction_to_Deep_Learning_cache/html/unnamed-chunk-4_800080a34a3df52179bfe21e0239c583">
<div class="sourceCode cell-code" id="cb4"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a>knitr<span class="sc">::</span><span class="fu">include_graphics</span>(<span class="st">"images/ActivationFunction0.png"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-display">
<div class="quarto-figure quarto-figure-center">
<figure class="figure">
<p><img src="images/ActivationFunction0.png" class="img-fluid figure-img" style="width:100.0%"></p>
</figure>
</div>
</div>
</div>
<p><a href="https://towardsdatascience.com/everything-you-need-to-know-about-activation-functions-in-deep-learning-models-84ba9f82c253">Read more here about activation functions.</a></p>
<p>With all these inputs in mind we can now define an Artificial Neuron as a <em>computational unit</em> that - takes as input <span class="math inline">\(x=(x_0,x_1,x_2,x_3)\)</span> (<span class="math inline">\(x_0\)</span> = +1, called bias), and - outputs <span class="math inline">\(h_{\theta}(x) = f(\theta^\intercal x) = f(\sum_i \theta_ix_i)\)</span>, - where <span class="math inline">\(f:\mathbb{R}\mapsto \mathbb{R}\)</span> is called the <strong>activation function</strong>.</p>
<p>The goal of the activation function is to provide the Neuron with <em>the capability of producing the required outputs</em>.</p>
<p>For instance, if the output has to be a probability, the activation function will only produce values between 0 and 1.</p>
<p>With this idea in mind activation functions are chosen from a set of pre-defined functions:</p>
<ul>
<li>the sigmoid function:</li>
</ul>
<p><span class="math display">\[
f(z)=\frac{1}{1+e^{-z}}
\]</span></p>
<ul>
<li>the hyperbolic tangent, or <code>tanh</code>, function:</li>
</ul>
<p><span class="math display">\[
f(z)=\frac{e^{z}-e^{-z}}{e^{z}+e^{-z}}
\]</span></p>
<p>The <code>tanh(z)</code> function is a re-scaled version of the sigmoid, and its output range is <span class="math inline">\([-1,1]\)</span> instead of <span class="math inline">\([0,1]\)</span>.</p>
<p>Two useful properties to recall are that: - <em>If</em> <span class="math inline">\(f(z)=1/(1+e^z)\)</span> is the sigmoid function, then its derivative is given by <span class="math inline">\(f'(z)=f(z)(1-f(z))\)</span>.</p>
<ul>
<li><p><em>Similarly, if</em> <span class="math inline">\(f\)</span> is the <code>tanh</code> function, then its derivative is given by <span class="math inline">\(f'(z)=1-(f(z))^2\)</span>.</p></li>
<li><p>In modern neural networks, the default recommendation is to use the <em>rectified linear unit</em> or ReLU defined by the activation function <span class="math inline">\(f(z)=\max\{0,z\}\)</span>.</p></li>
</ul>
<p>This function remains very close to a linear one, in the sense that is a piece-wise linear function with two linear pieces.</p>
<p>Because rectified linear units are nearly linear, they preserve many of the properties that make linear models easy to optimize with gradient based methods.</p>
<p>They also preserve many of the properties that make linear models generalize well.</p>
<p><a href="https://medium.com/@shrutijadon/survey-on-activation-functions-for-deep-learning-9689331ba092"><img src="images/ActivationFunctions.png" class="img-fluid"></a>.</p>
<p><strong>Putting altogether</strong> we have the following schematic representation of an artificial neuron where <span class="math inline">\(\Sigma=\left\langle w_{j}, x\right\rangle+ b_{j}\)</span> and <span class="math inline">\(\left\langle w_{j}, x\right\rangle\)</span> represents the dot product between vectors <span class="math inline">\(w\)</span> and <span class="math inline">\(x\)</span>.</p>
<p><img src="images/ArtificialNeuron.png" class="img-fluid"></p>
</section>
<section id="multilayer-perceptrons" class="level3">
<h3 class="anchored" data-anchor-id="multilayer-perceptrons">Multilayer perceptrons</h3>
<p>A multilayer perceptron (or Artificial neural network) is a structure composed by <em>several hidden layers of neurons</em> where the output of a neuron of a layer becomes the input of a neuron of the next layer.</p>
<p>Moreover, the output of a neuron can also be the input of a neuron of the same layer or of neuron of previous layers (this is the case for recurrent neural networks). On last layer, called output layer, we may apply a different activation function as for the hidden layers depending on the type of problems we have at hand : regression or classification.</p>
<p>The Figure below represents a neural network with three input variables, one output variable, and two hidden layers.</p>
<div class="cell" data-layout-align="center" data-hash="Introduction_to_Deep_Learning_cache/html/unnamed-chunk-5_28206c0d978a94d687d78789a46d5f5b">
<div class="sourceCode cell-code" id="cb5"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a>knitr<span class="sc">::</span><span class="fu">include_graphics</span>(<span class="st">"images/MultiLayer1.png"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-display">
<div class="quarto-figure quarto-figure-center">
<figure class="figure">
<p><img src="images/MultiLayer1.png" class="img-fluid figure-img" style="width:100.0%"></p>
</figure>
</div>
</div>
</div>
<p>Multilayers perceptrons have a basic architecture since each unit (or neuron) of a layer is linked to all the units of the next layer but has no link with the neurons of the same layer.</p>
<p>The parameters of the architecture are:</p>
<ul>
<li>the number of hidden layers and</li>
<li>the number of neurons in each layer.</li>
</ul>
<p>The activation functions are also to choose by the user. For the output layer, as mentioned previously, the activation function is generally different from the one used on the hidden layers. For example:.</p>
<ul>
<li>For regression, we apply no activation function on the output layer.</li>
<li>For binary classification, the output gives a prediction of <span class="math inline">\(\mathbb{P}(Y=1 / X)\)</span> since this value is in <span class="math inline">\([0,1]\)</span> and the sigmoid activation function is generally considered.</li>
<li>For multi-class classification, the output layer contains one neuron per class (i), giving a prediction of <span class="math inline">\(\mathbb{P}(Y=i / X)\)</span>. The sum of all these values has to be equal to 1. The sum of all these values has to be equal to 1.
<ul>
<li>A common choice for multi-class ANN is the soft-max activation function: <span class="math display">\[
\operatorname{softmax}(z)_{i}=\frac{\exp \left(z_{i}\right)}{\sum_{j} \exp \left(z_{j}\right)}
\]</span></li>
</ul></li>
</ul>
</section>
</section>
<section id="an-example" class="level2">
<h2 class="anchored" data-anchor-id="an-example">An example</h2>
<p>In this example we train and use a “shallow neural network”, called this way in contrast with “deep neural networks”.</p>
<p>We will use the <code>neuralnet</code> R package, which is not intended to work with deep neural networks, to build a simple neural network to predict if a type of stock pays dividends or not.</p>
<div class="cell" data-hash="Introduction_to_Deep_Learning_cache/html/unnamed-chunk-6_e7d8d19cae4e176e8eb3116f65d74c59">
<div class="sourceCode cell-code" id="cb6"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a><span class="cf">if</span> (<span class="sc">!</span><span class="fu">require</span>(neuralnet)) </span>
<span id="cb6-2"><a href="#cb6-2" aria-hidden="true" tabindex="-1"></a> <span class="fu">install.packages</span>(<span class="st">"neuralnet"</span>, <span class="at">dep=</span><span class="cn">TRUE</span>)</span>
<span id="cb6-3"><a href="#cb6-3" aria-hidden="true" tabindex="-1"></a><span class="cf">if</span> (<span class="sc">!</span><span class="fu">require</span>(caret)) </span>
<span id="cb6-4"><a href="#cb6-4" aria-hidden="true" tabindex="-1"></a> <span class="fu">install.packages</span>(<span class="st">"caret"</span>, <span class="at">dep=</span><span class="cn">TRUE</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
<p>The data for the example are the <code>dividendinfo.csv</code> dataset, available from: <a href="https://github.com/MGCodesandStats/datasets" class="uri">https://github.com/MGCodesandStats/datasets</a></p>
<div class="cell" data-hash="Introduction_to_Deep_Learning_cache/html/unnamed-chunk-7_c53a098d15c8c0b026d3a4627d23ed10">
<div class="sourceCode cell-code" id="cb7"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a>mydata <span class="ot"><-</span> <span class="fu">read.csv</span>(<span class="st">"https://raw.githubusercontent.com/MGCodesandStats/datasets/master/dividendinfo.csv"</span>)</span>
<span id="cb7-2"><a href="#cb7-2" aria-hidden="true" tabindex="-1"></a><span class="fu">str</span>(mydata)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code>'data.frame': 200 obs. of 6 variables:
$ dividend : int 0 1 1 0 1 1 1 0 1 1 ...
$ fcfps : num 2.75 4.96 2.78 0.43 2.94 3.9 1.09 2.32 2.5 4.46 ...
$ earnings_growth: num -19.25 0.83 1.09 12.97 2.44 ...
$ de : num 1.11 1.09 0.19 1.7 1.83 0.46 2.32 3.34 3.15 3.33 ...
$ mcap : int 545 630 562 388 684 621 656 351 658 330 ...
$ current_ratio : num 0.924 1.469 1.976 1.942 2.487 ...</code></pre>
</div>
</div>
<section id="data-pre-processing" class="level3">
<h3 class="anchored" data-anchor-id="data-pre-processing">Data pre-processing</h3>
<p>One of the most important procedures when forming a neural network is data normalization. This involves adjusting the data to a common scale so as to accurately compare predicted and actual values. Failure to normalize the data will typically result in the prediction value remaining the same across all observations, regardless of the input values.</p>
<p>We can do this in two ways in R:</p>
<ul>
<li>Scale the data frame automatically using the scale function in R</li>
<li>Transform the data using a max-min normalization technique</li>
</ul>
<p>In this example We implement the max-min normalization technique.</p>
<p>See <a href="https://vitalflux.com/data-science-scale-normalize-numeric-data-using-r/">this link</a> for further details on how to use the normalization function.</p>
<div class="cell" data-hash="Introduction_to_Deep_Learning_cache/html/unnamed-chunk-8_fe7b6cba34ae9d4b663dba217359ce37">
<div class="sourceCode cell-code" id="cb9"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a>normalize <span class="ot"><-</span> <span class="cf">function</span>(x) {</span>
<span id="cb9-2"><a href="#cb9-2" aria-hidden="true" tabindex="-1"></a> <span class="fu">return</span> ((x <span class="sc">-</span> <span class="fu">min</span>(x)) <span class="sc">/</span> (<span class="fu">max</span>(x) <span class="sc">-</span> <span class="fu">min</span>(x)))</span>
<span id="cb9-3"><a href="#cb9-3" aria-hidden="true" tabindex="-1"></a>}</span>
<span id="cb9-4"><a href="#cb9-4" aria-hidden="true" tabindex="-1"></a>normData <span class="ot"><-</span> <span class="fu">as.data.frame</span>(<span class="fu">lapply</span>(mydata, normalize))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
<p>As usually, the dataset is separated in a training and a test set. The training set contains a random selection with and (arbitrary) 66% of the observations.</p>
<div class="cell" data-hash="Introduction_to_Deep_Learning_cache/html/unnamed-chunk-9_ed0a379596fbf8f767f606748135ceb4">
<div class="sourceCode cell-code" id="cb10"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a>perc2Train <span class="ot"><-</span> <span class="dv">2</span><span class="sc">/</span><span class="dv">3</span></span>
<span id="cb10-2"><a href="#cb10-2" aria-hidden="true" tabindex="-1"></a>ssize <span class="ot"><-</span> <span class="fu">nrow</span>(normData)</span>
<span id="cb10-3"><a href="#cb10-3" aria-hidden="true" tabindex="-1"></a><span class="fu">set.seed</span>(<span class="dv">12345</span>)</span>
<span id="cb10-4"><a href="#cb10-4" aria-hidden="true" tabindex="-1"></a>data_rows <span class="ot"><-</span> <span class="fu">floor</span>(perc2Train <span class="sc">*</span>ssize)</span>
<span id="cb10-5"><a href="#cb10-5" aria-hidden="true" tabindex="-1"></a>train_indices <span class="ot"><-</span> <span class="fu">sample</span>(<span class="fu">c</span>(<span class="dv">1</span><span class="sc">:</span>ssize), data_rows)</span>
<span id="cb10-6"><a href="#cb10-6" aria-hidden="true" tabindex="-1"></a>trainset <span class="ot"><-</span> normData[train_indices,]</span>
<span id="cb10-7"><a href="#cb10-7" aria-hidden="true" tabindex="-1"></a>testset <span class="ot"><-</span> normData[<span class="sc">-</span>train_indices,]</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
<p>The <code>trainset</code> set will be used to train the network and the <code>testset</code> set one will be used to evaluate it.</p>
</section>
<section id="training-a-neural-network" class="level3">
<h3 class="anchored" data-anchor-id="training-a-neural-network">Training a neural network</h3>
<p>Setting the parameters of a neural network requires experience and understanding of their meaning, and even so, changes in the parameters can lead to similar results.</p>
<p>We create a simple NN with two hidden layers, with 3 and 2 neurons respectively. This is specified in the <code>hidden</code> parameter. For other parameters see <a href="https://www.rdocumentation.org/packages/neuralnet/versions/1.44.2/topics/neuralnet">the package help</a>.</p>
<div class="cell" data-hash="Introduction_to_Deep_Learning_cache/html/unnamed-chunk-10_3196d64e764d46738396cfa10e10d37e">
<div class="sourceCode cell-code" id="cb11"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb11-1"><a href="#cb11-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Neural Network</span></span>
<span id="cb11-2"><a href="#cb11-2" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(neuralnet)</span>
<span id="cb11-3"><a href="#cb11-3" aria-hidden="true" tabindex="-1"></a>nn <span class="ot"><-</span> <span class="fu">neuralnet</span>(dividend <span class="sc">~</span> fcfps <span class="sc">+</span> earnings_growth <span class="sc">+</span> de <span class="sc">+</span> mcap <span class="sc">+</span> current_ratio, </span>
<span id="cb11-4"><a href="#cb11-4" aria-hidden="true" tabindex="-1"></a> <span class="at">data=</span>trainset, </span>
<span id="cb11-5"><a href="#cb11-5" aria-hidden="true" tabindex="-1"></a> <span class="at">hidden=</span><span class="fu">c</span>(<span class="dv">3</span>,<span class="dv">2</span>), </span>
<span id="cb11-6"><a href="#cb11-6" aria-hidden="true" tabindex="-1"></a> <span class="at">linear.output=</span><span class="cn">FALSE</span>, </span>
<span id="cb11-7"><a href="#cb11-7" aria-hidden="true" tabindex="-1"></a> <span class="at">threshold=</span><span class="fl">0.01</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
<p>The output of the procedure is a neural network with estimated weights.</p>
<p>This can be seen with a <code>plot</code> function (including the <code>rep</code> argument).</p>
<div class="cell" data-hash="Introduction_to_Deep_Learning_cache/html/unnamed-chunk-11_c418fb678bd43d0ae037d2ea2ea74487">
<div class="sourceCode cell-code" id="cb12"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a><span class="fu">plot</span>(nn, <span class="at">rep =</span> <span class="st">"best"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-display">
<p><img src="Introduction_to_Deep_Learning_files/figure-html/unnamed-chunk-11-1.png" class="img-fluid" width="768"></p>
</div>
</div>
<p>The object <code>nn</code>contains information the weights and the results although it is not particularly clear or useful.</p>
<div class="cell" data-hash="Introduction_to_Deep_Learning_cache/html/unnamed-chunk-12_ceb450f7f422f42fe4f731ae29d2fbde">
<div class="sourceCode cell-code" id="cb13"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb13-1"><a href="#cb13-1" aria-hidden="true" tabindex="-1"></a><span class="fu">summary</span>(nn)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code> Length Class Mode
call 6 -none- call
response 133 -none- numeric
covariate 665 -none- numeric
model.list 2 -none- list
err.fct 1 -none- function
act.fct 1 -none- function
linear.output 1 -none- logical
data 6 data.frame list
exclude 0 -none- NULL
net.result 1 -none- list
weights 1 -none- list
generalized.weights 1 -none- list
startweights 1 -none- list
result.matrix 32 -none- numeric </code></pre>
</div>
<div class="sourceCode cell-code" id="cb15"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb15-1"><a href="#cb15-1" aria-hidden="true" tabindex="-1"></a>nn<span class="sc">$</span>result.matrix</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code> [,1]
error 5.096531e-01
reached.threshold 9.874263e-03
steps 1.798000e+04
Intercept.to.1layhid1 -1.243872e+00
fcfps.to.1layhid1 -1.349137e-01
earnings_growth.to.1layhid1 3.151554e+00
de.to.1layhid1 -5.249806e+00
mcap.to.1layhid1 9.908495e-01
current_ratio.to.1layhid1 6.527535e+00
Intercept.to.1layhid2 1.660208e+00
fcfps.to.1layhid2 -2.401517e-01
earnings_growth.to.1layhid2 -1.385771e+00
de.to.1layhid2 7.682849e-01
mcap.to.1layhid2 -4.058053e+00
current_ratio.to.1layhid2 -2.855816e+00
Intercept.to.1layhid3 2.982002e+00
fcfps.to.1layhid3 -2.877651e+00
earnings_growth.to.1layhid3 -6.957763e-02
de.to.1layhid3 -2.965334e+00
mcap.to.1layhid3 -5.034300e+00
current_ratio.to.1layhid3 -1.086037e+00
Intercept.to.2layhid1 9.282087e-02
1layhid1.to.2layhid1 -2.341614e+00
1layhid2.to.2layhid1 3.001315e+00
1layhid3.to.2layhid1 5.107051e+00
Intercept.to.2layhid2 -4.188729e-02
1layhid1.to.2layhid2 3.029232e+00
1layhid2.to.2layhid2 -4.732821e+00
1layhid3.to.2layhid2 -9.017001e+00
Intercept.to.dividend -3.761263e-01
2layhid1.to.dividend -3.054146e+02
2layhid2.to.dividend 1.494655e+02</code></pre>
</div>
</div>
</section>
<section id="model-evaluation" class="level3">
<h3 class="anchored" data-anchor-id="model-evaluation">Model evaluation</h3>
<p>A prediction for each value in the <code>testset</code> dataset can be built with the <code>compute</code> function.</p>
<div class="cell" data-hash="Introduction_to_Deep_Learning_cache/html/unnamed-chunk-13_fe97e18bbd60341717805da9f2826aa8">
<div class="sourceCode cell-code" id="cb17"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb17-1"><a href="#cb17-1" aria-hidden="true" tabindex="-1"></a><span class="co">#Test the resulting output</span></span>
<span id="cb17-2"><a href="#cb17-2" aria-hidden="true" tabindex="-1"></a>temp_test <span class="ot"><-</span> <span class="fu">subset</span>(testset, <span class="at">select =</span></span>
<span id="cb17-3"><a href="#cb17-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">c</span>(<span class="st">"fcfps"</span>,<span class="st">"earnings_growth"</span>, </span>
<span id="cb17-4"><a href="#cb17-4" aria-hidden="true" tabindex="-1"></a> <span class="st">"de"</span>, <span class="st">"mcap"</span>, <span class="st">"current_ratio"</span>))</span>
<span id="cb17-5"><a href="#cb17-5" aria-hidden="true" tabindex="-1"></a><span class="fu">head</span>(temp_test)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code> fcfps earnings_growth de mcap current_ratio
9 0.4929006 0.52417860 0.7862595 0.79741379 0.662994637
19 0.8722110 0.89705139 0.5190840 0.31465517 0.631284474
22 0.0811359 0.68272957 0.4554707 0.05747126 0.000785556
26 0.4077079 0.07649537 0.6310433 0.70977011 0.379642293
27 0.4279919 0.70362258 0.1882952 0.30603448 0.628283435
29 0.3509128 0.74203875 0.6030534 0.53017241 0.543404499</code></pre>
</div>
<div class="sourceCode cell-code" id="cb19"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb19-1"><a href="#cb19-1" aria-hidden="true" tabindex="-1"></a>nn.results <span class="ot"><-</span> <span class="fu">compute</span>(nn, temp_test)</span>
<span id="cb19-2"><a href="#cb19-2" aria-hidden="true" tabindex="-1"></a>results <span class="ot"><-</span> <span class="fu">data.frame</span>(<span class="at">actual =</span> </span>
<span id="cb19-3"><a href="#cb19-3" aria-hidden="true" tabindex="-1"></a> testset<span class="sc">$</span>dividend, </span>
<span id="cb19-4"><a href="#cb19-4" aria-hidden="true" tabindex="-1"></a> <span class="at">prediction =</span> nn.results<span class="sc">$</span>net.result)</span>
<span id="cb19-5"><a href="#cb19-5" aria-hidden="true" tabindex="-1"></a><span class="fu">head</span>(results)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code> actual prediction
9 1 1.000000e+00
19 1 1.000000e+00
22 0 5.442517e-133
26 0 6.801894e-35
27 1 4.548179e-10
29 1 1.000000e+00</code></pre>
</div>
</div>
<p>A confusion matrix can be built to evaluate the predictive ability of the network:</p>
<div class="cell" data-hash="Introduction_to_Deep_Learning_cache/html/unnamed-chunk-14_44f6e014830a50889f87c8e460c604ac">
<div class="sourceCode cell-code" id="cb21"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb21-1"><a href="#cb21-1" aria-hidden="true" tabindex="-1"></a>roundedresults<span class="ot"><-</span><span class="fu">sapply</span>(results,round,<span class="at">digits=</span><span class="dv">0</span>)</span>
<span id="cb21-2"><a href="#cb21-2" aria-hidden="true" tabindex="-1"></a>roundedresultsdf<span class="ot">=</span><span class="fu">data.frame</span>(roundedresults)</span>
<span id="cb21-3"><a href="#cb21-3" aria-hidden="true" tabindex="-1"></a><span class="fu">attach</span>(roundedresultsdf)</span>
<span id="cb21-4"><a href="#cb21-4" aria-hidden="true" tabindex="-1"></a>confMat<span class="ot"><-</span> caret<span class="sc">::</span><span class="fu">confusionMatrix</span>(<span class="fu">table</span>(actual, prediction))</span>
<span id="cb21-5"><a href="#cb21-5" aria-hidden="true" tabindex="-1"></a>confMat</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code>Confusion Matrix and Statistics
prediction
actual 0 1
0 34 2
1 6 25
Accuracy : 0.8806
95% CI : (0.7782, 0.947)
No Information Rate : 0.597
P-Value [Acc > NIR] : 3.405e-07
Kappa : 0.7577
Mcnemar's Test P-Value : 0.2888
Sensitivity : 0.8500
Specificity : 0.9259
Pos Pred Value : 0.9444
Neg Pred Value : 0.8065
Prevalence : 0.5970
Detection Rate : 0.5075
Detection Prevalence : 0.5373
Balanced Accuracy : 0.8880
'Positive' Class : 0
</code></pre>
</div>
</div>
</section>
</section>
</section>
<section id="some-mathematics-behind-ann" class="level1">
<h1>Some mathematics behind ANN</h1>
<ul>
<li><p>An ANN is a predictive model whose properties and behaviour can be mathematically characterized.</p></li>
<li><p>In practice this means:</p>
<ul>
<li>The ANN acts by composing a series of linear and non-linear (activation) functions.</li>
<li>These are characterized by their <em>weights</em> and <em>biases</em>, that need to be <em>learnt</em>.</li>
</ul></li>
<li><p><em>Training</em> the network is done by</p>
<ul>
<li>Selecting an appropriate (convex) loss function,</li>
<li>Finding those weights that minimize a the total <em>cost</em> function (avg. loss).</li>
</ul></li>
<li><p>This is usually done using some iterative optimization procedure such as <em>gradient descent</em>.</p></li>
<li><p>This requires evaluating derivatives in a huge number of points.</p>
<ul>
<li>Such high number may be reduced by <em>Stochastic Gradient Descent</em>.</li>
<li>The evaluation of derivatives is simplified thanks to <em>Backpropagation</em>.</li>
</ul></li>
</ul>
<section id="a-guiding-example" class="level2">
<h2 class="anchored" data-anchor-id="a-guiding-example">A guiding example</h2>
<p>We will use a concrete model to explain the concepts, which can be easily generalized to more neurons and layers.</p>
<p>Consider the following simple ANN:</p>
<div class="cell" data-layout-align="center" data-hash="Introduction_to_Deep_Learning_cache/html/unnamed-chunk-15_1d5464df5247e552b6e1fdae3dc15893">
<div class="cell-output-display">
<div class="quarto-figure quarto-figure-center">
<figure class="figure">
<p><img src="images/nn.jpg" class="img-fluid figure-img" style="width:100.0%"></p>
</figure>
</div>
</div>
</div>
<ul>
<li>The circles labelled +1 are called bias units, and correspond to the intercept, here named <em>bias</em> term.</li>
<li>The leftmost layer of the network is called the <em>input layer</em>.</li>
<li>The rightmost layer is the <em>output</em> layer (which, in this example, has only one node).</li>
<li>The middle layer(s) is(are) called the <em>hidden layer(s)</em>, because its values are not observed in the training set.</li>
</ul>
<p>So our example network has:</p>
<ul>
<li>The input layer with 3 input units (not counting the bias unit),</li>
<li>1 hidden layer with 3 hidden units, and</li>
<li>The output layer with 1 output unit.</li>
</ul>
<section id="a-logistic-regression-ann" class="level3">
<h3 class="anchored" data-anchor-id="a-logistic-regression-ann">A logistic regression ANN</h3>
<p>This ANN canseen as a device to perform a logistic regression:</p>
<ul>
<li><p>From input layer to layer 2: non-linear transformation <span class="math inline">\(\rightarrow\)</span> new set of complex features.</p></li>
<li><p>From layer 2 to output layer use a sigmoid activation function to produce the following output from the set of <em>complex features</em>.</p></li>
</ul>
<p><span class="math display">\[
\mbox{The output is: }h_{\theta}(x)=\frac{1}{1+e^{-\theta^\intercal x}}
\]</span></p>
<p>Recall that, the logistic regression model is:</p>
<p><span class="math display">\[
\log\frac{p(Y=1|x)}{1-p(Y=1|x)}=\theta^\intercal x
\]</span></p>
<p>Isolating <span class="math inline">\(p(Y=1|x)\)</span> and taking logs in both sides, we have:</p>
<p><span class="math display">\[
\frac{p(Y=1|x)}{1-p(Y=1|x)}=e^{\theta^\intercal x}
\]</span></p>
<p>Thus <span class="math display">\[
p(Y=1|x)=\frac{e^{\theta^\intercal x}}{1+e^{\theta^\intercal x}}=\frac{1}{1+e^{-\theta^\intercal x}}
\]</span></p>
<p>That is, <em>when the activation function of the output node is the sigmoid activation function, the output coincides with a logistic regression on complex features</em></p>
<ul>
<li>And, <span class="math inline">\(h_{\theta}(x)\)</span>, the output of the ANN, estimates <span class="math inline">\(p(Y=1|x)\)</span>.</li>
</ul>
</section>
</section>
<section id="parametrizing-an-ann" class="level2">
<h2 class="anchored" data-anchor-id="parametrizing-an-ann">Parametrizing an ANN</h2>
<ul>
<li><p>Let <span class="math inline">\(n_l\)</span> denote the number of layers in our network, thus <span class="math inline">\(n_l=3\)</span> in our example.</p></li>
<li><p>Label layer <span class="math inline">\(l\)</span> as <span class="math inline">\(L_l\)</span>, so layer <span class="math inline">\(L_1\)</span> is the input layer, and layer <span class="math inline">\(L_{n_l}=L_3\)</span> the output layer.</p></li>
<li><p>Our neural network has parameters: <span class="math inline">\(\Theta=(\Theta^{(1)},\Theta^{(2)})\)</span>, where we will write <span class="math inline">\(\theta^{(l)}_{ij}\)</span> to denote the parameter (or weight) associated with the connection between unit <span class="math inline">\(j\)</span> in layer <span class="math inline">\(l\)</span>, and unit <span class="math inline">\(i\)</span> in layer <span class="math inline">\(l+1\)</span>.</p></li>
<li><p>Thus, in our example, we have:</p>
<ul>
<li><span class="math inline">\(\Theta^{(1)}\in\mathbb{R}^{3\times 4}\)</span>, and</li>
<li><span class="math inline">\(\Theta^{(2)}\in\mathbb{R}^{1\times 4}\)</span>.</li>
</ul></li>
</ul>
<p>Note that bias units don’t have inputs or connections going into them, since they always output the value +1.</p>
<p>We also let <span class="math inline">\(s_l\)</span> denote the number of nodes in layer <span class="math inline">\(l\)</span> (not counting the bias unit).</p>
<p>Now, write <span class="math inline">\(a^{(l)}_i\)</span> to denote the activation (meaning output value) of unit <span class="math inline">\(i\)</span> in layer <span class="math inline">\(l\)</span>.</p>
<p>For <span class="math inline">\(l=1\)</span>, we also use <span class="math inline">\(a^{(1)}_i=x_i\)</span> to denote the <span class="math inline">\(i\)</span>-th input.</p>
<p>Given a fixed setting of the parameters <span class="math inline">\(\Theta\)</span>, our neural network defines a model <span class="math inline">\(h_{\Theta}(x)\)</span> that outputs a real number.</p>
<p>We can now see <em>how these weights are used to produce the output</em>:</p>
<p>given by: <span class="math display">\[\begin{eqnarray}
a_1^{(2)}&=&f(\theta_{10}^{(1)}+\theta_{11}^{(1)}x_1+\theta_{12}^{(1)}x_2+\theta_{13}^{(1)}x_3)\\
a_2^{(2)}&=&f(\theta_{20}^{(1)}+\theta_{21}^{(1)}x_1+\theta_{22}^{(1)}x_2+\theta_{23}^{(1)}x_3)\\
a_3^{(2)}&=&f(\theta_{30}^{(1)}+\theta_{31}^{(1)}x_1+\theta_{32}^{(1)}x_2+\theta_{33}^{(1)}x_3)\\
h_{\Theta}(x)&=&a_1^{(3)}=f(\theta_{10}^{(2)}+\theta_{11}^{(2)}a_1^{(2)}+\theta_{12}^{(2)}a_2^{(2)}+\theta_{13}^{(2)}a_3^{(2)})
\end{eqnarray}\]</span></p>
<p>Now, letting <span class="math inline">\(z_i^{(l)}\)</span> denote the total weighted sum of inputs to unit <span class="math inline">\(i\)</span> in layer <span class="math inline">\(l\)</span>, including the bias term <span class="math display">\[
z_i^{(2)}=\theta_{i0}^{(1)}+\theta_{i1}^{(1)}x_1+\theta_{i2}^{(1)}x_2+\theta_{i3}^{(1)}x_3,
\]</span> the output becomes: <span class="math inline">\(a_i^{(l)}=f(z_i^{(l)})\)</span>.</p>
</section>
<section id="compacting-notation" class="level2">
<h2 class="anchored" data-anchor-id="compacting-notation">Compacting notation</h2>
<ul>
<li><p>Note that this easily lends itself to a more compact notation.</p></li>
<li><p>Extending the activation function <span class="math inline">\(f(\cdot)\)</span> to apply to vectors in an element-wise fashion: <span class="math display">\[
f([z_1,z_2,z_3]) = [f(z_1), f(z_2),f(z_3)],
\]</span></p></li>
</ul>
<p>then we can write the previous equations more compactly as:</p>
<span class="math display">\[\begin{eqnarray}
z^{(2)}&=&\Theta^{(1)}x\nonumber\\
a^{(2)}&=&f(z^{(2)})\nonumber\\
z^{(3)}&=&\Theta^{(2)}a^{(2)}\nonumber\\
h_{\Theta}(x)&=&a^{(3)}=f(z^{(3)})\nonumber
\end{eqnarray}\]</span>
<ul>
<li><p>More generally, recalling that we also use <span class="math inline">\(a^{(1)}=x\)</span> to also denote the values from the input layer,</p></li>
<li><p>then given layer <span class="math inline">\(l\)</span>’s activations <span class="math inline">\(a^{(l)}\)</span>, we can compute layer <span class="math inline">\(l+1\)</span>’s activations <span class="math inline">\(a^{(l+1)}\)</span> as:</p></li>
</ul>
<span class="math display">\[\begin{eqnarray}
z^{(l+1)}&=&\Theta^{(l)}a^{(l)}\\
a^{(l+1)}&=&f(z^{(l+1)})
\end{eqnarray}\]</span>
<p>This can be used to provide a matrix representation for the weighted sum of inputs of all neurons:</p>
<p><span class="math display">\[
z^{(l+1)}=
\begin{bmatrix}
z_1^{(l+1)}\\
z_2^{(l+1)}\\
\vdots\\
z_{s_{l+1}}^{(l)}
\end{bmatrix}=
\begin{bmatrix}
\theta_{10}^{(l)}& \theta_{11}^{(l)}&\theta_{12}^{(l)}&...&\theta_{1s_{l}}^{(l)}&\\
\theta_{20}^{(l)}& \theta_{21}^{(l)}&\theta_{22}^{(l)}&...&\theta_{2s_{l}}^{(l)}&\\
\vdots & \vdots& \vdots & \vdots & \vdots\\
\theta_{s_{l+1}0}^{(l)}& \theta_{s_{l+1}1}^{(l)}&\theta_{s_{l+1}2}^{(l)}&...&\theta_{s_{l+1}s_{l}}^{(l)}&\\
\end{bmatrix}
\cdot\begin{bmatrix}
1\\
a_1^{(l)}\\
a_2^{(l)}\\
\vdots\\
a_{s_l}^{(l)}
\end{bmatrix}
\]</span></p>
<p>So that, the activation is then:</p>
<p><span class="math display">\[
a^{(l+1)}=
\begin{bmatrix}
a_1^{(l+1)}\\
a_2^{(l+1)}\\
\vdots\\
a_{s_{l+1}}^{(l)}
\end{bmatrix}=f(z^{(l+1)})=\begin{bmatrix}
f(z_1^{(l+1)})\\
f(z_2^{(l+1)})\\
\vdots\\
f(z_{s_{l+1}}^{(l)})
\end{bmatrix}
\]</span></p>
<section id="forward-propagation" class="level3">
<h3 class="anchored" data-anchor-id="forward-propagation">Forward propagation</h3>
<ul>
<li><p>By organizing our parameters in matrices and using matrix-vector operations, we can take advantage of fast linear algebra routines to quickly perform calculations in our network.</p></li>
<li><p>This process is called <em>forward propagation</em>.</p></li>
</ul>
</section>
</section>
<section id="multiple-architectures-for-ann" class="level2">
<h2 class="anchored" data-anchor-id="multiple-architectures-for-ann">Multiple architectures for ANN</h2>
<ul>
<li><p>We have so far focused on a single hidden layer neural network of the example One can build neural networks with many distinct architectures (meaning patterns of connectivity between neurons), including ones with multiple hidden layers.</p></li>
<li><p>See <a href="https://www.asimovinstitute.org/neural-network-zoo/">here the Neural Network Zoo</a>.</p></li>
</ul>
<section id="feed-forward-neural-networks" class="level3">
<h3 class="anchored" data-anchor-id="feed-forward-neural-networks">Feed Forward Neural networks</h3>
<p>The most common choice is a <span class="math inline">\(n_l\)</span>-layered network where layer 1 is the input layer, layer <span class="math inline">\(n_l\)</span> is the output layer, and each layer <span class="math inline">\(l\)</span> is densely connected to layer <span class="math inline">\(l+1\)</span>.</p>
<p>In this setting, to compute the output of the network, we can successively compute all the activations in layer <span class="math inline">\(L_2\)</span>, then layer <span class="math inline">\(L_3\)</span>, and so on, up to layer <span class="math inline">\(L_{nl}\)</span> , using Equations (5-6). This is one example of a feed-forward <em>neural network (FFNN)</em>, since the connectivity graph does not have any directed loops or cycles.</p>
</section>
<section id="the-number-of-output-units" class="level3">
<h3 class="anchored" data-anchor-id="the-number-of-output-units">The number of output units</h3>
<p>Neural networks can also have multiple output units.</p>
<p>For example, in (Fig. 4) we can see a network with two hidden layers layers <span class="math inline">\(L_2\)</span> and <span class="math inline">\(L_3\)</span> and four output units in layer <span class="math inline">\(L_4\)</span>, where bias of each layer were omitted.</p>
<div class="quarto-figure quarto-figure-center">
<figure class="figure">
<p><img src="images/nn2.jpg" class="img-fluid figure-img" style="width:60.0%"></p>
<figcaption class="figure-caption">Neural network</figcaption>
</figure>
</div>
<p>To train this network, we would need training examples <span class="math inline">\((x^{(i)},y^{(i)})\)</span> where <span class="math inline">\(y^{(i)}\in\mathbb{R}^4\)</span>. This sort of network is useful if there are multiple outputs that you’re interested in predicting.</p>
<p>For example, in a medical diagnosis application, the vector <span class="math inline">\(x\)</span> might give the input features of a patient, and the different outputs <span class="math inline">\(y_i\)</span>’s might indicate presence or absence of different diseases.</p>
</section>
</section>
<section id="a-loss-function-for-optimization" class="level2">
<h2 class="anchored" data-anchor-id="a-loss-function-for-optimization">A loss function for optimization</h2>
<ul>
<li><p>In order to estimate the weights we will aim at minimizing an appropriate loss function.</p></li>
<li><p>A first idea may be to use <em>squared error loss</em> <span class="math display">\[
l(h_\theta(x),y)=(y-\frac{1}{1+e^{-\theta^\intercal x}})^2
\]</span></p></li>
<li><p>However it happens to be that <a href="https://towardsdatascience.com/why-not-mse-as-a-loss-function-for-logistic-regression-589816b5e03c"><em>this is is not a convex problem</em></a> which means that MSE is not appropriate.</p></li>
<li><p>Alternatively, we use the <em>binary cross-entropy loss function</em> : <span class="math display">\[
l(h_\theta(x),y)=\big{\{}\begin{array}{ll}
-\log h_\theta(x) & \textrm{if }y=1\\
-\log(1-h_\theta(x))& \textrm{if }y=0
\end{array}
\]</span></p></li>
<li><p>It can be written compactly as:</p></li>
</ul>
<p><span class="math display">\[
l(h_\theta(x),y)=-y\log h_\theta(x) - (1-y)\log(1-h_\theta(x))
\]</span></p>
<ul>
<li><p>Using cross-entropy loss, the cost function is of the form: <span class="math display">\[\begin{eqnarray*}
J(\theta)=-\frac{1}{n}\big[\sum_{i=1}^n&&(y^{(i)}\log h_\theta(x^{(i)})+\\ &&(1-y^{(i)})\log(1-h_\theta(x^{(i)}))\big]
\end{eqnarray*}\]</span></p></li>
<li><p>This is a convex optimization problem.</p></li>
<li><p>Better to work with a <em>regularized version</em> of the cost function (we don’t regularize the bias units)</p></li>
</ul>
<span class="math display">\[\begin{eqnarray*}
J(\Theta)&=&-\frac{1}{n}\big[\sum_{i=1}^n \sum_{k=1}^K y_k^{(i)}\log( h_\theta(x^{(i)}))_k\\
&+&(1-y_k^{(i)})\log(1-(h_\theta(x^{(i)}))_k)\big]\\
&+&\lambda\sum_{l=1}^{L-1}\sum_{i=1}^{s_l}\sum_{j=1}^{s_{l+1}}
(\theta_{ji}^{(l)})^2
\end{eqnarray*}\]</span>
</section>
<section id="gradient-descent" class="level2">
<h2 class="anchored" data-anchor-id="gradient-descent">Gradient descent</h2>
<p>We saw in the previous section that training a network corresponds to choosing the parameters, that is, the weights and biases, that minimize the cost function.</p>
<p>The weights and biases take the form of matrices and vectors, but at this stage it is convenient to imagine them stored as a single vector that we call <span class="math inline">\(\theta\)</span>. Generally, we will suppose <span class="math inline">\(\theta\in\mathbb{R}^p\)</span>, and write the cost function as <span class="math inline">\(J(\theta)\)</span> to emphasize its dependence on the parameters. So Cost <span class="math inline">\(J: \mathbb{R}^p\rightarrow \mathbb{R}\)</span>.</p>
<div class="quarto-figure quarto-figure-center">
<figure class="figure">
<p><img src="images/errorsurface.jpg" class="img-fluid figure-img" style="width:60.0%"></p>
<figcaption class="figure-caption">Error hyper-surface</figcaption>
</figure>
</div>
<p>We now introduce a classical method in optimization that is often referred to as steepest descent or gradient descent. The method proceeds iteratively, computing a sequence of vectors in <span class="math inline">\(\mathbb{R}^p\)</span> with the aim of converging to a vector that minimizes the cost function. Suppose that our current vector is <span class="math inline">\(\theta\)</span>. How should we choose a perturbation, <span class="math inline">\(\Delta\theta\)</span>, so that the next vector, <span class="math inline">\(\theta+\Delta\theta\)</span>, represents an improvement? If <span class="math inline">\(\Delta\theta\)</span> is small, then ignoring terms of order <span class="math inline">\(||\Delta\theta||^2\)</span>, a Taylor series expansion gives</p>
<p><span class="math display">\[
J(\theta+\Delta\theta)\approx J(\theta)+\sum_{i=1}^p\frac{\partial J(\theta)}{\partial\theta_i}\Delta\theta_i
\]</span> Here <span class="math inline">\(\frac{\partial J(\theta)}{\partial\theta_i}\)</span> denotes the partial derivative of the cost function with respect to the <span class="math inline">\(i\)</span>-th weight. For convenience, we will let <span class="math inline">\(\nabla J(\theta)\in\mathbb{R}^p\)</span> denote the vector of partial derivatives, known as the gradient, so that <span class="math display">\[\begin{equation}\label{g1}
\nabla J(\theta)=\big(\frac{\partial J(\theta)}{\partial\theta_1},...,\frac{\partial J(\theta)}{\partial\theta_p}\big)^\intercal
\end{equation}\]</span> Then, <span class="math display">\[\begin{equation}\label{g2}
J(\theta+\Delta\theta)\approx J(\theta)+\nabla J(\theta)^\intercal\Delta\theta
\end{equation}\]</span></p>
<p>Our aim is to reduce the value of the cost function. The relation (<span class="math inline">\(\ref{g2}\)</span>) motivates the idea of choosing <span class="math inline">\(\Delta\theta\)</span> to make <span class="math inline">\(\nabla J(\theta)^\intercal\Delta\theta\)</span> as negative as possible. We can address this problem via the Cauchy-Schwarz inequality, which states that for any <span class="math inline">\(f,g\in\mathbb{R}^p\)</span>, we have <span class="math inline">\(|f^\intercal g|\leq ||f||\cdot ||g||\)</span>. Moreover, the two sides are equal if and only if <span class="math inline">\(f\)</span> and <span class="math inline">\(g\)</span> are linearly dependent (meaning they are parallel).</p>
<p>So the most negative that <span class="math inline">\(f^\intercal g\)</span> can be is <span class="math inline">\(-||f||\cdot||g||\)</span>, which happens when <span class="math inline">\(f=-g\)</span>. Hence we should choose <span class="math inline">\(\Delta\theta\)</span> to lie in the direction of <span class="math inline">\(-\nabla J(\theta)\)</span>. Keeping in mind that (<span class="math inline">\(\ref{g2}\)</span>) is an approximation that is relevant only for small <span class="math inline">\(\Delta\theta\)</span>, we will limit ourselves to a small step in that direction. This leads to the update <span class="math display">\[\begin{equation}\label{g3}
\theta \rightarrow \theta-\eta\nabla J(\theta)
\end{equation}\]</span></p>
<p>Here <span class="math inline">\(\eta\)</span> is small step size that, in this context, is known as the learning rate. This equation defines the steepest descent method. We choose an initial vector and iterate (<span class="math inline">\(\ref{g3}\)</span>) until some stopping criterion has been met, or until the number of iterations has exceeded the computational budget.</p>
<p>Repeat:</p>
<p><span class="math display">\[
\theta_j=\theta_j-\eta\frac{\partial}{\partial\theta_j}J(\theta)
\]</span> <span class="math display">\[
\qquad \textrm{ simultaneously update all }\qquad \theta_j
\]</span></p>
<p><span class="math inline">\(\eta\in (0,1]\)</span> denotes the learning parameter.</p>
<p>We aim to minimize the cost function <span class="math display">\[
\underset{\theta}{\textrm{min }}J(\theta)
\]</span></p>
<p>In order to use gradient descent, we need to compute <span class="math inline">\(J(\theta)\)</span> and the partial derivative terms <span class="math display">\[
\frac{\partial}{\partial\theta_j}J(\theta)
\]</span></p>
<section id="initialization" class="level3">
<h3 class="anchored" data-anchor-id="initialization">Initialization</h3>
<p>The input data have to be normalized to have approximately the same range. The biases can be initialized to 0. They also cannot be initialized with the same values, otherwise, all the neurons of a hidden layer would have the same behavior. Perhaps the only property known with complete certainty is that the initial parameters need to break symmetry between different units. We generally initialize the weights at random: the values <span class="math inline">\(\theta_{ij}^{(l)}\)</span> are i.i.d. Uniform on <span class="math inline">\([-c,c]\)</span> with possibly <span class="math inline">\(c= 1/\sqrt{N_l}\)</span> where <span class="math inline">\(N_l\)</span> is the size of the hidden layer <span class="math inline">\(l\)</span>. We also sometimes initialize the weights with a normal distribution <span class="math inline">\(N(0,0.01)\)</span>.</p>
</section>
</section>
<section id="stochastic-gradient" class="level2">
<h2 class="anchored" data-anchor-id="stochastic-gradient">Stochastic Gradient</h2>
<p>Algorithm for optimization the cost function. When we have a large number of parameters and a large number of training points, computing the gradient vector (<span class="math inline">\(\ref{g1}\)</span>) at every iteration of the steepest descent method (<span class="math inline">\(\ref{g3}\)</span>) can be prohibitively expensive because we have to sum across all training points (for instance in Big Data). A much cheaper alternative is to replace the mean of the individual gradients over all training points by the gradient at a single, randomly chosen, training point. This leads to the simplest form of what is called the stochastic gradient method. A single step may be summarized as</p>
<p>Notice we have included <span class="math inline">\(x^{(i)}\)</span> in the notation of <span class="math inline">\(J(\theta;x^{(i)})\)</span> to remark the dependence. In words, at each step, the stochastic gradient method uses one randomly chosen training point to represent the full training set. As the iteration proceeds, the method sees more training points. So there is some hope that this dramatic reduction in cost-per-iteration will be worthwhile overall. We note that, even for very small <span class="math inline">\(\eta\)</span>, the update (<span class="math inline">\(\ref{g4}\)</span>) is not guaranteed to reduce the overall cost function we have traded the mean for a single sample. Hence, although the phrase stochastic gradient descent is widely used, we prefer to use <strong>stochastic gradient</strong>.</p>
<p>The version of the stochastic gradient method that we introduced in (<span class="math inline">\(\ref{g4}\)</span>) is the simplest from a large range of possibilities. In particular, the index <span class="math inline">\(i\)</span> in (<span class="math inline">\(\ref{g4}\)</span>) was chosen by sampling with replacement after using a training point, it is returned to the training set and is just as likely as any other point to be chosen at the next step. An alternative is to sample without replacement; that is, to cycle through each of the <span class="math inline">\(n\)</span> training points in a random order. Performing <span class="math inline">\(n\)</span> steps in this manner, refereed to as completing an epoch, may be summarized as follows:</p>
<p>If we regard the stochastic gradient method as approximating the mean over all training points by a single sample, then it is natural to consider a compromise where we use a small sample average. For some <span class="math inline">\(m<<n\)</span> we could take steps of the following form.</p>
<p>In this iteration, the set <span class="math inline">\(\{x^{(k_i)}\}_{i=1}^m\)</span> is known as a mini-batch. Because the stochastic gradient method is usually implemented within the context of a very large scale computation, algorithmic choices such as mini-batch size and the form of randomization are often driven by the requirements of high performance computing architectures. Also, it is, of course, possible to vary these choices, along with others, such as the learning rate, dynamically as the training progresses in an attempt to accelerate convergence.</p>
</section>
</section>
<section id="references-and-resources" class="level1">
<h1>References and resources</h1>
</section>
</main>
<!-- /main column -->
<script id="quarto-html-after-body" type="application/javascript">
window.document.addEventListener("DOMContentLoaded", function (event) {
const toggleBodyColorMode = (bsSheetEl) => {
const mode = bsSheetEl.getAttribute("data-mode");
const bodyEl = window.document.querySelector("body");
if (mode === "dark") {
bodyEl.classList.add("quarto-dark");
bodyEl.classList.remove("quarto-light");
} else {
bodyEl.classList.add("quarto-light");
bodyEl.classList.remove("quarto-dark");
}
}
const toggleBodyColorPrimary = () => {
const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
if (bsSheetEl) {
toggleBodyColorMode(bsSheetEl);
}
}
toggleBodyColorPrimary();
const icon = "";
const anchorJS = new window.AnchorJS();
anchorJS.options = {
placement: 'right',
icon: icon
};
anchorJS.add('.anchored');
const isCodeAnnotation = (el) => {
for (const clz of el.classList) {
if (clz.startsWith('code-annotation-')) {
return true;
}
}
return false;
}
const clipboard = new window.ClipboardJS('.code-copy-button', {
text: function(trigger) {
const codeEl = trigger.previousElementSibling.cloneNode(true);
for (const childEl of codeEl.children) {
if (isCodeAnnotation(childEl)) {
childEl.remove();
}
}
return codeEl.innerText;
}
});
clipboard.on('success', function(e) {
// button target
const button = e.trigger;
// don't keep focus
button.blur();
// flash "checked"
button.classList.add('code-copy-button-checked');
var currentTitle = button.getAttribute("title");
button.setAttribute("title", "Copied!");
let tooltip;
if (window.bootstrap) {
button.setAttribute("data-bs-toggle", "tooltip");
button.setAttribute("data-bs-placement", "left");
button.setAttribute("data-bs-title", "Copied!");
tooltip = new bootstrap.Tooltip(button,
{ trigger: "manual",
customClass: "code-copy-button-tooltip",
offset: [0, -8]});
tooltip.show();
}
setTimeout(function() {
if (tooltip) {
tooltip.hide();
button.removeAttribute("data-bs-title");
button.removeAttribute("data-bs-toggle");
button.removeAttribute("data-bs-placement");
}
button.setAttribute("title", currentTitle);
button.classList.remove('code-copy-button-checked');
}, 1000);
// clear code selection
e.clearSelection();
});
function tippyHover(el, contentFn) {
const config = {
allowHTML: true,
content: contentFn,
maxWidth: 500,
delay: 100,
arrow: false,
appendTo: function(el) {
return el.parentElement;
},
interactive: true,
interactiveBorder: 10,
theme: 'quarto',
placement: 'bottom-start'
};
window.tippy(el, config);
}
const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
for (var i=0; i<noterefs.length; i++) {