diff --git a/RohitsNotes/noteBits/noteBits.org b/RohitsNotes/noteBits/noteBits.org new file mode 100644 index 0000000..63da405 --- /dev/null +++ b/RohitsNotes/noteBits/noteBits.org @@ -0,0 +1,562 @@ +#+TITLE: Notes and Bits +#+SUBTITLE: A univ.ai compendium +#+AUTHOR: Rohit Goswami,\textsc{\scriptsize\ AMIChemE} +# This should not be altered +#+OPTIONS: toc:nil title:nil +# I need the footnotes to be inlined +#+STARTUP: fninline + +# +# LaTeX Stuff (from eisvogel https://raw.githubusercontent.com/Wandmalfarbe/pandoc-latex-template/master/eisvogel.tex) +# + +#+LATEX_COMPILER: xelatex +#+LATEX_CLASS: koma-article +#+LATEX_CLASS_OPTIONS: [12pt,a4paper,oneside,headinclude] + +#+LATEX_HEADER: \PassOptionsToPackage{unicode=true}{hyperref} +#+LATEX_HEADER: \PassOptionsToPackage{hyphens}{url} +#+LATEX_HEADER: \PassOptionsToPackage{dvipsnames,svgnames*,x11names*,table}{xcolor} + + +#+LATEX_HEADER: \usepackage{lmodern} +#+LATEX_HEADER: \usepackage{amssymb,amsmath} +#+LATEX_HEADER: \usepackage{physics} +#+LATEX_HEADER: \usepackage{ifxetex,ifluatex} +#+LATEX_HEADER: \usepackage{fixltx2e} % provides \textsubscript +#+LATEX_HEADER: \ifnum 0\ifxetex 1\fi\ifluatex 1\fi=0 % if pdftex +#+LATEX_HEADER: \usepackage[T1]{fontenc} +#+LATEX_HEADER: \usepackage[utf8]{inputenc} +#+LATEX_HEADER: \usepackage{textcomp} % provides euro and other symbols +#+LATEX_HEADER: \else % if luatex or xelatex +#+LATEX_HEADER: \usepackage{unicode-math} +#+LATEX_HEADER: \defaultfontfeatures{Ligatures=TeX,Scale=MatchLowercase} +#+LATEX_HEADER: \fi +#+LATEX_HEADER: % use upquote if available, for straight quotes in verbatim environments +#+LATEX_HEADER: \IfFileExists{upquote.sty}{\usepackage{upquote}}{} +#+LATEX_HEADER: % use microtype if available +#+LATEX_HEADER: \IfFileExists{microtype.sty}{% +#+LATEX_HEADER: \usepackage[]{microtype} +#+LATEX_HEADER: \UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts +#+LATEX_HEADER: }{} +#+LATEX_HEADER: \IfFileExists{parskip.sty}{% +#+LATEX_HEADER: \usepackage{parskip} +#+LATEX_HEADER: }{% else +#+LATEX_HEADER: \setlength{\parindent}{0pt} +#+LATEX_HEADER: \setlength{\parskip}{6pt plus 2pt minus 1pt} +#+LATEX_HEADER: } +#+LATEX_HEADER: \usepackage{hyperref} +#+LATEX_HEADER: \hypersetup{ +#+LATEX_HEADER: pdftitle={ }, +#+LATEX_HEADER: pdfauthor={Author}, +#+LATEX_HEADER: pdfborder={0 0 0}, +#+LATEX_HEADER: breaklinks=true} +#+LATEX_HEADER: \urlstyle{same} % don't use monospace font for urls + +#+LATEX_HEADER: \usepackage{longtable,booktabs} +#+LATEX_HEADER: % Fix footnotes in tables (requires footnote package) +#+LATEX_HEADER: \IfFileExists{footnote.sty}{\usepackage{footnote}\makesavenoteenv{longtable}}{} +#+LATEX_HEADER: \usepackage{graphicx,grffile} +#+LATEX_HEADER: \makeatletter +#+LATEX_HEADER: \def\maxwidth{\ifdim\Gin@nat@width>\linewidth\linewidth\else\Gin@nat@width\fi} +#+LATEX_HEADER: \def\maxheight{\ifdim\Gin@nat@height>\textheight\textheight\else\Gin@nat@height\fi} +#+LATEX_HEADER: \makeatother +#+LATEX_HEADER: % Scale images if necessary, so that they will not overflow the page +#+LATEX_HEADER: % margins by default, and it is still possible to overwrite the defaults +#+LATEX_HEADER: % using explicit options in \includegraphics[width, height, ...]{} +#+LATEX_HEADER: \setkeys{Gin}{width=\maxwidth,height=\maxheight,keepaspectratio} +#+LATEX_HEADER: \setlength{\emergencystretch}{3em} % prevent overfull lines +#+LATEX_HEADER: \providecommand{\tightlist}{% +#+LATEX_HEADER: \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}} +#+LATEX_HEADER: \setcounter{secnumdepth}{0} +#+LATEX_HEADER: % Redefines (sub)paragraphs to behave more like sections +#+LATEX_HEADER: \ifx\paragraph\undefined\else +#+LATEX_HEADER: \let\oldparagraph\paragraph +#+LATEX_HEADER: \renewcommand{\paragraph}[1]{\oldparagraph{#1}\mbox{}} +#+LATEX_HEADER: \fi +#+LATEX_HEADER: \ifx\subparagraph\undefined\else +#+LATEX_HEADER: \let\oldsubparagraph\subparagraph +#+LATEX_HEADER: \renewcommand{\subparagraph}[1]{\oldsubparagraph{#1}\mbox{}} +#+LATEX_HEADER: \fi + +#+LATEX_HEADER: % Make use of float-package and set default placement for figures to H +#+LATEX_HEADER: \usepackage{float} +#+LATEX_HEADER: \floatplacement{figure}{H} + +#+LATEX_HEADER: \numberwithin{figure}{section} +#+LATEX_HEADER: \numberwithin{equation}{section} +#+LATEX_HEADER: \numberwithin{table}{section} +#+LATEX_HEADER: \makeatletter +#+LATEX_HEADER: \@ifpackageloaded{subfig}{}{\usepackage{subfig}} +#+LATEX_HEADER: \@ifpackageloaded{caption}{}{\usepackage{caption}} +#+LATEX_HEADER: \captionsetup[subfloat]{margin=0.5em} +#+LATEX_HEADER: \AtBeginDocument{% +#+LATEX_HEADER: \renewcommand*\figurename{Figure} +#+LATEX_HEADER: \renewcommand*\tablename{Table} +#+LATEX_HEADER: } +#+LATEX_HEADER: \AtBeginDocument{% +#+LATEX_HEADER: \renewcommand*\listfigurename{List of Figures} +#+LATEX_HEADER: \renewcommand*\listtablename{List of Tables} +#+LATEX_HEADER: } +#+LATEX_HEADER: \@ifpackageloaded{float}{}{\usepackage{float}} +#+LATEX_HEADER: \floatstyle{ruled} +#+LATEX_HEADER: \@ifundefined{c@chapter}{\newfloat{codelisting}{h}{lop}}{\newfloat{codelisting}{h}{lop}[chapter]} +#+LATEX_HEADER: \floatname{codelisting}{Listing} +#+LATEX_HEADER: \makeatother + +#+LATEX_HEADER: \usepackage[dvipsnames,svgnames*,x11names*,table]{xcolor} +#+LATEX_HEADER: \definecolor{listing-background}{HTML}{F7F7F7} +#+LATEX_HEADER: \definecolor{listing-rule}{HTML}{B3B2B3} +#+LATEX_HEADER: \definecolor{listing-numbers}{HTML}{B3B2B3} +#+LATEX_HEADER: \definecolor{listing-text-color}{HTML}{000000} +#+LATEX_HEADER: \definecolor{listing-keyword}{HTML}{435489} +#+LATEX_HEADER: \definecolor{listing-identifier}{HTML}{435489} +#+LATEX_HEADER: \definecolor{listing-string}{HTML}{00999A} +#+LATEX_HEADER: \definecolor{listing-comment}{HTML}{8E8E8E} +#+LATEX_HEADER: \definecolor{listing-javadoc-comment}{HTML}{006CA9} + +#+LATEX_HEADER: \usepackage{pagecolor} +#+LATEX_HEADER: \usepackage{afterpage} +#+LATEX_HEADER: \setcounter{tocdepth}{3} +#+LATEX_HEADER: \usepackage{setspace} +#+LATEX_HEADER: \setstretch{1.2} +#+LATEX_HEADER: \usepackage{csquotes} +#+LATEX_HEADER: \usepackage[font={small,it}]{caption} +#+LATEX_HEADER: \newcommand{\imglabel}[1]{\textbf{\textit{(#1)}}} +#+LATEX_HEADER: \definecolor{blockquote-border}{RGB}{221,221,221} +#+LATEX_HEADER: \definecolor{blockquote-text}{RGB}{119,119,119} +#+LATEX_HEADER: \usepackage{mdframed} +#+LATEX_HEADER: \newmdenv[rightline=false,bottomline=false,topline=false,linewidth=3pt,linecolor=blockquote-border,skipabove=\parskip]{customblockquote} +#+LATEX_HEADER: \renewenvironment{quote}{\begin{customblockquote}\list{}{\rightmargin=0em\leftmargin=0em}% +#+LATEX_HEADER: \item\relax\color{blockquote-text}\ignorespaces}{\unskip\unskip\endlist\end{customblockquote}} +#+LATEX_HEADER: \definecolor{heading-color}{RGB}{40,40,40} +#+LATEX_HEADER: \addtokomafont{section}{\color{heading-color}} +#+LATEX_HEADER: \usepackage{titling} +#+LATEX_HEADER: \renewcommand{\arraystretch}{1.3} % table spacing +#+LATEX_HEADER: \definecolor{table-row-color}{HTML}{F5F5F5} +#+LATEX_HEADER: \rowcolors{3}{}{table-row-color!100} + +#+LATEX_HEADER: % Reset rownum counter so that each table starts with the same row color +#+LATEX_HEADER: \let\oldlongtable\longtable +#+LATEX_HEADER: \let\endoldlongtable\endlongtable +#+LATEX_HEADER: \renewenvironment{longtable}{\oldlongtable} { +#+LATEX_HEADER: \endoldlongtable +#+LATEX_HEADER: \global\rownum=0\relax} +#+LATEX_HEADER: \setlength{\parindent}{0pt} +#+LATEX_HEADER: \setlength{\parskip}{6pt plus 2pt minus 1pt} +#+LATEX_HEADER: \setlength{\emergencystretch}{3em} % prevent overfull lines + +#+LATEX_HEADER: \usepackage{fancyhdr} +#+LATEX_HEADER: \pagestyle{fancy} +#+LATEX_HEADER: \fancyhead{} +#+LATEX_HEADER: \fancyfoot{} +#+LATEX_HEADER: \lhead{Notes and Bits} +#+LATEX_HEADER: \chead{} +#+LATEX_HEADER: \rhead{\today} +#+LATEX_HEADER: \lfoot{Rohit Goswami} +#+LATEX_HEADER: \cfoot{} +#+LATEX_HEADER: \rfoot{\thepage} +#+LATEX_HEADER: \renewcommand{\headrulewidth}{0.4pt} +#+LATEX_HEADER: \renewcommand{\footrulewidth}{0.4pt} + + +#+LATEX_HEADER: % When using the classes report, scrreprt, book, +#+LATEX_HEADER: % scrbook or memoir, uncomment the following line. +#+LATEX_HEADER: %\addtokomafont{chapter}{\color{heading-color}} + + +# Nicer Fonts +# #+LATEX_HEADER: \usepackage{xunicode} +# #+LATEX_HEADER: \usepackage{xltxtra} +# #+LATEX_HEADER: \usepackage[protrusion=true,final]{microtype} +# #+LATEX_HEADER: \usepackage{mathspec} +# #+LATEX_HEADER: \defaultfontfeatures{Mapping=tex-text} +# #+LATEX_HEADER: \setromanfont[Ligatures={Common}, Numbers={OldStyle}]{Hoefler Text} +# #+LATEX_HEADER: \setsansfont[Scale=0.9]{Helvetica Neue} +# #+LATEX_HEADER: \setmonofont[Scale=0.8]{Courier} +# #+LATEX_HEADER: \newfontfamily\scfont[Scale=1.2]{Minion Pro} + +#+LATEX_HEADER: \usepackage[default]{sourcesanspro} +#+LATEX_HEADER: \usepackage{sourcecodepro} + +# Wider Text +# #+LATEX_HEADER: \usepackage[textwidth=7in,textheight=9in]{geometry} +#+LATEX_HEADER: \usepackage[margin=2.5cm,includehead=true,includefoot=true,centering]{geometry} + +\begin{titlepage} +\newgeometry{left=6cm} +\definecolor{titlepage-color}{HTML}{06386e} +\newpagecolor{titlepage-color}\afterpage{\restorepagecolor} +\newcommand{\colorRule}[3][black]{\textcolor[HTML]{#1}{\rule{#2}{#3}}} +\begin{flushleft} +\noindent +\\[-1em] +\color[HTML]{ffffff} +\makebox[0pt][l]{\colorRule[ffffff]{1.3\textwidth}{1pt}} +\par +\noindent + +{ \setstretch{1.4} +\vfill +\noindent {\huge \textbf{\textsf{Notes and Bits}}} +\vskip 1em +{\Large \textsf{A univ.ai compendium}} +\vskip 2em +\noindent +{\Large \textsf{\MakeUppercase{Rohit Goswami,\textsc{\scriptsize\ AMIChemE}}} +\vfill +} + +\textsf{\today}} +\end{flushleft} +\end{titlepage} +\restoregeometry + +\tableofcontents +\newpage + +# Start Here + +These are to be used in conjunction with the materials which have all been +forked to my own github repo. As such I will link to them and these are to be +used mainly to capture important aspects of the discussion. The topics listed +below have videos and slides in the repositories listed. + +| Topic | Repo | +|---------------------------------------------------+--------------------------| +| Regression (KNN and Linear) | [[https://github.com/univai-summerschool-2019/Regression][Regression]] | +| Selection and Regularization | [[https://github.com/univai-summerschool-2019/ValidationRegularization][ValidationRegularization]] | +| Logistic Regression | [[https://github.com/univai-summerschool-2019/Classification][Classification]] | +| Classification Trees and Bagging | [[https://github.com/univai-summerschool-2019/Trees][Trees]] | +| Enhanced Sampling (Boosting) | [[https://github.com/univai-summerschool-2019/Boosting][Boosting]] | +| Perceptrons | [[https://github.com/univai-summerschool-2019/ANN][ANN]] | +| Neural Networks (Regularization and Optimization) | [[https://github.com/univai-summerschool-2019/ANN2][ANN2]] | + +Furthermore there are several topics covered as labs with papers. + +| Topic | Repo | +|-------------------------------+------------------------| +| Convolutional Neural Networks | CNN {[[https://github.com/univai-summerschool-2019/CNN1][One]], [[https://github.com/univai-summerschool-2019/CNN3][Three]], [[https://github.com/univai-summerschool-2019/CNN4][Four]]} | +| Language Models | Lang {[[https://github.com/univai-summerschool-2019/Lang1][One]], [[https://github.com/univai-summerschool-2019/Lang2][Two]], [[https://github.com/univai-summerschool-2019/Lang3][Three]]} | +| RNNs (Ensembles) | [[https://github.com/univai-summerschool-2019/CaptionHack][CaptionHack]] | + +* Predictors +The measurement erorr is irreducile. Inspite of the way the measurements are +independent, we are able to sample over time. The basic concept is that the +confidence intervals for the predictors can be estimated. This now allows us to +treat it as a cumulative distribution, and therefore we can calculate the +standard errors. Now confidence intervals may also be obtained. +** Bootstrap +This basically is a quasi-equilibrium assumpion. This allows for the fomulation +of multiple populations, and then as a consequencuntly +#+BEGIN_QUOTE +Bootstrap aggregating, also called bagging, is a machine learning ensemble +meta-algorithm designed to improve the stability and accuracy of machine +learning algorithms used in statistical classification and regression. +#+END_QUOTE +The quoted text is [[https://en.wikipedia.org/wiki/Bootstrap_aggregating][from wikipedia]]. +** Standard Errors +** Hypothesis Testing +This is the formal process through wich we evaluate the validyty of a +statistical hypothesis by considering evidence for or against the hypothesis +gathered by random sampling of the data. + +* Validation +Essentially the take away is that though we can and will fit perfectly to the +training data as we increase the polynomial degree, we will actually see a +*decrease* in the accuracy metric ($R^2$) as the degree is increased due to +overfitting. + +The large number standard (around >50) for large data-sets without multimodes is +a good idea. Basically, don't always use the general $80:20$ for train:test and +$25\%$ for validation. + +The errors increase with the degree, not because of error propogation, but +because *the sensitivity has increased* due to the fact that at higher degrees +we have _overfitting_. + +* Regularization +Interestingly, using gradient techniques will not give us any better coeffients +in this form. That is, the regularization is not achieved by optimization. So in +effect, we need more than the loss function, we need the penalty function. + +To wit, when using regularization for linear cases, we are essentially using it +to reduce the dimensionality of our space. That is it is somewhat like feature +selection or dimensional reduction. +** LASSO and Ridge +So from the geometry and the fact that is more probable that from the LASSO +formulation we will obtain many zeros (it forms a square and the points are most +likely). The ridge is faster though. The LASSO is slower because it depends on +the $L_1$ norm which is not exactly differentiable (due to it being a modulus). +* Advanced Regression +:PROPERTIES: +:CATEGORY: Achuta +:END: +** Closed form regression +So from the euclidean norm, we can use the inner product aspect to get the +derivative. Ridge +** Norms +*** Euclidean Distance (L2) +This is the L2 and is basically the distance. +*** L1 Norm +This is just the modulus, or the absolute sum of the values. +*** General Norms +So given a p-norm, we have the sum for every element, to the p^{th} power, and +then raise the entirity of it to $\frac{1}{p}$ + +#+BEGIN_QUOTE +Can't we apply a transform to ensure the the isocontour of our Ridge regression +is actually on the axes as well? Just like the corners of the L1 regularization +fall on the axes. +#+END_QUOTE + +achuta@ucla.edu +* Logistic Regression +:PROPERTIES: +:CATEGORY: Pavlov +:END: +Logistic regression depends on predicting the $Y=1$ class, or the positive +class. Now the softmax is the generalized logistic regression mapping function, +but is basically the linear one in the equation below. +#+BEGIN_EXPORT latex +$$\frac{1}{1-e^{-(\beta_{0}+\beta_{1}X)}}$$ +#+END_EXPORT + +So one unit change is an $e^{\beta_{1}}$ in the odds that $Y=1$ +- This is also called a discriminative classifier, as they directly model the + probability of some class being present on the basis of the features +* Sparsity +JPEG is by convention 8 by 8 bits. Consider the DCT (Discrete Cosine Transform). + +# $G_{u,v}=\frac{1}{4}\alpha(u)\alpha(v)\Sum_{x=0}^{7}g_{x,y}\cos{\left[\frac{(2x+1)u\pi}{16}\right]} \cos{\left[\frac{(2y+1)v\pi}{16}\right]}$ + +Low freqency variations are more common in images. The [[https://spie.org/publications/book/34917][SPIE handbook on +compression]] this is really very good. + +So it turns out that when we have colors to be encoded, then we deal with them +as linear combinations, like 3 matrices for an RGB image. The mosaic class of +methods are used to extrapolate other colors from existing ones to reduce +complexity. + +Sparsity helps work with scale invariace of image sets, which can then be used +for image processing. Compressive sensing, or compressed sensing. This is used +for taking tiny datasets and forming a lot of information from it. This is +apparently lossless (must ask for the proof). The proof is by restricted +isometry. An approximate isometry property. A matrix $A$ is said to satisfy the +RIP of order $K$ with isometry constant. (ask for the rest) +* Decision Trees +:PROPERTIES: +:CATEGORY: Pavlov +:END: +So this is different from KNN because KNN does not look at variables, basically +it is a clustering algorithm. Clustering algorithms may also look at density and +but they do not consider labels. In a decision tree we simply draw the line where +the probability of being in both classes is equal. The decision tree approach +does not look at the density per-se. KNN is predictive as opposed to +inferential. So the decision boundary we put is sort of not just where the +probability is equal, it is technically wherever we need to put the positive +prediction threshold. + +So the confidence of the model can only be visualized (via noting how peaked the +histogram is). Basically given the same point on the decision boundary, there +may be multiple curves through it but they will have different +"sharpness-factors". + +#+BEGIN_QUOTE +Basically, a flow chart whose graph is a tree (connected and no cycles) +represents a model called a decision tree. +#+END_QUOTE + +It turns out that though we can actually use *m-nary* trees instead of binary +trees, but it is more relevant (computationally) to use the binary one instead. + +** Errors +The Gini index is used to determine purity post split. Another method is to use +the maxiumum misclassification rate, weigh it according to the number of points, and +move towards the minimum of that. + +** Classification Trees +** Regression Trees +When we discuss these, then it is important to note that local effects are +better captured by the tree regression techniques compared to the polynomial and +linear regression methods. This means that the linear/polynomial version is +global in nature, while the tree model is local. Naturally, the +linear/polynomial has a guarantee to a minimum, but the regression tree, +typically being piecewise linear, the regression tree will be better for +situations where there is less smooth. So we can test the Hessian and see if the +data is smooth, if it is the polynomial or linear regression is better, while if +it is a sort of jagged system, then the polynomial will be of a high degree and +therefore we don't want to try that. + +There is no need for standardization typically because this is one feature +at a time. + +- In both cases we report only accuracies from the leaf nodes, that is the R^2 +- In both cases we report only accuracies from the leaf nodes, that is the $R^2$ + for the regression case, and the accuracy for the classification. + +It has been proven that it is computationally more tractable to use decision +trees when compared to piecewise linear regression. +** Weighted Samples +The most important thing about this is that though we do use weights which +effectively change the distribution, this is ONLY done to the *training-set*. +The test and validation sets must be as biased as the original training set. +** Impudation +This seems to be the way you fix missing data. Conditional impudation is when +you look at it. KNN is also used for the imputation of points. KNN imputation is +expensive. Typically we can throw away columns if there is a lot of points missin. +* Machine Learning and Denoising +So a dictionary learning thing is something which can be used to learn a +compressible basis, sort of like the DCT or the DFT. Well it is better to +consider it as a *sparse basis*. + +The basic concept is that an image, can be broken down into a n-dimensional +representatin with relative energies of the frequencies. + +Now with the n-dimensions, we can create a matrix A (which is fat as it contains +m-dimensions, and $n\linewidth\linewidth\else\Gin@nat@width\fi} +\def\maxheight{\ifdim\Gin@nat@height>\textheight\textheight\else\Gin@nat@height\fi} +\makeatother +% Scale images if necessary, so that they will not overflow the page +% margins by default, and it is still possible to overwrite the defaults +% using explicit options in \includegraphics[width, height, ...]{} +\setkeys{Gin}{width=\maxwidth,height=\maxheight,keepaspectratio} +\setlength{\emergencystretch}{3em} % prevent overfull lines +\providecommand{\tightlist}{% +\setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}} +\setcounter{secnumdepth}{0} +% Redefines (sub)paragraphs to behave more like sections +\ifx\paragraph\undefined\else +\let\oldparagraph\paragraph +\renewcommand{\paragraph}[1]{\oldparagraph{#1}\mbox{}} +\fi +\ifx\subparagraph\undefined\else +\let\oldsubparagraph\subparagraph +\renewcommand{\subparagraph}[1]{\oldsubparagraph{#1}\mbox{}} +\fi +% Make use of float-package and set default placement for figures to H +\usepackage{float} +\floatplacement{figure}{H} +\numberwithin{figure}{section} +\numberwithin{equation}{section} +\numberwithin{table}{section} +\makeatletter +\@ifpackageloaded{subfig}{}{\usepackage{subfig}} +\@ifpackageloaded{caption}{}{\usepackage{caption}} +\captionsetup[subfloat]{margin=0.5em} +\AtBeginDocument{% +\renewcommand*\figurename{Figure} +\renewcommand*\tablename{Table} +} +\AtBeginDocument{% +\renewcommand*\listfigurename{List of Figures} +\renewcommand*\listtablename{List of Tables} +} +\@ifpackageloaded{float}{}{\usepackage{float}} +\floatstyle{ruled} +\@ifundefined{c@chapter}{\newfloat{codelisting}{h}{lop}}{\newfloat{codelisting}{h}{lop}[chapter]} +\floatname{codelisting}{Listing} +\makeatother +\usepackage[dvipsnames,svgnames*,x11names*,table]{xcolor} +\definecolor{listing-background}{HTML}{F7F7F7} +\definecolor{listing-rule}{HTML}{B3B2B3} +\definecolor{listing-numbers}{HTML}{B3B2B3} +\definecolor{listing-text-color}{HTML}{000000} +\definecolor{listing-keyword}{HTML}{435489} +\definecolor{listing-identifier}{HTML}{435489} +\definecolor{listing-string}{HTML}{00999A} +\definecolor{listing-comment}{HTML}{8E8E8E} +\definecolor{listing-javadoc-comment}{HTML}{006CA9} +\usepackage{pagecolor} +\usepackage{afterpage} +\setcounter{tocdepth}{3} +\usepackage{setspace} +\setstretch{1.2} +\usepackage{csquotes} +\usepackage[font={small,it}]{caption} +\newcommand{\imglabel}[1]{\textbf{\textit{(#1)}}} +\definecolor{blockquote-border}{RGB}{221,221,221} +\definecolor{blockquote-text}{RGB}{119,119,119} +\usepackage{mdframed} +\newmdenv[rightline=false,bottomline=false,topline=false,linewidth=3pt,linecolor=blockquote-border,skipabove=\parskip]{customblockquote} +\renewenvironment{quote}{\begin{customblockquote}\list{}{\rightmargin=0em\leftmargin=0em}% +\item\relax\color{blockquote-text}\ignorespaces}{\unskip\unskip\endlist\end{customblockquote}} +\definecolor{heading-color}{RGB}{40,40,40} +\addtokomafont{section}{\color{heading-color}} +\usepackage{titling} +\renewcommand{\arraystretch}{1.3} % table spacing +\definecolor{table-row-color}{HTML}{F5F5F5} +\rowcolors{3}{}{table-row-color!100} +% Reset rownum counter so that each table starts with the same row color +\let\oldlongtable\longtable +\let\endoldlongtable\endlongtable +\renewenvironment{longtable}{\oldlongtable} { +\endoldlongtable +\global\rownum=0\relax} +\setlength{\parindent}{0pt} +\setlength{\parskip}{6pt plus 2pt minus 1pt} +\setlength{\emergencystretch}{3em} % prevent overfull lines +\usepackage{fancyhdr} +\pagestyle{fancy} +\fancyhead{} +\fancyfoot{} +\lhead{Notes and Bits} +\chead{} +\rhead{\today} +\lfoot{Rohit Goswami} +\cfoot{} +\rfoot{\thepage} +\renewcommand{\headrulewidth}{0.4pt} +\renewcommand{\footrulewidth}{0.4pt} +% When using the classes report, scrreprt, book, +% scrbook or memoir, uncomment the following line. +%\addtokomafont{chapter}{\color{heading-color}} +\usepackage[default]{sourcesanspro} +\usepackage{sourcecodepro} +\usepackage[margin=2.5cm,includehead=true,includefoot=true,centering]{geometry} +\author{Rohit Goswami,\textsc{\scriptsize\ AMIChemE}} +\date{\today} +\title{Notes and Bits\\\medskip +\large A univ.ai compendium} +\hypersetup{ + pdfauthor={Rohit Goswami,\textsc{\scriptsize\ AMIChemE}}, + pdftitle={Notes and Bits}, + pdfkeywords={}, + pdfsubject={}, + pdfcreator={Emacs 26.2 (Org mode 9.1.9)}, + pdflang={English}} +\begin{document} + +\begin{titlepage} +\newgeometry{left=6cm} +\definecolor{titlepage-color}{HTML}{06386e} +\newpagecolor{titlepage-color}\afterpage{\restorepagecolor} +\newcommand{\colorRule}[3][black]{\textcolor[HTML]{#1}{\rule{#2}{#3}}} +\begin{flushleft} +\noindent +\\[-1em] +\color[HTML]{ffffff} +\makebox[0pt][l]{\colorRule[ffffff]{1.3\textwidth}{1pt}} +\par +\noindent + +{ \setstretch{1.4} +\vfill +\noindent {\huge \textbf{\textsf{Notes and Bits}}} +\vskip 1em +{\Large \textsf{A univ.ai compendium}} +\vskip 2em +\noindent +{\Large \textsf{\MakeUppercase{Rohit Goswami,\textsc{\scriptsize\ AMIChemE}}} +\vfill +} + +\textsf{\today}} +\end{flushleft} +\end{titlepage} +\restoregeometry + +\tableofcontents +\newpage + +These are to be used in conjunction with the materials which have all been +forked to my own github repo. As such I will link to them and these are to be +used mainly to capture important aspects of the discussion. The topics listed +below have videos and slides in the repositories listed. + +\begin{center} +\begin{tabular}{ll} +Topic & Repo\\ +\hline +Regression (KNN and Linear) & \href{https://github.com/univai-summerschool-2019/Regression}{Regression}\\ +Selection and Regularization & \href{https://github.com/univai-summerschool-2019/ValidationRegularization}{ValidationRegularization}\\ +Logistic Regression & \href{https://github.com/univai-summerschool-2019/Classification}{Classification}\\ +Classification Trees and Bagging & \href{https://github.com/univai-summerschool-2019/Trees}{Trees}\\ +Enhanced Sampling (Boosting) & \href{https://github.com/univai-summerschool-2019/Boosting}{Boosting}\\ +Perceptrons & \href{https://github.com/univai-summerschool-2019/ANN}{ANN}\\ +Neural Networks (Regularization and Optimization) & \href{https://github.com/univai-summerschool-2019/ANN2}{ANN2}\\ +\end{tabular} +\end{center} + +Furthermore there are several topics covered as labs with papers. + +\begin{center} +\begin{tabular}{ll} +Topic & Repo\\ +\hline +Convolutional Neural Networks & CNN \{\href{https://github.com/univai-summerschool-2019/CNN1}{One}, \href{https://github.com/univai-summerschool-2019/CNN3}{Three}, \href{https://github.com/univai-summerschool-2019/CNN4}{Four}\}\\ +Language Models & Lang \{\href{https://github.com/univai-summerschool-2019/Lang1}{One}, \href{https://github.com/univai-summerschool-2019/Lang2}{Two}, \href{https://github.com/univai-summerschool-2019/Lang3}{Three}\}\\ +RNNs (Ensembles) & \href{https://github.com/univai-summerschool-2019/CaptionHack}{CaptionHack}\\ +\end{tabular} +\end{center} + +\section{Predictors} +\label{sec:orgdf81ab2} +The measurement erorr is irreducile. Inspite of the way the measurements are +independent, we are able to sample over time. The basic concept is that the +confidence intervals for the predictors can be estimated. This now allows us to +treat it as a cumulative distribution, and therefore we can calculate the +standard errors. Now confidence intervals may also be obtained. +\subsection{Bootstrap} +\label{sec:orgac02bb8} +This basically is a quasi-equilibrium assumpion. This allows for the fomulation +of multiple populations, and then as a consequencuntly +\begin{quote} +Bootstrap aggregating, also called bagging, is a machine learning ensemble +meta-algorithm designed to improve the stability and accuracy of machine +learning algorithms used in statistical classification and regression. +\end{quote} +The quoted text is \href{https://en.wikipedia.org/wiki/Bootstrap\_aggregating}{from wikipedia}. +\subsection{Standard Errors} +\label{sec:org3562c0a} +\subsection{Hypothesis Testing} +\label{sec:org14c2072} +This is the formal process through wich we evaluate the validyty of a +statistical hypothesis by considering evidence for or against the hypothesis +gathered by random sampling of the data. + +\section{Validation} +\label{sec:org25c31d9} +Essentially the take away is that though we can and will fit perfectly to the +training data as we increase the polynomial degree, we will actually see a +\textbf{decrease} in the accuracy metric (\(R^2\)) as the degree is increased due to +overfitting. + +The large number standard (around >50) for large data-sets without multimodes is +a good idea. Basically, don't always use the general \(80:20\) for train:test and +\(25\%\) for validation. + +The errors increase with the degree, not because of error propogation, but +because \textbf{the sensitivity has increased} due to the fact that at higher degrees +we have \uline{overfitting}. + +\section{Regularization} +\label{sec:org3f462d8} +Interestingly, using gradient techniques will not give us any better coeffients +in this form. That is, the regularization is not achieved by optimization. So in +effect, we need more than the loss function, we need the penalty function. + +To wit, when using regularization for linear cases, we are essentially using it +to reduce the dimensionality of our space. That is it is somewhat like feature +selection or dimensional reduction. +\subsection{LASSO and Ridge} +\label{sec:org520b9f5} +So from the geometry and the fact that is more probable that from the LASSO +formulation we will obtain many zeros (it forms a square and the points are most +likely). The ridge is faster though. The LASSO is slower because it depends on +the \(L_1\) norm which is not exactly differentiable (due to it being a modulus). +\section{Advanced Regression} +\label{sec:org60c8c0f} +\subsection{Closed form regression} +\label{sec:orgaec938b} +So from the euclidean norm, we can use the inner product aspect to get the +derivative. Ridge +\subsection{Norms} +\label{sec:org0387473} +\subsubsection{Euclidean Distance (L2)} +\label{sec:orgd6381f7} +This is the L2 and is basically the distance. +\subsubsection{L1 Norm} +\label{sec:orgfc3b75c} +This is just the modulus, or the absolute sum of the values. +\subsubsection{General Norms} +\label{sec:org84b5a85} +So given a p-norm, we have the sum for every element, to the p\(^{\text{th}}\) power, and +then raise the entirity of it to \(\frac{1}{p}\) + +\begin{quote} +Can't we apply a transform to ensure the the isocontour of our Ridge regression +is actually on the axes as well? Just like the corners of the L1 regularization +fall on the axes. +\end{quote} + +achuta@ucla.edu +\section{Logistic Regression} +\label{sec:orgab94bf5} +Logistic regression depends on predicting the \(Y=1\) class, or the positive +class. Now the softmax is the generalized logistic regression mapping function, +but is basically the linear one in the equation below. +$$\frac{1}{1-e^{-(\beta_{0}+\beta_{1}X)}}$$ + +So one unit change is an \(e^{\beta_{1}}\) in the odds that \(Y=1\) +\begin{itemize} +\item This is also called a discriminative classifier, as they directly model the +probability of some class being present on the basis of the features +\end{itemize} +\section{Sparsity} +\label{sec:org17ba4c6} +JPEG is by convention 8 by 8 bits. Consider the DCT (Discrete Cosine Transform). + +Low freqency variations are more common in images. The \href{https://spie.org/publications/book/34917}{SPIE handbook on +compression} this is really very good. + +So it turns out that when we have colors to be encoded, then we deal with them +as linear combinations, like 3 matrices for an RGB image. The mosaic class of +methods are used to extrapolate other colors from existing ones to reduce +complexity. + +Sparsity helps work with scale invariace of image sets, which can then be used +for image processing. Compressive sensing, or compressed sensing. This is used +for taking tiny datasets and forming a lot of information from it. This is +apparently lossless (must ask for the proof). The proof is by restricted +isometry. An approximate isometry property. A matrix \(A\) is said to satisfy the +RIP of order \(K\) with isometry constant. (ask for the rest) +\section{Decision Trees} +\label{sec:org15b838d} +So this is different from KNN because KNN does not look at variables, basically +it is a clustering algorithm. Clustering algorithms may also look at density and +but they do not consider labels. In a decision tree we simply draw the line where +the probability of being in both classes is equal. The decision tree approach +does not look at the density per-se. KNN is predictive as opposed to +inferential. So the decision boundary we put is sort of not just where the +probability is equal, it is technically wherever we need to put the positive +prediction threshold. + +So the confidence of the model can only be visualized (via noting how peaked the +histogram is). Basically given the same point on the decision boundary, there +may be multiple curves through it but they will have different +"sharpness-factors". + +\begin{quote} +Basically, a flow chart whose graph is a tree (connected and no cycles) +represents a model called a decision tree. +\end{quote} + +It turns out that though we can actually use \textbf{m-nary} trees instead of binary +trees, but it is more relevant (computationally) to use the binary one instead. + +\subsection{Errors} +\label{sec:org043fdb6} +The Gini index is used to determine purity post split. Another method is to use +the maxiumum misclassification rate, weigh it according to the number of points, and +move towards the minimum of that. + +\subsection{Classification Trees} +\label{sec:org05071ea} +\subsection{Regression Trees} +\label{sec:org17d8a07} +When we discuss these, then it is important to note that local effects are +better captured by the tree regression techniques compared to the polynomial and +linear regression methods. This means that the linear/polynomial version is +global in nature, while the tree model is local. Naturally, the +linear/polynomial has a guarantee to a minimum, but the regression tree, +typically being piecewise linear, the regression tree will be better for +situations where there is less smooth. So we can test the Hessian and see if the +data is smooth, if it is the polynomial or linear regression is better, while if +it is a sort of jagged system, then the polynomial will be of a high degree and +therefore we don't want to try that. + +There is no need for standardization typically because this is one feature +at a time. + +\begin{itemize} +\item In both cases we report only accuracies from the leaf nodes, that is the R\(^{\text{2}}\) +\item In both cases we report only accuracies from the leaf nodes, that is the \(R^2\) +for the regression case, and the accuracy for the classification. +\end{itemize} + +It has been proven that it is computationally more tractable to use decision +trees when compared to piecewise linear regression. +\subsection{Weighted Samples} +\label{sec:orgad05e15} +The most important thing about this is that though we do use weights which +effectively change the distribution, this is ONLY done to the \textbf{training-set}. +The test and validation sets must be as biased as the original training set. +\subsection{Impudation} +\label{sec:org821984f} +This seems to be the way you fix missing data. Conditional impudation is when +you look at it. KNN is also used for the imputation of points. KNN imputation is +expensive. Typically we can throw away columns if there is a lot of points missin. +\section{Machine Learning and Denoising} +\label{sec:orgf233cc2} +So a dictionary learning thing is something which can be used to learn a +compressible basis, sort of like the DCT or the DFT. Well it is better to +consider it as a \textbf{sparse basis}. + +The basic concept is that an image, can be broken down into a n-dimensional +representatin with relative energies of the frequencies. + +Now with the n-dimensions, we can create a matrix A (which is fat as it contains +m-dimensions, and \(n