diff --git a/notes/00_lagrange/1_lagrange.tex b/notes/00_lagrange/1_lagrange.tex index 431306e..25f9ac2 100644 --- a/notes/00_lagrange/1_lagrange.tex +++ b/notes/00_lagrange/1_lagrange.tex @@ -216,11 +216,11 @@ \subsection{Multiple constraints} \label{eq:optimizationINequalitymultipe} \end{equation} -However, the Lagrangian remains the same\notesonly{ as in \eqref{eq:lagrangianmultiple}}. The inequality is taken into account in that the solutions for $\lambda$ extend to some range: +This leads to the following changes to Lagrangian in order to take the inequality and the type of optimization (\textcolor{magenta}{max} vs. min) into account. The inequality is taken into account in that the solutions for $\lambda$ extend to some range: \begin{equation} L(\,\vec w\;, \{\lambda_k\} -) \; := \; f_0(\vec w) + \sum_{k=1}^{m} \lambda_k \, f_k(\vec w)\,,\qquad +) \; := \; {\color{magenta}-} \; f_0(\vec w) + \sum_{k=1}^{m} \lambda_k \, f_k(\vec w)\,,\qquad \lambda_{k} \;{\color{red}{\ge}}\; 0 \quad \forall k \in \{1,\ldots,m\} \label{eq:lagrangianINequalitymultiple} \end{equation} diff --git a/notes/01_pca/0_cov.tex b/notes/01_pca/0_cov.tex index 7d0db63..c204f1b 100644 --- a/notes/01_pca/0_cov.tex +++ b/notes/01_pca/0_cov.tex @@ -47,16 +47,16 @@ \subsection{Variances and Covariances} \mode{ \only<1>{ \begin{center} -\includegraphics[width=0.6\textwidth]{img/scatter}% +\includegraphics[width=0.55\textwidth]{img/scatter}% \end{center} } \only<2>{ \begin{center} -\includegraphics[width=0.6\textwidth]{img/cov}% +\includegraphics[width=0.55\textwidth]{img/cov}% \end{center} - -\slidesonly{\vspace{-10mm}} } + +\svspace{-8mm} } diff --git a/notes/01_pca/0_dim.tex b/notes/01_pca/0_dim.tex index 702ebee..d9ae265 100644 --- a/notes/01_pca/0_dim.tex +++ b/notes/01_pca/0_dim.tex @@ -74,27 +74,30 @@ \subsection{Simple truncation} \resizebox{\textwidth}{!}{% \begin{tabular}{c|cccc} - & \multicolumn{1}{c}{$\vec x^{(1)}$} & \multicolumn{1}{c}{$\vec x^{(2)}$} & \multicolumn{1}{c}{\ldots} & $\vec x^{(p)}$ \\ \hline%\cline{1-3} \cline{5-5} - $x_1$ & -0.2 & 0.1 & & 0.2 \\ \cline{1-1} - $x_2$ & 0.1 & 3.1 & & -1.0 \\ \cline{1-1} - $x_3$ & 2.5 & 7.2 & & -0.8 \\ %\cline{1-1} + \only<-3>{ & \multicolumn{1}{c}{$\vec{{ x}}^{(1)}$} & \multicolumn{1}{c}{$\vec{{ x}}^{(2)}$} & \multicolumn{1}{c}{\ldots} & $\vec{{ x}}^{(p)}$ \\ \hline%\cline{1-3} \cline{5-5} + } + \only<4->{ & \multicolumn{1}{c}{$\vec{{\widetilde x}}^{(1)}$} & \multicolumn{1}{c}{$\vec{{\widetilde x}}^{(2)}$} & \multicolumn{1}{c}{\ldots} & $\vec{{\widetilde x}}^{(p)}$ \\ \hline%\cline{1-3} \cline{5-5} + } + $ x_1$ & -0.2 & 0.1 & & 0.2 \\ \cline{1-1} + $ x_2$ & 0.1 & 3.1 & & -1.0 \\ \cline{1-1} + $ x_3$ & 2.5 & 7.2 & & -0.8 \\ %\cline{1-1} \vdots & \vdots & \vdots & \begin{tabular}[c]{@{}c@{}}\ldots\vspace{2.5mm}\end{tabular} & \vdots \\ %\cline{1-1} - $x_{N-2}$ & -7.1 & -3.5 & & 7.0 \\ \cline{1-1} + $ x_{N-2}$ & -7.1 & -3.5 & & 7.0 \\ \cline{1-1} \slidesonly{ \only<1>{ - $x_{N-1}$ & -10.3 & -0.3 & & 4.5 \\ \cline{1-1} - $x_N$ & 4.0 & 1.3 & & 6.6 \\ \hline + $ x_{N-1}$ & -10.3 & -0.3 & & 4.5 \\ \cline{1-1} + $ x_N$ & 4.0 & 1.3 & & 6.6 \\ \hline } \only<3>{ %\hline{\vspace{\dimexpr 2.2ex-\doublerulesep}} - $\hcancel[red]{x_{N-1}}$ & \textcolor{red}{?} & \textcolor{red}{?} & & \textcolor{red}{?} \\ \cline{1-1} - $\hcancel[red]{x_{N}}$ & \textcolor{red}{?} & \textcolor{red}{?} & & \textcolor{red}{?} \\ \hline + $\hcancel[red]{{{ x}}_{N-1}}$ & \textcolor{red}{?} & \textcolor{red}{?} & & \textcolor{red}{?} \\ \cline{1-1} + $\hcancel[red]{{{ x}}_{N}}$ & \textcolor{red}{?} & \textcolor{red}{?} & & \textcolor{red}{?} \\ \hline } } \only<4->{ %\hline{\vspace{\dimexpr 2.2ex-\doublerulesep}} - $\color{blue}{x_{N-1}}$ & \textcolor{blue}{0} & \textcolor{blue}{0} & & \textcolor{blue}{0} \\ \cline{1-1} - $\color{blue}{x_{N}}$ & \textcolor{blue}{0} & \textcolor{blue}{0} & & \textcolor{blue}{0} \\ \hline + $\color{blue}{{{\widetilde x}}_{N-1}}$ & \textcolor{blue}{0} & \textcolor{blue}{0} & & \textcolor{blue}{0} \\ \cline{1-1} + $\color{blue}{{{\widetilde x}}_{N}}$ & \textcolor{blue}{0} & \textcolor{blue}{0} & & \textcolor{blue}{0} \\ \hline } \end{tabular}% } @@ -103,8 +106,8 @@ \subsection{Simple truncation} \only<2>{ \mode{ \begin{center} - \includegraphics[width=0.2\textwidth]{img/telegram} - \captionof*{figure}{\footnotesize + \includegraphics[width=0.15\textwidth]{img/telegram} + \captionof*{figure}{\tiny Photo by \href{https://unsplash.com/@sandratansh?utm_source=unsplash&utm_medium=referral&utm_content=creditCopyText}{Sandra Tan} on \href{https://unsplash.com/s/photos/telegram?utm_source=unsplash&utm_medium=referral&utm_content=creditCopyText}{Unsplash} }% \end{center} @@ -125,7 +128,7 @@ \subsection{Simple truncation} \hfill \begin{minipage}{0.33\textwidth} \begin{center} - \includegraphics[width=0.99\textwidth]{img/tram_missing}% + \includegraphics[width=0.99\textwidth]{img/tram_missing_box}% \end{center} \end{minipage} \end{minipage} @@ -142,19 +145,19 @@ \subsubsection{Procedure} \begin{frame}\frametitle{\subsecname:~\subsubsecname} Let -\slidesonly{\vspace{-5mm}} +\svspace{-5mm} \begin{itemize} \item[]$\vec x \in \R^N$,\\ \item[] w.l.o.g. $\E[\vec x] \eqexcl \vec 0$ \notesonly{\\ -}(i.e. for each variable $x_i$ its mean $m_i = \sum_{\alpha=1}^{p} x_i^{(\alpha)} \eqexcl 0$) +}, i.e. for each variable $x_i$ its mean $m_i = \sum_{\alpha=1}^{p} x_i^{(\alpha)} \eqexcl 0$. \end{itemize} \pause \begin{enumerate} -\item \underline{Dimensionality Reduction}: From $N$ to $M$ with $0 < M\, < N$\\ -$\Rightarrow$ simply transmit the first $M$ elements of $\vec x$. +\item \underline{Dimensionality Reduction}: From $N$ to $\color{red}M$ with $0 < {\color{red}M}\, < N$\\ +$\Rightarrow$ simply transmit the first $\color{red}M$ elements of $\vec x$. \pause \item \underline{Reconstruction}: The recipient reconstructs all $N$ elements by adding zero entries for all missing elements (i.e. \textit{zero-padding}):\\ Let $\widetilde{\vec{x}}$ be the reconstructed observation, where\\ @@ -162,8 +165,8 @@ \subsubsection{Procedure} \begin{equation} % = $ for $j=1,\ldots,M% (perfect reconstruction for the first $M$ elements), \widetilde{x}_j = \begin{cases} - {x}_j & \text{for}~j=1,\ldots,M \qquad \text{(perfect reconstruction)} \\ - 0 & \text{for}~j=M+1,\ldots,N \quad \text{zero-padding} + {x}_j & \text{for}~j=1,\ldots,{\color{red}M} \qquad \text{(perfect reconstruction)} \\ + 0 & \text{for}~j={\color{red}M}+1,\ldots,N \quad \text{zero-padding} \end{cases} \end{equation} \end{enumerate} @@ -187,22 +190,22 @@ \subsubsection{Measuring the error}\label{sec:objective} \label{eq:mse} \visible<1->{ \mathit{MSE} &= \frac{1}{p} \sum\limits_{\alpha = 1}^p ( \vec{x}^{(\alpha)} - \widetilde{\vec{x}}^{(\alpha)} )^2 - \notesonly{\\&}= \frac{1}{p} \sum\limits_{\alpha = 1}^p \sum\limits_{j = 1}^N ( {x}_j^{(\alpha)} - \widetilde{{x}}^{(\alpha)}_j )^2\\ + \notesonly{\\&}= \frac{1}{p} \sum\limits_{\alpha = 1}^p \sum\limits_{j = 1}^N ( {x}_j^{(\alpha)} - {\widetilde{{x}}}^{(\alpha)}_j )^2\\ \intertext{The \textcolor{red}{first $M$} elements were transmitted perfectly, \textcolor{blue}{zero padding} is used to extend the vector to its original size of $N$ elements} } \visible<2->{ &= \frac{1}{p} \sum\limits_{\alpha = 1}^p \bigg( \underbrace{ - {\color{red}\sum\limits_{j = 1}^M} ( x_j^{(\alpha)} - \widetilde{x_j}^{(\alpha)} )^2 + {\color{red}\sum\limits_{j = 1}^M} ( x_j^{(\alpha)} - \widetilde{x}_j^{(\alpha)} )^2 }_{ \substack{=0 \\\text{ (perfect transmission)}} } + {\color{blue}\sum\limits_{j = M+1}^N} ( x_j^{(\alpha)} - \underbrace{ - \vphantom{\sum\limits_{j = 1}^M ( x_j^{(\alpha)} - \widetilde{x_j}^{(\alpha)} )^2} - \widetilde{x_j}^{(\alpha)} + \vphantom{\sum\limits_{j = 1}^M ( x_j^{(\alpha)} - \widetilde{x}_j^{(\alpha)} )^2} + \widetilde{x}_j^{(\alpha)} }_{\substack{=0\\ \text{padded}}} - \;)^2 \bigg)\\ + \kern-0.5ex)^2 \bigg)\\ &= \frac{1}{p} \sum\limits_{\alpha = 1}^p {\color{blue}\sum\limits_{j = M+1}^N} ( x_j^{(\alpha)} )^2 \slidesonly{\;\;}\notesonly{\\&}= {\color{blue}\sum\limits_{j = M+1}^N} \frac{1}{p} \sum\limits_{\alpha = 1}^p ( x_j^{(\alpha)} )^2 \\ &= {\color{blue}\sum\limits_{j = M+1}^N} \sigma_j^2 @@ -239,7 +242,7 @@ \subsubsection{Measuring the error}\label{sec:objective} \vec u = \vec M^\top \vec x \qquad\quad \vec M := \text{TBD} \end{equation} -s.t. truncating the transformed vector $\vec u \in \R^N$ is optimum in the sense of minimal MSE. +s.t. truncating the transformed vector $\vec u \in \R^N$ \notesonly{is optimum in the sense of}\slidesonly{has} minimal MSE. \question{Any ideas?} @@ -248,8 +251,16 @@ \subsubsection{Measuring the error}\label{sec:objective} - Sort the $N$ components in $\vec x$ from highest to lowest variance. \notesonly{The transformation here would be some permutation of the identity matrix that accomplishes the sorting.}\slidesonly{transformation: permutation of identity matrix} +\pause + \question{Is this enough to be minimal in MSE?} +\only<4->{ +\slidesonly{ + \placeimage{11.5}{11.3}{img/meme_sort}{width=3.5cm} +} +} + \pause - No, we still have to take the \emph{covariances} into consideration. diff --git a/notes/01_pca/1_pca.tex b/notes/01_pca/1_pca.tex index a209149..d8c1a14 100644 --- a/notes/01_pca/1_pca.tex +++ b/notes/01_pca/1_pca.tex @@ -11,34 +11,37 @@ \section{PCA} \end{frame} } -\subsection{Procedure} +\subsection{Procedure and Projection} -\begin{frame}\frametitle{\secname: \subsecname} +\begin{frame}{Visible}\frametitle{\secname: \subsecname} \begin{enumerate} - \item Center the data, $\E\lbrack\vec x\rbrack = \vec m = \frac{1}{p} \sum_{\alpha=1}^{p} \vec x^{(\alpha)}\eqexcl \vec 0$. + \item Center the data, $\E\lbrack\vec x\rbrack = \vec m = \frac{1}{p} \sum_{\alpha=1}^{p} \vec x^{(\alpha)}\eqexcl \vec 0$. \slidesonly{ \only<2>{ +\svspace{5mm} \begin{center} - \includegraphics[width=5.5cm]{img/meme_center} + \includegraphics[width=5cm]{img/meme_center} \end{center} } } - \visible<3->{ - \item Let $\vec X$ be the $N \times p$ matrix of the centered data. - \item Measure + %\visible<3-> + %{ + \item Let $\vec X$ be the $N \times p$ matrix of the centered data. + \item Measure \begin{itemize} \item the variance of each component in $\vec x$.\\ \textbf{Not enough}, the variables in $\vec x$ could be correlated. \item the covariances $C_{ij} \;\; \forall\,i,j = 1,\ldots,N$. - \end{itemize} - \item[$\Rightarrow$] Construct the covariance matrix $\vec C$. + \item[$\Rightarrow$] Construct the covariance matrix $\vec C$. \begin{equation} \vec C = \text{Cov}(\vec X) = \mathbf{\Sigma} = \E\lbrack\vec X~\vec X^\top\rbrack \in \R^{N \times N} \end{equation} - \item \textbf{eigenvalue decomposition} - \item Order eigenvalues in \emph{descending} order. (Highest variance first). The ordered eigenvectors are the \emph{principle components} of the dataset $\vec X$. - \item Project $\vec x$ onto the first $M$ PCs. - } + \end{itemize} + \svspace{-5mm} + \item \textbf{eigenvalue decomposition} + \item Order eigenvalues in \emph{descending} order. (Highest variance first). The ordered eigenvectors are the \emph{principle components} of the dataset $\vec X$. + \item Project $\vec x$ onto the \textcolor{red}{first} $\color{red}M$ PCs. + %} \end{enumerate} @@ -122,7 +125,7 @@ \subsubsection{Projection onto the PC space} \subsection{Reconstruction error} -\begin{frame}\frametitle{How much better is this vs. simple truncation?} +\begin{frame}\frametitle{How much better is this vs. simple truncation of $\vec x$?} %\newpage \pause @@ -153,11 +156,17 @@ \subsection{Reconstruction error} \only<3,4>{ \begin{equation} -E = \frac{1}{p} \sum_{\alpha = 1}^{p} (\vec x - \widetilde{\vec x})^2 = \frac{1}{p} \sum_{\alpha = 1}^{p} \sum_{j = M+1}^{N} (a_j^{(\alpha)})^2 +E = \frac{1}{p} \sum_{\alpha = 1}^{p} (\vec x - \widetilde{\vec x})^2 = \frac{1}{p} \sum_{\alpha = 1}^{p} \sum_{j = {\color{cyan}M+1}}^{{\color{cyan} N}} (a_j^{(\alpha)})^2 \end{equation} -The MSE is equal to the sum of variances of the final $\notesonly{N-(M+1)-1=}N-M$ components of the \emph{transformed} observations.\\ } -\only<4>{\notesonly{Since the PCs are ordered w.r.t to variance in descending order, }the variances of the last $N-M$ components of the transformed data are smallest. +\only<4>{ +\svspace{5mm} + +The MSE is equal to the sum of variances of the \textcolor{cyan}{last} $\notesonly{N-(M+1)-1=}{\color{cyan}N-M}$ components of the \emph{transformed} observations $\vec u=\vec M^\top \vec x$.\\ + +\svspace{5mm} + +\notesonly{Since the PCs are ordered w.r.t to variance in descending order, }\notesonly{t}\slidesonly{T}he variances of the \textcolor{cyan}{last} $\color{cyan}N-M$ components of the transformed data are smallest. The transformation is therefore optimal in the sense of minimal MSE. } diff --git a/notes/01_pca/2_apply-pca.tex b/notes/01_pca/2_apply-pca.tex index 5012202..08971ca 100644 --- a/notes/01_pca/2_apply-pca.tex +++ b/notes/01_pca/2_apply-pca.tex @@ -40,7 +40,7 @@ \subsection{Applying PCA} \newpage -\subsubsection{A note on implementation:} +\subsubsection{A note on implementation} \begin{frame}\frametitle{\subsubsecname} @@ -88,8 +88,8 @@ \subsubsection{A note on implementation:} \end{equation} (i.e. $\vec U$, $\vec V$ are orthogonal). \begin{itemize} -\item The columns of $\vec V$ (or rows of $\vec V^\top$) make up the eigenvectors of $\vec X^\top\vec X$. -\item The columns of $\vec U$ make up the eigenvectors of $\vec X~\vec X^\top$. +\item The columns of $\vec V$ (or rows of $\vec V^\top$) make up the eigenvectors of $\vec X^\top\vec X$. +\item The columns of $\vec U$ make up the eigenvectors of $\vec X~\vec X^\top$. \item $\vec S$ is a diagonal matrix. The singular values in $\vec S$ (the diagonal entries) are the \textbf{square roots} of the eigenvalues of the eigenvectors of $\vec X~\vec X^\top$ or $\vec X^\top\vec X$. \end{itemize} @@ -104,12 +104,12 @@ \subsubsection{A note on implementation:} \begin{frame}\frametitle{\subsubsecname} -\question{When to use \textit{SVD} vs. \text{Eig}? \textbf{Not dogma}} +\question{When to use \textit{SVD} vs. \textit{eig}? \textbf{Not dogma}} \begin{itemize} \item Numerical instabilities in eigendecomposition of very large covariance matrix $\rightarrow$ \textit{SVD} -\item SVD not applicable to Kernel-PCA $\rightarrow$ \textit{Eig} +\item \textit{SVD} is not applicable to Kernel-PCA $\rightarrow$ \textit{eig} \item Computational efficiency $\rightarrow$ \textit{SVD} (depends...?) \slidesonly{ @@ -117,19 +117,20 @@ \subsubsection{A note on implementation:} \vec X \in \R^{N \times p} \quad \leadsto \quad \vec C = \frac{1}{p} \vec X~\vec X^\top $$ } -\item Possible differences between PCA via SVD and PCA via eig: SVD may flip the sign. +\item Possible differences between PCA via \textit{SVD} and PCA via \textit{eig}: \textit{SVD} may flip the sign. \notesonly{ -The reason for the difference in sign between the eigenvector's you would get via \text{Eig} and the eigenvector you would get via \text{SVD}\footnote{Python's scikit-learn package uses SVD for its PCA implementation.}, is that SVD, in practice, may arbitrarily flip the direction of the eigenevectors. +The reason for the difference in sign between the eigenvector's you would get via \textit{eig} and the eigenvector you would get via \textit{SVD}\footnote{Python's scikit-learn package uses SVD for its PCA implementation.}, is that SVD, in practice, may arbitrarily flip the direction of the eigenevectors. Possible reasons for the sign ambiguity according to \ref{bro2008resolving}: \begin{enumerate} -\item It's a mechanism in the SVD implementation (there's more than one SVD implementation) to mitigate numerical instabilities, +\item It's a mechanism in the \textit{SVD} implementation (there's more than one \textit{SVD} implementation) to mitigate numerical instabilities, \item Due to centering of the data, -\item an SVD impelementation could have a random component. +\item an \textit{SVD} impelementation could have a random component. \end{enumerate} -In both approaches you should be able to get the same reconstructions from both of them, i.e. the sign of the components along the flipped PC will also be flipped. Depending on the application, it may be necessary to "detect" if SVD flipped the direction or not. Numerical methods to detect this exist. A reason to care about what the sign should be is when you want to interpret what an eigenvector represents to possibly assign meaning to the component along this PC: Is the component positive or negative along the PC or does this point have a larger component along the PC than another point. The classical example of "interpreting" PCs would be "Eigenfaces". PCA is applied on images of faces. It is similar to what we ask you to do in Exercise 1.4. The Eigenvectors extracted from image data can be represented as images themselves. If SVD gives you "flipped" eigenvectors, visualizing them would give you the "negative" of that image. It's not as intuitive to look at negatives, but a person will still be able to make sense of them. However, imagine trying to interpret the eigenvector of other high-dimensional data that isn't as pretty as pictures. Knowing that SVD could have flipped the sign could give you a hard time trying to make sense of what the eigenvector represents. +In both approaches you should be able to get the same reconstructions from both of them, i.e. the sign of the components along the flipped PC will also be flipped. Depending on the application, it may be necessary to ``detect'' if \textit{SVD} flipped the direction or not. Numerical methods to detect this exist. A reason to care about what the sign should be is when you want to interpret what an eigenvector represents to possibly assign meaning to the component along this PC: ``\textit{Is the component positive or negative along the PC or does this point have a larger component along the PC than another point?}'' +The classical example of ``interpreting'' PCs would be ``Eigenfaces''. PCA is applied on images of faces. It is similar to what we ask you to do in Exercise 1.4. The Eigenvectors extracted from image data can be represented as images themselves. If \textit{SVD} gives you ``flipped'' eigenvectors, visualizing them would give you the ``negative'' of that image. It's not as intuitive to look at negatives, but a person will still be able to make sense of them. However, imagine trying to interpret the eigenvector of other high-dimensional data that isn't as pretty as pictures. Knowing that \textit{SVD} could have flipped the sign could give you a hard time trying to make sense of what the eigenvector represents. } diff --git a/notes/01_pca/img/meme_sort.jpg b/notes/01_pca/img/meme_sort.jpg new file mode 100644 index 0000000..0f5ef31 Binary files /dev/null and b/notes/01_pca/img/meme_sort.jpg differ diff --git a/notes/01_pca/img/tram_missing.JPG.svg b/notes/01_pca/img/tram_missing.JPG.svg new file mode 100644 index 0000000..c24b722 --- /dev/null +++ b/notes/01_pca/img/tram_missing.JPG.svg @@ -0,0 +1,21995 @@ + + + + + + image/svg+xml + + + + + + + + + + + 000... + + diff --git a/notes/01_pca/img/tram_missing_box.png b/notes/01_pca/img/tram_missing_box.png new file mode 100644 index 0000000..01f9a17 Binary files /dev/null and b/notes/01_pca/img/tram_missing_box.png differ diff --git a/notes/01_pca/tutorial.tex b/notes/01_pca/tutorial.tex index 49a7a1a..caec990 100644 --- a/notes/01_pca/tutorial.tex +++ b/notes/01_pca/tutorial.tex @@ -15,7 +15,7 @@ % for slides \subtitle{\sheetnum \tutorialtitle} -%\maxdeadcycles=1000 % Workaround for ! Output loop---100 consecutive dead cycles because of too many figures +\maxdeadcycles=1000 % Workaround for ! Output loop---100 consecutive dead cycles because of too many figures % The following use of algroithms does not work well with the notes: % @@ -57,7 +57,7 @@ \end{frame} \begin{frame} -\tableofcontents +\tableofcontents[subsubsectionstyle=hide] \end{frame} \newpage