Merge pull request #38 from kashefy/pca21

PCA updates
kashefy · Apr 24, 2021 · 15cebed · 15cebed
2 parents bed6603 + 3ecbff5
commit 15cebed
Show file tree

Hide file tree

Showing 9 changed files with 22,077 additions and 61 deletions.
diff --git a/notes/00_lagrange/1_lagrange.tex b/notes/00_lagrange/1_lagrange.tex
@@ -216,11 +216,11 @@ \subsection{Multiple constraints}
 \label{eq:optimizationINequalitymultipe}
 \end{equation}
 
-However, the Lagrangian remains the same\notesonly{ as in \eqref{eq:lagrangianmultiple}}. The inequality is taken into account in that the solutions for $\lambda$ extend to some range:
+This leads to the following changes to Lagrangian in order to take the inequality and the type of optimization (\textcolor{magenta}{max} vs. min) into account. The inequality is taken into account in that the solutions for $\lambda$ extend to some range:
 
 \begin{equation}
 L(\,\vec w\;, \{\lambda_k\}
-) \; := \; f_0(\vec w) + \sum_{k=1}^{m} \lambda_k \, f_k(\vec w)\,,\qquad
+) \; := \; {\color{magenta}-} \; f_0(\vec w) + \sum_{k=1}^{m} \lambda_k \, f_k(\vec w)\,,\qquad
 \lambda_{k} \;{\color{red}{\ge}}\; 0 \quad \forall k \in \{1,\ldots,m\}
 \label{eq:lagrangianINequalitymultiple}
 \end{equation}

diff --git a/notes/01_pca/0_cov.tex b/notes/01_pca/0_cov.tex
@@ -47,16 +47,16 @@ \subsection{Variances and Covariances}
 \mode<presentation>{
 \only<1>{
 \begin{center}
-\includegraphics[width=0.6\textwidth]{img/scatter}%
+\includegraphics[width=0.55\textwidth]{img/scatter}%
 \end{center}
 }
 \only<2>{
 \begin{center}
-\includegraphics[width=0.6\textwidth]{img/cov}%
+\includegraphics[width=0.55\textwidth]{img/cov}%
 \end{center}
-
-\slidesonly{\vspace{-10mm}}
 }
+
+\svspace{-8mm}
 }
 
 

diff --git a/notes/01_pca/0_dim.tex b/notes/01_pca/0_dim.tex
@@ -74,27 +74,30 @@ \subsection{Simple truncation}
 
 	\resizebox{\textwidth}{!}{%
 	\begin{tabular}{c|cccc}
-			  & \multicolumn{1}{c}{$\vec x^{(1)}$} & \multicolumn{1}{c}{$\vec x^{(2)}$} & \multicolumn{1}{c}{\ldots} & $\vec x^{(p)}$ \\ \hline%\cline{1-3} \cline{5-5} 
-	$x_1$     & -0.2                           & 0.1                            &                       & 0.2       \\ \cline{1-1}
-	$x_2$     & 0.1                            & 3.1                            &                       & -1.0      \\ \cline{1-1}
-	$x_3$     & 2.5                            & 7.2                            &                       & -0.8      \\ %\cline{1-1}
+	\only<-3>{		  & \multicolumn{1}{c}{$\vec{{ x}}^{(1)}$} & \multicolumn{1}{c}{$\vec{{ x}}^{(2)}$} & \multicolumn{1}{c}{\ldots} & $\vec{{ x}}^{(p)}$ \\ \hline%\cline{1-3} \cline{5-5} 
+	}
+	\only<4->{		  & \multicolumn{1}{c}{$\vec{{\widetilde x}}^{(1)}$} & \multicolumn{1}{c}{$\vec{{\widetilde x}}^{(2)}$} & \multicolumn{1}{c}{\ldots} & $\vec{{\widetilde x}}^{(p)}$ \\ \hline%\cline{1-3} \cline{5-5} 
+	}
+	$ x_1$     & -0.2                           & 0.1                            &                       & 0.2       \\ \cline{1-1}
+	$ x_2$     & 0.1                            & 3.1                            &                       & -1.0      \\ \cline{1-1}
+	$ x_3$     & 2.5                            & 7.2                            &                       & -0.8      \\ %\cline{1-1}
 	  \vdots        &            \vdots                    &     \vdots                          & \begin{tabular}[c]{@{}c@{}}\ldots\vspace{2.5mm}\end{tabular} &      \vdots     \\ %\cline{1-1}
-	$x_{N-2}$ & -7.1                           & -3.5                           &                       & 7.0       \\ \cline{1-1}
+	$ x_{N-2}$ & -7.1                           & -3.5                           &                       & 7.0       \\ \cline{1-1}
 	\slidesonly{
 	\only<1>{
-	$x_{N-1}$ & -10.3                          & -0.3                           &                       & 4.5       \\ \cline{1-1}
-	$x_N$     & 4.0                            & 1.3                            &                       & 6.6       \\ \hline
+	$ x_{N-1}$ & -10.3                          & -0.3                           &                       & 4.5       \\ \cline{1-1}
+	$ x_N$     & 4.0                            & 1.3                            &                       & 6.6       \\ \hline
 	}
 	\only<3>{
 	%\hline{\vspace{\dimexpr 2.2ex-\doublerulesep}}
-	$\hcancel[red]{x_{N-1}}$ &          \textcolor{red}{?}             &          \textcolor{red}{?}             &                    &    \textcolor{red}{?}   \\ \cline{1-1}
-	$\hcancel[red]{x_{N}}$     &                      \textcolor{red}{?}        &               \textcolor{red}{?}            &                   & \textcolor{red}{?}      \\ \hline
+	$\hcancel[red]{{{ x}}_{N-1}}$ &          \textcolor{red}{?}             &          \textcolor{red}{?}             &                    &    \textcolor{red}{?}   \\ \cline{1-1}
+	$\hcancel[red]{{{ x}}_{N}}$     &                      \textcolor{red}{?}        &               \textcolor{red}{?}            &                   & \textcolor{red}{?}      \\ \hline
 	}
 	}
 	\only<4->{
 	%\hline{\vspace{\dimexpr 2.2ex-\doublerulesep}}
-	$\color{blue}{x_{N-1}}$ &          \textcolor{blue}{0}             &          \textcolor{blue}{0}          &                    &   \textcolor{blue}{0}  \\ \cline{1-1}
-	$\color{blue}{x_{N}}$     &                      \textcolor{blue}{0}        &              \textcolor{blue}{0}           &                   & \textcolor{blue}{0}   \\ \hline
+	$\color{blue}{{{\widetilde x}}_{N-1}}$ &          \textcolor{blue}{0}             &          \textcolor{blue}{0}          &                    &   \textcolor{blue}{0}  \\ \cline{1-1}
+	$\color{blue}{{{\widetilde x}}_{N}}$     &                      \textcolor{blue}{0}        &              \textcolor{blue}{0}           &                   & \textcolor{blue}{0}   \\ \hline
 	}
 	\end{tabular}%
 	}
@@ -103,8 +106,8 @@ \subsection{Simple truncation}
 \only<2>{
 \mode<presentation>{
 	\begin{center}
-		\includegraphics[width=0.2\textwidth]{img/telegram}
-		\captionof*{figure}{\footnotesize
+		\includegraphics[width=0.15\textwidth]{img/telegram}
+		\captionof*{figure}{\tiny
 		Photo by \href{https://unsplash.com/@sandratansh?utm_source=unsplash&utm_medium=referral&utm_content=creditCopyText}{Sandra Tan} on \href{https://unsplash.com/s/photos/telegram?utm_source=unsplash&utm_medium=referral&utm_content=creditCopyText}{Unsplash}
 	}%
 	\end{center}
@@ -125,7 +128,7 @@ \subsection{Simple truncation}
 \hfill
 \begin{minipage}{0.33\textwidth}
 	\begin{center}
-		\includegraphics[width=0.99\textwidth]{img/tram_missing}%
+		\includegraphics[width=0.99\textwidth]{img/tram_missing_box}%
 	\end{center}
 \end{minipage}
 \end{minipage}
@@ -142,28 +145,28 @@ \subsubsection{Procedure}
 \begin{frame}\frametitle{\subsecname:~\subsubsecname}
 
 Let
-\slidesonly{\vspace{-5mm}}
+\svspace{-5mm}
 \begin{itemize}
 \item[]$\vec x \in \R^N$,\\
 \item[] w.l.o.g. $\E[\vec x] \eqexcl \vec 0$ \notesonly{\\
 
-}(i.e. for each variable $x_i$ its mean $m_i = \sum_{\alpha=1}^{p} x_i^{(\alpha)} \eqexcl 0$)
+}, i.e. for each variable $x_i$ its mean $m_i = \sum_{\alpha=1}^{p} x_i^{(\alpha)} \eqexcl 0$.
 \end{itemize}
 
 \pause
 
 \begin{enumerate}
-\item \underline{Dimensionality Reduction}: From $N$ to $M$ with $0 < M\, < N$\\
-$\Rightarrow$ simply transmit the first $M$ elements of $\vec x$. 
+\item \underline{Dimensionality Reduction}: From $N$ to $\color{red}M$ with $0 < {\color{red}M}\, < N$\\
+$\Rightarrow$ simply transmit the first $\color{red}M$ elements of $\vec x$. 
 \pause
 \item \underline{Reconstruction}: The recipient reconstructs all $N$ elements by adding zero entries for all missing elements (i.e. \textit{zero-padding}):\\
 Let $\widetilde{\vec{x}}$ be the reconstructed observation, where\\ 
 
  \begin{equation}
  % = $ for $j=1,\ldots,M% (perfect reconstruction for the first $M$ elements),
  \widetilde{x}_j = \begin{cases} 
-      {x}_j & \text{for}~j=1,\ldots,M \qquad \text{(perfect reconstruction)} \\
-      0 & \text{for}~j=M+1,\ldots,N \quad \text{zero-padding} 
+      {x}_j & \text{for}~j=1,\ldots,{\color{red}M} \qquad \text{(perfect reconstruction)} \\
+      0 & \text{for}~j={\color{red}M}+1,\ldots,N \quad \text{zero-padding} 
    \end{cases}
  \end{equation}
 \end{enumerate}
@@ -187,22 +190,22 @@ \subsubsection{Measuring the error}\label{sec:objective}
 \label{eq:mse}
 \visible<1->{
 \mathit{MSE}  &=  \frac{1}{p} \sum\limits_{\alpha = 1}^p ( \vec{x}^{(\alpha)} - \widetilde{\vec{x}}^{(\alpha)} )^2
-	\notesonly{\\&}=  \frac{1}{p} \sum\limits_{\alpha = 1}^p \sum\limits_{j = 1}^N ( {x}_j^{(\alpha)} - \widetilde{{x}}^{(\alpha)}_j )^2\\
+	\notesonly{\\&}=  \frac{1}{p} \sum\limits_{\alpha = 1}^p \sum\limits_{j = 1}^N ( {x}_j^{(\alpha)} - {\widetilde{{x}}}^{(\alpha)}_j )^2\\
 	\intertext{The \textcolor{red}{first $M$} elements were transmitted perfectly, \textcolor{blue}{zero padding} is used to extend the vector to its original size of $N$ elements}
 }
 \visible<2->{
      &=  \frac{1}{p} \sum\limits_{\alpha = 1}^p \bigg(
      \underbrace{
-		{\color{red}\sum\limits_{j = 1}^M} ( x_j^{(\alpha)} - \widetilde{x_j}^{(\alpha)} )^2
+		{\color{red}\sum\limits_{j = 1}^M} ( x_j^{(\alpha)} - \widetilde{x}_j^{(\alpha)} )^2
 		}_{
 		\substack{=0 \\\text{ (perfect transmission)}}
 		} 
 		+ {\color{blue}\sum\limits_{j = M+1}^N} ( x_j^{(\alpha)} - 
 	\underbrace{
-	\vphantom{\sum\limits_{j = 1}^M ( x_j^{(\alpha)} - \widetilde{x_j}^{(\alpha)} )^2}
-	\widetilde{x_j}^{(\alpha)}
+	\vphantom{\sum\limits_{j = 1}^M ( x_j^{(\alpha)} - \widetilde{x}_j^{(\alpha)} )^2}
+	\widetilde{x}_j^{(\alpha)}
 	}_{\substack{=0\\ \text{padded}}}
-	\;)^2 \bigg)\\
+	\kern-0.5ex)^2 \bigg)\\
      &=  \frac{1}{p} \sum\limits_{\alpha = 1}^p {\color{blue}\sum\limits_{j = M+1}^N} ( x_j^{(\alpha)} )^2
      \slidesonly{\;\;}\notesonly{\\&}=  {\color{blue}\sum\limits_{j = M+1}^N} \frac{1}{p} \sum\limits_{\alpha = 1}^p  ( x_j^{(\alpha)} )^2 \\
      &=  {\color{blue}\sum\limits_{j = M+1}^N} \sigma_j^2
@@ -239,7 +242,7 @@ \subsubsection{Measuring the error}\label{sec:objective}
 \vec u = \vec M^\top \vec x \qquad\quad \vec M := \text{TBD}
 \end{equation}
 
-s.t. truncating the transformed vector $\vec u \in \R^N$ is optimum in the sense of minimal MSE.
+s.t. truncating the transformed vector $\vec u \in \R^N$ \notesonly{is optimum in the sense of}\slidesonly{has} minimal MSE.
 
 
 \question{Any ideas?}
@@ -248,8 +251,16 @@ \subsubsection{Measuring the error}\label{sec:objective}
 
 - Sort the $N$ components in $\vec x$ from highest to lowest variance. \notesonly{The transformation here would be some permutation of the identity matrix that accomplishes the sorting.}\slidesonly{transformation: permutation of identity matrix}
 
+\pause
+
 \question{Is this enough to be minimal in MSE?}
 
+\only<4->{
+\slidesonly{
+	\placeimage{11.5}{11.3}{img/meme_sort}{width=3.5cm}
+}
+}
+
 \pause
 
 - No, we still have to take the \emph{covariances} into consideration.

diff --git a/notes/01_pca/1_pca.tex b/notes/01_pca/1_pca.tex
@@ -11,34 +11,37 @@ \section{PCA}
 \end{frame}
 }
 
-\subsection{Procedure}
+\subsection{Procedure and Projection}
 
-\begin{frame}\frametitle{\secname: \subsecname}
+\begin{frame}{Visible}\frametitle{\secname: \subsecname}
 
 \begin{enumerate}
-	\item Center the data, $\E\lbrack\vec x\rbrack = \vec m  = \frac{1}{p} \sum_{\alpha=1}^{p} \vec x^{(\alpha)}\eqexcl \vec 0$.
+	\item<visible@1-> Center the data, $\E\lbrack\vec x\rbrack = \vec m  = \frac{1}{p} \sum_{\alpha=1}^{p} \vec x^{(\alpha)}\eqexcl \vec 0$.
 \slidesonly{	\only<2>{
+\svspace{5mm}
 	\begin{center}
-		\includegraphics[width=5.5cm]{img/meme_center}
+		\includegraphics[width=5cm]{img/meme_center}
     \end{center}
 	}
 }
-	\visible<3->{
-	\item Let $\vec X$ be the $N \times p$ matrix of the centered data.
-	\item Measure
+	%\visible<3->
+	%{
+	\item<visible@3-> Let $\vec X$ be the $N \times p$ matrix of the centered data.
+	\item<visible@4-> Measure
 	\begin{itemize}
 	\item the variance of each component in $\vec x$.\\
 	\textbf{Not enough}, the variables in $\vec x$ could be correlated.
 	\item the covariances $C_{ij} \;\; \forall\,i,j = 1,\ldots,N$.
-	\end{itemize}
-	\item[$\Rightarrow$] Construct the covariance matrix $\vec C$.
+	\item<visible@4->[$\Rightarrow$] Construct the covariance matrix $\vec C$.
 		\begin{equation}
 		\vec C = \text{Cov}(\vec X) = \mathbf{\Sigma} = \E\lbrack\vec X~\vec X^\top\rbrack \in \R^{N \times N}
 		\end{equation}
-	\item \textbf{eigenvalue decomposition}
-	\item Order eigenvalues in \emph{descending} order. (Highest variance first). The ordered eigenvectors are the \emph{principle components} of the dataset $\vec X$.
-	\item Project $\vec x$ onto the first $M$ PCs.
-	}
+	\end{itemize}
+	\svspace{-5mm}
+	\item<visible@6-> \textbf{eigenvalue decomposition}
+	\item<visible@7-> Order eigenvalues in \emph{descending} order. (Highest variance first). The ordered eigenvectors are the \emph{principle components} of the dataset $\vec X$.
+	\item<visible@8-> Project $\vec x$ onto the \textcolor{red}{first} $\color{red}M$ PCs.
+	%}
 \end{enumerate}
 
 
@@ -122,7 +125,7 @@ \subsubsection{Projection onto the PC space}
 
 \subsection{Reconstruction error}
 
-\begin{frame}\frametitle{How much better is this vs. simple truncation?}
+\begin{frame}\frametitle{How much better is this vs. simple truncation of $\vec x$?}
 %\newpage
 
 \pause
@@ -153,11 +156,17 @@ \subsection{Reconstruction error}
 
 \only<3,4>{
 \begin{equation}
-E = \frac{1}{p} \sum_{\alpha = 1}^{p} (\vec x - \widetilde{\vec x})^2 = \frac{1}{p} \sum_{\alpha = 1}^{p} \sum_{j = M+1}^{N} (a_j^{(\alpha)})^2
+E = \frac{1}{p} \sum_{\alpha = 1}^{p} (\vec x - \widetilde{\vec x})^2 = \frac{1}{p} \sum_{\alpha = 1}^{p} \sum_{j = {\color{cyan}M+1}}^{{\color{cyan} N}} (a_j^{(\alpha)})^2
 \end{equation}
-The MSE is equal to the sum of variances of the final $\notesonly{N-(M+1)-1=}N-M$ components of the \emph{transformed} observations.\\
 }
-\only<4>{\notesonly{Since the PCs are ordered w.r.t to variance in descending order, }the variances of the last $N-M$ components of the transformed data are smallest.
+\only<4>{
+\svspace{5mm}
+
+The MSE is equal to the sum of variances of the \textcolor{cyan}{last} $\notesonly{N-(M+1)-1=}{\color{cyan}N-M}$ components of the \emph{transformed} observations $\vec u=\vec M^\top \vec x$.\\
+
+\svspace{5mm}
+
+\notesonly{Since the PCs are ordered w.r.t to variance in descending order, }\notesonly{t}\slidesonly{T}he variances of the \textcolor{cyan}{last} $\color{cyan}N-M$ components of the transformed data are smallest.
 The transformation is therefore optimal in the sense of minimal MSE.
 }
 

diff --git a/notes/01_pca/2_apply-pca.tex b/notes/01_pca/2_apply-pca.tex
@@ -40,7 +40,7 @@ \subsection{Applying PCA}
 \newpage
 
 
-\subsubsection{A note on implementation:}
+\subsubsection{A note on implementation}
 
 \begin{frame}\frametitle{\subsubsecname}
 
@@ -88,8 +88,8 @@ \subsubsection{A note on implementation:}
 \end{equation} (i.e. $\vec U$, $\vec V$ are orthogonal).
 \begin{itemize}
 
-\item<only@2> The columns of $\vec V$ (or rows of $\vec V^\top$) make up the eigenvectors of $\vec X^\top\vec X$.
-\item<only@3,4,5> The columns of $\vec U$ make up the eigenvectors of $\vec X~\vec X^\top$.
+\item<only@2-> The columns of $\vec V$ (or rows of $\vec V^\top$) make up the eigenvectors of $\vec X^\top\vec X$.
+\item<only@3-5> The columns of $\vec U$ make up the eigenvectors of $\vec X~\vec X^\top$.
 \item<only@4,5> $\vec S$ is a diagonal matrix. The singular values in $\vec S$ (the diagonal entries) are the \textbf{square roots} of the  eigenvalues of the eigenvectors of $\vec X~\vec X^\top$ or $\vec X^\top\vec X$.
 \end{itemize}
 
@@ -104,32 +104,33 @@ \subsubsection{A note on implementation:}
 
 \begin{frame}\frametitle{\subsubsecname}
 
-\question{When to use \textit{SVD} vs. \text{Eig}? \textbf{Not dogma}}
+\question{When to use \textit{SVD} vs. \textit{eig}? \textbf{Not dogma}}
 
 \begin{itemize}
 
 \item Numerical instabilities in eigendecomposition of very large covariance matrix $\rightarrow$ \textit{SVD}
-\item SVD not applicable to Kernel-PCA $\rightarrow$ \textit{Eig}
+\item \textit{SVD} is not applicable to Kernel-PCA $\rightarrow$ \textit{eig}
 \item Computational efficiency $\rightarrow$ \textit{SVD} (depends...?)
 
 \slidesonly{
 $$
 \vec X \in \R^{N \times p} \quad \leadsto \quad \vec C = \frac{1}{p} \vec X~\vec X^\top
 $$
 }
-\item Possible differences between PCA via SVD and PCA via eig: SVD may flip the sign.
+\item Possible differences between PCA via \textit{SVD} and PCA via \textit{eig}: \textit{SVD} may flip the sign.
 
 \notesonly{
 
-The reason for the difference in sign between the eigenvector's you would get via \text{Eig} and the eigenvector you would get via \text{SVD}\footnote{Python's scikit-learn package uses SVD for its PCA implementation.}, is that SVD, in practice, may arbitrarily flip the direction of the eigenevectors.
+The reason for the difference in sign between the eigenvector's you would get via \textit{eig} and the eigenvector you would get via \textit{SVD}\footnote{Python's scikit-learn package uses SVD for its PCA implementation.}, is that SVD, in practice, may arbitrarily flip the direction of the eigenevectors.
 Possible reasons for the sign ambiguity according to \ref{bro2008resolving}:
 \begin{enumerate}
-\item  It's a mechanism in the SVD implementation (there's more than one SVD implementation) to mitigate numerical instabilities,
+\item  It's a mechanism in the \textit{SVD} implementation (there's more than one \textit{SVD} implementation) to mitigate numerical instabilities,
 \item Due to centering of the data,
-\item an SVD impelementation could have a random component.
+\item an \textit{SVD} impelementation could have a random component.
 \end{enumerate}
 
-In both approaches you should be able to get the same reconstructions from both of them, i.e. the sign of the components along the flipped PC will also be flipped. Depending on the application, it may be necessary to "detect" if SVD flipped the direction or not. Numerical methods to detect this exist. A reason to care about what the sign should be is when you want to interpret what an eigenvector represents to possibly assign meaning to the component along this PC: Is the component positive or negative along the PC or does this point have a larger component along the PC than another point. The classical example of "interpreting" PCs would be "Eigenfaces". PCA is applied on images of faces. It is similar to what we ask you to do in Exercise 1.4. The Eigenvectors extracted from image data can be represented as images themselves. If SVD gives you "flipped" eigenvectors, visualizing them would give you the "negative" of that image. It's not as intuitive to look at negatives, but a person will still be able to make sense of them. However, imagine trying to interpret the eigenvector of other high-dimensional data that isn't as pretty as pictures. Knowing that SVD could have flipped the sign could give you a hard time trying to make sense of what the eigenvector represents.
+In both approaches you should be able to get the same reconstructions from both of them, i.e. the sign of the components along the flipped PC will also be flipped. Depending on the application, it may be necessary to ``detect'' if \textit{SVD} flipped the direction or not. Numerical methods to detect this exist. A reason to care about what the sign should be is when you want to interpret what an eigenvector represents to possibly assign meaning to the component along this PC: ``\textit{Is the component positive or negative along the PC or does this point have a larger component along the PC than another point?}''
+The classical example of ``interpreting'' PCs would be ``Eigenfaces''. PCA is applied on images of faces. It is similar to what we ask you to do in Exercise 1.4. The Eigenvectors extracted from image data can be represented as images themselves. If \textit{SVD} gives you ``flipped'' eigenvectors, visualizing them would give you the ``negative'' of that image. It's not as intuitive to look at negatives, but a person will still be able to make sense of them. However, imagine trying to interpret the eigenvector of other high-dimensional data that isn't as pretty as pictures. Knowing that \textit{SVD} could have flipped the sign could give you a hard time trying to make sense of what the eigenvector represents.
 
 }
 

diff --git a/notes/01_pca/img/meme_sort.jpg b/notes/01_pca/img/meme_sort.jpg