soft kmeans

kashefy · Jun 10, 2020 · a34494d · a34494d
1 parent a9d35ee
commit a34494d
Show file tree

Hide file tree

Showing 11 changed files with 242 additions and 57 deletions.
diff --git a/notes/08_clustering/1_kmeans.tex b/notes/08_clustering/1_kmeans.tex
@@ -11,27 +11,27 @@ \section{K-means}
 \end{frame}
 }
 
-\begin{frame}{Finding structure in the data}
-
+\begin{frame}
 K-means is simple.
+\end{frame}
 
-\pause
+\begin{frame}{Finding structure in the data}
 
 \begin{center}
 \slidesonly{
-	\includegraphics<2>[width=6cm]{img/clustering_no-color_no-labels_2p}
-	\includegraphics<3>[width=6cm]{img/clustering_no-color_no-labels_4p}
-	\includegraphics<4>[width=6cm]{img/clustering_no-color_no-labels}
+	\includegraphics<1>[width=6cm]{img/clustering_no-color_no-labels_2p}
+	\includegraphics<2>[width=6cm]{img/clustering_no-color_no-labels_4p}
+	\includegraphics<3>[width=6cm]{img/clustering_no-color_no-labels}
 	}
-	\includegraphics<5->[width=6cm]{img/clustering_color}
+	\includegraphics<4->[width=6cm]{img/clustering_color}
 	\notesonly{
 	\captionof{figure}{Example clustering of 2D points into M=2 clusters}
 	}
 \end{center}
 
 
 \slidesonly{
-\visible<6>{
+\visible<5>{
 \begin{itemize}
 \item proximity must count for something
 \item Points that fall within a region form a cluster
@@ -58,8 +58,14 @@ \subsubsection{In plain English}
 
 \question{What does clustering give us?}
 
-- Eventually, instead of describing each point by its absolute location, we will be able to describe it by the cluster it is assigned to. 
-We will also be able to describe the entire dataset by the partitions we've found that separate the clusters.
+\begin{itemize}
+\item[-] Instead of describing each point by its absolute location, we will be able to describe it by the cluster it is assigned to.
+\begin{equation}
+\text{point}~\vec x \in \R^N \quad \longrightarrow \quad~\text{cluster index}~q \in \N
+\end{equation} 
+\item We will be able to describe the entire dataset by the partitions we've found that separate the clusters.
+\item We'll draw relations between simple clustering and other algorithms (embedding algorithms, density estimation)
+\end{itemize}
 
 \end{frame}
 

diff --git a/notes/08_clustering/4_pairwise.tex b/notes/08_clustering/4_pairwise.tex
@@ -5,14 +5,16 @@ \section{Pairwise clustering}
     \begin{center} \huge
         \secname
     \end{center}
-    \begin{center}
-		Proximity to $\underset{\text{other points}}{\cancel{\text{prototype}}}$
-    \end{center}
+    \begin{itemize}
+		\item Proximity to $\underset{\text{other points}}{\cancel{\text{prototype}}}$
+		\item A proper application of mean-field annealing
+		\item A building block for ``soft'' K-means
+    \end{itemize}
 \end{frame}
 }
 
 
-\begin{frame}{Notions of distance}
+\begin{frame}{Notions of proximity}
 
 \textbf{Recall} that K-means clusters points based on their proximity to some prototype.\\
 
@@ -33,7 +35,7 @@ \section{Pairwise clustering}
 
 \end{frame}
 
-\begin{frame}{Notions of distance: pairwise distance}
+\begin{frame}{Notions of proximity: pairwise distance}
 
 \begin{center}
 \begin{minipage}{0.32\textwidth}

diff --git a/notes/08_clustering/5_pairwise_data.tex b/notes/08_clustering/5_pairwise_data.tex
@@ -1,4 +1,4 @@
-\subsection{The data}
+\subsection{Data representation}
 
 \begin{frame}{\secname:~\subsecname}
 
@@ -67,12 +67,43 @@ \subsection{The data}
 \begin{frame}
 \slidesonly{\frametitle{Other sources for pairwise distances}}
 
-A further use-case is one where pairwise distance is not measured explicitly. The data representation can already be in a pairwise fashion as a result of an algorithm. Example: Sequence alignment procedures and graph-similarity measures produce pairwise representations.\\
+A further use-case is one where pairwise distance is not measured explicitly. The data representation can already be in a pairwise fashion as a result of an algorithm.\\
+Example: Sequence alignment procedures and graph-similarity measures produce pairwise representations.\\
 
 We will opt for the \emph{squared Euclidean distance} for our pairwise clustering algorithm.
 
 \end{frame}
 
+\begin{frame}{Possibilities: transformed distances}
+
+\begin{center}
+\begin{minipage}{0.99\textwidth}
+	\begin{center}
+	\begin{minipage}{0.35\textwidth}
+		\includegraphics[width=0.9\textwidth]{img/m3_circ_data} 
+	\end{minipage}
+	\begin{minipage}{0.35\textwidth}
+		\includegraphics[width=0.9\textwidth]{img/m3_circ_pdist} 
+	\end{minipage}
+	\end{center}
+\end{minipage}
+\end{center}
+
+\begin{center}
+\begin{minipage}{0.99\textwidth}
+	\begin{center}
+	\begin{minipage}{0.35\textwidth}
+		\includegraphics[width=0.9\textwidth]{img/m3_circ_data_polar} 
+	\end{minipage}
+	\begin{minipage}{0.35\textwidth}
+		\includegraphics[width=0.9\textwidth]{img/m3_circ_pdist_polar} 
+	\end{minipage}
+	\end{center}
+\end{minipage}
+\end{center}
+
+\end{frame}
+
 \subsection{Cost}
 
 \begin{frame}
@@ -119,7 +150,18 @@ \subsection{Relation of pairwise clustering to K-means}
 
 \begin{frame}\frametitle{\subsecname}
 
+\notesonly{
 When we choose squared Euclidean distance as the distance measure for pairwise clustering, we can show that this choice let's pairwise clustering effectively find the same solution as K-means clustering.
+}
+
+\begin{center}
+\resizebox{.7\textwidth}{!}{%
+\begin{tabular}{cllll}
+Pairwise Clustering                                                                 & \multicolumn{1}{c}{$\Longrightarrow$} & \multicolumn{1}{c}{\begin{tabular}[c]{@{}c@{}}Some\\ clustering solution\end{tabular}} & \multicolumn{1}{c}{$\Longleftarrow$} & K-means \\
+\begin{tabular}[c]{@{}c@{}}using\\ $d_{\alpha\alpha'}$ := sq. Eucl. distance\end{tabular} &                                     &                                                                                        &                                  &       
+\end{tabular}%
+}
+\end{center}
 
 \end{frame}
 
@@ -327,20 +369,3 @@ \subsection{Relation of pairwise clustering to K-means}
 
 \end{frame}
 
-\subsection{The mean-field approximation for pairwise clustering}
-
-\begin{frame}
- Nomenclature ($\otimes$ $\rightarrow$ \emph{set-product, Cartesian product})\\
-
- \begin{tabular}{r l p{9cm}}
-$\big\{ \vec{m}^{(\alpha)} \big\}$: & & set of all $M$-dimensional binary vectors $\big( m_1^{(\alpha)}, m_2^{(\alpha)}, \ldots, 
-  m_M^{(\alpha)} \big)^\top$ which fulfill the normalization condition (exactly one element equals 1). \\\\
-$\mathscr{M}$: & & $\big\{ \vec{m}^{(1)} \big\} \otimes \big\{ \vec{m}^{(2)} \big\} \otimes \ldots \otimes \big\{ \vec{m}^{(p)} \big\}$\\
-& & set-product (Cartesian product) between all possible binary assignment variables i.e.\ all possible valid assignments for the full dataset\\\\
-$\mathscr{M}_{\gamma}$:& &  $\big\{ \vec{m}^{(1)} \big\} \otimes \ldots \otimes \big\{ \vec{m}^{(\gamma - 1)} \big\} \otimes
-  \big\{ \vec{m}^{(\gamma + 1)} \big\} \otimes \ldots \otimes
-  \big\{ \vec{m}^{(p)} \big\}$\\
-& &\  set of all possible assignments for all data points  \\& & \hspace{0.03cm} except $\gamma$
-\end{tabular}
-
-\end{frame}
diff --git a/notes/08_clustering/6_pairwsie_meanfield.tex b/notes/08_clustering/6_pairwsie_meanfield.tex
@@ -1,43 +1,46 @@
+\subsection{The mean-field approximation for pairwise clustering}
 
 \begin{frame}[t] 
-\slidesonly{\frametitle{The mean-field approximation for pairwise clustering}}
+\slidesonly{\frametitle{\subsecname}}
 
 \svspace{-3mm}
 
 \begin{block}{assignment noise $\rightarrow$ Gibbs distribution}
-$$
+\begin{equation}
 	P_{ \big( \big\{ m_q^{(\alpha)} \big\} \big) }
 	= \frac{1}{Z_p} \exp \Big\{ -\beta 
 	\overbrace{
 		E_{\big[ \big\{ m_q^{(\alpha)} \big\} \big]}
 		}^{= \, E_p}
 		\Big\}
-$$
+\end{equation}
 where
-$$
+\begin{equation}
 	Z_p = \sum\limits_{\mathscr{M}} \exp \Big\{ -\beta
 		E_p
 		\Big\}
-$$
+\end{equation}
 \end{block}
 \notesonly{
 This is approximated by the mean-fields:
 }
 \begin{block}{factorizing distribution}
-$$
+\begin{equation}
 	Q_{ \big[ \big\{ m_q^{(\alpha)} \big\} \big] }
 	= \frac{1}{Z_Q} \exp \Big\{ -\beta \sum\limits_{q, \gamma}
 		m_q^{(\gamma)} \underbrace{ e_q^{(\gamma)} }_{
 			\text{{\tiny mean-fields}} } \Big\}
-$$
+\end{equation}
 where:
-$$
+\begin{equation}
 	Z_Q = \sum\limits_{\mathscr{M}} \exp \Big\{ -\beta \sum\limits_{q, 
 		\gamma} m_q^{(\gamma)} e_q^{(\gamma)} \Big\}
-$$
+\end{equation}
 \end{block}
 \end{frame}
 
+\subsubsection{Calculation of the moments}
+
 \begin{frame}\frametitle{Recap calculation of the moments (general mean-field case)}
 The factorization of the distribution $Q$ simplifies the calculation of the moments. 
 This is based on the individual state variables being \emph{uncorrelated}.
@@ -49,6 +52,9 @@
 \end{equation}
 
 \end{frame}
+
+\mode<article>{
+
 \begin{frame}
 \slidesonly{
 \frametitle{Factorizing moments (general mean-field case)}
@@ -82,22 +88,48 @@
 \end{frame}
 
 \begin{frame}
-\slidesonly{\frametitle{Calculation of moments}}
-$$
+\notesonly{Some nomenclature, before we demonstrate the same factorization for the assignment variables in pairwise clustering.}
+\slidesonly{ Nomenclature }($\otimes$ $\rightarrow$ \emph{set-product, Cartesian product})\\
+
+ \begin{tabular}{r l p{9cm}}
+$\big\{ \vec{m}^{(\alpha)} \big\}$: & & set of all $M$-dimensional binary vectors $\big( m_1^{(\alpha)}, m_2^{(\alpha)}, \ldots, 
+  m_M^{(\alpha)} \big)^\top$ which fulfill the normalization condition (exactly one element equals 1). \\\\
+$\mathscr{M}$: & & $\big\{ \vec{m}^{(1)} \big\} \otimes \big\{ \vec{m}^{(2)} \big\} \otimes \ldots \otimes \big\{ \vec{m}^{(p)} \big\}$\\
+& & set-product (Cartesian product) between all possible binary assignment variables i.e.\ all possible valid assignments for the full dataset\\\\
+$\mathscr{M}_{\gamma}$:& &  $\big\{ \vec{m}^{(1)} \big\} \otimes \ldots \otimes \big\{ \vec{m}^{(\gamma - 1)} \big\} \otimes
+  \big\{ \vec{m}^{(\gamma + 1)} \big\} \otimes \ldots \otimes
+  \big\{ \vec{m}^{(p)} \big\}$\\
+& &\  set of all possible assignments for all data points  \\& & \hspace{0.03cm} except $\gamma$
+\end{tabular}
+
+\end{frame}
+
+}
+
+\begin{frame}
+\slidesonly{\frametitle{Calculation of moments for the assignment variables}}
+\begin{equation}
 		\begin{array}{lll}
 	\big< m_q^{(\gamma)} \big>_Q
 	& = \frac{1}{Z_Q} \sum\limits_{\mathscr{M}} m_q^{(\gamma)}
 		\exp \Big\{ -\beta \sum\limits_{r, \delta} 
 		m_{r}^{(\delta)} e_{r}^{(\delta)} \Big\}
 	\end{array}
-$$
+\end{equation}
 
+\slidesonly{
 \begin{itemize}
 \itr The factorization of $Q$ simplifies the calculation of the moments.
 \end{itemize}
 
+Here,\\
+$\mathscr{M}$ is the set-product (Cartesian product) between \textbf{all possible} binary assignment variables i.e.\ all possible valid assignments for the full dataset.
+}
+
 \end{frame}
 
+\mode<article>{
+
 \begin{frame}
 \slidesonly{\frametitle{Calculation of moments (derivation)}
 The factorization in \eqref{eq:factorizingMoments}) regarding valid assignments $\big\{ \vec{m}^{(\gamma)} \big\}$ for observation $\gamma$ and the rest of the variables excluding $\gamma$ (i.e. $\mathscr{M}_{\gamma}$) is valid for any functions $f,g$:
@@ -153,6 +185,8 @@
 \end{equation}
 \end{frame}
 
+}
+
 \begin{frame}
 \frametitle{Solution for calculating the moments}
 $$
@@ -174,7 +208,7 @@
 \end{block}
 \end{frame}
 
-\subsection{Soft clustering}
+\subsubsection{Soft clustering}
 
 \begin{frame}{\subsecname}
 
@@ -206,6 +240,8 @@ \subsection{Soft clustering}
 \end{figure}
 \end{frame}
 
+\subsubsection{Determining the mean-field parameters}
+
 \notesonly{
 
 Going back to mean-field approximation:
@@ -215,26 +251,71 @@ \subsection{Soft clustering}
 
 \begin{frame}[t] \slidesonly{\frametitle{Minimization of the KL-divergence}}
 \begin{block}{Mean Field equation (c.f. section on Stochastic Optimization for how to arrive at this result)}
-$$
+\begin{equation}
 	\fbox{$ \frac{\partial}{\partial e_l}\big<E_p\big>_Q 
 		- \sum\limits_k e_k \frac{\partial}{\partial e_l} \big<s_k\big>_Q = 0
 	$}
-$$
+\end{equation}
 \end{block}
-$$	\frac{\partial \big< E_p \big>_Q}{\partial e_q^{(\alpha)}}
+\begin{equation}	\frac{\partial \big< E_p \big>_Q}{\partial e_q^{(\alpha)}}
 	- \sum\limits_{r, \gamma} \frac{
 		\overbrace{ \partial \big< m_r^{(\gamma)} \big>_Q }^{
 			\substack{	\text{depends only on} \\
 					\text{data point } \gamma }}}{
 			\partial e_q^{(\alpha)}}
 		e_r^{(\gamma)} \eqexcl 0
-$$
-$$
+\end{equation}
+\begin{equation}
 	\frac{\partial \big< E_p \big>_Q}{\partial e_q^{(\alpha)}}
 	- \sum\limits_r \frac{\partial \big< m_r^{(\alpha)} \big>_Q}{
 		\partial e_q^{(\alpha)}}
 		e_r^{(\alpha)} \eqexcl 0
-$$
+\end{equation}
+
+\end{frame}
+
+%--------------------------------------------------------------------------
 
-switch to lecture slides....
+%--------------------------------------------------------------------------
+\begin{frame}[shrink=17] \frametitle{Mean-field annealing for pairwise clustering}
+\begin{figure}[!th]
+\footnotesize
+\removelatexerror
+\begin{algorithm}[H]
+  \DontPrintSemicolon
+  \textbf{Initialization:}\;
+  - (max.) number $M$ of partitions, initial  ($\beta_0$) and final  ($\beta_f$) values of the noise parameter, annealing factor  $\eta$,   convergence criterion  $\theta$\;
+  - initialize mean-fields  $e_q^{(\alpha)}$ with random numbers $\in [0, 1]$\;
+
+- $\beta \leftarrow \beta_0$\;
+\While(annealing){$\beta < \beta_f$}{
+\Repeat ( EM $\text{(fixed point iteration)}$){
+$\big| \big( e_q^{(\alpha)} \big)_{\mathrm{new}}
+	- \big( e_q^{(\alpha)} \big)_{\mathrm{old}} \big| < \theta 
+	\hspace{0.3cm}\forall q, \alpha$}{
+\hspace{-0.2cm}compute assignment probabilities: 
+$ \big< m_q^{(\alpha)} \big>_Q = \frac{ \exp \big\{ -\beta \big(e_q^{(\alpha)}
+	\big)_{\mathrm{old}}\big\} }{ \sum\limits_r \exp \big\{ -\beta \big(
+	e_r^{(\alpha)} \big)_{\mathrm{old}} \big\} } \hspace{0.3cm}  \forall
+	q, \alpha
+$\;
+\hspace{-0.2cm}compute new mean-fields: 
+\hspace{-0.6cm}
+$$\hspace{-0.2cm}\big( e_q^{(\alpha)} \big)_{\mathrm{new}} = \frac{2}{p} \frac{1}{
+	\sum\limits_{\gamma} \big< m_q^{(\gamma)} \big>_Q } \sum\limits_{\delta}
+	\big<m_q^{(\delta)} \big>_Q \cdot \bigg\{ d_{\delta \alpha} - \frac{1}{2} \frac{1}{
+		\sum\limits_{\gamma} \big< m_q^{(\gamma)} \big>_Q }
+		\sum\limits_{\varepsilon} \big< m_q^{(\varepsilon)}
+		\big>_Q d_{\varepsilon \delta} \bigg\} \hspace{0.1cm}\forall q, \alpha$$
+
+}
+$\beta \leftarrow \eta \cdot \beta$\;
+}
+\label{alg:meanFieldClustering}
+\caption{Mean-field annealing for pairwise clustering}
+\end{algorithm}
+\end{figure}
 \end{frame}
+
+%--------------------------------------------------------------------------
+