Skip to content

Commit

Permalink
soft kmeans
Browse files Browse the repository at this point in the history
  • Loading branch information
kashefy committed Jun 10, 2020
1 parent a9d35ee commit a34494d
Show file tree
Hide file tree
Showing 11 changed files with 242 additions and 57 deletions.
26 changes: 16 additions & 10 deletions notes/08_clustering/1_kmeans.tex
Original file line number Diff line number Diff line change
Expand Up @@ -11,27 +11,27 @@ \section{K-means}
\end{frame}
}

\begin{frame}{Finding structure in the data}

\begin{frame}
K-means is simple.
\end{frame}

\pause
\begin{frame}{Finding structure in the data}

\begin{center}
\slidesonly{
\includegraphics<2>[width=6cm]{img/clustering_no-color_no-labels_2p}
\includegraphics<3>[width=6cm]{img/clustering_no-color_no-labels_4p}
\includegraphics<4>[width=6cm]{img/clustering_no-color_no-labels}
\includegraphics<1>[width=6cm]{img/clustering_no-color_no-labels_2p}
\includegraphics<2>[width=6cm]{img/clustering_no-color_no-labels_4p}
\includegraphics<3>[width=6cm]{img/clustering_no-color_no-labels}
}
\includegraphics<5->[width=6cm]{img/clustering_color}
\includegraphics<4->[width=6cm]{img/clustering_color}
\notesonly{
\captionof{figure}{Example clustering of 2D points into M=2 clusters}
}
\end{center}


\slidesonly{
\visible<6>{
\visible<5>{
\begin{itemize}
\item proximity must count for something
\item Points that fall within a region form a cluster
Expand All @@ -58,8 +58,14 @@ \subsubsection{In plain English}

\question{What does clustering give us?}

- Eventually, instead of describing each point by its absolute location, we will be able to describe it by the cluster it is assigned to.
We will also be able to describe the entire dataset by the partitions we've found that separate the clusters.
\begin{itemize}
\item[-] Instead of describing each point by its absolute location, we will be able to describe it by the cluster it is assigned to.
\begin{equation}
\text{point}~\vec x \in \R^N \quad \longrightarrow \quad~\text{cluster index}~q \in \N
\end{equation}
\item We will be able to describe the entire dataset by the partitions we've found that separate the clusters.
\item We'll draw relations between simple clustering and other algorithms (embedding algorithms, density estimation)
\end{itemize}

\end{frame}

Expand Down
12 changes: 7 additions & 5 deletions notes/08_clustering/4_pairwise.tex
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,16 @@ \section{Pairwise clustering}
\begin{center} \huge
\secname
\end{center}
\begin{center}
Proximity to $\underset{\text{other points}}{\cancel{\text{prototype}}}$
\end{center}
\begin{itemize}
\item Proximity to $\underset{\text{other points}}{\cancel{\text{prototype}}}$
\item A proper application of mean-field annealing
\item A building block for ``soft'' K-means
\end{itemize}
\end{frame}
}


\begin{frame}{Notions of distance}
\begin{frame}{Notions of proximity}

\textbf{Recall} that K-means clusters points based on their proximity to some prototype.\\

Expand All @@ -33,7 +35,7 @@ \section{Pairwise clustering}

\end{frame}

\begin{frame}{Notions of distance: pairwise distance}
\begin{frame}{Notions of proximity: pairwise distance}

\begin{center}
\begin{minipage}{0.32\textwidth}
Expand Down
63 changes: 44 additions & 19 deletions notes/08_clustering/5_pairwise_data.tex
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
\subsection{The data}
\subsection{Data representation}

\begin{frame}{\secname:~\subsecname}

Expand Down Expand Up @@ -67,12 +67,43 @@ \subsection{The data}
\begin{frame}
\slidesonly{\frametitle{Other sources for pairwise distances}}

A further use-case is one where pairwise distance is not measured explicitly. The data representation can already be in a pairwise fashion as a result of an algorithm. Example: Sequence alignment procedures and graph-similarity measures produce pairwise representations.\\
A further use-case is one where pairwise distance is not measured explicitly. The data representation can already be in a pairwise fashion as a result of an algorithm.\\
Example: Sequence alignment procedures and graph-similarity measures produce pairwise representations.\\

We will opt for the \emph{squared Euclidean distance} for our pairwise clustering algorithm.

\end{frame}

\begin{frame}{Possibilities: transformed distances}

\begin{center}
\begin{minipage}{0.99\textwidth}
\begin{center}
\begin{minipage}{0.35\textwidth}
\includegraphics[width=0.9\textwidth]{img/m3_circ_data}
\end{minipage}
\begin{minipage}{0.35\textwidth}
\includegraphics[width=0.9\textwidth]{img/m3_circ_pdist}
\end{minipage}
\end{center}
\end{minipage}
\end{center}

\begin{center}
\begin{minipage}{0.99\textwidth}
\begin{center}
\begin{minipage}{0.35\textwidth}
\includegraphics[width=0.9\textwidth]{img/m3_circ_data_polar}
\end{minipage}
\begin{minipage}{0.35\textwidth}
\includegraphics[width=0.9\textwidth]{img/m3_circ_pdist_polar}
\end{minipage}
\end{center}
\end{minipage}
\end{center}

\end{frame}

\subsection{Cost}

\begin{frame}
Expand Down Expand Up @@ -119,7 +150,18 @@ \subsection{Relation of pairwise clustering to K-means}

\begin{frame}\frametitle{\subsecname}

\notesonly{
When we choose squared Euclidean distance as the distance measure for pairwise clustering, we can show that this choice let's pairwise clustering effectively find the same solution as K-means clustering.
}

\begin{center}
\resizebox{.7\textwidth}{!}{%
\begin{tabular}{cllll}
Pairwise Clustering & \multicolumn{1}{c}{$\Longrightarrow$} & \multicolumn{1}{c}{\begin{tabular}[c]{@{}c@{}}Some\\ clustering solution\end{tabular}} & \multicolumn{1}{c}{$\Longleftarrow$} & K-means \\
\begin{tabular}[c]{@{}c@{}}using\\ $d_{\alpha\alpha'}$ := sq. Eucl. distance\end{tabular} & & & &
\end{tabular}%
}
\end{center}

\end{frame}

Expand Down Expand Up @@ -327,20 +369,3 @@ \subsection{Relation of pairwise clustering to K-means}

\end{frame}

\subsection{The mean-field approximation for pairwise clustering}

\begin{frame}
Nomenclature ($\otimes$ $\rightarrow$ \emph{set-product, Cartesian product})\\

\begin{tabular}{r l p{9cm}}
$\big\{ \vec{m}^{(\alpha)} \big\}$: & & set of all $M$-dimensional binary vectors $\big( m_1^{(\alpha)}, m_2^{(\alpha)}, \ldots,
m_M^{(\alpha)} \big)^\top$ which fulfill the normalization condition (exactly one element equals 1). \\\\
$\mathscr{M}$: & & $\big\{ \vec{m}^{(1)} \big\} \otimes \big\{ \vec{m}^{(2)} \big\} \otimes \ldots \otimes \big\{ \vec{m}^{(p)} \big\}$\\
& & set-product (Cartesian product) between all possible binary assignment variables i.e.\ all possible valid assignments for the full dataset\\\\
$\mathscr{M}_{\gamma}$:& & $\big\{ \vec{m}^{(1)} \big\} \otimes \ldots \otimes \big\{ \vec{m}^{(\gamma - 1)} \big\} \otimes
\big\{ \vec{m}^{(\gamma + 1)} \big\} \otimes \ldots \otimes
\big\{ \vec{m}^{(p)} \big\}$\\
& &\ set of all possible assignments for all data points \\& & \hspace{0.03cm} except $\gamma$
\end{tabular}

\end{frame}
121 changes: 101 additions & 20 deletions notes/08_clustering/6_pairwsie_meanfield.tex
Original file line number Diff line number Diff line change
@@ -1,43 +1,46 @@
\subsection{The mean-field approximation for pairwise clustering}

\begin{frame}[t]
\slidesonly{\frametitle{The mean-field approximation for pairwise clustering}}
\slidesonly{\frametitle{\subsecname}}

\svspace{-3mm}

\begin{block}{assignment noise $\rightarrow$ Gibbs distribution}
$$
\begin{equation}
P_{ \big( \big\{ m_q^{(\alpha)} \big\} \big) }
= \frac{1}{Z_p} \exp \Big\{ -\beta
\overbrace{
E_{\big[ \big\{ m_q^{(\alpha)} \big\} \big]}
}^{= \, E_p}
\Big\}
$$
\end{equation}
where
$$
\begin{equation}
Z_p = \sum\limits_{\mathscr{M}} \exp \Big\{ -\beta
E_p
\Big\}
$$
\end{equation}
\end{block}
\notesonly{
This is approximated by the mean-fields:
}
\begin{block}{factorizing distribution}
$$
\begin{equation}
Q_{ \big[ \big\{ m_q^{(\alpha)} \big\} \big] }
= \frac{1}{Z_Q} \exp \Big\{ -\beta \sum\limits_{q, \gamma}
m_q^{(\gamma)} \underbrace{ e_q^{(\gamma)} }_{
\text{{\tiny mean-fields}} } \Big\}
$$
\end{equation}
where:
$$
\begin{equation}
Z_Q = \sum\limits_{\mathscr{M}} \exp \Big\{ -\beta \sum\limits_{q,
\gamma} m_q^{(\gamma)} e_q^{(\gamma)} \Big\}
$$
\end{equation}
\end{block}
\end{frame}

\subsubsection{Calculation of the moments}

\begin{frame}\frametitle{Recap calculation of the moments (general mean-field case)}
The factorization of the distribution $Q$ simplifies the calculation of the moments.
This is based on the individual state variables being \emph{uncorrelated}.
Expand All @@ -49,6 +52,9 @@
\end{equation}

\end{frame}

\mode<article>{

\begin{frame}
\slidesonly{
\frametitle{Factorizing moments (general mean-field case)}
Expand Down Expand Up @@ -82,22 +88,48 @@
\end{frame}

\begin{frame}
\slidesonly{\frametitle{Calculation of moments}}
$$
\notesonly{Some nomenclature, before we demonstrate the same factorization for the assignment variables in pairwise clustering.}
\slidesonly{ Nomenclature }($\otimes$ $\rightarrow$ \emph{set-product, Cartesian product})\\

\begin{tabular}{r l p{9cm}}
$\big\{ \vec{m}^{(\alpha)} \big\}$: & & set of all $M$-dimensional binary vectors $\big( m_1^{(\alpha)}, m_2^{(\alpha)}, \ldots,
m_M^{(\alpha)} \big)^\top$ which fulfill the normalization condition (exactly one element equals 1). \\\\
$\mathscr{M}$: & & $\big\{ \vec{m}^{(1)} \big\} \otimes \big\{ \vec{m}^{(2)} \big\} \otimes \ldots \otimes \big\{ \vec{m}^{(p)} \big\}$\\
& & set-product (Cartesian product) between all possible binary assignment variables i.e.\ all possible valid assignments for the full dataset\\\\
$\mathscr{M}_{\gamma}$:& & $\big\{ \vec{m}^{(1)} \big\} \otimes \ldots \otimes \big\{ \vec{m}^{(\gamma - 1)} \big\} \otimes
\big\{ \vec{m}^{(\gamma + 1)} \big\} \otimes \ldots \otimes
\big\{ \vec{m}^{(p)} \big\}$\\
& &\ set of all possible assignments for all data points \\& & \hspace{0.03cm} except $\gamma$
\end{tabular}

\end{frame}

}

\begin{frame}
\slidesonly{\frametitle{Calculation of moments for the assignment variables}}
\begin{equation}
\begin{array}{lll}
\big< m_q^{(\gamma)} \big>_Q
& = \frac{1}{Z_Q} \sum\limits_{\mathscr{M}} m_q^{(\gamma)}
\exp \Big\{ -\beta \sum\limits_{r, \delta}
m_{r}^{(\delta)} e_{r}^{(\delta)} \Big\}
\end{array}
$$
\end{equation}

\slidesonly{
\begin{itemize}
\itr The factorization of $Q$ simplifies the calculation of the moments.
\end{itemize}

Here,\\
$\mathscr{M}$ is the set-product (Cartesian product) between \textbf{all possible} binary assignment variables i.e.\ all possible valid assignments for the full dataset.
}

\end{frame}

\mode<article>{

\begin{frame}
\slidesonly{\frametitle{Calculation of moments (derivation)}
The factorization in \eqref{eq:factorizingMoments}) regarding valid assignments $\big\{ \vec{m}^{(\gamma)} \big\}$ for observation $\gamma$ and the rest of the variables excluding $\gamma$ (i.e. $\mathscr{M}_{\gamma}$) is valid for any functions $f,g$:
Expand Down Expand Up @@ -153,6 +185,8 @@
\end{equation}
\end{frame}

}

\begin{frame}
\frametitle{Solution for calculating the moments}
$$
Expand All @@ -174,7 +208,7 @@
\end{block}
\end{frame}

\subsection{Soft clustering}
\subsubsection{Soft clustering}

\begin{frame}{\subsecname}

Expand Down Expand Up @@ -206,6 +240,8 @@ \subsection{Soft clustering}
\end{figure}
\end{frame}

\subsubsection{Determining the mean-field parameters}

\notesonly{

Going back to mean-field approximation:
Expand All @@ -215,26 +251,71 @@ \subsection{Soft clustering}

\begin{frame}[t] \slidesonly{\frametitle{Minimization of the KL-divergence}}
\begin{block}{Mean Field equation (c.f. section on Stochastic Optimization for how to arrive at this result)}
$$
\begin{equation}
\fbox{$ \frac{\partial}{\partial e_l}\big<E_p\big>_Q
- \sum\limits_k e_k \frac{\partial}{\partial e_l} \big<s_k\big>_Q = 0
$}
$$
\end{equation}
\end{block}
$$ \frac{\partial \big< E_p \big>_Q}{\partial e_q^{(\alpha)}}
\begin{equation} \frac{\partial \big< E_p \big>_Q}{\partial e_q^{(\alpha)}}
- \sum\limits_{r, \gamma} \frac{
\overbrace{ \partial \big< m_r^{(\gamma)} \big>_Q }^{
\substack{ \text{depends only on} \\
\text{data point } \gamma }}}{
\partial e_q^{(\alpha)}}
e_r^{(\gamma)} \eqexcl 0
$$
$$
\end{equation}
\begin{equation}
\frac{\partial \big< E_p \big>_Q}{\partial e_q^{(\alpha)}}
- \sum\limits_r \frac{\partial \big< m_r^{(\alpha)} \big>_Q}{
\partial e_q^{(\alpha)}}
e_r^{(\alpha)} \eqexcl 0
$$
\end{equation}

\end{frame}

%--------------------------------------------------------------------------

switch to lecture slides....
%--------------------------------------------------------------------------
\begin{frame}[shrink=17] \frametitle{Mean-field annealing for pairwise clustering}
\begin{figure}[!th]
\footnotesize
\removelatexerror
\begin{algorithm}[H]
\DontPrintSemicolon
\textbf{Initialization:}\;
- (max.) number $M$ of partitions, initial ($\beta_0$) and final ($\beta_f$) values of the noise parameter, annealing factor $\eta$, convergence criterion $\theta$\;
- initialize mean-fields $e_q^{(\alpha)}$ with random numbers $\in [0, 1]$\;

- $\beta \leftarrow \beta_0$\;
\While(annealing){$\beta < \beta_f$}{
\Repeat ( EM $\text{(fixed point iteration)}$){
$\big| \big( e_q^{(\alpha)} \big)_{\mathrm{new}}
- \big( e_q^{(\alpha)} \big)_{\mathrm{old}} \big| < \theta
\hspace{0.3cm}\forall q, \alpha$}{
\hspace{-0.2cm}compute assignment probabilities:
$ \big< m_q^{(\alpha)} \big>_Q = \frac{ \exp \big\{ -\beta \big(e_q^{(\alpha)}
\big)_{\mathrm{old}}\big\} }{ \sum\limits_r \exp \big\{ -\beta \big(
e_r^{(\alpha)} \big)_{\mathrm{old}} \big\} } \hspace{0.3cm} \forall
q, \alpha
$\;
\hspace{-0.2cm}compute new mean-fields:
\hspace{-0.6cm}
$$\hspace{-0.2cm}\big( e_q^{(\alpha)} \big)_{\mathrm{new}} = \frac{2}{p} \frac{1}{
\sum\limits_{\gamma} \big< m_q^{(\gamma)} \big>_Q } \sum\limits_{\delta}
\big<m_q^{(\delta)} \big>_Q \cdot \bigg\{ d_{\delta \alpha} - \frac{1}{2} \frac{1}{
\sum\limits_{\gamma} \big< m_q^{(\gamma)} \big>_Q }
\sum\limits_{\varepsilon} \big< m_q^{(\varepsilon)}
\big>_Q d_{\varepsilon \delta} \bigg\} \hspace{0.1cm}\forall q, \alpha$$

}
$\beta \leftarrow \eta \cdot \beta$\;
}
\label{alg:meanFieldClustering}
\caption{Mean-field annealing for pairwise clustering}
\end{algorithm}
\end{figure}
\end{frame}

%--------------------------------------------------------------------------

Loading

0 comments on commit a34494d

Please sign in to comment.