diff --git a/notes/05_infomax/3_cost.tex b/notes/05_infomax/3_cost.tex index 2113712..55196a4 100644 --- a/notes/05_infomax/3_cost.tex +++ b/notes/05_infomax/3_cost.tex @@ -16,12 +16,16 @@ \section{Empirical Risk Minimization} Consider the following perceptron network with $N$ inputs and $N$ outputs: \begin{figure}[ht] \centering -\includegraphics[width=6cm]{img/section2_fig16} +\includegraphics[width=4.5cm]{img/section2_fig16} %\caption{$N-N$ perceptron network} \end{figure} where +\svspace{-4mm} \begin{equation} -\widehat{u}_i := \underbrace{ +\widehat{u}_i := \widehat{f}_i + \big( + s_i + \big) = \underbrace{ \widehat{f}_i \Big( \sum_{j=1}^{N} \mathrm{w}_{ij} \mathrm{x}_j @@ -33,6 +37,7 @@ \section{Empirical Risk Minimization} } \end{equation} and observations: +\svspace{-4mm} \begin{equation} \vec{x}^{(\alpha)} \in \mathbb{R}^N, \quad \alpha = 1, \ldots, p @@ -41,6 +46,8 @@ \section{Empirical Risk Minimization} \end{frame} +\begin{frame} + Deriving the cost function for this network to find the Infomax solution: \begin{equation} \label{eq:conservationvec} @@ -57,27 +64,69 @@ \section{Empirical Risk Minimization} & = \frac{P_{\vec{x}}(\vec{x})}{|\det \vec{J}\,|} \end{align} with elements of the Jacobian $\vec J$ given as +\slidesonly{ + \begingroup + \small +} \begin{align} \label{eq:jacobelement} J_{ij}= \frac{\partial \widehat{u}_i}{\partial \mathrm{x}_j} & = \frac{\partial}{\partial \mathrm{x}_j} \widehat{f}_i \bigg( \sum\limits_{k = 1}^N \mathrm{w}_{ik} - \mathrm{x}_k \bigg) \\ - & = \mathrm{w}_{ij} \widehat{f}_i^{'} \bigg( \sum\limits_{k = 1}^N + \mathrm{x}_k \bigg) \notesonly{\\ + &} = \mathrm{w}_{ij} \widehat{f}_i^{'} \bigg( \sum\limits_{k = 1}^N \mathrm{w}_{ik} \mathrm{x}_k \bigg). \end{align} +\slidesonly{ + \endgroup +} + We therefore obtain for the value of the Jacobian determinant +\slidesonly{ + \begingroup + \footnotesize +} \begin{equation} \label{eq:functionalDeterminant} |\det \vec {J}\,| = \Big| \det \frac{\partial \widehat{\vec{u}}}{\partial \vec{x}} \Big| = |\det \vec{W}\, | \prod\limits_{l = 1}^N \widehat{f}_l^{'} \Bigg( \sum\limits_{k = 1}^N \mathrm{w}_{lk} \mathrm{x}_k \Bigg). \end{equation} +\slidesonly{ + \endgroup +} + +\end{frame} \clearpage -Inserting \eqref{eq:conservationvec} and \eqref{eq:uxj} into the Infomax cost function from \eqref{eq:infomax} gives +\begin{frame} + +\slidesonly{ +\visible<1->{ +\vspace{-7mm} +\hspace{8.0cm} +\StickyNote[1.7cm]{ + \begingroup + \scriptsize +\begin{equation} +%\label{eq:conservationvec} + P_{\vec{u}} (\widehat{\vec{u}}) d \widehat{\vec{u}} + = P_{\vec{x}}(\vec{x}) d \vec{x} +\end{equation} +\begin{equation} +%\label{eq:uxj} + P_{\vec{u}} (\widehat{\vec{u}}) + = \frac{P_{\vec{x}}(\vec{x})}{|\det \vec{J}\,|} +\end{equation} + \endgroup +}[3.cm] % width +\vspace{-22mm} +} +} + +\notesonly{Inserting \eqref{eq:conservationvec} and \eqref{eq:uxj} into the Infomax cost function from \eqref{eq:infomax} gives} \begin{eqnarray} H & = & -\int d \widehat{\vec{u}} P_{\vec{u}} (\widehat{\vec{u}}) \ln P_{\vec{u}} (\widehat{\vec{u}}) \\ @@ -89,14 +138,45 @@ \section{Empirical Risk Minimization} }_{ \text{constant w.r.t. } \vec W } + \int d \vec{x} P_{\vec{x}} (\vec{x}) \ln |\det \vec{J}\,| \end{eqnarray} -and with \eqref{eq:functionalDeterminant} we can formulate the cost -in terms that explicitly depend on $\vec W$ and its components: +and with \notesonly{\eqref{eq:functionalDeterminant}} +\slidesonly{ + \begingroup + \footnotesize +} +\begin{equation} \label{eq:functionalDeterminant} +|\det \vec {J}\,| = + \Big| \det \frac{\partial \widehat{\vec{u}}}{\partial \vec{x}} \Big| + = |\det \vec{W}\, | \prod\limits_{l = 1}^N \widehat{f}_l^{'} \Bigg( + \sum\limits_{k = 1}^N \mathrm{w}_{lk} \mathrm{x}_k \Bigg). +\end{equation} +\slidesonly{ + \endgroup +} +we can formulate the cost +in terms that depend on components in $\vec W$: +\begin{equation} + H =~\text{const.} \, + \; \ln |\det \vec{W}\,| \underbrace{\int d \vec{x} P_{\vec{x}} (\vec{x})}_{=\,1} + + \int d \vec{x} P_{\vec{x}} (\vec{x}) \sum\limits_{l = 1}^N + \ln \widehat{f}_l^{'} \Bigg( \sum\limits_{k = 1}^N + \mathrm{w}_{lk} \mathrm{x}_k \Bigg). +\end{equation} + +\end{frame} + +\begin{frame}{Generalization cost} + +\only<1>{ +\slidesonly{ \begin{equation} - H = const. \, + \; \ln |\det \vec{W}\,| \underbrace{\int d \vec{x} P_{\vec{x}} (\vec{x})}_{=\,1} + H =~\text{const.} \, + \; \ln |\det \vec{W}\,| \underbrace{\int d \vec{x} P_{\vec{x}} (\vec{x})}_{=\,1} + \int d \vec{x} P_{\vec{x}} (\vec{x}) \sum\limits_{l = 1}^N \ln \widehat{f}_l^{'} \Bigg( \sum\limits_{k = 1}^N \mathrm{w}_{lk} \mathrm{x}_k \Bigg). \end{equation} +} +} + + This enables us to define the generlization cost $E^G$ for model selection: \begin{equation} \tag{generalization cost} E^G = \ln |\det \vec W\,| + \int d \vec{x} P_{\vec{x}} (\vec{x}) @@ -105,6 +185,8 @@ \section{Empirical Risk Minimization} \mathrm{w}_{lk} \mathrm{x}_k \Bigg) \Bigg\} \end{equation} + +\only<2>{ The \emph{principle of empirical risk minimization} (in our particular case maximization) allows \begin{center} mathematical expectation $E^G \longrightarrow$ empirical average $E^T$ @@ -118,7 +200,10 @@ \section{Empirical Risk Minimization} \end{equation} can be used for model selection using empirical data \begin{equation} -E^T \eqexcl \max +E^T \eqexcl \max_{\vec W} \end{equation} +} + +\end{frame} \newpage