diff --git a/docs/background/plot_00_conceptual_intro.md b/docs/background/plot_00_conceptual_intro.md index 8074fff0..49c4a250 100644 --- a/docs/background/plot_00_conceptual_intro.md +++ b/docs/background/plot_00_conceptual_intro.md @@ -227,13 +227,13 @@ likely it is to observe the given spike train for the computed firing rate: if $y(t)$ is the spike counts and $\lambda(t)$ the firing rate, the equation for the log-likelihood is -$$ \sum\_t \log P(y(t) | \lambda(t)) = \sum\_t y(t) \log(\lambda(t)) - +$$ \sum_t \log P(y(t) | \lambda(t)) = \sum_t y(t) \log(\lambda(t)) - \lambda(t) - \log (y(t)!)\tag{3}$$ Note that this last $\log(y(t)!)$ term does not depend on $\lambda(t)$ and thus is independent of the model, so it is normally ignored. -$$ \sum\_t \log P(y(t) | \lambda(t)) \propto \sum\_t y(t) \log(\lambda(t)) - +$$ \sum_t \log P(y(t) | \lambda(t)) \propto \sum_t y(t) \log(\lambda(t)) - \lambda(t))\tag{4}$$ This is the objective function of the GLM model: we are trying to find the diff --git a/src/nemos/observation_models.py b/src/nemos/observation_models.py index bea7b3d6..4a72d561 100644 --- a/src/nemos/observation_models.py +++ b/src/nemos/observation_models.py @@ -466,16 +466,16 @@ def _negative_log_likelihood( .. math:: \begin{aligned} \text{LL}(\hat{\lambda} | y) &= \frac{1}{T \cdot N} \sum_{n=1}^{N} \sum_{t=1}^{T} - [y\_{tn} \log(\hat{\lambda}\_{tn}) - \hat{\lambda}\_{tn} - \log({y\_{tn}!})] \\\ - &= \frac{1}{T \cdot N} \sum_{n=1}^{N} \sum_{t=1}^{T} [y\_{tn} \log(\hat{\lambda}\_{tn}) - - \hat{\lambda}\_{tn} - \Gamma({y\_{tn}+1})] \\\ - &= \frac{1}{T \cdot N} \sum_{n=1}^{N} \sum_{t=1}^{T} [y\_{tn} \log(\hat{\lambda}\_{tn}) - - \hat{\lambda}\_{tn}] + \\text{const} + [y_{tn} \log(\hat{\lambda}_{tn}) - \hat{\lambda}_{tn} - \log({y_{tn}!})] \\\ + &= \frac{1}{T \cdot N} \sum_{n=1}^{N} \sum_{t=1}^{T} [y_{tn} \log(\hat{\lambda}_{tn}) - + \hat{\lambda}_{tn} - \Gamma({y_{tn}+1})] \\\ + &= \frac{1}{T \cdot N} \sum_{n=1}^{N} \sum_{t=1}^{T} [y_{tn} \log(\hat{\lambda}_{tn}) - + \hat{\lambda}_{tn}] + \\text{const} \end{aligned} Because :math:`\Gamma(k+1)=k!`, see `wikipedia ` for explanation. - The :math:`\log({y\_{tn}!})` term is not a function of the parameters and can be disregarded + The :math:`\log({y_{tn}!})` term is not a function of the parameters and can be disregarded when computing the loss-function. This is why we incorporated it into the `const` term. """ predicted_rate = jnp.clip( diff --git a/src/nemos/proximal_operator.py b/src/nemos/proximal_operator.py index 51602ed5..dc4f5b6d 100644 --- a/src/nemos/proximal_operator.py +++ b/src/nemos/proximal_operator.py @@ -11,7 +11,7 @@ More formally, proximal operators solve the minimization problem, $$ -\\text{prox}\_f(\bm{v}) = \arg\min\_{\bm{x}} \left( f(\bm{x}) + \frac{1}{2}\Vert \bm{x} - \bm{v}\Vert_2 ^2 \right) +\\text{prox}_f(\bm{v}) = \arg\min_{\bm{x}} \left( f(\bm{x}) + \frac{1}{2}\Vert \bm{x} - \bm{v}\Vert_2 ^2 \right) $$ @@ -106,7 +106,7 @@ def prox_group_lasso( The proximal operator equation are, $$ - \text{prox}(\beta_g) = \text{min}_{\beta} \left[ \lambda \sum\_{g=1}^G \Vert \beta_g \Vert_2 + + \text{prox}(\beta_g) = \text{min}_{\beta} \left[ \lambda \sum_{g=1}^G \Vert \beta_g \Vert_2 + \frac{1}{2} \Vert \hat{\beta} - \beta \Vert_2^2 \right], $$ @@ -115,15 +115,15 @@ def prox_group_lasso( The analytical solution[$^{[1]}$](#references). for the beta is, $$ - \text{prox}(\beta\_g) = \max \left(1 - \frac{\lambda \sqrt{p\_g}}{\Vert \hat{\beta}\_g \Vert_2}, - 0\right) \cdot \hat{\beta}\_g, + \text{prox}(\beta_g) = \max \left(1 - \frac{\lambda \sqrt{p_g}}{\Vert \hat{\beta}_g \Vert_2}, + 0\right) \cdot \hat{\beta}_g, $$ - where $p_g$ is the dimensionality of $\beta\_g$ and $\hat{\beta}$ is typically the gradient step + where $p_g$ is the dimensionality of $\beta_g$ and $\hat{\beta}$ is typically the gradient step of the un-regularized optimization objective function. It's easy to see how the group-Lasso proximal operator acts as a shrinkage factor for the un-penalize update, and the half-rectification non-linearity that effectively sets to zero group of coefficients satisfying, $$ - \Vert \hat{\beta}\_g \Vert_2 \le \frac{1}{\lambda \sqrt{p\_g}}. + \Vert \hat{\beta}_g \Vert_2 \le \frac{1}{\lambda \sqrt{p_g}}. $$ # References @@ -154,8 +154,8 @@ def prox_lasso(x: Any, l1reg: Optional[Any] = None, scaling: float = 1.0) -> Any Minimizes the following function: $$ - \underset{y}{\text{argmin}} ~ \frac{1}{2} ||x - y||\_2^2 - + \text{scaling} \cdot \text{l1reg} \cdot ||y||\_1 + \underset{y}{\text{argmin}} ~ \frac{1}{2} ||x - y||_2^2 + + \text{scaling} \cdot \text{l1reg} \cdot ||y||_1 $$ When `l1reg` is a pytree, the weights are applied coordinate-wise. diff --git a/src/nemos/solvers/_svrg_defaults.py b/src/nemos/solvers/_svrg_defaults.py index a12d1098..47aea11d 100644 --- a/src/nemos/solvers/_svrg_defaults.py +++ b/src/nemos/solvers/_svrg_defaults.py @@ -422,7 +422,7 @@ def _calculate_optimal_batch_size_svrg( num_samples: The number of samples. l_smooth_max: - The $L\_{\text{max}}$ smoothness constant. + The $L_{\text{max}}$ smoothness constant. l_smooth: The $L$ smoothness constant. strong_convexity: @@ -480,7 +480,7 @@ def _calculate_b_hat(num_samples: int, l_smooth_max: float, l_smooth: float): num_samples : Total number of data points. l_smooth_max : - Maximum smoothness constant $L\_{\text{max}}$. + Maximum smoothness constant $L_{\text{max}}$. l_smooth : Smoothness constant $L$.