diff --git a/docs/background/plot_00_conceptual_intro.md b/docs/background/plot_00_conceptual_intro.md
index 8074fff0..49c4a250 100644
--- a/docs/background/plot_00_conceptual_intro.md
+++ b/docs/background/plot_00_conceptual_intro.md
@@ -227,13 +227,13 @@ likely it is to observe the given spike train for the computed firing rate:
 if $y(t)$ is the spike counts and $\lambda(t)$ the firing rate, the equation
 for the log-likelihood is
 
-$$ \sum\_t \log P(y(t) | \lambda(t)) = \sum\_t y(t) \log(\lambda(t)) -
+$$ \sum_t \log P(y(t) | \lambda(t)) = \sum_t y(t) \log(\lambda(t)) -
 \lambda(t) - \log (y(t)!)\tag{3}$$
 
 Note that this last $\log(y(t)!)$ term does not depend on $\lambda(t)$ and
 thus is independent of the model, so it is normally ignored.
 
-$$ \sum\_t \log P(y(t) | \lambda(t)) \propto \sum\_t y(t) \log(\lambda(t)) -
+$$ \sum_t \log P(y(t) | \lambda(t)) \propto \sum_t y(t) \log(\lambda(t)) -
 \lambda(t))\tag{4}$$
 
 This is the objective function of the GLM model: we are trying to find the
diff --git a/src/nemos/observation_models.py b/src/nemos/observation_models.py
index bea7b3d6..4a72d561 100644
--- a/src/nemos/observation_models.py
+++ b/src/nemos/observation_models.py
@@ -466,16 +466,16 @@ def _negative_log_likelihood(
         .. math::
         \begin{aligned}
         \text{LL}(\hat{\lambda} | y) &= \frac{1}{T \cdot N} \sum_{n=1}^{N} \sum_{t=1}^{T}
-        [y\_{tn} \log(\hat{\lambda}\_{tn}) - \hat{\lambda}\_{tn} - \log({y\_{tn}!})] \\\
-        &= \frac{1}{T \cdot N} \sum_{n=1}^{N} \sum_{t=1}^{T} [y\_{tn} \log(\hat{\lambda}\_{tn}) -
-        \hat{\lambda}\_{tn} - \Gamma({y\_{tn}+1})] \\\
-        &= \frac{1}{T \cdot N} \sum_{n=1}^{N} \sum_{t=1}^{T} [y\_{tn} \log(\hat{\lambda}\_{tn}) -
-        \hat{\lambda}\_{tn}] + \\text{const}
+        [y_{tn} \log(\hat{\lambda}_{tn}) - \hat{\lambda}_{tn} - \log({y_{tn}!})] \\\
+        &= \frac{1}{T \cdot N} \sum_{n=1}^{N} \sum_{t=1}^{T} [y_{tn} \log(\hat{\lambda}_{tn}) -
+        \hat{\lambda}_{tn} - \Gamma({y_{tn}+1})] \\\
+        &= \frac{1}{T \cdot N} \sum_{n=1}^{N} \sum_{t=1}^{T} [y_{tn} \log(\hat{\lambda}_{tn}) -
+        \hat{\lambda}_{tn}] + \\text{const}
         \end{aligned}
 
         Because :math:`\Gamma(k+1)=k!`, see `wikipedia <https://en.wikipedia.org/wiki/Gamma_function>` for explanation.
 
-        The :math:`\log({y\_{tn}!})` term is not a function of the parameters and can be disregarded
+        The :math:`\log({y_{tn}!})` term is not a function of the parameters and can be disregarded
         when computing the loss-function. This is why we incorporated it into the `const` term.
         """
         predicted_rate = jnp.clip(
diff --git a/src/nemos/proximal_operator.py b/src/nemos/proximal_operator.py
index 51602ed5..dc4f5b6d 100644
--- a/src/nemos/proximal_operator.py
+++ b/src/nemos/proximal_operator.py
@@ -11,7 +11,7 @@
 More formally, proximal operators solve the minimization problem,
 
 $$
-\\text{prox}\_f(\bm{v}) = \arg\min\_{\bm{x}} \left( f(\bm{x}) + \frac{1}{2}\Vert \bm{x} - \bm{v}\Vert_2 ^2 \right)
+\\text{prox}_f(\bm{v}) = \arg\min_{\bm{x}} \left( f(\bm{x}) + \frac{1}{2}\Vert \bm{x} - \bm{v}\Vert_2 ^2 \right)
 $$
 
 
@@ -106,7 +106,7 @@ def prox_group_lasso(
     The proximal operator equation are,
 
     $$
-    \text{prox}(\beta_g) = \text{min}_{\beta} \left[ \lambda  \sum\_{g=1}^G \Vert \beta_g \Vert_2 +
+    \text{prox}(\beta_g) = \text{min}_{\beta} \left[ \lambda  \sum_{g=1}^G \Vert \beta_g \Vert_2 +
      \frac{1}{2} \Vert \hat{\beta} - \beta \Vert_2^2
     \right],
     $$
@@ -115,15 +115,15 @@ def prox_group_lasso(
     The analytical solution[$^{[1]}$](#references). for the beta is,
 
     $$
-    \text{prox}(\beta\_g) = \max \left(1 - \frac{\lambda \sqrt{p\_g}}{\Vert \hat{\beta}\_g \Vert_2},
-     0\right) \cdot \hat{\beta}\_g,
+    \text{prox}(\beta_g) = \max \left(1 - \frac{\lambda \sqrt{p_g}}{\Vert \hat{\beta}_g \Vert_2},
+     0\right) \cdot \hat{\beta}_g,
     $$
-    where $p_g$ is the dimensionality of $\beta\_g$ and $\hat{\beta}$ is typically the gradient step
+    where $p_g$ is the dimensionality of $\beta_g$ and $\hat{\beta}$ is typically the gradient step
     of the un-regularized optimization objective function. It's easy to see how the group-Lasso
     proximal operator acts as a shrinkage factor for the un-penalize update, and the half-rectification
     non-linearity that effectively sets to zero group of coefficients satisfying,
     $$
-    \Vert \hat{\beta}\_g \Vert_2 \le \frac{1}{\lambda \sqrt{p\_g}}.
+    \Vert \hat{\beta}_g \Vert_2 \le \frac{1}{\lambda \sqrt{p_g}}.
     $$
 
     # References
@@ -154,8 +154,8 @@ def prox_lasso(x: Any, l1reg: Optional[Any] = None, scaling: float = 1.0) -> Any
     Minimizes the following function:
 
     $$
-      \underset{y}{\text{argmin}} ~ \frac{1}{2} ||x - y||\_2^2
-      + \text{scaling} \cdot \text{l1reg} \cdot ||y||\_1
+      \underset{y}{\text{argmin}} ~ \frac{1}{2} ||x - y||_2^2
+      + \text{scaling} \cdot \text{l1reg} \cdot ||y||_1
     $$
 
     When `l1reg` is a pytree, the weights are applied coordinate-wise.
diff --git a/src/nemos/solvers/_svrg_defaults.py b/src/nemos/solvers/_svrg_defaults.py
index a12d1098..47aea11d 100644
--- a/src/nemos/solvers/_svrg_defaults.py
+++ b/src/nemos/solvers/_svrg_defaults.py
@@ -422,7 +422,7 @@ def _calculate_optimal_batch_size_svrg(
     num_samples:
         The number of samples.
     l_smooth_max:
-        The $L\_{\text{max}}$ smoothness constant.
+        The $L_{\text{max}}$ smoothness constant.
     l_smooth:
         The $L$ smoothness constant.
     strong_convexity:
@@ -480,7 +480,7 @@ def _calculate_b_hat(num_samples: int, l_smooth_max: float, l_smooth: float):
     num_samples :
         Total number of data points.
     l_smooth_max :
-        Maximum smoothness constant $L\_{\text{max}}$.
+        Maximum smoothness constant $L_{\text{max}}$.
     l_smooth :
         Smoothness constant $L$.