diff --git a/optax/_src/loss.py b/optax/_src/loss.py
index e1062bde..bebd3e1b 100644
--- a/optax/_src/loss.py
+++ b/optax/_src/loss.py
@@ -86,7 +86,7 @@ def l2_loss(
 def huber_loss(
     predictions: chex.Array,
     targets: Optional[chex.Array] = None,
-    delta: float = 1.) -> chex.Array:
+    delta: float = 1.) -> jnp.ndarray:
   """Huber loss, similar to L2 loss close to zero, L1 loss away from zero.
 
   If gradient descent is applied to the `huber loss`, it is equivalent to
@@ -118,7 +118,7 @@ def huber_loss(
 def smooth_labels(
     labels: chex.Array,
     alpha: float,
-) -> jnp.ndarray:
+) -> chex.Array:
   """Apply label smoothing.
 
   Label smoothing is often used in combination with a cross-entropy loss.
@@ -140,7 +140,10 @@ def smooth_labels(
   return (1.0 - alpha) * labels + alpha / num_categories
 
 
-def sigmoid_binary_cross_entropy(logits, labels):
+def sigmoid_binary_cross_entropy(
+    logits: chex.Array,
+    labels: chex.Array,
+) -> jnp.ndarray:
   """Computes element-wise sigmoid cross entropy given logits and labels.
 
   This function can be used for binary or multiclass classification (where each
@@ -178,7 +181,7 @@ class is an independent binary prediction and different classes are not
 def softmax_cross_entropy(
     logits: chex.Array,
     labels: chex.Array,
-) -> chex.Array:
+) -> jnp.ndarray:
   """Computes the softmax cross entropy between sets of logits and labels.
 
   Measures the probability error in discrete classification tasks in which
@@ -206,7 +209,7 @@ def softmax_cross_entropy(
 def softmax_cross_entropy_with_integer_labels(
     logits: chex.Array,
     labels: chex.Array,
-) -> chex.Array:
+) -> jnp.ndarray:
   """Computes softmax cross entropy between sets of logits and integer labels.
 
   Measures the probability error in discrete classification tasks in which
@@ -242,7 +245,7 @@ def cosine_similarity(
     predictions: chex.Array,
     targets: chex.Array,
     epsilon: float = 0.,
-) -> chex.Array:
+) -> jnp.ndarray:
   r"""Computes the cosine similarity between targets and predictions.
 
   The cosine **similarity** is a measure of similarity between vectors defined
@@ -277,7 +280,7 @@ def cosine_distance(
     predictions: chex.Array,
     targets: chex.Array,
     epsilon: float = 0.,
-) -> chex.Array:
+) -> jnp.ndarray:
   r"""Computes the cosine distance between targets and predictions.
 
   The cosine **distance**, implemented here, measures the **dissimilarity**
@@ -302,7 +305,7 @@ def cosine_distance(
 def log_cosh(
     predictions: chex.Array,
     targets: Optional[chex.Array] = None,
-) -> chex.Array:
+) -> jnp.ndarray:
   """Calculates the log-cosh loss for a set of predictions.
 
   log(cosh(x)) is approximately `(x**2) / 2` for small x and `abs(x) - log(2)`
@@ -331,7 +334,7 @@ def ctc_loss_with_forward_probs(
     labels: chex.Array,
     label_paddings: chex.Array,
     blank_id: int = 0,
-    log_epsilon: float = -1e5) -> Tuple[chex.Array, chex.Array, chex.Array]:
+    log_epsilon: float = -1e5) -> Tuple[jnp.ndarray, jnp.ndarray, jnp.ndarray]:
   r"""Computes CTC loss and CTC forward-probabilities.
 
   The CTC loss is a loss function based on log-likelihoods of the model that
@@ -459,7 +462,7 @@ def ctc_loss(logits: chex.Array,
              labels: chex.Array,
              label_paddings: chex.Array,
              blank_id: int = 0,
-             log_epsilon: float = -1e5) -> chex.Array:
+             log_epsilon: float = -1e5) -> jnp.ndarray:
   """Computes CTC loss.
 
   See docstring for ``ctc_loss_with_forward_probs`` for details.
@@ -494,7 +497,7 @@ def ctc_loss(logits: chex.Array,
 
 def convex_kl_divergence(
     log_predictions: chex.Array, targets: chex.Array
-) -> chex.Array:
+) -> jnp.ndarray:
   """Computes a convex version of the Kullback-Leibler divergence loss.
 
   Measures the information gain achieved if target probability distribution
@@ -521,7 +524,7 @@ def convex_kl_divergence(
 
 def kl_divergence(
     log_predictions: chex.Array, targets: chex.Array
-) -> chex.Array:
+) -> jnp.ndarray:
   """Computes the Kullback-Leibler divergence (relative entropy) loss.
 
   Measures the information gain achieved if target probability distribution
@@ -548,7 +551,7 @@ def kl_divergence(
 
 
 def kl_divergence_with_log_targets(log_predictions: chex.Array,
-                                   log_targets: chex.Array) -> chex.Array:
+                                   log_targets: chex.Array) -> jnp.ndarray:
   """Computes the Kullback-Leibler divergence (relative entropy) loss.
 
   Version of kl_div_loss where targets are given in log-space.
@@ -569,7 +572,7 @@ def kl_divergence_with_log_targets(log_predictions: chex.Array,
 
 
 def hinge_loss(predictor_outputs: chex.Array,
-               targets: chex.Array) -> chex.Array:
+               targets: chex.Array) -> jnp.ndarray:
   """Computes the hinge loss for binary classification.
 
   Args:
@@ -584,7 +587,7 @@ def hinge_loss(predictor_outputs: chex.Array,
 
 def poly_loss_cross_entropy(
     logits: chex.Array, labels: chex.Array, epsilon: float = 2.0
-) -> chex.Array:
+) -> jnp.ndarray:
   r"""Computes PolyLoss between logits and labels.
 
   The PolyLoss is a loss function that decomposes commonly