From d6ceae1bba0ccd76e7b68ae4c56ed911f5559096 Mon Sep 17 00:00:00 2001
From: Till Hoffmann <tillahoffmann@gmail.com>
Date: Tue, 30 Apr 2024 13:00:23 -0400
Subject: [PATCH] Add entropy implementations. (#1787)

* Add entropy implementations.

* Use `betaln` to evaluate entropy of `Beta` distribution.
---
 numpyro/distributions/continuous.py   | 82 +++++++++++++++++++++++++++
 numpyro/distributions/discrete.py     | 31 ++++++++++
 numpyro/distributions/distribution.py |  6 ++
 test/test_distributions.py            | 31 ++++++++++
 4 files changed, 150 insertions(+)

diff --git a/numpyro/distributions/continuous.py b/numpyro/distributions/continuous.py
index 0e895c827..273fa4a43 100644
--- a/numpyro/distributions/continuous.py
+++ b/numpyro/distributions/continuous.py
@@ -36,6 +36,7 @@
 from jax.scipy.linalg import cho_solve, solve_triangular
 from jax.scipy.special import (
     betaln,
+    digamma,
     expi,
     expit,
     gammainc,
@@ -198,6 +199,15 @@ def cdf(self, value):
     def icdf(self, q):
         return betaincinv(self.concentration1, self.concentration0, q)
 
+    def entropy(self):
+        total = self.concentration0 + self.concentration1
+        return (
+            betaln(self.concentration0, self.concentration1)
+            - (self.concentration0 - 1) * digamma(self.concentration0)
+            - (self.concentration1 - 1) * digamma(self.concentration1)
+            + (total - 2) * digamma(total)
+        )
+
 
 class Cauchy(Distribution):
     arg_constraints = {"loc": constraints.real, "scale": constraints.positive}
@@ -239,6 +249,9 @@ def cdf(self, value):
     def icdf(self, q):
         return self.loc + self.scale * jnp.tan(jnp.pi * (q - 0.5))
 
+    def entropy(self):
+        return jnp.broadcast_to(jnp.log(4 * np.pi * self.scale), self.batch_shape)
+
 
 class Dirichlet(Distribution):
     arg_constraints = {
@@ -293,6 +306,16 @@ def infer_shapes(concentration):
         event_shape = concentration[-1:]
         return batch_shape, event_shape
 
+    def entropy(self):
+        (n,) = self.event_shape
+        total = self.concentration.sum(axis=-1)
+        return (
+            gammaln(self.concentration).sum(axis=-1)
+            - gammaln(total)
+            + (total - n) * digamma(total)
+            - ((self.concentration - 1) * digamma(self.concentration)).sum(axis=-1)
+        )
+
 
 class EulerMaruyama(Distribution):
     """
@@ -458,6 +481,9 @@ def cdf(self, value):
     def icdf(self, q):
         return -jnp.log1p(-q) / self.rate
 
+    def entropy(self):
+        return 1 - jnp.log(self.rate)
+
 
 class Gamma(Distribution):
     arg_constraints = {
@@ -504,6 +530,14 @@ def cdf(self, x):
     def icdf(self, q):
         return gammaincinv(self.concentration, q) / self.rate
 
+    def entropy(self):
+        return (
+            self.concentration
+            - jnp.log(self.rate)
+            + gammaln(self.concentration)
+            + (1 - self.concentration) * digamma(self.concentration)
+        )
+
 
 class Chi2(Gamma):
     arg_constraints = {"df": constraints.positive}
@@ -861,6 +895,9 @@ def icdf(self, q):
         a = q - 0.5
         return self.loc - self.scale * jnp.sign(a) * jnp.log1p(-2 * jnp.abs(a))
 
+    def entropy(self):
+        return jnp.log(2 * self.scale) + 1
+
 
 class LKJ(TransformedDistribution):
     r"""
@@ -1161,6 +1198,9 @@ def variance(self):
     def cdf(self, x):
         return self.base_dist.cdf(jnp.log(x))
 
+    def entropy(self):
+        return (1 + jnp.log(2 * jnp.pi)) / 2 + self.loc + jnp.log(self.scale)
+
 
 class Logistic(Distribution):
     arg_constraints = {"loc": constraints.real, "scale": constraints.positive}
@@ -1201,6 +1241,9 @@ def cdf(self, value):
     def icdf(self, q):
         return self.loc + self.scale * logit(q)
 
+    def entropy(self):
+        return jnp.broadcast_to(jnp.log(self.scale) + 2, self.batch_shape)
+
 
 class LogUniform(TransformedDistribution):
     arg_constraints = {"low": constraints.positive, "high": constraints.positive}
@@ -1233,6 +1276,11 @@ def variance(self):
     def cdf(self, x):
         return self.base_dist.cdf(jnp.log(x))
 
+    def entropy(self):
+        log_low = jnp.log(self.low)
+        log_high = jnp.log(self.high)
+        return (log_low + log_high) / 2 + jnp.log(log_high - log_low)
+
 
 def _batch_solve_triangular(A, B):
     """
@@ -1521,6 +1569,13 @@ def infer_shapes(
                 event_shape = lax.broadcast_shapes(event_shape, matrix[-1:])
         return batch_shape, event_shape
 
+    def entropy(self):
+        (n,) = self.event_shape
+        half_log_det = jnp.log(jnp.diagonal(self.scale_tril, axis1=-2, axis2=-1)).sum(
+            -1
+        )
+        return n * (jnp.log(2 * np.pi) + 1) / 2 + half_log_det
+
 
 def _is_sparse(A):
     from scipy import sparse
@@ -2062,6 +2117,11 @@ def mean(self):
     def variance(self):
         return jnp.broadcast_to(self.scale**2, self.batch_shape)
 
+    def entropy(self):
+        return jnp.broadcast_to(
+            (jnp.log(2 * np.pi * self.scale**2) + 1) / 2, self.batch_shape
+        )
+
 
 class Pareto(TransformedDistribution):
     arg_constraints = {"scale": constraints.positive, "alpha": constraints.positive}
@@ -2103,6 +2163,9 @@ def cdf(self, value):
     def icdf(self, q):
         return self.scale / jnp.power(1 - q, 1 / self.alpha)
 
+    def entropy(self):
+        return jnp.log(self.scale / self.alpha) + 1 + 1 / self.alpha
+
 
 class RelaxedBernoulliLogits(TransformedDistribution):
     arg_constraints = {"temperature": constraints.positive, "logits": constraints.real}
@@ -2257,6 +2320,15 @@ def icdf(self, q):
         scaled = jnp.sign(q - 0.5) * jnp.sqrt(scaled_squared)
         return scaled * self.scale + self.loc
 
+    def entropy(self):
+        return jnp.broadcast_to(
+            (self.df + 1) / 2 * (digamma((self.df + 1) / 2) - digamma(self.df / 2))
+            + jnp.log(self.df) / 2
+            + betaln(self.df / 2, 0.5)
+            + jnp.log(self.scale),
+            self.batch_shape,
+        )
+
 
 class Uniform(Distribution):
     arg_constraints = {"low": constraints.dependent, "high": constraints.dependent}
@@ -2303,6 +2375,9 @@ def infer_shapes(low=(), high=()):
         event_shape = ()
         return batch_shape, event_shape
 
+    def entropy(self):
+        return jnp.log(self.high - self.low)
+
 
 class Weibull(Distribution):
     arg_constraints = {
@@ -2348,6 +2423,13 @@ def variance(self):
             - jnp.exp(gammaln(1.0 + 1.0 / self.concentration)) ** 2
         )
 
+    def entropy(self):
+        return (
+            jnp.euler_gamma * (1 - 1 / self.concentration)
+            + jnp.log(self.scale / self.concentration)
+            + 1
+        )
+
 
 class BetaProportion(Beta):
     """
diff --git a/numpyro/distributions/discrete.py b/numpyro/distributions/discrete.py
index a5fd12536..fc0c81dd9 100644
--- a/numpyro/distributions/discrete.py
+++ b/numpyro/distributions/discrete.py
@@ -108,6 +108,11 @@ def enumerate_support(self, expand=True):
             values = jnp.broadcast_to(values, values.shape[:1] + self.batch_shape)
         return values
 
+    def entropy(self):
+        return -self.probs * jnp.log(self.probs) - (1 - self.probs) * jnp.log1p(
+            -self.probs
+        )
+
 
 class BernoulliLogits(Distribution):
     arg_constraints = {"logits": constraints.real}
@@ -149,6 +154,10 @@ def enumerate_support(self, expand=True):
             values = jnp.broadcast_to(values, values.shape[:1] + self.batch_shape)
         return values
 
+    def entropy(self):
+        nexp = jnp.exp(-self.logits)
+        return ((1 + nexp) * jnp.log1p(nexp) + nexp * self.logits) / (1 + nexp)
+
 
 def Bernoulli(probs=None, logits=None, *, validate_args=None):
     if probs is not None:
@@ -341,6 +350,9 @@ def enumerate_support(self, expand=True):
             values = jnp.broadcast_to(values, values.shape[:1] + self.batch_shape)
         return values
 
+    def entropy(self):
+        return -(self.probs * jnp.log(self.probs)).sum(axis=-1)
+
 
 class CategoricalLogits(Distribution):
     arg_constraints = {"logits": constraints.real_vector}
@@ -393,6 +405,10 @@ def enumerate_support(self, expand=True):
             values = jnp.broadcast_to(values, values.shape[:1] + self.batch_shape)
         return values
 
+    def entropy(self):
+        probs = softmax(self.logits, axis=-1)
+        return -(probs * self.logits).sum(axis=-1) + logsumexp(self.logits, axis=-1)
+
 
 def Categorical(probs=None, logits=None, *, validate_args=None):
     if probs is not None:
@@ -462,6 +478,9 @@ def enumerate_support(self, expand=True):
             values = jnp.broadcast_to(values, values.shape[:1] + self.batch_shape)
         return values
 
+    def entropy(self):
+        return jnp.log(self.high - self.low + 1)
+
 
 class OrderedLogistic(CategoricalProbs):
     """
@@ -498,6 +517,9 @@ def infer_shapes(predictor, cutpoints):
         event_shape = ()
         return batch_shape, event_shape
 
+    def entropy(self):
+        raise NotImplementedError
+
 
 class MultinomialProbs(Distribution):
     arg_constraints = {
@@ -879,6 +901,11 @@ def mean(self):
     def variance(self):
         return (1.0 / self.probs - 1.0) / self.probs
 
+    def entropy(self):
+        return -(1 - self.probs) * jnp.log1p(-self.probs) / self.probs - jnp.log(
+            self.probs
+        )
+
 
 class GeometricLogits(Distribution):
     arg_constraints = {"logits": constraints.real}
@@ -914,6 +941,10 @@ def mean(self):
     def variance(self):
         return (1.0 / self.probs - 1.0) / self.probs
 
+    def entropy(self):
+        nexp = jnp.exp(-self.logits)
+        return nexp * self.logits + jnp.log1p(nexp) * (1 + nexp)
+
 
 def Geometric(probs=None, logits=None, *, validate_args=None):
     if probs is not None:
diff --git a/numpyro/distributions/distribution.py b/numpyro/distributions/distribution.py
index 8463c10c2..fa9e4613c 100644
--- a/numpyro/distributions/distribution.py
+++ b/numpyro/distributions/distribution.py
@@ -391,6 +391,12 @@ def enumerate_support(self, expand=True):
         """
         raise NotImplementedError
 
+    def entropy(self):
+        """
+        Returns the entropy of the distribution.
+        """
+        raise NotImplementedError
+
     def expand(self, batch_shape):
         """
         Returns a new :class:`ExpandedDistribution` instance with batch
diff --git a/test/test_distributions.py b/test/test_distributions.py
index 961ab449f..43360b74b 100644
--- a/test/test_distributions.py
+++ b/test/test_distributions.py
@@ -408,6 +408,7 @@ def __init__(
     dist.Cauchy: lambda loc, scale: osp.cauchy(loc=loc, scale=scale),
     dist.Chi2: lambda df: osp.chi2(df),
     dist.Dirichlet: lambda conc: osp.dirichlet(conc),
+    dist.DiscreteUniform: lambda low, high: osp.randint(low, high + 1),
     dist.Exponential: lambda rate: osp.expon(scale=jnp.reciprocal(rate)),
     dist.Gamma: lambda conc, rate: osp.gamma(conc, scale=1.0 / rate),
     dist.GeometricProbs: lambda probs: osp.geom(p=probs, loc=-1),
@@ -1390,6 +1391,36 @@ def test_log_prob(jax_dist, sp_dist, params, prepend_shape, jit):
     assert_allclose(jit_fn(jax_dist.log_prob)(samples), expected, atol=1e-5)
 
 
+@pytest.mark.parametrize(
+    "jax_dist, sp_dist, params", CONTINUOUS + DISCRETE + DIRECTIONAL
+)
+def test_entropy(jax_dist, sp_dist, params):
+    jax_dist = jax_dist(*params)
+
+    if _is_batched_multivariate(jax_dist):
+        pytest.skip("batching not allowed in multivariate distns.")
+    if sp_dist is None:
+        pytest.skip(reason="no corresponding scipy distribution")
+    try:
+        actual = jax_dist.entropy()
+    except NotImplementedError:
+        pytest.skip(reason="distribution does not implement `entropy`")
+
+    sp_dist = sp_dist(*params)
+    expected = sp_dist.entropy()
+    assert_allclose(actual, expected, atol=1e-5)
+
+
+def test_entropy_categorical():
+    # There is no scipy mapping for categorical distributions, but the multinomial with
+    # one trial has the same entropy--which we check here.
+    logits = jax.random.normal(jax.random.key(9), (7,))
+    probs = _to_probs_multinom(logits)
+    sp_dist = osp.multinomial(1, probs)
+    for jax_dist in [dist.CategoricalLogits(logits), dist.CategoricalProbs(probs)]:
+        assert_allclose(jax_dist.entropy(), sp_dist.entropy())
+
+
 def test_mixture_log_prob():
     gmm = dist.MixtureSameFamily(
         dist.Categorical(logits=np.zeros(2)), dist.Normal(0, 1).expand([2])