diff --git a/csep/core/catalog_evaluations.py b/csep/core/catalog_evaluations.py
index 214ceca0..885898f3 100644
--- a/csep/core/catalog_evaluations.py
+++ b/csep/core/catalog_evaluations.py
@@ -1,11 +1,14 @@
-# Third-Party Imports
+# Python imports
 import time
+from typing import Optional, TYPE_CHECKING
 
+# Third-Party Imports
 import numpy
 import scipy.stats
 
 # PyCSEP imports
 from csep.core.exceptions import CSEPEvaluationException
+from csep.core.catalogs import CSEPCatalog
 from csep.models import (
     CatalogNumberTestResult,
     CatalogSpatialTestResult,
@@ -14,7 +17,10 @@
     CalibrationTestResult
 )
 from csep.utils.calc import _compute_likelihood
-from csep.utils.stats import get_quantiles, cumulative_square_diff
+from csep.utils.stats import get_quantiles, cumulative_square_diff, MLL_score
+
+if TYPE_CHECKING:
+    from csep.core.forecasts import CatalogForecast
 
 
 def number_test(forecast, observed_catalog, verbose=True):
@@ -55,6 +61,7 @@ def number_test(forecast, observed_catalog, verbose=True):
                                      obs_name=observed_catalog.name)
     return result
 
+
 def spatial_test(forecast, observed_catalog, verbose=True):
     """ Performs spatial test for catalog-based forecasts.
 
@@ -140,6 +147,7 @@ def spatial_test(forecast, observed_catalog, verbose=True):
 
     return result
 
+
 def magnitude_test(forecast, observed_catalog, verbose=True):
     """ Performs magnitude test for catalog-based forecasts """
     test_distribution = []
@@ -215,6 +223,7 @@ def magnitude_test(forecast, observed_catalog, verbose=True):
 
     return result
 
+
 def pseudolikelihood_test(forecast, observed_catalog, verbose=True):
     """ Performs the spatial pseudolikelihood test for catalog forecasts.
 
@@ -310,6 +319,7 @@ def pseudolikelihood_test(forecast, observed_catalog, verbose=True):
 
     return result
 
+
 def calibration_test(evaluation_results, delta_1=False):
     """ Perform the calibration test by computing a Kilmogorov-Smirnov test of the observed quantiles against a uniform
     distribution.
@@ -345,3 +355,261 @@ def calibration_test(evaluation_results, delta_1=False):
     return result
 
 
+def resampled_magnitude_test(forecast: "CatalogForecast",
+                             observed_catalog: CSEPCatalog,
+                             verbose: bool = False,
+                             seed: Optional[int] = None) -> CatalogMagnitudeTestResult:
+    """
+    Performs the resampled magnitude test for catalog-based forecasts (Serafini et al., 2024),
+    which corrects the bias from the original M-test implementation to the total N of events.
+    Calculates the (pseudo log-likelihood) test statistic distribution from the forecast's
+    synthetic catalogs Λ_j as:
+
+        D_j = Σ_k [log(Λ_u(k) * N / N_u + 1) - log(Λ̃_j(k) + 1)] ^ 2
+
+    where k are the magnitude bins, Λ_u the union of all synthetic catalogs, N_u the total
+    number of events in Λ_u, and Λ̃_j the resampled catalog containing exactly N events.
+
+    The pseudo log-likelihood statistic from the observations is calculated as:
+
+        D_o = Σ_k [log(Λ_U(k) * N / N_u + 1) - log(Ω(k) + 1)]^2
+
+    where Ω is the observed catalog.
+
+    Args:
+        forecast (CatalogForecast): The forecast to be evaluated
+        observed_catalog (CSEPCatalog): The observation/testing catalog.
+        verbose (bool): Flag to display debug messages
+        seed (int): Random number generator seed
+
+    Returns:
+        A CatalogMagnitudeTestResult object containing the statistic distribution and the
+        observed statistic.
+    """
+
+    # set seed
+    if seed:
+        numpy.random.seed(seed)
+    """  """
+    test_distribution = []
+
+    if forecast.region.magnitudes is None:
+        raise CSEPEvaluationException(
+            "Forecast must have region.magnitudes member to perform magnitude test.")
+
+    # short-circuit if zero events
+    if observed_catalog.event_count == 0:
+        print("Cannot perform magnitude test when observed event count is zero.")
+        # prepare result
+        result = CatalogMagnitudeTestResult(test_distribution=test_distribution,
+                                            name='M-Test',
+                                            observed_statistic=None,
+                                            quantile=(None, None),
+                                            status='not-valid',
+                                            min_mw=forecast.min_magnitude,
+                                            obs_catalog_repr=str(observed_catalog),
+                                            obs_name=observed_catalog.name,
+                                            sim_name=forecast.name)
+
+        return result
+
+    # compute expected rates for forecast if needed
+    if forecast.expected_rates is None:
+        forecast.get_expected_rates(verbose=verbose)
+
+    # THIS IS NEW - returns the average events in the magnitude bins
+    union_histogram = numpy.zeros(len(forecast.magnitudes))
+    for j, cat in enumerate(forecast):
+        union_histogram += cat.magnitude_counts()
+
+    mag_half_bin = numpy.diff(observed_catalog.region.magnitudes)[0] / 2.
+    # end new
+    n_union_events = numpy.sum(union_histogram)
+    obs_histogram = observed_catalog.magnitude_counts()
+    n_obs = numpy.sum(obs_histogram)
+    union_scale = n_obs / n_union_events
+    scaled_union_histogram = union_histogram * union_scale
+
+    # this is new - prob to be used for resampling
+    probs = union_histogram / n_union_events
+    # end new
+
+    # compute the test statistic for each catalog
+    t0 = time.time()
+    for i, catalog in enumerate(forecast):
+        # THIS IS NEW - sampled from the union forecast histogram
+        mag_values = numpy.random.choice(forecast.magnitudes + mag_half_bin, p=probs,
+                                         size=int(n_obs))
+        extended_mag_max = max(forecast.magnitudes) + 10
+        mag_counts, tmp = numpy.histogram(mag_values, bins=numpy.append(forecast.magnitudes,
+                                                                        extended_mag_max))
+        # end new
+        n_events = numpy.sum(mag_counts)
+        if n_events == 0:
+            # print("Skipping to next because catalog contained zero events.")
+            continue
+        scale = n_obs / n_events
+        catalog_histogram = mag_counts * scale
+        # compute magnitude test statistic for the catalog
+        test_distribution.append(
+            cumulative_square_diff(numpy.log10(catalog_histogram + 1),
+                                   numpy.log10(scaled_union_histogram + 1))
+        )
+        # output status
+        if verbose:
+            tens_exp = numpy.floor(numpy.log10(i + 1))
+            if (i + 1) % 10 ** tens_exp == 0:
+                t1 = time.time()
+                print(f'Processed {i + 1} catalogs in {t1 - t0} seconds', flush=True)
+
+    # compute observed statistic
+    obs_d_statistic = cumulative_square_diff(numpy.log10(obs_histogram + 1),
+                                             numpy.log10(scaled_union_histogram + 1))
+
+    # score evaluation
+    delta_1, delta_2 = get_quantiles(test_distribution, obs_d_statistic)
+
+    # prepare result
+    result = CatalogMagnitudeTestResult(test_distribution=test_distribution,
+                                        name='M-Test',
+                                        observed_statistic=obs_d_statistic,
+                                        quantile=(delta_1, delta_2),
+                                        status='normal',
+                                        min_mw=forecast.min_magnitude,
+                                        obs_catalog_repr=str(observed_catalog),
+                                        obs_name=observed_catalog.name,
+                                        sim_name=forecast.name)
+
+    return result
+
+
+def MLL_magnitude_test(forecast: "CatalogForecast",
+                       observed_catalog: CSEPCatalog,
+                       full_calculation: bool = False,
+                       verbose: bool = False,
+                       seed: Optional[int] = None) -> CatalogMagnitudeTestResult:
+    """
+    Implements the modified Multinomial log-likelihood ratio (MLL) magnitude test (Serafini et
+    al., 2024). Calculates the test statistic distribution as:
+
+        D̃_j = -2 * log( L(Λ_u + N_u / N_j + Λ̃_j + 1) /
+                       [L(Λ_u + N_u / N_j) * L(Λ̃_j + 1)]
+                       )
+
+    where L is the multinomial likelihood function, Λ_u the union of all the forecasts'
+    synthetic catalogs, N_u the total number of events in Λ_u, Λ̃_j the resampled catalog
+    containing exactly N observed events. The observed statistic is defined as:
+
+        D_o = -2 * log( L(Λ_u + N_u / N + Ω + 1) /
+                       [L(Λ_u + N_u / N) * L(Ω + 1)]
+                       )
+
+    where Ω is the observed catalog.
+
+    Args:
+        forecast (CatalogForecast): The forecast to be evaluated
+        observed_catalog (CSEPCatalog): The observation/testing catalog.
+        full_calculation (bool): Whether to sample from the entire stochastic catalogs or from
+            its already processed magnitude histogram.
+        verbose (bool): Flag to display debug messages
+        seed (int): Random number generator seed
+
+    Returns:
+        A CatalogMagnitudeTestResult object containing the statistic distribution and the
+        observed statistic.
+    """
+
+    # set seed
+    if seed:
+        numpy.random.seed(seed)
+
+    test_distribution = []
+
+    if forecast.region.magnitudes is None:
+        raise CSEPEvaluationException(
+            "Forecast must have region.magnitudes member to perform magnitude test.")
+
+    # short-circuit if zero events
+    if observed_catalog.event_count == 0:
+        print("Cannot perform magnitude test when observed event count is zero.")
+        # prepare result
+        result = CatalogMagnitudeTestResult(test_distribution=test_distribution,
+                                            name='M-Test',
+                                            observed_statistic=None,
+                                            quantile=(None, None),
+                                            status='not-valid',
+                                            min_mw=forecast.min_magnitude,
+                                            obs_catalog_repr=str(observed_catalog),
+                                            obs_name=observed_catalog.name,
+                                            sim_name=forecast.name)
+
+        return result
+
+    # compute expected rates for forecast if needed
+    if forecast.expected_rates is None:
+        forecast.get_expected_rates(verbose=verbose)
+
+    # calculate histograms of union forecast and total number of events
+    Lambda_u_histogram = numpy.zeros(len(forecast.magnitudes))
+
+    if full_calculation:
+        Lambda_u = []
+    else:
+        mag_half_bin = numpy.diff(observed_catalog.region.magnitudes)[0] / 2.
+
+    for j, cat in enumerate(forecast):
+        if full_calculation:
+            Lambda_u = numpy.append(Lambda_u, cat.get_magnitudes())
+        Lambda_u_histogram += cat.magnitude_counts()
+
+    # # calculate histograms of observations and observed number of events
+    Omega_histogram = observed_catalog.magnitude_counts()
+    n_obs = numpy.sum(Omega_histogram)
+
+    # compute observed statistic
+    obs_d_statistic = MLL_score(union_catalog_counts=Lambda_u_histogram,
+                                catalog_counts=Omega_histogram)
+
+    probs = Lambda_u_histogram / numpy.sum(Lambda_u_histogram)
+
+    # compute the test statistic for each catalog
+    t0 = time.time()
+    for i, catalog in enumerate(forecast):
+        # this is new - sampled from the union forecast histogram
+        if full_calculation:
+            mag_values = numpy.random.choice(Lambda_u, size=int(n_obs))
+        else:
+            mag_values = numpy.random.choice(forecast.magnitudes + mag_half_bin, p=probs,
+                                             size=int(n_obs))
+        extended_mag_max = max(forecast.magnitudes) + 10
+        Lambda_j_histogram, tmp = numpy.histogram(mag_values,
+                                                  bins=numpy.append(forecast.magnitudes,
+                                                                    extended_mag_max))
+
+        # compute magnitude test statistic for the catalog
+        test_distribution.append(
+            MLL_score(union_catalog_counts=Lambda_u_histogram,
+                      catalog_counts=Lambda_j_histogram)
+        )
+        # output status
+        if verbose:
+            tens_exp = numpy.floor(numpy.log10(i + 1))
+            if (i + 1) % 10 ** tens_exp == 0:
+                t1 = time.time()
+                print(f'Processed {i + 1} catalogs in {t1 - t0} seconds', flush=True)
+
+    # score evaluation
+    delta_1, delta_2 = get_quantiles(test_distribution, obs_d_statistic)
+
+    # prepare result
+    result = CatalogMagnitudeTestResult(test_distribution=test_distribution,
+                                        name='M-Test',
+                                        observed_statistic=obs_d_statistic,
+                                        quantile=(delta_1, delta_2),
+                                        status='normal',
+                                        min_mw=forecast.min_magnitude,
+                                        obs_catalog_repr=str(observed_catalog),
+                                        obs_name=observed_catalog.name,
+                                        sim_name=forecast.name)
+
+    return result
\ No newline at end of file
diff --git a/csep/utils/stats.py b/csep/utils/stats.py
index 3550b0de..f735206a 100644
--- a/csep/utils/stats.py
+++ b/csep/utils/stats.py
@@ -1,8 +1,7 @@
 import numpy
-import scipy.stats
 import scipy.special
-# PyCSEP imports
-from csep.core import regions
+import scipy.stats
+
 
 def sup_dist(cdf1, cdf2):
     """
@@ -269,3 +268,61 @@ def get_Kagan_I1_score(forecasts, catalog):
         I_1[j] = numpy.dot(counts[non_zero_idx], numpy.log2(rate_den[non_zero_idx] / uniform_forecast)) / n_event
     
     return I_1
+
+
+def log_d_multinomial(x: numpy.ndarray, size: int, prob: numpy.ndarray):
+    """
+
+    Args:
+        x:
+        size:
+        prob:
+
+    Returns:
+
+    """
+    return scipy.special.loggamma(size + 1) + numpy.sum(
+        x * numpy.log(prob) - scipy.special.loggamma(x + 1))
+
+
+def MLL_score(union_catalog_counts: numpy.ndarray, catalog_counts: numpy.ndarray):
+    """
+    Calculates the modified Multinomial log-likelihood (MLL) score, defined by Serafini et al.,
+    (2024). It is built from a collection catalogs Λ_u and a single catalog Ω
+
+        MLL_score = 2 * log( L(Λ_u + N_u / N_o + Ω + 1) /
+                           [L(Λ_u + N_u / N_o) * L(Ω + 1)]
+                           )
+    where N_u and N_j are the total number of events in Λ_u and Ω, respectively.
+
+    Args:
+        union_catalog_counts (numpy.ndarray):
+        catalog_counts (numpy.ndarray):
+
+    Returns:
+        The MLL score for the collection of catalogs and
+    """
+
+    N_u = numpy.sum(union_catalog_counts)
+    N_j = numpy.sum(catalog_counts)
+    events_ratio = N_u / N_j
+
+    union_catalog_counts_mod = union_catalog_counts + events_ratio
+    catalog_counts_mod = catalog_counts + 1
+    merged_catalog_j = union_catalog_counts_mod + catalog_counts_mod
+
+    pr_merged_cat = merged_catalog_j / numpy.sum(merged_catalog_j)
+    pr_union_cat = union_catalog_counts_mod / numpy.sum(union_catalog_counts_mod)
+    pr_cat_j = catalog_counts_mod / numpy.sum(catalog_counts_mod)
+
+    log_lik_merged = log_d_multinomial(x=merged_catalog_j,
+                                       size=numpy.sum(merged_catalog_j),
+                                       prob=pr_merged_cat)
+    log_lik_union = log_d_multinomial(x=union_catalog_counts_mod,
+                                      size=numpy.sum(union_catalog_counts_mod),
+                                      prob=pr_union_cat)
+    log_like_cat_j = log_d_multinomial(x=catalog_counts_mod,
+                                       size=numpy.sum(catalog_counts_mod),
+                                       prob=pr_cat_j)
+
+    return 2 * (log_lik_merged - log_lik_union - log_like_cat_j)
diff --git a/docs/concepts/evaluations.rst b/docs/concepts/evaluations.rst
index 05e94ad3..4e26ae81 100644
--- a/docs/concepts/evaluations.rst
+++ b/docs/concepts/evaluations.rst
@@ -30,7 +30,7 @@ consistency tests and they verify whether a forecast in consistent with an obser
 that can be used to compare the performance of two (or more) competing forecasts.
 PyCSEP implements the following evaluation routines for grid-based forecasts. These functions are intended to work with
 :class:`GriddedForecasts<csep.core.forecasts.GriddedForecast>` and :class:`CSEPCatalogs`<csep.core.catalogs.CSEPCatalog>`.
-Visit the :ref:`catalogs reference<catalogs-reference>` and the :ref:`forecasts reference<forecasts-reference>` to learn
+Visit the :ref:`catalogs reference<catalogs-reference>` and the :ref:`forecasts reference<forecast-reference>` to learn
 more about to import your forecasts and catalogs into PyCSEP.
 
 .. note::
@@ -105,6 +105,8 @@ Consistency tests
    magnitude_test
    pseudolikelihood_test
    calibration_test
+   resampled_magnitude_test
+   MLL_magnitude_test
 
 Publication reference
 =====================
@@ -114,6 +116,8 @@ Publication reference
 3. Magnitude test (:ref:`Savran et al., 2020<savran-2020>`)
 4. Pseudolikelihood test (:ref:`Savran et al., 2020<savran-2020>`)
 5. Calibration test (:ref:`Savran et al., 2020<savran-2020>`)
+6. Resampled Magnitude Test (Serafini et al., in-prep)
+7. MLL Magnitude Test (Serafini et al., in-prep)
 
 ****************************
 Preparing evaluation catalog
@@ -121,6 +125,7 @@ Preparing evaluation catalog
 
 The evaluations in PyCSEP do not implicitly filter the observed catalogs or modify the forecast data when called. For most
 cases, the observation catalog should be filtered according to:
+
     1. Magnitude range of the forecast
     2. Spatial region of the forecast
     3. Start and end-time of the forecast
diff --git a/docs/conf.py b/docs/conf.py
index 208045ef..2d3f07d0 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -99,10 +99,10 @@
 # intersphinx configuration
 intersphinx_mapping = {
     "python": ("https://docs.python.org/3/", None),
-    "numpy": ("https://docs.scipy.org/doc/numpy/", None),
+    "numpy": (" https://numpy.org/doc/stable/", None),
     "pandas": ("http://pandas.pydata.org/pandas-docs/stable/", None),
-    "scipy": ('http://docs.scipy.org/doc/scipy/reference', None),
-    "matplotlib": ('http://matplotlib.sourceforge.net/', None)
+    "scipy": ('https://docs.scipy.org/doc/scipy/', None),
+    "matplotlib": ('https://matplotlib.org/stable', None)
 }
 
 html_theme_options = {}
diff --git a/docs/getting_started/static/output_37_0.png b/docs/getting_started/static/output_37_0.png
new file mode 100644
index 00000000..f1399388
Binary files /dev/null and b/docs/getting_started/static/output_37_0.png differ
diff --git a/docs/getting_started/static/output_38_0.png b/docs/getting_started/static/output_38_0.png
new file mode 100644
index 00000000..6397c60b
Binary files /dev/null and b/docs/getting_started/static/output_38_0.png differ
diff --git a/docs/getting_started/theory.rst b/docs/getting_started/theory.rst
index 4c9c8ddd..24252a2c 100644
--- a/docs/getting_started/theory.rst
+++ b/docs/getting_started/theory.rst
@@ -971,7 +971,7 @@ Spatial test
 Aim: The spatial test again aims to isolate the spatial component of the
 forecast and test the consistency of spatial rates with observed events.
 
-Method We perform the spatial test in the catalog-based approach in a
+Method: We perform the spatial test in the catalog-based approach in a
 similar way to the grid-based spatial test approach: by normalising the
 approximate rate density. In this case, we use the normalisation
 :math:`\hat{\lambda}_s = \hat{\lambda}_s \big/ \sum_{R} \hat{\lambda}_s`.
@@ -1015,6 +1015,90 @@ dashed vertical line shows the observed test statistic
 distribution. The quantile score :math:`\gamma_s = 0.36` is also printed
 on the figure by default.
 
+
+Resampled Magnitude Test
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Aim: Perform the resampled magnitude test for catalog-based forecasts (Serafini et al., , in-prep),
+which is a correction to the original M-test implementation that is biased to the total N of
+events.
+
+Method: Calculates the (pseudo log-likelihood) test statistic distribution :math:`D_j` from the
+resampled forecast's synthetic catalogs :math:`\hat{\Lambda}_j` as:
+
+.. math::  D_j =  \sum_{k}\Bigg(\log\Bigg[\frac{N_{obs}}{N_U} \Lambda_U^{(m)}(k) + 1\Bigg]- \log\Bigg[\hat{\Lambda}_j^{(m)}(k) + 1\Bigg]\Bigg)^2; j= 1...J
+
+where :math:`\Lambda_U^{(m)}(k)` and  :math:`\Lambda_U^{(m)}(k)` represent the count in the :math:`k`\ th bin of the magnitude-frequency
+distribution in the union catalog and resampled catalog, respectively. The resampled catalogs :math:`\hat{\Lambda}_j^{(m)}`
+are built from drawing exactly :math:`N_{obs}` events from the union catalog :math:`\Lambda_U^{(m)}`.
+
+The observed statistic is built identically as the original Magnitude test:
+
+.. math:: d_{obs}= \sum_{k}\Bigg(\log\Bigg[\frac{N_{obs}}{N_U} \Lambda_U^{(m)}(k) + 1\Bigg]- \log\Big[\Omega^{(m)}(k) + 1\Big]\Bigg)^2
+
+
+where :math:`\Omega^{(m)}(k)` represent the count in the :math:`k`\ th bin of the magnitude-frequency
+distribution in the observed catalog.
+
+
+* Implementation in pyCSEP
+
+.. code:: ipython3
+
+    from csep.core.catalog_evaluations import resampled_magnitude_test
+
+    resampled_m_test = resampled_magnitude_test(forecast, comcat_catalog, seed=1)
+    resampled_m_test.plot(show = True)
+
+
+.. image:: static/output_37_0.png
+
+
+The histogram shows the resulting test distribution with :math:`D^*`, which has a smoother
+statistic distribution than the original M-test.
+The observed statistic :math:`\omega = d_{obs}` is shown with the dashed
+horizontal line. The quantile score for this forecast is
+:math:`\gamma = 0.98`.
+
+
+Modified Multinomial Log-Likelihood (MLL) Magnitude Test
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+* Aim: Implements the modified Multinomial log-likelihood (MLL) magnitude test (Serafini et al., in-prep).
+
+* Method:  Calculates the test statistic distribution as:
+
+  .. math:: D_j = -2 \log\Bigg[\cfrac{\mathcal{L}\big(\Lambda_u^{(m)} + \frac{N_u}{N_j}\boldsymbol{1} + \hat{\Lambda}_j^{(m)} + \boldsymbol{1}\big)} {\mathcal{L}\big(\Lambda_u^{(m)} + \frac{N_u}{N_j}\boldsymbol{1}\big) \mathcal{L}\big(\hat{\Lambda}_j^{(m)} + \boldsymbol{1}\big)}\Bigg]
+
+  where :math:`\mathcal{L}` is the multinomial likelihood function. :math:`\Lambda_u` is the union of all the forecasts'
+  synthetic catalogs, :math:`\Lambda_u^{(m)}` its magnitude histogram, and :math:`N_u` the total number of events in :math:`\Lambda_u`. :math:`\hat{\Lambda}_j` are resampled catalogs, which are built from drawing exactly :math:`N_{obs}` events from the union catalog :math:`\Lambda_U`, whereas :math:`\hat{\Lambda}_j^{(m)}` is its histogram. The vector :math:`\boldsymbol{1}` has a value of 1 on each magnitude histogram bin.
+
+  The observed statistic is defined as:
+
+  .. math:: d_{obs} = -2 \log\Bigg[\cfrac{\mathcal{L}\big(\Lambda_u^{(m)} + \frac{N_u}{N_j}\boldsymbol{1} + \Omega^{(m)} + \boldsymbol{1}\big)} {\mathcal{L}\big(\Lambda_u^{(m)} + \frac{N_u}{N_j}\boldsymbol{1}\big) \mathcal{L}\big(\Omega^{(m)} + \boldsymbol{1}\big)}\Bigg]
+
+  The multinomial likelihood function :math:`\mathcal{L}` for an arbitrary catalog's magnitude histogram :math:`\Lambda^{(m)}` is defined as:
+
+
+  .. math:: \mathcal{L}(\Lambda^{(m)}) = \cfrac{N_\Lambda!}{n_{\Lambda,1}!\, ... n_{\Lambda,K}!}\prod_{k=1}^{K} \left(\cfrac{n_{\Lambda,k}}{N}\right)^{n_{\Lambda,k}}
+
+  where :math:`N_\Lambda` is the total number of events in :math:`\Lambda^{(m)}`,
+  :math:`n_{\Lambda,k}` the number of events in the :math:`k-th`  bin, and :math:`K` the total
+  number of bins.
+
+* Implementation in pyCSEP
+
+
+  .. code:: ipython3
+
+      from csep.core.catalog_evaluations import MLL_magnitude_test
+
+      MLL_t = MLL_magnitude_test(forecast, comcat_catalog, full_calculation=False, verbose=False, seed=22)
+      MLL_t.plot(show=True)
+
+  .. image:: static/output_38_0.png
+
+
 References
 ----------
 
diff --git a/docs/reference/api_reference.rst b/docs/reference/api_reference.rst
index 6e3ce176..e85c5723 100644
--- a/docs/reference/api_reference.rst
+++ b/docs/reference/api_reference.rst
@@ -159,6 +159,8 @@ Catalog-based forecast evaluations:
    number_test
    spatial_test
    magnitude_test
+   resampled_magnitude_test
+   MLL_magnitude_test
    pseudolikelihood_test
    calibration_test
 
diff --git a/tests/test_magnitude_tests.py b/tests/test_magnitude_tests.py
new file mode 100644
index 00000000..493c78b7
--- /dev/null
+++ b/tests/test_magnitude_tests.py
@@ -0,0 +1,454 @@
+from unittest import TestCase
+
+import numpy
+import scipy.special
+
+from csep.core.catalog_evaluations import resampled_magnitude_test, MLL_magnitude_test
+from csep.core.catalogs import CSEPCatalog
+from csep.core.forecasts import CatalogForecast
+from csep.core.regions import CartesianGrid2D
+
+
+class TestResampledMagnitudeTest(TestCase):
+    def setUp(self):
+        self.mag_bins = [1.0, 2.0]
+        self.region = CartesianGrid2D.from_origins(
+            numpy.array([[0.0, 0.0]]), dh=1.0, magnitudes=self.mag_bins
+        )
+
+    @staticmethod
+    def D_o(forecast, obs):
+        """
+        Independent implementation of D_o for the resampled M-test, just following the equations
+         from Serafini et al., (2024)
+
+        Args:
+            forecast: A CatalogForecast object
+            obs: A CSEPCatalog
+
+        Returns:
+            The observed statistic
+        """
+        d_o = 0
+        N_u = numpy.sum([i.get_number_of_events() for i in forecast.catalogs])
+        N_o = obs.get_number_of_events()
+        for i in range(len(obs.region.magnitudes)):
+            union_mag_counts = forecast.magnitude_counts()[i] * forecast.n_cat
+            first_term = numpy.log10(union_mag_counts * N_o / N_u + 1)
+            second_term = numpy.log10(obs.magnitude_counts()[i] + 1)
+            d_o += (first_term - second_term) ** 2
+
+        return d_o
+
+    @staticmethod
+    def D_j(forecast, obs, seed=None):
+        """
+        Independent implementation of D_j for the resampled M-test, just following the equations
+         from Serafini et al., (2024)
+
+        Args:
+            forecast: A CatalogForecast object
+            obs: A CSEPCatalog
+            seed: random number seed value for the sampling of magnitudes
+        Returns:
+            The test statistic distribution
+        """
+        if seed:
+            numpy.random.seed(seed)
+
+        D = []
+        N_u = numpy.sum([i.get_number_of_events() for i in forecast.catalogs])
+        N_o = obs.get_number_of_events()
+        mag_histogram = forecast.magnitude_counts()
+
+        for n in range(forecast.n_cat):
+            d_j = 0
+            lambda_j = numpy.random.choice(
+                obs.region.magnitudes, size=N_o, p=mag_histogram / numpy.sum(mag_histogram)
+            )
+            for i in range(len(obs.region.magnitudes)):
+                union_mag_counts = forecast.magnitude_counts()[i] * forecast.n_cat
+                first_term = numpy.log10(union_mag_counts * N_o / N_u + 1)
+                counts = numpy.sum(lambda_j == obs.region.magnitudes[i])
+                second_term = numpy.log10(counts + 1)
+                d_j += (first_term - second_term) ** 2
+            D.append(d_j)
+
+        return D
+
+    def test_no_region(self):
+        synthetic_cats = [CSEPCatalog(data=[("0", 1, 0.0, 0.0, 0.0, 1.0)])]
+        forecast = CatalogForecast(catalogs=synthetic_cats)
+        observation_cat = CSEPCatalog(data=[("0", 1, 0.0, 0.0, 0.0, 1.0)])
+
+        with self.assertRaises(AttributeError):
+            resampled_magnitude_test(forecast, observation_cat)
+
+    def test_single_cat_single_event(self):
+
+        # Same magnitude as observation
+        synthetic_cats = [CSEPCatalog(data=[("0", 1, 0.0, 0.0, 0.0, 1.0)])]
+        forecast = CatalogForecast(catalogs=synthetic_cats, region=self.region, n_cat=1)
+        observation_cat = CSEPCatalog(data=[("0", 1, 0.0, 0.0, 0.0, 1.0)], region=self.region)
+        result = resampled_magnitude_test(forecast, observation_cat)
+        self.assertEqual(result.test_distribution, [0])  # [0]
+        self.assertEqual(result.observed_statistic, self.D_o(forecast, observation_cat))  # 0
+
+        # Different magnitude as observation
+
+        observation_cat = CSEPCatalog(data=[("0", 1, 0.0, 0.0, 0.0, 2.0)], region=self.region)
+        result = resampled_magnitude_test(forecast, observation_cat)
+        self.assertEqual(result.test_distribution, [0])
+        self.assertEqual(
+            result.observed_statistic, self.D_o(forecast, observation_cat)
+        )  # 0.18123811
+
+    def test_single_cat_multiple_events(self):
+
+        # Same MFD
+        forecast = CatalogForecast(
+            catalogs=[
+                CSEPCatalog(
+                    data=[
+                        *(4 * [("0", 1, 0.0, 0.0, 0.0, 1.0)]),  # 4 events of this type
+                        *(2 * [("0", 1, 0.0, 0.0, 0.0, 2.0)]),  # 2 events of this type
+                    ]
+                )
+            ],
+            region=self.region,
+            n_cat=1,
+        )
+
+        observation_cat = CSEPCatalog(
+            data=[
+                *(2 * [("0", 1, 0.0, 0.0, 0.0, 1.0)]),  # 2 events of this type
+                *(1 * [("0", 1, 0.0, 0.0, 0.0, 2.0)]),  # 1 event of this type
+            ],
+            region=self.region,
+        )
+
+        seed = 2  # Set seed so both methods sample the same events from the Union catalog.
+        result = resampled_magnitude_test(forecast, observation_cat, seed=seed)
+        self.assertEqual(result.observed_statistic, self.D_o(forecast, observation_cat))  # 0.
+        self.assertEqual(
+            result.test_distribution, self.D_j(forecast, observation_cat, seed=seed)
+        )  # [0.10622874]
+
+        # Different MFD
+        observation_cat = CSEPCatalog(
+            data=[*(2 * [("0", 1, 0.0, 0.0, 0.0, 1.0)])],
+            region=self.region,
+        )
+        seed = 3
+        result = resampled_magnitude_test(forecast, observation_cat, seed=seed)
+        self.assertEqual(
+            result.observed_statistic, self.D_o(forecast, observation_cat)
+        )  # 0.0611293829124204
+        self.assertEqual(
+            result.test_distribution, self.D_j(forecast, observation_cat, seed=seed)
+        )  # 0.010751542367500075
+
+    def test_multiple_cat_multiple_events(self):
+
+        # Same MFD
+        forecast = CatalogForecast(
+            catalogs=[
+                CSEPCatalog(
+                    data=[
+                        *(2 * [("0", 1, 0.0, 0.0, 0.0, 1.0)]),
+                        *(1 * [("0", 1, 0.0, 0.0, 0.0, 2.0)]),
+                    ]
+                ),
+                CSEPCatalog(
+                    data=[
+                        *(8 * [("0", 1, 0.0, 0.0, 0.0, 1.0)]),
+                        *(4 * [("0", 1, 0.0, 0.0, 0.0, 2.0)]),
+                    ]
+                ),
+                CSEPCatalog(
+                    data=[
+                        *(12 * [("0", 1, 0.0, 0.0, 0.0, 1.0)]),
+                        *(6 * [("0", 1, 0.0, 0.0, 0.0, 2.0)]),
+                    ]
+                ),
+            ],
+            region=self.region,
+            n_cat=3,
+        )
+
+        observation_cat = CSEPCatalog(
+            data=[
+                *(4 * [("0", 1, 0.0, 0.0, 0.0, 1.0)]),
+                *(2 * [("0", 1, 0.0, 0.0, 0.0, 2.0)]),
+            ],
+            region=self.region,
+        )
+
+        seed = 3  # Set seed so both methods sample the same events from the Union catalog.
+        result = resampled_magnitude_test(forecast, observation_cat, seed=seed)
+
+        self.assertEqual(result.observed_statistic, self.D_o(forecast, observation_cat))  # 0.
+        self.assertEqual(
+            result.test_distribution, self.D_j(forecast, observation_cat, seed=seed)
+        )  # [0.025001238526499825, 0.2489980945164454, 0.03727780124146953]
+
+        # Different MFD
+        observation_cat = CSEPCatalog(
+            data=[*(3 * [("0", 1, 0.0, 0.0, 0.0, 1.0)]), *(1 * [("0", 1, 0.0, 0.0, 0.0, 1.0)])],
+            region=self.region,
+        )
+        seed = 3
+        result = resampled_magnitude_test(forecast, observation_cat, seed=seed)
+        self.assertAlmostEqual(
+            result.observed_statistic, self.D_o(forecast, observation_cat)
+        )  # 0.1535506203257525
+        numpy.testing.assert_almost_equal(
+            result.test_distribution, self.D_j(forecast, observation_cat, seed=seed)
+        )  # [0.005909847975937456, 0.019507668333914787, 0.1535506203257525]
+
+
+class TestMLLTest(TestCase):
+    def setUp(self):
+        self.mag_bins = [1.0, 2.0]
+        self.region = CartesianGrid2D.from_origins(
+            numpy.array([[0.0, 0.0]]), dh=1.0, magnitudes=self.mag_bins
+        )
+
+    @staticmethod
+    def D_o(forecast, catalog):
+
+        def multinomial_likelihood(array):
+            """
+            Calculates the likelihood value. Only valid for small arrays and values.
+            """
+            total = numpy.sum(array)
+
+            C = scipy.special.factorial(total)
+            for i in array:
+                C /= scipy.special.factorial(i)
+
+            product = numpy.prod([(i / total) ** i for i in array])
+
+            return C * product
+
+        lambda_u = forecast.magnitude_counts() * forecast.n_cat
+        omega = catalog.magnitude_counts()
+        N_u = numpy.sum([i.get_number_of_events() for i in forecast.catalogs])
+        N_o = catalog.get_number_of_events()
+
+        likelihood_merged = multinomial_likelihood(lambda_u + N_u / N_o + omega + 1)
+        likelihood_union = multinomial_likelihood(lambda_u + N_u / N_o)
+        likelihood_cat = multinomial_likelihood(omega + 1)
+
+        return 2 * numpy.log(likelihood_merged / likelihood_union / likelihood_cat)
+
+    @staticmethod
+    def D_j(forecast, catalog, seed=None):
+        if seed:
+            numpy.random.seed(seed)
+
+        def multinomial_likelihood(array):
+            """
+            Calculates the likelihood value. Only valid for small arrays and values.
+            """
+            total = numpy.sum(array)
+
+            C = scipy.special.factorial(total)
+            for i in array:
+                C /= scipy.special.factorial(i)
+
+            product = numpy.prod([(i / total) ** i for i in array])
+
+            return C * product
+
+        D = []
+
+        lambda_u = forecast.magnitude_counts() * forecast.n_cat
+        N_u = numpy.sum([i.get_number_of_events() for i in forecast.catalogs])
+        N_o = catalog.get_number_of_events()
+
+        for n in range(forecast.n_cat):
+            lambda_j = numpy.random.choice(
+                catalog.region.magnitudes, size=N_o, p=lambda_u / numpy.sum(lambda_u)
+            )
+            counts = numpy.array([numpy.sum(lambda_j == i) for i in catalog.region.magnitudes])
+            likelihood_merged = multinomial_likelihood(lambda_u + N_u / N_o + counts + 1)
+            likelihood_union = multinomial_likelihood(lambda_u + N_u / N_o)
+            likelihood_cat = multinomial_likelihood(counts + 1)
+            D.append(2 * numpy.log(likelihood_merged / likelihood_union / likelihood_cat))
+
+        return D
+
+    def test_no_region(self):
+        synthetic_cats = [CSEPCatalog(data=[("0", 1, 0.0, 0.0, 0.0, 1.0)])]
+        forecast = CatalogForecast(catalogs=synthetic_cats)
+        observation_cat = CSEPCatalog(data=[("0", 1, 0.0, 0.0, 0.0, 1.0)])
+
+        with self.assertRaises(AttributeError):
+            MLL_magnitude_test(forecast, observation_cat)
+
+    def test_single_cat_single_event(self):
+
+        # Same magnitudes as observation
+        synthetic_cats = [CSEPCatalog(data=[("0", 1, 0.0, 0.0, 0.0, 1.0)])]
+        forecast = CatalogForecast(catalogs=synthetic_cats, region=self.region, n_cat=1)
+        observation_cat = CSEPCatalog(data=[("0", 1, 0.0, 0.0, 0.0, 1.0)], region=self.region)
+        result = MLL_magnitude_test(forecast, observation_cat)
+
+        # D_j = - 2 * log(  L(Lambda_U + N_u / N_j + Lambda_j + 1) /
+        #                  [L(Lambda_U + N_u / N_j) * L(Lambda_j + 1)]
+        #                )
+
+        # First term: L(Lambda_U + N_u / N_j + Lambda_j + 1)
+        #           Array: ( [1, 0] + 1/1 + [1, 0] + 1) =  ( [4, 2] )
+        #               L ( [4, 2] ) = 6! / ( 4! * 2!) *   (4 / 6) ** 4 * (2 / 6) ** 2
+        #                            =  5 * 6 / 2          *   2 ** 10 / 6 ** 6
+        #                            = 0.3292181069958848
+
+        # Second term: L(Lambda_U + N_u / N_j)
+        #            Array: ([1, 0] + 1/1) =  ( [2, 1] )
+        #               L ( [2, 1] ) = 3! / ( 2! * 1!) *   (2 / 3) ** 2 * (1 / 3) ** 1
+        #                            =  3              *   4 / 9        *  1 / 3
+        #                            =  0.4444444444444444
+
+        # Third term:  L(Lambda_j + 1)
+        #            Array: ([1, 0] + 1) =  ( [2, 1] )
+        #               L ( [2, 1] ) =  0.4444444444444444
+
+        # D_j = -2 log( 0.3292181069958848 / 0.4444444444444444 / 0.4444444444444444)
+        #     = -1.0216512475319817
+        self.assertAlmostEqual(result.test_distribution[0], 1.0216512475319817)
+
+        # Different magnitude as observation
+        observation_cat = CSEPCatalog(data=[("0", 1, 0.0, 0.0, 0.0, 3.0)], region=self.region)
+        result = MLL_magnitude_test(forecast, observation_cat)
+        # D_o = - 2 * log(  L(Lambda_U + N_u / N_o + Omega + 1) /
+        #                  [L(Lambda_U + N_u / N_o) * L(Omega + 1)]
+        #                )
+
+        # First term: L(Lambda_U + N_u / N_o + Omega + 1)
+        #           Array: ( [1, 0] + 1/1 + [0, 1] + 1) =  ( [3, 3] )
+        #               L ( [3, 3] ) = 6! / ( 3! * 3!)          *   (3 / 6) ** 3 * (3 / 6) ** 3
+        #                            =  4 * 5 * 6 / (1 * 2 * 3)   *   0.5 ** 6  * 0.5 ** 6
+        #                            = 0.3125
+
+        # Second term: L(Lambda_U + N_u / N_o)
+        #            Array: ([1, 0] + 1/1) =  ( [2, 1] )
+        #                            =  0.4444444444444444
+
+        # Third term:  L(Omega + 1)
+        #            Array: ([0, 1] + 1) =  ( [1, 2] )
+        #               L ( [2, 1] ) =  0.4444444444444444
+
+        # D_j = - 2 * log( 0.3125 / 0.4444444444444444 / 0.4444444444444444)
+        #     = - 0.9174192452539534
+
+        self.assertAlmostEqual(result.observed_statistic, 0.9174192452539534)
+        # test own implementation
+        self.assertAlmostEqual(result.observed_statistic, self.D_o(forecast, observation_cat))
+
+    def test_single_cat_multiple_events(self):
+
+        # Same MFD
+        forecast = CatalogForecast(
+            catalogs=[
+                CSEPCatalog(
+                    data=[
+                        *(4 * [("0", 1, 0.0, 0.0, 0.0, 1.0)]),  # 4 events of this type
+                        *(2 * [("0", 1, 0.0, 0.0, 0.0, 2.0)]),  # 2 events of this type
+                    ]
+                )
+            ],
+            region=self.region,
+            n_cat=1,
+        )
+
+        observation_cat = CSEPCatalog(
+            data=[
+                *(2 * [("0", 1, 0.0, 0.0, 0.0, 1.0)]),  # 2 events of this type
+                *(1 * [("0", 1, 0.0, 0.0, 0.0, 2.0)]),  # 1 event of this type
+            ],
+            region=self.region,
+        )
+
+        seed = 2  # Set seed so both methods sample the same events from the Union catalog.
+        result = MLL_magnitude_test(forecast, observation_cat, seed=seed)
+        self.assertAlmostEqual(
+            result.observed_statistic, self.D_o(forecast, observation_cat)
+        )  # 1.7370001360756202
+        numpy.testing.assert_almost_equal(
+            result.test_distribution, self.D_j(forecast, observation_cat, seed=seed)
+        )  # [1.4704757763861487]
+
+        # Different MFD
+        observation_cat = CSEPCatalog(
+            data=[*(2 * [("0", 1, 0.0, 0.0, 0.0, 1.0)])],
+            region=self.region,
+        )
+        seed = 3
+        result = MLL_magnitude_test(forecast, observation_cat, seed=seed)
+        self.assertAlmostEqual(
+            result.observed_statistic, self.D_o(forecast, observation_cat)
+        )  # 1.483977055813141
+        numpy.testing.assert_almost_equal(
+            result.test_distribution, self.D_j(forecast, observation_cat, seed=seed)
+        )  # [1.6728620032121357]
+
+    def test_multiple_cat_multiple_events(self):
+
+        # Same MFD
+        forecast = CatalogForecast(
+            catalogs=[
+                CSEPCatalog(
+                    data=[
+                        *(2 * [("0", 1, 0.0, 0.0, 0.0, 1.0)]),
+                        *(1 * [("0", 1, 0.0, 0.0, 0.0, 2.0)]),
+                    ]
+                ),
+                CSEPCatalog(
+                    data=[
+                        *(8 * [("0", 1, 0.0, 0.0, 0.0, 1.0)]),
+                        *(4 * [("0", 1, 0.0, 0.0, 0.0, 2.0)]),
+                    ]
+                ),
+                CSEPCatalog(
+                    data=[
+                        *(12 * [("0", 1, 0.0, 0.0, 0.0, 1.0)]),
+                        *(6 * [("0", 1, 0.0, 0.0, 0.0, 2.0)]),
+                    ]
+                ),
+            ],
+            region=self.region,
+            n_cat=3,
+        )
+
+        observation_cat = CSEPCatalog(
+            data=[
+                *(4 * [("0", 1, 0.0, 0.0, 0.0, 1.0)]),
+                *(2 * [("0", 1, 0.0, 0.0, 0.0, 2.0)]),
+            ],
+            region=self.region,
+        )
+
+        seed = 2  # Set seed so both methods sample the same events from the Union catalog.
+        result = MLL_magnitude_test(forecast, observation_cat, seed=seed)
+        self.assertAlmostEqual(
+            result.observed_statistic, self.D_o(forecast, observation_cat)
+        )  # 2.3691574300045595
+        numpy.testing.assert_almost_equal(
+            result.test_distribution, self.D_j(forecast, observation_cat, seed=seed)
+        )  # [1.752103594329519, 1.752103594329519, 2.3691574300045595]
+
+        # Different MFD
+        observation_cat = CSEPCatalog(
+            data=[*(3 * [("0", 1, 0.0, 0.0, 0.0, 1.0)]), *(1 * [("0", 1, 0.0, 0.0, 0.0, 1.0)])],
+            region=self.region,
+        )
+        seed = 3
+        result = MLL_magnitude_test(forecast, observation_cat, seed=seed)
+        self.assertAlmostEqual(
+            result.observed_statistic, self.D_o(forecast, observation_cat)
+        )  # 1.7348577364545044
+        numpy.testing.assert_almost_equal(
+            result.test_distribution, self.D_j(forecast, observation_cat, seed=seed)
+        )  # [2.114537837794348, 2.202622612026193, 1.7348577364545044]