From dcda4ad0d75ee244d3742cff0e474d791609c5b6 Mon Sep 17 00:00:00 2001
From: Shing Zhan <shing.zhan@gmail.com>
Date: Wed, 28 Feb 2024 21:16:01 +0000
Subject: [PATCH] Update docstrings

---
 python/tests/beagle.py       | 653 -----------------------------------
 python/tests/beagle_numba.py |  39 ++-
 2 files changed, 23 insertions(+), 669 deletions(-)
 delete mode 100644 python/tests/beagle.py

diff --git a/python/tests/beagle.py b/python/tests/beagle.py
deleted file mode 100644
index 47ebb6920b..0000000000
--- a/python/tests/beagle.py
+++ /dev/null
@@ -1,653 +0,0 @@
-"""
-Implementation of the BEAGLE 4.1 algorithm to impute alleles.
-
-This was implemented while closely consulting the BEAGLE 4.1 paper:
-Browning & Browning (2016). Genotype imputation with millions of reference samples.
-Am J Hum Genet 98:116-126. doi:10.1016/j.ajhg.2015.11.020
-
-The BEAGLE 4.1 source code (particularly `LSHapBaum.java`) was also consulted:
-https://faculty.washington.edu/browning/beagle/b4_1.html
-
-These notations are used throughout this implementation:
-h = number of reference haplotypes.
-m = number of genotyped markers.
-x = number of ungenotyped markers.
-
-This implementation takes the following inputs:
-1. Panel of reference haplotypes in a matrix of size (m + x, h).
-2. One query haplotype in an array of size (m + x).
-3. Site positions of all the markers in an array of size (m + x).
-
-In the query haplotype:
-1. The genotyped markers take values of 0, 1, 2, or 3 (ACGT encoding).
-2. The ungenotyped markers take -1.
-
-The forward and backward probability matrices are of size (m, h).
-The HMM state probability matrix is of size (m + 1, h).
-The interpolated allele probability matrix is of size (x, 4),
-The imputed alleles are the maximum a posteriori (MAP) alleles.
-
-For efficiency, BEAGLE uses aggregated markers, which are clusters of markers
-within a 0.005 cM interval (default). Because the genotypes are phased,
-the alleles in the aggregated markers form distinct "allele sequences".
-Below, we do not use aggregated markers or allele sequences, which would complicate
-the implementation.
-
-Rather than trying to exactly replicating the original BEAGLE 4.1 algorithm,
-this implementation computes Equation 1 of BB2016. We keep functions used
-in a first attempt to faithfully implement the BEAGLE algorithm for documentation.
-"""
-import numpy as np
-
-
-"""
-TODO: Update use of genetic map.
-TODO: Refactor and clean up this code.
-TODO: Move useful documentation over to beagle_numba.py.
-"""
-
-
-def convert_to_genetic_map_position(pos, genetic_map_pos=None, genetic_map_cm=None):
-    """
-    Convert physical positions (bp) to genetic map positions (cM).
-
-    In BEAGLE 4.1, when a genetic map is not specified, it is assumed
-    that the recombination rate is constant (1 cM / 1 Mb).
-
-    If a genetic map is specified, then the genetic map positions are
-    either taken straight from it or interpolated from it. The genetic map
-    needs to contain physical positions and corresponding genetic positions.
-    See `PlinkGenMap.java` in the BEAGLE 4.1 source code for details.
-
-    :param numpy.ndarray pos: Physical positions (bp).
-    :param numpy.ndarray genetic_map_pos: Physical positions of genetic map.
-    :param numpy.ndarray genetic_map_cm: Genetic map positions of genetic map.
-    :return: Genetic map positions (cM).
-    :rtype: numpy.ndarray
-    """
-    if genetic_map_pos is None or genetic_map_cm is None:
-        return pos / 1e6
-    # TODO: Better way to pass the components of a genetic map.
-    base_pos = genetic_map_pos
-    gen_pos = genetic_map_cm
-    assert len(base_pos) == len(gen_pos), "Bad genetic map."
-    assert np.all(pos >= base_pos[0]) and np.all(
-        pos <= base_pos[-1]
-    ), "Physical positions outside of genetic map."
-    left_idx = np.searchsorted(base_pos, pos, side="right")
-    xMa = np.zeros(len(pos), dtype=np.float32)
-    bMa = np.zeros(len(pos), dtype=np.float32)
-    fbMfa = np.zeros(len(pos), dtype=np.float32)
-    for i in range(len(pos)):
-        xMa[i] = pos[i] - base_pos[left_idx[i] - 1]
-        bMa[i] = base_pos[left_idx[i]] - base_pos[left_idx[i] - 1]
-        fbMfa[i] = gen_pos[left_idx[i]] - gen_pos[left_idx[i] - 1]
-    return xMa / bMa * fbMfa
-
-
-def get_weights(genotyped_pos, ungenotyped_pos, genetic_map=None):
-    """
-    Get weights at ungenotyped markers in the query haplotype, which are used in
-    linear interpolation of HMM state probabilities at ungenotyped markers.
-
-    In BB2016 (see below Equation 1), a weight between genotyped markers m and m + 1
-    at ungenotyped marker x is denoted lambda_m,x.
-
-    lambda_m,x = [g(m + 1) - g(x)] / [g(m + 1) - g(m)],
-    where g(i) = genetic map position of marker i.
-
-    :param numpy.ndarray genotyped_pos: Site positions of genotyped markers.
-    :param numpy.ndarray ungenotyped_pos: Site positions of ungenotyped markers.
-    :param msprime.RateMap genetic_map: Genetic map.
-    :return: Weights and associated marker interval start indices.
-    :rtype: tuple(numpy.ndarray, numpy.ndarray)
-    """
-    MIN_CM_DIST = 1e-7  # See `ImpData.java` in BEAGLE 4.1 source code
-    assert len(genotyped_pos) > 1
-    assert len(ungenotyped_pos) > 0
-    # Check that the two sets are mutually exclusive.
-    m = len(genotyped_pos)
-    x = len(ungenotyped_pos)
-    assert len(np.union1d(genotyped_pos, ungenotyped_pos)) == m + x
-    if genetic_map is None:
-        genotyped_cm = convert_to_genetic_map_position(genotyped_pos)
-        ungenotyped_cm = convert_to_genetic_map_position(ungenotyped_pos)
-    else:
-        genotyped_cm = genetic_map.get_cumulative_mass(genotyped_pos)
-        ungenotyped_cm = genetic_map.get_cumulative_mass(ungenotyped_pos)
-    # Calculate weights for ungenotyped markers.
-    weights = np.zeros(x, dtype=np.float32)
-    # Identify genotype markers k and k + 1 sandwiching marker ungenotyped marker i.
-    marker_interval_start = np.zeros(x, dtype=np.int32)
-    for i in range(x):
-        if ungenotyped_pos[i] < genotyped_pos[0]:
-            # Ungenotyped marker is before first genotyped marker.
-            k = 0
-            weights[i] = 1.0
-        elif ungenotyped_pos[i] > genotyped_pos[-1]:
-            # Ungenotyped marker is after last genotyped marker.
-            k = m - 1
-            weights[i] = 0.0
-        else:
-            # Ungenotyped marker is between genotyped markers k and k + 1.
-            k = np.max(np.where(ungenotyped_pos[i] > genotyped_pos)[0])  # Inefficient
-            cm_mP1_x = genotyped_cm[k + 1] - ungenotyped_cm[i]
-            cm_mP1_m = genotyped_cm[k + 1] - genotyped_cm[k]
-            if cm_mP1_m == 0:
-                cm_mP1_m = MIN_CM_DIST
-            weights[i] = cm_mP1_x / cm_mP1_m
-        marker_interval_start[i] = k
-    assert 0 <= np.min(weights)
-    assert np.max(weights) <= 1
-    return (weights, marker_interval_start)
-
-
-def get_mismatch_prob(pos, miscall_rate):
-    """
-    Set mismatch probabilities at genotyped markers.
-
-    In BEAGLE, the mismatch probability is dominated by the allelic miscall rate.
-    By default, it is set to 0.0001 and capped at 0.50.
-
-    :param numpy.ndarray pos: Site positions.
-    :param float miscall_rate: Allelic miscall rate.
-    :return: Mismatch probabilities.
-    :rtype: numpy.ndarray
-    """
-    assert isinstance(miscall_rate, float)
-    if miscall_rate >= 0.5:
-        # TODO: Print warning.
-        miscall_rate = 0.5
-    mu = np.zeros(len(pos), dtype=np.float32) + miscall_rate
-    return mu
-
-
-def get_switch_prob(pos, h, ne, genetic_map=None):
-    """
-    Set switch probabilities at genotyped markers.
-
-    Note that in BEAGLE, the default value of `ne` is set to 1e6.
-    If `h` is small and `ne` is large, the switch probability is nearly 1.
-    In such cases, it may be desirable to set `ne` to a small value
-    to avoid switching between haplotypes all the time.
-
-    :param numpy.ndarray pos: Site positions.
-    :param int h: Number of reference haplotypes.
-    :param float ne: Effective population size.
-    :param msprime.RateMap genetic_map: Genetic map.
-    :return: Switch probabilities.
-    :rtype: numpy.ndarray
-    """
-    assert np.min(pos) > 0
-    assert isinstance(h, int)
-    assert isinstance(ne, float)
-    # Get genetic distance between adjacent markers.
-    if genetic_map is None:
-        cm = convert_to_genetic_map_position(pos)
-    else:
-        cm = genetic_map.get_cumulative_mass(pos)
-    d = np.zeros(len(pos), dtype=np.float32)
-    d[0] = cm[0]
-    d[1:] = cm[1:] - cm[:-1]
-    assert len(d) == len(pos)
-    rho = -np.expm1(-(4 * ne * d / h))
-    assert len(d) == len(rho)
-    return rho
-
-
-def compute_forward_matrix(ref_h, query_h, rho, mu):
-    """
-    Implement forward algorithm to compute a forward probablity matrix of size (m, h).
-
-    Reference haplotypes and query haplotype are subsetted to genotyped markers.
-    So, they are a matrix of size (m, h) and an array of size m, respectively.
-
-    :param numpy.ndarray ref_h: Reference haplotypes.
-    :param numpy.ndarray query_h: One query haplotype.
-    :param numpy.ndarray rho: Switch probabilities.
-    :param numpy.ndarray mu: Mismatch probabilities.
-    :return: Forward probability matrix.
-    :rtype: numpy.ndarray
-    """
-    alleles = np.arange(4)  # ACGT
-    assert np.all(np.isin(ref_h, alleles))
-    assert np.all(np.isin(query_h, alleles))
-    m = ref_h.shape[0]
-    h = ref_h.shape[1]
-    assert m == len(query_h)
-    assert m == len(rho)
-    assert m == len(mu)
-    assert 0 <= np.min(rho)
-    assert np.max(rho) <= 1
-    assert 0 <= np.min(mu)
-    assert np.max(mu) <= 0.5
-    fm = np.zeros((m, h), dtype=np.float32)
-    last_sum = 1.0  # Part of normalization factor
-    for i in range(m):
-        # Get site-specific parameters.
-        shift = rho[i] / h
-        scale = (1 - rho[i]) / last_sum
-        # Get allele at genotyped marker i on query haplotype.
-        query_a = query_h[i]
-        for j in range(h):
-            # Get allele at genotyped marker i on reference haplotype j.
-            ref_a = ref_h[i, j]
-            # Get emission probability.
-            em_prob = mu[i] if query_a != ref_a else 1.0 - mu[i]
-            fm[i, j] = em_prob
-            if i == 0:
-                fm[i, j] *= 1.0 / h
-            else:
-                fm[i, j] *= scale * fm[i - 1, j] + shift
-        site_sum = np.sum(fm[i, :])
-        last_sum = site_sum / h if i == 0 else site_sum
-    return fm
-
-
-def compute_backward_matrix(ref_h, query_h, rho, mu):
-    """
-    Implement backward algorithm to compute a backward probablity matrix of size (m, h).
-
-    Reference haplotypes and query haplotype are subsetted to genotyped markers.
-    So, they are a matrix of size (m, h) and an array of size m, respectively.
-
-    In BEAGLE 4.1, the values are kept one site at a time.
-    Here, we keep the values at all the sites.
-
-    :param numpy.ndarray ref_h: Reference haplotypes.
-    :param numpy.ndarray query_h: One query haplotype.
-    :param numpy.ndarray rho: Switch probabilities.
-    :param numpy.ndarray mu: Mismatch probabilities.
-    :return: Backward probability matrix.
-    :rtype: numpy.ndarray
-    """
-    alleles = np.arange(4)  # ACGT
-    assert np.all(np.isin(ref_h, alleles))
-    assert np.all(np.isin(query_h, alleles))
-    m = ref_h.shape[0]
-    h = ref_h.shape[1]
-    assert m == len(query_h)
-    assert m == len(rho)
-    assert m == len(mu)
-    assert 0 <= np.min(rho)
-    assert np.max(rho) <= 1
-    assert 0 <= np.min(mu)
-    assert np.max(mu) <= 0.5
-    bm = np.zeros((m, h), dtype=np.float32)
-    bm[-1, :] = 1.0 / h  # Initialise the last column
-    for i in range(m - 2, -1, -1):
-        query_a = query_h[i + 1]
-        for j in range(h):
-            ref_a = ref_h[i + 1, j]
-            em_prob = mu[i + 1] if ref_a != query_a else 1.0 - mu[i + 1]
-            # Note that BEAGLE keeps the values at one site at a time.
-            bm[i, j] = bm[i + 1, j] * em_prob
-        sum_site = np.sum(bm[i, :])
-        scale = (1 - rho[i + 1]) / sum_site
-        shift = rho[i + 1] / h
-        bm[i, :] = scale * bm[i, :] + shift
-    return bm
-
-
-def _get_reference_haplotype_segments(a, b):
-    """
-    WARN: This function is part of an attempt to faithfully reimplement
-    the BEAGLE 4.1 algorithm. It is not complete and not used
-    in the current implementation. It is kept for documentation.
-
-    Assuming all biallelic sites, get the index of a reference haplotype segment (i)
-    defined by the alleles a and b at two adjacent genotyped markers.
-
-    #i, a, b
-    0, 0, 0
-    1, 1, 0
-    2, 0, 1
-    3, 1, 1
-
-    See https://github.com/tskit-dev/tskit/issues/2802#issuecomment-1698767169
-
-    This is a helper function for `_compute_state_probability_matrix`.
-    """
-    ref_hap_idx = a * 1 + b * 2
-    return ref_hap_idx
-
-
-def _compute_state_prob_matrix(fm, bm, ref_h, query_h, rho, mu):
-    """
-    WARN: This function is part of an attempt to faithfully reimplement
-    the BEAGLE 4.1 algorithm. It is not complete and not used
-    in the current implementation. It is kept for documentation.
-
-    Implement the forward-backward algorithm to compute:
-    1. A HMM state probability matrix across genotyped markers.
-    2. "Forward haplotype probabilities".
-    3. "Backward haplotype probabilities".
-
-    The HMM state probability matrix is of size (m + 1, h).
-
-    Note that the additional column at m + 1 is for interpolation of
-    ungenotyped markers right of the last genotyped marker.
-
-    Both the forward and backward haplotype probability matrices
-    are of size (m, 2), assuming only biallelic sites.
-
-    Reference haplotypes and query haplotype are subsetted to genotyped markers.
-    So, they are a matrix of size (m, h) and an array of size m, respectively.
-
-    This implementation is based on the function `setStateProbs`
-    of `LSHapBaum.java` in BEAGLE. It performs the computation
-    described in Equation 2 of BB2016.
-
-    :param numpy.ndarray fm: Forward probability matrix.
-    :param numpy.ndarray bm: Backward probability matrix.
-    :param numpy.ndarray ref_h: Reference haplotypes.
-    :param numpy.ndarray query_h: One query haplotype.
-    :param numpy.ndarray rho: Switch probabilities.
-    :param numpy.ndarray mu: Mismatch probabilities.
-    :return: HMM state probability matrix,
-        forward haplotype probability matrix, and
-        backward haplotype probability matrix.
-    :rtype: tuple(numpy.ndarray, numpy.ndarray, numpy.ndarray)
-    """
-    alleles = np.arange(4)  # ACGT
-    m = fm.shape[0]
-    h = fm.shape[1]
-    assert (m, h) == bm.shape
-    assert not np.any(fm < 0)
-    assert not np.any(np.isnan(fm))
-    assert not np.any(bm < 0)
-    assert not np.any(np.isnan(bm))
-    assert np.all(np.isin(ref_h, alleles))
-    assert np.all(np.isin(query_h, alleles))
-    assert m == len(query_h)
-    assert m == len(rho)
-    assert m == len(mu)
-    assert 0 <= np.min(rho)
-    assert np.max(rho) <= 1
-    assert 0 <= np.min(mu)
-    assert np.max(mu) <= 0.5
-    # m + 1 columns to allow for ungenotyped markers right of the last genotyped marker.
-    sm = np.zeros((m + 1, h), dtype=np.float32)
-    fwd_hap_probs = np.zeros((m, 4), dtype=np.float32)
-    bwd_hap_probs = np.zeros((m, 4), dtype=np.float32)
-    for i in range(m - 1, -1, -1):
-        for j in range(h):
-            sm[i, j] = fm[i, j] * bm[i, j]
-            ref_hap_seg_mP1 = _get_reference_haplotype_segments(
-                ref_h[i, j], ref_h[i + 1, j]
-            )
-            ref_hap_seg_m = _get_reference_haplotype_segments(
-                ref_h[i - 1, j], ref_h[i, j]
-            )
-            fwd_hap_probs[i, ref_hap_seg_mP1] += sm[i, j]
-            bwd_hap_probs[i, ref_hap_seg_m] += sm[i, j]
-        sum_fwd_hap_probs = np.sum(fwd_hap_probs[i, :])
-        fwd_hap_probs[i, :] /= sum_fwd_hap_probs
-        bwd_hap_probs[i, :] /= sum_fwd_hap_probs  # Sum of forward hap probs
-    sm[m, :] = sm[m - 1, :]  # Not in BEAGLE
-    return (sm, fwd_hap_probs, bwd_hap_probs)
-
-
-def _interpolate_allele_prob(
-    fwd_hap_probs, bwd_hap_probs, genotyped_pos, ungenotyped_pos, genetic_map=None
-):
-    """
-    WARN: This function is part of an attempt to faithfully reimplement
-    the BEAGLE 4.1 algorithm. It is not complete and not used
-    in the current implementation. It is kept for documentation.
-
-    Compute the interpolated allele probabilities at the ungenotyped markers.
-
-    The forward and backward haplotype probability matrices are of size (m, 2).
-    The interpolated allele probability matrix is of size (x, 2).
-    We assume all biallelic sites, hence the 2.
-
-    This implementation is based on Equation 2 (the rearranged one) in BB2016.
-
-    :param numpy.ndarray fwd_hap_probs: Forward haplotype probability matrix.
-    :param numpy.ndarray bwd_hap_probs: Backward haplotype probability matrix.
-    :param numpy.ndarray genotyped_pos: Site positions of genotyped markers.
-    :param numpy.ndarray ungenotyped_pos: Site positions of ungenotyped markers.
-    :param msprime.RateMap genetic_map: Genetic map.
-    :return: Interpolated allele probabilities.
-    :rtype: numpy.ndarray
-    """
-    m = len(genotyped_pos)
-    x = len(ungenotyped_pos)
-    assert (m, 2) == fwd_hap_probs.shape
-    assert (m, 2) == bwd_hap_probs.shape
-    genotyped_cm = convert_to_genetic_map_position(genotyped_pos)
-    assert m == len(genotyped_cm)
-    ungenotyped_cm = convert_to_genetic_map_position(ungenotyped_pos)
-    assert x == len(ungenotyped_cm)
-    weights, _ = get_weights(genotyped_pos, ungenotyped_pos, genetic_map=genetic_map)
-    assert x == len(weights)
-    i_hap_probs = np.zeros((x, 2), dtype=np.float32)
-    for i in range(x):
-        # Identify genotyped markers k and k + 1 sandwiching ungenotyped marker i.
-        if ungenotyped_pos[i] < genotyped_pos[0]:
-            k = 0
-        elif ungenotyped_pos[i] > genotyped_pos[-1]:
-            k = m - 2
-        else:
-            k = np.max(np.where(ungenotyped_pos[i] > genotyped_pos)[0])
-        # Vectorised over the allelic states.
-        i_hap_probs[i, :] = weights[i] * bwd_hap_probs[k, :]
-        i_hap_probs[i, :] += (1 - weights[i]) * fwd_hap_probs[k, :]
-    return i_hap_probs
-
-
-def compute_state_prob_matrix(fm, bm):
-    """
-    Compute the HMM state probability matrix, in which each element is
-    the probability of a HMM state at a genotyped marker.
-
-    Implementing this is simpler than faithfully reproducing the BEAGLE 4.1 algorithm.
-
-    The output HMM state probability matrix is of size (m + 1, h).
-
-    Note that the additional column at m + 1 is used for interpolation of
-    ungenotyped markers to the right of the last genotyped marker.
-
-    The forward and backward probability matrices are of size (m, h).
-
-    :param numpy.ndarray fm: Forward probability matrix.
-    :param numpy.ndarray bm: Backward probability matrix.
-    :return: HMM state probability matrix.
-    :rtype: numpy.ndarray
-    """
-    assert fm.shape == bm.shape
-    m = fm.shape[0]
-    h = bm.shape[1]
-    assert not np.any(fm < 0)
-    assert not np.any(np.isnan(fm))
-    assert not np.any(bm < 0)
-    assert not np.any(np.isnan(bm))
-    # m + 1 columns to allow for ungenotyped markers right of the last genotyped marker.
-    sm = np.zeros((m + 1, h), dtype=np.float32)
-    sm[:-1, :] = np.multiply(fm, bm)
-    sm[m, :] = sm[m - 1, :]  # Not in BEAGLE
-    return sm
-
-
-def interpolate_allele_probabilities(
-    sm, ref_h, genotyped_pos, ungenotyped_pos, genetic_map=None
-):
-    """
-    Compute the interpolated allele probabilities at ungenotyped markers of
-    a query haplotype following Equation 1 of BB2016.
-
-    The interpolated allele probability matrix is of size (x, a),
-    where a is the number of alleles.
-
-    This function takes the output of `compute_state_prob_matrix`.
-
-    Note that this function takes:
-    1. HMM state probability matrix across genotyped markers of size (m + 1, h).
-    2. reference haplotypes subsetted to ungenotyped markers of size (x, h).
-
-    :param numpy.ndarray sm: HMM state probability matrix at genotyped markers.
-    :param numpy.ndarray ref_h: Reference haplotypes subsetted to ungenotyped markers.
-    :param numpy.ndarray genotyped_pos: Site positions at genotyped markers.
-    :param numpy.ndarray ungenotyped_pos: Site positions at ungenotyped markers.
-    :param msprime.RateMap genetic_map: Genetic map.
-    :return: Interpolated allele probabilities.
-    :rtype: numpy.ndarray
-    """
-    alleles = np.arange(4)  # ACGT
-    assert not np.any(sm < 0)
-    assert not np.any(np.isnan(sm))
-    m = sm.shape[0] - 1
-    h = sm.shape[1]
-    assert m == len(genotyped_pos)
-    x = len(ungenotyped_pos)
-    assert (x, h) == ref_h.shape
-    weights, marker_interval_start = get_weights(
-        genotyped_pos, ungenotyped_pos, genetic_map=genetic_map
-    )
-    assert x == len(weights)
-    assert x == len(marker_interval_start)
-    p = np.zeros((x, len(alleles)), dtype=np.float32)
-    # Compute allele probabilities as per Equation 1 in BB2016.
-    for a in alleles:
-        # Going from left to right.
-        for i in range(x):
-            # Identify genotyped markers k and k + 1 sandwiching ungenotyped marker i.
-            k = marker_interval_start[i]
-            # Sum over the reference haplotypes where the allele is a at marker i.
-            has_a = ref_h[i, :] == a
-            p[i, a] += np.sum(weights[i] * sm[k, has_a])
-            p[i, a] += np.sum((1 - weights[i]) * sm[k + 1, has_a])
-    # Rescale probabilities.
-    p_rescaled = p / np.sum(p, axis=1)[:, np.newaxis]
-    return p_rescaled
-
-
-def get_map_alleles(allele_prob):
-    """
-    Compute the maximum a posteriori alleles at ungenotyped markers of a query haplotype.
-
-    The imputed alleles is an array of size x.
-
-    WARN: If the allele probabilities are equal, then allele 0 is arbitrarily chosen.
-
-    :param numpy.ndarray allele_prob: Interpolated allele probabilities.
-    :return: Imputed alleles and their associated probabilities.
-    :rtype: tuple(numpy.ndarray, numpy.ndarray)
-    """
-    assert np.all(allele_prob >= 0)
-    assert not np.any(np.isnan(allele_prob))
-    imputed_alleles = np.argmax(allele_prob, axis=1)
-    max_allele_prob = np.max(allele_prob, axis=1)
-    assert len(imputed_alleles) == allele_prob.shape[0]
-    assert len(max_allele_prob) == allele_prob.shape[0]
-    return (imputed_alleles, max_allele_prob)
-
-
-def run_beagle(
-    ref_h, query_h, pos, miscall_rate=1e-4, ne=1e6, genetic_map=None, debug=False
-):
-    """
-    Run a simplified version of the BEAGLE 4.1 imputation algorithm
-    based on Equation 1 of BB2016.
-
-    Reference haplotypes and query haplotype are of size (m + x, h) and (m + x),
-    respectively,
-
-    The site positions of all the markers are an array of size (m + x).
-
-    This produces imputed alleles and interpolated allele probabilities
-    at the ungenotyped markers of the query haplotype.
-
-    :param numpy.ndarray ref_h: Reference haplotypes.
-    :param numpy.ndarray query_h: One query haplotype.
-    :param numpy.ndarray pos: Site positions of all the markers.
-    :param float miscall_rate: Allelic miscall rate.
-    :param int ne: Effective population size.
-    :param msprime.RateMap genetic_map: Genetic map.
-    :param bool debug: Whether to print intermediate results.
-    :return: Imputed alleles and their associated probabilities.
-    :rtype: tuple(numpy.ndarray, numpy.ndarray)
-    """
-    # Set precision for printing.
-    if debug:
-        np.set_printoptions(precision=20)
-    assert ref_h.shape[0] == len(pos)
-    assert len(query_h) == len(pos)
-    # Indices of genotyped markers in the query haplotype.
-    genotyped_idx = np.where(query_h != -1)[0]
-    assert len(genotyped_idx) > 1
-    # Indices of ungenotyped markers in the query haplotype.
-    ungenotyped_idx = np.where(query_h == -1)[0]
-    assert len(ungenotyped_idx) > 0
-    # Site positions of genotyped markers.
-    genotyped_pos = pos[genotyped_idx]
-    # Site positions of ungenotyped markers.
-    ungenotyped_pos = pos[ungenotyped_idx]
-    h = ref_h.shape[1]
-    m = len(genotyped_pos)
-    x = len(ungenotyped_pos)
-    assert m + x == len(pos)
-    # Subset reference haplotypes to genotyped markers.
-    ref_h_genotyped = ref_h[genotyped_idx, :]
-    if debug:
-        print("Reference haplotypes subsetted to genotyped markers")
-        print(ref_h_genotyped)
-    assert (m, h) == ref_h_genotyped.shape
-    # Subset query haplotype to genotyped markers.
-    query_h_genotyped = query_h[genotyped_idx]
-    if debug:
-        print("Query haplotype subsetted to genotyped markers")
-        print(query_h_genotyped)
-    assert m == len(query_h_genotyped)
-    # Set mismatch probabilities at genotyped markers.
-    mu = get_mismatch_prob(genotyped_pos, miscall_rate=miscall_rate)
-    if debug:
-        print("Mismatch probabilities")
-        print(mu)
-    assert m == len(mu)
-    # Set switch probabilities at genotyped markers.
-    rho = get_switch_prob(genotyped_pos, h, ne=ne)
-    if debug:
-        print("Switch probabilities")
-        print(rho)
-    assert m == len(rho)
-    # Compute forward probability matrix at genotyped markers.
-    fm = compute_forward_matrix(ref_h_genotyped, query_h_genotyped, rho, mu)
-    if debug:
-        print("Forward probability matrix")
-        print(fm)
-    assert (m, h) == fm.shape
-    # Compute backward probability matrix at genotyped markers.
-    bm = compute_backward_matrix(ref_h_genotyped, query_h_genotyped, rho, mu)
-    if debug:
-        print("Backward probability matrix")
-        print(bm)
-    assert (m, h) == bm.shape
-    # Compute HMM state probability matrix at genotyped markers.
-    sm = compute_state_prob_matrix(fm, bm)
-    if debug:
-        print("HMM state probability matrix")
-        print(sm)
-    assert (m + 1, h) == sm.shape
-    # Subset the reference haplotypes to ungenotyped markers.
-    ref_h_ungenotyped = ref_h[ungenotyped_idx, :]
-    # Interpolate allele probabilities at ungenotyped markers.
-    alleles = np.arange(4)  # ACGT
-    i_allele_prob = interpolate_allele_probabilities(
-        sm, ref_h_ungenotyped, genotyped_pos, ungenotyped_pos, genetic_map=genetic_map
-    )
-    if debug:
-        print("Interpolated allele probabilities")
-        print(i_allele_prob)
-    assert (x, len(alleles)) == i_allele_prob.shape
-    # Get MAP alleles at ungenotyped markers.
-    imputed_alleles, max_allele_prob = get_map_alleles(i_allele_prob)
-    if debug:
-        print("Imputed alleles")
-        print(imputed_alleles)
-        print("Max allele probabilities")
-        print(max_allele_prob)
-    assert x == len(imputed_alleles)
-    assert x == len(max_allele_prob)
-    return (imputed_alleles, max_allele_prob)
diff --git a/python/tests/beagle_numba.py b/python/tests/beagle_numba.py
index dee2c01f74..7de8667336 100644
--- a/python/tests/beagle_numba.py
+++ b/python/tests/beagle_numba.py
@@ -2,12 +2,9 @@
 Implementation of the BEAGLE 4.1 algorithm to impute alleles by linear interpolation
 of state probabilities at ungenotyped positions in the hidden state probability matrix.
 
-This was implemented while closely consulting the BEAGLE 4.1 paper:
-Browning & Browning (2016). Genotype imputation with millions of reference samples.
-Am J Hum Genet 98:116-126. doi:10.1016/j.ajhg.2015.11.020
-
-The BEAGLE 4.1 source code (particularly `LSHapBaum.java`) was closely consulted:
-https://faculty.washington.edu/browning/beagle/b4_1.html
+This was implemented while closely consulting the BEAGLE 4.1 paper and source code:
+* Browning & Browning (2016). Am J Hum Genet 98:116-126. doi:10.1016/j.ajhg.2015.11.020
+* Source code: https://faculty.washington.edu/browning/beagle/b4_1.html
 
 These notations are used throughout:
 h = number of reference haplotypes.
@@ -15,18 +12,28 @@
 x = number of ungenotyped positions.
 
 This implementation takes the following inputs:
-1. Panel of reference haplotypes in a matrix of size (m + x, h).
-2. One query haplotype in an array of size (m + x).
-3. Physical positions of all the positions in an array of size (m + x).
+* Panel of reference haplotypes in a matrix of size (m + x, h).
+* Query haplotype in an array of size (m + x).
+* Physical positions of all the markers in an array of size (m + x).
+* Genetic map.
 
 In the query haplotype:
-1. The genotyped positions take values of 0, 1, 2, or 3 (ACGT encoding).
-2. The ungenotyped positions take -1.
-
-The forward and backward probability matrices are of size (m, h).
-The state probability matrix is of size (m, h).
-The imputed allele probability matrix is of size (x, 4),
-The imputed alleles are the maximum a posteriori (MAP) alleles.
+* Genotyped positions take values of 0, 1, 2, or 3 (i.e. ACGT encoding).
+* Ungenotyped positions take -1.
+
+The following objects are computed:
+* Forward and backward probability matrices of size (m, h).
+* Hidden state probability matrix of size (m, h).
+* Interpolated state probability matrix of size (x, h).
+* Imputed allele probability matrix of size (x, 4),
+* Imputed alleles as the maximum a posteriori alleles.
+
+The following evaluation metrics are produced in VCF format:
+* Estimated allelic R-squared (AR2).
+* Dosage R-squared (DR2).
+* Estimated allele frequency (AF).
+* Genotype probabilities of 00, 01/10, and 11 (GP).
+* Estimated dosage (DS).
 
 To improve computational efficiency, BEAGLE uses aggregated markers,
 which are clusters of markers within a 0.005 cM interval (default).