From aad036def08210e816fa4c0360a11ecac9491757 Mon Sep 17 00:00:00 2001
From: Shing Zhan <shing.zhan@gmail.com>
Date: Sun, 14 Jan 2024 09:53:10 +0000
Subject: [PATCH] Add GeneticMap class and refactor

---
 python/tests/beagle_numba.py | 98 ++++++++++++++++++------------------
 1 file changed, 48 insertions(+), 50 deletions(-)

diff --git a/python/tests/beagle_numba.py b/python/tests/beagle_numba.py
index dd9544dc75..fec799680b 100644
--- a/python/tests/beagle_numba.py
+++ b/python/tests/beagle_numba.py
@@ -6,6 +6,8 @@
 
 This is the numba-fied version of `beagle.py`.
 """
+from dataclasses import dataclass
+
 import numpy as np
 from numba import njit
 
@@ -13,7 +15,22 @@
 import tskit
 
 
-def convert_to_genetic_map_position(pos, genetic_map_pos=None, genetic_map_cm=None):
+@dataclass
+class GeneticMap:
+    """
+    A genetic map having:
+    1. Physical positions (bp).
+    2. Genetic map positions (cM).
+    """
+
+    base_pos: np.ndarray
+    gen_pos: np.ndarray
+
+    def __post_init__(self):
+        assert len(self.base_pos) == len(self.gen_pos)
+
+
+def convert_to_genetic_map_positions(pos, genetic_map=None):
     """
     Convert physical positions (bp) to genetic map positions (cM).
 
@@ -26,29 +43,31 @@ def convert_to_genetic_map_position(pos, genetic_map_pos=None, genetic_map_cm=No
     See `PlinkGenMap.java` in the BEAGLE 4.1 source code for details.
 
     :param numpy.ndarray pos: Physical positions (bp).
-    :param numpy.ndarray genetic_map_pos: Physical positions of genetic map.
-    :param numpy.ndarray genetic_map_cm: Genetic map positions of genetic map.
+    :param GeneticMap genetic_map: Genetic map.
     :return: Genetic map positions (cM).
     :rtype: numpy.ndarray
     """
-    if genetic_map_pos is None or genetic_map_cm is None:
+    if genetic_map is None:
         return pos / 1e6
-    # TODO: Better way to pass the components of a genetic map.
-    base_pos = genetic_map_pos
-    gen_pos = genetic_map_cm
-    assert len(base_pos) == len(gen_pos), "Bad genetic map."
-    assert np.all(pos >= base_pos[0]) and np.all(
-        pos <= base_pos[-1]
+    assert np.all(pos >= genetic_map.base_pos[0]) and np.all(
+        pos <= genetic_map.base_pos[-1]
     ), "Physical positions outside of genetic map."
-    left_idx = np.searchsorted(base_pos, pos, side="right")
+    # Approximate genetic map distances between adjacent base positions.
+    left_idx = np.searchsorted(genetic_map.base_pos, pos, side="right")
     xMa = np.zeros(len(pos), dtype=np.float32)
     bMa = np.zeros(len(pos), dtype=np.float32)
     fbMfa = np.zeros(len(pos), dtype=np.float32)
     for i in range(len(pos)):
-        xMa[i] = pos[i] - base_pos[left_idx[i] - 1]
-        bMa[i] = base_pos[left_idx[i]] - base_pos[left_idx[i] - 1]
-        fbMfa[i] = gen_pos[left_idx[i]] - gen_pos[left_idx[i] - 1]
-    return xMa / bMa * fbMfa
+        xMa[i] = pos[i] - genetic_map.base_pos[left_idx[i] - 1]
+        bMa[i] = (
+            genetic_map.base_pos[left_idx[i]] - genetic_map.base_pos[left_idx[i] - 1]
+        )
+        fbMfa[i] = (
+            genetic_map.gen_pos[left_idx[i]] - genetic_map.gen_pos[left_idx[i] - 1]
+        )
+    est_cM_diff = xMa / bMa * fbMfa
+    # TODO: Calculate cumulative genetic map distances.
+    return est_cM_diff
 
 
 @njit
@@ -338,19 +357,17 @@ def run_beagle(
     pos,
     miscall_rate=1e-4,
     ne=1e6,
-    genetic_map_pos=None,
-    genetic_map_cm=None,
+    genetic_map=None,
 ):
     """
     Run a simplified version of the BEAGLE 4.1 imputation algorithm
     based on Equation 1 of BB2016.
 
-    Reference haplotypes and query haplotype are of size (m + x, h) and (m + x),
-    respectively,
+    Reference haplotypes and query haplotype are of size (m + x, h) and (m + x).
 
-    The site positions of all the markers are an array of size (m + x).
+    The physical positions of all the markers are an array of size (m + x).
 
-    This produces imputed alleles and interpolated allele probabilities
+    This produces imputed alleles and their interpolated probabilities
     at the ungenotyped markers of the query haplotype.
 
     :param numpy.ndarray ref_h: Reference haplotypes.
@@ -358,8 +375,7 @@ def run_beagle(
     :param numpy.ndarray pos: Physical positions of all the markers.
     :param float miscall_rate: Allelic miscall rate.
     :param int ne: Effective population size.
-    :param numpy.ndarray genetic_map_pos: Physical positions of genetic map.
-    :param numpy.ndarray genetic_map_cm: Genetic map positions of genetic map.
+    :param GeneticMap genetic_map: Genetic map.
     :return: Imputed alleles and their interpolated probabilities.
     :rtype: tuple(numpy.ndarray, numpy.ndarray)
     """
@@ -370,16 +386,8 @@ def run_beagle(
     genotyped_pos = pos[genotyped_idx]
     ungenotyped_pos = pos[ungenotyped_idx]
     # Get genetic map positions of markers.
-    if genetic_map_pos is None or genetic_map_cm is None:
-        genotyped_cm = convert_to_genetic_map_position(genotyped_pos)
-        ungenotyped_cm = convert_to_genetic_map_position(ungenotyped_pos)
-    else:
-        genotyped_cm = convert_to_genetic_map_position(
-            genotyped_pos, genetic_map_pos, genetic_map_cm
-        )
-        ungenotyped_cm = convert_to_genetic_map_position(
-            ungenotyped_pos, genetic_map_pos, genetic_map_cm
-        )
+    genotyped_cm = convert_to_genetic_map_positions(genotyped_pos, genetic_map)
+    ungenotyped_cm = convert_to_genetic_map_positions(ungenotyped_pos, genetic_map)
     # Subset haplotypes.
     ref_h_genotyped = ref_h[genotyped_idx, :]
     ref_h_ungenotyped = ref_h[ungenotyped_idx, :]
@@ -413,8 +421,7 @@ def run_tsimpute(
     mu,
     rho,
     precision=22,
-    genetic_map_pos=None,
-    genetic_map_cm=None,
+    genetic_map=None,
 ):
     """
     Run the BEAGLE 4.1 algorithm except that the forward and backward probability
@@ -441,16 +448,8 @@ def run_tsimpute(
     genotyped_pos = pos[genotyped_idx]
     ungenotyped_pos = pos[ungenotyped_idx]
     # Get genetic map positions of markers.
-    if genetic_map_pos is None or genetic_map_cm is None:
-        genotyped_cm = convert_to_genetic_map_position(genotyped_pos)
-        ungenotyped_cm = convert_to_genetic_map_position(ungenotyped_pos)
-    else:
-        genotyped_cm = convert_to_genetic_map_position(
-            genotyped_pos, genetic_map_pos, genetic_map_cm
-        )
-        ungenotyped_cm = convert_to_genetic_map_position(
-            ungenotyped_pos, genetic_map_pos, genetic_map_cm
-        )
+    genotyped_cm = convert_to_genetic_map_positions(genotyped_pos, genetic_map)
+    ungenotyped_cm = convert_to_genetic_map_positions(ungenotyped_pos, genetic_map)
     # Get parameters at genotyped markers.
     mu = mu[genotyped_idx]
     rho = rho[genotyped_idx]
@@ -459,10 +458,11 @@ def run_tsimpute(
     ref_ts_ungenotyped = ref_ts.delete_sites(site_ids=genotyped_idx)
     ref_h_ungenotyped = ref_ts_ungenotyped.genotype_matrix(alleles=tskit.ALLELES_ACGT)
     query_h_genotyped = query_h[genotyped_idx]
-    # Get forward and backward matrices from tree sequence.
+    # Get matrices from tree sequence.
     fm = _tskit.CompressedMatrix(ref_ts_genotyped._ll_tree_sequence)
     bm = _tskit.CompressedMatrix(ref_ts_genotyped._ll_tree_sequence)
-    # WARN: Be careful with the position arguments rho and mu!!!
+    # WARN: Be careful with the positional arguments rho and mu!!!
+    # WARN: Be careful with the argument `acgt_alleles`!!!
     ls_hmm = _tskit.LsHmm(
         ref_ts_genotyped._ll_tree_sequence,
         rho,
@@ -472,9 +472,8 @@ def run_tsimpute(
     )
     ls_hmm.forward_matrix(query_h_genotyped.T, fm)
     ls_hmm.backward_matrix(query_h_genotyped.T, fm.normalisation_factor, bm)
-    # Compute HMM state probability matrix.
     sm = compute_state_prob_matrix(fm.decode(), bm.decode())
-    # Interpolate allele probabilities.
+    # Perform interpolation.
     i_allele_prob = interpolate_allele_prob(
         sm,
         ref_h_ungenotyped,
@@ -483,6 +482,5 @@ def run_tsimpute(
         genotyped_cm,
         ungenotyped_cm,
     )
-    # Get MAP alleles at ungenotyped markers.
     imputed_alleles, max_allele_prob = get_map_alleles(i_allele_prob)
     return (imputed_alleles, max_allele_prob)