From 0f1501d54be1326c79c6ea4affd9c90598cebc6c Mon Sep 17 00:00:00 2001
From: Shing Zhan <shing.zhan@gmail.com>
Date: Sun, 25 Feb 2024 13:22:19 +0000
Subject: [PATCH] WIP: Allelic R-squared

---
 python/tests/beagle_numba.py | 95 +++++++++++++++++++-----------------
 1 file changed, 50 insertions(+), 45 deletions(-)

diff --git a/python/tests/beagle_numba.py b/python/tests/beagle_numba.py
index ff0553a4ca..8cd04b34f5 100644
--- a/python/tests/beagle_numba.py
+++ b/python/tests/beagle_numba.py
@@ -577,18 +577,50 @@ def run_tsimpute(
     return (imputed_alleles, max_allele_probs)
 
 
-def compute_alt_allele_frequencies():
+def compute_genotype_probs(alleles_1, allele_probs_1, alleles_2, allele_probs_2):
     """
-    In BEAGLE 4.1, AF: "Estimated ALT Allele Frequencies".
+    Compute genotype probabilities of diploid individuals at an imputed position.
+
+    Assume that all sites are biallelic. Otherwise, the calculation below is incorrect.
+    Note that 0 refers to the major allele and 1 the minor allele.
+
+    Allele dosages are as follows: P(00) = 0, P(01 or 10) = 1, P(11) = 2.
+
+    In BEAGLE 4.1 output, GP: "Estimated Genotype Probability".
+
+    :param numpy.ndarray alleles_1: Imputed alleles for haplotype 1.
+    :param numpy.ndarray allele_probs_1: Imputed allele probabilities for haplotype 1.
+    :param numpy.ndarray alleles_2: Imputed alleles for haplotype 2.
+    :param numpy.ndarray allele_probs_2: Imputed allele probabilities for haplotype 2.
+    :return: Genotype probabilities.
+    :rtype: numpy.ndarray(dtype=np.float64)
     """
-    pass
+    n = len(alleles_1)
+    assert len(alleles_2) == n, "Lengths of alleles differ."
+    assert (
+        len(allele_probs_1) == n
+    ), "Lengths of alleles and allele probabilities differ."
+    assert (
+        len(allele_probs_2) == n
+    ), "Lengths of alleles and allele probabilities differ."
+    gt_probs = np.zeros((n, 3), dtype=np.float64)
+    for i in range(n):
+        ap_hap1_0 = allele_probs_1[i] if alleles_1[i] == 0 else 1 - allele_probs_1[i]
+        ap_hap1_1 = 1 - ap_hap1_0
+        ap_hap2_0 = allele_probs_2[i] if alleles_2[i] == 0 else 1 - allele_probs_2[i]
+        ap_hap2_1 = 1 - ap_hap2_0
+        gt_probs[i, 0] = ap_hap1_0 * ap_hap2_0  # P(00)
+        gt_probs[i, 1] = ap_hap1_0 * ap_hap2_1 + ap_hap1_1 * ap_hap2_0  # P(01 or 10)
+        gt_probs[i, 2] = ap_hap1_1 * ap_hap2_1  # P(11)
+    return gt_probs
 
 
-def compute_estimated_allelic_r_squared(
-    alleles_1, allele_probs_1, alleles_2, allele_probs_2
-):
+def compute_estimated_allelic_r_squared(gt_probs):
     """
-    Compute the estimated allelic R^2 for a given imputed site.
+    Compute the estimated allelic R^2 for an imputed position.
+
+    Assume that all sites are biallelic. Otherwise, the calculation below is incorrect.
+    Note that 0 refers to the major allele and 1 the minor allele.
 
     It is not the true allelic R^2, which needs access to true genotypes to compute.
     The true allelic R^s is the squared correlation between true and imputed ALT dosages.
@@ -600,9 +632,6 @@ def compute_estimated_allelic_r_squared(
     See formulation in the Appendix 1 of Browning and Browning. (2009).
     Am J Hum Genet. 84(2): 210–223. doi: 10.1016/j.ajhg.2009.01.005.
 
-    Assume that all sites are biallelic. Otherwise, the calculation below is incorrect.
-    Note that 0 refers to the major allele and 1 the minor allele.
-
     :param numpy.ndarray alleles_1: Imputed alleles for haplotype 1.
     :param numpy.ndarray allele_probs_1: Imputed allele probabilities for haplotype 1.
     :param numpy.ndarray alleles_2: Imputed alleles for haplotype 2.
@@ -610,21 +639,10 @@ def compute_estimated_allelic_r_squared(
     :return: Estimated allelic R-squared.
     :rtype: float
     """
-    n = len(alleles_1)
-    assert len(alleles_2) == n, "Lengths of alleles differ."
-    assert (
-        len(allele_probs_1) == n
-    ), "Lengths of alleles and allele probabilities differ."
-    assert (
-        len(allele_probs_2) == n
-    ), "Lengths of alleles and allele probabilities differ."
-    # Genotype probabilities for ith sample out of n samples.
-    # Dossges: P(RR) = 0, P(RA or AR) = 1, P(AA) = 2.
-    # TODO: Refactor so that genotyped are the input.
-    y = np.zeros((n, 3), dtype=np.float64)
-    z = np.argmax(y, axis=1)  # Most likely imputed genotype of ith sample.
-    u = y[:, 1] + 2 * y[:, 2]  # E[X | y_i].
-    w = y[:, 1] + 4 * y[:, 2]  # E[X^2 | y_i].
+    n = len(gt_probs)  # Number of individuals.
+    z = np.argmax(gt_probs, axis=1)  # Most likely imputed genotype of ith sample.
+    u = gt_probs[:, 1] + 2 * gt_probs[:, 2]  # E[X | y_i].
+    w = gt_probs[:, 1] + 4 * gt_probs[:, 2]  # E[X^2 | y_i].
     term_1 = (np.sum(z * u) - (1 / n) * (np.sum(u) * np.sum(z))) ** 2
     term_2 = np.sum(w) - (1 / n) * np.sum(u) ** 2
     term_3 = np.sum(z) - (1 / n) * np.sum(z) ** 2
@@ -649,26 +667,6 @@ def compute_genotype_likelihoods():
     pass
 
 
-def compute_genotype_probs(allele_probs_1, allele_probs_2):
-    """
-    Compute genotype probabilities at ungenotyped positions of an individual.
-
-    Take element-wise products of allele probabilities at each ungenotyped positions.
-
-    In BEAGLE 4.1 output, GP: "Estimated Genotype Probability".
-
-    :param numpy.ndarray allele_probs_1: Imputed allele probabilities for haplotype 1.
-    :param numpy.ndarray allele_probs_2: Imputed allele probabilities for haplotype 2.
-    :return: Genotype probabilities.
-    :rtype: numpy.ndarray(dtype=np.float64)
-    """
-    assert len(allele_probs_1) == len(
-        allele_probs_2
-    ), "Lengths of allele probabilities differ."
-    genotype_probs = allele_probs_1 * allele_probs_2
-    return genotype_probs
-
-
 def compute_dosage_scores(
     imputed_alleles_1, allele_probs_1, imputed_alleles_2, allele_probs_2
 ):
@@ -696,6 +694,13 @@ def compute_dosage_scores(
     return dosage_scores
 
 
+def compute_alt_allele_frequencies():
+    """
+    In BEAGLE 4.1, AF: "Estimated ALT Allele Frequencies".
+    """
+    pass
+
+
 def write_vcf(out_file):
     """
     :param str out_file: Path to output VCF file.