From 0f1501d54be1326c79c6ea4affd9c90598cebc6c Mon Sep 17 00:00:00 2001 From: Shing Zhan Date: Sun, 25 Feb 2024 13:22:19 +0000 Subject: [PATCH] WIP: Allelic R-squared --- python/tests/beagle_numba.py | 95 +++++++++++++++++++----------------- 1 file changed, 50 insertions(+), 45 deletions(-) diff --git a/python/tests/beagle_numba.py b/python/tests/beagle_numba.py index ff0553a4ca..8cd04b34f5 100644 --- a/python/tests/beagle_numba.py +++ b/python/tests/beagle_numba.py @@ -577,18 +577,50 @@ def run_tsimpute( return (imputed_alleles, max_allele_probs) -def compute_alt_allele_frequencies(): +def compute_genotype_probs(alleles_1, allele_probs_1, alleles_2, allele_probs_2): """ - In BEAGLE 4.1, AF: "Estimated ALT Allele Frequencies". + Compute genotype probabilities of diploid individuals at an imputed position. + + Assume that all sites are biallelic. Otherwise, the calculation below is incorrect. + Note that 0 refers to the major allele and 1 the minor allele. + + Allele dosages are as follows: P(00) = 0, P(01 or 10) = 1, P(11) = 2. + + In BEAGLE 4.1 output, GP: "Estimated Genotype Probability". + + :param numpy.ndarray alleles_1: Imputed alleles for haplotype 1. + :param numpy.ndarray allele_probs_1: Imputed allele probabilities for haplotype 1. + :param numpy.ndarray alleles_2: Imputed alleles for haplotype 2. + :param numpy.ndarray allele_probs_2: Imputed allele probabilities for haplotype 2. + :return: Genotype probabilities. + :rtype: numpy.ndarray(dtype=np.float64) """ - pass + n = len(alleles_1) + assert len(alleles_2) == n, "Lengths of alleles differ." + assert ( + len(allele_probs_1) == n + ), "Lengths of alleles and allele probabilities differ." + assert ( + len(allele_probs_2) == n + ), "Lengths of alleles and allele probabilities differ." + gt_probs = np.zeros((n, 3), dtype=np.float64) + for i in range(n): + ap_hap1_0 = allele_probs_1[i] if alleles_1[i] == 0 else 1 - allele_probs_1[i] + ap_hap1_1 = 1 - ap_hap1_0 + ap_hap2_0 = allele_probs_2[i] if alleles_2[i] == 0 else 1 - allele_probs_2[i] + ap_hap2_1 = 1 - ap_hap2_0 + gt_probs[i, 0] = ap_hap1_0 * ap_hap2_0 # P(00) + gt_probs[i, 1] = ap_hap1_0 * ap_hap2_1 + ap_hap1_1 * ap_hap2_0 # P(01 or 10) + gt_probs[i, 2] = ap_hap1_1 * ap_hap2_1 # P(11) + return gt_probs -def compute_estimated_allelic_r_squared( - alleles_1, allele_probs_1, alleles_2, allele_probs_2 -): +def compute_estimated_allelic_r_squared(gt_probs): """ - Compute the estimated allelic R^2 for a given imputed site. + Compute the estimated allelic R^2 for an imputed position. + + Assume that all sites are biallelic. Otherwise, the calculation below is incorrect. + Note that 0 refers to the major allele and 1 the minor allele. It is not the true allelic R^2, which needs access to true genotypes to compute. The true allelic R^s is the squared correlation between true and imputed ALT dosages. @@ -600,9 +632,6 @@ def compute_estimated_allelic_r_squared( See formulation in the Appendix 1 of Browning and Browning. (2009). Am J Hum Genet. 84(2): 210–223. doi: 10.1016/j.ajhg.2009.01.005. - Assume that all sites are biallelic. Otherwise, the calculation below is incorrect. - Note that 0 refers to the major allele and 1 the minor allele. - :param numpy.ndarray alleles_1: Imputed alleles for haplotype 1. :param numpy.ndarray allele_probs_1: Imputed allele probabilities for haplotype 1. :param numpy.ndarray alleles_2: Imputed alleles for haplotype 2. @@ -610,21 +639,10 @@ def compute_estimated_allelic_r_squared( :return: Estimated allelic R-squared. :rtype: float """ - n = len(alleles_1) - assert len(alleles_2) == n, "Lengths of alleles differ." - assert ( - len(allele_probs_1) == n - ), "Lengths of alleles and allele probabilities differ." - assert ( - len(allele_probs_2) == n - ), "Lengths of alleles and allele probabilities differ." - # Genotype probabilities for ith sample out of n samples. - # Dossges: P(RR) = 0, P(RA or AR) = 1, P(AA) = 2. - # TODO: Refactor so that genotyped are the input. - y = np.zeros((n, 3), dtype=np.float64) - z = np.argmax(y, axis=1) # Most likely imputed genotype of ith sample. - u = y[:, 1] + 2 * y[:, 2] # E[X | y_i]. - w = y[:, 1] + 4 * y[:, 2] # E[X^2 | y_i]. + n = len(gt_probs) # Number of individuals. + z = np.argmax(gt_probs, axis=1) # Most likely imputed genotype of ith sample. + u = gt_probs[:, 1] + 2 * gt_probs[:, 2] # E[X | y_i]. + w = gt_probs[:, 1] + 4 * gt_probs[:, 2] # E[X^2 | y_i]. term_1 = (np.sum(z * u) - (1 / n) * (np.sum(u) * np.sum(z))) ** 2 term_2 = np.sum(w) - (1 / n) * np.sum(u) ** 2 term_3 = np.sum(z) - (1 / n) * np.sum(z) ** 2 @@ -649,26 +667,6 @@ def compute_genotype_likelihoods(): pass -def compute_genotype_probs(allele_probs_1, allele_probs_2): - """ - Compute genotype probabilities at ungenotyped positions of an individual. - - Take element-wise products of allele probabilities at each ungenotyped positions. - - In BEAGLE 4.1 output, GP: "Estimated Genotype Probability". - - :param numpy.ndarray allele_probs_1: Imputed allele probabilities for haplotype 1. - :param numpy.ndarray allele_probs_2: Imputed allele probabilities for haplotype 2. - :return: Genotype probabilities. - :rtype: numpy.ndarray(dtype=np.float64) - """ - assert len(allele_probs_1) == len( - allele_probs_2 - ), "Lengths of allele probabilities differ." - genotype_probs = allele_probs_1 * allele_probs_2 - return genotype_probs - - def compute_dosage_scores( imputed_alleles_1, allele_probs_1, imputed_alleles_2, allele_probs_2 ): @@ -696,6 +694,13 @@ def compute_dosage_scores( return dosage_scores +def compute_alt_allele_frequencies(): + """ + In BEAGLE 4.1, AF: "Estimated ALT Allele Frequencies". + """ + pass + + def write_vcf(out_file): """ :param str out_file: Path to output VCF file.