Skip to content

Commit

Permalink
WIP: Allelic R-squared
Browse files Browse the repository at this point in the history
  • Loading branch information
szhan committed Feb 25, 2024
1 parent 9a3132b commit 0f1501d
Showing 1 changed file with 50 additions and 45 deletions.
95 changes: 50 additions & 45 deletions python/tests/beagle_numba.py
Original file line number Diff line number Diff line change
Expand Up @@ -577,18 +577,50 @@ def run_tsimpute(
return (imputed_alleles, max_allele_probs)


def compute_alt_allele_frequencies():
def compute_genotype_probs(alleles_1, allele_probs_1, alleles_2, allele_probs_2):
"""
In BEAGLE 4.1, AF: "Estimated ALT Allele Frequencies".
Compute genotype probabilities of diploid individuals at an imputed position.
Assume that all sites are biallelic. Otherwise, the calculation below is incorrect.
Note that 0 refers to the major allele and 1 the minor allele.
Allele dosages are as follows: P(00) = 0, P(01 or 10) = 1, P(11) = 2.
In BEAGLE 4.1 output, GP: "Estimated Genotype Probability".
:param numpy.ndarray alleles_1: Imputed alleles for haplotype 1.
:param numpy.ndarray allele_probs_1: Imputed allele probabilities for haplotype 1.
:param numpy.ndarray alleles_2: Imputed alleles for haplotype 2.
:param numpy.ndarray allele_probs_2: Imputed allele probabilities for haplotype 2.
:return: Genotype probabilities.
:rtype: numpy.ndarray(dtype=np.float64)
"""
pass
n = len(alleles_1)
assert len(alleles_2) == n, "Lengths of alleles differ."
assert (
len(allele_probs_1) == n
), "Lengths of alleles and allele probabilities differ."
assert (
len(allele_probs_2) == n
), "Lengths of alleles and allele probabilities differ."
gt_probs = np.zeros((n, 3), dtype=np.float64)
for i in range(n):
ap_hap1_0 = allele_probs_1[i] if alleles_1[i] == 0 else 1 - allele_probs_1[i]
ap_hap1_1 = 1 - ap_hap1_0
ap_hap2_0 = allele_probs_2[i] if alleles_2[i] == 0 else 1 - allele_probs_2[i]
ap_hap2_1 = 1 - ap_hap2_0
gt_probs[i, 0] = ap_hap1_0 * ap_hap2_0 # P(00)
gt_probs[i, 1] = ap_hap1_0 * ap_hap2_1 + ap_hap1_1 * ap_hap2_0 # P(01 or 10)
gt_probs[i, 2] = ap_hap1_1 * ap_hap2_1 # P(11)
return gt_probs


def compute_estimated_allelic_r_squared(
alleles_1, allele_probs_1, alleles_2, allele_probs_2
):
def compute_estimated_allelic_r_squared(gt_probs):
"""
Compute the estimated allelic R^2 for a given imputed site.
Compute the estimated allelic R^2 for an imputed position.
Assume that all sites are biallelic. Otherwise, the calculation below is incorrect.
Note that 0 refers to the major allele and 1 the minor allele.
It is not the true allelic R^2, which needs access to true genotypes to compute.
The true allelic R^s is the squared correlation between true and imputed ALT dosages.
Expand All @@ -600,31 +632,17 @@ def compute_estimated_allelic_r_squared(
See formulation in the Appendix 1 of Browning and Browning. (2009).
Am J Hum Genet. 84(2): 210–223. doi: 10.1016/j.ajhg.2009.01.005.
Assume that all sites are biallelic. Otherwise, the calculation below is incorrect.
Note that 0 refers to the major allele and 1 the minor allele.
:param numpy.ndarray alleles_1: Imputed alleles for haplotype 1.
:param numpy.ndarray allele_probs_1: Imputed allele probabilities for haplotype 1.
:param numpy.ndarray alleles_2: Imputed alleles for haplotype 2.
:param numpy.ndarray allele_probs_2: Imputed allele probabilities for haplotype 2.
:return: Estimated allelic R-squared.
:rtype: float
"""
n = len(alleles_1)
assert len(alleles_2) == n, "Lengths of alleles differ."
assert (
len(allele_probs_1) == n
), "Lengths of alleles and allele probabilities differ."
assert (
len(allele_probs_2) == n
), "Lengths of alleles and allele probabilities differ."
# Genotype probabilities for ith sample out of n samples.
# Dossges: P(RR) = 0, P(RA or AR) = 1, P(AA) = 2.
# TODO: Refactor so that genotyped are the input.
y = np.zeros((n, 3), dtype=np.float64)
z = np.argmax(y, axis=1) # Most likely imputed genotype of ith sample.
u = y[:, 1] + 2 * y[:, 2] # E[X | y_i].
w = y[:, 1] + 4 * y[:, 2] # E[X^2 | y_i].
n = len(gt_probs) # Number of individuals.
z = np.argmax(gt_probs, axis=1) # Most likely imputed genotype of ith sample.
u = gt_probs[:, 1] + 2 * gt_probs[:, 2] # E[X | y_i].
w = gt_probs[:, 1] + 4 * gt_probs[:, 2] # E[X^2 | y_i].
term_1 = (np.sum(z * u) - (1 / n) * (np.sum(u) * np.sum(z))) ** 2
term_2 = np.sum(w) - (1 / n) * np.sum(u) ** 2
term_3 = np.sum(z) - (1 / n) * np.sum(z) ** 2
Expand All @@ -649,26 +667,6 @@ def compute_genotype_likelihoods():
pass


def compute_genotype_probs(allele_probs_1, allele_probs_2):
"""
Compute genotype probabilities at ungenotyped positions of an individual.
Take element-wise products of allele probabilities at each ungenotyped positions.
In BEAGLE 4.1 output, GP: "Estimated Genotype Probability".
:param numpy.ndarray allele_probs_1: Imputed allele probabilities for haplotype 1.
:param numpy.ndarray allele_probs_2: Imputed allele probabilities for haplotype 2.
:return: Genotype probabilities.
:rtype: numpy.ndarray(dtype=np.float64)
"""
assert len(allele_probs_1) == len(
allele_probs_2
), "Lengths of allele probabilities differ."
genotype_probs = allele_probs_1 * allele_probs_2
return genotype_probs


def compute_dosage_scores(
imputed_alleles_1, allele_probs_1, imputed_alleles_2, allele_probs_2
):
Expand Down Expand Up @@ -696,6 +694,13 @@ def compute_dosage_scores(
return dosage_scores


def compute_alt_allele_frequencies():
"""
In BEAGLE 4.1, AF: "Estimated ALT Allele Frequencies".
"""
pass


def write_vcf(out_file):
"""
:param str out_file: Path to output VCF file.
Expand Down

0 comments on commit 0f1501d

Please sign in to comment.