Skip to content

Commit

Permalink
WIP: Print results to VCF
Browse files Browse the repository at this point in the history
  • Loading branch information
szhan committed Feb 20, 2024
1 parent fad3b8d commit 6a2035d
Showing 1 changed file with 108 additions and 0 deletions.
108 changes: 108 additions & 0 deletions python/tests/beagle_numba.py
Original file line number Diff line number Diff line change
Expand Up @@ -571,3 +571,111 @@ def run_tsimpute(
)
imputed_alleles, max_allele_probs = get_map_alleles(int_allele_probs)
return (imputed_alleles, max_allele_probs)


def compute_alt_allele_frequencies():
"""
In BEAGLE 4.1, AF: "Estimated ALT Allele Frequencies".
"""
pass


def compute_allelic_r_squared():
"""
In BEAGLE 4.1, AR2: "Allelic R-Squared: estimated squared correlation
between most probable REF dose and true REF dose".
"""
pass


def compute_dosage_r_squared():
"""
In BEAGLE 4.1, DR2: "Dosage R-Squared: estimated squared correlation
between estimated REF dose [P(RA) + 2 * P(RR)] and true REF dose".
"""
pass


def compute_genotype_likelihoods():
"""
In BEAGLE 4.1, GL: "Log10-scaled Genotype Likelihood".
TODO: Check if BEAGLE 4.1 actually outputs this.
"""
pass


def compute_genotype_probs(allele_probs_1, allele_probs_2):
"""
Compute genotype probabilities at ungenotyped positions of an individual.
Take element-wise products of allele probabilities at each ungenotyped positions.
In BEAGLE 4.1 output, GP: "Estimated Genotype Probability".
:param numpy.ndarray allele_probs_1: Imputed allele probabilities for haplotype 1.
:param numpy.ndarray allele_probs_2: Imputed allele probabilities for haplotype 2.
:return: Genotype probabilities.
:rtype: numpy.ndarray(dtype=np.float64)
"""
assert len(allele_probs_1) == len(
allele_probs_2
), "Lengths of allele probabilities differ."
genotype_probs = allele_probs_1 * allele_probs_2
return genotype_probs


def compute_dosage_scores(
imputed_alleles_1, allele_probs_1, imputed_alleles_2, allele_probs_2
):
"""
Compute dosage scores at ungenotyped positions of an individual.
In BEAGLE 4.1 output, DS: "Estimated ALT dose [P(RA) + P(AA)".
:param numpy.ndarray imputed_alleles_1: Imputed alleles for haplotype 1.
:param numpy.ndarray allele_probs_1: Imputed allele probabilities for haplotype 1.
:param numpy.ndarray imputed_alleles_2: Imputed alleles for haplotype 2.
:param numpy.ndarray allele_probs_2: Imputed allele probabilities for haplotype 2.
:return: Dosage scores at ungenotyped positions.
:rtype: numpy.ndarray(dtype=np.float64)
"""
assert len(imputed_alleles_1) == len(
imputed_alleles_2
), "Lengths of imputed alleles differ."
assert len(allele_probs_1) == len(
allele_probs_2
), "Lengths of allele probabilities differ."
x = len(imputed_alleles_1)
dosage_scores = np.zeros(x, dtype=np.float64)
# TODO: Figure out how BEAGLE does this.
return dosage_scores


def write_vcf(out_file):
"""
:param str out_file: Path to output VCF file.
:return: None
"""
_HEADER = [
"##fileformat=VCFv4.2",
"##INFO=<ID=AF,Number=A,Type=Float,"
+ 'Description="Estimated ALT Allele Frequencies">',
"##INFO=<ID=AR2,Number=1,Type=Float,"
+ 'Description="Allelic R-Squared: estimated squared correlation '
+ 'between most probable REF dose and true REF dose">',
"##INFO=<ID=DR2,Number=1,Type=Float,"
+ 'Description="Dosage R-Squared: estimated squared correlation '
+ 'between estimated REF dose [P(RA) + 2*P(RR)] and true REF dose">',
'##INFO=<ID=IMP,Number=0,Type=Flag,Description="Imputed marker">',
'##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
"##FORMAT=<ID=DS,Number=A,Type=Float,"
+ 'Description="estimated ALT dose [P(RA) + P(AA)]">',
"##FORMAT=<ID=GL,Number=G,Type=Float,"
+ 'Description="Log10-scaled Genotype Likelihood">',
"##FORMAT=<ID=GP,Number=G,Type=Float,"
+ 'Description="Estimated Genotype Probability">',
]
with open(out_file, "w") as f:
for line in _HEADER:
f.write(line + "\n")

0 comments on commit 6a2035d

Please sign in to comment.