WIP: Print results to VCF

szhan · Feb 20, 2024 · 6a2035d · 6a2035d
1 parent fad3b8d
commit 6a2035d
Showing 1 changed file with 108 additions and 0 deletions.
diff --git a/python/tests/beagle_numba.py b/python/tests/beagle_numba.py
@@ -571,3 +571,111 @@ def run_tsimpute(
     )
     imputed_alleles, max_allele_probs = get_map_alleles(int_allele_probs)
     return (imputed_alleles, max_allele_probs)
+
+
+def compute_alt_allele_frequencies():
+    """
+    In BEAGLE 4.1, AF: "Estimated ALT Allele Frequencies".
+    """
+    pass
+
+
+def compute_allelic_r_squared():
+    """
+    In BEAGLE 4.1, AR2: "Allelic R-Squared: estimated squared correlation
+    between most probable REF dose and true REF dose".
+    """
+    pass
+
+
+def compute_dosage_r_squared():
+    """
+    In BEAGLE 4.1, DR2: "Dosage R-Squared: estimated squared correlation
+    between estimated REF dose [P(RA) + 2 * P(RR)] and true REF dose".
+    """
+    pass
+
+
+def compute_genotype_likelihoods():
+    """
+    In BEAGLE 4.1, GL: "Log10-scaled Genotype Likelihood".
+
+    TODO: Check if BEAGLE 4.1 actually outputs this.
+    """
+    pass
+
+
+def compute_genotype_probs(allele_probs_1, allele_probs_2):
+    """
+    Compute genotype probabilities at ungenotyped positions of an individual.
+
+    Take element-wise products of allele probabilities at each ungenotyped positions.
+
+    In BEAGLE 4.1 output, GP: "Estimated Genotype Probability".
+
+    :param numpy.ndarray allele_probs_1: Imputed allele probabilities for haplotype 1.
+    :param numpy.ndarray allele_probs_2: Imputed allele probabilities for haplotype 2.
+    :return: Genotype probabilities.
+    :rtype: numpy.ndarray(dtype=np.float64)
+    """
+    assert len(allele_probs_1) == len(
+        allele_probs_2
+    ), "Lengths of allele probabilities differ."
+    genotype_probs = allele_probs_1 * allele_probs_2
+    return genotype_probs
+
+
+def compute_dosage_scores(
+    imputed_alleles_1, allele_probs_1, imputed_alleles_2, allele_probs_2
+):
+    """
+    Compute dosage scores at ungenotyped positions of an individual.
+
+    In BEAGLE 4.1 output, DS: "Estimated ALT dose [P(RA) + P(AA)".
+
+    :param numpy.ndarray imputed_alleles_1: Imputed alleles for haplotype 1.
+    :param numpy.ndarray allele_probs_1: Imputed allele probabilities for haplotype 1.
+    :param numpy.ndarray imputed_alleles_2: Imputed alleles for haplotype 2.
+    :param numpy.ndarray allele_probs_2: Imputed allele probabilities for haplotype 2.
+    :return: Dosage scores at ungenotyped positions.
+    :rtype: numpy.ndarray(dtype=np.float64)
+    """
+    assert len(imputed_alleles_1) == len(
+        imputed_alleles_2
+    ), "Lengths of imputed alleles differ."
+    assert len(allele_probs_1) == len(
+        allele_probs_2
+    ), "Lengths of allele probabilities differ."
+    x = len(imputed_alleles_1)
+    dosage_scores = np.zeros(x, dtype=np.float64)
+    # TODO: Figure out how BEAGLE does this.
+    return dosage_scores
+
+
+def write_vcf(out_file):
+    """
+    :param str out_file: Path to output VCF file.
+    :return: None
+    """
+    _HEADER = [
+        "##fileformat=VCFv4.2",
+        "##INFO=<ID=AF,Number=A,Type=Float,"
+        + 'Description="Estimated ALT Allele Frequencies">',
+        "##INFO=<ID=AR2,Number=1,Type=Float,"
+        + 'Description="Allelic R-Squared: estimated squared correlation '
+        + 'between most probable REF dose and true REF dose">',
+        "##INFO=<ID=DR2,Number=1,Type=Float,"
+        + 'Description="Dosage R-Squared: estimated squared correlation '
+        + 'between estimated REF dose [P(RA) + 2*P(RR)] and true REF dose">',
+        '##INFO=<ID=IMP,Number=0,Type=Flag,Description="Imputed marker">',
+        '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
+        "##FORMAT=<ID=DS,Number=A,Type=Float,"
+        + 'Description="estimated ALT dose [P(RA) + P(AA)]">',
+        "##FORMAT=<ID=GL,Number=G,Type=Float,"
+        + 'Description="Log10-scaled Genotype Likelihood">',
+        "##FORMAT=<ID=GP,Number=G,Type=Float,"
+        + 'Description="Estimated Genotype Probability">',
+    ]
+    with open(out_file, "w") as f:
+        for line in _HEADER:
+            f.write(line + "\n")