Skip to content

Commit

Permalink
updated the api to allow tree testing in tskit
Browse files Browse the repository at this point in the history
  • Loading branch information
astheeggeggs committed Aug 29, 2022
1 parent 62237a0 commit b565924
Show file tree
Hide file tree
Showing 2 changed files with 83 additions and 4 deletions.
2 changes: 1 addition & 1 deletion lshmm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@
Viterbi algorithms on haploid or diploid genotype data.
"""

from .api import backwards, forwards, viterbi, checks
from .api import backwards, checks, forwards, path_ll, viterbi
85 changes: 82 additions & 3 deletions lshmm/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,12 @@
backwards_viterbi_dip,
forwards_viterbi_dip_low_mem,
get_phased_path,
path_ll_dip,
)
from .vit_haploid_variants_samples import (
backwards_viterbi_hap,
forwards_viterbi_hap_lower_mem_rescaling,
path_ll_hap,
)

EQUAL_BOTH_HOM = 4
Expand Down Expand Up @@ -200,7 +202,7 @@ def forwards(
e = np.zeros((m, 8))
e[:, EQUAL_BOTH_HOM] = (1 - mutation_rate) ** 2
e[:, UNEQUAL_BOTH_HOM] = mutation_rate ** 2
e[:, BOTH_HET] = 1 - mutation_rate
e[:, BOTH_HET] = (1 - mutation_rate) ** 2 + mutation_rate ** 2
e[:, REF_HOM_OBS_HET] = 2 * mutation_rate * (1 - mutation_rate)
e[:, REF_HET_OBS_HOM] = mutation_rate * (1 - mutation_rate)

Expand Down Expand Up @@ -311,7 +313,7 @@ def backwards(
e = np.zeros((m, 8))
e[:, EQUAL_BOTH_HOM] = (1 - mutation_rate) ** 2
e[:, UNEQUAL_BOTH_HOM] = mutation_rate ** 2
e[:, BOTH_HET] = 1 - mutation_rate
e[:, BOTH_HET] = (1 - mutation_rate) ** 2 + mutation_rate ** 2
e[:, REF_HOM_OBS_HET] = 2 * mutation_rate * (1 - mutation_rate)
e[:, REF_HET_OBS_HOM] = mutation_rate * (1 - mutation_rate)

Expand Down Expand Up @@ -420,7 +422,7 @@ def viterbi(
e = np.zeros((m, 8))
e[:, EQUAL_BOTH_HOM] = (1 - mutation_rate) ** 2
e[:, UNEQUAL_BOTH_HOM] = mutation_rate ** 2
e[:, BOTH_HET] = 1 - mutation_rate
e[:, BOTH_HET] = (1 - mutation_rate) ** 2 + mutation_rate ** 2
e[:, REF_HOM_OBS_HET] = 2 * mutation_rate * (1 - mutation_rate)
e[:, REF_HET_OBS_HOM] = mutation_rate * (1 - mutation_rate)

Expand All @@ -431,3 +433,80 @@ def viterbi(
most_likely_path = get_phased_path(n, unphased_path)

return most_likely_path, log_likelihood


# Finally, need to include a function to evaluate the likelihood of a given path


def path_ll(
reference_panel,
query,
path,
recombination_rate,
alleles=None,
mutation_rate=None,
scale_mutation_based_on_n_alleles=True,
):

n, m, ploidy = checks(
reference_panel,
query,
mutation_rate,
recombination_rate,
scale_mutation_based_on_n_alleles,
)
# Check alleles should go in here, and mofify e before passing to the algorithm
# If alleles is not passed, we don't perform a test of alleles, but set n_alleles based on the reference_panel.
if alleles is None:
n_alleles = np.int8(
[
len(np.unique(np.append(reference_panel[j, :], query[:, j])))
for j in range(reference_panel.shape[0])
]
)
else:
n_alleles = check_alleles(alleles, m)

if mutation_rate is None:
# Set the mutation rate to be the proposed mutation rate in Li and Stephens (2003).
theta_tilde = 1 / np.sum([1 / k for k in range(1, n - 1)])
mutation_rate = 0.5 * (theta_tilde / (n + theta_tilde))

if ploidy == 1:
# Haploid
# Evaluate emission probabilities here, using the mutation rate - this can take a scalar or vector.
e = np.zeros((m, 2))

if scale_mutation_based_on_n_alleles:
# Scale mutation based on the number of alleles - so the mutation rate is the mutation rate to one of the alleles.
# The overall mutation rate is then (n_alleles - 1) * mutation_rate.
e[:, 0] = mutation_rate - mutation_rate * np.equal(
n_alleles, np.ones(m)
) # Added boolean in case we're at an invariant site
e[:, 1] = 1 - (n_alleles - 1) * mutation_rate
else:
# No scaling based on the number of alleles - so the mutation rate is the mutation rate to anything.
# Which means that we must rescale the mutation rate to a different allele, by the number of alleles.
for j in range(m):
if n_alleles[j] == 1: # In case we're at an invariant site
e[j, 0] = 0
e[j, 1] = 1
else:
e[j, 0] = mutation_rate[j] / (n_alleles[j] - 1)
e[j, 1] = 1 - mutation_rate[j]

ll = path_ll_hap(n, m, reference_panel, path, query, e, recombination_rate)
else:
# Diploid
# Evaluate emission probabilities here, using the mutation rate - this can take a scalar or vector.
# DEV: there's a wrinkle here.
e = np.zeros((m, 8))
e[:, EQUAL_BOTH_HOM] = (1 - mutation_rate) ** 2
e[:, UNEQUAL_BOTH_HOM] = mutation_rate ** 2
e[:, BOTH_HET] = (1 - mutation_rate) ** 2 + mutation_rate ** 2
e[:, REF_HOM_OBS_HET] = 2 * mutation_rate * (1 - mutation_rate)
e[:, REF_HET_OBS_HOM] = mutation_rate * (1 - mutation_rate)

ll = path_ll_dip(n, m, reference_panel, path, query, e, recombination_rate)

return ll

0 comments on commit b565924

Please sign in to comment.