Skip to content

Commit

Permalink
Merge pull request #41 from szhan/noncopy
Browse files Browse the repository at this point in the history
Implement NONCOPY for both the haploid and diploid cases
  • Loading branch information
astheeggeggs authored May 22, 2024
2 parents 3c9cdd3 + 338bcd8 commit 8a8eba4
Show file tree
Hide file tree
Showing 9 changed files with 591 additions and 270 deletions.
40 changes: 37 additions & 3 deletions lshmm/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
MISSING_INDEX = 3

MISSING = -1
NONCOPY = -2


""" Helper functions. """
Expand Down Expand Up @@ -72,9 +73,9 @@ def check_alleles(alleles, num_sites):
If alleles is a list of strings, then each string represents distinct alleles
at a site, and each character in a string represents a distinct allele.
It is assumed that MISSING is not encoded in these strings.
It is assumed that MISSING and NONCOPY are not encoded in these strings.
Note MISSING values in the allele lists are excluded from the counts.
Note that MISSING and NONCOPY values are excluded from the counts.
:param list alleles: A list of lists of alleles (or strings).
:param int num_sites: Number of sites.
Expand All @@ -88,7 +89,7 @@ def check_alleles(alleles, num_sites):
if isinstance(alleles[0], str):
return np.int8([len(alleles) for _ in range(num_sites)])
# Otherwise, process allele lists.
exclusion_set = np.array([MISSING])
exclusion_set = np.array([MISSING, NONCOPY])
num_alleles = np.zeros(num_sites, dtype=np.int32)
for i in range(num_sites):
uniq_alleles = np.unique(alleles[i])
Expand Down Expand Up @@ -165,3 +166,36 @@ def get_emission_matrix_diploid(mu, num_sites):
e[:, REF_HET_OBS_HOM] = mu * (1 - mu)
e[:, MISSING_INDEX] = 1
return e


@jit.numba_njit
def get_emission_probability_haploid(ref_allele, query_allele, site, emission_matrix):
if ref_allele == NONCOPY:
return 0.0
else:
emission_index = get_index_in_emission_matrix_haploid(ref_allele, query_allele)
return emission_matrix[site, emission_index]


@jit.numba_njit
def get_emission_probability_diploid(ref_allele, query_allele, site, emission_matrix):
if ref_allele == NONCOPY:
return 0.0
else:
emission_index = get_index_in_emission_matrix_diploid(ref_allele, query_allele)
return emission_matrix[site, emission_index]


@jit.numba_njit
def get_emission_probability_diploid_G(ref_G, query_allele, site, emission_matrix):
emission_probs = np.zeros(ref_G.shape, dtype=np.float64)
for i in range(len(ref_G)):
for j in range(len(ref_G)):
if ref_G[i, j] == NONCOPY:
emission_probs[i, j] = 0.0
else:
emission_index = get_index_in_emission_matrix_diploid(
ref_G[i, j], query_allele
)
emission_probs[i, j] = emission_matrix[site, emission_index]
return emission_probs
111 changes: 68 additions & 43 deletions lshmm/fb_diploid.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,19 +17,25 @@ def forwards_ls_dip(n, m, G, s, e, r, norm=True):
c = np.ones(m)
r_n = r / n

emission_index = core.get_index_in_emission_matrix_diploid_G(
ref_G=G[0, :, :], query_allele=s[0, 0], n=n
emission_probs = core.get_emission_probability_diploid_G(
ref_G=G[0, :, :],
query_allele=s[0, 0],
site=0,
emission_matrix=e,
)
F[0, :, :] *= e[0, emission_index]
F[0, :, :] *= emission_probs

if norm:
c[0] = np.sum(F[0, :, :])
F[0, :, :] *= 1 / c[0]

# Forwards
for l in range(1, m):
emission_index = core.get_index_in_emission_matrix_diploid_G(
ref_G=G[l, :, :], query_allele=s[0, l], n=n
emission_probs = core.get_emission_probability_diploid_G(
ref_G=G[l, :, :],
query_allele=s[0, l],
site=l,
emission_matrix=e,
)

# No change in both
Expand All @@ -43,16 +49,19 @@ def forwards_ls_dip(n, m, G, s, e, r, norm=True):
F[l, :, :] += ((1 - r[l]) * r_n[l]) * (sum_j + sum_j.T)

# Emission
F[l, :, :] *= e[l, emission_index]
F[l, :, :] *= emission_probs
c[l] = np.sum(F[l, :, :])
F[l, :, :] *= 1 / c[l]

ll = np.sum(np.log10(c))
else:
# Forwards
for l in range(1, m):
emission_index = core.get_index_in_emission_matrix_diploid_G(
ref_G=G[l, :, :], query_allele=s[0, l], n=n
emission_probs = core.get_emission_probability_diploid_G(
ref_G=G[l, :, :],
query_allele=s[0, l],
site=l,
emission_matrix=e,
)

# No change in both
Expand All @@ -67,7 +76,7 @@ def forwards_ls_dip(n, m, G, s, e, r, norm=True):
F[l, :, :] += ((1 - r[l]) * r_n[l]) * (sum_j + sum_j.T)

# Emission
F[l, :, :] *= e[l, emission_index]
F[l, :, :] *= emission_probs

ll = np.log10(np.sum(F[l, :, :]))

Expand All @@ -83,27 +92,22 @@ def backwards_ls_dip(n, m, G, s, e, c, r):

# Backwards
for l in range(m - 2, -1, -1):
emission_index = core.get_index_in_emission_matrix_diploid_G(
ref_G=G[l + 1, :, :], query_allele=s[0, l + 1], n=n
emission_probs = core.get_emission_probability_diploid_G(
ref_G=G[l + 1, :, :],
query_allele=s[0, l + 1],
site=l + 1,
emission_matrix=e,
)

# No change in both
B[l, :, :] = r_n[l + 1] ** 2 * np.sum(
e[l + 1, emission_index.reshape((n, n))] * B[l + 1, :, :]
)
B[l, :, :] = r_n[l + 1] ** 2 * np.sum(emission_probs * B[l + 1, :, :])

# Both change
B[l, :, :] += (
(1 - r[l + 1]) ** 2
* B[l + 1, :, :]
* e[l + 1, emission_index.reshape((n, n))]
)
B[l, :, :] += (1 - r[l + 1]) ** 2 * B[l + 1, :, :] * emission_probs

# One changes
sum_j = (
core.np_sum(B[l + 1, :, :] * e[l + 1, emission_index], 0)
.repeat(n)
.reshape((-1, n))
core.np_sum(B[l + 1, :, :] * emission_probs, 0).repeat(n).reshape((-1, n))
)
B[l, :, :] += ((1 - r[l + 1]) * r_n[l + 1]) * (sum_j + sum_j.T)
B[l, :, :] *= 1 / c[l + 1]
Expand All @@ -121,10 +125,13 @@ def forward_ls_dip_starting_point(n, m, G, s, e, r):
for j1 in range(n):
for j2 in range(n):
F[0, j1, j2] = 1 / (n**2)
emission_index = core.get_index_in_emission_matrix_diploid(
ref_allele=G[0, j1, j2], query_allele=s[0, 0]
emission_prob = core.get_emission_probability_diploid(
ref_allele=G[0, j1, j2],
query_allele=s[0, 0],
site=0,
emission_matrix=e,
)
F[0, j1, j2] *= e[0, emission_index]
F[0, j1, j2] *= emission_prob

for l in range(1, m):
F_no_change = np.zeros((n, n))
Expand Down Expand Up @@ -162,10 +169,13 @@ def forward_ls_dip_starting_point(n, m, G, s, e, r):

for j1 in range(n):
for j2 in range(n):
emission_index = core.get_index_in_emission_matrix_diploid(
ref_allele=G[l, j1, j2], query_allele=s[0, l]
emission_prob = core.get_emission_probability_diploid(
ref_allele=G[l, j1, j2],
query_allele=s[0, l],
site=l,
emission_matrix=e,
)
F[l, j1, j2] *= e[l, emission_index]
F[l, j1, j2] *= emission_prob

ll = np.log10(np.sum(F[l, :, :]))

Expand All @@ -190,10 +200,13 @@ def backward_ls_dip_starting_point(n, m, G, s, e, r):
e_tmp = np.zeros((n, n))
for j1 in range(n):
for j2 in range(n):
emission_index = core.get_index_in_emission_matrix_diploid(
ref_allele=G[l + 1, j1, j2], query_allele=s[0, l + 1]
emission_prob = core.get_emission_probability_diploid(
ref_allele=G[l + 1, j1, j2],
query_allele=s[0, l + 1],
site=l + 1,
emission_matrix=e,
)
e_tmp[j1, j2] = e[l + 1, emission_index]
e_tmp[j1, j2] = emission_prob

for j1 in range(n):
for j2 in range(n):
Expand Down Expand Up @@ -244,10 +257,13 @@ def forward_ls_dip_loop(n, m, G, s, e, r, norm=True):
for j1 in range(n):
for j2 in range(n):
F[0, j1, j2] = 1 / (n**2)
emission_index = core.get_index_in_emission_matrix_diploid(
ref_allele=G[0, j1, j2], query_allele=s[0, 0]
emission_prob = core.get_emission_probability_diploid(
ref_allele=G[0, j1, j2],
query_allele=s[0, 0],
site=0,
emission_matrix=e,
)
F[0, j1, j2] *= e[0, emission_index]
F[0, j1, j2] *= emission_prob
r_n = r / n
c = np.ones(m)

Expand All @@ -274,10 +290,13 @@ def forward_ls_dip_loop(n, m, G, s, e, r, norm=True):

for j1 in range(n):
for j2 in range(n):
emission_index = core.get_index_in_emission_matrix_diploid(
ref_allele=G[l, j1, j2], query_allele=s[0, l]
emission_prob = core.get_emission_probability_diploid(
ref_allele=G[l, j1, j2],
query_allele=s[0, l],
site=l,
emission_matrix=e,
)
F[l, j1, j2] *= e[l, emission_index]
F[l, j1, j2] *= emission_prob

c[l] = np.sum(F[l, :, :])
F[l, :, :] *= 1 / c[l]
Expand Down Expand Up @@ -308,10 +327,13 @@ def forward_ls_dip_loop(n, m, G, s, e, r, norm=True):

for j1 in range(n):
for j2 in range(n):
emission_index = core.get_index_in_emission_matrix_diploid(
ref_allele=G[l, j1, j2], query_allele=s[0, l]
emission_prob = core.get_emission_probability_diploid(
ref_allele=G[l, j1, j2],
query_allele=s[0, l],
site=l,
emission_matrix=e,
)
F[l, j1, j2] *= e[l, emission_index]
F[l, j1, j2] *= emission_prob

ll = np.log10(np.sum(F[l, :, :]))

Expand Down Expand Up @@ -340,10 +362,13 @@ def backward_ls_dip_loop(n, m, G, s, e, c, r):
e_tmp = np.zeros((n, n))
for j1 in range(n):
for j2 in range(n):
emission_index = core.get_index_in_emission_matrix_diploid(
ref_allele=G[l + 1, j1, j2], query_allele=s[0, l + 1]
emission_prob = core.get_emission_probability_diploid(
ref_allele=G[l + 1, j1, j2],
query_allele=s[0, l + 1],
site=l + 1,
emission_matrix=e,
)
e_tmp[j1, j2] = e[l + 1, emission_index]
e_tmp[j1, j2] = emission_prob

for j1 in range(n):
for j2 in range(n):
Expand Down
45 changes: 30 additions & 15 deletions lshmm/fb_haploid.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,13 @@ def forwards_ls_hap(n, m, H, s, e, r, norm=True):
if norm:
c = np.zeros(m)
for i in range(n):
emission_index = core.get_index_in_emission_matrix_haploid(
ref_allele=H[0, i], query_allele=s[0, 0]
emission_prob = core.get_emission_probability_haploid(
ref_allele=H[0, i],
query_allele=s[0, 0],
site=0,
emission_matrix=e,
)
F[0, i] = 1 / n * e[0, emission_index]
F[0, i] = 1 / n * emission_prob
c[0] += F[0, i]

for i in range(n):
Expand All @@ -35,10 +38,13 @@ def forwards_ls_hap(n, m, H, s, e, r, norm=True):
for l in range(1, m):
for i in range(n):
F[l, i] = F[l - 1, i] * (1 - r[l]) + r_n[l]
emission_index = core.get_index_in_emission_matrix_haploid(
ref_allele=H[l, i], query_allele=s[0, l]
emission_prob = core.get_emission_probability_haploid(
ref_allele=H[l, i],
query_allele=s[0, l],
site=l,
emission_matrix=e,
)
F[l, i] *= e[l, emission_index]
F[l, i] *= emission_prob
c[l] += F[l, i]

for i in range(n):
Expand All @@ -49,19 +55,25 @@ def forwards_ls_hap(n, m, H, s, e, r, norm=True):
else:
c = np.ones(m)
for i in range(n):
emission_index = core.get_index_in_emission_matrix_haploid(
ref_allele=H[0, i], query_allele=s[0, 0]
emission_prob = core.get_emission_probability_haploid(
ref_allele=H[0, i],
query_allele=s[0, 0],
site=0,
emission_matrix=e,
)
F[0, i] = 1 / n * e[0, emission_index]
F[0, i] = 1 / n * emission_prob

# Forwards pass
for l in range(1, m):
for i in range(n):
F[l, i] = F[l - 1, i] * (1 - r[l]) + np.sum(F[l - 1, :]) * r_n[l]
emission_index = core.get_index_in_emission_matrix_haploid(
ref_allele=H[l, i], query_allele=s[0, l]
emission_prob = core.get_emission_probability_haploid(
ref_allele=H[l, i],
query_allele=s[0, l],
site=l,
emission_matrix=e,
)
F[l, i] *= e[l, emission_index]
F[l, i] *= emission_prob

ll = np.log10(np.sum(F[m - 1, :]))

Expand All @@ -85,10 +97,13 @@ def backwards_ls_hap(n, m, H, s, e, c, r):
tmp_B = np.zeros(n)
tmp_B_sum = 0
for i in range(n):
emission_index = core.get_index_in_emission_matrix_haploid(
ref_allele=H[l + 1, i], query_allele=s[0, l + 1]
emission_prob = core.get_emission_probability_haploid(
ref_allele=H[l + 1, i],
query_allele=s[0, l + 1],
site=l + 1,
emission_matrix=e,
)
tmp_B[i] = e[l + 1, emission_index] * B[l + 1, i]
tmp_B[i] = emission_prob * B[l + 1, i]
tmp_B_sum += tmp_B[i]
for i in range(n):
B[l, i] = r_n[l + 1] * tmp_B_sum
Expand Down
Loading

0 comments on commit 8a8eba4

Please sign in to comment.