From 65e3aa7ede576d32117e08436594241e89e4e8a5 Mon Sep 17 00:00:00 2001 From: astheeggeggs Date: Tue, 30 Aug 2022 14:15:49 +0100 Subject: [PATCH 1/6] fixed initial lint error --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 9ea1ed9..2ad0da4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,6 +15,7 @@ disable = [ "C", # Do not warn about TODO and FIXME comments "fixme", + "E1136" ] [tool.pytest.ini_options] From c3f1b2d3697a258c0521c1524caeaf3ebb2a0035 Mon Sep 17 00:00:00 2001 From: astheeggeggs Date: Tue, 30 Aug 2022 17:22:22 +0100 Subject: [PATCH 2/6] updated testing, ensured everything is njitted. --- lshmm/api.py | 4 +- .../fb_diploid_samples_variants.py | 12 +- .../fb_diploid_variants_samples.py | 273 ------------------ .../fb_haploid_samples_variants.py | 4 +- .../fb_haploid_variants_samples.py | 59 +--- ...b_haploid_variants_samples_multiallelic.py | 212 +++++++------- lshmm/vit_diploid_samples_variants.py | 12 +- lshmm/vit_diploid_variants_samples.py | 201 ------------- lshmm/vit_haploid_samples_variants.py | 20 +- lshmm/vit_haploid_variants_samples.py | 230 +-------------- tests/test_API.py | 6 +- tests/test_API_multialllelic.py | 213 -------------- tests/test_LS_haploid_diploid.py | 13 +- tests/test_LS_haploid_multiallelic.py | 27 +- 14 files changed, 167 insertions(+), 1119 deletions(-) delete mode 100644 tests/test_API_multialllelic.py diff --git a/lshmm/api.py b/lshmm/api.py index 8a5faec..acb62cd 100644 --- a/lshmm/api.py +++ b/lshmm/api.py @@ -309,7 +309,7 @@ def backwards( else: # Diploid # Evaluate emission probabilities here, using the mutation rate - this can take a scalar or vector. - # DEV: there's a wrinkle here. + # DEV: there's a wrinkle here - multiallelics not yet done. e = np.zeros((m, 8)) e[:, EQUAL_BOTH_HOM] = (1 - mutation_rate) ** 2 e[:, UNEQUAL_BOTH_HOM] = mutation_rate ** 2 @@ -418,7 +418,7 @@ def viterbi( else: # Diploid # Evaluate emission probabilities here, using the mutation rate - this can take a scalar or vector. - # DEV: there's a wrinkle here. + # DEV: there's a wrinkle here - multiallelics not yet done. e = np.zeros((m, 8)) e[:, EQUAL_BOTH_HOM] = (1 - mutation_rate) ** 2 e[:, UNEQUAL_BOTH_HOM] = mutation_rate ** 2 diff --git a/lshmm/forward_backward/fb_diploid_samples_variants.py b/lshmm/forward_backward/fb_diploid_samples_variants.py index dee32f2..c65ada3 100644 --- a/lshmm/forward_backward/fb_diploid_samples_variants.py +++ b/lshmm/forward_backward/fb_diploid_samples_variants.py @@ -47,7 +47,7 @@ def np_argmax(array, axis): return np_apply_along_axis(np.argmax, axis, array) -@nb.jit +@nb.njit def forwards_ls_dip(n, m, G, s, e, r, norm=True): """Matrix based diploid LS forward algorithm using numpy vectorisation.""" # Initialise the forward tensor @@ -121,7 +121,7 @@ def forwards_ls_dip(n, m, G, s, e, r, norm=True): return F, c, ll -@nb.jit +@nb.njit def backwards_ls_dip(n, m, G, s, e, c, r): """Matrix based diploid LS backward algorithm using numpy vectorisation.""" # Initialise the backward tensor @@ -161,7 +161,7 @@ def backwards_ls_dip(n, m, G, s, e, c, r): return B -@nb.jit +@nb.njit def forward_ls_dip_starting_point(n, m, G, s, e, r): """Naive implementation of LS diploid forwards algorithm.""" # Initialise the forward tensor @@ -234,7 +234,7 @@ def forward_ls_dip_starting_point(n, m, G, s, e, r): return F, ll -@nb.jit +@nb.njit def backward_ls_dip_starting_point(n, m, G, s, e, r): """Naive implementation of LS diploid backwards algorithm.""" # Backwards @@ -310,7 +310,7 @@ def backward_ls_dip_starting_point(n, m, G, s, e, r): return B -@nb.jit +@nb.njit def forward_ls_dip_loop(n, m, G, s, e, r, norm=True): """LS diploid forwards algoritm without vectorisation.""" # Initialise the forward tensor @@ -422,7 +422,7 @@ def forward_ls_dip_loop(n, m, G, s, e, r, norm=True): return F, c, ll -@nb.jit +@nb.njit def backward_ls_dip_loop(n, m, G, s, e, c, r): """LS diploid backwards algoritm without vectorisation.""" # Initialise the backward tensor diff --git a/lshmm/forward_backward/fb_diploid_variants_samples.py b/lshmm/forward_backward/fb_diploid_variants_samples.py index e376523..231d385 100644 --- a/lshmm/forward_backward/fb_diploid_variants_samples.py +++ b/lshmm/forward_backward/fb_diploid_variants_samples.py @@ -148,10 +148,6 @@ def backwards_ls_dip(n, m, G, s, e, c, r): ) # One changes - # sum_j1 = np.tile(np.sum(B[l+1,:,:] * e[l+1, index], 0, keepdims=True), (n,1)) - # sum_j1 = np_sum(B[l + 1, :, :], 0).repeat(n).reshape((-1, n)).T - # sum_j2 = np.tile(np.sum(B[l+1,:,:] * e[l+1, index], 1, keepdims=True), (1,n)) - # sum_j2 = np_sum(B[l + 1, :, :], 1).repeat(n).reshape((-1, n)) sum_j = np_sum(B[l + 1, :, :] * e[l + 1, index], 0).repeat(n).reshape((-1, n)) B[l, :, :] += ((1 - r[l + 1]) * r_n[l + 1]) * (sum_j + sum_j.T) B[l, :, :] *= 1 / c[l + 1] @@ -159,83 +155,6 @@ def backwards_ls_dip(n, m, G, s, e, c, r): return B -# def forward_ls_dip_starting_point(n, m, G, s, e, r): -# ''' -# Unbelievably naive implementation of LS diploid forwards. Just to get something down -# that works. -# ''' - -# # Initialise the forward tensor -# F = np.zeros((m,n,n)) -# F[0,:,:] = 1/(n**2) -# index = ( -# 4*np.equal(G[0,:,:], s[0,0]).astype(np.int64) + -# 2*(G[0,:,:] == 1).astype(np.int64) + -# np.int64(s[0,0] == 1) -# ) -# F[0,:,:] *= e[0, index] -# r_n = r/n - -# for l in range(1,m): - -# # Determine the various components -# F_no_change = np.zeros((n,n)) -# F_j1_change = np.zeros(n) -# F_j2_change = np.zeros(n) -# F_both_change = 0 - -# for j1 in range(n): -# for j2 in range(n): -# F_no_change[j1, j2] = (1-r[l])**2 * F[l-1, j1, j2] - -# for j1 in range(n): -# for j2 in range(n): -# F_both_change += r_n[l]**2 * F[l-1, j1, j2] - -# for j1 in range(n): -# for j2 in range(n): # This is the variable to sum over - it changes -# F_j2_change[j1] += (1 - r[l]) * r_n[l] * F[l-1, j1, j2] - -# for j2 in range(n): -# for j1 in range(n): # This is the variable to sum over - it changes -# F_j1_change[j2] += (1 - r[l]) * r_n[l] * F[l-1, j1, j2] - -# F[l,:,:] = F_both_change - -# for j1 in range(n): -# F[l, j1, :] += F_j2_change - -# for j2 in range(n): -# F[l, :, j2] += F_j1_change - -# for j1 in range(n): -# for j2 in range(n): -# F[l, j1, j2] += F_no_change[j1, j2] - -# for j1 in range(n): -# for j2 in range(n): -# # What is the emission? -# if s[0,l] == 1: -# # OBS is het -# if G[l, j1, j2] == 1: # REF is het -# F[l, j1, j2] *= e[l,BOTH_HET] -# else: # REF is hom -# F[l, j1, j2] *= e[l,REF_HOM_OBS_HET] -# else: -# # OBS is hom -# if G[l, j1, j2] == 1: # REF is het -# F[l, j1, j2] *= e[l,REF_HET_OBS_HOM] -# else: # REF is hom -# if G[l, j1, j2] == s[0,l]: # Equal -# F[l, j1, j2] *= e[l,EQUAL_BOTH_HOM] -# else: # Unequal -# F[l, j1, j2] *= e[l,UNEQUAL_BOTH_HOM] - -# ll = np.log10(np.sum(F[l,:,:])) - -# return F, ll - - @nb.njit def forward_ls_dip_starting_point(n, m, G, s, e, r): """Naive implementation of LS diploid forwards algorithm.""" @@ -312,79 +231,6 @@ def forward_ls_dip_starting_point(n, m, G, s, e, r): return F, ll -# def backward_ls_dip_starting_point(n, m, G, s, e, r): -# ''' -# Unbelievably naive implementation of LS diploid backwards. Just to get something down -# that works. -# ''' - -# # Backwards -# B = np.zeros((m,n,n)) - -# # Initialise -# B[m-1, :, :] = 1 -# r_n = r/n - -# for l in range(m-2, -1, -1): - -# # Determine the various components -# B_no_change = np.zeros((n,n)) -# B_j1_change = np.zeros(n) -# B_j2_change = np.zeros(n) -# B_both_change = 0 - -# # Evaluate the emission matrix at this site, for all pairs -# e_tmp = np.zeros((n,n)) -# for j1 in range(n): -# for j2 in range(n): -# # What is the emission? -# if s[0,l+1] == 1: -# # OBS is het -# if G[l+1, j1, j2] == 1: # REF is het -# e_tmp[j1, j2] = e[l+1, BOTH_HET] -# else: # REF is hom -# e_tmp[j1, j2] = e[l+1, REF_HOM_OBS_HET] -# else: -# # OBS is hom -# if G[l+1, j1, j2] == 1: # REF is het -# e_tmp[j1, j2] = e[l+1,REF_HET_OBS_HOM] -# else: # REF is hom -# if G[l+1, j1, j2] == s[0,l+1]: # Equal -# e_tmp[j1, j2] = e[l+1,EQUAL_BOTH_HOM] -# else: # Unequal -# e_tmp[j1, j2] = e[l+1,UNEQUAL_BOTH_HOM] - -# for j1 in range(n): -# for j2 in range(n): -# B_no_change[j1, j2] = (1-r[l+1])**2 * B[l+1,j1,j2] * e_tmp[j1, j2] - -# for j1 in range(n): -# for j2 in range(n): -# B_both_change += r_n[l+1]**2 * e_tmp[j1, j2] * B[l+1,j1,j2] - -# for j1 in range(n): -# for j2 in range(n): # This is the variable to sum over - it changes -# B_j2_change[j1] += (1 - r[l+1]) * r_n[l+1] * B[l+1,j1,j2] * e_tmp[j1, j2] - -# for j2 in range(n): -# for j1 in range(n): # This is the variable to sum over - it changes -# B_j1_change[j2] += (1 - r[l+1]) * r_n[l+1] * B[l+1,j1,j2] * e_tmp[j1, j2] - -# B[l,:,:] = B_both_change - -# for j1 in range(n): -# B[l, j1, :] += B_j2_change - -# for j2 in range(n): -# B[l, :, j2] += B_j1_change - -# for j1 in range(n): -# for j2 in range(n): -# B[l, j1, j2] += B_no_change[j1, j2] - -# return B - - @nb.njit def backward_ls_dip_starting_point(n, m, G, s, e, r): """Naive implementation of LS diploid backwards algorithm.""" @@ -461,68 +307,6 @@ def backward_ls_dip_starting_point(n, m, G, s, e, r): return B -# def forward_ls_dip_loop(n, m, G, s, e, r): -# ''' -# LS diploid forwards with lots of loops. -# ''' - -# # Initialise the forward tensor -# F = np.zeros((m,n,n)) -# F[0,:,:] = 1/(n**2) -# index = ( -# 4*np.equal(G[0,:,:], s[0,0]).astype(np.int64) + -# 2*(G[0,:,:] == 1).astype(np.int64) + -# np.int64(s[0,0] == 1) -# ) -# F[0,:,:] *= e[0, index] -# r_n = r/n - -# for l in range(1,m): - -# # Determine the various components -# F_no_change = np.zeros((n,n)) -# F_j1_change = np.zeros(n) -# F_j2_change = np.zeros(n) -# F_both_change = 0 - -# for j1 in range(n): -# for j2 in range(n): -# F_no_change[j1, j2] = (1-r[l])**2 * F[l-1,j1,j2] -# F_j1_change[j1] += (1 - r[l]) * r_n[l] * F[l-1,j2,j1] -# F_j2_change[j1] += (1 - r[l]) * r_n[l] * F[l-1,j1,j2] -# F_both_change += r_n[l]**2 * F[l-1,j1,j2] - -# F[l,:,:] = F_both_change - -# for j1 in range(n): -# F[l, j1, :] += F_j2_change -# F[l, :, j1] += F_j1_change -# for j2 in range(n): -# F[l, j1, j2] += F_no_change[j1, j2] - -# for j1 in range(n): -# for j2 in range(n): -# # What is the emission? -# if s[0,l] == 1: -# # OBS is het -# if G[l, j1, j2] == 1: # REF is het -# F[l, j1, j2] *= e[l, BOTH_HET] -# else: # REF is hom -# F[l, j1, j2] *= e[l, REF_HOM_OBS_HET] -# else: -# # OBS is hom -# if G[l, j1, j2] == 1: # REF is het -# F[l, j1, j2] *= e[l, REF_HET_OBS_HOM] -# else: # REF is hom -# if G[l, j1, j2] == s[0,l]: # Equal -# F[l, j1, j2] *= e[l, EQUAL_BOTH_HOM] -# else: # Unequal -# F[l, j1, j2] *= e[l, UNEQUAL_BOTH_HOM] - -# ll = np.log10(np.sum(F[l,:,:])) -# return F, ll - - @nb.njit def forward_ls_dip_loop(n, m, G, s, e, r, norm=True): """LS diploid forwards algoritm without vectorisation.""" @@ -637,63 +421,6 @@ def forward_ls_dip_loop(n, m, G, s, e, r, norm=True): return F, c, ll -# def backward_ls_dip_loop(n, m, G, s, e, r): -# ''' -# LS diploid backwards with lots of loops. -# ''' - -# # Initialise the backward tensor -# B = np.zeros((m,n,n)) -# B[m-1, :, :] = 1 -# r_n = r/n - -# for l in range(m-2, -1, -1): - -# # Determine the various components -# B_no_change = np.zeros((n,n)) -# B_j1_change = np.zeros(n) -# B_j2_change = np.zeros(n) -# B_both_change = 0 - -# # Evaluate the emission matrix at this site, for all pairs -# e_tmp = np.zeros((n,n)) -# for j1 in range(n): -# for j2 in range(n): -# # What is the emission? -# if s[0,l+1] == 1: -# # OBS is het -# if G[l+1, j1, j2] == 1: # REF is het -# e_tmp[j1, j2] = e[l+1, BOTH_HET] -# else: # REF is hom -# e_tmp[j1, j2] = e[l+1, REF_HOM_OBS_HET] -# else: -# # OBS is hom -# if G[l+1, j1, j2] == 1: # REF is het -# e_tmp[j1, j2] = e[l+1, REF_HET_OBS_HOM] -# else: # REF is hom -# if G[l+1, j1, j2] == s[0,l+1]: # Equal -# e_tmp[j1, j2] = e[l+1, EQUAL_BOTH_HOM] -# else: # Unequal -# e_tmp[j1, j2] = e[l+1, UNEQUAL_BOTH_HOM] - -# for j1 in range(n): -# for j2 in range(n): -# B_no_change[j1, j2] = (1-r[l+1])**2 * B[l+1,j1,j2] * e_tmp[j1, j2] -# B_j2_change[j1] += (1 - r[l+1]) * r_n[l+1] * B[l+1,j1,j2] * e_tmp[j1, j2] -# B_j1_change[j1] += (1 - r[l+1]) * r_n[l+1] * B[l+1,j2,j1] * e_tmp[j2, j1] -# B_both_change += r_n[l+1]**2 * e_tmp[j1, j2] * B[l+1,j1,j2] - -# B[l,:,:] = B_both_change - -# for j1 in range(n): -# B[l, j1, :] += B_j2_change -# B[l, :, j1] += B_j1_change -# for j2 in range(n): -# B[l, j1, j2] += B_no_change[j1, j2] - -# return B - - @nb.njit def backward_ls_dip_loop(n, m, G, s, e, c, r): """LS diploid backwards algoritm without vectorisation.""" diff --git a/lshmm/forward_backward/fb_haploid_samples_variants.py b/lshmm/forward_backward/fb_haploid_samples_variants.py index 580f63e..683412a 100644 --- a/lshmm/forward_backward/fb_haploid_samples_variants.py +++ b/lshmm/forward_backward/fb_haploid_samples_variants.py @@ -3,7 +3,7 @@ import numpy as np -@nb.jit +@nb.njit def forwards_ls_hap(n, m, H, s, e, r, norm=True): """Matrix based haploid LS forward algorithm using numpy vectorisation.""" # Initialise @@ -39,7 +39,7 @@ def forwards_ls_hap(n, m, H, s, e, r, norm=True): return F, c, ll -@nb.jit +@nb.njit def backwards_ls_hap(n, m, H, s, e, c, r): """Matrix based haploid LS backward algorithm using numpy vectorisation.""" # Initialise diff --git a/lshmm/forward_backward/fb_haploid_variants_samples.py b/lshmm/forward_backward/fb_haploid_variants_samples.py index 0ea5b1d..f35dac1 100644 --- a/lshmm/forward_backward/fb_haploid_variants_samples.py +++ b/lshmm/forward_backward/fb_haploid_variants_samples.py @@ -2,49 +2,13 @@ import numba as nb import numpy as np -# def forwards_ls_hap(n, m, H, s, e, r, norm=True): -# ''' -# Simple matrix based method for LS forward algorithm using numpy vectorisation. -# ''' -# # Initialise -# F = np.zeros((m,n)) -# c = np.ones(m) -# F[0,:] = 1/n * e[0, np.equal(H[0, :], s[0,0]).astype(np.int64)] -# r_n = r/n - -# if norm: - -# c[0] = np.sum(F[0,:]) -# F[0,:] *= 1/c[0] - -# # Forwards pass -# for l in range(1,m): -# F[l,:] = F[l-1,:] * (1 - r[l]) + r_n[l] # Don't need to multiply by r_n[l] F[:,l-1] as we've normalised. -# F[l,:] *= e[l,np.equal(H[l,:], s[0,l]).astype(np.int64)] -# c[l] = np.sum(F[l,:]) -# F[l,:] *= 1/c[l] - -# ll = np.sum(np.log10(c)) - -# else: -# # Forwards pass -# for l in range(1,m): -# F[l,:] = F[l-1,:] * (1 - r[l]) + np.sum(F[l-1,:]) * r_n[l] -# F[l,:] *= e[l, np.equal(H[l,:], s[0,l]).astype(np.int64)] - -# ll = np.log10(np.sum(F[m-1,:])) - -# return F, c, ll - - -@nb.jit +@nb.njit def forwards_ls_hap(n, m, H, s, e, r, norm=True): """Matrix based haploid LS forward algorithm using numpy vectorisation.""" # Initialise F = np.zeros((m, n)) r_n = r / n - print("running") if norm: @@ -87,26 +51,7 @@ def forwards_ls_hap(n, m, H, s, e, r, norm=True): return F, c, ll -# def backwards_ls_hap(n, m, H, s, e, c, r): -# ''' -# Simple matrix based method for LS backwards algorithm using numpy vectorisation. -# ''' - -# # Initialise -# B = np.zeros((m,n)) -# B[m-1,:] = 1 -# r_n = r/n - -# # Backwards pass -# for l in range(m-2, -1, -1): -# B[l,:] = r_n[l+1] * np.sum(e[l+1, np.equal(H[l+1,:], s[0,l+1]).astype(np.int64)] * B[l+1,:]) -# B[l,:] += (1 - r[l+1]) * e[l+1, np.equal(H[l+1,:], s[0,l+1]).astype(np.int64)] * B[l+1,:] -# B[l,:] *= 1/c[l+1] - -# return B - - -@nb.jit +@nb.njit def backwards_ls_hap(n, m, H, s, e, c, r): """Matrix based haploid LS backward algorithm using numpy vectorisation.""" # Initialise diff --git a/lshmm/forward_backward/fb_haploid_variants_samples_multiallelic.py b/lshmm/forward_backward/fb_haploid_variants_samples_multiallelic.py index 171aeb8..f3c59b4 100644 --- a/lshmm/forward_backward/fb_haploid_variants_samples_multiallelic.py +++ b/lshmm/forward_backward/fb_haploid_variants_samples_multiallelic.py @@ -1,106 +1,106 @@ -"""Collection of functions to run forwards and backwards algorithms on haploid genotype data, where the data is structured as variants x samples.""" -import numba as nb -import numpy as np - - -def check_alleles(alleles, m): - """ - Checks the specified allele list and returns a list of lists - of alleles of length num_sites. - If alleles is a 1D list of strings, assume that this list is used - for each site and return num_sites copies of this list. - Otherwise, raise a ValueError if alleles is not a list of length - num_sites. - """ - if isinstance(alleles[0], str): - return np.int8([len(alleles) for _ in range(m)]) - if len(alleles) != m: - raise ValueError("Malformed alleles list") - n_alleles = np.int8([(len(alleles_site)) for alleles_site in alleles]) - return n_alleles - - -@nb.jit -def forwards_ls_hap(n, m, n_alleles, H, s, e, r, norm=True): - """Matrix based haploid LS forward algorithm using numpy vectorisation.""" - # Initialise - F = np.zeros((m, n)) - r_n = r / n - - if norm: - - c = np.zeros(m) - for i in range(n): - F[0, i] = 1 / n * e[0, np.int64(np.equal(H[0, i], s[0, 0]))] - c[0] += F[0, i] - - for i in range(n): - F[0, i] *= 1 / c[0] - - # Forwards pass - for l in range(1, m): - for i in range(n): - F[l, i] = F[l - 1, i] * (1 - r[l]) + r_n[l] - F[l, i] *= e[l, np.int64(np.equal(H[l, i], s[0, l]))] - c[l] += F[l, i] - - for i in range(n): - F[l, i] *= 1 / c[l] - - ll = np.sum(np.log10(c)) - - else: - - c = np.ones(m) - - for i in range(n): - F[0, i] = 1 / n * e[0, np.int64(np.equal(H[0, i], s[0, 0]))] - - # Forwards pass - for l in range(1, m): - for i in range(n): - F[l, i] = F[l - 1, i] * (1 - r[l]) + np.sum(F[l - 1, :]) * r_n[l] - F[l, i] *= e[l, np.int64(np.equal(H[l, i], s[0, l]))] - - ll = np.log10(np.sum(F[m - 1, :])) - - return F, c, ll - - -@nb.jit -def backwards_ls_hap(n, m, n_alleles, H, s, e, c, r): - """Matrix based haploid LS backward algorithm using numpy vectorisation.""" - # Initialise - # alleles = check_alleles(alleles, m) - B = np.zeros((m, n)) - for i in range(n): - B[m - 1, i] = 1 - r_n = r / n - - # Backwards pass - for l in range(m - 2, -1, -1): - tmp_B = np.zeros(n) - tmp_B_sum = 0 - for i in range(n): - tmp_B[i] = ( - e[l + 1, np.int64(np.equal(H[l + 1, i], s[0, l + 1]))] * B[l + 1, i] - ) - tmp_B_sum += tmp_B[i] - for i in range(n): - B[l, i] = r_n[l + 1] * tmp_B_sum - B[l, i] += (1 - r[l + 1]) * tmp_B[i] - B[l, i] *= 1 / c[l + 1] - - return B - - -def forwards_ls_hap_wrapper(n, m, alleles, H, s, e, r, norm=True): - n_alleles = check_alleles(alleles, m) - F, c, ll = forwards_ls_hap(n, m, n_alleles, H, s, e, r, norm) - return F, c, ll - - -def backwards_ls_hap_wrapper(n, m, alleles, H, s, e, c, r): - n_alleles = check_alleles(alleles, m) - B = backwards_ls_hap(n, m, n_alleles, H, s, e, c, r) - return B +# """Collection of functions to run forwards and backwards algorithms on haploid genotype data, where the data is structured as variants x samples.""" +# import numba as nb +# import numpy as np + + +# def check_alleles(alleles, m): +# """ +# Checks the specified allele list and returns a list of lists +# of alleles of length num_sites. +# If alleles is a 1D list of strings, assume that this list is used +# for each site and return num_sites copies of this list. +# Otherwise, raise a ValueError if alleles is not a list of length +# num_sites. +# """ +# if isinstance(alleles[0], str): +# return np.int8([len(alleles) for _ in range(m)]) +# if len(alleles) != m: +# raise ValueError("Malformed alleles list") +# n_alleles = np.int8([(len(alleles_site)) for alleles_site in alleles]) +# return n_alleles + + +# @nb.jit +# def forwards_ls_hap(n, m, n_alleles, H, s, e, r, norm=True): +# """Matrix based haploid LS forward algorithm using numpy vectorisation.""" +# # Initialise +# F = np.zeros((m, n)) +# r_n = r / n + +# if norm: + +# c = np.zeros(m) +# for i in range(n): +# F[0, i] = 1 / n * e[0, np.int64(np.equal(H[0, i], s[0, 0]))] +# c[0] += F[0, i] + +# for i in range(n): +# F[0, i] *= 1 / c[0] + +# # Forwards pass +# for l in range(1, m): +# for i in range(n): +# F[l, i] = F[l - 1, i] * (1 - r[l]) + r_n[l] +# F[l, i] *= e[l, np.int64(np.equal(H[l, i], s[0, l]))] +# c[l] += F[l, i] + +# for i in range(n): +# F[l, i] *= 1 / c[l] + +# ll = np.sum(np.log10(c)) + +# else: + +# c = np.ones(m) + +# for i in range(n): +# F[0, i] = 1 / n * e[0, np.int64(np.equal(H[0, i], s[0, 0]))] + +# # Forwards pass +# for l in range(1, m): +# for i in range(n): +# F[l, i] = F[l - 1, i] * (1 - r[l]) + np.sum(F[l - 1, :]) * r_n[l] +# F[l, i] *= e[l, np.int64(np.equal(H[l, i], s[0, l]))] + +# ll = np.log10(np.sum(F[m - 1, :])) + +# return F, c, ll + + +# @nb.jit +# def backwards_ls_hap(n, m, n_alleles, H, s, e, c, r): +# """Matrix based haploid LS backward algorithm using numpy vectorisation.""" +# # Initialise +# # alleles = check_alleles(alleles, m) +# B = np.zeros((m, n)) +# for i in range(n): +# B[m - 1, i] = 1 +# r_n = r / n + +# # Backwards pass +# for l in range(m - 2, -1, -1): +# tmp_B = np.zeros(n) +# tmp_B_sum = 0 +# for i in range(n): +# tmp_B[i] = ( +# e[l + 1, np.int64(np.equal(H[l + 1, i], s[0, l + 1]))] * B[l + 1, i] +# ) +# tmp_B_sum += tmp_B[i] +# for i in range(n): +# B[l, i] = r_n[l + 1] * tmp_B_sum +# B[l, i] += (1 - r[l + 1]) * tmp_B[i] +# B[l, i] *= 1 / c[l + 1] + +# return B + + +# def forwards_ls_hap_wrapper(n, m, alleles, H, s, e, r, norm=True): +# n_alleles = check_alleles(alleles, m) +# F, c, ll = forwards_ls_hap(n, m, n_alleles, H, s, e, r, norm) +# return F, c, ll + + +# def backwards_ls_hap_wrapper(n, m, alleles, H, s, e, c, r): +# n_alleles = check_alleles(alleles, m) +# B = backwards_ls_hap(n, m, n_alleles, H, s, e, c, r) +# return B diff --git a/lshmm/vit_diploid_samples_variants.py b/lshmm/vit_diploid_samples_variants.py index fbf542d..4eb849b 100644 --- a/lshmm/vit_diploid_samples_variants.py +++ b/lshmm/vit_diploid_samples_variants.py @@ -38,7 +38,7 @@ def np_argmax(array, axis): return np_apply_along_axis(np.argmax, axis, array) -@nb.jit +@nb.njit def forwards_viterbi_dip_naive(n, m, G, s, e, r): """Naive implementation of LS diploid Viterbi algorithm.""" # Initialise @@ -84,7 +84,7 @@ def forwards_viterbi_dip_naive(n, m, G, s, e, r): return V, P, ll -@nb.jit +@nb.njit def forwards_viterbi_dip_naive_low_mem(n, m, G, s, e, r): """Naive implementation of LS diploid Viterbi algorithm, with reduced memory.""" # Initialise @@ -132,7 +132,7 @@ def forwards_viterbi_dip_naive_low_mem(n, m, G, s, e, r): return V, P, ll -@nb.jit +@nb.njit def forwards_viterbi_dip_low_mem(n, m, G, s, e, r): """LS diploid Viterbi algorithm, with reduced memory.""" # Initialise @@ -207,7 +207,7 @@ def forwards_viterbi_dip_low_mem(n, m, G, s, e, r): return V, P, ll -@nb.jit +@nb.njit def forwards_viterbi_dip_naive_vec(n, m, G, s, e, r): """Vectorised LS diploid Viterbi algorithm using numpy.""" # Initialise @@ -290,7 +290,7 @@ def forwards_viterbi_dip_naive_full_vec(n, m, G, s, e, r): return V, P, ll -@nb.jit +@nb.njit def backwards_viterbi_dip(m, V_last, P): """Run a backwards pass to determine the most likely path.""" assert V_last.ndim == 2 @@ -311,7 +311,7 @@ def get_phased_path(n, path): return np.unravel_index(path, (n, n)) -@nb.jit +@nb.njit def path_ll_dip(n, m, G, phased_path, s, e, r): """Evaluate log-likelihood path through a reference panel which results in sequence s.""" index = ( diff --git a/lshmm/vit_diploid_variants_samples.py b/lshmm/vit_diploid_variants_samples.py index b2b7afb..9e8ebae 100644 --- a/lshmm/vit_diploid_variants_samples.py +++ b/lshmm/vit_diploid_variants_samples.py @@ -38,49 +38,6 @@ def np_argmax(array, axis): return np_apply_along_axis(np.argmax, axis, array) -# def forwards_viterbi_dip_naive(n, m, G, s, e, r): - -# # Initialise -# V = np.zeros((m, n, n)) -# P = np.zeros((m, n, n)).astype(np.int64) -# c = np.ones(m) -# index = ( -# 4*np.equal(G[0,:,:], s[0,0]).astype(np.int64) + -# 2*(G[0,:,:] == 1).astype(np.int64) + -# np.int64(s[0,0] == 1) -# ) -# V[0,:,:] = 1/(n**2) * e[0,index] -# r_n = r/n - -# for l in range(1,m): -# index = ( -# 4*np.equal(G[l,:,:], s[0,l]).astype(np.int64) + -# 2*(G[l,:,:] == 1).astype(np.int64) + -# np.int64(s[0,l] == 1) -# ) -# for j1 in range(n): -# for j2 in range(n): -# # Get the vector to maximise over -# v = np.zeros((n,n)) -# for k1 in range(n): -# for k2 in range(n): -# v[k1, k2] = V[l-1,k1, k2] -# if ((k1 == j1) and (k2 == j2)): -# v[k1, k2] *= ((1 - r[l])**2 + 2*(1-r[l]) * r_n[l] + r_n[l]**2) -# elif ((k1 == j1) or (k2 == j2)): -# v[k1, k2] *= (r_n[l] * (1 - r[l]) + r_n[l]**2) -# else: -# v[k1, k2] *= r_n[l]**2 -# V[l,j1,j2] = np.amax(v) * e[l, index[j1, j2]] -# P[l,j1,j2] = np.argmax(v) -# c[l] = np.amax(V[l,:,:]) -# V[l,:,:] *= 1/c[l] - -# ll = np.sum(np.log10(c)) - -# return V, P, ll - - @nb.njit def forwards_viterbi_dip_naive(n, m, G, s, e, r): """Naive implementation of LS diploid Viterbi algorithm.""" @@ -130,51 +87,6 @@ def forwards_viterbi_dip_naive(n, m, G, s, e, r): return V, P, ll -# def forwards_viterbi_dip_naive_low_mem(n, m, G, s, e, r): - -# # Initialise -# V = np.zeros((n,n)) -# P = np.zeros((m,n,n)).astype(np.int64) -# c = np.ones(m) -# index = ( -# 4*np.equal(G[0,:,:], s[0,0]).astype(np.int64) + -# 2*(G[0,:,:] == 1).astype(np.int64) + -# np.int64(s[0,0] == 1) -# ) -# V_previous = 1/(n**2) * e[0,index] -# r_n = r/n - -# # Take a look at Haploid Viterbi implementation in Jeromes code and see if we can pinch some ideas. -# # Diploid Viterbi, with smaller memory footprint. -# for l in range(1,m): -# index = ( -# 4*np.equal(G[l,:,:], s[0,l]).astype(np.int64) + -# 2*(G[l,:,:] == 1).astype(np.int64) + -# np.int64(s[0,l] == 1) -# ) -# for j1 in range(n): -# for j2 in range(n): -# # Get the vector to maximise over -# v = np.zeros((n,n)) -# for k1 in range(n): -# for k2 in range(n): -# v[k1, k2] = V_previous[k1, k2] -# if ((k1 == j1) and (k2 == j2)): -# v[k1, k2] *= ((1 - r[l])**2 + 2*(1-r[l]) * r_n[l] + r_n[l]**2) -# elif ((k1 == j1) or (k2 == j2)): -# v[k1, k2] *= (r_n[l] * (1 - r[l]) + r_n[l]**2) -# else: -# v[k1, k2] *= r_n[l]**2 -# V[j1,j2] = np.amax(v) * e[l,index[j1, j2]] -# P[l,j1,j2] = np.argmax(v) -# c[l] = np.amax(V) -# V_previous = np.copy(V) / c[l] - -# ll = np.sum(np.log10(c)) - -# return V, P, ll - - @nb.njit def forwards_viterbi_dip_naive_low_mem(n, m, G, s, e, r): """Naive implementation of LS diploid Viterbi algorithm, with reduced memory.""" @@ -227,78 +139,6 @@ def forwards_viterbi_dip_naive_low_mem(n, m, G, s, e, r): return V, P, ll -# def forwards_viterbi_dip_low_mem(n, m, G, s, e, r): - -# # Initialise -# V = np.zeros((n, n)) -# P = np.zeros((m,n,n)).astype(np.int64) -# index = ( -# 4*np.equal(G[0,:,:], s[0,0]).astype(np.int64) + -# 2*(G[0,:,:] == 1).astype(np.int64) + -# np.int64(s[0,0] == 1) -# ) -# V_previous = 1/(n**2) * e[0,index] -# c = np.ones(m) -# r_n = r/n - -# # Diploid Viterbi, with smaller memory footprint, rescaling, and using the structure of the HMM. -# for l in range(1,m): - -# index = ( -# 4*np.equal(G[l,:,:], s[0,l]).astype(np.int64) + -# 2*(G[l,:,:] == 1).astype(np.int64) + -# np.int64(s[0,l] == 1) -# ) - -# c[l] = np.amax(V_previous) -# argmax = np.argmax(V_previous) - -# V_previous *= 1/c[l] -# V_rowcol_max = np_amax(V_previous, 0) -# arg_rowcol_max = np_argmax(V_previous, 0) - -# no_switch = (1 - r[l])**2 + 2*(r_n[l]*(1 - r[l])) + r_n[l]**2 -# single_switch = r_n[l]*(1 - r[l]) + r_n[l]**2 -# double_switch = r_n[l]**2 - -# j1_j2 = 0 - -# for j1 in range(n): -# for j2 in range(n): - -# V_single_switch = max(V_rowcol_max[j1], V_rowcol_max[j2]) -# P_single_switch = np.argmax(np.array([V_rowcol_max[j1], V_rowcol_max[j2]])) - -# if P_single_switch == 0: -# template_single_switch = j1*n + arg_rowcol_max[j1] -# else: -# template_single_switch = arg_rowcol_max[j2]*n + j2 - -# V[j1,j2] = V_previous[j1,j2] * no_switch # No switch in either -# P[l, j1, j2] = j1_j2 - -# # Single or double switch? -# single_switch_tmp = single_switch * V_single_switch -# if (single_switch_tmp > double_switch): -# # Then single switch is the alternative -# if (V[j1,j2] < single_switch * V_single_switch): -# V[j1,j2] = single_switch * V_single_switch -# P[l, j1, j2] = template_single_switch -# else: -# # Double switch is the alternative -# if V[j1, j2] < double_switch: -# V[j1, j2] = double_switch -# P[l, j1, j2] = argmax - -# V[j1,j2] *= e[l, index[j1, j2]] -# j1_j2 += 1 -# V_previous = np.copy(V) - -# ll = np.sum(np.log10(c)) + np.log10(np.amax(V)) - -# return V, P, ll - - @nb.njit def forwards_viterbi_dip_low_mem(n, m, G, s, e, r): """LS diploid Viterbi algorithm, with reduced memory.""" @@ -471,47 +311,6 @@ def forwards_viterbi_dip_low_mem_no_pointer(n, m, G, s, e, r): ) -# def forwards_viterbi_dip_naive_vec(n, m, G, s, e, r): - -# # Initialise -# V = np.zeros((m,n,n)) -# P = np.zeros((m,n,n)).astype(np.int64) -# c = np.ones(m) -# index = ( -# 4*np.equal(G[0,:,:], s[0,0]).astype(np.int64) + -# 2*(G[0,:,:] == 1).astype(np.int64) + -# np.int64(s[0,0] == 1) -# ) -# V[0,:,:] = 1/(n**2) * e[0,index] -# r_n = r/n - -# # Jumped the gun - vectorising. -# for l in range(1,m): - -# index = ( -# 4*np.equal(G[l,:,:], s[0,l]).astype(np.int64) + -# 2*(G[l,:,:] == 1).astype(np.int64) + -# np.int64(s[0,l] == 1) -# ) - -# for j1 in range(n): -# for j2 in range(n): -# v = (r_n[l]**2) * np.ones((n,n)) -# v[j1,j2] += (1-r[l])**2 -# v[j1, :] += (r_n[l] * (1 - r[l])) -# v[:, j2] += (r_n[l] * (1 - r[l])) -# v *= V[l-1,:,:] -# V[l,j1,j2] = np.amax(v) * e[l,index[j1, j2]] -# P[l,j1,j2] = np.argmax(v) - -# c[l] = np.amax(V[l,:,:]) -# V[l,:,:] *= 1/c[l] - -# ll = np.sum(np.log10(c)) - -# return V, P, ll - - @nb.njit def forwards_viterbi_dip_naive_vec(n, m, G, s, e, r): """Vectorised LS diploid Viterbi algorithm using numpy.""" diff --git a/lshmm/vit_haploid_samples_variants.py b/lshmm/vit_haploid_samples_variants.py index 7ac4c7f..1793c5d 100644 --- a/lshmm/vit_haploid_samples_variants.py +++ b/lshmm/vit_haploid_samples_variants.py @@ -3,7 +3,7 @@ import numpy as np -@nb.jit +@nb.njit def viterbi_naive_init(n, m, H, s, e, r): """Initialise naive implementation of LS viterbi.""" V = np.zeros((n, m)) @@ -15,7 +15,7 @@ def viterbi_naive_init(n, m, H, s, e, r): return V, P, r_n -@nb.jit +@nb.njit def viterbi_init(n, m, H, s, e, r): """Initialise naive, but more space memory efficient implementation of LS viterbi.""" V_previous = 1 / n * e[np.equal(H[:, 0], s[0, 0]).astype(np.int64), 0] @@ -27,7 +27,7 @@ def viterbi_init(n, m, H, s, e, r): return V, V_previous, P, r_n -@nb.jit +@nb.njit def forwards_viterbi_hap_naive(n, m, H, s, e, r): """Naive implementation of LS haploid Viterbi algorithm.""" # Initialise @@ -51,7 +51,7 @@ def forwards_viterbi_hap_naive(n, m, H, s, e, r): return V, P, ll -@nb.jit +@nb.njit def forwards_viterbi_hap_naive_vec(n, m, H, s, e, r): """Naive matrix based implementation of LS haploid forward Viterbi algorithm using numpy.""" # Initialise @@ -88,7 +88,7 @@ def forwards_viterbi_hap_naive_full_vec(n, m, H, s, e, r): return V, P, ll -@nb.jit +@nb.njit def forwards_viterbi_hap_naive_low_mem(n, m, H, s, e, r): """Naive implementation of LS haploid Viterbi algorithm, with reduced memory.""" # Initialise @@ -113,7 +113,7 @@ def forwards_viterbi_hap_naive_low_mem(n, m, H, s, e, r): return V, P, ll -@nb.jit +@nb.njit def forwards_viterbi_hap_naive_low_mem_rescaling(n, m, H, s, e, r): """Naive implementation of LS haploid Viterbi algorithm, with reduced memory and rescaling.""" # Initialise @@ -142,7 +142,7 @@ def forwards_viterbi_hap_naive_low_mem_rescaling(n, m, H, s, e, r): return V, P, ll -@nb.jit +@nb.njit def forwards_viterbi_hap_low_mem_rescaling(n, m, H, s, e, r): """LS haploid Viterbi algorithm, with reduced memory and exploits the Markov process structure.""" # Initialise @@ -168,7 +168,7 @@ def forwards_viterbi_hap_low_mem_rescaling(n, m, H, s, e, r): return V, P, ll -@nb.jit +@nb.njit def forwards_viterbi_hap_lower_mem_rescaling(n, m, H, s, e, r): """LS haploid Viterbi algorithm with even smaller memory footprint and exploits the Markov process structure.""" # Initialise @@ -195,7 +195,7 @@ def forwards_viterbi_hap_lower_mem_rescaling(n, m, H, s, e, r): return V, P, ll -@nb.jit +@nb.njit def backwards_viterbi_hap(m, V_last, P): """Run a backwards pass to determine the most likely path.""" # Initialise @@ -209,7 +209,7 @@ def backwards_viterbi_hap(m, V_last, P): return path -@nb.jit +@nb.njit def path_ll_hap(n, m, H, path, s, e, r): """Evaluate log-likelihood path through a reference panel which results in sequence s.""" index = np.int64(np.equal(H[path[0], 0], s[0, 0])) diff --git a/lshmm/vit_haploid_variants_samples.py b/lshmm/vit_haploid_variants_samples.py index 3e15373..f1d367e 100644 --- a/lshmm/vit_haploid_variants_samples.py +++ b/lshmm/vit_haploid_variants_samples.py @@ -2,24 +2,8 @@ import numba as nb import numpy as np -# Speedier version, variants x samples - -# def viterbi_naive_init(n, m, H, s, e, r): -# ''' -# Initialisation portion of initial naive implementation of LS viterbi to avoid -# lots of code duplication -# ''' - -# V = np.zeros((m,n)) -# P = np.zeros((m,n)).astype(np.int64) -# V[0,:] = 1/n * e[0,np.equal(H[0,:], s[0,0]).astype(np.int64)] -# P[0,:] = 0 # Reminder -# r_n = r/n -# return V, P, r_n - - -@nb.jit +@nb.njit def viterbi_naive_init(n, m, H, s, e, r): """Initialise naive implementation of LS viterbi.""" V = np.zeros((m, n)) @@ -31,22 +15,7 @@ def viterbi_naive_init(n, m, H, s, e, r): return V, P, r_n -# def viterbi_init(n, m, H, s, e, r): -# ''' -# Initialisation portion of initial naive, but more space memory efficient implementation -# of LS viterbi to avoid lots of code duplication -# ''' - -# V_previous = 1/n * e[0,np.equal(H[0,:], s[0,0]).astype(np.int64)] -# V = np.zeros(n) -# P = np.zeros((m,n)).astype(np.int64) -# P[0,:] = 0 # Reminder -# r_n = r/n - -# return V, V_previous, P, r_n - - -@nb.jit +@nb.njit def viterbi_init(n, m, H, s, e, r): """Initialise naive, but more space memory efficient implementation of LS viterbi.""" V_previous = np.zeros(n) @@ -60,34 +29,7 @@ def viterbi_init(n, m, H, s, e, r): return V, V_previous, P, r_n -# def forwards_viterbi_hap_naive(n, m, H, s, e, r): -# ''' -# Simple naive LS forward Viterbi algorithm. -# ''' - -# # Initialise -# V, P, r_n = viterbi_naive_init(n, m, H, s, e, r) - -# for j in range(1,m): -# for i in range(n): -# # Get the vector to maximise over -# v = np.zeros(n) -# for k in range(n): -# # v[k] = e[j,np.equal(H[j,i], s[0,j]).astype(np.int64)] * V[j-1,k] -# v[k] = e[j,np.int64(np.equal(H[j,i], s[0,j]))] * V[j-1,k] -# if k == i: -# v[k] *= 1 - r[j] + r_n[j] -# else: -# v[k] *= r_n[j] -# P[j,i] = np.argmax(v) -# V[j,i] = v[P[j,i]] - -# ll = np.log10(np.amax(V[m-1,:])) - -# return V, P, ll - - -@nb.jit +@nb.njit def forwards_viterbi_hap_naive(n, m, H, s, e, r): """Naive implementation of LS haploid Viterbi algorithm.""" # Initialise @@ -112,30 +54,7 @@ def forwards_viterbi_hap_naive(n, m, H, s, e, r): return V, P, ll -# def forwards_viterbi_hap_naive_vec(n, m, H, s, e, r): -# ''' -# Simple matrix based method naive LS forward Viterbi algorithm. Vectorised things -# - I jumped the gun! -# ''' - -# # Initialise -# V, P, r_n = viterbi_naive_init(n, m, H, s, e, r) - -# for j in range(1,m): -# v_tmp = V[j-1,:] * r_n[j] -# for i in range(n): -# v = np.copy(v_tmp) -# v[i] += V[j-1,i] * (1 - r[j]) -# v *= e[j,np.int64(np.equal(H[j,i], s[0,j]))] -# P[j,i] = np.argmax(v) -# V[j,i] = v[P[j,i]] - -# ll = np.log10(np.amax(V[m-1,:])) - -# return V, P, ll - - -@nb.jit +@nb.njit def forwards_viterbi_hap_naive_vec(n, m, H, s, e, r): """Naive matrix based implementation of LS haploid forward Viterbi algorithm using numpy.""" # Initialise @@ -155,34 +74,7 @@ def forwards_viterbi_hap_naive_vec(n, m, H, s, e, r): return V, P, ll -# def forwards_viterbi_hap_naive_low_mem(n, m, H, s, e, r): -# ''' -# Simple naive LS forward Viterbi algorithm. More memory efficient. -# ''' - -# # Initialise -# V, V_previous, P, r_n = viterbi_init(n, m, H, s, e, r) - -# for j in range(1,m): -# for i in range(n): -# # Get the vector to maximise over -# v = np.zeros(n) -# for k in range(n): -# v[k] = e[j,np.int64(np.equal(H[j,i], s[0,j]))] * V_previous[k] -# if k == i: -# v[k] *= 1 - r[j] + r_n[j] -# else: -# v[k] *= r_n[j] -# P[j,i] = np.argmax(v) -# V[i] = v[P[j,i]] -# V_previous = np.copy(V) - -# ll = np.log10(np.amax(V)) - -# return V, P, ll - - -@nb.jit +@nb.njit def forwards_viterbi_hap_naive_low_mem(n, m, H, s, e, r): """Naive implementation of LS haploid Viterbi algorithm, with reduced memory.""" # Initialise @@ -207,39 +99,7 @@ def forwards_viterbi_hap_naive_low_mem(n, m, H, s, e, r): return V, P, ll -# def forwards_viterbi_hap_naive_low_mem_rescaling(n, m, H, s, e, r): -# ''' -# Simple naive LS forward Viterbi algorithm. More memory efficient, and with -# a rescaling to avoid underflow problems -# ''' - -# # Initialise -# V, V_previous, P, r_n = viterbi_init(n, m, H, s, e, r) -# c = np.ones(m) - -# for j in range(1,m): -# c[j] = np.amax(V_previous) -# V_previous *= 1/c[j] -# for i in range(n): -# # Get the vector to maximise over -# v = np.zeros(n) -# for k in range(n): -# v[k] = e[j,np.int64(np.equal(H[j,i], s[0,j]))] * V_previous[k] -# if k == i: -# v[k] *= 1 - r[j] + r_n[j] -# else: -# v[k] *= r_n[j] -# P[j,i] = np.argmax(v) -# V[i] = v[P[j,i]] - -# V_previous = np.copy(V) - -# ll = np.sum(np.log10(c)) + np.log10(np.amax(V)) - -# return V, P, ll - - -@nb.jit +@nb.njit def forwards_viterbi_hap_naive_low_mem_rescaling(n, m, H, s, e, r): """Naive implementation of LS haploid Viterbi algorithm, with reduced memory and rescaling.""" # Initialise @@ -268,36 +128,7 @@ def forwards_viterbi_hap_naive_low_mem_rescaling(n, m, H, s, e, r): return V, P, ll -# def forwards_viterbi_hap_low_mem_rescaling(n, m, H, s, e, r): -# ''' -# Simple LS forward Viterbi algorithm. Smaller memory footprint and rescaling, -# and considers the structure of the Markov process. -# ''' - -# # Initialise -# V, V_previous, P, r_n = viterbi_init(n, m, H, s, e, r) -# c = np.ones(m) - -# for j in range(1,m): -# argmax = np.argmax(V_previous) -# c[j] = V_previous[argmax] -# V_previous *= 1/c[j] -# V = np.zeros(n) -# for i in range(n): -# V[i] = V_previous[i] * (1 - r[j] + r_n[j]) -# P[j,i] = i -# if V[i] < r_n[j]: -# V[i] = r_n[j] -# P[j,i] = argmax -# V[i] *= e[j,np.equal(H[j,i], s[0,j]).astype(np.int64)] -# V_previous = np.copy(V) - -# ll = np.sum(np.log10(c)) + np.log10(np.max(V)) - -# return V, P, ll - - -@nb.jit +@nb.njit def forwards_viterbi_hap_low_mem_rescaling(n, m, H, s, e, r): """LS haploid Viterbi algorithm, with reduced memory and exploits the Markov process structure.""" # Initialise @@ -323,37 +154,7 @@ def forwards_viterbi_hap_low_mem_rescaling(n, m, H, s, e, r): return V, P, ll -# def forwards_viterbi_hap_lower_mem_rescaling(n, m, H, s, e, r): -# ''' -# Simple LS forward Viterbi algorithm. Even smaller memory footprint and rescaling, -# and considers the structure of the Markov process. -# ''' - -# # Initialise -# V = 1/n * e[0, np.equal(H[0,:], s[0,0]).astype(np.int64)] -# P = np.zeros((m,n)).astype(np.int64) -# # P[0,:] = 0 -# r_n = r/n -# c = np.ones(m) - -# for j in range(1,m): -# argmax = np.argmax(V) -# c[j] = V[argmax] -# V *= 1/c[j] -# for i in range(n): -# V[i] = V[i] * (1 - r[j] + r_n[j]) -# P[j,i] = i -# if V[i] < r_n[j]: -# V[i] = r_n[j] -# P[j,i] = argmax -# V[i] *= e[j,np.int64(np.equal(H[j,i], s[0,j]))] - -# ll = np.sum(np.log10(c)) + np.log10(np.max(V)) - -# return V, P, ll - - -@nb.jit +@nb.njit def forwards_viterbi_hap_lower_mem_rescaling(n, m, H, s, e, r): """LS haploid Viterbi algorithm with even smaller memory footprint and exploits the Markov process structure.""" # Initialise @@ -381,7 +182,7 @@ def forwards_viterbi_hap_lower_mem_rescaling(n, m, H, s, e, r): return V, P, ll -@nb.jit +@nb.njit def forwards_viterbi_hap_lower_mem_rescaling_no_pointer(n, m, H, s, e, r): """LS haploid Viterbi algorithm with even smaller memory footprint and exploits the Markov process structure.""" # Initialise @@ -391,8 +192,9 @@ def forwards_viterbi_hap_lower_mem_rescaling_no_pointer(n, m, H, s, e, r): r_n = r / n c = np.ones(m) recombs = [ - set() for _ in range(m) + np.zeros(shape=0, dtype=np.int64) for _ in range(m) ] # This is going to be filled with the templates we can recombine to that have higher prob than staying where we are. + V_argmaxes = np.zeros(m) for j in range(1, m): @@ -404,8 +206,8 @@ def forwards_viterbi_hap_lower_mem_rescaling_no_pointer(n, m, H, s, e, r): V[i] = V[i] * (1 - r[j] + r_n[j]) if V[i] < r_n[j]: V[i] = r_n[j] - recombs[j].add( - i + recombs[j] = np.append( + recombs[j], i ) # We add template i as a potential template to recombine to at site j. V[i] *= e[j, np.int64(np.equal(H[j, i], s[0, j]))] @@ -416,7 +218,7 @@ def forwards_viterbi_hap_lower_mem_rescaling_no_pointer(n, m, H, s, e, r): # Speedier version, variants x samples -@nb.jit +@nb.njit def backwards_viterbi_hap(m, V_last, P): """Run a backwards pass to determine the most likely path.""" # Initialise @@ -430,7 +232,7 @@ def backwards_viterbi_hap(m, V_last, P): return path -@nb.jit +@nb.njit def backwards_viterbi_hap_no_pointer(m, V_argmaxes, recombs): """Run a backwards pass to determine the most likely path.""" # Initialise @@ -446,7 +248,7 @@ def backwards_viterbi_hap_no_pointer(m, V_argmaxes, recombs): return path -@nb.jit +@nb.njit def path_ll_hap(n, m, H, path, s, e, r): """Evaluate log-likelihood path through a reference panel which results in sequence s.""" index = np.int64(np.equal(H[0, path[0]], s[0, 0])) diff --git a/tests/test_API.py b/tests/test_API.py index 82b7a48..c84aec1 100644 --- a/tests/test_API.py +++ b/tests/test_API.py @@ -44,7 +44,7 @@ def genotype_emission(self, mu, m): e = np.zeros((m, 8)) e[:, EQUAL_BOTH_HOM] = (1 - mu) ** 2 e[:, UNEQUAL_BOTH_HOM] = mu ** 2 - e[:, BOTH_HET] = 1 - mu + e[:, BOTH_HET] = (1 - mu) ** 2 + mu ** 2 e[:, REF_HOM_OBS_HET] = 2 * mu * (1 - mu) e[:, REF_HET_OBS_HOM] = mu * (1 - mu) @@ -203,7 +203,6 @@ class FBAlgorithmBase(LSBase): """Base for forwards backwards algorithm tests.""" -# @pytest.mark.skip(reason="DEV: skip for time being") class TestMethodsHap(FBAlgorithmBase): """Test that we compute the sample likelihoods across all implementations.""" @@ -221,7 +220,6 @@ def verify(self, ts): B = ls.backwards(H_vs, s, c, r, mu) -# @pytest.mark.skip(reason="DEV: skip for time being") class TestMethodsDip(FBAlgorithmBase): """Test that we compute the sample likelihoods across all implementations.""" @@ -243,7 +241,6 @@ class VitAlgorithmBase(LSBase): """Base for viterbi algoritm tests.""" -# @pytest.mark.skip(reason="DEV: skip for time being") class TestViterbiHap(VitAlgorithmBase): """Test that we have the same log-likelihood across all implementations""" @@ -260,7 +257,6 @@ def verify(self, ts): self.assertAllClose(path_vs, path) -# @pytest.mark.skip(reason="DEV: skip for time being") class TestViterbiDip(VitAlgorithmBase): """Test that we have the same log-likelihood across all implementations""" diff --git a/tests/test_API_multialllelic.py b/tests/test_API_multialllelic.py deleted file mode 100644 index c62bed1..0000000 --- a/tests/test_API_multialllelic.py +++ /dev/null @@ -1,213 +0,0 @@ -# Simulation -import itertools - -# Python libraries -import msprime -import numpy as np -import pytest -import tskit - -import lshmm as ls -import lshmm.forward_backward.fb_diploid_variants_samples as fbd_vs -import lshmm.forward_backward.fb_haploid_variants_samples as fbh_vs -import lshmm.vit_diploid_variants_samples as vd_vs -import lshmm.vit_haploid_variants_samples as vh_vs - -EQUAL_BOTH_HOM = 4 -UNEQUAL_BOTH_HOM = 0 -BOTH_HET = 7 -REF_HOM_OBS_HET = 1 -REF_HET_OBS_HOM = 2 - - -class LSBase: - """Superclass of Li and Stephens tests.""" - - def example_haplotypes(self, ts): - - H = ts.genotype_matrix() - s = H[:, 0].reshape(1, H.shape[0]) - H = H[:, 1:] - - return H, s - - def haplotype_emission(self, mu, m, n_alleles, scale_mutation_based_on_n_alleles): - # Define the emission probability matrix - e = np.zeros((m, 2)) - if isinstance(mu, float): - mu = mu * np.ones(m) - - if scale_mutation_based_on_n_alleles: - e[:, 0] = mu - mu * np.equal( - n_alleles, np.ones(m) - ) # Added boolean in case we're at an invariant site - e[:, 1] = 1 - (n_alleles - 1) * mu - else: - for j in range(m): - if n_alleles[j] == 1: # In case we're at an invariant site - e[j, 0] = 0 - e[j, 1] = 1 - else: - e[j, 0] = mu[j] / (n_alleles[j] - 1) - e[j, 1] = 1 - mu[j] - return e - - def example_parameters_haplotypes(self, ts, seed=42, scale_mutation=True): - """Returns an iterator over combinations of haplotype, recombination and - mutation rates.""" - np.random.seed(seed) - H, s = self.example_haplotypes(ts) - n = H.shape[1] - m = ts.get_num_sites() - - # alleles = [] - # for variant in ts.variants(): - # alleles.append(variant.alleles) - # n_alleles = np.int8([(len(alleles_site)) for alleles_site in alleles]) - - # Must be calculated from the genotype matrix because we can now get back mutations that - # result in the number of alleles being higher than the number of alleles in the reference panel. - n_alleles = np.int8( - [len(np.unique(np.append(H[j, :], s[:, j]))) for j in range(m)] - ) - - # Here we have equal mutation and recombination - r = np.zeros(m) + 0.01 - mu = np.zeros(m) + 0.01 - r[0] = 0 - - e = self.haplotype_emission( - mu, m, n_alleles, scale_mutation_based_on_n_alleles=scale_mutation - ) - - yield n, m, H, s, e, r, mu - - # Mixture of random and extremes - rs = [np.zeros(m) + 0.999, np.zeros(m) + 1e-6, np.random.rand(m)] - - mus = [np.zeros(m) + 0.33, np.zeros(m) + 1e-6, np.random.rand(m) * 0.33] - - e = self.haplotype_emission( - mu, m, n_alleles, scale_mutation_based_on_n_alleles=scale_mutation - ) - - for r, mu in itertools.product(rs, mus): - r[0] = 0 - e = self.haplotype_emission( - mu, m, n_alleles, scale_mutation_based_on_n_alleles=scale_mutation - ) - yield n, m, H, s, e, r, mu - - def assertAllClose(self, A, B): - """Assert that all entries of two matrices are 'close'""" - assert np.allclose(A, B, rtol=1e-9, atol=0.0) - - # Define a bunch of very small tree-sequences for testing a collection of parameters on - def test_simple_n_10_no_recombination(self): - ts = msprime.sim_ancestry( - samples=10, - recombination_rate=0, - random_seed=42, - sequence_length=10, - population_size=10000, - ) - ts = msprime.sim_mutations(ts, rate=1e-5, random_seed=42) - assert ts.num_sites > 3 - self.verify(ts) - - def test_simple_n_6(self): - ts = msprime.sim_ancestry( - samples=6, - recombination_rate=1e-4, - random_seed=42, - sequence_length=40, - population_size=10000, - ) - ts = msprime.sim_mutations(ts, rate=1e-3, random_seed=42) - assert ts.num_sites > 5 - self.verify(ts) - - def test_simple_n_8(self): - ts = msprime.sim_ancestry( - samples=8, - recombination_rate=1e-4, - random_seed=42, - sequence_length=20, - population_size=10000, - ) - ts = msprime.sim_mutations(ts, rate=1e-4, random_seed=42) - assert ts.num_sites > 5 - assert ts.num_trees > 15 - self.verify(ts) - - def test_simple_n_16(self): - ts = msprime.sim_ancestry( - samples=16, - recombination_rate=1e-2, - random_seed=42, - sequence_length=20, - population_size=10000, - ) - ts = msprime.sim_mutations(ts, rate=1e-4, random_seed=42) - assert ts.num_sites > 5 - self.verify(ts) - - def verify(self, ts): - raise NotImplementedError() - - -class FBAlgorithmBase(LSBase): - """Base for forwards backwards algorithm tests.""" - - -# @pytest.mark.skip(reason="DEV: skip for time being") -class TestMethodsHap(FBAlgorithmBase): - """Test that we compute the sample likelihoods across all implementations.""" - - def verify(self, ts): - for n, m, H_vs, s, e_vs, r, mu in self.example_parameters_haplotypes(ts): - F_vs, c_vs, ll_vs = fbh_vs.forwards_ls_hap(n, m, H_vs, s, e_vs, r) - B_vs = fbh_vs.backwards_ls_hap(n, m, H_vs, s, e_vs, c_vs, r) - F, c, ll = ls.forwards(H_vs, s, r, mutation_rate=mu) - B = ls.backwards(H_vs, s, c, r, mutation_rate=mu) - self.assertAllClose(F, F_vs) - self.assertAllClose(B, B_vs) - self.assertAllClose(ll_vs, ll) - - for n, m, H_vs, s, e_vs, r, mu in self.example_parameters_haplotypes( - ts, scale_mutation=False - ): - F_vs, c_vs, ll_vs = fbh_vs.forwards_ls_hap(n, m, H_vs, s, e_vs, r) - B_vs = fbh_vs.backwards_ls_hap(n, m, H_vs, s, e_vs, c_vs, r) - F, c, ll = ls.forwards( - H_vs, s, r, mutation_rate=mu, scale_mutation_based_on_n_alleles=False - ) - B = ls.backwards( - H_vs, s, c, r, mutation_rate=mu, scale_mutation_based_on_n_alleles=False - ) - self.assertAllClose(F, F_vs) - self.assertAllClose(B, B_vs) - self.assertAllClose(ll_vs, ll) - - -class VitAlgorithmBase(LSBase): - """Base for viterbi algoritm tests.""" - - -# @pytest.mark.skip(reason="DEV: skip for time being") -class TestViterbiHap(VitAlgorithmBase): - """Test that we have the same log-likelihood across all implementations""" - - def verify(self, ts): - for n, m, H_vs, s, e_vs, r, mu in self.example_parameters_haplotypes(ts): - - V_vs, P_vs, ll_vs = vh_vs.forwards_viterbi_hap_lower_mem_rescaling( - n, m, H_vs, s, e_vs, r - ) - path_vs = vh_vs.backwards_viterbi_hap(m, V_vs, P_vs) - path_ll_hap = vh_vs.path_ll_hap(n, m, H_vs, path_vs, s, e_vs, r) - path, ll = ls.viterbi(H_vs, s, r, mutation_rate=mu) - - self.assertAllClose(ll_vs, ll) - self.assertAllClose(ll_vs, path_ll_hap) - self.assertAllClose(path_vs, path) diff --git a/tests/test_LS_haploid_diploid.py b/tests/test_LS_haploid_diploid.py index 63ebdc0..34012a4 100644 --- a/tests/test_LS_haploid_diploid.py +++ b/tests/test_LS_haploid_diploid.py @@ -221,7 +221,6 @@ class FBAlgorithmBase(LSBase): """Base for forwards backwards algorithm tests.""" -@pytest.mark.skip(reason="DEV: skip for time being") class TestNonTreeMethodsHap(FBAlgorithmBase): """Test that we compute the sample likelihoods across all implementations.""" @@ -287,7 +286,6 @@ def verify_larger(self, ts): self.assertAllClose(ll_vs, ll_sv) -@pytest.mark.skip(reason="DEV: skip for time being") class TestNonTreeMethodsDip(FBAlgorithmBase): """Test that we compute the sample likelihoods across all implementations.""" @@ -454,12 +452,10 @@ def verify_larger(self, ts): self.assertAllClose(ll_vs, ll_sv) -# @pytest.mark.skip(reason="DEV: skip for time being") class VitAlgorithmBase(LSBase): """Base for viterbi algoritm tests.""" -@pytest.mark.skip(reason="DEV: skip for time being") class TestNonTreeViterbiHap(VitAlgorithmBase): """Test that we have the same log-likelihood across all implementations""" @@ -518,7 +514,9 @@ def verify(self, ts): n, m, H_vs, s, e_vs, r ) path_tmp = vh_vs.backwards_viterbi_hap_no_pointer( - m, V_argmaxes_tmp, recombs + m, + V_argmaxes_tmp, + nb.typed.List(recombs), ) ll_check = vh_vs.path_ll_hap(n, m, H_vs, path_tmp, s, e_vs, r) self.assertAllClose(ll_tmp, ll_check) @@ -622,7 +620,9 @@ def verify_larger(self, ts): ) = vh_vs.forwards_viterbi_hap_lower_mem_rescaling_no_pointer( n, m, H_vs, s, e_vs, r ) - path_tmp = vh_vs.backwards_viterbi_hap_no_pointer(m, V_argmaxes_tmp, recombs) + path_tmp = vh_vs.backwards_viterbi_hap_no_pointer( + m, V_argmaxes_tmp, nb.typed.List(recombs) + ) ll_check = vh_vs.path_ll_hap(n, m, H_vs, path_tmp, s, e_vs, r) self.assertAllClose(ll_tmp, ll_check) self.assertAllClose(ll_vs, ll_tmp) @@ -672,7 +672,6 @@ def verify_larger(self, ts): self.assertAllClose(ll_vs, ll_sv) -# @pytest.mark.skip(reason="DEV: skip for time being") class TestNonTreeViterbiDip(VitAlgorithmBase): """Test that we have the same log-likelihood across all implementations""" diff --git a/tests/test_LS_haploid_multiallelic.py b/tests/test_LS_haploid_multiallelic.py index d78e4cd..96b8bda 100644 --- a/tests/test_LS_haploid_multiallelic.py +++ b/tests/test_LS_haploid_multiallelic.py @@ -7,8 +7,8 @@ import pytest import tskit -import lshmm.forward_backward.fb_haploid_variants_samples_multiallelic as fbh -import lshmm.vit_haploid_variants_samples_multiallelic as vh +import lshmm.forward_backward.fb_haploid_variants_samples as fbh +import lshmm.vit_haploid_variants_samples as vh class LSBase: @@ -125,46 +125,39 @@ class FBAlgorithmBase(LSBase): """Base for forwards backwards algorithm tests.""" -# @pytest.mark.skip(reason="DEV: skip for time being") class TestNonTreeMethodsHap(FBAlgorithmBase): """Test that we compute the sample likelihoods across all implementations.""" def verify(self, ts): for n, m, alleles, H, s, e, r in self.example_parameters_haplotypes(ts): - F, c, ll = fbh.forwards_ls_hap_wrapper( - n, m, alleles, H, s, e, r, norm=False - ) - B = fbh.backwards_ls_hap_wrapper(n, m, alleles, H, s, e, c, r) + F, c, ll = fbh.forwards_ls_hap(n, m, H, s, e, r, norm=False) + B = fbh.backwards_ls_hap(n, m, H, s, e, c, r) self.assertAllClose(np.log10(np.sum(F * B, 1)), ll * np.ones(m)) - F_tmp, c_tmp, ll_tmp = fbh.forwards_ls_hap_wrapper( - n, m, alleles, H, s, e, r, norm=True - ) - B_tmp = fbh.backwards_ls_hap_wrapper(n, m, alleles, H, s, e, c_tmp, r) + F_tmp, c_tmp, ll_tmp = fbh.forwards_ls_hap(n, m, H, s, e, r, norm=True) + B_tmp = fbh.backwards_ls_hap(n, m, H, s, e, c_tmp, r) self.assertAllClose(ll, ll_tmp) self.assertAllClose(np.sum(F_tmp * B_tmp, 1), np.ones(m)) -@pytest.mark.skip(reason="DEV: skip for time being") class VitAlgorithmBase(LSBase): """Base for viterbi algoritm tests.""" -@pytest.mark.skip(reason="DEV: skip for time being") class TestNonTreeViterbiHap(VitAlgorithmBase): """Test that we have the same log-likelihood across all implementations""" def verify(self, ts): for n, m, alleles, H, s, e, r in self.example_parameters_haplotypes(ts): - V, P, ll = vh.forwards_viterbi_hap_naive_wrapper(n, m, H, s, e, r) - path = vh.backwards_viterbi_hap_wrapper(m, V[m - 1, :], P) + V, P, ll = vh.forwards_viterbi_hap_naive(n, m, H, s, e, r) + path = vh.backwards_viterbi_hap(m, V[m - 1, :], P) ll_check = vh.path_ll_hap(n, m, H, path, s, e, r) self.assertAllClose(ll, ll_check) - V_tmp, P_tmp, ll_tmp = vh.forwards_viterbi_hap_lower_mem_rescaling_wrapper( + V_tmp, P_tmp, ll_tmp = vh.forwards_viterbi_hap_lower_mem_rescaling( n, m, H, s, e, r ) - path_tmp = vh.backwards_viterbi_hap_wrapper(m, V_tmp, P_tmp) + path_tmp = vh.backwards_viterbi_hap(m, V_tmp, P_tmp) ll_check = vh.path_ll_hap(n, m, H, path_tmp, s, e, r) self.assertAllClose(ll_tmp, ll_check) self.assertAllClose(ll, ll_tmp) From 3e00844062afa37deb15587136b379422ec8e87d Mon Sep 17 00:00:00 2001 From: astheeggeggs Date: Tue, 30 Aug 2022 17:23:45 +0100 Subject: [PATCH 3/6] added tests for completeness, will remove in next commit --- ...t_haploid_variants_samples_multiallelic.py | 147 +++++++++++++ tests/test_API_multiallelic.py | 206 ++++++++++++++++++ 2 files changed, 353 insertions(+) create mode 100644 lshmm/vit_haploid_variants_samples_multiallelic.py create mode 100644 tests/test_API_multiallelic.py diff --git a/lshmm/vit_haploid_variants_samples_multiallelic.py b/lshmm/vit_haploid_variants_samples_multiallelic.py new file mode 100644 index 0000000..9dcdacf --- /dev/null +++ b/lshmm/vit_haploid_variants_samples_multiallelic.py @@ -0,0 +1,147 @@ +# """Collection of functions to run Viterbi algorithms on haploid genotype data, where the data is structured as variants x samples.""" +# import numba as nb +# import numpy as np + +# def check_alleles(alleles, m): +# """ +# Checks the specified allele list and returns a list of lists +# of alleles of length num_sites. +# If alleles is a 1D list of strings, assume that this list is used +# for each site and return num_sites copies of this list. +# Otherwise, raise a ValueError if alleles is not a list of length +# num_sites. +# """ +# if isinstance(alleles[0], str): +# return np.int8([len(alleles) for _ in range(m)]) +# if len(alleles) != m: +# raise ValueError("Malformed alleles list") +# n_alleles = np.int8([(len(alleles_site)) for alleles_site in alleles]) +# return n_alleles + +# # Speedier version, variants x samples + +# @nb.jit +# def viterbi_naive_init(n, m, H, s, e, r): +# """Initialise naive implementation of LS viterbi.""" +# V = np.zeros((m, n)) +# P = np.zeros((m, n)).astype(np.int64) +# r_n = r / n +# for i in range(n): +# V[0, i] = 1 / n * e[0, np.int64(np.equal(H[0, i], s[0, 0]))] + +# return V, P, r_n + +# @nb.jit +# def viterbi_init(n, m, H, s, e, r): +# """Initialise naive, but more space memory efficient implementation of LS viterbi.""" +# V_previous = np.zeros(n) +# V = np.zeros(n) +# P = np.zeros((m, n)).astype(np.int64) +# r_n = r / n + +# for i in range(n): +# V_previous[i] = 1 / n * e[0, np.int64(np.equal(H[0, i], s[0, 0]))] + +# return V, V_previous, P, r_n + + +# @nb.jit +# def forwards_viterbi_hap_naive(n, m, n_alleles, H, s, e, r): +# """Naive implementation of LS haploid Viterbi algorithm.""" +# # Initialise +# V, P, r_n = viterbi_naive_init(n, m, H, s, e, r) + +# for j in range(1, m): +# for i in range(n): +# # Get the vector to maximise over +# v = np.zeros(n) +# for k in range(n): +# # v[k] = e[j,np.equal(H[j,i], s[0,j]).astype(np.int64)] * V[j-1,k] +# v[k] = e[j, np.int64(np.equal(H[j, i], s[0, j]))] * V[j - 1, k] +# if k == i: +# v[k] *= 1 - r[j] + r_n[j] +# else: +# v[k] *= r_n[j] +# P[j, i] = np.argmax(v) +# V[j, i] = v[P[j, i]] + +# ll = np.log10(np.amax(V[m - 1, :])) + +# return V, P, ll + + +# @nb.jit +# def forwards_viterbi_hap_lower_mem_rescaling(n, m, n_alleles, H, s, e, r): +# """LS haploid Viterbi algorithm with even smaller memory footprint and exploits the Markov process structure.""" +# # Initialise +# V = np.zeros(n) +# for i in range(n): +# V[i] = 1 / n * e[0, np.int64(np.equal(H[0, i], s[0, 0]))] +# P = np.zeros((m, n)).astype(np.int64) +# r_n = r / n +# c = np.ones(m) + +# for j in range(1, m): +# argmax = np.argmax(V) +# c[j] = V[argmax] +# V *= 1 / c[j] +# for i in range(n): +# V[i] = V[i] * (1 - r[j] + r_n[j]) +# P[j, i] = i +# if V[i] < r_n[j]: +# V[i] = r_n[j] +# P[j, i] = argmax +# V[i] *= e[j, np.int64(np.equal(H[j, i], s[0, j]))] + +# ll = np.sum(np.log10(c)) + np.log10(np.max(V)) + +# return V, P, ll + + +# def forwards_viterbi_hap_naive_wrapper(n, m, alleles, H, s, e, r): +# n_alleles = check_alleles(alleles, m) +# V, P, ll = forwards_viterbi_hap_naive(n, m, n_alleles, H, s, e, r) +# return V, P, ll + +# def forwards_viterbi_hap_lower_mem_rescaling_wrapper(n, m, alleles, H, s, e, r): +# n_alleles = check_alleles(alleles, m) +# V, P, ll = forwards_viterbi_hap_lower_mem_rescaling(n, m, n_alleles, H, s, e, r) +# return V, P, ll + +# # Speedier version, variants x samples +# @nb.jit +# def backwards_viterbi_hap(m, V_last, P): +# """Run a backwards pass to determine the most likely path.""" +# # Initialise +# assert len(V_last.shape) == 1 +# path = np.zeros(m).astype(np.int64) +# path[m - 1] = np.argmax(V_last) + +# for j in range(m - 2, -1, -1): +# path[j] = P[j + 1, path[j + 1]] + +# return path + +# # DEV: This might need some work +# @nb.jit +# def path_ll_hap(n, m, H, path, s, e, r): +# """Evaluate log-likelihood path through a reference panel which results in sequence s.""" +# index = np.int64(np.equal(H[0, path[0]], s[0, 0])) +# log_prob_path = np.log10((1 / n) * e[0, index]) +# old = path[0] +# r_n = r / n + +# for l in range(1, m): +# index = np.int64(np.equal(H[l, path[l]], s[0, l])) +# current = path[l] +# same = old == current + +# if same: +# log_prob_path += np.log10((1 - r[l]) + r_n[l]) +# else: +# log_prob_path += np.log10(r_n[l]) + +# log_prob_path += np.log10(e[l, index]) +# old = current + +# return log_prob_path diff --git a/tests/test_API_multiallelic.py b/tests/test_API_multiallelic.py new file mode 100644 index 0000000..a840d1e --- /dev/null +++ b/tests/test_API_multiallelic.py @@ -0,0 +1,206 @@ +# Simulation +import itertools + +# Python libraries +import msprime +import numpy as np +import pytest +import tskit + +import lshmm as ls +import lshmm.forward_backward.fb_diploid_variants_samples as fbd_vs +import lshmm.forward_backward.fb_haploid_variants_samples as fbh_vs +import lshmm.vit_diploid_variants_samples as vd_vs +import lshmm.vit_haploid_variants_samples as vh_vs + +EQUAL_BOTH_HOM = 4 +UNEQUAL_BOTH_HOM = 0 +BOTH_HET = 7 +REF_HOM_OBS_HET = 1 +REF_HET_OBS_HOM = 2 + + +class LSBase: + """Superclass of Li and Stephens tests.""" + + def example_haplotypes(self, ts): + + H = ts.genotype_matrix() + s = H[:, 0].reshape(1, H.shape[0]) + H = H[:, 1:] + + return H, s + + def haplotype_emission(self, mu, m, n_alleles, scale_mutation_based_on_n_alleles): + # Define the emission probability matrix + e = np.zeros((m, 2)) + if isinstance(mu, float): + mu = mu * np.ones(m) + + if scale_mutation_based_on_n_alleles: + e[:, 0] = mu - mu * np.equal( + n_alleles, np.ones(m) + ) # Added boolean in case we're at an invariant site + e[:, 1] = 1 - (n_alleles - 1) * mu + else: + for j in range(m): + if n_alleles[j] == 1: # In case we're at an invariant site + e[j, 0] = 0 + e[j, 1] = 1 + else: + e[j, 0] = mu[j] / (n_alleles[j] - 1) + e[j, 1] = 1 - mu[j] + return e + + def example_parameters_haplotypes(self, ts, seed=42, scale_mutation=True): + """Returns an iterator over combinations of haplotype, recombination and + mutation rates.""" + np.random.seed(seed) + H, s = self.example_haplotypes(ts) + n = H.shape[1] + m = ts.get_num_sites() + + # Must be calculated from the genotype matrix because we can now get back mutations that + # result in the number of alleles being higher than the number of alleles in the reference panel. + n_alleles = np.int8( + [len(np.unique(np.append(H[j, :], s[:, j]))) for j in range(m)] + ) + + # Here we have equal mutation and recombination + r = np.zeros(m) + 0.01 + mu = np.zeros(m) + 0.01 + r[0] = 0 + + e = self.haplotype_emission( + mu, m, n_alleles, scale_mutation_based_on_n_alleles=scale_mutation + ) + + yield n, m, H, s, e, r, mu + + # Mixture of random and extremes + rs = [np.zeros(m) + 0.999, np.zeros(m) + 1e-6, np.random.rand(m)] + + mus = [np.zeros(m) + 0.33, np.zeros(m) + 1e-6, np.random.rand(m) * 0.33] + + e = self.haplotype_emission( + mu, m, n_alleles, scale_mutation_based_on_n_alleles=scale_mutation + ) + + for r, mu in itertools.product(rs, mus): + r[0] = 0 + e = self.haplotype_emission( + mu, m, n_alleles, scale_mutation_based_on_n_alleles=scale_mutation + ) + yield n, m, H, s, e, r, mu + + def assertAllClose(self, A, B): + """Assert that all entries of two matrices are 'close'""" + assert np.allclose(A, B, rtol=1e-9, atol=0.0) + + # Define a bunch of very small tree-sequences for testing a collection of parameters on + def test_simple_n_10_no_recombination(self): + ts = msprime.sim_ancestry( + samples=10, + recombination_rate=0, + random_seed=42, + sequence_length=10, + population_size=10000, + ) + ts = msprime.sim_mutations(ts, rate=1e-5, random_seed=42) + assert ts.num_sites > 3 + self.verify(ts) + + def test_simple_n_6(self): + ts = msprime.sim_ancestry( + samples=6, + recombination_rate=1e-4, + random_seed=42, + sequence_length=40, + population_size=10000, + ) + ts = msprime.sim_mutations(ts, rate=1e-3, random_seed=42) + assert ts.num_sites > 5 + self.verify(ts) + + def test_simple_n_8(self): + ts = msprime.sim_ancestry( + samples=8, + recombination_rate=1e-4, + random_seed=42, + sequence_length=20, + population_size=10000, + ) + ts = msprime.sim_mutations(ts, rate=1e-4, random_seed=42) + assert ts.num_sites > 5 + assert ts.num_trees > 15 + self.verify(ts) + + def test_simple_n_16(self): + ts = msprime.sim_ancestry( + samples=16, + recombination_rate=1e-2, + random_seed=42, + sequence_length=20, + population_size=10000, + ) + ts = msprime.sim_mutations(ts, rate=1e-4, random_seed=42) + assert ts.num_sites > 5 + self.verify(ts) + + def verify(self, ts): + raise NotImplementedError() + + +class FBAlgorithmBase(LSBase): + """Base for forwards backwards algorithm tests.""" + + +class TestMethodsHap(FBAlgorithmBase): + """Test that we compute the sample likelihoods across all implementations.""" + + def verify(self, ts): + for n, m, H_vs, s, e_vs, r, mu in self.example_parameters_haplotypes(ts): + F_vs, c_vs, ll_vs = fbh_vs.forwards_ls_hap(n, m, H_vs, s, e_vs, r) + B_vs = fbh_vs.backwards_ls_hap(n, m, H_vs, s, e_vs, c_vs, r) + F, c, ll = ls.forwards(H_vs, s, r, mutation_rate=mu) + B = ls.backwards(H_vs, s, c, r, mutation_rate=mu) + self.assertAllClose(F, F_vs) + self.assertAllClose(B, B_vs) + self.assertAllClose(ll_vs, ll) + + for n, m, H_vs, s, e_vs, r, mu in self.example_parameters_haplotypes( + ts, scale_mutation=False + ): + F_vs, c_vs, ll_vs = fbh_vs.forwards_ls_hap(n, m, H_vs, s, e_vs, r) + B_vs = fbh_vs.backwards_ls_hap(n, m, H_vs, s, e_vs, c_vs, r) + F, c, ll = ls.forwards( + H_vs, s, r, mutation_rate=mu, scale_mutation_based_on_n_alleles=False + ) + B = ls.backwards( + H_vs, s, c, r, mutation_rate=mu, scale_mutation_based_on_n_alleles=False + ) + self.assertAllClose(F, F_vs) + self.assertAllClose(B, B_vs) + self.assertAllClose(ll_vs, ll) + + +class VitAlgorithmBase(LSBase): + """Base for viterbi algoritm tests.""" + + +class TestViterbiHap(VitAlgorithmBase): + """Test that we have the same log-likelihood across all implementations""" + + def verify(self, ts): + for n, m, H_vs, s, e_vs, r, mu in self.example_parameters_haplotypes(ts): + + V_vs, P_vs, ll_vs = vh_vs.forwards_viterbi_hap_lower_mem_rescaling( + n, m, H_vs, s, e_vs, r + ) + path_vs = vh_vs.backwards_viterbi_hap(m, V_vs, P_vs) + path_ll_hap = vh_vs.path_ll_hap(n, m, H_vs, path_vs, s, e_vs, r) + path, ll = ls.viterbi(H_vs, s, r, mutation_rate=mu) + + self.assertAllClose(ll_vs, ll) + self.assertAllClose(ll_vs, path_ll_hap) + self.assertAllClose(path_vs, path) From 588d2d7500e010d674408955d6e818e4a3836edc Mon Sep 17 00:00:00 2001 From: astheeggeggs Date: Tue, 30 Aug 2022 18:17:47 +0100 Subject: [PATCH 4/6] cleaned up the api.py --- lshmm/api.py | 379 ++++++------------ ...b_haploid_variants_samples_multiallelic.py | 106 ----- ...t_haploid_variants_samples_multiallelic.py | 147 ------- tests/test_LS_haploid_multiallelic.py | 163 -------- 4 files changed, 120 insertions(+), 675 deletions(-) delete mode 100644 lshmm/forward_backward/fb_haploid_variants_samples_multiallelic.py delete mode 100644 lshmm/vit_haploid_variants_samples_multiallelic.py delete mode 100644 tests/test_LS_haploid_multiallelic.py diff --git a/lshmm/api.py b/lshmm/api.py index acb62cd..9d2f658 100644 --- a/lshmm/api.py +++ b/lshmm/api.py @@ -105,48 +105,18 @@ def checks( return n, m, ploidy -# def forwards(n, m, G_or_H, s, e, r): -# """ -# Run the Li and Stephens forwards algorithm on haplotype or -# unphased genotype data. -# """ -# template_dimensions = G_or_H.shape -# assert len(template_dimensions) in [2, 3] - -# if len(template_dimensions) == 2: -# # Haploid -# assert (G_or_H.shape == np.array([m, n])).all() -# F, c, ll = forwards_ls_hap(n, m, G_or_H, s, e, r, norm=True) -# else: -# # Diploid -# assert (G_or_H.shape == np.array([m, n, n])).all() -# F, c, ll = forward_ls_dip_loop(n, m, G_or_H, s, e, r, norm=True) - -# return F, c, ll - - -def forwards( +def set_emission_probabilities( + n, + m, reference_panel, query, - recombination_rate, - alleles=None, - mutation_rate=None, - scale_mutation_based_on_n_alleles=True, + alleles, + mutation_rate, + ploidy, + scale_mutation_based_on_n_alleles, ): - """ - Run the Li and Stephens forwards algorithm on haplotype or - unphased genotype data. - """ - n, m, ploidy = checks( - reference_panel, - query, - mutation_rate, - recombination_rate, - scale_mutation_based_on_n_alleles, - ) # Check alleles should go in here, and modify e before passing to the algorithm # If alleles is not passed, we don't perform a test of alleles, but set n_alleles based on the reference_panel. - if alleles is None: n_alleles = np.int8( [ @@ -187,14 +157,6 @@ def forwards( else: e[j, 0] = mutation_rate[j] / (n_alleles[j] - 1) e[j, 1] = 1 - mutation_rate[j] - - ( - forward_array, - normalisation_factor_from_forward, - log_likelihood, - ) = forwards_ls_hap( - n, m, reference_panel, query, e, recombination_rate, norm=True - ) else: # Diploid # Evaluate emission probabilities here, using the mutation rate - this can take a scalar or vector. @@ -206,50 +168,43 @@ def forwards( e[:, REF_HOM_OBS_HET] = 2 * mutation_rate * (1 - mutation_rate) e[:, REF_HET_OBS_HOM] = mutation_rate * (1 - mutation_rate) - ( - forward_array, - normalisation_factor_from_forward, - log_likelihood, - ) = forward_ls_dip_loop( - n, m, reference_panel, query, e, recombination_rate, norm=True - ) + return e - return forward_array, normalisation_factor_from_forward, log_likelihood +def viterbi_hap(n, m, reference_panel, query, emissions, recombination_rate): -# def backwards(n, m, G_or_H, s, e, c, r): -# """ -# Run the Li and Stephens backwards algorithm on haplotype or -# unphased genotype data. -# """ -# template_dimensions = G_or_H.shape -# assert len(template_dimensions) in [2, 3] + V, P, log_likelihood = forwards_viterbi_hap_lower_mem_rescaling( + n, m, reference_panel, query, emissions, recombination_rate + ) + most_likely_path = backwards_viterbi_hap(m, V, P) -# if len(template_dimensions) == 2: -# # Haploid -# assert (G_or_H.shape == np.array([m, n])).all() -# B = backwards_ls_hap(n, m, G_or_H, s, e, c, r) -# else: -# # Diploid -# assert (G_or_H.shape == np.array([m, n, n])).all() -# B = backward_ls_dip_loop(n, m, G_or_H, s, e, c, r) + return most_likely_path, log_likelihood -# return B +def viterbi_dip(n, m, reference_panel, query, emissions, recombination_rate): -def backwards( + V, P, log_likelihood = forwards_viterbi_dip_low_mem( + n, m, reference_panel, query, emissions, recombination_rate + ) + unphased_path = backwards_viterbi_dip(m, V, P) + most_likely_path = get_phased_path(n, unphased_path) + + return most_likely_path, log_likelihood + + +def forwards( reference_panel, query, - normalisation_factor_from_forward, recombination_rate, alleles=None, mutation_rate=None, scale_mutation_based_on_n_alleles=True, ): """ - Run the Li and Stephens backwards algorithm on haplotype or + Run the Li and Stephens forwards algorithm on haplotype or unphased genotype data. """ + n, m, ploidy = checks( reference_panel, query, @@ -257,100 +212,78 @@ def backwards( recombination_rate, scale_mutation_based_on_n_alleles, ) - # Check alleles should go in here, and mofify e before passing to the algorithm - # If alleles is not passed, we don't perform a test of alleles, but set n_alleles based on the reference_panel. - if alleles is None: - n_alleles = np.int8( - [ - len(np.unique(np.append(reference_panel[j, :], query[:, j]))) - for j in range(reference_panel.shape[0]) - ] - ) - else: - n_alleles = check_alleles(alleles, m) - if mutation_rate is None: - # Set the mutation rate to be the proposed mutation rate in Li and Stephens (2003). - theta_tilde = 1 / np.sum([1 / k for k in range(1, n - 1)]) - mutation_rate = 0.5 * (theta_tilde / (n + theta_tilde)) + emissions = set_emission_probabilities( + n, + m, + reference_panel, + query, + alleles, + mutation_rate, + ploidy, + scale_mutation_based_on_n_alleles, + ) if ploidy == 1: - # Haploid - # Evaluate emission probabilities here, using the mutation rate - this can take a scalar or vector. - e = np.zeros((m, 2)) + forward_pass = forwards_ls_hap + else: + forward_pass = forward_ls_dip_loop - if scale_mutation_based_on_n_alleles: - # Scale mutation based on the number of alleles - so the mutation rate is the mutation rate to one of the alleles. - # The overall mutation rate is then (n_alleles - 1) * mutation_rate. - e[:, 0] = mutation_rate - mutation_rate * np.equal( - n_alleles, np.ones(m) - ) # Added boolean in case we're at an invariant site - e[:, 1] = 1 - (n_alleles - 1) * mutation_rate - else: - # No scaling based on the number of alleles - so the mutation rate is the mutation rate to anything. - # Which means that we must rescale the mutation rate to a different allele, by the number of alleles. - for j in range(m): - if n_alleles[j] == 1: # In case we're at an invariant site - e[j, 0] = 0 - e[j, 1] = 1 - else: - e[j, 0] = mutation_rate[j] / (n_alleles[j] - 1) - e[j, 1] = 1 - mutation_rate[j] + (forward_array, normalisation_factor_from_forward, log_likelihood) = forward_pass( + n, m, reference_panel, query, emissions, recombination_rate, norm=True + ) - backwards_array = backwards_ls_hap( - n, - m, - reference_panel, - query, - e, - normalisation_factor_from_forward, - recombination_rate, - ) - else: - # Diploid - # Evaluate emission probabilities here, using the mutation rate - this can take a scalar or vector. - # DEV: there's a wrinkle here - multiallelics not yet done. - e = np.zeros((m, 8)) - e[:, EQUAL_BOTH_HOM] = (1 - mutation_rate) ** 2 - e[:, UNEQUAL_BOTH_HOM] = mutation_rate ** 2 - e[:, BOTH_HET] = (1 - mutation_rate) ** 2 + mutation_rate ** 2 - e[:, REF_HOM_OBS_HET] = 2 * mutation_rate * (1 - mutation_rate) - e[:, REF_HET_OBS_HOM] = mutation_rate * (1 - mutation_rate) + return forward_array, normalisation_factor_from_forward, log_likelihood - backwards_array = backward_ls_dip_loop( - n, - m, - reference_panel, - query, - e, - normalisation_factor_from_forward, - recombination_rate, - ) - return backwards_array +def backwards( + reference_panel, + query, + normalisation_factor_from_forward, + recombination_rate, + alleles=None, + mutation_rate=None, + scale_mutation_based_on_n_alleles=True, +): + """ + Run the Li and Stephens backwards algorithm on haplotype or + unphased genotype data. + """ + n, m, ploidy = checks( + reference_panel, + query, + mutation_rate, + recombination_rate, + scale_mutation_based_on_n_alleles, + ) + emissions = set_emission_probabilities( + n, + m, + reference_panel, + query, + alleles, + mutation_rate, + ploidy, + scale_mutation_based_on_n_alleles, + ) -# def viterbi(n, m, G_or_H, s, e, r): -# """ -# Run the Li and Stephens Viterbi algorithm on haplotype or -# unphased genotype data. -# """ -# template_dimensions = G_or_H.shape -# assert len(template_dimensions) in [2, 3] + if ploidy == 1: + backward_pass = backwards_ls_hap + else: + backward_pass = backward_ls_dip_loop -# if len(template_dimensions) == 2: -# # Haploid -# assert (G_or_H.shape == np.array([m, n])).all() -# V, P, ll = forwards_viterbi_hap_lower_mem_rescaling(n, m, G_or_H, s, e, r) -# path = backwards_viterbi_hap(m, V, P) -# else: -# # Diploid -# assert (G_or_H.shape == np.array([m, n, n])).all() -# V, P, ll = forwards_viterbi_dip_low_mem(n, m, G_or_H, s, e, r) -# unphased_path = backwards_viterbi_dip(m, V, P) -# path = get_phased_path(n, unphased_path) + backwards_array = backward_pass( + n, + m, + reference_panel, + query, + emissions, + normalisation_factor_from_forward, + recombination_rate, + ) -# return path, ll + return backwards_array def viterbi( @@ -373,71 +306,29 @@ def viterbi( scale_mutation_based_on_n_alleles, ) - if alleles is None: - n_alleles = np.int8( - [ - len(np.unique(np.append(reference_panel[j, :], query[:, j]))) - for j in range(reference_panel.shape[0]) - ] - ) - else: - n_alleles = check_alleles(alleles, m) - - if mutation_rate is None: - # Set the mutation rate to be the proposed mutation rate in Li and Stephens (2003). - theta_tilde = 1 / np.sum([1 / k for k in range(1, n - 1)]) - mutation_rate = 0.5 * (theta_tilde / (n + theta_tilde)) + emissions = set_emission_probabilities( + n, + m, + reference_panel, + query, + alleles, + mutation_rate, + ploidy, + scale_mutation_based_on_n_alleles, + ) if ploidy == 1: - # Haploid - # Evaluate emission probabilities here, using the mutation rate - this can take a scalar or vector. - e = np.zeros((m, 2)) - - if scale_mutation_based_on_n_alleles: - # Scale mutation based on the number of alleles - so the mutation rate is the mutation rate to one of the alleles. - # The overall mutation rate is then (n_alleles - 1) * mutation_rate. - e[:, 0] = mutation_rate - mutation_rate * np.equal( - n_alleles, np.ones(m) - ) # Added boolean in case we're at an invariant site - e[:, 1] = 1 - (n_alleles - 1) * mutation_rate - else: - # No scaling based on the number of alleles - so the mutation rate is the mutation rate to anything. - # Which means that we must rescale the mutation rate to a different allele, by the number of alleles. - for j in range(m): - if n_alleles[j] == 1: # In case we're at an invariant site - e[j, 0] = 0 - e[j, 1] = 1 - else: - e[j, 0] = mutation_rate[j] / (n_alleles[j] - 1) - e[j, 1] = 1 - mutation_rate[j] - - V, P, log_likelihood = forwards_viterbi_hap_lower_mem_rescaling( - n, m, reference_panel, query, e, recombination_rate - ) - most_likely_path = backwards_viterbi_hap(m, V, P) + viterbi_forward_backward = viterbi_hap else: - # Diploid - # Evaluate emission probabilities here, using the mutation rate - this can take a scalar or vector. - # DEV: there's a wrinkle here - multiallelics not yet done. - e = np.zeros((m, 8)) - e[:, EQUAL_BOTH_HOM] = (1 - mutation_rate) ** 2 - e[:, UNEQUAL_BOTH_HOM] = mutation_rate ** 2 - e[:, BOTH_HET] = (1 - mutation_rate) ** 2 + mutation_rate ** 2 - e[:, REF_HOM_OBS_HET] = 2 * mutation_rate * (1 - mutation_rate) - e[:, REF_HET_OBS_HOM] = mutation_rate * (1 - mutation_rate) + viterbi_forward_backward = viterbi_dip - V, P, log_likelihood = forwards_viterbi_dip_low_mem( - n, m, reference_panel, query, e, recombination_rate - ) - unphased_path = backwards_viterbi_dip(m, V, P) - most_likely_path = get_phased_path(n, unphased_path) + most_likely_path, log_likelihood = viterbi_forward_backward( + n, m, reference_panel, query, emissions, recombination_rate + ) return most_likely_path, log_likelihood -# Finally, need to include a function to evaluate the likelihood of a given path - - def path_ll( reference_panel, query, @@ -455,58 +346,28 @@ def path_ll( recombination_rate, scale_mutation_based_on_n_alleles, ) - # Check alleles should go in here, and mofify e before passing to the algorithm - # If alleles is not passed, we don't perform a test of alleles, but set n_alleles based on the reference_panel. - if alleles is None: - n_alleles = np.int8( - [ - len(np.unique(np.append(reference_panel[j, :], query[:, j]))) - for j in range(reference_panel.shape[0]) - ] - ) - else: - n_alleles = check_alleles(alleles, m) - if mutation_rate is None: - # Set the mutation rate to be the proposed mutation rate in Li and Stephens (2003). - theta_tilde = 1 / np.sum([1 / k for k in range(1, n - 1)]) - mutation_rate = 0.5 * (theta_tilde / (n + theta_tilde)) + emissions = set_emission_probabilities( + n, + m, + reference_panel, + query, + alleles, + mutation_rate, + ploidy, + scale_mutation_based_on_n_alleles, + ) if ploidy == 1: - # Haploid - # Evaluate emission probabilities here, using the mutation rate - this can take a scalar or vector. - e = np.zeros((m, 2)) - - if scale_mutation_based_on_n_alleles: - # Scale mutation based on the number of alleles - so the mutation rate is the mutation rate to one of the alleles. - # The overall mutation rate is then (n_alleles - 1) * mutation_rate. - e[:, 0] = mutation_rate - mutation_rate * np.equal( - n_alleles, np.ones(m) - ) # Added boolean in case we're at an invariant site - e[:, 1] = 1 - (n_alleles - 1) * mutation_rate - else: - # No scaling based on the number of alleles - so the mutation rate is the mutation rate to anything. - # Which means that we must rescale the mutation rate to a different allele, by the number of alleles. - for j in range(m): - if n_alleles[j] == 1: # In case we're at an invariant site - e[j, 0] = 0 - e[j, 1] = 1 - else: - e[j, 0] = mutation_rate[j] / (n_alleles[j] - 1) - e[j, 1] = 1 - mutation_rate[j] + viterbi = viterbi_hap + else: + viterbi = viterbi_dip - ll = path_ll_hap(n, m, reference_panel, path, query, e, recombination_rate) + if ploidy == 1: + path_ll = path_ll_hap else: - # Diploid - # Evaluate emission probabilities here, using the mutation rate - this can take a scalar or vector. - # DEV: there's a wrinkle here. - e = np.zeros((m, 8)) - e[:, EQUAL_BOTH_HOM] = (1 - mutation_rate) ** 2 - e[:, UNEQUAL_BOTH_HOM] = mutation_rate ** 2 - e[:, BOTH_HET] = (1 - mutation_rate) ** 2 + mutation_rate ** 2 - e[:, REF_HOM_OBS_HET] = 2 * mutation_rate * (1 - mutation_rate) - e[:, REF_HET_OBS_HOM] = mutation_rate * (1 - mutation_rate) + path_ll = path_ll_dip - ll = path_ll_dip(n, m, reference_panel, path, query, e, recombination_rate) + ll = path_ll(n, m, reference_panel, path, query, emissions, recombination_rate) return ll diff --git a/lshmm/forward_backward/fb_haploid_variants_samples_multiallelic.py b/lshmm/forward_backward/fb_haploid_variants_samples_multiallelic.py deleted file mode 100644 index f3c59b4..0000000 --- a/lshmm/forward_backward/fb_haploid_variants_samples_multiallelic.py +++ /dev/null @@ -1,106 +0,0 @@ -# """Collection of functions to run forwards and backwards algorithms on haploid genotype data, where the data is structured as variants x samples.""" -# import numba as nb -# import numpy as np - - -# def check_alleles(alleles, m): -# """ -# Checks the specified allele list and returns a list of lists -# of alleles of length num_sites. -# If alleles is a 1D list of strings, assume that this list is used -# for each site and return num_sites copies of this list. -# Otherwise, raise a ValueError if alleles is not a list of length -# num_sites. -# """ -# if isinstance(alleles[0], str): -# return np.int8([len(alleles) for _ in range(m)]) -# if len(alleles) != m: -# raise ValueError("Malformed alleles list") -# n_alleles = np.int8([(len(alleles_site)) for alleles_site in alleles]) -# return n_alleles - - -# @nb.jit -# def forwards_ls_hap(n, m, n_alleles, H, s, e, r, norm=True): -# """Matrix based haploid LS forward algorithm using numpy vectorisation.""" -# # Initialise -# F = np.zeros((m, n)) -# r_n = r / n - -# if norm: - -# c = np.zeros(m) -# for i in range(n): -# F[0, i] = 1 / n * e[0, np.int64(np.equal(H[0, i], s[0, 0]))] -# c[0] += F[0, i] - -# for i in range(n): -# F[0, i] *= 1 / c[0] - -# # Forwards pass -# for l in range(1, m): -# for i in range(n): -# F[l, i] = F[l - 1, i] * (1 - r[l]) + r_n[l] -# F[l, i] *= e[l, np.int64(np.equal(H[l, i], s[0, l]))] -# c[l] += F[l, i] - -# for i in range(n): -# F[l, i] *= 1 / c[l] - -# ll = np.sum(np.log10(c)) - -# else: - -# c = np.ones(m) - -# for i in range(n): -# F[0, i] = 1 / n * e[0, np.int64(np.equal(H[0, i], s[0, 0]))] - -# # Forwards pass -# for l in range(1, m): -# for i in range(n): -# F[l, i] = F[l - 1, i] * (1 - r[l]) + np.sum(F[l - 1, :]) * r_n[l] -# F[l, i] *= e[l, np.int64(np.equal(H[l, i], s[0, l]))] - -# ll = np.log10(np.sum(F[m - 1, :])) - -# return F, c, ll - - -# @nb.jit -# def backwards_ls_hap(n, m, n_alleles, H, s, e, c, r): -# """Matrix based haploid LS backward algorithm using numpy vectorisation.""" -# # Initialise -# # alleles = check_alleles(alleles, m) -# B = np.zeros((m, n)) -# for i in range(n): -# B[m - 1, i] = 1 -# r_n = r / n - -# # Backwards pass -# for l in range(m - 2, -1, -1): -# tmp_B = np.zeros(n) -# tmp_B_sum = 0 -# for i in range(n): -# tmp_B[i] = ( -# e[l + 1, np.int64(np.equal(H[l + 1, i], s[0, l + 1]))] * B[l + 1, i] -# ) -# tmp_B_sum += tmp_B[i] -# for i in range(n): -# B[l, i] = r_n[l + 1] * tmp_B_sum -# B[l, i] += (1 - r[l + 1]) * tmp_B[i] -# B[l, i] *= 1 / c[l + 1] - -# return B - - -# def forwards_ls_hap_wrapper(n, m, alleles, H, s, e, r, norm=True): -# n_alleles = check_alleles(alleles, m) -# F, c, ll = forwards_ls_hap(n, m, n_alleles, H, s, e, r, norm) -# return F, c, ll - - -# def backwards_ls_hap_wrapper(n, m, alleles, H, s, e, c, r): -# n_alleles = check_alleles(alleles, m) -# B = backwards_ls_hap(n, m, n_alleles, H, s, e, c, r) -# return B diff --git a/lshmm/vit_haploid_variants_samples_multiallelic.py b/lshmm/vit_haploid_variants_samples_multiallelic.py deleted file mode 100644 index 9dcdacf..0000000 --- a/lshmm/vit_haploid_variants_samples_multiallelic.py +++ /dev/null @@ -1,147 +0,0 @@ -# """Collection of functions to run Viterbi algorithms on haploid genotype data, where the data is structured as variants x samples.""" -# import numba as nb -# import numpy as np - -# def check_alleles(alleles, m): -# """ -# Checks the specified allele list and returns a list of lists -# of alleles of length num_sites. -# If alleles is a 1D list of strings, assume that this list is used -# for each site and return num_sites copies of this list. -# Otherwise, raise a ValueError if alleles is not a list of length -# num_sites. -# """ -# if isinstance(alleles[0], str): -# return np.int8([len(alleles) for _ in range(m)]) -# if len(alleles) != m: -# raise ValueError("Malformed alleles list") -# n_alleles = np.int8([(len(alleles_site)) for alleles_site in alleles]) -# return n_alleles - -# # Speedier version, variants x samples - -# @nb.jit -# def viterbi_naive_init(n, m, H, s, e, r): -# """Initialise naive implementation of LS viterbi.""" -# V = np.zeros((m, n)) -# P = np.zeros((m, n)).astype(np.int64) -# r_n = r / n -# for i in range(n): -# V[0, i] = 1 / n * e[0, np.int64(np.equal(H[0, i], s[0, 0]))] - -# return V, P, r_n - -# @nb.jit -# def viterbi_init(n, m, H, s, e, r): -# """Initialise naive, but more space memory efficient implementation of LS viterbi.""" -# V_previous = np.zeros(n) -# V = np.zeros(n) -# P = np.zeros((m, n)).astype(np.int64) -# r_n = r / n - -# for i in range(n): -# V_previous[i] = 1 / n * e[0, np.int64(np.equal(H[0, i], s[0, 0]))] - -# return V, V_previous, P, r_n - - -# @nb.jit -# def forwards_viterbi_hap_naive(n, m, n_alleles, H, s, e, r): -# """Naive implementation of LS haploid Viterbi algorithm.""" -# # Initialise -# V, P, r_n = viterbi_naive_init(n, m, H, s, e, r) - -# for j in range(1, m): -# for i in range(n): -# # Get the vector to maximise over -# v = np.zeros(n) -# for k in range(n): -# # v[k] = e[j,np.equal(H[j,i], s[0,j]).astype(np.int64)] * V[j-1,k] -# v[k] = e[j, np.int64(np.equal(H[j, i], s[0, j]))] * V[j - 1, k] -# if k == i: -# v[k] *= 1 - r[j] + r_n[j] -# else: -# v[k] *= r_n[j] -# P[j, i] = np.argmax(v) -# V[j, i] = v[P[j, i]] - -# ll = np.log10(np.amax(V[m - 1, :])) - -# return V, P, ll - - -# @nb.jit -# def forwards_viterbi_hap_lower_mem_rescaling(n, m, n_alleles, H, s, e, r): -# """LS haploid Viterbi algorithm with even smaller memory footprint and exploits the Markov process structure.""" -# # Initialise -# V = np.zeros(n) -# for i in range(n): -# V[i] = 1 / n * e[0, np.int64(np.equal(H[0, i], s[0, 0]))] -# P = np.zeros((m, n)).astype(np.int64) -# r_n = r / n -# c = np.ones(m) - -# for j in range(1, m): -# argmax = np.argmax(V) -# c[j] = V[argmax] -# V *= 1 / c[j] -# for i in range(n): -# V[i] = V[i] * (1 - r[j] + r_n[j]) -# P[j, i] = i -# if V[i] < r_n[j]: -# V[i] = r_n[j] -# P[j, i] = argmax -# V[i] *= e[j, np.int64(np.equal(H[j, i], s[0, j]))] - -# ll = np.sum(np.log10(c)) + np.log10(np.max(V)) - -# return V, P, ll - - -# def forwards_viterbi_hap_naive_wrapper(n, m, alleles, H, s, e, r): -# n_alleles = check_alleles(alleles, m) -# V, P, ll = forwards_viterbi_hap_naive(n, m, n_alleles, H, s, e, r) -# return V, P, ll - -# def forwards_viterbi_hap_lower_mem_rescaling_wrapper(n, m, alleles, H, s, e, r): -# n_alleles = check_alleles(alleles, m) -# V, P, ll = forwards_viterbi_hap_lower_mem_rescaling(n, m, n_alleles, H, s, e, r) -# return V, P, ll - -# # Speedier version, variants x samples -# @nb.jit -# def backwards_viterbi_hap(m, V_last, P): -# """Run a backwards pass to determine the most likely path.""" -# # Initialise -# assert len(V_last.shape) == 1 -# path = np.zeros(m).astype(np.int64) -# path[m - 1] = np.argmax(V_last) - -# for j in range(m - 2, -1, -1): -# path[j] = P[j + 1, path[j + 1]] - -# return path - -# # DEV: This might need some work -# @nb.jit -# def path_ll_hap(n, m, H, path, s, e, r): -# """Evaluate log-likelihood path through a reference panel which results in sequence s.""" -# index = np.int64(np.equal(H[0, path[0]], s[0, 0])) -# log_prob_path = np.log10((1 / n) * e[0, index]) -# old = path[0] -# r_n = r / n - -# for l in range(1, m): -# index = np.int64(np.equal(H[l, path[l]], s[0, l])) -# current = path[l] -# same = old == current - -# if same: -# log_prob_path += np.log10((1 - r[l]) + r_n[l]) -# else: -# log_prob_path += np.log10(r_n[l]) - -# log_prob_path += np.log10(e[l, index]) -# old = current - -# return log_prob_path diff --git a/tests/test_LS_haploid_multiallelic.py b/tests/test_LS_haploid_multiallelic.py deleted file mode 100644 index 96b8bda..0000000 --- a/tests/test_LS_haploid_multiallelic.py +++ /dev/null @@ -1,163 +0,0 @@ -# Simulation -import itertools - -# Python libraries -import msprime -import numpy as np -import pytest -import tskit - -import lshmm.forward_backward.fb_haploid_variants_samples as fbh -import lshmm.vit_haploid_variants_samples as vh - - -class LSBase: - """Superclass of Li and Stephens tests.""" - - def example_haplotypes(self, ts): - - H = ts.genotype_matrix() - s = H[:, 0].reshape(1, H.shape[0]) - H = H[:, 1:] - - return H, s - - def haplotype_emission(self, mu, m): - # Define the emission probability matrix - e = np.zeros((m, 2)) - e[:, 0] = mu - e[:, 1] = 1 - mu - - return e - - def example_parameters_haplotypes(self, ts, seed=42): - """Returns an iterator over combinations of haplotype, recombination and mutation rates.""" - np.random.seed(seed) - H, s = self.example_haplotypes(ts) - n = H.shape[1] - m = ts.get_num_sites() - - alleles = [] - for variant in ts.variants(): - alleles.append(variant.alleles) - - # Here we have equal mutation and recombination - r = np.zeros(m) + 0.01 - mu = np.zeros(m) + 0.01 - r[0] = 0 - - e = self.haplotype_emission(mu, m) - - yield n, m, alleles, H, s, e, r - - # Mixture of random and extremes - rs = [np.zeros(m) + 0.999, np.zeros(m) + 1e-6, np.random.rand(m)] - - mus = [np.zeros(m) + 0.33, np.zeros(m) + 1e-6, np.random.rand(m) * 0.33] - - e = self.haplotype_emission(mu, m) - - for r, mu in itertools.product(rs, mus): - r[0] = 0 - e = self.haplotype_emission(mu, m) - yield n, m, alleles, H, s, e, r - - def assertAllClose(self, A, B): - """Assert that all entries of two matrices are 'close'""" - assert np.allclose(A, B, rtol=1e-9, atol=0.0) - - # Define a bunch of very small tree-sequences for testing a collection of parameters on - def test_simple_n_10_no_recombination(self): - ts = msprime.sim_ancestry( - samples=10, - recombination_rate=0, - random_seed=42, - sequence_length=10, - population_size=10000, - ) - ts = msprime.sim_mutations(ts, rate=1e-5, random_seed=42) - assert ts.num_sites > 3 - self.verify(ts) - - def test_simple_n_6(self): - ts = msprime.sim_ancestry( - samples=6, - recombination_rate=1e-4, - random_seed=42, - sequence_length=40, - population_size=10000, - ) - ts = msprime.sim_mutations(ts, rate=1e-3, random_seed=42) - assert ts.num_sites > 5 - self.verify(ts) - - def test_simple_n_8(self): - ts = msprime.sim_ancestry( - samples=8, - recombination_rate=1e-4, - random_seed=42, - sequence_length=20, - population_size=10000, - ) - ts = msprime.sim_mutations(ts, rate=1e-4, random_seed=42) - assert ts.num_sites > 5 - assert ts.num_trees > 15 - self.verify(ts) - - def test_simple_n_16(self): - ts = msprime.sim_ancestry( - samples=16, - recombination_rate=1e-2, - random_seed=42, - sequence_length=20, - population_size=10000, - ) - ts = msprime.sim_mutations(ts, rate=1e-4, random_seed=42) - assert ts.num_sites > 5 - print(ts.num_trees) - self.verify(ts) - - def verify(self, ts): - raise NotImplementedError() - - -class FBAlgorithmBase(LSBase): - """Base for forwards backwards algorithm tests.""" - - -class TestNonTreeMethodsHap(FBAlgorithmBase): - """Test that we compute the sample likelihoods across all implementations.""" - - def verify(self, ts): - for n, m, alleles, H, s, e, r in self.example_parameters_haplotypes(ts): - F, c, ll = fbh.forwards_ls_hap(n, m, H, s, e, r, norm=False) - B = fbh.backwards_ls_hap(n, m, H, s, e, c, r) - self.assertAllClose(np.log10(np.sum(F * B, 1)), ll * np.ones(m)) - F_tmp, c_tmp, ll_tmp = fbh.forwards_ls_hap(n, m, H, s, e, r, norm=True) - B_tmp = fbh.backwards_ls_hap(n, m, H, s, e, c_tmp, r) - self.assertAllClose(ll, ll_tmp) - self.assertAllClose(np.sum(F_tmp * B_tmp, 1), np.ones(m)) - - -class VitAlgorithmBase(LSBase): - """Base for viterbi algoritm tests.""" - - -class TestNonTreeViterbiHap(VitAlgorithmBase): - """Test that we have the same log-likelihood across all implementations""" - - def verify(self, ts): - for n, m, alleles, H, s, e, r in self.example_parameters_haplotypes(ts): - - V, P, ll = vh.forwards_viterbi_hap_naive(n, m, H, s, e, r) - path = vh.backwards_viterbi_hap(m, V[m - 1, :], P) - ll_check = vh.path_ll_hap(n, m, H, path, s, e, r) - self.assertAllClose(ll, ll_check) - - V_tmp, P_tmp, ll_tmp = vh.forwards_viterbi_hap_lower_mem_rescaling( - n, m, H, s, e, r - ) - path_tmp = vh.backwards_viterbi_hap(m, V_tmp, P_tmp) - ll_check = vh.path_ll_hap(n, m, H, path_tmp, s, e, r) - self.assertAllClose(ll_tmp, ll_check) - self.assertAllClose(ll, ll_tmp) From b228d43c73a2681f5e92d15584cac4dd2fecb231 Mon Sep 17 00:00:00 2001 From: astheeggeggs Date: Tue, 30 Aug 2022 18:22:10 +0100 Subject: [PATCH 5/6] now passing lint --- lshmm/api.py | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/lshmm/api.py b/lshmm/api.py index 9d2f658..6137480 100644 --- a/lshmm/api.py +++ b/lshmm/api.py @@ -225,11 +225,15 @@ def forwards( ) if ploidy == 1: - forward_pass = forwards_ls_hap + forward_function = forwards_ls_hap else: - forward_pass = forward_ls_dip_loop + forward_function = forward_ls_dip_loop - (forward_array, normalisation_factor_from_forward, log_likelihood) = forward_pass( + ( + forward_array, + normalisation_factor_from_forward, + log_likelihood, + ) = forward_function( n, m, reference_panel, query, emissions, recombination_rate, norm=True ) @@ -269,11 +273,11 @@ def backwards( ) if ploidy == 1: - backward_pass = backwards_ls_hap + backward_function = backwards_ls_hap else: - backward_pass = backward_ls_dip_loop + backward_function = backward_ls_dip_loop - backwards_array = backward_pass( + backwards_array = backward_function( n, m, reference_panel, @@ -318,11 +322,11 @@ def viterbi( ) if ploidy == 1: - viterbi_forward_backward = viterbi_hap + viterbi_function = viterbi_hap else: - viterbi_forward_backward = viterbi_dip + viterbi_function = viterbi_dip - most_likely_path, log_likelihood = viterbi_forward_backward( + most_likely_path, log_likelihood = viterbi_function( n, m, reference_panel, query, emissions, recombination_rate ) @@ -359,15 +363,12 @@ def path_ll( ) if ploidy == 1: - viterbi = viterbi_hap + path_ll_function = path_ll_hap else: - viterbi = viterbi_dip + path_ll_function = path_ll_dip - if ploidy == 1: - path_ll = path_ll_hap - else: - path_ll = path_ll_dip - - ll = path_ll(n, m, reference_panel, path, query, emissions, recombination_rate) + ll = path_ll_function( + n, m, reference_panel, path, query, emissions, recombination_rate + ) return ll From c21a1b8059d33359e9d96a03c0d74643f227cfbb Mon Sep 17 00:00:00 2001 From: astheeggeggs Date: Tue, 30 Aug 2022 18:26:07 +0100 Subject: [PATCH 6/6] passed, updating veriosn to 0.0.2, ready to push to pypi --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 95f12c9..c48d56e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = lshmm -version = 0.0.1 +version = 0.0.2 long_description = file: README.md long_description_content_type = text/markdown license = MIT