Skip to content

Commit

Permalink
fine tuning HMM
Browse files Browse the repository at this point in the history
  • Loading branch information
rbpisupati committed Apr 23, 2021
1 parent c4ba20d commit bd61cdb
Show file tree
Hide file tree
Showing 2 changed files with 158 additions and 213 deletions.
53 changes: 29 additions & 24 deletions snpmatch/core/genotype_cross.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,10 @@ def get_segregating_snps_parents(self, parents, father):
commonSNPsCHR = np.array(self.g.g_acc.chromosomes)
commonSNPsPOS = np.array(self.g.g_acc.positions)
log.info("done!")
## only considering sites where two parents differ ---
# 1. excluding any site which is heterozygous in either parent
# 2. also excluding sites when it is NA in atleast one parent.
## you do not need a lot of markers to construct the map .. but rather perfect sites
segSNPsind = np.where((snpsP1 != snpsP2) & (snpsP1 >= 0) & (snpsP2 >= 0) & (snpsP1 < 2) & (snpsP2 < 2))[0]
# segSNPsind = np.where((snpsP1 != snpsP2) )[0]
log.info("number of segregating snps between parents: %s", len(segSNPsind))
Expand Down Expand Up @@ -127,13 +131,14 @@ def get_parental_obs(input_gt, snpsP1_gt, snpsP2_gt, polarize = None):
snpsP1_gt_mask = numpy.ma.masked_less(numpy.ma.masked_greater(snpsP1_gt, 1), 0)
snpsP2_gt_mask = numpy.ma.masked_less(numpy.ma.masked_greater(snpsP2_gt, 1), 0)
ebPolarised[np.where((np.equal( ebTarGTs, snpsP1_gt_mask )) & (~snpsP2_gt_mask) )[0] ] = 0 ## 00
ebPolarised[np.where((np.equal( ebTarGTs, snpsP1_gt_mask )) & (snpsP2_gt_mask) )[0] ] = 3 ## 0
ebPolarised[np.where((np.equal( ebTarGTs, snpsP2_gt_mask )) & (~snpsP1_gt_mask) )[0] ] = 2 ## 11
### additional observation points
ebPolarised[np.where((np.equal( ebTarGTs, snpsP1_gt_mask )) & (snpsP2_gt_mask) )[0] ] = 3 ## 0
ebPolarised[np.where((np.equal( ebTarGTs, snpsP2_gt_mask )) & (snpsP1_gt_mask) )[0] ] = 4 ## 1
return(ebPolarised)

@staticmethod
def init_hmm(num_markers, chromosome_size = 115, recomb_rate = 3.3):
def init_hmm(num_markers, chromosome_size = 115, recomb_rate = 3.3, n_iter = 100):
"""
Function to initilize a HMM model
Input:
Expand All @@ -143,7 +148,6 @@ def init_hmm(num_markers, chromosome_size = 115, recomb_rate = 3.3):
"""
from hmmlearn import hmm
### The below transition probability is for a intercross, adapted from R/qtl
log.info("Initialising HMM")
states = ('aa', 'ab', 'bb')
observations = ('00', '01', '11', '0', '1', 'NA')
## assume A. thaliana genome size of 115 Mb
Expand All @@ -153,21 +157,25 @@ def init_hmm(num_markers, chromosome_size = 115, recomb_rate = 3.3):
## given a recombination rate of ~3.3
# probability you see a recombinant is 1 / (100 * recomb_rate)
prob_of_change = ( chromosome_size / num_markers ) * ( 1 / (100 * recomb_rate) )
transmission_prob = {
'aa': {'aa': 1 - prob_of_change, 'ab': prob_of_change/2, 'bb': prob_of_change/2},
'ab': {'aa': prob_of_change/2, 'ab': 1 - prob_of_change, 'bb': prob_of_change/2},
'bb': {'aa': prob_of_change/2, 'ab': prob_of_change/2, 'bb': 1 - prob_of_change}
}
## Since I have 6 possible observed states -- I have made an emmission matrix with such a structure.
emission_prob = {
'aa': {'00': 0.599, '01': 0.1, '11': 0.001, '0': 0.52, '1': 0, 'NA': 0.33},
'ab': {'00': 0.4, '01': 0.8, '11': 0.4, '0': 0.48, '1': 0.48, 'NA': 0.34},
'bb': {'00': 0.001, '01': 0.1, '11': 0.599, '0': 0, '1': 0.52, 'NA': 0.33}
}
model = hmm.MultinomialHMM(n_components=3)
transmission_prob = [
[1 - prob_of_change, prob_of_change/2, prob_of_change/2],
[prob_of_change/2, 1 - prob_of_change, prob_of_change/2],
[prob_of_change/2, prob_of_change/2, 1 - prob_of_change]
]
## Since there are 6 observed states . we need 3x6 matrix
## ('00', '01', '11', '0', '1', 'NA')
## 0, 1, 2, 3, 4, 5
emission_prob = [
[0.65, 0.1, 0.01, 0.5, 0.1, 0.33 ],
[0.34, 0.8, 0.34, 0.4, 0.4, 0.34 ],
[0.01, 0.1, 0.65, 0.1, 0.5, 0.33 ]
]
model = hmm.MultinomialHMM(n_components=3, n_iter = n_iter, algorithm = "viterbi")
model.startprob_ = np.array([0.25, 0.5, 0.25])
model.transmat_ = pd.DataFrame(transmission_prob)
model.emissionprob_ = pd.DataFrame(emission_prob).T
model.emissionprob_ = pd.DataFrame(emission_prob)
log.info("initialising HMM with transition prob")
log.info("%s" % pd.DataFrame(transmission_prob) )
return(model)

def genotype_cross_hmm(self, input_file):
Expand All @@ -184,17 +192,17 @@ def genotype_cross_hmm(self, input_file):
g_chr_names = genome.chrs[pd.Series(self.commonSNPsCHR, dtype = str).apply(genome.get_chr_ind)]
allSNPGenos = pd.DataFrame( - np.ones((0,samples_gt.shape[1]), dtype=int) )
allSNPGenos_raw = pd.DataFrame( - np.ones((0,samples_gt.shape[1]), dtype=int) )
t_model = self.init_hmm( samples_gt.shape[0] )
for ec, eclen in zip(genome.chrs_ids, genome.chrlen):
reqPOSind = np.where(g_chr_names == ec)[0]
reqTARind = np.where( input_chr_names == ec )[0]
ebAccsInds, ebTarInds = self._get_common_positions_ixs(self.commonSNPsPOS, snpvcf['pos'], reqPOSind, reqTARind)
t_model = self.init_hmm( len(ebAccsInds), eclen / 1000000 )
t_genotypes = pd.DataFrame( - np.ones((len(ebAccsInds),samples_gt.shape[1]), dtype=int), index = ec + ":" + pd.Series(self.commonSNPsPOS[ebAccsInds]).astype(str) )
t_genotypes_raw = pd.DataFrame( - np.ones((len(ebAccsInds),samples_gt.shape[1]), dtype=int), index = ec + ":" + pd.Series(self.commonSNPsPOS[ebAccsInds]).astype(str) )
for sample_ix in range(samples_gt.shape[1]):
ebin_gt_polarized = self.get_parental_obs(samples_gt.iloc[ebTarInds,sample_ix].values, self.snpsP1[ebAccsInds], self.snpsP2[ebAccsInds])
t_genotypes_raw.iloc[:, sample_ix] = ebin_gt_polarized
t_genotypes.iloc[:, sample_ix] = t_model.predict(ebin_gt_polarized.reshape((-1, 1)))
t_genotypes.iloc[:, sample_ix] = t_model.decode(ebin_gt_polarized.reshape((-1, 1)), algorithm='viterbi')[1]
allSNPGenos = allSNPGenos.append(t_genotypes)
allSNPGenos_raw = allSNPGenos_raw.append(t_genotypes_raw)
pos_em_cm = pd.Series(allSNPGenos.index, index = allSNPGenos.index).str.replace(":",",").apply(genome.estimated_cM_distance)
Expand Down Expand Up @@ -278,15 +286,12 @@ def write_output_genotype_cross(outfile_str, output_file ):


def uniq_neighbor(a):
sorted_a_count = np.zeros(0, dtype = int)
sorted_a = np.zeros(0, dtype = a.dtype)
for ef_ix in range(0, len(a)):
sorted_a = np.array(a[0], dtype = a.dtype)
sorted_a_count = np.array([1], dtype = int)
for ef_ix in range(1, len(a)):
if a[ef_ix] != a[ef_ix-1]:
sorted_a = np.append(sorted_a, a[ef_ix])
sorted_a_count = np.append(sorted_a_count, 1)
elif np.sum(sorted_a_count) == 0:
sorted_a = np.append(sorted_a, a[ef_ix])
sorted_a_count[-1] += 1
elif a[ef_ix] == a[ef_ix-1]:
sorted_a_count[-1] += 1
return((sorted_a, sorted_a_count))
Expand Down
Loading

0 comments on commit bd61cdb

Please sign in to comment.