HMM for genotype cross, version up

Gregor-Mendel-Institute · Dec 5, 2018 · c5e4723 · c5e4723
1 parent f108d15
commit c5e4723
Show file tree

Hide file tree

Showing 5 changed files with 87 additions and 56 deletions.
diff --git a/README.md b/README.md
@@ -68,10 +68,10 @@ VCF file in a default format in the [link](http://gatkforums.broadinstitute.org/
 SNPmatch can be run as bash commands given below. A detailed manual for each command with -h.
 
 ```bash
-snpmatch inbred -i input_file -d db.hdf5 -e db.acc.hdf5 -o output_file
+snpmatch inbred -v -i input_file -d db.hdf5 -e db.acc.hdf5 -o output_file
 # or
-snpmatch parser -i input_file -o input_npz
-snpmatch inbred -i input_npz -d db.hdf5 -e db.acc.hdf5 -o output_file
+snpmatch parser -v -i input_file -o input_npz
+snpmatch inbred -v -i input_npz -d db.hdf5 -e db.acc.hdf5 -o output_file
 ```
 
 ### AraGeno
@@ -101,11 +101,8 @@ It might be easier to parse this file using [json editor](https://docs.python.or
 SNPmatch can be used to identify hybrid individuals when parental strains are present in database. For such individuals, SNPmatch can be run in windows across the genome. The commands used to run are given below
 
 ```bash
-snpmatch cross -d db.hdf5 -e db.acc.hdf5 -i input_file -b window_size_in_bp -o output_file
+snpmatch cross -v -d db.hdf5 -e db.acc.hdf5 -i input_file -b window_size_in_bp -o output_file
 #to identify the windows matching to each parent in a hybrid
-snpmatch genotype_cross -e db.acc.hdf5 -p parent1xparent2 -i input_file -o output_file
-# or if parents have VCF files individually
-snpmatch genotype_cross -p parent1.vcf -q parent2.vcf -i input_file -o output_file
 ```
 
 These scripts are implemented based on the *A. thaliana* genome sizes. But the global variable in csmatch [script](https://github.com/Gregor-Mendel-Institute/SNPmatch/blob/master/snpmatch/core/csmatch.py#L19) can be modified to the corresponding genome sizes.
@@ -131,20 +128,37 @@ Filtering this table by column 7 having 1 would result in homozygous windows.
 The file containing the list of matched strains, list of homozygous windows and strains matched to them and along with a simple interpretation.
 
 
+## Identifying underlying haplotype for a experimental cross
+
+For a given hybird sample and its parents, SNPmatch can determine the underlying haplotype structure (homozygous or heterozygous).
+
+```bash
+snpmatch genotype_cross -v -e db.acc.hdf5 -p "parent1xparent2" -i input_file -o output_file -b window_size
+# or if parents have VCF files individually
+snpmatch genotype_cross -v -p parent1.vcf -q parent2.vcf -i input_file -o output_file -b window_size
+```
+
+One can implement this by considering a Markhof chain (HMM, requires [hmmlearn](https://github.com/hmmlearn/hmmlearn) python package), by running above command using `--hmm`. The starting probabilities are based on mendel segregation (1:2:1, for F2), might be necessary to change them when implementing for higher crosses. The transition probability matrix is adapted from R/qtl (Browman 2009, doi:10.1007/978-0-387-92125-9).
+
+The output file is a tab delimited file as below.
+
+|1|2|3|4|5|6|7|
+|---|---|---|---|---|---|---|
+1|1|300000|14|1114|NA|1.47,1.64,1.00|
+1|300001|600000|19|1248|2|2.46,2.29,1.00|
+1|600001|900000|8|1018|2|nan,3.28,1.00|
+1|900001|1200000|15|1036|2|2.83,2.59,1.00|
+1|1200001|1500000|12|995| 2|2.71,2.71,1.00|
+
+The columns are Chromosome ID, start position of window, end position, number of SNPs from sample in a window, number of segregating SNPs, underlying genotype (0, 1, 2 for homozygous parent1, heterozygous and homozygous parent2), likelihood ratio test statistic for each genotype (or number of SNPs each genotype under HMM).
+
 ## Contributing
 1. Fork it!
 2. Create your feature branch: `git checkout -b my-new-feature`
 3. Commit your changes: `git commit -am 'Add some feature'`
 4. Push to the branch: `git push origin my-new-feature`
 5. Submit a pull request :D
 
-## History
-
-- 1.9.2: Stable version, 24-08-2017
-- 2.0.0: Stable version, 26-01-2018
-- 2.1.0: Stable version, 09-08-2018
-
-
 ## Credits
 
 - Rahul Pisupati (rahul.pisupati[at]gmi.oeaw.ac.at)

diff --git a/setup.py b/setup.py
@@ -10,7 +10,7 @@
 
 setup(
     name='SNPmatch',
-    version='2.2.0',
+    version='2.5.0',
     description='A simple python library to identify the most likely strain given the SNPs for a sample',
     long_description=long_description,
     url='https://github.com/Gregor-Mendel-Institute/SNPmatch',

diff --git a/snpmatch/__init__.py b/snpmatch/__init__.py
@@ -16,8 +16,8 @@
 from snpmatch.core import simulate
 import logging, logging.config
 
-__version__ = '2.2.0'
-__updated__ = "14.08.2018"
+__version__ = '2.5.0'
+__updated__ = "05.12.2018"
 __date__ = "25.10.2016"
 
 def setLog(logDebug):

diff --git a/snpmatch/core/genotype_cross.py b/snpmatch/core/genotype_cross.py
@@ -17,7 +17,7 @@
 log = logging.getLogger(__name__)
 
 # Arabidopsis chromosome length
-chrlen = np.array((30427671, 19698289, 23459830, 18585056, 26975502, 154478, 154478))
+chrlen = np.array((30427671, 19698289, 23459830, 18585056, 26975502))
 tair_chrs = ['1', '2', '3', '4', '5']
 mean_recomb_rates = [3.4, 3.6, 3.5, 3.8, 3.6]  ## cM/Mb ## Salome, P et al. 2011
 
@@ -149,55 +149,68 @@ def genotype_each_cross(self, input_file, lr_thres):
         log.info("done!")
         return(outfile_str)
 
+    @staticmethod
+    def get_probabilities_ebin(input_gt, snpsP1_gt, snpsP2_gt, error_prob = 0.01):
+        num_snps = len(input_gt)
+        ebTarGTs = parsers.parseGT(input_gt)
+        ebPolarised = np.zeros(num_snps, dtype=int)
+        ebPolarised[np.where(np.equal( ebTarGTs, snpsP1_gt ))[0] ] = 0
+        ebPolarised[np.where(np.equal( ebTarGTs, snpsP2_gt ))[0] ] = 2
+        ebPolarised[np.where(np.equal( ebTarGTs, np.repeat(2, num_snps) ))[0] ] = 1
+        eb_probs = np.repeat(error_prob / 2, num_snps * 3).reshape((-3, 3))
+        for i_ix in range(num_snps):
+            eb_probs[i_ix,ebPolarised[i_ix]] = 1 - error_prob
+        return( eb_probs )
+
+
     def genotype_each_cross_hmm(self, input_file, n_marker_thres):
+        self._inputs = parsers.ParseInputs(inFile = input_file, logDebug = self.logDebug)
+        ## Inputs is the ParseInputs class object
+        self._inputs.get_chr_list()
         from hmmlearn import hmm
-        r_i = float(self.window_size) * 0.38 /1000000
-        assert r_i < 1, "Provide either small window size or check the mean recombination rate"
-        log.info("recombination fraction between windows: %.2f" % r_i)
         ### The below transition probability is for a intercross, adapted from R/qtl
         log.info("running HMM")
-        transprob = np.array( [[ (1-r_i)**2, 2*r_i*(1-r_i), r_i**2 ],
-                               [r_i*(1-r_i), (1-r_i)**2 + r_i**2, r_i*(1-r_i)],
-                               [ r_i**2, 2*r_i*(1-r_i), (1-r_i)**2 ]] )
-        model = hmm.GaussianHMM(n_components=3, covariance_type="full", init_params="mcs")
+        #r_i = float(self.window_size) * 0.38 /1000000
+        r_i = 4000 * 3.8 /1000000
+        e_p = 0.01
+        assert r_i < 1, "Provide either small window size or check the mean recombination rate"
+        log.info("recombination fraction between windows: %.2f" % r_i)
+        transprob = np.array( [[ (1-r_i)**2, 2*r_i*(1-r_i), r_i**2 ], [r_i*(1-r_i), (1-r_i)**2 + r_i**2, r_i*(1-r_i)], [ r_i**2, 2*r_i*(1-r_i), (1-r_i)**2 ]] )
+        log.info("error rate for genotyping: %.2f" % e_p)
+        model = hmm.GaussianHMM(n_components=3, covariance_type="full", n_iter = 1000)
         model.startprob_ = np.array([0.25, 0.5, 0.25])
         model.transmat_ = transprob
-        model.means_ = np.array([[1, 0, 0], [0.08, 0.84, 0.08], [0, 0, 1]])
-        model.covars_ = np.tile(np.identity(3), (3, 1, 1))
-        ## Choose a small window that no recombination occur
-        inputs = parsers.ParseInputs(inFile = input_file, logDebug = self.logDebug)
-        ## Inputs is the ParseInputs class object
         log.info("Transision matrix: %s" % transprob)
+        model.means_ = np.array( [ [1, 0, 0], [0, 1, 0], [0, 0, 1] ] )
+        model.covars_ = np.tile(np.identity(3), (3, 1, 1))
+        allSNPProbs = {}
+        allSNPGenos = np.zeros(0, dtype=int)
+        for ec, eclen in itertools.izip(tair_chrs, chrlen):
+            reqPOSind =  np.where(self.commonSNPsCHR == ec)[0]
+            reqPOS = self.commonSNPsPOS[reqPOSind]
+            reqTARind = np.where( self._inputs.g_chrs == ec )[0]
+            perchrTarPos = self._inputs.pos[reqTARind]
+            ebAccsInds_rel = np.where( np.in1d(reqPOS, perchrTarPos) )[0]
+            ebAccsInds = reqPOSind[ ebAccsInds_rel ]
+            ebTarInds = reqTARind[ np.where( np.in1d(perchrTarPos, reqPOS) )[0] ]
+            allSNPProbs[ec] = np.zeros((len(reqPOSind), 3))
+            ebin_prob = self.get_probabilities_ebin(self._inputs.gt[ebTarInds], self.snpsP1[ebAccsInds], self.snpsP2[ebAccsInds])
+            allSNPProbs[ec][ebAccsInds_rel,] = ebin_prob
+            allSNPGenos = np.append(allSNPGenos, model.predict( allSNPProbs[ec] ))
         iter_bins_genome = csmatch.get_bins_arrays(self.commonSNPsCHR, self.commonSNPsPOS, self.window_size)
-        iter_bins_snps = csmatch.get_bins_arrays(inputs.chrs, inputs.pos, self.window_size)
-        allGenotypeProbs = {}
-        for ec in tair_chrs:
-            allGenotypeProbs[ec] = np.zeros((0, 3))
+        iter_bins_snps = csmatch.get_bins_arrays(self._inputs.chrs, self._inputs.pos, self.window_size)
         outfile_str = np.zeros(0, dtype="string")
         for e_b, e_s in itertools.izip(iter_bins_genome, iter_bins_snps):
-            # first snp positions which are segregating and are in this window
-            bin_str = tair_chrs[e_b[0]] + "\t" + str(e_b[1][0]) + "\t" + str(e_b[1][1])
-            reqPOS = self.commonSNPsPOS[e_b[2]]
-            perchrTarPos = inputs.pos[e_s[2]]
-            ebAccsInds = np.array(e_b[2])[ np.where( np.in1d(reqPOS, perchrTarPos) )[0] ]
-            ebTarInds = np.array(e_s[2], dtype=int)[ np.where( np.in1d(perchrTarPos, reqPOS) )[0] ]
-            ebTarGTs = parsers.parseGT(inputs.gt[ebTarInds])
-            if len(ebTarInds) <= n_marker_thres:
-                ebin_prob = np.array((0, 0, 0))
+            bin_str = tair_chrs[e_b[0]] + "\t" + str(e_b[1][0]) + "\t" + str(e_b[1][1]) + "\t" + str(len(e_s[2])) + "\t" + str(len(e_b[2]))
+            t_genos = np.unique(allSNPGenos[e_b[2]])
+            t_genos_nums = pd.Series([ len( np.where( allSNPGenos[e_b[2]] == e )[0] )  for e in [0, 1, 2]]).astype(str).str.cat(sep=",")
+            if len(e_b[2]) > 0:
+                if len(t_genos) == 1:
+                    outfile_str = np.append(outfile_str, "%s\t%s\t%s" % (bin_str, t_genos[0], t_genos_nums) )
+                else:
+                    outfile_str = np.append(outfile_str, "%s\t%s\t%s" % (bin_str, 1, t_genos_nums) )
             else:
-                matP1no = float(len(np.where(np.equal( ebTarGTs, self.snpsP1[ebAccsInds] ))[0]))/len(ebTarInds)
-                matP2no = float(len(np.where(np.equal( ebTarGTs, self.snpsP2[ebAccsInds] ))[0]))/len(ebTarInds)
-                matHetno = float(len(np.where(np.equal( ebTarGTs, np.repeat(2, len(ebTarInds)) ))[0]))/len(ebTarInds)
-                ebin_prob = np.array( (matP1no,matHetno,matP2no) )
-            allGenotypeProbs[tair_chrs[e_b[0]]] = np.vstack( (allGenotypeProbs[tair_chrs[e_b[0]]], ebin_prob) )
-            #allGenotypeProbs =
-            outfile_str = np.append(outfile_str, "%s\t%s\t%s" % (bin_str, len(ebTarInds), len(e_b[2])))
-        allGenos = np.zeros(0)
-        allGenos_prob = np.zeros(0, dtype="string")
-        for ec in tair_chrs:
-            allGenos = np.append(allGenos, model.predict( allGenotypeProbs[ec] ))
-            allGenos_prob = np.append(allGenos_prob, pd.DataFrame(allGenotypeProbs[ec]).apply(lambda x: ','.join(x.round(2).astype(str)), axis=1) )
-        outfile_str  = np.array(pd.Series(outfile_str) + "\t" + pd.Series(allGenos).astype(int).astype(str) + "\t" + pd.Series(allGenos_prob) )
+                outfile_str = np.append(outfile_str, "%s\t%s\t%s" % (bin_str, 'NA', 'NA' ) )
         return(outfile_str)
 
 

diff --git a/snpmatch/core/parsers.py b/snpmatch/core/parsers.py
@@ -142,6 +142,10 @@ def filter_chr_names(self, g):
         self.chrs_nochr = np.array(pd.Series(self.chrs).str.replace("chr", "", case=False), dtype="string")
         self.filter_inds_ix = np.where( np.in1d( self.chrs_nochr, self.chr_list ) )[0]
 
+    def get_chr_list(self):
+        self.g_chrs = np.char.replace(np.core.defchararray.lower(np.array(self.chrs, dtype="string")), "chr", "")
+        self.g_chrs_uq = np.unique(self.g_chrs)
+
 
 def potatoParser(inFile, logDebug, outFile = "parser"):
     inputs = ParseInputs(inFile, logDebug, outFile)