diff --git a/README.md b/README.md index 31c5731..f993099 100644 --- a/README.md +++ b/README.md @@ -68,10 +68,10 @@ VCF file in a default format in the [link](http://gatkforums.broadinstitute.org/ SNPmatch can be run as bash commands given below. A detailed manual for each command with -h. ```bash -snpmatch inbred -i input_file -d db.hdf5 -e db.acc.hdf5 -o output_file +snpmatch inbred -v -i input_file -d db.hdf5 -e db.acc.hdf5 -o output_file # or -snpmatch parser -i input_file -o input_npz -snpmatch inbred -i input_npz -d db.hdf5 -e db.acc.hdf5 -o output_file +snpmatch parser -v -i input_file -o input_npz +snpmatch inbred -v -i input_npz -d db.hdf5 -e db.acc.hdf5 -o output_file ``` ### AraGeno @@ -101,11 +101,8 @@ It might be easier to parse this file using [json editor](https://docs.python.or SNPmatch can be used to identify hybrid individuals when parental strains are present in database. For such individuals, SNPmatch can be run in windows across the genome. The commands used to run are given below ```bash -snpmatch cross -d db.hdf5 -e db.acc.hdf5 -i input_file -b window_size_in_bp -o output_file +snpmatch cross -v -d db.hdf5 -e db.acc.hdf5 -i input_file -b window_size_in_bp -o output_file #to identify the windows matching to each parent in a hybrid -snpmatch genotype_cross -e db.acc.hdf5 -p parent1xparent2 -i input_file -o output_file -# or if parents have VCF files individually -snpmatch genotype_cross -p parent1.vcf -q parent2.vcf -i input_file -o output_file ``` These scripts are implemented based on the *A. thaliana* genome sizes. But the global variable in csmatch [script](https://github.com/Gregor-Mendel-Institute/SNPmatch/blob/master/snpmatch/core/csmatch.py#L19) can be modified to the corresponding genome sizes. @@ -131,6 +128,30 @@ Filtering this table by column 7 having 1 would result in homozygous windows. The file containing the list of matched strains, list of homozygous windows and strains matched to them and along with a simple interpretation. +## Identifying underlying haplotype for a experimental cross + +For a given hybird sample and its parents, SNPmatch can determine the underlying haplotype structure (homozygous or heterozygous). + +```bash +snpmatch genotype_cross -v -e db.acc.hdf5 -p "parent1xparent2" -i input_file -o output_file -b window_size +# or if parents have VCF files individually +snpmatch genotype_cross -v -p parent1.vcf -q parent2.vcf -i input_file -o output_file -b window_size +``` + +One can implement this by considering a Markhof chain (HMM, requires [hmmlearn](https://github.com/hmmlearn/hmmlearn) python package), by running above command using `--hmm`. The starting probabilities are based on mendel segregation (1:2:1, for F2), might be necessary to change them when implementing for higher crosses. The transition probability matrix is adapted from R/qtl (Browman 2009, doi:10.1007/978-0-387-92125-9). + +The output file is a tab delimited file as below. + +|1|2|3|4|5|6|7| +|---|---|---|---|---|---|---| +1|1|300000|14|1114|NA|1.47,1.64,1.00| +1|300001|600000|19|1248|2|2.46,2.29,1.00| +1|600001|900000|8|1018|2|nan,3.28,1.00| +1|900001|1200000|15|1036|2|2.83,2.59,1.00| +1|1200001|1500000|12|995| 2|2.71,2.71,1.00| + +The columns are Chromosome ID, start position of window, end position, number of SNPs from sample in a window, number of segregating SNPs, underlying genotype (0, 1, 2 for homozygous parent1, heterozygous and homozygous parent2), likelihood ratio test statistic for each genotype (or number of SNPs each genotype under HMM). + ## Contributing 1. Fork it! 2. Create your feature branch: `git checkout -b my-new-feature` @@ -138,13 +159,6 @@ The file containing the list of matched strains, list of homozygous windows and 4. Push to the branch: `git push origin my-new-feature` 5. Submit a pull request :D -## History - -- 1.9.2: Stable version, 24-08-2017 -- 2.0.0: Stable version, 26-01-2018 -- 2.1.0: Stable version, 09-08-2018 - - ## Credits - Rahul Pisupati (rahul.pisupati[at]gmi.oeaw.ac.at) diff --git a/setup.py b/setup.py index 2688cc5..13cfbea 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( name='SNPmatch', - version='2.2.0', + version='2.5.0', description='A simple python library to identify the most likely strain given the SNPs for a sample', long_description=long_description, url='https://github.com/Gregor-Mendel-Institute/SNPmatch', diff --git a/snpmatch/__init__.py b/snpmatch/__init__.py index 16e046f..ffca5a0 100644 --- a/snpmatch/__init__.py +++ b/snpmatch/__init__.py @@ -16,8 +16,8 @@ from snpmatch.core import simulate import logging, logging.config -__version__ = '2.2.0' -__updated__ = "14.08.2018" +__version__ = '2.5.0' +__updated__ = "05.12.2018" __date__ = "25.10.2016" def setLog(logDebug): diff --git a/snpmatch/core/genotype_cross.py b/snpmatch/core/genotype_cross.py index 8aff0a9..faf78b3 100644 --- a/snpmatch/core/genotype_cross.py +++ b/snpmatch/core/genotype_cross.py @@ -17,7 +17,7 @@ log = logging.getLogger(__name__) # Arabidopsis chromosome length -chrlen = np.array((30427671, 19698289, 23459830, 18585056, 26975502, 154478, 154478)) +chrlen = np.array((30427671, 19698289, 23459830, 18585056, 26975502)) tair_chrs = ['1', '2', '3', '4', '5'] mean_recomb_rates = [3.4, 3.6, 3.5, 3.8, 3.6] ## cM/Mb ## Salome, P et al. 2011 @@ -149,55 +149,68 @@ def genotype_each_cross(self, input_file, lr_thres): log.info("done!") return(outfile_str) + @staticmethod + def get_probabilities_ebin(input_gt, snpsP1_gt, snpsP2_gt, error_prob = 0.01): + num_snps = len(input_gt) + ebTarGTs = parsers.parseGT(input_gt) + ebPolarised = np.zeros(num_snps, dtype=int) + ebPolarised[np.where(np.equal( ebTarGTs, snpsP1_gt ))[0] ] = 0 + ebPolarised[np.where(np.equal( ebTarGTs, snpsP2_gt ))[0] ] = 2 + ebPolarised[np.where(np.equal( ebTarGTs, np.repeat(2, num_snps) ))[0] ] = 1 + eb_probs = np.repeat(error_prob / 2, num_snps * 3).reshape((-3, 3)) + for i_ix in range(num_snps): + eb_probs[i_ix,ebPolarised[i_ix]] = 1 - error_prob + return( eb_probs ) + + def genotype_each_cross_hmm(self, input_file, n_marker_thres): + self._inputs = parsers.ParseInputs(inFile = input_file, logDebug = self.logDebug) + ## Inputs is the ParseInputs class object + self._inputs.get_chr_list() from hmmlearn import hmm - r_i = float(self.window_size) * 0.38 /1000000 - assert r_i < 1, "Provide either small window size or check the mean recombination rate" - log.info("recombination fraction between windows: %.2f" % r_i) ### The below transition probability is for a intercross, adapted from R/qtl log.info("running HMM") - transprob = np.array( [[ (1-r_i)**2, 2*r_i*(1-r_i), r_i**2 ], - [r_i*(1-r_i), (1-r_i)**2 + r_i**2, r_i*(1-r_i)], - [ r_i**2, 2*r_i*(1-r_i), (1-r_i)**2 ]] ) - model = hmm.GaussianHMM(n_components=3, covariance_type="full", init_params="mcs") + #r_i = float(self.window_size) * 0.38 /1000000 + r_i = 4000 * 3.8 /1000000 + e_p = 0.01 + assert r_i < 1, "Provide either small window size or check the mean recombination rate" + log.info("recombination fraction between windows: %.2f" % r_i) + transprob = np.array( [[ (1-r_i)**2, 2*r_i*(1-r_i), r_i**2 ], [r_i*(1-r_i), (1-r_i)**2 + r_i**2, r_i*(1-r_i)], [ r_i**2, 2*r_i*(1-r_i), (1-r_i)**2 ]] ) + log.info("error rate for genotyping: %.2f" % e_p) + model = hmm.GaussianHMM(n_components=3, covariance_type="full", n_iter = 1000) model.startprob_ = np.array([0.25, 0.5, 0.25]) model.transmat_ = transprob - model.means_ = np.array([[1, 0, 0], [0.08, 0.84, 0.08], [0, 0, 1]]) - model.covars_ = np.tile(np.identity(3), (3, 1, 1)) - ## Choose a small window that no recombination occur - inputs = parsers.ParseInputs(inFile = input_file, logDebug = self.logDebug) - ## Inputs is the ParseInputs class object log.info("Transision matrix: %s" % transprob) + model.means_ = np.array( [ [1, 0, 0], [0, 1, 0], [0, 0, 1] ] ) + model.covars_ = np.tile(np.identity(3), (3, 1, 1)) + allSNPProbs = {} + allSNPGenos = np.zeros(0, dtype=int) + for ec, eclen in itertools.izip(tair_chrs, chrlen): + reqPOSind = np.where(self.commonSNPsCHR == ec)[0] + reqPOS = self.commonSNPsPOS[reqPOSind] + reqTARind = np.where( self._inputs.g_chrs == ec )[0] + perchrTarPos = self._inputs.pos[reqTARind] + ebAccsInds_rel = np.where( np.in1d(reqPOS, perchrTarPos) )[0] + ebAccsInds = reqPOSind[ ebAccsInds_rel ] + ebTarInds = reqTARind[ np.where( np.in1d(perchrTarPos, reqPOS) )[0] ] + allSNPProbs[ec] = np.zeros((len(reqPOSind), 3)) + ebin_prob = self.get_probabilities_ebin(self._inputs.gt[ebTarInds], self.snpsP1[ebAccsInds], self.snpsP2[ebAccsInds]) + allSNPProbs[ec][ebAccsInds_rel,] = ebin_prob + allSNPGenos = np.append(allSNPGenos, model.predict( allSNPProbs[ec] )) iter_bins_genome = csmatch.get_bins_arrays(self.commonSNPsCHR, self.commonSNPsPOS, self.window_size) - iter_bins_snps = csmatch.get_bins_arrays(inputs.chrs, inputs.pos, self.window_size) - allGenotypeProbs = {} - for ec in tair_chrs: - allGenotypeProbs[ec] = np.zeros((0, 3)) + iter_bins_snps = csmatch.get_bins_arrays(self._inputs.chrs, self._inputs.pos, self.window_size) outfile_str = np.zeros(0, dtype="string") for e_b, e_s in itertools.izip(iter_bins_genome, iter_bins_snps): - # first snp positions which are segregating and are in this window - bin_str = tair_chrs[e_b[0]] + "\t" + str(e_b[1][0]) + "\t" + str(e_b[1][1]) - reqPOS = self.commonSNPsPOS[e_b[2]] - perchrTarPos = inputs.pos[e_s[2]] - ebAccsInds = np.array(e_b[2])[ np.where( np.in1d(reqPOS, perchrTarPos) )[0] ] - ebTarInds = np.array(e_s[2], dtype=int)[ np.where( np.in1d(perchrTarPos, reqPOS) )[0] ] - ebTarGTs = parsers.parseGT(inputs.gt[ebTarInds]) - if len(ebTarInds) <= n_marker_thres: - ebin_prob = np.array((0, 0, 0)) + bin_str = tair_chrs[e_b[0]] + "\t" + str(e_b[1][0]) + "\t" + str(e_b[1][1]) + "\t" + str(len(e_s[2])) + "\t" + str(len(e_b[2])) + t_genos = np.unique(allSNPGenos[e_b[2]]) + t_genos_nums = pd.Series([ len( np.where( allSNPGenos[e_b[2]] == e )[0] ) for e in [0, 1, 2]]).astype(str).str.cat(sep=",") + if len(e_b[2]) > 0: + if len(t_genos) == 1: + outfile_str = np.append(outfile_str, "%s\t%s\t%s" % (bin_str, t_genos[0], t_genos_nums) ) + else: + outfile_str = np.append(outfile_str, "%s\t%s\t%s" % (bin_str, 1, t_genos_nums) ) else: - matP1no = float(len(np.where(np.equal( ebTarGTs, self.snpsP1[ebAccsInds] ))[0]))/len(ebTarInds) - matP2no = float(len(np.where(np.equal( ebTarGTs, self.snpsP2[ebAccsInds] ))[0]))/len(ebTarInds) - matHetno = float(len(np.where(np.equal( ebTarGTs, np.repeat(2, len(ebTarInds)) ))[0]))/len(ebTarInds) - ebin_prob = np.array( (matP1no,matHetno,matP2no) ) - allGenotypeProbs[tair_chrs[e_b[0]]] = np.vstack( (allGenotypeProbs[tair_chrs[e_b[0]]], ebin_prob) ) - #allGenotypeProbs = - outfile_str = np.append(outfile_str, "%s\t%s\t%s" % (bin_str, len(ebTarInds), len(e_b[2]))) - allGenos = np.zeros(0) - allGenos_prob = np.zeros(0, dtype="string") - for ec in tair_chrs: - allGenos = np.append(allGenos, model.predict( allGenotypeProbs[ec] )) - allGenos_prob = np.append(allGenos_prob, pd.DataFrame(allGenotypeProbs[ec]).apply(lambda x: ','.join(x.round(2).astype(str)), axis=1) ) - outfile_str = np.array(pd.Series(outfile_str) + "\t" + pd.Series(allGenos).astype(int).astype(str) + "\t" + pd.Series(allGenos_prob) ) + outfile_str = np.append(outfile_str, "%s\t%s\t%s" % (bin_str, 'NA', 'NA' ) ) return(outfile_str) diff --git a/snpmatch/core/parsers.py b/snpmatch/core/parsers.py index 8e59be9..d1691d2 100644 --- a/snpmatch/core/parsers.py +++ b/snpmatch/core/parsers.py @@ -142,6 +142,10 @@ def filter_chr_names(self, g): self.chrs_nochr = np.array(pd.Series(self.chrs).str.replace("chr", "", case=False), dtype="string") self.filter_inds_ix = np.where( np.in1d( self.chrs_nochr, self.chr_list ) )[0] + def get_chr_list(self): + self.g_chrs = np.char.replace(np.core.defchararray.lower(np.array(self.chrs, dtype="string")), "chr", "") + self.g_chrs_uq = np.unique(self.g_chrs) + def potatoParser(inFile, logDebug, outFile = "parser"): inputs = ParseInputs(inFile, logDebug, outFile)