Skip to content

Commit

Permalink
few changes and bugs. version up
Browse files Browse the repository at this point in the history
  • Loading branch information
rbpisupati committed Jan 26, 2018
1 parent 31d4241 commit 01df5ec
Show file tree
Hide file tree
Showing 7 changed files with 169 additions and 156 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ These scripts are implemented based on the *A. thaliana* genome sizes. But the g
- 1.7.2: Stable version, 15-12-2016
- 1.8.2: Stable version, 16-02-2017
- 1.9.2: Stable version, 24-08-2017
- 2.0.0: Stable version, 26-01-2018


## Credits
Expand All @@ -116,4 +117,3 @@ These scripts are implemented based on the *A. thaliana* genome sizes. But the g

Pisupati, R. *et al.*. Verification of *Arabidopsis* stock collections using SNPmatch, a tool for genotyping high-plexed samples. *Nature Scientific Data* **4**, 170184 (2017).
[doi:10.1038/sdata.2017.184](https://www.nature.com/articles/sdata2017184)

2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

setup(
name='SNPmatch',
version='1.9.2',
version='2.0.0',
description='A simple python library to identify the most likely strain given the SNPs for a sample',
long_description=long_description,
url='https://github.com/Gregor-Mendel-Institute/SNPmatch',
Expand Down
14 changes: 10 additions & 4 deletions snpmatch/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@
from snpmatch.core import simulate
import logging, logging.config

__version__ = '1.9.2'
__updated__ = "31.8.2017"
__version__ = '2.0.0'
__updated__ = "26.01.2018"
__date__ = "25.10.2016"

def setLog(logDebug):
Expand Down Expand Up @@ -47,6 +47,7 @@ def get_options(program_license,program_version_message):
inbred_parser.add_argument("-v", "--verbose", action="store_true", dest="logDebug", default=False, help="Show verbose debugging output")
inbred_parser.add_argument("-o", "--output", dest="outFile", help="Output file with the probability scores")
inbred_parser.set_defaults(func=snpmatch_inbred)

cross_parser = subparsers.add_parser('cross', help="SNPmatch on the crosses (F2s and F3s) of A. thaliana")
cross_parser.add_argument("-i", "--input_file", dest="inFile", help="VCF/BED file for the variants in the sample")
cross_parser.add_argument("-d", "--hdf5_file", dest="hdf5File", help="Path to SNP matrix given in binary hdf5 file chunked row-wise")
Expand All @@ -55,6 +56,7 @@ def get_options(program_license,program_version_message):
cross_parser.add_argument("-v", "--verbose", action="store_true", dest="logDebug", default=False, help="Show verbose debugging output")
cross_parser.add_argument("-o", "--output", dest="outFile", help="Output files with the probability scores and scores along windows")
cross_parser.set_defaults(func=snpmatch_cross)

genocross_parser = subparsers.add_parser('genotype_cross', help="Genotype the crosses by windows given parents")
genocross_parser.add_argument("-i", "--input_file", dest="inFile", help="VCF file for the variants in the sample")
genocross_parser.add_argument("-e", "--hdf5_acc_file", dest="hdf5accFile", help="Path to SNP matrix given in binary hdf5 file chunked column-wise")
Expand All @@ -69,18 +71,21 @@ def get_options(program_license,program_version_message):
parser.add_argument("-v", "--verbose", action="store_true", dest="logDebug", default=False, help="Show verbose debugging output")
parser.add_argument("-o", "--output", dest="outFile", help="output + .npz file is generater required for SNPmatch")
parser.set_defaults(func=snpmatch_parser)

pairparser = subparsers.add_parser('pairsnp', help="pairwise comparison of two snp files")
pairparser.add_argument("-i", "--input_file_1", dest="inFile_1", help="VCF/BED file for the variants in the sample one")
pairparser.add_argument("-j", "--input_file_2", dest="inFile_2", help="VCF/BED file for the variants in the sample two")
pairparser.add_argument("-v", "--verbose", action="store_true", dest="logDebug", default=False, help="Show verbose debugging output")
pairparser.add_argument("-o", "--output", dest="outFile", help="output json file")
pairparser.set_defaults(func=snpmatch_paircomparions)

makedbparser = subparsers.add_parser('makedb', help="Create database files from given VCF, only give biallelic SNPs")
makedbparser.add_argument("-i", "--input_vcf", dest="inFile", help="input VCF file for the known strains.")
makedbparser.add_argument("-i", "--input_vcf", dest="inFile", help="input VCF file for the known strains. You can also provide a CSV file which is an intermediate file in the process.")
makedbparser.add_argument("-p", "--bcftools_path", dest="bcfpath", help="path to the bcftools executable. Not necessary if present in BASH PATH", default='')
makedbparser.add_argument("-o", "--out_db_id", dest="db_id", help="output id for database files")
makedbparser.add_argument("-v", "--verbose", action="store_true", dest="logDebug", default=False, help="Show verbose debugging output")
makedbparser.set_defaults(func=makedb_vcf_to_hdf5)

simparser = subparsers.add_parser('simulate', help="Given SNP database, check the genotyping efficiency randomly selecting 'n' number of SNPs")
simparser.add_argument("-d", "--hdf5_file", dest="hdf5File", help="Path to SNP matrix given in binary hdf5 file chunked row-wise")
simparser.add_argument("-e", "--hdf5_acc_file", dest="hdf5accFile", help="Path to SNP matrix given in binary hdf5 file chunked column-wise")
Expand Down Expand Up @@ -122,7 +127,8 @@ def snpmatch_parser(args):
if not args['outFile']:
if os.path.isfile(args['inFile'] + ".snpmatch.npz"):
os.remove(args['inFile'] + ".snpmatch.npz")
snpmatch.parseInput(inFile = args['inFile'], logDebug = args['logDebug'], outFile = args['outFile'])
from snpmatch.core import parsers
parsers.parseInput(inFile = args['inFile'], logDebug = args['logDebug'], outFile = args['outFile'])

def genotype_cross(args):
#checkARGs(args)
Expand Down
21 changes: 10 additions & 11 deletions snpmatch/core/csmatch.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import os
import StringIO
import snpmatch
import parsers
import json
import itertools

Expand Down Expand Up @@ -92,10 +93,8 @@ def crossWindower(binLen, snpCHR, snpPOS, snpWEI, DPmean, GenotypeData, outFile)
tempScore1 = np.sum(np.multiply(np.array(t1001SNPs == samSNPs1, dtype=int).T, matchedTarWei[j:j+chunk_size,1]).T, axis=0)
tempScore2 = np.sum(np.multiply(np.array(t1001SNPs == samSNPs2, dtype=int).T, matchedTarWei[j:j+chunk_size,2]).T, axis=0)
ScoreList = ScoreList + tempScore0 + tempScore1 + tempScore2
if(len(TarGTs0[j:j+chunk_size]) > 1):
if(len(TarGTs0[j:j+chunk_size]) >= 1):
NumInfoSites = NumInfoSites + len(TarGTs0[j:j+chunk_size]) - np.sum(numpy.ma.masked_less(t1001SNPs, 0).mask.astype(int), axis = 0)
elif(len(TarGTs0[j:j+chunk_size]) == 1):
NumInfoSites = NumInfoSites + 1 - numpy.ma.masked_less(t1001SNPs, 0).mask.astype(int)
TotScoreList = TotScoreList + ScoreList
TotNumInfoSites = TotNumInfoSites + NumInfoSites
writeBinData(out_file, i, GenotypeData, ScoreList, NumInfoSites)
Expand Down Expand Up @@ -197,7 +196,7 @@ def crossIdentifier(binLen, snpCHR, snpPOS, snpWEI, DPmean, GenotypeData, Genoty
score = 0
numinfo = 0
NumMatSNPs = 0
for ind,echr in enumerate(snpmatch.parseChrName(GenotypeData.chrs)[0]):
for ind,echr in enumerate(parsers.parseChrName(GenotypeData.chrs)[0]):
perchrTarPos = np.where(snpCHR == echr)[0]
perchrtarSNPpos = snpPOS[perchrTarPos]
start = GenotypeData.chr_regions[ind][0]
Expand All @@ -223,7 +222,7 @@ def crossIdentifier(binLen, snpCHR, snpPOS, snpWEI, DPmean, GenotypeData, Genoty
crossInterpreter(GenotypeData, binLen, outID)

def potatoCrossIdentifier(args):
(snpCHR, snpPOS, snpGT, snpWEI, DPmean) = snpmatch.parseInput(inFile = args['inFile'], logDebug = args['logDebug'])
(snpCHR, snpPOS, snpGT, snpWEI, DPmean) = parsers.parseInput(inFile = args['inFile'], logDebug = args['logDebug'])
log.info("loading genotype files!")
GenotypeData = genotype.load_hdf5_genotype_data(args['hdf5File'])
GenotypeData_acc = genotype.load_hdf5_genotype_data(args['hdf5accFile'])
Expand Down Expand Up @@ -252,7 +251,7 @@ def getWindowGenotype(matchedP1, totalMarkers, lr_thres = 2.706):

def crossGenotypeWindows(commonSNPsCHR, commonSNPsPOS, snpsP1, snpsP2, inFile, binLen, outFile, logDebug = True):
## inFile are the SNPs of the sample
(snpCHR, snpPOS, snpGT, snpWEI, DPmean) = snpmatch.parseInput(inFile = inFile, logDebug = logDebug)
(snpCHR, snpPOS, snpGT, snpWEI, DPmean) = parsers.parseInput(inFile = inFile, logDebug = logDebug)
# identifying the segregating SNPs between the accessions
# only selecting 0 or 1
segSNPsind = np.where((snpsP1 != snpsP2) & (snpsP1 >= 0) & (snpsP2 >= 0) & (snpsP1 < 2) & (snpsP2 < 2))[0]
Expand All @@ -272,7 +271,7 @@ def crossGenotypeWindows(commonSNPsCHR, commonSNPsPOS, snpsP1, snpsP2, inFile, b
matchedTarInd = perchrTarPosind[np.where(np.in1d(perchrTarPos, reqPOS))[0]]
matchedTarGTs = snpGT[matchedTarInd]
try:
TarGTBinary = snpmatch.parseGT(matchedTarGTs)
TarGTBinary = parsers.parseGT(matchedTarGTs)
TarGTBinary[np.where(TarGTBinary == 2)[0]] = 4
genP1 = np.subtract(TarGTBinary, snpsP1[matchedAccInd])
genP1no = len(np.where(genP1 == 0)[0])
Expand All @@ -299,8 +298,8 @@ def crossGenotyper(args):
log.info("input files: %s and %s" % (args['parents'], args['father']))
if not os.path.isfile(args['parents']) and os.path.isfile(args['father']):
die("either of the input files do not exists, please provide VCF/BED file for parent genotype information")
(p1snpCHR, p1snpPOS, p1snpGT, p1snpWEI, p1DPmean) = snpmatch.parseInput(inFile = args['parents'], logDebug = args['logDebug'])
(p2snpCHR, p2snpPOS, p2snpGT, p2snpWEI, p2DPmean) = snpmatch.parseInput(inFile = args['father'], logDebug = args['logDebug'])
(p1snpCHR, p1snpPOS, p1snpGT, p1snpWEI, p1DPmean) = parsers.parseInput(inFile = args['parents'], logDebug = args['logDebug'])
(p2snpCHR, p2snpPOS, p2snpGT, p2snpWEI, p2DPmean) = parsers.parseInput(inFile = args['father'], logDebug = args['logDebug'])
commonCHRs_ids = np.union1d(p1snpCHR, p2snpCHR)
commonSNPsCHR = np.zeros(0, dtype=commonCHRs_ids.dtype)
commonSNPsPOS = np.zeros(0, dtype=int)
Expand All @@ -316,8 +315,8 @@ def crossGenotyper(args):
perchrsnpsP2 = np.repeat(-1, len(perchrPositions)).astype('int8')
perchrsnpsP1_inds = np.where(np.in1d(p1snpPOS[perchrP1inds], perchrPositions))[0]
perchrsnpsP2_inds = np.where(np.in1d(p2snpPOS[perchrP2inds], perchrPositions))[0]
snpsP1 = np.append(snpsP1, snpmatch.parseGT(p1snpGT[perchrsnpsP1_inds]))
snpsP2 = np.append(snpsP2, snpmatch.parseGT(p2snpGT[perchrsnpsP2_inds]))
snpsP1 = np.append(snpsP1, parsers.parseGT(p1snpGT[perchrsnpsP1_inds]))
snpsP2 = np.append(snpsP2, parsers.parseGT(p2snpGT[perchrsnpsP2_inds]))
log.info("done!")
else:
parents = args['parents']
Expand Down
2 changes: 1 addition & 1 deletion snpmatch/core/makedb.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def makedb_from_vcf(args):
log.info('done!')
elif inType == '.csv':
log.info("converting CSV to hdf5!")
makeHDF5s(args['db_id'] + '.csv', args['db_id'])
makeHDF5s(args['inFile'], args['db_id'])
log.info('done!')
else:
die("please provide either a VCF file or a CSV!")
125 changes: 125 additions & 0 deletions snpmatch/core/parsers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
import pandas as pd
import numpy as np
import allel
import snpmatch
import logging
import os
import json

log = logging.getLogger(__name__)

def parseGT(snpGT):
first = snpGT[0]
snpBinary = np.zeros(len(snpGT), dtype = "int8")
if first.find('|') != -1:
## GT is phased
separator = "|"
elif first.find('/') != -1:
## GT is not phased
separator = "/"
elif np.char.isdigit(first):
return np.array(np.copy(snpGT), dtype = "int8")
else:
snpmatch.die("unable to parse the format of GT in vcf!")
hetGT = "0" + separator + "1"
refGT = "0" + separator + "0"
altGT = "1" + separator + "1"
nocall = "." + separator + "."
snpBinary[np.where(snpGT == altGT)[0]] = 1
snpBinary[np.where(snpGT == hetGT)[0]] = 2
snpBinary[np.where(snpGT == nocall)[0]] = -1
return snpBinary

def parseChrName(targetCHR):
snpCHROM = np.char.replace(np.core.defchararray.lower(np.array(targetCHR, dtype="string")), "chr", "")
snpsREQ = np.where(np.char.isdigit(snpCHROM))[0] ## Filtering positions from mitochondrial and chloroplast
snpCHR = snpCHROM[snpsREQ]
return (snpCHR, snpsREQ)

def readBED(inFile, logDebug):
log.info("reading the position file")
targetSNPs = pd.read_table(inFile, header=None, usecols=[0,1,2])
(snpCHR, snpsREQ) = parseChrName(targetSNPs[0])
snpPOS = np.array(targetSNPs[1], dtype=int)[snpsREQ]
snpGT = np.array(targetSNPs[2])[snpsREQ]
snpBinary = parseGT(snpGT)
snpWEI = np.ones((len(snpCHR), 3)) ## for homo and het
snpWEI[np.where(snpBinary != 0),0] = 0
snpWEI[np.where(snpBinary != 1),2] = 0
snpWEI[np.where(snpBinary != 2),1] = 0
return (snpCHR, snpPOS, snpGT, snpWEI)

def readVcf(inFile, logDebug):
log.info("reading the VCF file")
## We read only one sample from the VCF file
if logDebug:
vcf = allel.read_vcf(inFile, samples = [0], fields = '*')
else:
import StringIO
import sys
sys.stderr = StringIO.StringIO()
vcf = allel.read_vcf(inFile, samples = [0], fields = '*')
#vcf = vcfnp.variants(inFile, cache=False).view(np.recarray)
#vcfD = vcfnp.calldata_2d(inFile, cache=False).view(np.recarray)
sys.stderr = sys.__stderr__
(snpCHR, snpsREQ) = parseChrName(vcf['variants/CHROM'])
try:
snpGT = allel.GenotypeArray(vcf['calldata/GT']).to_gt()[snpsREQ, 0]
except AttributeError:
snpmatch.die("input VCF file doesnt have required GT field")
snpsREQ = snpsREQ[np.where(snpGT != './.')[0]]
snpGT = allel.GenotypeArray(vcf['calldata/GT']).to_gt()[snpsREQ, 0]
if 'calldata/PL' in sorted(vcf.keys()):
snpWEI = np.copy(vcf['calldata/PL'][snpsREQ, 0]).astype('float')
snpWEI = snpWEI/(-10)
snpWEI = np.exp(snpWEI)

else:
snpBinary = parseGT(snpGT)
snpWEI = np.ones((len(snpsREQ), 3)) ## for homo and het
snpWEI[np.where(snpBinary != 0),0] = 0
snpWEI[np.where(snpBinary != 1),2] = 0
snpWEI[np.where(snpBinary != 2),1] = 0
snpCHR = snpCHR[snpsREQ]
DPmean = np.mean(vcf['calldata/DP'][snpsREQ,0])
snpPOS = np.array(vcf['variants/POS'][snpsREQ])
return (DPmean, snpCHR, snpPOS, snpGT, snpWEI)

def parseInput(inFile, logDebug, outFile = "parser"):
if outFile == "parser" or not outFile:
outFile = inFile + ".snpmatch"
if os.path.isfile(inFile + ".snpmatch.npz"):
log.info("snpmatch parser dump found! loading %s", inFile + ".snpmatch.npz")
snps = np.load(inFile + ".snpmatch.npz")
(snpCHR, snpPOS, snpGT, snpWEI, DPmean) = (snps['chr'], snps['pos'], snps['gt'], snps['wei'], snps['dp'])
else:
_,inType = os.path.splitext(inFile)
if inType == '.npz':
log.info("loading snpmatch parser file! %s", inFile)
snps = np.load(inFile)
(snpCHR, snpPOS, snpGT, snpWEI, DPmean) = (snps['chr'], snps['pos'], snps['gt'], snps['wei'], snps['dp'])
else:
log.info('running snpmatch parser!')
if inType == '.vcf':
(DPmean, snpCHR, snpPOS, snpGT, snpWEI) = readVcf(inFile, logDebug)
elif inType == '.bed':
(snpCHR, snpPOS, snpGT, snpWEI) = readBED(inFile, logDebug)
DPmean = "NA"
else:
snpmatch.die("input file type %s not supported" % inType)
log.info("creating snpmatch parser file: %s", outFile + '.npz')
np.savez(outFile, chr = snpCHR, pos = snpPOS, gt = snpGT, wei = snpWEI, dp = DPmean)
NumSNPs = len(snpCHR)
case = 0
note = "Sufficient number of SNPs"
if NumSNPs < snpmatch.snp_thres:
note = "Attention: low number of SNPs provided"
case = 1
snpst = np.unique(snpCHR, return_counts=True)
snpdict = dict(('Chr%s' % snpst[0][i], snpst[1][i]) for i in range(len(snpst[0])))
statdict = {"interpretation": {"case": case, "text": note}, "snps": snpdict, "num_of_snps": NumSNPs, "depth": DPmean}
statdict['percent_heterozygosity'] = snpmatch.getHeterozygosity(snpGT)
with open(outFile + ".stats.json", "w") as out_stats:
out_stats.write(json.dumps(statdict))
log.info("done!")
return (snpCHR, snpPOS, snpGT, snpWEI, DPmean)
Loading

0 comments on commit 01df5ec

Please sign in to comment.