diff --git a/snpmatch/__init__.py b/snpmatch/__init__.py index 6260270..ac19083 100644 --- a/snpmatch/__init__.py +++ b/snpmatch/__init__.py @@ -128,7 +128,7 @@ def snpmatch_parser(args): if os.path.isfile(args['inFile'] + ".snpmatch.npz"): os.remove(args['inFile'] + ".snpmatch.npz") from snpmatch.core import parsers - parsers.parseInput(inFile = args['inFile'], logDebug = args['logDebug'], outFile = args['outFile']) + parsers.potatoParser(inFile = args['inFile'], logDebug = args['logDebug'], outFile = args['outFile']) def genotype_cross(args): #checkARGs(args) diff --git a/snpmatch/core/csmatch.py b/snpmatch/core/csmatch.py index 6779a5b..ef90750 100644 --- a/snpmatch/core/csmatch.py +++ b/snpmatch/core/csmatch.py @@ -5,10 +5,9 @@ import numpy.ma import scipy.stats as st from pygwas.core import genotype -import pandas +import pandas as pd import logging import os -import StringIO import snpmatch import parsers import json @@ -18,225 +17,242 @@ # Arabidopsis chromosome length chrlen = np.array((30427671, 19698289, 23459830, 18585056, 26975502, 154478, 154478)) +tair_chrs = ['1', '2', '3', '4', '5'] -def getBins(g, binLen): - binLen = int(binLen) - ChrIndex = np.zeros(0,dtype="int8") - LenBins = np.zeros(0, dtype="int16") - for i in range(len(g.chrs)): - start = g.chr_regions[i][0] - end = g.chr_regions[i][1] - chr_pos = g.positions[start:end] - for j in range(0, chrlen[i], binLen): - bin_pos = len(np.where((chr_pos >= j) & (chr_pos < j + binLen))[0]) - LenBins = np.append(LenBins, bin_pos) - ChrIndex = np.append(ChrIndex, g.chrs[i]) - return(ChrIndex, LenBins) +def get_bins_echr(real_chrlen, chr_pos, binLen, rel_ix): + ind = 0 + for t in range(1, real_chrlen, binLen): + result = [] + bin_bed = [int(t), int(t) + binLen - 1] + for epos in chr_pos[ind:]: + if epos >= bin_bed[0]: + if epos <= bin_bed[1]: + result.append(ind + rel_ix) + elif epos > bin_bed[1] : + yield((bin_bed, result)) + break + ind = ind + 1 -def getBinsSNPs(g_chrs, g_snppos, binLen): +def get_bins_genome(g, binLen): binLen = int(binLen) - ChrIndex = np.zeros(0,dtype="int8") - LenBins = np.zeros(0, dtype="int16") - chrs = np.unique(g_chrs) - for i in range(len(chrs)): - chr_pos = g_snppos[np.where(g_chrs == chrs[i])[0]] - for j in range(0, chrlen[i], binLen): - bin_pos = len(np.where((chr_pos >= j) & (chr_pos < j + binLen))[0]) - LenBins = np.append(LenBins, bin_pos) - ChrIndex = np.append(ChrIndex, chrs[i]) - return(ChrIndex, LenBins) + g_chrs = np.char.replace(np.core.defchararray.lower(np.array(g.chrs[0:len(tair_chrs)], dtype="string")), "chr", "") + if len(g_chrs) > 7: + snpmatch.die("Please change the genome sizes in csmatch module") + if not np.array_equal(g_chrs, np.array(tair_chrs)): + snpmatch.die("Please change the genome sizes in csmatch module") + for chr_ix in range(len(g_chrs)): + start = g.chr_regions[chr_ix][0] + end = g.chr_regions[chr_ix][1] + chr_pos = g.positions[start:end] + echr_bins = get_bins_echr(chrlen[chr_ix], chr_pos, binLen, start) + for e_bin in echr_bins: + yield((chr_ix, e_bin[0], e_bin[1])) -def writeBinData(out_file, i, GenotypeData, ScoreList, NumInfoSites): - num_lines = len(GenotypeData.accessions) - (likeliScore, likeliHoodRatio) = snpmatch.calculate_likelihoods(ScoreList, NumInfoSites) - if len(likeliScore) > 0: - NumAmb = np.where(likeliHoodRatio < snpmatch.lr_thres)[0] - if len(NumAmb) >= 1 and len(NumAmb) < num_lines: - try: - nextLikeli = np.nanmin(likeliHoodRatio[np.where(likeliHoodRatio > snpmatch.lr_thres)[0]]) - except: - nextLikeli = 1 - for k in NumAmb: - score = float(ScoreList[k])/NumInfoSites[k] - out_file.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (GenotypeData.accessions[k], int(ScoreList[k]), NumInfoSites[k], score, likeliScore[k], nextLikeli, len(NumAmb), i+1)) +def get_bins_arrays(g_chrs, g_snppos, binLen): + g_chrs = np.char.replace(np.core.defchararray.lower(np.array(g_chrs, dtype="string")), "chr", "") + g_chrs_uq = np.unique(g_chrs) + g_chrs_uq = np.char.replace(np.core.defchararray.lower(np.array(g_chrs_uq[0:len(tair_chrs)], dtype="string")), "chr", "") + if len(g_chrs_uq) > 7: + snpmatch.die("Please change the genome sizes in csmatch module") + if not np.array_equal(g_chrs_uq, np.array(tair_chrs)): + snpmatch.die("Please change the genome sizes in csmatch module") + for chr_ix in range(len(g_chrs_uq)): + chr_pos_ix = np.where(g_chrs == g_chrs_uq[chr_ix])[0] + echr_bins = get_bins_echr(chrlen[chr_ix], g_snppos[chr_pos_ix], binLen, chr_pos_ix[0]) + for e_bin in echr_bins: + yield((chr_ix, e_bin[0], e_bin[1])) -def crossWindower(binLen, snpCHR, snpPOS, snpWEI, DPmean, GenotypeData, outFile): - NumSNPs = len(snpCHR) - num_lines = len(GenotypeData.accessions) - NumMatSNPs = 0 - chunk_size = 1000 - TotScoreList = np.zeros(num_lines, dtype="uint32") - TotNumInfoSites = np.zeros(num_lines, dtype="uint32") - (ChrBins, PosBins) = getBins(GenotypeData, binLen) - out_file = open(outFile, 'w') - for i in range(len(PosBins)): - start = np.sum(PosBins[0:i]) - end = start + PosBins[i] - pos = GenotypeData.positions[start:end] - perchrTarPos = np.where(snpCHR == ChrBins[i])[0] - perchrtarSNPpos = snpPOS[perchrTarPos] - matchedAccInd = np.where(np.in1d(pos, perchrtarSNPpos))[0] + start - matchedTarInd = np.where(np.in1d(perchrtarSNPpos, pos))[0] - matchedTarWei = snpWEI[perchrTarPos[matchedTarInd],] - TarGTs0 = np.zeros(len(matchedTarInd), dtype="int8") - TarGTs1 = np.ones(len(matchedTarInd), dtype="int8") + 1 - TarGTs2 = np.ones(len(matchedTarInd), dtype="int8") - NumMatSNPs = NumMatSNPs + len(matchedAccInd) - ScoreList = np.zeros(num_lines, dtype="uint32") - NumInfoSites = np.zeros(num_lines, dtype="uint32") - for j in range(0, len(matchedAccInd), chunk_size): - t1001SNPs = GenotypeData.snps[matchedAccInd[j:j+chunk_size],:] - samSNPs0 = np.reshape(np.repeat(TarGTs0[j:j+chunk_size], num_lines), (len(TarGTs0[j:j+chunk_size]),num_lines)) - samSNPs1 = np.reshape(np.repeat(TarGTs1[j:j+chunk_size], num_lines), (len(TarGTs1[j:j+chunk_size]),num_lines)) - samSNPs2 = np.reshape(np.repeat(TarGTs2[j:j+chunk_size], num_lines), (len(TarGTs2[j:j+chunk_size]),num_lines)) - tempScore0 = np.sum(np.multiply(np.array(t1001SNPs == samSNPs0, dtype=int).T, matchedTarWei[j:j+chunk_size,0]).T, axis=0) - tempScore1 = np.sum(np.multiply(np.array(t1001SNPs == samSNPs1, dtype=int).T, matchedTarWei[j:j+chunk_size,1]).T, axis=0) - tempScore2 = np.sum(np.multiply(np.array(t1001SNPs == samSNPs2, dtype=int).T, matchedTarWei[j:j+chunk_size,2]).T, axis=0) - ScoreList = ScoreList + tempScore0 + tempScore1 + tempScore2 - if(len(TarGTs0[j:j+chunk_size]) >= 1): - NumInfoSites = NumInfoSites + len(TarGTs0[j:j+chunk_size]) - np.sum(numpy.ma.masked_less(t1001SNPs, 0).mask.astype(int), axis = 0) - TotScoreList = TotScoreList + ScoreList - TotNumInfoSites = TotNumInfoSites + NumInfoSites - writeBinData(out_file, i, GenotypeData, ScoreList, NumInfoSites) - if i % 50 == 0: - log.info("Done analysing %s positions", NumMatSNPs) - out_file.close() - return (TotScoreList, TotNumInfoSites, NumMatSNPs) + +def writeBinData(out_file, e_g, GenotypeData, ScoreList, NumInfoSites): + num_lines = len(GenotypeData.accessions) + (likeliScore, likeliHoodRatio) = snpmatch.GenotyperOutput.calculate_likelihoods(ScoreList, NumInfoSites) + bin_bed = str(GenotypeData.chrs[e_g[0]]) + ',' + str(e_g[1][0]) + ',' + str(e_g[1][1]) + if len(likeliScore) > 0: + NumAmb = np.where(likeliHoodRatio < snpmatch.lr_thres)[0] + if len(NumAmb) >= 1 and len(NumAmb) < num_lines: + try: + nextLikeli = np.nanmin(likeliHoodRatio[np.where(likeliHoodRatio > snpmatch.lr_thres)[0]]) + except: + nextLikeli = 1 + for k in NumAmb: + score = float(ScoreList[k])/NumInfoSites[k] + out_file.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (GenotypeData.accessions[k], int(ScoreList[k]), NumInfoSites[k], score, likeliScore[k], nextLikeli, len(NumAmb), bin_bed)) + +def crossWindower(inputs, GenotypeData, binLen, outFile): + inputs.filter_chr_names(GenotypeData) + num_lines = len(GenotypeData.accessions) + NumMatSNPs = 0 + chunk_size = 1000 + TotScoreList = np.zeros(num_lines, dtype="uint32") + TotNumInfoSites = np.zeros(num_lines, dtype="uint32") + iter_bins_genome = get_bins_genome(GenotypeData, binLen) + iter_bins_snps = get_bins_arrays(inputs.chrs, inputs.pos, binLen) + out_file = open(outFile, 'w') + bin_inds = 0 + winds_chrs = np.zeros(0, dtype = GenotypeData.chrs.dtype) + for e_g, e_s in zip(iter_bins_genome, iter_bins_snps): + g_bin_pos = GenotypeData.positions[e_g[2]] + perchrtarSNPpos = inputs.pos[e_s[2]] + matchedAccInd = np.array(e_g[2], dtype=int)[np.where(np.in1d(g_bin_pos, perchrtarSNPpos))[0]] + matchedTarInd = np.array(e_s[2], dtype=int)[np.where(np.in1d(perchrtarSNPpos, g_bin_pos))[0]] + matchedTarWei = inputs.wei[matchedTarInd,] + TarGTs0 = np.zeros(len(matchedTarInd), dtype="int8") + TarGTs1 = np.ones(len(matchedTarInd), dtype="int8") + 1 + TarGTs2 = np.ones(len(matchedTarInd), dtype="int8") + NumMatSNPs = NumMatSNPs + len(matchedAccInd) + ScoreList = np.zeros(num_lines, dtype="uint32") + NumInfoSites = np.zeros(num_lines, dtype="uint32") + for j in range(0, len(matchedAccInd), chunk_size): + t1001SNPs = GenotypeData.snps[matchedAccInd[j:j+chunk_size],:] + samSNPs0 = np.reshape(np.repeat(TarGTs0[j:j+chunk_size], num_lines), (len(TarGTs0[j:j+chunk_size]),num_lines)) + samSNPs1 = np.reshape(np.repeat(TarGTs1[j:j+chunk_size], num_lines), (len(TarGTs1[j:j+chunk_size]),num_lines)) + samSNPs2 = np.reshape(np.repeat(TarGTs2[j:j+chunk_size], num_lines), (len(TarGTs2[j:j+chunk_size]),num_lines)) + tempScore0 = np.sum(np.multiply(np.array(t1001SNPs == samSNPs0, dtype=int).T, matchedTarWei[j:j+chunk_size,0]).T, axis=0) + tempScore1 = np.sum(np.multiply(np.array(t1001SNPs == samSNPs1, dtype=int).T, matchedTarWei[j:j+chunk_size,1]).T, axis=0) + tempScore2 = np.sum(np.multiply(np.array(t1001SNPs == samSNPs2, dtype=int).T, matchedTarWei[j:j+chunk_size,2]).T, axis=0) + ScoreList = ScoreList + tempScore0 + tempScore1 + tempScore2 + if(len(TarGTs0[j:j+chunk_size]) >= 1): + NumInfoSites = NumInfoSites + len(TarGTs0[j:j+chunk_size]) - np.sum(numpy.ma.masked_less(t1001SNPs, 0).mask.astype(int), axis = 0) + TotScoreList = TotScoreList + ScoreList + TotNumInfoSites = TotNumInfoSites + NumInfoSites + writeBinData(out_file, e_g, GenotypeData, ScoreList, NumInfoSites) + bin_inds += 1 + winds_chrs = np.append( winds_chrs, inputs.chr_list[e_g[0]] ) + if bin_inds % 50 == 0: + log.info("Done analysing %s positions", NumMatSNPs) + out_file.close() + overlap = float(NumMatSNPs)/len(inputs.filter_inds_ix) + result = snpmatch.GenotyperOutput(GenotypeData.accessions, TotScoreList, TotNumInfoSites, overlap, NumMatSNPs, inputs.dp) + result.winds_chrs = winds_chrs + return(result) def getHomoWindows(likeLiwind): - snp_thres_wind = np.nanmean(likeLiwind[2]) - np.std(likeLiwind[2]) - x_info = np.unique(likeLiwind[7]) - homo_wind = np.zeros(0, dtype = "int") - for i in x_info: - eWinds = likeLiwind.iloc[np.where(likeLiwind[7] == i)[0]] - if np.nanmean(eWinds[3]) > snpmatch.prob_thres and np.nanmean(eWinds[2]) > snp_thres_wind: - homo_wind = np.append(homo_wind, i) - return homo_wind + snp_thres_wind = np.nanmean(likeLiwind[2]) - np.std(likeLiwind[2]) + x_info = np.unique(likeLiwind[7]) + homo_wind = np.zeros(0, dtype = "int") + for i in x_info: + eWinds = likeLiwind.iloc[np.where(likeLiwind[7] == i)[0]] + if np.nanmean(eWinds[3]) > snpmatch.prob_thres and np.nanmean(eWinds[2]) > snp_thres_wind: + homo_wind = np.append(homo_wind, i) + return(homo_wind) -def crossInterpreter(GenotypeData, binLen, outID): - ## ScoreFile should be one from crossF1genotyper - ## Output file is from the crossIdentifier - cs_thres = 0.9 - outFile = outID + '.windowscore.txt' - scoreFile = outID + '.scores.txt' - log.info("running cross interpreter!") - num_lines = len(GenotypeData.accessions) - likeLiwind = pandas.read_table(outFile, header=None) - ScoreAcc = pandas.read_table(scoreFile, header=None) - topHitsDict = json.load(open(scoreFile + ".matches.json", 'r')) - if topHitsDict['interpretation']['case'] == 3: - homo_wind = getHomoWindows(likeLiwind) - homo_acc = np.unique(likeLiwind[0][np.where(np.in1d(likeLiwind[7], homo_wind))[0]],return_counts=True) - matches_dict = [(homo_acc[0][i].astype("string"), homo_acc[1][i]) for i in np.argsort(-homo_acc[1])] - topHitsDict['matches'] = matches_dict - f1matches = ScoreAcc.iloc[~np.in1d(ScoreAcc[0], GenotypeData.accessions)].reset_index() - topMatch = np.argsort(f1matches[5])[0] ## top F1 match sorted based on likelihood - if f1matches[3][topMatch] > cs_thres: - mother = f1matches[0][topMatch].split("x")[0] - father = f1matches[0][topMatch].split("x")[1] - topHitsDict['interpretation']['text'] = "Sample may be a F1! or a contamination!" - topHitsDict['interpretation']['case'] = 5 - topHitsDict['parents'] = {'mother': [mother,1], 'father': [father,1]} - topHitsDict['genotype_windows'] = {'chr_bins': None, 'coordinates': {'x': None, 'y': None}} - else: - (ChrBins, PosBins) = getBins(GenotypeData, binLen) - ## Get exactly the homozygous windows with one count - clean = np.unique(likeLiwind[0][np.where(likeLiwind[6] == 1)[0]], return_counts = True) - if len(clean[0]) > 0: ## Check if there are atlease one homozygous window - parents = clean[0][np.argsort(-clean[1])[0:2]].astype("string") - parents_counts = clean[1][np.argsort(-clean[1])[0:2]].astype("int") - xdict = np.array(np.unique(likeLiwind[7]), dtype="int") - ydict = np.repeat("NA", len(xdict)).astype("a25") - if len(parents) == 1: - topHitsDict['interpretation']['text'] = "Sample may be a F2! but only one parent found!" - topHitsDict['interpretation']['case'] = 6 - topHitsDict['parents'] = {'mother': [parents[0], parents_counts[0]], 'father': ["NA", "NA"]} - par1_ind = likeLiwind[7][np.where((likeLiwind[0].astype("string") == parents[0]) & np.in1d(likeLiwind[7], homo_wind))[0]] - ydict[np.where(np.in1d(xdict,par1_ind))[0]] = parents[0] +def crossInterpreter(snpmatch_result, GenotypeData, binLen, outID): + ## ScoreFile should be one from crossF1genotyper + ## Output file is from the crossIdentifier + cs_thres = 0.9 + outFile = outID + '.windowscore.txt' + scoreFile = outID + '.scores.txt' + log.info("running cross interpreter!") + likeLiwind = pandas.read_table(outFile, header=None) + ScoreAcc = pandas.read_table(scoreFile, header=None) + topHitsDict = json.load(open(scoreFile + ".matches.json", 'r')) + if topHitsDict['interpretation']['case'] == 3: + homo_wind = getHomoWindows(likeLiwind) + homo_acc = np.unique(likeLiwind[0][np.where(np.in1d(likeLiwind[7], homo_wind))[0]],return_counts=True) + matches_dict = [(homo_acc[0][i].astype("string"), homo_acc[1][i]) for i in np.argsort(-homo_acc[1])] + topHitsDict['matches'] = matches_dict + f1matches = ScoreAcc.iloc[~np.in1d(ScoreAcc[0], GenotypeData.accessions)].reset_index() + topMatch = np.argsort(f1matches[5])[0] ## top F1 match sorted based on likelihood + if f1matches[3][topMatch] > cs_thres: + mother = f1matches[0][topMatch].split("x")[0] + father = f1matches[0][topMatch].split("x")[1] + topHitsDict['interpretation']['text'] = "Sample may be a F1! or a contamination!" + topHitsDict['interpretation']['case'] = 5 + topHitsDict['parents'] = {'mother': [mother,1], 'father': [father,1]} + topHitsDict['genotype_windows'] = {'chr_bins': None, 'coordinates': {'x': None, 'y': None}} else: - topHitsDict['interpretation']['text'] = "Sample may be a F2!" - topHitsDict['interpretation']['case'] = 6 - topHitsDict['parents'] = {'mother': [parents[0], parents_counts[0]], 'father': [parents[1], parents_counts[1]]} - NumChrs = np.unique(ChrBins, return_counts=True) - chr_bins = dict(('Chr%s' % NumChrs[0][i], NumChrs[1][i]) for i in range(len(NumChrs[0]))) - par1_ind = np.array(likeLiwind[7][np.where((likeLiwind[0].astype("string") == parents[0]) & np.in1d(likeLiwind[7], homo_wind))[0]]) - par2_ind = np.array(likeLiwind[7][np.where((likeLiwind[0].astype("string") == parents[1]) & np.in1d(likeLiwind[7], homo_wind))[0]]) - ydict[np.where(np.in1d(xdict,par1_ind))[0]] = parents[0] - ydict[np.where(np.in1d(xdict,par2_ind))[0]] = parents[1] - xdict = xdict.tolist() - ydict = ydict.tolist() - topHitsDict['genotype_windows'] = {'chr_bins': chr_bins, 'coordinates': {'x': xdict, 'y': ydict}} - else: ## No homozygous window found! - topHitsDict['interpretation']['case'] = 7 - topHitsDict['interpretation']['text'] = "Sample may just be contamination!" - topHitsDict['genotype_windows'] = {'chr_bins': None, 'coordinates': {'x': None, 'y': None}} - topHitsDict['parents'] = {'mother': [None,0], 'father': [None,1]} - with open(outID + ".matches.json", "w") as out_stats: - out_stats.write(json.dumps(topHitsDict)) + ## Get exactly the homozygous windows with one count + clean = np.unique(likeLiwind[0][np.where(likeLiwind[6] == 1)[0]], return_counts = True) + if len(clean[0]) > 0: ## Check if there are atlease one homozygous window + parents = clean[0][np.argsort(-clean[1])[0:2]].astype("string") + parents_counts = clean[1][np.argsort(-clean[1])[0:2]].astype("int") + xdict = np.array(np.unique(likeLiwind[7]), dtype="int") + ydict = np.repeat("NA", len(xdict)).astype("a25") + if len(parents) == 1: + topHitsDict['interpretation']['text'] = "Sample may be a F2! but only one parent found!" + topHitsDict['interpretation']['case'] = 6 + topHitsDict['parents'] = {'mother': [parents[0], parents_counts[0]], 'father': ["NA", "NA"]} + par1_ind = likeLiwind[7][np.where((likeLiwind[0].astype("string") == parents[0]) & np.in1d(likeLiwind[7], homo_wind))[0]] + ydict[np.where(np.in1d(xdict,par1_ind))[0]] = parents[0] + else: + topHitsDict['interpretation']['text'] = "Sample may be a F2!" + topHitsDict['interpretation']['case'] = 6 + topHitsDict['parents'] = {'mother': [parents[0], parents_counts[0]], 'father': [parents[1], parents_counts[1]]} + NumChrs = np.unique(snpmatch_result.winds_chrs, return_counts=True) + chr_bins = dict(( NumChrs[0][i], NumChrs[1][i]) for i in range(len(NumChrs[0]))) + par1_ind = np.array(likeLiwind[7][np.where((likeLiwind[0].astype("string") == parents[0]) & np.in1d(likeLiwind[7], homo_wind))[0]]) + par2_ind = np.array(likeLiwind[7][np.where((likeLiwind[0].astype("string") == parents[1]) & np.in1d(likeLiwind[7], homo_wind))[0]]) + ydict[np.where(np.in1d(xdict,par1_ind))[0]] = parents[0] + ydict[np.where(np.in1d(xdict,par2_ind))[0]] = parents[1] + xdict = xdict.tolist() + ydict = ydict.tolist() + topHitsDict['genotype_windows'] = {'chr_bins': chr_bins, 'coordinates': {'x': xdict, 'y': ydict}} + else: ## No homozygous window found! + topHitsDict['interpretation']['case'] = 7 + topHitsDict['interpretation']['text'] = "Sample may just be contamination!" + topHitsDict['genotype_windows'] = {'chr_bins': None, 'coordinates': {'x': None, 'y': None}} + topHitsDict['parents'] = {'mother': [None,0], 'father': [None,1]} + with open(outID + ".matches.json", "w") as out_stats: + out_stats.write(json.dumps(topHitsDict)) -def crossIdentifier(binLen, snpCHR, snpPOS, snpWEI, DPmean, GenotypeData, GenotypeData_acc, outID): - ## Get tophit accessions - # sorting based on the final scores - if not outID: - outID = "cross.identifier" - outFile = outID + '.windowscore.txt' - scoreFile = outID + '.scores.txt' - NumSNPs = len(snpCHR) - num_lines = len(GenotypeData.accessions) - (ScoreList, NumInfoSites, NumMatSNPs) = crossWindower(binLen, snpCHR, snpPOS, snpWEI, DPmean, GenotypeData, outFile) - probScore = [float(ScoreList[i])/NumInfoSites[i] for i in range(num_lines)] - probScore = np.array(probScore, dtype = float) - snpmatch.print_topHits(scoreFile + ".matches.json", GenotypeData.accessions, ScoreList, NumInfoSites, float(NumMatSNPs)/NumSNPs, NumMatSNPs) - log.info("simulating F1s for top 10 accessions") - Accessions = np.copy(GenotypeData.accessions) - TopHitAccs = np.argsort(-probScore)[0:10] - for (i, j) in itertools.combinations(TopHitAccs, 2): - p1 = GenotypeData_acc.snps[:,i] - p2 = GenotypeData_acc.snps[:,j] - score = 0 - numinfo = 0 - NumMatSNPs = 0 - for ind,echr in enumerate(parsers.parseChrName(GenotypeData.chrs)[0]): - perchrTarPos = np.where(snpCHR == echr)[0] - perchrtarSNPpos = snpPOS[perchrTarPos] - start = GenotypeData.chr_regions[ind][0] - end = GenotypeData.chr_regions[ind][1] - chrpositions = GenotypeData.positions[start:end] - matchedAccInd = np.where(np.in1d(chrpositions, perchrtarSNPpos))[0] + start - matchedTarInd = np.where(np.in1d(perchrtarSNPpos, chrpositions))[0] - NumMatSNPs = NumMatSNPs + len(matchedTarInd) - gtp1 = p1[matchedAccInd] - gtp2 = p2[matchedAccInd] - matchedTarWEI = snpWEI[perchrTarPos[matchedTarInd],] - homalt = np.where((gtp1 == 1) & (gtp2 == 1))[0] - homref = np.where((gtp1 == 0) & (gtp2 == 0))[0] - het = np.where((gtp1 != -1) & (gtp2 != -1) & (gtp1 != gtp2))[0] - score = score + np.sum(matchedTarWEI[homalt, 2]) + np.sum(matchedTarWEI[homref, 0]) + np.sum(matchedTarWEI[het, 1]) - numinfo = numinfo + len(homalt) + len(homref) + len(het) - ScoreList = np.append(ScoreList, score) - NumInfoSites = np.append(NumInfoSites, numinfo) - acc = GenotypeData.accessions[i] + "x" + GenotypeData.accessions[j] - Accessions = np.append(Accessions, acc) - log.info("writing output!") - snpmatch.print_out_table(scoreFile, Accessions, ScoreList, NumInfoSites, NumMatSNPs, DPmean) - crossInterpreter(GenotypeData, binLen, outID) +def crossIdentifier(inputs, GenotypeData, GenotypeData_acc, binLen, outID): + ## Get tophit accessions + # sorting based on the final scores + inputs.filter_chr_names(GenotypeData) + if not outID: + outID = "cross.identifier" + outFile = outID + '.windowscore.txt' + scoreFile = outID + '.scores.txt' + snpmatch_result = crossWindower(inputs, GenotypeData, binLen, outFile) + snpmatch_result.print_json_output( scoreFile + ".matches.json" ) + log.info("simulating F1s for top 10 accessions") + TopHitAccs = np.argsort(-snpmatch_result.probabilies)[0:10] + for (i, j) in itertools.combinations(TopHitAccs, 2): + p1 = GenotypeData_acc.snps[:,i] + p2 = GenotypeData_acc.snps[:,j] + score = 0 + numinfo = 0 + NumMatSNPs = 0 + for ind,echr in enumerate(inputs.chr_list): + perchrTarPos = np.where(inputs.chrs_nochr == echr)[0] + perchrtarSNPpos = inputs.pos[perchrTarPos] + start = GenotypeData.chr_regions[ind][0] + end = GenotypeData.chr_regions[ind][1] + chrpositions = GenotypeData.positions[start:end] + matchedAccInd = np.where(np.in1d(chrpositions, perchrtarSNPpos))[0] + start + matchedTarInd = np.where(np.in1d(perchrtarSNPpos, chrpositions))[0] + NumMatSNPs = NumMatSNPs + len(matchedTarInd) + gtp1 = p1[matchedAccInd] + gtp2 = p2[matchedAccInd] + matchedTarWEI = inputs.wei[perchrTarPos[matchedTarInd],] + homalt = np.where((gtp1 == 1) & (gtp2 == 1))[0] + homref = np.where((gtp1 == 0) & (gtp2 == 0))[0] + het = np.where((gtp1 != -1) & (gtp2 != -1) & (gtp1 != gtp2))[0] + score = score + np.sum(matchedTarWEI[homalt, 2]) + np.sum(matchedTarWEI[homref, 0]) + np.sum(matchedTarWEI[het, 1]) + numinfo = numinfo + len(homalt) + len(homref) + len(het) + snpmatch_result.scores = np.append(snpmatch_result.scores, score) + snpmatch_result.ninfo = np.append(snpmatch_result.ninfo, numinfo) + snpmatch_result.accs = np.append( snpmatch_result.accs, GenotypeData.accessions[i] + "x" + GenotypeData.accessions[j] ) + log.info("writing output!") + snpmatch_result.print_out_table( scoreFile ) + crossInterpreter(snpmatch_result, GenotypeData, binLen, outID) def potatoCrossIdentifier(args): - (snpCHR, snpPOS, snpGT, snpWEI, DPmean) = parsers.parseInput(inFile = args['inFile'], logDebug = args['logDebug']) - log.info("loading genotype files!") - GenotypeData = genotype.load_hdf5_genotype_data(args['hdf5File']) - GenotypeData_acc = genotype.load_hdf5_genotype_data(args['hdf5accFile']) - log.info("done!") - log.info("running cross identifier!") - crossIdentifier(args['binLen'],snpCHR, snpPOS, snpWEI, DPmean, GenotypeData, GenotypeData_acc, args['outFile']) - log.info("finished!") + inputs = parsers.ParseInputs(inFile = args['inFile'], logDebug = args['logDebug']) + log.info("loading genotype files!") + GenotypeData = genotype.load_hdf5_genotype_data(args['hdf5File']) + GenotypeData_acc = genotype.load_hdf5_genotype_data(args['hdf5accFile']) + log.info("done!") + log.info("running cross identifier!") + crossIdentifier(inputs, GenotypeData, GenotypeData_acc, args['binLen'], args['outFile']) + log.info("finished!") def getWindowGenotype(matchedP1, totalMarkers, lr_thres = 2.706): ### Choose lr_thres as 2.706 which is at 0.1 alpha level with 1 degree of freedom pval = 'NA' geno = 'NA' if totalMarkers > 0: - likelihood = snpmatch.calculate_likelihoods([matchedP1, totalMarkers - matchedP1], [totalMarkers, totalMarkers]) + likelihood = snpmatch.GenotyperOutput.calculate_likelihoods([matchedP1, totalMarkers - matchedP1], [totalMarkers, totalMarkers]) if np.max(likelihood[1]) > lr_thres: if likelihood[1][0] == 1: geno = 0 @@ -247,40 +263,39 @@ def getWindowGenotype(matchedP1, totalMarkers, lr_thres = 2.706): else: geno = 0.5 pval = np.max(likelihood[1]) - return (geno, pval) + return(geno, pval) def crossGenotypeWindows(commonSNPsCHR, commonSNPsPOS, snpsP1, snpsP2, inFile, binLen, outFile, logDebug = True): ## inFile are the SNPs of the sample - (snpCHR, snpPOS, snpGT, snpWEI, DPmean) = parsers.parseInput(inFile = inFile, logDebug = logDebug) + inputs = parsers.ParseInputs(inFile = inFile, logDebug = logDebug) # identifying the segregating SNPs between the accessions # only selecting 0 or 1 segSNPsind = np.where((snpsP1 != snpsP2) & (snpsP1 >= 0) & (snpsP2 >= 0) & (snpsP1 < 2) & (snpsP2 < 2))[0] log.info("number of segregating snps between parents: %s", len(segSNPsind)) - (ChrBins, PosBins) = getBinsSNPs(commonSNPsCHR, commonSNPsPOS, binLen) - log.info("number of bins: %s", len(ChrBins)) + iter_bins_genome = get_bins_arrays(commonSNPsCHR, commonSNPsPOS, binLen) + iter_bins_snps = get_bins_arrays(inputs.chrs, inputs.pos, binLen) + bin_inds = 0 outfile = open(outFile, 'w') - for i in range(len(PosBins)): - start = np.sum(PosBins[0:i]) - end = start + PosBins[i] - # first snp positions which are segregating and are in this window - reqPOSind = segSNPsind[np.where((segSNPsind < end) & (segSNPsind >= start))[0]] - reqPOS = commonSNPsPOS[reqPOSind] - perchrTarPosind = np.where(snpCHR == ChrBins[i])[0] - perchrTarPos = snpPOS[perchrTarPosind] - matchedAccInd = reqPOSind[np.where(np.in1d(reqPOS, perchrTarPos))[0]] - matchedTarInd = perchrTarPosind[np.where(np.in1d(perchrTarPos, reqPOS))[0]] - matchedTarGTs = snpGT[matchedTarInd] - try: - TarGTBinary = parsers.parseGT(matchedTarGTs) - TarGTBinary[np.where(TarGTBinary == 2)[0]] = 4 - genP1 = np.subtract(TarGTBinary, snpsP1[matchedAccInd]) - genP1no = len(np.where(genP1 == 0)[0]) - (geno, pval) = getWindowGenotype(genP1no, len(genP1)) - outfile.write("%s\t%s\t%s\t%s\t%s\n" % (i+1, genP1no, len(genP1), geno, pval)) - except: - outfile.write("%s\tNA\tNA\tNA\tNA\n" % (i+1)) - if i % 40 == 0: - log.info("progress: %s windows", i+10) + for e_b in iter_bins_snps: + # first snp positions which are segregating and are in this window + reqPOSind = segSNPsind[ np.where( np.in1d(segSNPsind, e_b[2]) )[0] ] + reqPOS = commonSNPsPOS[reqPOSind] + perchrTarPos = inputs.pos[e_s[2]] + matchedAccInd = reqPOSind[ np.where( np.in1d(reqPOS, perchrTarPos) )[0] ] + matchedTarInd = np.array(e_s[2], dtype=int)[ np.where( np.in1d(perchrTarPos, reqPOS) )[0] ] + matchedTarGTs = inputs.gt[matchedTarInd] + try: + TarGTBinary = parsers.parseGT(matchedTarGTs) + TarGTBinary[np.where(TarGTBinary == 2)[0]] = 4 + genP1 = np.subtract(TarGTBinary, snpsP1[matchedAccInd]) + genP1no = len(np.where(genP1 == 0)[0]) + (geno, pval) = getWindowGenotype(genP1no, len(genP1)) + outfile.write("%s\t%s\t%s\t%s\t%s\n" % (bin_inds+1, genP1no, len(genP1), geno, pval)) + except: + outfile.write("%s\tNA\tNA\tNA\tNA\n" % (bin_inds+1)) + bin_inds += 1 + if bin_inds % 40 == 0: + log.info("progress: %s windows", bin_inds) log.info("done!") outfile.close() @@ -298,25 +313,25 @@ def crossGenotyper(args): log.info("input files: %s and %s" % (args['parents'], args['father'])) if not os.path.isfile(args['parents']) and os.path.isfile(args['father']): die("either of the input files do not exists, please provide VCF/BED file for parent genotype information") - (p1snpCHR, p1snpPOS, p1snpGT, p1snpWEI, p1DPmean) = parsers.parseInput(inFile = args['parents'], logDebug = args['logDebug']) - (p2snpCHR, p2snpPOS, p2snpGT, p2snpWEI, p2DPmean) = parsers.parseInput(inFile = args['father'], logDebug = args['logDebug']) - commonCHRs_ids = np.union1d(p1snpCHR, p2snpCHR) + p1_snps = parsers.ParseInputs(inFile = args['parents'], logDebug = args['logDebug']) + p2_snps = parsers.ParseInputs(inFile = args['father'], logDebug = args['logDebug']) + commonCHRs_ids = np.union1d(p1_snps.chrs, p2_snps.chrs) commonSNPsCHR = np.zeros(0, dtype=commonCHRs_ids.dtype) commonSNPsPOS = np.zeros(0, dtype=int) snpsP1 = np.zeros(0, dtype='int8') snpsP2 = np.zeros(0, dtype='int8') for i in commonCHRs_ids: - perchrP1inds = np.where(p1snpCHR == i)[0] - perchrP2inds = np.where(p2snpCHR == i)[0] - perchrPositions = np.union1d(p1snpPOS[perchrP1inds], p2snpPOS[perchrP2inds]) + perchrP1inds = np.where(p1_snps.chrs == i)[0] + perchrP2inds = np.where(p2_snps.chrs == i)[0] + perchrPositions = np.union1d(p1_snps.pos[perchrP1inds], p2_snps.pos[perchrP2inds]) commonSNPsCHR = np.append(commonSNPsCHR, np.repeat(i, len(perchrPositions))) commonSNPsPOS = np.append(commonSNPsPOS, perchrPositions) perchrsnpsP1 = np.repeat(-1, len(perchrPositions)).astype('int8') perchrsnpsP2 = np.repeat(-1, len(perchrPositions)).astype('int8') - perchrsnpsP1_inds = np.where(np.in1d(p1snpPOS[perchrP1inds], perchrPositions))[0] - perchrsnpsP2_inds = np.where(np.in1d(p2snpPOS[perchrP2inds], perchrPositions))[0] - snpsP1 = np.append(snpsP1, parsers.parseGT(p1snpGT[perchrsnpsP1_inds])) - snpsP2 = np.append(snpsP2, parsers.parseGT(p2snpGT[perchrsnpsP2_inds])) + perchrsnpsP1_inds = np.where(np.in1d(p1_snps.pos[perchrP1inds], perchrPositions))[0] + perchrsnpsP2_inds = np.where(np.in1d(p2_snps.pos[perchrP2inds], perchrPositions))[0] + snpsP1 = np.append(snpsP1, parsers.parseGT(p1_snps.gt[perchrsnpsP1_inds])) + snpsP2 = np.append(snpsP2, parsers.parseGT(p2_snps.gt[perchrsnpsP2_inds])) log.info("done!") else: parents = args['parents'] diff --git a/snpmatch/core/parsers.py b/snpmatch/core/parsers.py index 14c4560..53516b6 100644 --- a/snpmatch/core/parsers.py +++ b/snpmatch/core/parsers.py @@ -31,98 +31,116 @@ def parseGT(snpGT): snpBinary[np.where(snpGT == nocall)[0]] = -1 return snpBinary -def parseChrName(targetCHR): - snpCHROM = np.char.replace(np.core.defchararray.lower(np.array(targetCHR, dtype="string")), "chr", "") - snpsREQ = np.where(np.char.isdigit(snpCHROM))[0] ## Filtering positions from mitochondrial and chloroplast - snpCHR = snpCHROM[snpsREQ] - return (snpCHR, snpsREQ) +class ParseInputs(object): + ## class object for parsing input files for SNPmatch -def readBED(inFile, logDebug): - log.info("reading the position file") - targetSNPs = pd.read_table(inFile, header=None, usecols=[0,1,2]) - (snpCHR, snpsREQ) = parseChrName(targetSNPs[0]) - snpPOS = np.array(targetSNPs[1], dtype=int)[snpsREQ] - snpGT = np.array(targetSNPs[2])[snpsREQ] - snpBinary = parseGT(snpGT) - snpWEI = np.ones((len(snpCHR), 3)) ## for homo and het - snpWEI[np.where(snpBinary != 0),0] = 0 - snpWEI[np.where(snpBinary != 1),2] = 0 - snpWEI[np.where(snpBinary != 2),1] = 0 - return (snpCHR, snpPOS, snpGT, snpWEI) + def __init__(self, inFile, logDebug, outFile = "parser" ): + if outFile == "parser" or not outFile: + outFile = inFile + ".snpmatch" + if os.path.isfile(inFile + ".snpmatch.npz"): + log.info("snpmatch parser dump found! loading %s", inFile + ".snpmatch.npz") + snps = np.load(inFile + ".snpmatch.npz") + self.load_snp_info(snps['chr'], snps['pos'], snps['gt'], snps['wei'], snps['dp']) + else: + _,inType = os.path.splitext(inFile) + if inType == '.npz': + log.info("loading snpmatch parser file! %s", inFile) + snps = np.load(inFile) + self.load_snp_info(snps['chr'], snps['pos'], snps['gt'], snps['wei'], snps['dp']) + else: + log.info('running snpmatch parser!') + if inType == '.vcf': + (snpCHR, snpPOS, snpGT, snpWEI, DPmean) = self.read_vcf(inFile, logDebug) -def readVcf(inFile, logDebug): - log.info("reading the VCF file") - ## We read only one sample from the VCF file - if logDebug: - vcf = allel.read_vcf(inFile, samples = [0], fields = '*') - else: - import StringIO - import sys - sys.stderr = StringIO.StringIO() - vcf = allel.read_vcf(inFile, samples = [0], fields = '*') - #vcf = vcfnp.variants(inFile, cache=False).view(np.recarray) - #vcfD = vcfnp.calldata_2d(inFile, cache=False).view(np.recarray) - sys.stderr = sys.__stderr__ - (snpCHR, snpsREQ) = parseChrName(vcf['variants/CHROM']) - try: - snpGT = allel.GenotypeArray(vcf['calldata/GT']).to_gt()[snpsREQ, 0] - except AttributeError: - snpmatch.die("input VCF file doesnt have required GT field") - snpsREQ = snpsREQ[np.where(snpGT != './.')[0]] - snpGT = allel.GenotypeArray(vcf['calldata/GT']).to_gt()[snpsREQ, 0] - if 'calldata/PL' in sorted(vcf.keys()): - snpWEI = np.copy(vcf['calldata/PL'][snpsREQ, 0]).astype('float') - snpWEI = snpWEI/(-10) - snpWEI = np.exp(snpWEI) - else: + elif inType == '.bed': + (snpCHR, snpPOS, snpGT, snpWEI, DPmean) = self.read_bed(inFile, logDebug) + else: + snpmatch.die("input file type %s not supported" % inType) + self.load_snp_info(snpCHR, snpPOS, snpGT, snpWEI, DPmean) + self.save_snp_info(outFile) + self.case_interpret_inputs(outFile + ".stats.json") + log.info("done!") + + def load_snp_info(self, snpCHR, snpPOS, snpGT, snpWEI, DPmean): + self.chrs = snpCHR + self.pos = snpPOS + self.gt = snpGT + self.wei = snpWEI + self.dp = DPmean + + def save_snp_info(self, outFile): + log.info("creating snpmatch parser file: %s", outFile + '.npz') + np.savez(outFile, chr = self.chrs, pos = self.pos, gt = self.gt, wei = self.wei, dp = self.dp) + + def case_interpret_inputs(self, outFile): + NumSNPs = len(self.chrs) + case = 0 + note = "Sufficient number of SNPs" + if NumSNPs < snpmatch.snp_thres: + note = "Attention: low number of SNPs provided" + case = 1 + snpst = np.unique(self.chrs, return_counts=True) + snpdict = dict(('%s' % snpst[0][i], snpst[1][i]) for i in range(len(snpst[0]))) + statdict = {"interpretation": {"case": case, "text": note}, "snps": snpdict, "num_of_snps": NumSNPs, "depth": self.dp} + statdict['percent_heterozygosity'] = snpmatch.getHeterozygosity(self.gt) + with open(outFile , "w") as out_stats: + out_stats.write(json.dumps(statdict)) + + @staticmethod + def read_bed(inFile, logDebug): + log.info("reading the position file") + targetSNPs = pd.read_table(inFile, header=None, usecols=[0,1,2]) + snpCHR = np.array(targetSNPs[0], dtype="string") + snpPOS = np.array(targetSNPs[1], dtype=int) + snpGT = np.array(targetSNPs[2]) snpBinary = parseGT(snpGT) - snpWEI = np.ones((len(snpsREQ), 3)) ## for homo and het + snpWEI = np.ones((len(snpCHR), 3)) ## for homo and het snpWEI[np.where(snpBinary != 0),0] = 0 snpWEI[np.where(snpBinary != 1),2] = 0 snpWEI[np.where(snpBinary != 2),1] = 0 - snpCHR = snpCHR[snpsREQ] - if 'calldata/DP' in sorted(vcf.keys()): - DPmean = np.mean(vcf['calldata/DP'][snpsREQ,0]) - else: - DPmean = "NA" - snpPOS = np.array(vcf['variants/POS'][snpsREQ]) - return (DPmean, snpCHR, snpPOS, snpGT, snpWEI) + return((snpCHR, snpPOS, snpGT, snpWEI, "NA")) -def parseInput(inFile, logDebug, outFile = "parser"): - if outFile == "parser" or not outFile: - outFile = inFile + ".snpmatch" - if os.path.isfile(inFile + ".snpmatch.npz"): - log.info("snpmatch parser dump found! loading %s", inFile + ".snpmatch.npz") - snps = np.load(inFile + ".snpmatch.npz") - (snpCHR, snpPOS, snpGT, snpWEI, DPmean) = (snps['chr'], snps['pos'], snps['gt'], snps['wei'], snps['dp']) - else: - _,inType = os.path.splitext(inFile) - if inType == '.npz': - log.info("loading snpmatch parser file! %s", inFile) - snps = np.load(inFile) - (snpCHR, snpPOS, snpGT, snpWEI, DPmean) = (snps['chr'], snps['pos'], snps['gt'], snps['wei'], snps['dp']) + @staticmethod + def read_vcf(inFile, logDebug): + if logDebug: + vcf = allel.read_vcf(inFile, samples = [0], fields = '*') else: - log.info('running snpmatch parser!') - if inType == '.vcf': - (DPmean, snpCHR, snpPOS, snpGT, snpWEI) = readVcf(inFile, logDebug) - elif inType == '.bed': - (snpCHR, snpPOS, snpGT, snpWEI) = readBED(inFile, logDebug) - DPmean = "NA" - else: - snpmatch.die("input file type %s not supported" % inType) - log.info("creating snpmatch parser file: %s", outFile + '.npz') - np.savez(outFile, chr = snpCHR, pos = snpPOS, gt = snpGT, wei = snpWEI, dp = DPmean) - NumSNPs = len(snpCHR) - case = 0 - note = "Sufficient number of SNPs" - if NumSNPs < snpmatch.snp_thres: - note = "Attention: low number of SNPs provided" - case = 1 - snpst = np.unique(snpCHR, return_counts=True) - snpdict = dict(('Chr%s' % snpst[0][i], snpst[1][i]) for i in range(len(snpst[0]))) - statdict = {"interpretation": {"case": case, "text": note}, "snps": snpdict, "num_of_snps": NumSNPs, "depth": DPmean} - statdict['percent_heterozygosity'] = snpmatch.getHeterozygosity(snpGT) - with open(outFile + ".stats.json", "w") as out_stats: - out_stats.write(json.dumps(statdict)) - log.info("done!") - return (snpCHR, snpPOS, snpGT, snpWEI, DPmean) + import StringIO + import sys + sys.stderr = StringIO.StringIO() + vcf = allel.read_vcf(inFile, samples = [0], fields = '*') + sys.stderr = sys.__stderr__ + try: + snpGT = allel.GenotypeArray(vcf['calldata/GT']).to_gt()[:, 0] + except AttributeError: + snpmatch.die("input VCF file doesnt have required GT field") + snpsREQ = np.where((snpGT != './.') & (snpGT != '.|.'))[0] + snpGT = snpGT[snpsREQ] + if 'calldata/PL' in sorted(vcf.keys()): + snpWEI = np.copy(vcf['calldata/PL'][snpsREQ, 0]).astype('float') + snpWEI = snpWEI/(-10) + snpWEI = np.exp(snpWEI) + else: + snpBinary = parseGT(snpGT) + snpWEI = np.ones((len(snpsREQ), 3)) ## for homo and het + snpWEI[np.where(snpBinary != 0),0] = 0 + snpWEI[np.where(snpBinary != 1),2] = 0 + snpWEI[np.where(snpBinary != 2),1] = 0 + snpCHR = np.array(vcf['variants/CHROM'][snpsREQ], dtype="string") + if 'calldata/DP' in sorted(vcf.keys()): + DPmean = np.mean(vcf['calldata/DP'][snpsREQ,0]) + else: + DPmean = "NA" + snpPOS = np.array(vcf['variants/POS'][snpsREQ]) + return((snpCHR, snpPOS, snpGT, snpWEI, DPmean)) + + def filter_chr_names(self, g): + ## provide genotypedata (pygwas genotype object) + self.chr_list = np.array(pd.Series(g.chrs).str.replace("chr", "", case=False), dtype="string") + self.chrs_nochr = np.array(pd.Series(self.chrs).str.replace("chr", "", case=False), dtype="string") + self.filter_inds_ix = np.where( np.in1d( self.chrs_nochr, self.chr_list ) )[0] + + +def potatoParser(inFile, logDebug, outFile = "parser"): + inputs = ParseInputs(inFile, logDebug, outFile) + return(inputs.chrs, inputs.pos, inputs.gt, inputs.wei, inputs.dp) diff --git a/snpmatch/core/snpmatch.py b/snpmatch/core/snpmatch.py index 23f701d..beeff21 100644 --- a/snpmatch/core/snpmatch.py +++ b/snpmatch/core/snpmatch.py @@ -33,62 +33,79 @@ def likeliTest(n, y): else: return np.nan -def calculate_likelihoods(ScoreList, NumInfoSites): - num_lines = len(ScoreList) - LikeLiHoods = [likeliTest(NumInfoSites[i], int(ScoreList[i])) for i in range(num_lines)] - LikeLiHoods = np.array(LikeLiHoods).astype("float") - TopHit = np.amin(LikeLiHoods) - LikeLiHoodRatios = [LikeLiHoods[i]/TopHit for i in range(num_lines)] - LikeLiHoodRatios = np.array(LikeLiHoodRatios).astype("float") - return (LikeLiHoods, LikeLiHoodRatios) - -def print_out_table(outFile, AccList, ScoreList, NumInfoSites, NumMatSNPs, DPmean): - (LikeLiHoods, LikeLiHoodRatios) = calculate_likelihoods(ScoreList, NumInfoSites) - if outFile: - out = open(outFile, 'w') - for i in range(len(AccList)): - score = float(ScoreList[i])/NumInfoSites[i] - out.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (AccList[i], int(ScoreList[i]), NumInfoSites[i], score, LikeLiHoods[i], LikeLiHoodRatios[i], NumMatSNPs, DPmean)) - out.close() - else: - for i in range(len(AccList)): - score = float(ScoreList[i])/NumInfoSites[i] - sys.stdout.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (AccList[i], int(ScoreList[i]), NumInfoSites[i], score, LikeLiHoods[i], LikeLiHoodRatios[i], NumMatSNPs, DPmean)) - -def CaseInterpreter(overlap, NumSNPs, topHits, probScore): - overlap_thres = 0.5 - num_lines = len(probScore) - case = 10 - if len(topHits) == 1: - case = 0 - note = "Unique hit" - elif np.nanmean(probScore[topHits]) > prob_thres: - case = 2 - note = "Ambiguous sample: Accessions in top hits can be really close" - elif overlap > overlap_thres: - case = 3 - note = "Ambiguous sample: Sample might contain mixture of DNA or contamination" - elif overlap < overlap_thres: - case = 4 - note = "Ambiguous sample: Overlap of SNPs is very low, sample may not be in database" - if case > 4: - case = 1 - note = "Ambiguous sample" - return (case, note) - -def print_topHits(outFile, AccList, ScoreList, NumInfoSites, overlap, NumMatSNPs): - num_lines = len(ScoreList) - (LikeLiHoods, LikeLiHoodRatios) = calculate_likelihoods(ScoreList, NumInfoSites) - topHits = np.where(LikeLiHoodRatios < lr_thres)[0] - probScore = [float(ScoreList[i])/NumInfoSites[i] for i in range(num_lines)] - overlapScore = [float(NumInfoSites[i])/NumMatSNPs for i in range(num_lines)] - probScore = np.array(probScore, dtype = float) - sorted_order = topHits[np.argsort(-probScore[topHits])] - (case, note) = CaseInterpreter(overlap, NumMatSNPs, topHits, probScore) - matches_dict = [(str(AccList[i]), probScore[i], int(NumInfoSites[i]), int(overlapScore[i])) for i in sorted_order] - topHitsDict = {'overlap': [overlap, NumMatSNPs], 'matches': matches_dict, 'interpretation':{'case': case, 'text': note}} - with open(outFile, "w") as out_stats: - out_stats.write(json.dumps(topHitsDict)) +class GenotyperOutput(object): + ## class object for main SNPmatch output + + def __init__(self, AccList, ScoreList, NumInfoSites, overlap, NumMatSNPs, DPmean ): + self.accs = np.array(AccList, dtype="string") + self.scores = np.array(ScoreList, dtype="int") + self.ninfo = np.array(NumInfoSites, dtype="int") + self.overlap = overlap + self.num_snps = NumMatSNPs + self.dp = DPmean + + def get_probabilities(self): + probs = [float(self.scores[i])/self.ninfo[i] for i in range(len(self.accs))] + self.probabilies = np.array(probs, dtype="float") + + @staticmethod + def calculate_likelihoods(scores, ninfo, amin = "calc"): + num_lines = len(scores) + LikeLiHoods = [likeliTest(ninfo[i], int(scores[i])) for i in range(num_lines)] + LikeLiHoods = np.array(LikeLiHoods, dtype = "float") + if amin == "calc": + TopHit = np.amin(LikeLiHoods) + else: + TopHit = float(amin) + LikeLiHoodRatios = [LikeLiHoods[i]/TopHit for i in range(num_lines)] + LikeLiHoodRatios = np.array(LikeLiHoodRatios, dtype="float") + return((LikeLiHoods, LikeLiHoodRatios)) + + def get_likelihoods(self, amin = "calc"): + (self.likelis, self.lrts) = self.calculate_likelihoods(self.scores, self.ninfo, amin) + + def print_out_table(self, outFile): + self.get_likelihoods() + self.get_probabilities() + if outFile: + out = open(outFile, 'w') + for i in range(len(self.accs)): + out.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (self.accs[i], self.scores[i], self.ninfo[i], self.probabilies[i], self.likelis[i], self.lrts[i], self.num_snps, self.dp)) + out.close() + else: + for i in range(len(self.accs)): + sys.stdout.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (self.accs[i], self.scores[i], self.ninfo[i], self.probabilies[i], self.likelis[i], self.lrts[i], self.num_snps, self.dp)) + + def print_json_output(self, outFile): + self.get_likelihoods() + self.get_probabilities() + topHits = np.where(self.lrts < lr_thres)[0] + overlapScore = [float(self.ninfo[i])/self.num_snps for i in range(len(self.accs))] + sorted_order = topHits[np.argsort(-self.probabilies[topHits])] + (case, note) = self.case_interpreter(topHits) + matches_dict = [(self.accs[i], self.probabilies[i], self.ninfo[i], overlapScore[i]) for i in sorted_order] + topHitsDict = {'overlap': [self.overlap, self.num_snps], 'matches': matches_dict, 'interpretation':{'case': case, 'text': note}} + with open(outFile, "w") as out_stats: + out_stats.write(json.dumps(topHitsDict)) + + def case_interpreter(self, topHits): + overlap_thres = 0.5 + case = 1 + note = "Ambiguous sample" + if len(topHits) == 1: + case = 0 + note = "Unique hit" + elif np.nanmean(self.probabilies[topHits]) > prob_thres: + case = 2 + note = "Ambiguous sample: Accessions in top hits can be really close" + elif self.overlap > overlap_thres: + case = 3 + note = "Ambiguous sample: Sample might contain mixture of DNA or contamination" + elif self.overlap < overlap_thres: + case = 4 + note = "Ambiguous sample: Overlap of SNPs is very low, sample may not be in database" + return(case, note) + def getHeterozygosity(snpGT, outFile='default'): snpBinary = parsers.parseGT(snpGT) @@ -101,79 +118,80 @@ def getHeterozygosity(snpGT, outFile='default'): out_stats.write(json.dumps(topHitsDict)) return float(numHets)/len(snpGT) -def genotyper(snpCHR, snpPOS, snpGT, snpWEI, DPmean, hdf5File, hdf5accFile, outFile): - NumSNPs = len(snpCHR) - log.info("loading database files") - GenotypeData = genotype.load_hdf5_genotype_data(hdf5File) - GenotypeData_acc = genotype.load_hdf5_genotype_data(hdf5accFile) - log.info("done!") - num_lines = len(GenotypeData.accessions) - ScoreList = np.zeros(num_lines, dtype="float") - NumInfoSites = np.zeros(len(GenotypeData.accessions), dtype="uint32") - NumMatSNPs = 0 - overlappedInds = np.zeros(0, dtype=int) - chunk_size = 1000 - for ind,echr in enumerate(parsers.parseChrName(GenotypeData.chrs)[0]): - perchrTarPos = np.where(snpCHR == echr)[0] - perchrtarSNPpos = snpPOS[perchrTarPos] - log.info("Analysing positions in chromosome %s", echr) - start = GenotypeData.chr_regions[ind][0] - end = GenotypeData.chr_regions[ind][1] - chrpositions = GenotypeData.positions[start:end] - matchedAccInd = np.where(np.in1d(chrpositions, perchrtarSNPpos))[0] + start - matchedTarInd = np.where(np.in1d(perchrtarSNPpos, chrpositions))[0] - matchedTarWei = snpWEI[perchrTarPos[matchedTarInd],] - TarGTs0 = np.zeros(len(matchedTarInd), dtype="int8") - TarGTs1 = np.ones(len(matchedTarInd), dtype="int8") + 1 - TarGTs2 = np.ones(len(matchedTarInd), dtype="int8") - overlappedInds = np.append(overlappedInds, perchrTarPos[matchedTarInd]) - NumMatSNPs = NumMatSNPs + len(matchedAccInd) - for j in range(0, len(matchedAccInd), chunk_size): - t1001SNPs = GenotypeData.snps[matchedAccInd[j:j+chunk_size],:] - samSNPs0 = np.reshape(np.repeat(TarGTs0[j:j+chunk_size], num_lines), (len(TarGTs0[j:j+chunk_size]),num_lines)) - samSNPs1 = np.reshape(np.repeat(TarGTs1[j:j+chunk_size], num_lines), (len(TarGTs1[j:j+chunk_size]),num_lines)) - samSNPs2 = np.reshape(np.repeat(TarGTs2[j:j+chunk_size], num_lines), (len(TarGTs2[j:j+chunk_size]),num_lines)) - tempScore0 = np.sum(np.multiply(np.array(t1001SNPs == samSNPs0, dtype=int).T, matchedTarWei[j:j+chunk_size,0]).T, axis=0) - tempScore1 = np.sum(np.multiply(np.array(t1001SNPs == samSNPs1, dtype=int).T, matchedTarWei[j:j+chunk_size,1]).T, axis=0) - tempScore2 = np.sum(np.multiply(np.array(t1001SNPs == samSNPs2, dtype=int).T, matchedTarWei[j:j+chunk_size,2]).T, axis=0) - ScoreList = ScoreList + tempScore0 + tempScore1 + tempScore2 - if(len(TarGTs0[j:j+chunk_size]) >= 1): - NumInfoSites = NumInfoSites + len(TarGTs0[j:j+chunk_size]) - np.sum(numpy.ma.masked_less(t1001SNPs, 0).mask.astype(int), axis = 0) # Number of informative sites - log.info("Done analysing %s positions", NumMatSNPs) - log.info("writing score file!") - overlap = float(NumMatSNPs)/NumSNPs - print_out_table(outFile + '.scores.txt',GenotypeData.accessions, ScoreList, NumInfoSites, NumMatSNPs, DPmean) - if not outFile: - outFile = "genotyper" - print_topHits(outFile + ".matches.json", GenotypeData.accessions, ScoreList, NumInfoSites, overlap, NumMatSNPs) - getHeterozygosity(snpGT[overlappedInds], outFile + ".matches.json") - return (ScoreList, NumInfoSites) +def genotyper(inputs, hdf5File, hdf5accFile, outFile): + log.info("loading database files") + GenotypeData = genotype.load_hdf5_genotype_data(hdf5File) + GenotypeData_acc = genotype.load_hdf5_genotype_data(hdf5accFile) + log.info("done!") + inputs.filter_chr_names(GenotypeData) + num_lines = len(GenotypeData.accessions) + ScoreList = np.zeros(num_lines, dtype="float") + NumInfoSites = np.zeros(len(GenotypeData.accessions), dtype="uint32") + NumMatSNPs = 0 + overlappedInds = np.zeros(0, dtype=int) + chunk_size = 1000 + for ind,echr in enumerate(inputs.chr_list): + perchrTarPos = np.where(inputs.chrs_nochr == echr)[0] + perchrtarSNPpos = inputs.pos[perchrTarPos] + log.info("Analysing positions in chromosome %s", echr) + start = GenotypeData.chr_regions[ind][0] + end = GenotypeData.chr_regions[ind][1] + chrpositions = GenotypeData.positions[start:end] + matchedAccInd = np.where(np.in1d(chrpositions, perchrtarSNPpos))[0] + start + matchedTarInd = np.where(np.in1d(perchrtarSNPpos, chrpositions))[0] + matchedTarWei = inputs.wei[perchrTarPos[matchedTarInd],] + TarGTs0 = np.zeros(len(matchedTarInd), dtype="int8") + TarGTs1 = np.ones(len(matchedTarInd), dtype="int8") + 1 + TarGTs2 = np.ones(len(matchedTarInd), dtype="int8") + overlappedInds = np.append(overlappedInds, perchrTarPos[matchedTarInd]) + NumMatSNPs = NumMatSNPs + len(matchedAccInd) + for j in range(0, len(matchedAccInd), chunk_size): + t1001SNPs = GenotypeData.snps[matchedAccInd[j:j+chunk_size],:] + samSNPs0 = np.reshape(np.repeat(TarGTs0[j:j+chunk_size], num_lines), (len(TarGTs0[j:j+chunk_size]),num_lines)) + samSNPs1 = np.reshape(np.repeat(TarGTs1[j:j+chunk_size], num_lines), (len(TarGTs1[j:j+chunk_size]),num_lines)) + samSNPs2 = np.reshape(np.repeat(TarGTs2[j:j+chunk_size], num_lines), (len(TarGTs2[j:j+chunk_size]),num_lines)) + tempScore0 = np.sum(np.multiply(np.array(t1001SNPs == samSNPs0, dtype=int).T, matchedTarWei[j:j+chunk_size,0]).T, axis=0) + tempScore1 = np.sum(np.multiply(np.array(t1001SNPs == samSNPs1, dtype=int).T, matchedTarWei[j:j+chunk_size,1]).T, axis=0) + tempScore2 = np.sum(np.multiply(np.array(t1001SNPs == samSNPs2, dtype=int).T, matchedTarWei[j:j+chunk_size,2]).T, axis=0) + ScoreList = ScoreList + tempScore0 + tempScore1 + tempScore2 + if(len(TarGTs0[j:j+chunk_size]) >= 1): + NumInfoSites = NumInfoSites + len(TarGTs0[j:j+chunk_size]) - np.sum(numpy.ma.masked_less(t1001SNPs, 0).mask.astype(int), axis = 0) # Number of informative sites + log.info("Done analysing %s positions", NumMatSNPs) + log.info("writing score file!") + overlap = float(NumMatSNPs)/len(inputs.filter_inds_ix) + result = GenotyperOutput(GenotypeData.accessions, ScoreList, NumInfoSites, overlap, NumMatSNPs, inputs.dp) + result.print_out_table( outFile + '.scores.txt' ) + if not outFile: + outFile = "genotyper" + result.print_json_output( outFile + ".matches.json" ) + getHeterozygosity(inputs.gt[overlappedInds], outFile + ".matches.json") + return(result) def potatoGenotyper(args): - (snpCHR, snpPOS, snpGT, snpWEI, DPmean) = parsers.parseInput(inFile = args['inFile'], logDebug = args['logDebug']) - log.info("running genotyper!") - (ScoreList, NumInfoSites) = genotyper(snpCHR, snpPOS, snpGT, snpWEI, DPmean, args['hdf5File'], args['hdf5accFile'], args['outFile']) - log.info("finished!") + inputs = parsers.ParseInputs(inFile = args['inFile'], logDebug = args['logDebug']) + log.info("running genotyper!") + result = genotyper(inputs, args['hdf5File'], args['hdf5accFile'], args['outFile']) + log.info("finished!") def pairwiseScore(inFile_1, inFile_2, logDebug, outFile): - (snpCHR1, snpPOS1, snpGT1, snpWEI1, DPmean1) = parsers.parseInput(inFile = inFile_1, logDebug = logDebug) - (snpCHR2, snpPOS2, snpGT2, snpWEI2, DPmean2) = parsers.parseInput(inFile = inFile_2, logDebug = logDebug) + inputs_1 = parsers.ParseInputs(inFile = inFile_1, logDebug = logDebug) + inputs_2 = parsers.ParseInputs(inFile = inFile_2, logDebug = logDebug) snpmatch_stats = {} unique_1, unique_2, common, scores = 0, 0, 0, 0 - chrs = np.union1d(snpCHR1, snpCHR2) - for i in chrs: - perchrTarPosInd1 = np.where(snpCHR1 == i)[0] - perchrTarPosInd2 = np.where(snpCHR2 == i)[0] + common_chrs = np.intersect1d(np.unique(inputs_1.chrs), np.unique(inputs_2.chrs)) + for i in common_chrs: + perchrTarPosInd1 = np.where(inputs_1.chrs == i)[0] + perchrTarPosInd2 = np.where(inputs_2.chrs == i)[0] log.info("Analysing chromosome %s positions", i) - perchrtarSNPpos1 = snpPOS1[perchrTarPosInd1] - perchrtarSNPpos2 = snpPOS2[perchrTarPosInd2] + perchrtarSNPpos1 = inputs_1.pos[perchrTarPosInd1] + perchrtarSNPpos2 = inputs_2.pos[perchrTarPosInd2] matchedAccInd1 = np.where(np.in1d(perchrtarSNPpos1, perchrtarSNPpos2))[0] matchedAccInd2 = np.where(np.in1d(perchrtarSNPpos2, perchrtarSNPpos1))[0] unique_1 = unique_1 + len(perchrTarPosInd1) - len(matchedAccInd1) unique_2 = unique_2 + len(perchrTarPosInd2) - len(matchedAccInd2) common = common + len(matchedAccInd1) - scores = scores + np.sum(np.array(snpGT1[matchedAccInd1] == snpGT2[matchedAccInd2], dtype = int)) - snpmatch_stats['unique'] = {"%s" % os.path.basename(inFile_1): [float(unique_1)/len(snpCHR1), len(snpCHR1)], "%s" % os.path.basename(inFile_2): [float(unique_2)/len(snpCHR2), len(snpCHR2)]} + scores = scores + np.sum(np.array(inputs_1.gt[matchedAccInd1] == inputs_2.gt[matchedAccInd2], dtype = int)) + snpmatch_stats['unique'] = {"%s" % os.path.basename(inFile_1): [float(unique_1)/len(inputs_1.chrs), len(inputs_1.chrs)], "%s" % os.path.basename(inFile_2): [float(unique_2)/len(inputs_2.chrs), len(inputs_2.chrs)]} snpmatch_stats['matches'] = [float(scores)/common, common] if not outFile: outFile = "genotyper"