From ee20ba26def575383c2f67934163cf49f711e3e2 Mon Sep 17 00:00:00 2001 From: Karl Kroll Date: Mon, 8 Feb 2016 16:00:32 -0500 Subject: [PATCH] #7 changed fields in the configuration script and modified the rest of the code accordingly. source is renamed to format, and the new value of source is used to determine column names in output. the output value source has been renamed to fn for clarity. --- config.py | 1 + inputs.py | 8 ++++---- mucor.py | 30 ++++++++++++++++++------------ mucor_config.py | 34 +++++++++++++++++++++++++--------- variant.py | 5 +++-- 5 files changed, 51 insertions(+), 27 deletions(-) diff --git a/config.py b/config.py index f753d31..00e7045 100644 --- a/config.py +++ b/config.py @@ -32,6 +32,7 @@ def __init__(self): self.regions = [] self.filename2samples = {} self.source = {} + self.format = {} def __str__(self): string_rep = "No. samples: {0}\n\n" + \ diff --git a/inputs.py b/inputs.py index 206c9a7..779addf 100644 --- a/inputs.py +++ b/inputs.py @@ -33,6 +33,7 @@ class Parser(object): def __init__(self): self.row = '' self.source = '' + self.format = '' self.var = None self.fieldId = '' self.header = '' @@ -53,8 +54,7 @@ def __init__(self): "GenericGATK":self.parse_GenericGATK, "INFOCol":self.parse_INFO_Column } - def parse(self, row, source): - self.source = source + def parse(self, row): self.row = row chrom = row.chrom ref = row.ref @@ -64,9 +64,9 @@ def parse(self, row, source): frac = 0.0 effect, fc = parse_EFC(row.info) out = [] - samples_dict = self.supported_formats[self.source](row.samples) + samples_dict = self.supported_formats[self.format](row.samples) for sample, vals in samples_dict.items(): - out.append(Variant(source='', sample=sample, pos=pos, ref=ref, alt=alt, frac=vals[1], dp=vals[0], eff=effect, fc=fc)) + out.append(Variant(fn=self.fn, source=self.source, sample=sample, pos=pos, ref=ref, alt=alt, frac=vals[1], dp=vals[0], eff=effect, fc=fc)) return out def parse_MiSeq(self,samples): diff --git a/mucor.py b/mucor.py index e254f37..7a9fa86 100755 --- a/mucor.py +++ b/mucor.py @@ -200,6 +200,7 @@ def parseJSON(json_config): filename = os.path.basename(j['path']) config.filename2samples[filename] = i['id'] config.source[filename] = j['source'] + config.format[filename] = j['format'] config.inputFiles.append(j['path']) return config @@ -389,7 +390,7 @@ def annotateDF(grp, databases): pos = grp['pos'].unique()[0] ref = grp['ref'].unique()[0] alt = grp['alt'].unique()[0] - var = Variant(source=None, sample=None, pos=HTSeq.GenomicPosition(chrom, pos), ref=ref, alt=alt, frac=None, dp=None, eff=None, fc=None) + var = Variant(fn=None, source=None, sample=None, pos=HTSeq.GenomicPosition(chrom, pos), ref=ref, alt=alt, frac=None, dp=None, eff=None, fc=None) dbEntries = dbLookup(var, databases) return pd.Series(dbEntries) @@ -429,6 +430,7 @@ def integrateVar(var, varD, config, gas, knownFeatures, unrecognizedContigs, unr fc = var.fc sample = var.sample source = var.source + fn = var.fn count = 0 # initialize this database column now to save for later freq = 0.0 # initialize this database column now to save for later @@ -439,8 +441,8 @@ def integrateVar(var, varD, config, gas, knownFeatures, unrecognizedContigs, unr for feature in features.split(', '): # Removing columns from the following 'columns' list will mask them from output - columns = ['chr','pos','ref','alt','vf','dp','feature','effect','fc','count','freq','sample','source'] - values = [ chr, pos, ref, alt, vf, dp, feature, effect, fc , count, freq, sample, source ] + columns = ['chr','pos','ref','alt','vf','dp','feature','effect','fc','count','freq','sample','source','fn' ] + values = [ chr, pos, ref, alt, vf, dp, feature, effect, fc , count, freq, sample , source , fn ] vardata = dict(zip( columns, values )) for key in vardata.keys(): @@ -505,9 +507,11 @@ def parseVariantFiles(config, knownFeatures, gas, databases, filters, regions, t if filterRow(row, fieldId, filters, kind): # filter rows as they come in, to prevent them from entering the dataframe continue # this allows us to print the dataframe directly and have consistent output with variant_details.txt, etc. source = config.source[ os.path.basename(fn) ] + format = config.format[ os.path.basename(fn) ] parser = inputs.Parser() parser.row = row parser.source = source + parser.format = format parser.fieldId = fieldId parser.header = header parser.fn = fn @@ -525,18 +529,21 @@ def parseVariantFiles(config, knownFeatures, gas, databases, filters, regions, t varReader = HTSeq.VCF_Reader(str(fn)) varReader.parse_meta() varReader.make_info_dict() + parser = inputs.Parser() + format = config.format[ os.path.basename(fn) ] + parser.source = config.source[ os.path.basename(fn) ] + parser.fn = os.path.basename(fn) + parser.format = format for row in varReader: if row.filter not in filters: continue if regions and not inRegionDict(row.pos.chrom, int(row.pos.pos), int(row.pos.pos), regionDict ): continue row.unpack_info(varReader.infodict) - parser = inputs.Parser() - source = config.source[ os.path.basename(fn) ] try: - samps = parser.parse(row, source) + samps = parser.parse(row) except KeyError: - throwWarning("parsing " + fn + " as '" + source + "'") + throwWarning("parsing " + fn + " as '" + format + "'") print("Cannot parse file from an unsupported or unknown variant caller. \nPlease use supported variant software, or compose an input module compatible with inputs.py") print("Options include: " + str(parser.supported_formats.keys()).replace("'","")) break @@ -549,9 +556,7 @@ def parseVariantFiles(config, knownFeatures, gas, databases, filters, regions, t pass if not var or var.sample not in config.samples: # this sample has no mutation data at the given location, or this sample was not specified as a sample of interest in the JSON config - continue - var.source = os.path.basename(fn) - + continue varD, unrecognizedContigs, unrecognizedMutations = integrateVar(var, varD, config, gas, knownFeatures, unrecognizedContigs, unrecognizedMutations) else: throwWarning("Unable to parse file with extension '{0}': {1}".format(kind, fn)) @@ -561,7 +566,7 @@ def parseVariantFiles(config, knownFeatures, gas, databases, filters, regions, t totalTime = time.clock() - startTime print("{0:02d}:{1:02d}\t{2}".format(int(totalTime/60), int(totalTime % 60), fn)) - columns = ['chr','pos','ref','alt','vf','dp','feature','effect','fc','count','freq','sample','source'] + columns = ['chr','pos','ref','alt','vf','dp','feature','effect','fc','count','freq','sample','source','fn'] # Transform data frame dictionary into pandas DF. Major speed increase compared to appending variants to the DF while reading the input files. try: varDF = pd.DataFrame(varD, columns=columns) @@ -623,7 +628,8 @@ def parseVariantFiles(config, knownFeatures, gas, databases, filters, regions, t #stop() # this command throws a warning varDF.replace('', '?', inplace=True) - + from pdb import set_trace as stop + stop() return varDF, knownFeatures, gas def printOutput(config, outputDirName, varDF): diff --git a/mucor_config.py b/mucor_config.py index 0361fc9..4f22639 100755 --- a/mucor_config.py +++ b/mucor_config.py @@ -169,9 +169,9 @@ def blankJSONDict(): full_path = os.path.join(root, dirs, files) source = "VarScan" if os.path.splitext(files)[1] == str(".vcf") or files.endswith(".vcf.gz"): - tmpSampleDict['files'].append({'type':'vcf', 'path':str(full_path), 'snpeff':False, 'source':source} ) + tmpSampleDict['files'].append({'type':'vcf', 'path':str(full_path), 'snpeff':False, 'format':source, 'source':source} ) elif os.path.splitext(files)[1] == str(".out"): - tmpSampleDict['files'].append({'type':'mutect', 'path':str(full_path), 'source':source} ) + tmpSampleDict['files'].append({'type':'mutect', 'path':str(full_path), 'format':source, 'source':source} ) elif os.path.splitext(files)[1] == str(".bam"): try: tmpSampleDict['files'].append({'type':'bam', 'path':str(full_path)} ) @@ -180,9 +180,9 @@ def blankJSONDict(): tmpSampleDict['files'].append({'type':'bam', 'path':str(full_path)}) # Not sure if these still work elif os.path.splitext(files)[1].lower() == str(".maf"): - tmpSampleDict['files'].append({'type':'maf', 'path':str(full_path), 'source':source} ) + tmpSampleDict['files'].append({'type':'maf', 'path':str(full_path), 'format':source, 'source':source} ) elif os.path.splitext(files)[1].lower() == str(".gvf"): - tmpSampleDict['files'].append({'type':'gvf', 'path':str(full_path), 'source':source} ) + tmpSampleDict['files'].append({'type':'gvf', 'path':str(full_path), 'format':source, 'source':source} ) else: # If not a VCF, MAF, GVF, or Mutect .out type, ignore it. Uncomment the following line to see the names of files that are being ignored # print("Found an unsupported file type " + str(full_path) + " for sample " + str(sid)) @@ -204,9 +204,9 @@ def processFile(full_path, tmpSampleDict): throwWarning(full_path) print("Malformed or empty VCF file: {0}".format(full_path)) ''' - tmpSampleDict['files'].append({'type':'vcf', 'path':str(full_path), 'snpeff':DetectSnpEffStatus(full_path), 'source':source} ) + tmpSampleDict['files'].append({'type':'vcf', 'path':str(full_path), 'snpeff':DetectSnpEffStatus(full_path), 'format':source, 'source':source} ) elif os.path.splitext(full_path)[1] == str(".out"): - tmpSampleDict['files'].append({'type':'mutect', 'path':str(full_path), 'source':source} ) + tmpSampleDict['files'].append({'type':'mutect', 'path':str(full_path), 'format':source, 'source':source} ) elif os.path.splitext(full_path)[1] == str(".bam"): try: tmpSampleDict['files'].append({'type':'bam','path':str(full_path)} ) @@ -216,9 +216,9 @@ def processFile(full_path, tmpSampleDict): tmpSampleDict['files'].append({'type':'bam','path':str(full_path)} ) # Not sure if these still work elif os.path.splitext(full_path)[1].lower() == str(".maf"): - tmpSampleDict['files'].append({'type':'maf', 'path':str(full_path), 'source':source} ) + tmpSampleDict['files'].append({'type':'maf', 'path':str(full_path), 'format':source, 'source':source} ) elif os.path.splitext(full_path)[1].lower() == str(".gvf"): - tmpSampleDict['files'].append({'type':'gvf', 'path':str(full_path), 'source':source} ) + tmpSampleDict['files'].append({'type':'gvf', 'path':str(full_path), 'format':source, 'source':source} ) else: # If not a VCF, MAF, GVF, or Mutect .out type, ignore it. Uncomment the following line to see the names of files that are being ignored # print("Found an unsupported file type " + str(full_path) + " for sample " + str(sid)) @@ -335,6 +335,8 @@ def getJSONDict(args): # directory crawl searching for sample name in the file name/path if args['project_directory']: for root, dirs, files in os.walk(args['project_directory']): + #from pdb import set_trace as stop + #stop() for i in files: if re.search(r"\b" + re.escape(sid) + r"\b", i): # be careful with sample names here. "U-23" will catch "U-238" etc. Occasional cases can be resolved by manually editing the JSON config file @@ -353,8 +355,22 @@ def getJSONDict(args): if sid in set(varReader.sampleids): tmpSampleDict = processFile(full_path, tmpSampleDict) # uniquify the list of files associated with the sample, in case they were supplied directly via (-i) and found while directory crawling (-d) - tmpSampleDict['files'] = { v['path']:v for v in tmpSampleDict['files']}.values() + # Note that these must be sorted if a sample will have more than 1 file associated with any "source" + tmpSampleDict['files'] = sorted({ v['path']:v for v in tmpSampleDict['files']}.values()) json_dict['samples'].append(tmpSampleDict) + + # uniquify the sources + # if the "source" field is to be used to name output columns, they must be unique + # this code will add integer counts to the ends of repeated sources, e.g. VarScan.1, VarScan.2 + for sample in range(len(json_dict['samples'])): + countDict = defaultdict(int) + incDict = {x['format']:1 for x in json_dict['samples'][sample]['files']} + for x in range(len(json_dict['samples'][sample]['files'])): + countDict[json_dict['samples'][sample]['files'][x]['format']] += 1 + for x in range(len(json_dict['samples'][sample]['files'])): + if countDict[json_dict['samples'][sample]['files'][x]['format']] > 1: + json_dict['samples'][sample]['files'][x]['source'] += str("." + str(incDict[json_dict['samples'][sample]['files'][x]['format']])) + incDict[json_dict['samples'][sample]['files'][x]['format']] += 1 return json_dict def inconsistentFilesPerSample(json_dict): diff --git a/variant.py b/variant.py index 8642d2f..7821b0e 100644 --- a/variant.py +++ b/variant.py @@ -21,8 +21,9 @@ class Variant: '''Data about SNV and Indels''' - def __init__(self,source,sample,pos,ref,alt,frac,dp, eff,fc): - self.source = source # source of variant - typically a filename + def __init__(self,fn, source,sample,pos,ref,alt,frac,dp, eff,fc): + self.fn = fn # source of variant - typically a filename + self.source = source self.sample = sample # sample ID, as defined in the VCF self.pos = pos # HTSeq.GenomicPosition self.ref = ref