#7 changed fields in the configuration script and modified the rest o…

…f the code accordingly. source is renamed to format, and the new value of source is used to determine column names in output. the output value source has been renamed to fn for clarity.
blachlylab · Feb 8, 2016 · ee20ba2 · ee20ba2
1 parent 9fb306d
commit ee20ba2
Show file tree

Hide file tree

Showing 5 changed files with 51 additions and 27 deletions.
diff --git a/config.py b/config.py
@@ -32,6 +32,7 @@ def __init__(self):
             self.regions = []
             self.filename2samples = {}
             self.source = {}
+            self.format = {}
 
     def __str__(self):
         string_rep =    "No. samples: {0}\n\n" + \

diff --git a/inputs.py b/inputs.py
@@ -33,6 +33,7 @@ class Parser(object):
     def __init__(self):
             self.row = ''
             self.source = ''
+            self.format = ''
             self.var = None
             self.fieldId = ''
             self.header = ''
@@ -53,8 +54,7 @@ def __init__(self):
                                         "GenericGATK":self.parse_GenericGATK,
                                         "INFOCol":self.parse_INFO_Column }
 
-    def parse(self, row, source):
-        self.source = source
+    def parse(self, row):
         self.row  = row
         chrom = row.chrom 
         ref   = row.ref
@@ -64,9 +64,9 @@ def parse(self, row, source):
         frac  = 0.0
         effect, fc = parse_EFC(row.info)
         out = []
-        samples_dict = self.supported_formats[self.source](row.samples)
+        samples_dict = self.supported_formats[self.format](row.samples)
         for sample, vals in samples_dict.items():
-            out.append(Variant(source='', sample=sample, pos=pos, ref=ref, alt=alt, frac=vals[1], dp=vals[0], eff=effect, fc=fc))
+            out.append(Variant(fn=self.fn, source=self.source, sample=sample, pos=pos, ref=ref, alt=alt, frac=vals[1], dp=vals[0], eff=effect, fc=fc))
         return out
 
     def parse_MiSeq(self,samples):

diff --git a/mucor.py b/mucor.py
@@ -200,6 +200,7 @@ def parseJSON(json_config):
             filename = os.path.basename(j['path'])
             config.filename2samples[filename] = i['id']
             config.source[filename] = j['source']
+            config.format[filename] = j['format']
             config.inputFiles.append(j['path'])
 
     return config 
@@ -389,7 +390,7 @@ def annotateDF(grp, databases):
     pos = grp['pos'].unique()[0]
     ref = grp['ref'].unique()[0]
     alt = grp['alt'].unique()[0]
-    var = Variant(source=None, sample=None, pos=HTSeq.GenomicPosition(chrom, pos), ref=ref, alt=alt, frac=None, dp=None, eff=None, fc=None)
+    var = Variant(fn=None, source=None, sample=None, pos=HTSeq.GenomicPosition(chrom, pos), ref=ref, alt=alt, frac=None, dp=None, eff=None, fc=None)
     dbEntries = dbLookup(var, databases)
     return pd.Series(dbEntries)
 
@@ -429,6 +430,7 @@ def integrateVar(var, varD, config, gas, knownFeatures, unrecognizedContigs, unr
     fc = var.fc
     sample = var.sample
     source = var.source
+    fn = var.fn 
     count = 0 # initialize this database column now to save for later
     freq = 0.0 # initialize this database column now to save for later
 
@@ -439,8 +441,8 @@ def integrateVar(var, varD, config, gas, knownFeatures, unrecognizedContigs, unr
 
     for feature in features.split(', '):
         # Removing columns from the following 'columns' list will mask them from output
-        columns = ['chr','pos','ref','alt','vf','dp','feature','effect','fc','count','freq','sample','source']
-        values  = [ chr,  pos,  ref,  alt,  vf,  dp,  feature,  effect,  fc , count,  freq,  sample, source  ]
+        columns = ['chr','pos','ref','alt','vf','dp','feature','effect','fc','count','freq','sample','source','fn' ]
+        values  = [ chr,  pos,  ref,  alt,  vf,  dp,  feature,  effect,  fc , count,  freq,  sample , source , fn  ]
 
         vardata = dict(zip( columns, values ))
         for key in vardata.keys():
@@ -505,9 +507,11 @@ def parseVariantFiles(config, knownFeatures, gas, databases, filters, regions, t
                 if filterRow(row, fieldId, filters, kind):  # filter rows as they come in, to prevent them from entering the dataframe
                     continue                                # this allows us to print the dataframe directly and have consistent output with variant_details.txt, etc.
                 source = config.source[ os.path.basename(fn) ]
+                format = config.format[ os.path.basename(fn) ]
                 parser = inputs.Parser()
                 parser.row = row
                 parser.source = source
+                parser.format = format
                 parser.fieldId = fieldId
                 parser.header = header
                 parser.fn = fn 
@@ -525,18 +529,21 @@ def parseVariantFiles(config, knownFeatures, gas, databases, filters, regions, t
             varReader = HTSeq.VCF_Reader(str(fn))
             varReader.parse_meta()
             varReader.make_info_dict()
+            parser = inputs.Parser()
+            format = config.format[ os.path.basename(fn) ]
+            parser.source = config.source[ os.path.basename(fn) ]
+            parser.fn = os.path.basename(fn)
+            parser.format = format
             for row in varReader:
                 if row.filter not in filters:
                     continue
                 if regions and not inRegionDict(row.pos.chrom, int(row.pos.pos), int(row.pos.pos), regionDict ):
                     continue
                 row.unpack_info(varReader.infodict)
-                parser = inputs.Parser()
-                source = config.source[ os.path.basename(fn) ]
                 try:
-                    samps = parser.parse(row, source)
+                    samps = parser.parse(row)
                 except KeyError:
-                    throwWarning("parsing " + fn + " as '" + source + "'")
+                    throwWarning("parsing " + fn + " as '" + format + "'")
                     print("Cannot parse file from an unsupported or unknown variant caller. \nPlease use supported variant software, or compose an input module compatible with inputs.py")
                     print("Options include: " + str(parser.supported_formats.keys()).replace("'",""))
                     break
@@ -549,9 +556,7 @@ def parseVariantFiles(config, knownFeatures, gas, databases, filters, regions, t
                         pass
                     if not var or var.sample not in config.samples:
                         # this sample has no mutation data at the given location, or this sample was not specified as a sample of interest in the JSON config 
-                        continue
-                    var.source = os.path.basename(fn)
-
+                        continue                    
                     varD, unrecognizedContigs, unrecognizedMutations = integrateVar(var, varD, config, gas, knownFeatures, unrecognizedContigs, unrecognizedMutations)
         else:
             throwWarning("Unable to parse file with extension '{0}': {1}".format(kind, fn))
@@ -561,7 +566,7 @@ def parseVariantFiles(config, knownFeatures, gas, databases, filters, regions, t
         totalTime = time.clock() - startTime
         print("{0:02d}:{1:02d}\t{2}".format(int(totalTime/60), int(totalTime % 60), fn))
 
-    columns = ['chr','pos','ref','alt','vf','dp','feature','effect','fc','count','freq','sample','source']
+    columns = ['chr','pos','ref','alt','vf','dp','feature','effect','fc','count','freq','sample','source','fn']
     # Transform data frame dictionary into pandas DF. Major speed increase compared to appending variants to the DF while reading the input files. 
     try:
         varDF = pd.DataFrame(varD, columns=columns)
@@ -623,7 +628,8 @@ def parseVariantFiles(config, knownFeatures, gas, databases, filters, regions, t
 
     #stop() # this command throws a warning
     varDF.replace('', '?', inplace=True)
-
+    from pdb import set_trace as stop
+    stop()
     return varDF, knownFeatures, gas 
 
 def printOutput(config, outputDirName, varDF):

diff --git a/mucor_config.py b/mucor_config.py
@@ -169,9 +169,9 @@ def blankJSONDict():
                 full_path = os.path.join(root, dirs, files)
                 source = "VarScan"
                 if os.path.splitext(files)[1] == str(".vcf") or files.endswith(".vcf.gz"):
-                    tmpSampleDict['files'].append({'type':'vcf', 'path':str(full_path), 'snpeff':False, 'source':source} )
+                    tmpSampleDict['files'].append({'type':'vcf', 'path':str(full_path), 'snpeff':False, 'format':source, 'source':source} )
                 elif os.path.splitext(files)[1] == str(".out"):
-                    tmpSampleDict['files'].append({'type':'mutect', 'path':str(full_path), 'source':source} )
+                    tmpSampleDict['files'].append({'type':'mutect', 'path':str(full_path), 'format':source, 'source':source} )
                 elif os.path.splitext(files)[1] == str(".bam"):
                     try:
                         tmpSampleDict['files'].append({'type':'bam', 'path':str(full_path)} )
@@ -180,9 +180,9 @@ def blankJSONDict():
                         tmpSampleDict['files'].append({'type':'bam', 'path':str(full_path)})
             # Not sure if these still work 
                 elif os.path.splitext(files)[1].lower() == str(".maf"):
-                    tmpSampleDict['files'].append({'type':'maf', 'path':str(full_path), 'source':source} )
+                    tmpSampleDict['files'].append({'type':'maf', 'path':str(full_path), 'format':source, 'source':source} )
                 elif os.path.splitext(files)[1].lower() == str(".gvf"):
-                    tmpSampleDict['files'].append({'type':'gvf', 'path':str(full_path), 'source':source} )
+                    tmpSampleDict['files'].append({'type':'gvf', 'path':str(full_path), 'format':source, 'source':source} )
                 else:
                     # If not a VCF, MAF, GVF, or Mutect .out type, ignore it. Uncomment the following line to see the names of files that are being ignored
                     # print("Found an unsupported file type " + str(full_path) + " for sample " + str(sid))
@@ -204,9 +204,9 @@ def processFile(full_path, tmpSampleDict):
             throwWarning(full_path)
             print("Malformed or empty VCF file: {0}".format(full_path))
         '''
-        tmpSampleDict['files'].append({'type':'vcf', 'path':str(full_path), 'snpeff':DetectSnpEffStatus(full_path), 'source':source} )
+        tmpSampleDict['files'].append({'type':'vcf', 'path':str(full_path), 'snpeff':DetectSnpEffStatus(full_path), 'format':source, 'source':source} )
     elif os.path.splitext(full_path)[1] == str(".out"):
-        tmpSampleDict['files'].append({'type':'mutect', 'path':str(full_path), 'source':source} )
+        tmpSampleDict['files'].append({'type':'mutect', 'path':str(full_path), 'format':source, 'source':source} )
     elif os.path.splitext(full_path)[1] == str(".bam"):
         try:
             tmpSampleDict['files'].append({'type':'bam','path':str(full_path)} )
@@ -216,9 +216,9 @@ def processFile(full_path, tmpSampleDict):
             tmpSampleDict['files'].append({'type':'bam','path':str(full_path)} )
 # Not sure if these still work 
     elif os.path.splitext(full_path)[1].lower() == str(".maf"):
-        tmpSampleDict['files'].append({'type':'maf', 'path':str(full_path), 'source':source} )
+        tmpSampleDict['files'].append({'type':'maf', 'path':str(full_path), 'format':source, 'source':source} )
     elif os.path.splitext(full_path)[1].lower() == str(".gvf"):
-        tmpSampleDict['files'].append({'type':'gvf', 'path':str(full_path), 'source':source} )
+        tmpSampleDict['files'].append({'type':'gvf', 'path':str(full_path), 'format':source, 'source':source} )
     else:
         # If not a VCF, MAF, GVF, or Mutect .out type, ignore it. Uncomment the following line to see the names of files that are being ignored
         # print("Found an unsupported file type " + str(full_path) + " for sample " + str(sid))
@@ -335,6 +335,8 @@ def getJSONDict(args):
         # directory crawl searching for sample name in the file name/path
         if args['project_directory']:
             for root, dirs, files in os.walk(args['project_directory']):
+                #from pdb import set_trace as stop
+                #stop()
                 for i in files:
                     if re.search(r"\b" + re.escape(sid) + r"\b", i): 
                     # be careful with sample names here. "U-23" will catch "U-238" etc. Occasional cases can be resolved by manually editing the JSON config file
@@ -353,8 +355,22 @@ def getJSONDict(args):
                 if sid in set(varReader.sampleids): 
                     tmpSampleDict = processFile(full_path, tmpSampleDict)
         # uniquify the list of files associated with the sample, in case they were supplied directly via (-i) and found while directory crawling (-d)
-        tmpSampleDict['files'] =  { v['path']:v for v in tmpSampleDict['files']}.values()
+        #     Note that these must be sorted if a sample will have more than 1 file associated with any "source"
+        tmpSampleDict['files'] =  sorted({ v['path']:v for v in tmpSampleDict['files']}.values())
         json_dict['samples'].append(tmpSampleDict)
+
+    # uniquify the sources 
+    # if the "source" field is to be used to name output columns, they must be unique
+    #     this code will add integer counts to the ends of repeated sources, e.g. VarScan.1, VarScan.2
+    for sample in range(len(json_dict['samples'])):
+        countDict = defaultdict(int)
+        incDict   = {x['format']:1 for x in json_dict['samples'][sample]['files']}
+        for x in range(len(json_dict['samples'][sample]['files'])):
+            countDict[json_dict['samples'][sample]['files'][x]['format']] += 1
+        for x in range(len(json_dict['samples'][sample]['files'])):
+            if countDict[json_dict['samples'][sample]['files'][x]['format']] > 1:
+                json_dict['samples'][sample]['files'][x]['source'] += str("." + str(incDict[json_dict['samples'][sample]['files'][x]['format']]))
+                incDict[json_dict['samples'][sample]['files'][x]['format']] += 1
     return json_dict
 
 def inconsistentFilesPerSample(json_dict):

diff --git a/variant.py b/variant.py
@@ -21,8 +21,9 @@
 
 class Variant:
 	'''Data about SNV and Indels'''
-	def __init__(self,source,sample,pos,ref,alt,frac,dp, eff,fc):
-		self.source = source	# source of variant - typically a filename
+	def __init__(self,fn, source,sample,pos,ref,alt,frac,dp, eff,fc):
+		self.fn = fn	# source of variant - typically a filename
+		self.source = source
 		self.sample = sample	# sample ID, as defined in the VCF 
 		self.pos = pos			# HTSeq.GenomicPosition
 		self.ref = ref