Skip to content

Commit

Permalink
#7 changed fields in the configuration script and modified the rest o…
Browse files Browse the repository at this point in the history
…f the code accordingly. source is renamed to format, and the new value of source is used to determine column names in output. the output value source has been renamed to fn for clarity.
  • Loading branch information
Karl Kroll committed Feb 8, 2016
1 parent 9fb306d commit ee20ba2
Show file tree
Hide file tree
Showing 5 changed files with 51 additions and 27 deletions.
1 change: 1 addition & 0 deletions config.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def __init__(self):
self.regions = []
self.filename2samples = {}
self.source = {}
self.format = {}

def __str__(self):
string_rep = "No. samples: {0}\n\n" + \
Expand Down
8 changes: 4 additions & 4 deletions inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ class Parser(object):
def __init__(self):
self.row = ''
self.source = ''
self.format = ''
self.var = None
self.fieldId = ''
self.header = ''
Expand All @@ -53,8 +54,7 @@ def __init__(self):
"GenericGATK":self.parse_GenericGATK,
"INFOCol":self.parse_INFO_Column }

def parse(self, row, source):
self.source = source
def parse(self, row):
self.row = row
chrom = row.chrom
ref = row.ref
Expand All @@ -64,9 +64,9 @@ def parse(self, row, source):
frac = 0.0
effect, fc = parse_EFC(row.info)
out = []
samples_dict = self.supported_formats[self.source](row.samples)
samples_dict = self.supported_formats[self.format](row.samples)
for sample, vals in samples_dict.items():
out.append(Variant(source='', sample=sample, pos=pos, ref=ref, alt=alt, frac=vals[1], dp=vals[0], eff=effect, fc=fc))
out.append(Variant(fn=self.fn, source=self.source, sample=sample, pos=pos, ref=ref, alt=alt, frac=vals[1], dp=vals[0], eff=effect, fc=fc))
return out

def parse_MiSeq(self,samples):
Expand Down
30 changes: 18 additions & 12 deletions mucor.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,7 @@ def parseJSON(json_config):
filename = os.path.basename(j['path'])
config.filename2samples[filename] = i['id']
config.source[filename] = j['source']
config.format[filename] = j['format']
config.inputFiles.append(j['path'])

return config
Expand Down Expand Up @@ -389,7 +390,7 @@ def annotateDF(grp, databases):
pos = grp['pos'].unique()[0]
ref = grp['ref'].unique()[0]
alt = grp['alt'].unique()[0]
var = Variant(source=None, sample=None, pos=HTSeq.GenomicPosition(chrom, pos), ref=ref, alt=alt, frac=None, dp=None, eff=None, fc=None)
var = Variant(fn=None, source=None, sample=None, pos=HTSeq.GenomicPosition(chrom, pos), ref=ref, alt=alt, frac=None, dp=None, eff=None, fc=None)
dbEntries = dbLookup(var, databases)
return pd.Series(dbEntries)

Expand Down Expand Up @@ -429,6 +430,7 @@ def integrateVar(var, varD, config, gas, knownFeatures, unrecognizedContigs, unr
fc = var.fc
sample = var.sample
source = var.source
fn = var.fn
count = 0 # initialize this database column now to save for later
freq = 0.0 # initialize this database column now to save for later

Expand All @@ -439,8 +441,8 @@ def integrateVar(var, varD, config, gas, knownFeatures, unrecognizedContigs, unr

for feature in features.split(', '):
# Removing columns from the following 'columns' list will mask them from output
columns = ['chr','pos','ref','alt','vf','dp','feature','effect','fc','count','freq','sample','source']
values = [ chr, pos, ref, alt, vf, dp, feature, effect, fc , count, freq, sample, source ]
columns = ['chr','pos','ref','alt','vf','dp','feature','effect','fc','count','freq','sample','source','fn' ]
values = [ chr, pos, ref, alt, vf, dp, feature, effect, fc , count, freq, sample , source , fn ]

vardata = dict(zip( columns, values ))
for key in vardata.keys():
Expand Down Expand Up @@ -505,9 +507,11 @@ def parseVariantFiles(config, knownFeatures, gas, databases, filters, regions, t
if filterRow(row, fieldId, filters, kind): # filter rows as they come in, to prevent them from entering the dataframe
continue # this allows us to print the dataframe directly and have consistent output with variant_details.txt, etc.
source = config.source[ os.path.basename(fn) ]
format = config.format[ os.path.basename(fn) ]
parser = inputs.Parser()
parser.row = row
parser.source = source
parser.format = format
parser.fieldId = fieldId
parser.header = header
parser.fn = fn
Expand All @@ -525,18 +529,21 @@ def parseVariantFiles(config, knownFeatures, gas, databases, filters, regions, t
varReader = HTSeq.VCF_Reader(str(fn))
varReader.parse_meta()
varReader.make_info_dict()
parser = inputs.Parser()
format = config.format[ os.path.basename(fn) ]
parser.source = config.source[ os.path.basename(fn) ]
parser.fn = os.path.basename(fn)
parser.format = format
for row in varReader:
if row.filter not in filters:
continue
if regions and not inRegionDict(row.pos.chrom, int(row.pos.pos), int(row.pos.pos), regionDict ):
continue
row.unpack_info(varReader.infodict)
parser = inputs.Parser()
source = config.source[ os.path.basename(fn) ]
try:
samps = parser.parse(row, source)
samps = parser.parse(row)
except KeyError:
throwWarning("parsing " + fn + " as '" + source + "'")
throwWarning("parsing " + fn + " as '" + format + "'")
print("Cannot parse file from an unsupported or unknown variant caller. \nPlease use supported variant software, or compose an input module compatible with inputs.py")
print("Options include: " + str(parser.supported_formats.keys()).replace("'",""))
break
Expand All @@ -549,9 +556,7 @@ def parseVariantFiles(config, knownFeatures, gas, databases, filters, regions, t
pass
if not var or var.sample not in config.samples:
# this sample has no mutation data at the given location, or this sample was not specified as a sample of interest in the JSON config
continue
var.source = os.path.basename(fn)

continue
varD, unrecognizedContigs, unrecognizedMutations = integrateVar(var, varD, config, gas, knownFeatures, unrecognizedContigs, unrecognizedMutations)
else:
throwWarning("Unable to parse file with extension '{0}': {1}".format(kind, fn))
Expand All @@ -561,7 +566,7 @@ def parseVariantFiles(config, knownFeatures, gas, databases, filters, regions, t
totalTime = time.clock() - startTime
print("{0:02d}:{1:02d}\t{2}".format(int(totalTime/60), int(totalTime % 60), fn))

columns = ['chr','pos','ref','alt','vf','dp','feature','effect','fc','count','freq','sample','source']
columns = ['chr','pos','ref','alt','vf','dp','feature','effect','fc','count','freq','sample','source','fn']
# Transform data frame dictionary into pandas DF. Major speed increase compared to appending variants to the DF while reading the input files.
try:
varDF = pd.DataFrame(varD, columns=columns)
Expand Down Expand Up @@ -623,7 +628,8 @@ def parseVariantFiles(config, knownFeatures, gas, databases, filters, regions, t

#stop() # this command throws a warning
varDF.replace('', '?', inplace=True)

from pdb import set_trace as stop
stop()
return varDF, knownFeatures, gas

def printOutput(config, outputDirName, varDF):
Expand Down
34 changes: 25 additions & 9 deletions mucor_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,9 +169,9 @@ def blankJSONDict():
full_path = os.path.join(root, dirs, files)
source = "VarScan"
if os.path.splitext(files)[1] == str(".vcf") or files.endswith(".vcf.gz"):
tmpSampleDict['files'].append({'type':'vcf', 'path':str(full_path), 'snpeff':False, 'source':source} )
tmpSampleDict['files'].append({'type':'vcf', 'path':str(full_path), 'snpeff':False, 'format':source, 'source':source} )
elif os.path.splitext(files)[1] == str(".out"):
tmpSampleDict['files'].append({'type':'mutect', 'path':str(full_path), 'source':source} )
tmpSampleDict['files'].append({'type':'mutect', 'path':str(full_path), 'format':source, 'source':source} )
elif os.path.splitext(files)[1] == str(".bam"):
try:
tmpSampleDict['files'].append({'type':'bam', 'path':str(full_path)} )
Expand All @@ -180,9 +180,9 @@ def blankJSONDict():
tmpSampleDict['files'].append({'type':'bam', 'path':str(full_path)})
# Not sure if these still work
elif os.path.splitext(files)[1].lower() == str(".maf"):
tmpSampleDict['files'].append({'type':'maf', 'path':str(full_path), 'source':source} )
tmpSampleDict['files'].append({'type':'maf', 'path':str(full_path), 'format':source, 'source':source} )
elif os.path.splitext(files)[1].lower() == str(".gvf"):
tmpSampleDict['files'].append({'type':'gvf', 'path':str(full_path), 'source':source} )
tmpSampleDict['files'].append({'type':'gvf', 'path':str(full_path), 'format':source, 'source':source} )
else:
# If not a VCF, MAF, GVF, or Mutect .out type, ignore it. Uncomment the following line to see the names of files that are being ignored
# print("Found an unsupported file type " + str(full_path) + " for sample " + str(sid))
Expand All @@ -204,9 +204,9 @@ def processFile(full_path, tmpSampleDict):
throwWarning(full_path)
print("Malformed or empty VCF file: {0}".format(full_path))
'''
tmpSampleDict['files'].append({'type':'vcf', 'path':str(full_path), 'snpeff':DetectSnpEffStatus(full_path), 'source':source} )
tmpSampleDict['files'].append({'type':'vcf', 'path':str(full_path), 'snpeff':DetectSnpEffStatus(full_path), 'format':source, 'source':source} )
elif os.path.splitext(full_path)[1] == str(".out"):
tmpSampleDict['files'].append({'type':'mutect', 'path':str(full_path), 'source':source} )
tmpSampleDict['files'].append({'type':'mutect', 'path':str(full_path), 'format':source, 'source':source} )
elif os.path.splitext(full_path)[1] == str(".bam"):
try:
tmpSampleDict['files'].append({'type':'bam','path':str(full_path)} )
Expand All @@ -216,9 +216,9 @@ def processFile(full_path, tmpSampleDict):
tmpSampleDict['files'].append({'type':'bam','path':str(full_path)} )
# Not sure if these still work
elif os.path.splitext(full_path)[1].lower() == str(".maf"):
tmpSampleDict['files'].append({'type':'maf', 'path':str(full_path), 'source':source} )
tmpSampleDict['files'].append({'type':'maf', 'path':str(full_path), 'format':source, 'source':source} )
elif os.path.splitext(full_path)[1].lower() == str(".gvf"):
tmpSampleDict['files'].append({'type':'gvf', 'path':str(full_path), 'source':source} )
tmpSampleDict['files'].append({'type':'gvf', 'path':str(full_path), 'format':source, 'source':source} )
else:
# If not a VCF, MAF, GVF, or Mutect .out type, ignore it. Uncomment the following line to see the names of files that are being ignored
# print("Found an unsupported file type " + str(full_path) + " for sample " + str(sid))
Expand Down Expand Up @@ -335,6 +335,8 @@ def getJSONDict(args):
# directory crawl searching for sample name in the file name/path
if args['project_directory']:
for root, dirs, files in os.walk(args['project_directory']):
#from pdb import set_trace as stop
#stop()
for i in files:
if re.search(r"\b" + re.escape(sid) + r"\b", i):
# be careful with sample names here. "U-23" will catch "U-238" etc. Occasional cases can be resolved by manually editing the JSON config file
Expand All @@ -353,8 +355,22 @@ def getJSONDict(args):
if sid in set(varReader.sampleids):
tmpSampleDict = processFile(full_path, tmpSampleDict)
# uniquify the list of files associated with the sample, in case they were supplied directly via (-i) and found while directory crawling (-d)
tmpSampleDict['files'] = { v['path']:v for v in tmpSampleDict['files']}.values()
# Note that these must be sorted if a sample will have more than 1 file associated with any "source"
tmpSampleDict['files'] = sorted({ v['path']:v for v in tmpSampleDict['files']}.values())
json_dict['samples'].append(tmpSampleDict)

# uniquify the sources
# if the "source" field is to be used to name output columns, they must be unique
# this code will add integer counts to the ends of repeated sources, e.g. VarScan.1, VarScan.2
for sample in range(len(json_dict['samples'])):
countDict = defaultdict(int)
incDict = {x['format']:1 for x in json_dict['samples'][sample]['files']}
for x in range(len(json_dict['samples'][sample]['files'])):
countDict[json_dict['samples'][sample]['files'][x]['format']] += 1
for x in range(len(json_dict['samples'][sample]['files'])):
if countDict[json_dict['samples'][sample]['files'][x]['format']] > 1:
json_dict['samples'][sample]['files'][x]['source'] += str("." + str(incDict[json_dict['samples'][sample]['files'][x]['format']]))
incDict[json_dict['samples'][sample]['files'][x]['format']] += 1
return json_dict

def inconsistentFilesPerSample(json_dict):
Expand Down
5 changes: 3 additions & 2 deletions variant.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,9 @@

class Variant:
'''Data about SNV and Indels'''
def __init__(self,source,sample,pos,ref,alt,frac,dp, eff,fc):
self.source = source # source of variant - typically a filename
def __init__(self,fn, source,sample,pos,ref,alt,frac,dp, eff,fc):
self.fn = fn # source of variant - typically a filename
self.source = source
self.sample = sample # sample ID, as defined in the VCF
self.pos = pos # HTSeq.GenomicPosition
self.ref = ref
Expand Down

0 comments on commit ee20ba2

Please sign in to comment.