Skip to content

Commit

Permalink
#7 integrated SampleVariantDetails format in outputs. minor update to…
Browse files Browse the repository at this point in the history
… config to detect unknown file formats
  • Loading branch information
Karl Kroll committed Feb 10, 2016
1 parent ee20ba2 commit 730d43f
Show file tree
Hide file tree
Showing 3 changed files with 108 additions and 5 deletions.
2 changes: 0 additions & 2 deletions mucor.py
Original file line number Diff line number Diff line change
Expand Up @@ -628,8 +628,6 @@ def parseVariantFiles(config, knownFeatures, gas, databases, filters, regions, t

#stop() # this command throws a warning
varDF.replace('', '?', inplace=True)
from pdb import set_trace as stop
stop()
return varDF, knownFeatures, gas

def printOutput(config, outputDirName, varDF):
Expand Down
18 changes: 17 additions & 1 deletion mucor_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,7 +316,7 @@ def getJSONDict(args):
# Output formats
json_dict['outputFormats'] = []
for i in str(args['output_type']).split(','):
json_dict['outputFormats'].append(str(i)) #.lower() was here for some reason
json_dict['outputFormats'].append(str(i)) #.lower() can be used here to make format identifiers insensitive to capitalization

# Samples and associated variant files
samples = []
Expand Down Expand Up @@ -471,6 +471,22 @@ def main():
if os.path.exists(args['json_config_output']):
abortWithMessage("JSON config file {0} already exists.".format(args['json_config_output']))

# Are the specified output formats known?
knownTypes = sorted(output.Writer().file_names.keys())
anyWrong = 0
for inputType in str(args['output_type']).split(','):
if inputType not in knownTypes:
throwWarning("{0} is not known!".format(inputType))
anyWrong += 1
if len(str(args['output_type']).split(',')) == anyWrong:
# quit if none of the input types were known
abortWithMessage("No known output file types selected!")
elif anyWrong:
# if 1 or more of the input types were known, write config file w/ user-defined parameters
# prompting them to fix the config. Mucor will crash at output if it is run with improperly defined output types
print("Please correct your JSON config, selecting from the following output types:")
print(knownTypes)

# Does the given output directory exist and contain output already?
if os.path.exists(args['output_directory']) and [x for x in os.listdir(args['output_directory']) if x in output.Writer().file_names.values() ]:
abortWithMessage("The directory {0} already exists and contains output. Will not overwrite.".format(args['output_directory']))
Expand Down
93 changes: 91 additions & 2 deletions output.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,11 @@ def __init__(self):
self.supported_formats = { "default": self.Default,
"counts": self.Counts,
"txt": self.VariantDetails,
"longtxt": self.LongVariantDetails,
"xls": self.VariantDetails,
"longtxt": self.LongVariantDetails,
"longxls": self.LongVariantDetails,
"svdtxt": self.SampleVariantDetails,
"svdxls": self.SampleVariantDetails,
"bed":self.VariantBed,
"featXsamp": self.FeatureXSample,
"mutXsamp": self.Feature_and_MutationXSample,
Expand All @@ -54,9 +56,11 @@ def __init__(self):

self.file_names = { "counts": "counts.txt",
"txt": "variant_details.txt",
"longtxt": "long_variant_details.txt",
"xls": "variant_details.xlsx",
"longtxt": "long_variant_details.txt",
"longxls": "long_variant_details.xlsx",
"svdtxt": "sample_variant_details.txt",
"svdxls": "sample_variant_details.xlsx",
"bed": "variant_locations.bed",
"featXsamp": "feature_by_sample.xlsx",
"mutXsamp": "feature_and_mutation_by_sample.xlsx",
Expand Down Expand Up @@ -294,6 +298,67 @@ def VariantDetails(self):

return True

def SampleVariantDetails(self):
'''
Print all information about each mutation,
without combining all mutations from different samples
Note: chrom, position, ref, alt, feature, and sample are all required to uniquely identify a mutation
indels may have the same chr, pos, but different ref/alt
Output: sample_variant_details.txt, sample_variant_details.xls
Note: switching the pandas ExcelWriter file extension to xlsx instead of xls requires openpyxl
'''

outputDirName = self.outputDirName

varDF = self.data
if 'svdtxt' in self.config.outputFormats and 'svdtxt' not in self.attempted_formats:
svdtxt = bool(True)
self.attempted_formats.append('svdtxt')
else:
svdtxt = bool(False) # User has not opted for this output or this output has already been run
if 'svdxls' in self.config.outputFormats and 'svdxls' not in self.attempted_formats:
svdxls = bool(True)
self.attempted_formats.append('svdxls')
else:
svdxls = bool(False) # User has not opted for this output or this output has already been run

try:
if svdtxt:
outputFileName = self.file_names['svdtxt']
ofSampleVariantDetailsTXT = open(outputDirName + "/" + outputFileName, 'w+')
if svdxls:
outputFileName = self.file_names['svdxls']
ofSampleVariantDetailsXLS = pd.ExcelWriter(str(outputDirName) + '/' + outputFileName)
except:
abortWithMessage("Error opening output files in {0}/".format(outputDirName))

if svdtxt or svdxls:
# add some new columns
sources = sorted(varDF['source'].unique(), reverse=True)
for source in sources:
for col in ['dp','vf']:
this_name = source + "_" + col
varDF.insert(4,this_name,None)
varDF.loc[varDF[varDF['source']==source].index,this_name] = varDF[varDF['source']==source][col]
outcols = [x for x in varDF.columns if x not in ['vf','dp']]
# Group by (chr, pos, ref, alt, feature)
grouped = varDF[outcols].groupby(['chr', 'pos', 'ref', 'alt', 'feature', 'sample'])
# apply collapsing function to each pandas group
out = grouped.apply(collapseSVD)
out.reset_index(drop=True,inplace=True)
if svdtxt:
# print the new, collapsed dataframe to a file
mySort(out, ['feature','pos']).to_csv(ofSampleVariantDetailsTXT, sep='\t', na_rep='?', index=False)
print("\t{0}: {1} rows".format(ofSampleVariantDetailsTXT.name, len(out)))
if svdxls:
# print the new, collapsed dataframe to file a
mySort(out, ['feature','pos']).to_excel(ofSampleVariantDetailsXLS, 'Variant Details', na_rep='?', index=False)
ofSampleVariantDetailsXLS.save()
print("\t{0}: {1} rows".format(str(outputDirName + '/' + outputFileName), len(out)))

return True

def LongVariantDetails(self):
'''
Similar to printVariantDetails above, but writes each instance
Expand Down Expand Up @@ -426,6 +491,30 @@ def All(self):
# do not write output, but used by writers #
############################################

def collapseSVD(group):
'''
Pandas operations to support the SampleVariantDetails family of functions.
Collapses variant rows that share the same contig, position, ref allele, alt allele, feature, and sample.
Input: a pandas groupby object
Output: a pandas dataframe object
'''
if len(group)==1:
#nothing to do here
return group
outvals = []
# sort group to maintain consistency when concatenating source and fn columns
group = mySort(group, columns='source')
for column in group.columns:
uniques = group[column].replace("?",np.nan).dropna().unique()
if len(uniques) == 1:
outvals.append(uniques[0])
elif len(uniques) == 0:
outvals.append('?')
else:
outvals.append(", ".join([x for x in uniques ] ))
outD = pd.DataFrame( dict(zip(group.columns,outvals)), index=[0])[group.columns]
return outD

def collapseVariantDetails(group):
'''
Pandas operations to support the VariantDetails family of functions.
Expand Down

0 comments on commit 730d43f

Please sign in to comment.