From 730d43fa409131123cfec07dbb9056d17dafded8 Mon Sep 17 00:00:00 2001 From: Karl Kroll Date: Wed, 10 Feb 2016 13:01:00 -0500 Subject: [PATCH] #7 integrated SampleVariantDetails format in outputs. minor update to config to detect unknown file formats --- mucor.py | 2 -- mucor_config.py | 18 +++++++++- output.py | 93 +++++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 108 insertions(+), 5 deletions(-) diff --git a/mucor.py b/mucor.py index 7a9fa86..1c18f90 100755 --- a/mucor.py +++ b/mucor.py @@ -628,8 +628,6 @@ def parseVariantFiles(config, knownFeatures, gas, databases, filters, regions, t #stop() # this command throws a warning varDF.replace('', '?', inplace=True) - from pdb import set_trace as stop - stop() return varDF, knownFeatures, gas def printOutput(config, outputDirName, varDF): diff --git a/mucor_config.py b/mucor_config.py index 4f22639..af7bffa 100755 --- a/mucor_config.py +++ b/mucor_config.py @@ -316,7 +316,7 @@ def getJSONDict(args): # Output formats json_dict['outputFormats'] = [] for i in str(args['output_type']).split(','): - json_dict['outputFormats'].append(str(i)) #.lower() was here for some reason + json_dict['outputFormats'].append(str(i)) #.lower() can be used here to make format identifiers insensitive to capitalization # Samples and associated variant files samples = [] @@ -471,6 +471,22 @@ def main(): if os.path.exists(args['json_config_output']): abortWithMessage("JSON config file {0} already exists.".format(args['json_config_output'])) + # Are the specified output formats known? + knownTypes = sorted(output.Writer().file_names.keys()) + anyWrong = 0 + for inputType in str(args['output_type']).split(','): + if inputType not in knownTypes: + throwWarning("{0} is not known!".format(inputType)) + anyWrong += 1 + if len(str(args['output_type']).split(',')) == anyWrong: + # quit if none of the input types were known + abortWithMessage("No known output file types selected!") + elif anyWrong: + # if 1 or more of the input types were known, write config file w/ user-defined parameters + # prompting them to fix the config. Mucor will crash at output if it is run with improperly defined output types + print("Please correct your JSON config, selecting from the following output types:") + print(knownTypes) + # Does the given output directory exist and contain output already? if os.path.exists(args['output_directory']) and [x for x in os.listdir(args['output_directory']) if x in output.Writer().file_names.values() ]: abortWithMessage("The directory {0} already exists and contains output. Will not overwrite.".format(args['output_directory'])) diff --git a/output.py b/output.py index 0870df3..ac68589 100644 --- a/output.py +++ b/output.py @@ -41,9 +41,11 @@ def __init__(self): self.supported_formats = { "default": self.Default, "counts": self.Counts, "txt": self.VariantDetails, - "longtxt": self.LongVariantDetails, "xls": self.VariantDetails, + "longtxt": self.LongVariantDetails, "longxls": self.LongVariantDetails, + "svdtxt": self.SampleVariantDetails, + "svdxls": self.SampleVariantDetails, "bed":self.VariantBed, "featXsamp": self.FeatureXSample, "mutXsamp": self.Feature_and_MutationXSample, @@ -54,9 +56,11 @@ def __init__(self): self.file_names = { "counts": "counts.txt", "txt": "variant_details.txt", - "longtxt": "long_variant_details.txt", "xls": "variant_details.xlsx", + "longtxt": "long_variant_details.txt", "longxls": "long_variant_details.xlsx", + "svdtxt": "sample_variant_details.txt", + "svdxls": "sample_variant_details.xlsx", "bed": "variant_locations.bed", "featXsamp": "feature_by_sample.xlsx", "mutXsamp": "feature_and_mutation_by_sample.xlsx", @@ -294,6 +298,67 @@ def VariantDetails(self): return True + def SampleVariantDetails(self): + ''' + Print all information about each mutation, + without combining all mutations from different samples + Note: chrom, position, ref, alt, feature, and sample are all required to uniquely identify a mutation + indels may have the same chr, pos, but different ref/alt + + Output: sample_variant_details.txt, sample_variant_details.xls + Note: switching the pandas ExcelWriter file extension to xlsx instead of xls requires openpyxl + ''' + + outputDirName = self.outputDirName + + varDF = self.data + if 'svdtxt' in self.config.outputFormats and 'svdtxt' not in self.attempted_formats: + svdtxt = bool(True) + self.attempted_formats.append('svdtxt') + else: + svdtxt = bool(False) # User has not opted for this output or this output has already been run + if 'svdxls' in self.config.outputFormats and 'svdxls' not in self.attempted_formats: + svdxls = bool(True) + self.attempted_formats.append('svdxls') + else: + svdxls = bool(False) # User has not opted for this output or this output has already been run + + try: + if svdtxt: + outputFileName = self.file_names['svdtxt'] + ofSampleVariantDetailsTXT = open(outputDirName + "/" + outputFileName, 'w+') + if svdxls: + outputFileName = self.file_names['svdxls'] + ofSampleVariantDetailsXLS = pd.ExcelWriter(str(outputDirName) + '/' + outputFileName) + except: + abortWithMessage("Error opening output files in {0}/".format(outputDirName)) + + if svdtxt or svdxls: + # add some new columns + sources = sorted(varDF['source'].unique(), reverse=True) + for source in sources: + for col in ['dp','vf']: + this_name = source + "_" + col + varDF.insert(4,this_name,None) + varDF.loc[varDF[varDF['source']==source].index,this_name] = varDF[varDF['source']==source][col] + outcols = [x for x in varDF.columns if x not in ['vf','dp']] + # Group by (chr, pos, ref, alt, feature) + grouped = varDF[outcols].groupby(['chr', 'pos', 'ref', 'alt', 'feature', 'sample']) + # apply collapsing function to each pandas group + out = grouped.apply(collapseSVD) + out.reset_index(drop=True,inplace=True) + if svdtxt: + # print the new, collapsed dataframe to a file + mySort(out, ['feature','pos']).to_csv(ofSampleVariantDetailsTXT, sep='\t', na_rep='?', index=False) + print("\t{0}: {1} rows".format(ofSampleVariantDetailsTXT.name, len(out))) + if svdxls: + # print the new, collapsed dataframe to file a + mySort(out, ['feature','pos']).to_excel(ofSampleVariantDetailsXLS, 'Variant Details', na_rep='?', index=False) + ofSampleVariantDetailsXLS.save() + print("\t{0}: {1} rows".format(str(outputDirName + '/' + outputFileName), len(out))) + + return True + def LongVariantDetails(self): ''' Similar to printVariantDetails above, but writes each instance @@ -426,6 +491,30 @@ def All(self): # do not write output, but used by writers # ############################################ +def collapseSVD(group): + ''' + Pandas operations to support the SampleVariantDetails family of functions. + Collapses variant rows that share the same contig, position, ref allele, alt allele, feature, and sample. + Input: a pandas groupby object + Output: a pandas dataframe object + ''' + if len(group)==1: + #nothing to do here + return group + outvals = [] + # sort group to maintain consistency when concatenating source and fn columns + group = mySort(group, columns='source') + for column in group.columns: + uniques = group[column].replace("?",np.nan).dropna().unique() + if len(uniques) == 1: + outvals.append(uniques[0]) + elif len(uniques) == 0: + outvals.append('?') + else: + outvals.append(", ".join([x for x in uniques ] )) + outD = pd.DataFrame( dict(zip(group.columns,outvals)), index=[0])[group.columns] + return outD + def collapseVariantDetails(group): ''' Pandas operations to support the VariantDetails family of functions.