forked from ndfriedman/myMskccUtils
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcalculate_nucleotide_mutation_spectra_from_maf.py
73 lines (59 loc) · 3.31 KB
/
calculate_nucleotide_mutation_spectra_from_maf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#written by noah friedman
#takes an input maf and returns a mutation spectra file
#the mutation spectra file is a 97 col tsv: Tumor_Sample_Barcode plus 96 quadnuc (ACTG etc)
#usage python calculate_nucleotide_mutation_spectra_from_maf.py inputMaf path/to/output/filename.tsv
#versioning: python 2.7.1, pandas 0.19.2
import sys
import pandas as pd
#UTILITY function to merge dicitonaries
def Merge(dict1, dict2):
z = dict2.copy()
z.update(dict1)
return z
#df['quadNuc'] = df.apply(lambda row: mutationSigUtils.create_reference_four_nuc(row['Ref_Tri'], row['Reference_Allele'], row['Tumor_Seq_Allele2'], row['Variant_Type']), axis=1)
def create_reference_four_nuc(refTri, refAllele, altAllele, variantType):
#properly invert when needed
def invert_allele_and_ref_tri(altAllele):
nucleotideDict = {'A': 'T', 'G': 'C', 'C':'G', 'T':'A'}
return nucleotideDict[altAllele]
if variantType != 'SNP': return None #there is no reference trinuc for non snps
if not isinstance(refTri, basestring): return None #if the ref tri is not a string we better return none
if len(refTri) < 3: return None # if the ref tri is less than length 3 (at the end of an exon), we cant do anything
refTri = str(refTri) #just make sure the ref tri is a string here to avoid funny business
refAlleleFromReftri = refTri[1]
alt = altAllele
if refAlleleFromReftri != refAllele:
alt = invert_allele_and_ref_tri(altAllele)
quadNuc = refTri[:2] + alt + refTri[2]
return quadNuc
#main function that returns a dataframe with mutation spectra
def count_quad_nuc_sprectra_from_maf(maf):
listOfDataframes = []
if ['Ref_Tri'] not in maf.columns.values:
print 'error, the maf we use needs to have a column called ref tri'
return
allBases = ['A', 'C', 'G', 'T']
changes = ['CA', 'CG', 'CT', 'TA', 'TC', 'TG'] #format: 'CA' means a change from C>A
allQuadNucs = [firstBase + change + lastBase for firstBase in allBases for change in changes for lastBase in allBases] #enumerate all 96 quadnucs for signatures
maf['quadNuc'] = maf.apply(lambda row: create_reference_four_nuc(row['Ref_Tri'], row['Reference_Allele'], row['Tumor_Seq_Allele2'], row['Variant_Type']), axis=1)
allCases = set(maf['Tumor_Sample_Barcode'])
for quadNuc in allQuadNucs:
qMaf = maf[maf['quadNuc'] == quadNuc]
#Convluted (but hopefully somewhat more efficient) code to count the number of times a trinucleotide occurs in each case
valueCountsDict = dict(qMaf['Tumor_Sample_Barcode'].value_counts()) #cases where the trincleotide occurs
zeroBarcodes = allCases - set(valueCountsDict.keys())
casesWhereQuadNucDoesNotOccur = {key:value for (key, value) in [(barcode, 0) for barcode in zeroBarcodes]}#create another dictionary to mark other cases as 0
allCasesDictionary = Merge(valueCountsDict, casesWhereQuadNucDoesNotOccur)
df = pd.DataFrame(allCasesDictionary.items())
df = df.rename(columns={0:'Tumor_Sample_Barcode', 1:quadNuc}) #fix the column names of this df we have constructed
listOfDataframes.append(df) #store all these columns so we can merge them later
df = listOfDataframes[0]
for df_ in listOfDataframes[1:]:
df = df.merge(df_, on='Tumor_Sample_Barcode')
return df
mafPath = sys.argv[1]
writePath = sys.argv[2]
maf = pd.read_table(mafPath)
spectraDf = count_quad_nuc_sprectra_from_maf(maf)
print 'writing file to ', writePath
spectraDf.to_csv(writePath, index=False, sep='\t')