-
Notifications
You must be signed in to change notification settings - Fork 1
/
maf_subsetter.py
80 lines (51 loc) · 2.54 KB
/
maf_subsetter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#written by Noah Friedman
#little utility script to do a la carte subsetting of mafs
import sys
import argparse
import os
import pandas as pd
import numpy as np
from collections import Counter
sys.path.append('/ifs/work/taylorlab/friedman/')
def get_cases_for_cancer_type(cancerType, cancerTypeListDir = '/ifs/work/taylorlab/friedman/msk-impact/msk-impact/case_lists'):
cancerType = 'case_list_' + cancerType + '.txt'
path = os.path.join(cancerTypeListDir, cancerType)
f = open(path)
lines = f.readlines()
return set(lines[4].split('\t'))
def main():
parser = argparse.ArgumentParser(description='Arg parser for this script')
parser.add_argument('--mode', help='mode to run the script in', default='subsetByCancerType')
parser.add_argument('--cancerTypes', default='')
parser.add_argument('--inputMaf', default='/ifs/work/taylorlab/friedman/myUtils/FilteredMafWithHospotAndSignatures.maf')
parser.add_argument('--inputMaf2', default='/ifs/work/taylorlab/friedman/myAdjustedDataFiles/impactMafs/data_mutations_extended_mafAnno.maf')
parser.add_argument('--outputFilename', default='adjustedMaf.maf')
parser.add_argument('--outputDir', default='/ifs/work/taylorlab/friedman/myAdjustedDataFiles/adjustedSubsetMafs')
args = parser.parse_args()
mafDf = pd.read_table(args.inputMaf, skiprows=[0])
writePath = os.path.join(args.outputDir, args.outputFilename)
if args.mode == 'subsetByCancerType':
cases = get_cases_for_cancer_type('Bladder_Cancer')
mafDf = mafDf[mafDf['Tumor_Sample_Barcode'].isin(cases)]
print 'writing file to ', writePath
mafDf.to_csv(writePath, sep='\t', index=False)
if args.mode == 'mashMafs': #mode to merge two mafs
mafDf1 = pd.read_table(args.inputMaf)
mafDf2 = pd.read_table(args.inputMaf2, skiprows=[0])
mafDf1['idCol'] = mafDf1.apply(lambda row: str(row['Chromosome']) + '_' + str(row['Start_Position']), axis=1)
mafDf2['idCol'] = mafDf2.apply(lambda row: str(row['Chromosome']) + '_' + str(row['Start_Position']), axis=1)
#print mafDf1['idCol']
#print mafDf2['idCol']
print len(list(mafDf1['idCol']))
print len(list(mafDf2['idCol']))
#mergedDf = mafDf1.merge(mafDf2, how='left', left_on='idCol', right_on='idCol')
mergedDf = mafDf1.merge(mafDf2, on='idCol')
#print mergedDf.shape
#print mergedDf.columns.values
#print mergedDf['idCol']
#print mergedDf['Start_Position_x']
#print mergedDf['Start_Position_y']
print len(set(mafDf1['idCol']) & set(mafDf2['idCol']))
mergedDf.to_csv('facetsAndTrinucAndHotspotsAndSignatures.maf', index=False, sep='\t')
if __name__ == '__main__':
main()