-
Notifications
You must be signed in to change notification settings - Fork 0
/
merge_and_process_xls.py
246 lines (208 loc) · 9.95 KB
/
merge_and_process_xls.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
#Written by Noah Friedman, edited by Charlie Curnin.
"""
Tools for processing the final xls for use by GCs and visualization
"""
import pandas as pd
import numpy as np
import sys
import os
reload(sys)
sys.setdefaultencoding('utf-8')
sys.setdefaultencoding("latin-1") #ingenuity exports are encoded this way I think
#a dict mapping the column name we want for the final xls to [columnNamePipelineXls, columnNameUserXls]
columnMappings = {'CHROM': ['CHROM', 'Chromosome'], 'POS': ['POS', 'Position']}
#I do this with inefficient looping over both spreadsheets
#and looping over DFs
#this will not work if we have big spreadsheets, in that case we need to implement a pandas merge
def merge_and_add_columns(dfUser, dfSTMP, userColumnsToAdd):
colNames = dfUser.columns.tolist() + userColumnsToAdd
numRows = len(dfUser.index)
returnMergedDf = pd.DataFrame(np.empty((numRows, len(colNames)), dtype=str), columns=colNames)
for index, row in dfUser.iterrows():
print(row)
#Copy data from user sheet
for column in dfUser.columns.tolist():
print(index, column, row[column])
returnMergedDf.set_value(index, column, row[column])
#Copy data from STMP-generated sheet, if we find the variant
dfUserVariantKey = str(row["Chromosome"]) + ':' + str(row["Position"])
for idx, r in dfSTMP.iterrows():
dfSTMPVariantKey = str(r["CHROM"]) + ':' + str(r["POS"])
if dfUserVariantKey == dfSTMPVariantKey: #Merge the rows
print("match for", dfSTMPVariantKey)
#copy over values from stmp
for column in userColumnsToAdd:
returnMergedDf.set_value(index, column, r[column])
return returnMergedDf
#Converts a value to float iff it can be
def convert_to_float_or_zero(i):
try:
float(i)
return float(i)
except:
return 0
def add_allele_freq_summary_column(df):
print(df)
alleleFreqCols = ['AF_EAS', 'AF_NFE', 'AF_SAS', 'AF_AMR', 'AF_AFR']
df['GNOMAD_Max_Allele_Freq'] = np.empty(len(df.index))
df['GNOMAD_Max_Allele_Freq_POP'] = np.empty(len(df.index))
df['GNOMAD_Max_Allele_Freq_POP'] = df['GNOMAD_Max_Allele_Freq_POP'].astype(str) #this will be a column of strings
for index, row in df.iterrows():
freqs = [convert_to_float_or_zero(row[i]) for i in alleleFreqCols]
population = alleleFreqCols[freqs.index(max(freqs))][3:]
freq = max(freqs)
#ALERT todo: include the population from which the max freq comes
#df.set_value(index, 'Max_Allele_Freq', str(population) + ':' + str(freq))
df.set_value(index, 'GNOMAD_Max_Allele_Freq', freq)
df.set_value(index, 'GNOMAD_Max_Allele_Freq_POP', population)
#print df
def fix_ref_or_alt_column_for_indels(df, refKey, altKey):
for index, row in df.iterrows():
if type(row[altKey]) == float:
df.set_value(index, altKey, '_')
if type(row[refKey]) == float:
df.set_value(index, refKey, '_')
def add_tier_column():
return 0
#read xls sheets and create a dictionary mapping sheet names to dictionaries
#it also returns a list of sheetNames
def read_xls_sheets(xlsName):
xls = pd.ExcelFile(xlsName)
sheetNames = xls.sheet_names
sheetDict = dict()
for sheet in sheetNames:
sheetDict[sheet] = xls.parse(sheet)
return sheetDict, sheetNames
def sort_sheets(df):
#print df
sys.exit()
#print df.sort([''])
#arbitrary function for fixing issues with STMP/ingenuity inconsistency
#basically if there is a stmp pos that is close to or equal to ingenuity pos overwrite the ingenuity pos with the stmp pos
def find_closest_match_for_pos(stmpDf, ingenuityChrom, ingenuityPos):
minDistance = 100000 #default distance is arbitrary big number
newChrom, newPos, newRef, newAlt = -1, -1, -1, -1
for idx, r in stmpDf.iterrows():
stmpChrom, stmpPos = str(r['CHROM']), str(r['POS'])
stmpRef, stmpAlt = str(r['REF']), str(r['ALT'])
if stmpChrom == ingenuityChrom:
distance = abs(int(stmpPos) - int(ingenuityPos))
if distance < minDistance:
minDistance = distance
newChrom = stmpChrom
newPos = stmpPos
newRef = stmpRef
newAlt = stmpAlt
return newChrom, newPos, newRef, newAlt
#in order to properly merge with the ingenuity XLS, we need to right align our STMP output
#not generic function iterates over both dfs
def realign_ingenuity_sheet(ingenuityDf, stmpDf):
for index, row in ingenuityDf.iterrows():
ingenuityChrom, ingenuityPos, ingenuityRef, ingenuityAlt = str(row['Chromosome']), str(row['Position']), str(row['Reference Allele']), str(row['Sample Allele'])
newChrom, newPos, newRef, newAlt = find_closest_match_for_pos(stmpDf, ingenuityChrom, ingenuityPos)
if newChrom > 0: #if we found a match please set it up
ingenuityDf.set_value(index, 'Chromosome', newChrom)
ingenuityDf.set_value(index, 'Position', newPos)
ingenuityDf.set_value(index, 'Reference Allele', newRef)
ingenuityDf.set_value(index, 'Sample Allele', newAlt)
return ingenuityDf
def merge_columns_across_spreadsheets(spreadSheetPipeline, spreadSheetUser, outputDir, udnId):
sheetDictPipeline, sheetDictPipelineNames = read_xls_sheets(spreadSheetPipeline)
sheetDictUser, sheetDictUserNames = read_xls_sheets(spreadSheetUser)
#fix_ref_or_alt_column_for_indels(sheetDictUser[sheetDictUserNames[0]], 'Reference Allele', 'Sample Allele') #this code fixes the syntax of indels for the user inputed sheet
#fix_ref_or_alt_column_for_indels(sheetDictPipeline[sheetDictPipelineNames[1]], 'REF', 'ALT') #this code fixes the syntax of indels for the pipeline inputed sheet
sheetDictUser[sheetDictUserNames[0]] = realign_ingenuity_sheet(sheetDictUser[sheetDictUserNames[0]], sheetDictPipeline[sheetDictPipelineNames[1]])
#We expect the user's xls to be just a single sheet output from ingenuity. If its not, that breaks our code and we exit
if len(sheetDictUser) != 1:
print 'error we expect an excel sheet from the user with a single sheet'
sys.exit()
#this variable is never used (and user sheet should be first argument)
#mergedDfPipeline = merge_and_add_columns(sheetDictPipeline[sheetDictPipelineNames[1]], sheetDictUser[sheetDictUserNames[0]], [ #indicies indicate where we can find the two actual data spreadsheets
#'Transcript ID', 'Transcript Variant', 'Protein Variant', 'Gene Region', 'Gene Symbol'], 0, 1) #list of columns to add from the user uploaded columns
colsToAddToDf = ['AF_EAS', 'AF_NFE', 'AF_SAS', 'AF_AMR', 'AF_AFR',
'NC', 'NI', 'NA', 'ESP_AF_POPMAX', 'KG_AF_POPMAX',
'SD', 'SF', 'QUAL', 'ID', 'FILTER', 'GT', 'NJ', 'SX', 'GI',
'AN_AFR', 'AN_AMR', 'AN_ASJ', 'AN_EAS', 'AN_FIN', 'AN_NFE', 'AN_OTH', 'AN_SAS',
'clinvar_pathogenic', 'KG_AF_GLOBAL', 'KG_AC', 'POPMAX', 'AN_POPMAX',
'AC_POPMAX', 'AF', 'AN', 'AN_Female', 'AN_Male']
colsFinal = []
for x in colsToAddToDf: #fix the columns in case they are missing
print(x)
if x in sheetDictPipeline[sheetDictPipelineNames[1]].columns.tolist():
print("appending", x)
colsFinal.append(x)
# the 0s /1s relate to is it the GC xls first or the pipeline xls first
mergedDfUser = merge_and_add_columns(sheetDictUser[sheetDictUserNames[0]], sheetDictPipeline[sheetDictPipelineNames[1]], colsFinal)
add_allele_freq_summary_column(mergedDfUser)
#if False:
# sort_sheets(mergedDfUser)
#Save everything to an XLSX
outputXlsxName = os.path.join(outputDir, udnId + '_merged.xlsx')
writer = pd.ExcelWriter(outputXlsxName,options={'encoding':'latin-1'})
#sheetDictPipeline[sheetDictPipelineNames[0]].to_excel(writer, 'Column Descriptions', index = False)
mergedDfUser.to_excel(writer,'Sheet1', index = False)
writer.save()
return outputXlsxName
renameDict = {'SX': 'SwissProtExpression', 'GI': 'ProximalGeneInfo', 'SD': 'SwissProtDiseaseAssociation', 'mTaster': 'MutationTaster', 'SF': 'SwissProtFunction', 'phylop': 'phyloP', 'NI': 'MutationTasterPVal', 'sift': 'Sift', 'GNOMAD_Max_Allele_Freq': 'GNOMADMaxAlleleFreq', 'POPMAX': 'ExacPopmax', 'AN_POPMAX': 'ExacANPopmax', 'AC_POPMAX': 'ExacACPopmax', 'AF':'ExacAf', 'AN':'ExacAn', 'AN_Female': 'GNOMAD_AN_FEMALE', 'AN_Male': 'GNOMAD_AN_MALE'}
#Changes column headers to make them more clear
def make_cols_human_readable(df):
colsToRename = renameDict
for key, value in colsToRename.items():
#Don't rename a column that's not actually there
if key not in df.columns.tolist():
del colsToRename[key]
df = df.rename(columns=colsToRename)
return df
#If any cell has a comma and the value before the comma is a float, just take the first half
def clean_up_comma_separated_values(df):
cols = df.columns
for index, row in df.iterrows():
for col in cols:
commaIdx = str(row[col]).find(',')
if commaIdx > 0:
val = row[col][:commaIdx]
try:
float(val)
df.set_value(index, col, val)
except:
pass
return df
def clean_cell_values(df):
df = clean_up_comma_separated_values(df)
valsToMarkAsEmpty = ['no value', '.']
emptyValue = ''
for col in df.columns:
if col == 'CADD Score':
df[col] = df[col].replace(['< 10'], 1) #clean CADD score
df[col] = df[col].replace(valsToMarkAsEmpty, emptyValue) #normalize empty columns
#clean up commas separated values
return df
orderedCols = ['Chromosome', 'Position', 'Reference Allele', 'Sample Allele', 'Variation Type',
'Gene Region', 'Gene Symbol', 'Transcript ID', 'Transcript Variant', 'Protein Variant',
'Translation Impact','GNOMAD_Max_Allele_Freq']
def sort_cols(df):
#inefficient way to order the columns
firstCols = [] #all columns whose order we care about
lastCols = [] #put all columns we dont specify specifically at the end
for col in orderedCols:
if col in df.columns.tolist():
firstCols.append(col)
for col in df.columns.tolist():
if col not in firstCols:
lastCols.append(col)
sortedCols = firstCols + lastCols
df = df.reindex_axis(sortedCols, axis=1)
return df
#function to rename columns, reorder columns etc in xls data
def improve_legibility_of_xls(xlsName):
xls = pd.ExcelFile(xlsName)
df = xls.parse(xls.sheet_names[0])
#to imporve legibility we do two things: make columns readable and sort columns
df = make_cols_human_readable(df)
df = sort_cols(df)
df = clean_cell_values(df)
outputXlsxName = xlsName
writer = pd.ExcelWriter(outputXlsxName,options={'encoding':'latin-1'})
df.to_excel(writer,'Sheet1', index = False)
writer.save()
return outputXlsxName