forked from ndfriedman/myMskccUtils
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsignature_landscape_plot_prep_util.py
117 lines (90 loc) · 3.89 KB
/
signature_landscape_plot_prep_util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#written by Noah Friedman
#a suite of processing functions to prepare myriad data into the formats I need to make signature landscape plots
import sys
import argparse
import os
import pandas as pd
import numpy as np
import math
from collections import Counter
sys.path.append('/ifs/work/taylorlab/friedman/')
#FUNCTIONALITY for interactive python on my desktop
pathPrefix = ''
if os.getcwd() == '/Users/friedman/Desktop/mnt':
pathPrefix = '/Users/friedman/Desktop/mnt'
def find_second_most_common(row, primarySig, returnMode,
sigNamesToSpecify = set(['mean_1', 'mean_3', 'mean_4', 'mean_7', 'mean_10', 'mean_11','mean_14', 'mean_17', 'mean_MMR', 'mean_APOBEC']) #a set of signatures we actually mark on the chart
):
colNames = row.to_dict().keys()
signatureColumns = [i for i in list(row.keys()) if 'mean' in i]
rowSigsOnly = row[signatureColumns]
rowAsDict = rowSigsOnly.to_dict()
items = rowAsDict.items()
sortedItems = sorted(items, key=lambda x: x[1], reverse=True)
if sortedItems[0][0] == primarySig:
if returnMode == 'name':
sigName = sortedItems[1][0]
if sigName in sigNamesToSpecify:
return sigName
else:
return 'other'
else:
return sortedItems[1][1]
else:
if returnMode == 'name':
sigName = sortedItems[0][0]
if sigName in sigNamesToSpecify:
return sigName
else:
return 'other'
else:
return sortedItems[0][1]
#define an ordering for the ggplot plot based on the first signature to hit a clipping thresh
#add functionality for low mut burden
def ordering_function_clip_mode(row, clipThresh = .15, sigsToOrderBy = ['mean_MMR', 'mean_1', 'mean_APOBEC', 'mean_3', 'mean_4', 'mean_7', 'mean_10']):
orderingNum = len(sigsToOrderBy)
for i in range(len(sigsToOrderBy)):
curSigToConsider = sigsToOrderBy[i]
if row[curSigToConsider] > clipThresh and (curSigToConsider == row['otherPredominantSigName'] or curSigToConsider == 'mean_MMR'):
return orderingNum + row[curSigToConsider]
else:
orderingNum -=1
return 0
#an ordering function for a df
def ordering_function_dom_sig_mode(row, domSig, #the signature to have as the primary ordering
ageSigReOrderMode = False, sigsToOrderBy = ['mean_APOBEC', 'mean_3', 'mean_4', 'mean_7', 'mean_10', 'mean_14', 'mean_11', 'mean_17']):
orderingNum = len(sigsToOrderBy) + 2
if row['Nmut'] < 10 or math.isnan(row['Nmut']): return -1 #cases with less than 10 mutations go at the far side
if row[domSig] > row['otherPredominantSigMagnitude']:
return orderingNum + row[domSig]
orderingNum -= 1
if row['otherPredominantSigName'] == 'mean_1' or row['otherPredominantSigName'] == 'Age' or row['otherPredominantSigName'] == '1':
if ageSigReOrderMode:
return orderingNum + row[domSig] #if age sig reorder mode reorer the age sig column by the dominant signature
else:
return orderingNum + row['mean_1']
orderingNum -= 1
for i in range(len(sigsToOrderBy)):
curSigToConsider = sigsToOrderBy[i]
if curSigToConsider == row['otherPredominantSigName']:
return orderingNum + row[curSigToConsider]
else:
orderingNum -=1
return 0
def ordering_function_two_signatures_mode(row, ordering=None):
orderingNum = len(ordering)
if row['Nmut'] < 10: return -1
for i in range(len(ordering)):
curSigToConsider = ordering[i]
if curSigToConsider == row['dominantSignatureName']:
return orderingNum + row[curSigToConsider]
else:
orderingNum -=1
return 0
def temp_func():
return 0
#functions for data prep
def rename_column_for_aesthetic_purposes():
return 0
def rename_columns_from_philip_data():
return 0