forked from benslice/jeder
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Figure4.py
163 lines (127 loc) · 7.03 KB
/
Figure4.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
# =============================================================================
""" Figure 4 (How many replicate screens is necessary for JEDER to run well?)
Generate consensus for 3,4,5,7,10,15, and 21 screens
Caluclate a precision and recall vs number of screens heatmap at different lfc cutoffs (Same from 1 should be fine)
Make a commentary about how many screens is necessary.
"""
# =============================================================================
import sys
import os
import math
import numpy as np
import pandas as pd
import h5py
import itertools
import matplotlib.pyplot as plt
import seaborn as sns
from jeder import evaluation_table, vec_precision, vec_recall, fpr_convert
from jeder import parse_hitspec, eval_expression
import pickle as pickle
import random
## Data for LFC < -1.0
data_dir = '/media/mahfuz/2A8B1AF404AA6CF6/CRISPR/MCMC_Jeder/Jeder_Manuscript/Analysis/WT_Minimal_analysis/'
input_files = ['lfc_WT_min_rep_3_seed_40.txt', 'lfc_WT_min_rep_4_seed_40.txt', 'lfc_WT_min_rep_5_seed_40.txt', 'lfc_WT_min_rep_7_seed_40.txt', 'lfc_WT_min_rep_10_seed_40.txt', 'lfc_WT_min_rep_15_seed_40.txt', 'lfc_WT_min_rep_21_seed_40.txt']
consensus_files = ['results.lfc.neg.1.0.FPR.0.005.0.009.FNR.0.05.0.09.3rep.hdf5', 'results.lfc.neg.1.0.FPR.0.005.0.009.FNR.0.07.0.11.4rep.hdf5', 'results.lfc.neg.1.0.FPR.0.005.0.009.FNR.0.07.0.11.5rep.hdf5', 'results.lfc.neg.1.0.FPR.0.005.0.009.FNR.0.08.0.12.7rep.hdf5', 'results.lfc.neg.1.0.FPR.0.005.0.009.FNR.0.08.0.12.10rep.hdf5', 'results.lfc.neg.1.0.FPR.0.005.0.009.FNR.0.08.0.12.15rep.hdf5', 'results.lfc.neg.1.0.FPR.0.005.0.009.FNR.0.08.0.12.21rep.hdf5']
number_of_reps = [3, 4, 5, 7, 10, 15, 21]
lfc_cutoff = [-3, -2.5, -2, -1.75, -1.5, -1.25, -1, -0.75, -0.5, -0.25, 0, 0.5, 1, 2]
rows = len(number_of_reps)
cols = len(lfc_cutoff)
# For individual replicates for the different sets (3,5,7,10,15,21), calculate stats against their consensus
screen_names = []
TP_matrix = np.zeros((sum(number_of_reps), cols))
FP_matrix = np.zeros((sum(number_of_reps), cols))
FN_matrix = np.zeros((sum(number_of_reps), cols))
TN_matrix = np.zeros((sum(number_of_reps), cols))
num_ess = np.zeros((sum(number_of_reps), cols))
indices = list(np.cumsum(number_of_reps))
start_ind = 0
for i in range(len(number_of_reps)):
print('Numer of Reps: {}'.format(number_of_reps[i]))
input_file = data_dir + 'Input/' + input_files[i]
output_file = data_dir + 'Output/' + consensus_files[i]
valid_ind = range(start_ind, indices[i])
hf = h5py.File(output_file, 'r')
vec_std = np.round(hf['vec_mean']).astype(np.bool) # standard (17804 genes )
# Remove frequent flyer genes from standard
df = pd.read_table(input_file)
input_df = df.pivot_table(index='repid', columns='expid', values='lfc', fill_value=False)
screen_names.extend(input_df.index.to_list())
for j in range(cols):
hits = np.ones(df.shape[0], dtype=np.bool)
hits = hits & (df['lfc'] < lfc_cutoff[j])
df['jeder_hits'] = hits
tmp_df = df.pivot_table(index='repid', columns='expid', values='jeder_hits', fill_value=False)
num_ess[valid_ind,j] = np.sum(tmp_df, axis = 1)
y_truth = np.tile(vec_std, (tmp_df.shape[0],1))
TP_matrix[valid_ind,j] = np.sum(y_truth & tmp_df, axis=1)
TN_matrix[valid_ind,j] = np.sum((y_truth == False) & (tmp_df == False), axis=1)
FP_matrix[valid_ind,j] = np.sum(tmp_df, axis=1) - TP_matrix[valid_ind,j]
FN_matrix[valid_ind,j] = np.sum(y_truth, axis=1) - TP_matrix[valid_ind,j]
start_ind = indices[i]
# ====================== Save data using pickle ======================
with open('TP_FP_neg_individual_screens.pkl', 'wb') as f: # Python 3: open(..., 'wb')
pickle.dump([TP_matrix, FP_matrix, FN_matrix, TN_matrix, number_of_reps, num_ess, screen_names], f)
with open('TP_FP_neg_individual_screens.pkl', 'rb') as f: # Python 3: open(..., 'rb')
TP_matrix, FP_matrix, FN_matrix, TN_matrix, number_of_reps, num_ess, screen_names = pickle.load(f)
# ====================== Plot Precision, recall, degree ======================
screen_TP = np.zeros((len(number_of_reps), cols)) # TP sum at a cutoff
screen_FP = np.zeros((len(number_of_reps), cols))
screen_TN = np.zeros((len(number_of_reps), cols))
screen_FN = np.zeros((len(number_of_reps), cols))
screen_precision = np.zeros((len(number_of_reps), cols))
screen_recall = np.zeros((len(number_of_reps), cols))
screen_essentials = np.zeros((len(number_of_reps), cols))
indices = list(np.cumsum(number_of_reps))
start_ind = 0
for i in range(len(number_of_reps)):
screen_id = range(start_ind, indices[i])
screen_TP[i,:] = np.nansum(TP_matrix[screen_id,:], axis = 0) # TP sum at a cutoff
screen_FP[i,:] = np.nansum(FP_matrix[screen_id,:], axis = 0)
screen_TN[i,:] = np.nansum(TN_matrix[screen_id,:], axis = 0)
screen_FN[i,:] = np.nansum(FN_matrix[screen_id,:], axis = 0)
screen_precision[i,:] = np.round(screen_TP[i,:] / (screen_TP[i,:] + screen_FP[i,:]), 4)
screen_recall[i,:] = np.round(screen_TP[i,:] / (screen_TP[i,:] + screen_FN[i,:]), 4)
screen_essentials[i,:] = np.round(np.nanmean(num_ess[screen_id,:], axis = 0))
start_ind = indices[i]
'''
# Plot a table
fig, ax = plt.subplots()
fig.patch.set_visible(False) # hide axes
ax.axis('off')
ax.axis('tight')
ax.table(cellText=global_precision, cellLoc = 'left',
colLabels=col_header, colLoc = 'left',
rowLabels=row_header, loc='center')
fig.tight_layout()
plt.show()
'''
# ============== plot a heatmap with annotation ==============
# https://seaborn.pydata.org/generated/seaborn.heatmap.html
row_header = list(map(str, number_of_reps)) # Number of replicates
# row_header[0] = '# of screens = ' + row_header[0]
col_header = list(map(str, lfc_cutoff)) # LFC Cutoff
# col_header[0] = 'LFC < ' + col_header[0]
df_prec = pd.DataFrame(screen_precision, columns = col_header, index = row_header)
df_recall = pd.DataFrame(screen_recall, columns = col_header, index = row_header)
# cmap = 'coolwarm', 'gist_yarg' (for grayscale)
# fig, ((ax, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(8, 6))
fig, ax = plt.subplots(2,1, figsize=(8, 6), sharey=True)
fig.suptitle('Number of Replicates vs Performance', fontsize=16)
# Precision
sns.heatmap(df_prec.transpose(), annot=True, annot_kws={"size": 8}, linewidths=.5, cmap="gist_yarg", robust = True, ax=ax[0])
ax[0].set_title('Precision')
ax[0].set_xticks([])
ax[0].tick_params('y', labelrotation=0)
ax[0].set(xlabel='', ylabel='LFC cutoff (<)')
# plt.yticks(rotation=0)
# Degree
# sns.heatmap(df_ess, annot=True, annot_kws={"size": 8}, fmt=".0f", linewidths=.5, cmap="coolwarm", robust = True, ax=ax[0,1])
# ax[0,1].set_title('qGI degree')
# Recall
sns.heatmap(df_recall.transpose(), annot=True, annot_kws={"size": 8}, linewidths=.5, cmap="gist_yarg", robust = True, ax=ax[1])
ax[1].set_title('Recall')
# ax[-1, -1].axis('off') # Make last one blank
ax[1].tick_params('y', labelrotation=0)
ax[1].set(xlabel='# of replicates', ylabel='LFC cutoff (<)')
# plt.show()
plt.savefig('NUmber_of_replicates_vs_Performance_BW.png', dpi=300)