Figure4.py

# =============================================================================
""" Figure 4 (How many replicate screens is necessary for JEDER to run well?)
    Generate consensus for 3,4,5,7,10,15, and 21 screens
    Caluclate a precision and recall vs number of screens heatmap at different lfc cutoffs (Same from 1 should be fine)
    Make a commentary about how many screens is necessary.
"""
# =============================================================================
import sys
import os
import math
import numpy as np
import pandas as pd
import h5py
import itertools

import matplotlib.pyplot as plt
import seaborn as sns

from jeder import evaluation_table, vec_precision, vec_recall, fpr_convert
from jeder import parse_hitspec, eval_expression

import pickle as pickle
import random


## Data for LFC < -1.0
data_dir = '/media/mahfuz/2A8B1AF404AA6CF6/CRISPR/MCMC_Jeder/Jeder_Manuscript/Analysis/WT_Minimal_analysis/'
input_files = ['lfc_WT_min_rep_3_seed_40.txt', 'lfc_WT_min_rep_4_seed_40.txt', 'lfc_WT_min_rep_5_seed_40.txt', 'lfc_WT_min_rep_7_seed_40.txt', 'lfc_WT_min_rep_10_seed_40.txt', 'lfc_WT_min_rep_15_seed_40.txt', 'lfc_WT_min_rep_21_seed_40.txt']
consensus_files = ['results.lfc.neg.1.0.FPR.0.005.0.009.FNR.0.05.0.09.3rep.hdf5', 'results.lfc.neg.1.0.FPR.0.005.0.009.FNR.0.07.0.11.4rep.hdf5', 'results.lfc.neg.1.0.FPR.0.005.0.009.FNR.0.07.0.11.5rep.hdf5', 'results.lfc.neg.1.0.FPR.0.005.0.009.FNR.0.08.0.12.7rep.hdf5', 'results.lfc.neg.1.0.FPR.0.005.0.009.FNR.0.08.0.12.10rep.hdf5', 'results.lfc.neg.1.0.FPR.0.005.0.009.FNR.0.08.0.12.15rep.hdf5', 'results.lfc.neg.1.0.FPR.0.005.0.009.FNR.0.08.0.12.21rep.hdf5']

number_of_reps = [3, 4, 5, 7, 10, 15, 21]
lfc_cutoff = [-3, -2.5, -2, -1.75, -1.5, -1.25, -1, -0.75, -0.5, -0.25, 0, 0.5, 1, 2]

rows = len(number_of_reps)
cols = len(lfc_cutoff)

# For individual replicates for the different sets (3,5,7,10,15,21), calculate stats against their consensus
screen_names = []
TP_matrix = np.zeros((sum(number_of_reps), cols))
FP_matrix = np.zeros((sum(number_of_reps), cols))
FN_matrix = np.zeros((sum(number_of_reps), cols))
TN_matrix = np.zeros((sum(number_of_reps), cols))
num_ess = np.zeros((sum(number_of_reps), cols))

indices = list(np.cumsum(number_of_reps))
start_ind = 0

for i in range(len(number_of_reps)):
    print('Numer of Reps: {}'.format(number_of_reps[i]))    
    input_file = data_dir + 'Input/' + input_files[i]
    output_file = data_dir + 'Output/' + consensus_files[i]
    
    valid_ind = range(start_ind, indices[i])
    
    hf = h5py.File(output_file, 'r')
    vec_std = np.round(hf['vec_mean']).astype(np.bool) # standard (17804 genes )
    
    # Remove frequent flyer genes from standard
    df = pd.read_table(input_file)
    input_df = df.pivot_table(index='repid', columns='expid', values='lfc', fill_value=False)
    screen_names.extend(input_df.index.to_list())

    for j in range(cols):    
        hits = np.ones(df.shape[0], dtype=np.bool)
        hits = hits & (df['lfc'] < lfc_cutoff[j])
        
        df['jeder_hits'] = hits
        tmp_df = df.pivot_table(index='repid', columns='expid', values='jeder_hits', fill_value=False)
        
        num_ess[valid_ind,j] = np.sum(tmp_df, axis = 1)
        y_truth = np.tile(vec_std, (tmp_df.shape[0],1))
        TP_matrix[valid_ind,j] = np.sum(y_truth & tmp_df, axis=1)
        TN_matrix[valid_ind,j] = np.sum((y_truth == False) & (tmp_df == False), axis=1)
        FP_matrix[valid_ind,j] = np.sum(tmp_df, axis=1) - TP_matrix[valid_ind,j]
        FN_matrix[valid_ind,j] = np.sum(y_truth, axis=1) - TP_matrix[valid_ind,j]
    
    start_ind = indices[i]
    
# ====================== Save data using pickle ======================
with open('TP_FP_neg_individual_screens.pkl', 'wb') as f:  # Python 3: open(..., 'wb')
    pickle.dump([TP_matrix, FP_matrix, FN_matrix, TN_matrix, number_of_reps, num_ess, screen_names], f)
    
with open('TP_FP_neg_individual_screens.pkl', 'rb') as f:  # Python 3: open(..., 'rb')
    TP_matrix, FP_matrix, FN_matrix, TN_matrix, number_of_reps, num_ess, screen_names = pickle.load(f)
    
    
#  ====================== Plot Precision, recall, degree ======================
screen_TP = np.zeros((len(number_of_reps), cols)) # TP sum at a cutoff
screen_FP = np.zeros((len(number_of_reps), cols))
screen_TN = np.zeros((len(number_of_reps), cols))
screen_FN = np.zeros((len(number_of_reps), cols))

screen_precision = np.zeros((len(number_of_reps), cols))
screen_recall = np.zeros((len(number_of_reps), cols))
screen_essentials = np.zeros((len(number_of_reps), cols))

indices = list(np.cumsum(number_of_reps))
start_ind = 0

for i in range(len(number_of_reps)):    
    screen_id = range(start_ind, indices[i])
    screen_TP[i,:] = np.nansum(TP_matrix[screen_id,:], axis = 0) # TP sum at a cutoff
    screen_FP[i,:] = np.nansum(FP_matrix[screen_id,:], axis = 0)
    screen_TN[i,:] = np.nansum(TN_matrix[screen_id,:], axis = 0)
    screen_FN[i,:] = np.nansum(FN_matrix[screen_id,:], axis = 0)
    
    screen_precision[i,:] = np.round(screen_TP[i,:] / (screen_TP[i,:] + screen_FP[i,:]), 4)
    screen_recall[i,:] = np.round(screen_TP[i,:] / (screen_TP[i,:] + screen_FN[i,:]), 4)
    screen_essentials[i,:] = np.round(np.nanmean(num_ess[screen_id,:], axis = 0))
    
    start_ind = indices[i]


'''
# Plot a table
fig, ax = plt.subplots()
fig.patch.set_visible(False) # hide axes
ax.axis('off')
ax.axis('tight')

ax.table(cellText=global_precision, cellLoc = 'left', 
         colLabels=col_header, colLoc = 'left', 
         rowLabels=row_header, loc='center')
fig.tight_layout()
plt.show()
'''
 
# ============== plot a heatmap with annotation ==============
# https://seaborn.pydata.org/generated/seaborn.heatmap.html
row_header = list(map(str, number_of_reps)) # Number of replicates
# row_header[0] = '# of screens = ' + row_header[0]
col_header = list(map(str, lfc_cutoff)) # LFC Cutoff
# col_header[0] = 'LFC < ' + col_header[0]

df_prec = pd.DataFrame(screen_precision, columns = col_header, index = row_header)
df_recall = pd.DataFrame(screen_recall, columns = col_header, index = row_header)

# cmap = 'coolwarm', 'gist_yarg' (for grayscale)
# fig, ((ax, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(8, 6))
fig, ax = plt.subplots(2,1, figsize=(8, 6), sharey=True)
fig.suptitle('Number of Replicates vs Performance', fontsize=16)

# Precision
sns.heatmap(df_prec.transpose(), annot=True, annot_kws={"size": 8}, linewidths=.5, cmap="gist_yarg", robust = True, ax=ax[0])
ax[0].set_title('Precision')
ax[0].set_xticks([])
ax[0].tick_params('y', labelrotation=0)
ax[0].set(xlabel='', ylabel='LFC cutoff (<)')
# plt.yticks(rotation=0)

# Degree
# sns.heatmap(df_ess, annot=True, annot_kws={"size": 8}, fmt=".0f", linewidths=.5, cmap="coolwarm", robust = True, ax=ax[0,1])
# ax[0,1].set_title('qGI degree')

# Recall
sns.heatmap(df_recall.transpose(), annot=True, annot_kws={"size": 8}, linewidths=.5, cmap="gist_yarg", robust = True, ax=ax[1])
ax[1].set_title('Recall')
# ax[-1, -1].axis('off') # Make last one blank
ax[1].tick_params('y', labelrotation=0)
ax[1].set(xlabel='# of replicates', ylabel='LFC cutoff (<)')

# plt.show()
plt.savefig('NUmber_of_replicates_vs_Performance_BW.png', dpi=300)