-
Notifications
You must be signed in to change notification settings - Fork 0
/
step6_geneDiff.py
67 lines (52 loc) · 2.94 KB
/
step6_geneDiff.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
__author__ = "Junhee Yoon"
__version__ = "1.0.0"
__maintainer__ = "Junhee Yoon"
__email__ = "[email protected]"
"""
Description: Mimic of notebook code for pipeline work, please see step3 in Jun notebook archive
"""
import os
from scipy.stats import ranksums
import pandas as pd
from libraries.metaHandler import metaExt
## call previous step for calling internal function
from libraries.statFunction import StatHandler
import itertools
import argparse
def _extract_geneSignature(actInput, msigDBPath):
#Data loading
df = pd.read_csv(actInput, engine='c', index_col=0) # activation score data for extracting gene signature
# MsigDB Parsing
gmt_arr = [] # gmt parsing array
with open(msigDBPath, 'r') as infile:
for line in infile:
gmt_value = line.strip().split("\t") # splitting line
sig_names = gmt_value[0] # signature name
gene_list = gmt_value[2:] # gene list
gmt_arr.append([sig_names]+gene_list)
gmt_ext_arr = [x[1:] for x in gmt_arr if x[0] in df.index.tolist()] # Selected signature genes
gmt_ext_arr = list(itertools.chain(*gmt_ext_arr))
gmt_ext_arr = list(set(gmt_ext_arr)) # remove duplicated
return gmt_ext_arr
parser = argparse.ArgumentParser(prog='step6_geneDiff.py')
# Input data
parser.add_argument('-t','--type', dest='resultType', default='RR,CIS',\
help='Result type, ex: long, healthy, "RR,CIS" ')
args = parser.parse_args()
if __name__ == "__main__":
SharedFilePath = os.environ['efspoint'] # Main data path here, goes to EFS volume
metaName = os.environ['metafile'] # EPIC_HCvB_metadata_baseline_updated-share.csv
msigFile = os.environ['msigDBPATH'] # msigdb.v7.4.entrez.gmt
step1Input = os.environ['startFile'] # counts_vst_CD4.csv
inputExprFile = SharedFilePath+os.path.basename(step1Input).replace('.csv', '.step2.csv') # replace to step2 input, expression
inputFile = SharedFilePath+os.path.basename(step1Input).replace('.csv', '.step5.csv') # replace to step5 input
df_expr = pd.read_csv(SharedFilePath+inputExprFile, engine='c', index_col=0) # get expr
meta_data = pd.read_csv(SharedFilePath+metaName) # metadata
longDD_samples, shortDD_samples = metaExt._LoadDiseaseDuration(df_expr, meta_data, args.resultType) # get samples for LDD SDD
geneList = _extract_geneSignature(inputFile, SharedFilePath+msigFile) # Extracting genes
print("Total extracted genes: "+ str(len(geneList)))
gene_intersected = list(set(geneList).intersection(df_expr.index.tolist())) # intersected between expr and actScore sig
df_expr = df_expr[longDD_samples+shortDD_samples].loc[gene_intersected].dropna() # selected expr only
outputFile = SharedFilePath+os.path.basename(step1Input).replace('.csv', '.step6.csv') # replace to step6 output
ranksumSig = StatHandler.calculate_ranksum(df_expr, shortDD_samples, longDD_samples) # get ranksum result
df_expr.loc[ranksumSig["Names"].values.tolist()].to_csv(outputFile) # Writing