forked from ChrisLiu95/Cancer-prediction
-
Notifications
You must be signed in to change notification settings - Fork 0
/
gen_miRNA_matrix.py
94 lines (71 loc) · 2.63 KB
/
gen_miRNA_matrix.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# copyright: [email protected]
import pandas as pd
import hashlib
import os
from utils import logger
def file_as_bytes(file):
with file:
return file.read()
def extractMatrix(dirname):
'''
return a dataframe of the miRNA matrix, each row is the miRNA counts for a file_id
'''
count = 0
miRNA_data = []
featureColumns = ["RNA", "counts"]
for idname in os.listdir(dirname):
# list all the ids
if idname.find("-") != -1:
idpath = dirname +"/" + idname
# all the files in each id directory
for filename in os.listdir(idpath):
# check the miRNA file
if filename.find("-") != -1 and filename.find("counts") != -1:
filepath = idpath + "/" + filename
# print(filepath)
df = pd.read_csv(filepath,sep="\t", names = featureColumns)
# columns = ["miRNA_ID", "read_count"]
if count == 0:
# get the miRNA_IDs
miRNA_IDs = df.RNA.values.tolist()
id_miRNA_read_counts = [idname] + df.counts.values.tolist()
miRNA_data.append(id_miRNA_read_counts)
count +=1
# print (df)
print("Total {} RNA in files " .format(count))
columns = ["file_id"] + miRNA_IDs
df = pd.DataFrame(miRNA_data, columns=columns)
return df
def extractLabel(inputfile):
df = pd.read_csv(inputfile, sep="\t")
#
# print (df[columns])
df['label'] = df['cases.0.samples.0.sample_type']
# df['Tumor'] = df['cases.0.samples.0.sample_type']
df['site'] = df['cases.0.project.primary_site']
df.loc[df['cases.0.samples.0.sample_type'].str.contains("Normal"), 'label'] = 0
df.loc[df['cases.0.samples.0.sample_type'].str.contains("Tumor"), 'label'] = 1
tumor_count = df.loc[df['cases.0.samples.0.sample_type'].str.contains("Tumor")].shape[0]
normal_count = df.loc[df['cases.0.samples.0.sample_type'].str.contains("Normal")].shape[0]
logger.info("{} Normal samples, {} Tumor samples ".format(normal_count,tumor_count))
df = df[df.label == 1]
# print("Label : {}" .format(df.label))
columns = ['file_id','label','site']
return df[columns]
if __name__ == '__main__':
# Data directory need to change accordingly
data_dir ="/Users/Tony/Desktop/tmp/"
# Input directory and label file. The directory that holds the data. Modify this when use.
dirname = data_dir + "OtherType"
label_file = data_dir + "files_meta_OtherType.tsv"
#output file directory need to change accordingly
outputfile = data_dir + "RNA_matrix_Brain_Counts.csv"
# extract data
matrix_df = extractMatrix(dirname)
label_df = extractLabel(label_file)
#merge the two based on the file_id
result = pd.merge(label_df, matrix_df, on='file_id', how="right")
#print(result)
#save data
result.to_csv(outputfile, index=False)
#print (labeldf)