gen_miRNA_matrix.py

# copyright: yueshi@usc.edu
import pandas as pd 
import hashlib
import os 
from utils import logger
def file_as_bytes(file):
    with file:
        return file.read()

def extractMatrix(dirname):
	'''
	return a dataframe of the miRNA matrix, each row is the miRNA counts for a file_id

	'''
	count = 0

	miRNA_data = []
	featureColumns = ["RNA", "counts"]
	for idname in os.listdir(dirname):
		# list all the ids 

		if idname.find("-") != -1:
			idpath = dirname +"/" + idname

			# all the files in each id directory
			for filename in os.listdir(idpath):
				# check the miRNA file
				if filename.find("-") != -1 and filename.find("counts") != -1:

					filepath = idpath + "/" + filename
					# print(filepath)
					df = pd.read_csv(filepath,sep="\t", names = featureColumns)
					# columns = ["miRNA_ID", "read_count"]
					if count == 0:
						# get the miRNA_IDs 
						miRNA_IDs = df.RNA.values.tolist()

					id_miRNA_read_counts = [idname] + df.counts.values.tolist()
					
					miRNA_data.append(id_miRNA_read_counts)


					count +=1
					# print (df)
	print("Total {} RNA in files " .format(count))
	columns = ["file_id"] + miRNA_IDs
	df = pd.DataFrame(miRNA_data, columns=columns)
	return df

def extractLabel(inputfile):
	df = pd.read_csv(inputfile, sep="\t")
	#
	# print (df[columns])
	df['label'] = df['cases.0.samples.0.sample_type']
	# df['Tumor'] = df['cases.0.samples.0.sample_type']
	df['site'] = df['cases.0.project.primary_site']
	df.loc[df['cases.0.samples.0.sample_type'].str.contains("Normal"), 'label'] = 0
	df.loc[df['cases.0.samples.0.sample_type'].str.contains("Tumor"), 'label'] = 1
	tumor_count = df.loc[df['cases.0.samples.0.sample_type'].str.contains("Tumor")].shape[0]
	normal_count = df.loc[df['cases.0.samples.0.sample_type'].str.contains("Normal")].shape[0]
	logger.info("{} Normal samples, {} Tumor samples ".format(normal_count,tumor_count))
	df = df[df.label == 1]
	# print("Label : {}" .format(df.label))
	columns = ['file_id','label','site']
	return df[columns]

if __name__ == '__main__':

	# Data directory need to change accordingly
	data_dir ="/Users/Tony/Desktop/tmp/"
	# Input directory and label file. The directory that holds the data. Modify this when use.
	dirname = data_dir + "OtherType"
	label_file = data_dir + "files_meta_OtherType.tsv"
	
	#output file directory need to change accordingly
	outputfile = data_dir + "RNA_matrix_Brain_Counts.csv"

	# extract data
	matrix_df = extractMatrix(dirname)
	label_df = extractLabel(label_file)

	#merge the two based on the file_id
	result = pd.merge(label_df, matrix_df, on='file_id', how="right")
	#print(result)

	#save data
	result.to_csv(outputfile, index=False)
	#print (labeldf)