tRNA analysis MINTmap implemented

HCGB-IGTP · Oct 18, 2021 · 84f0ecf · 84f0ecf
1 parent cb16406
commit 84f0ecf
Show file tree

Hide file tree

Showing 3 changed files with 59 additions and 25 deletions.
diff --git a/XICRA_pip/XICRA/modules/tRNA.py b/XICRA_pip/XICRA/modules/tRNA.py
@@ -183,10 +183,24 @@ def run_tRNA(options):
     ## debugging messages
     if options.debug:
         print (results_df)
-
+        
     ## merge all parse gtf files created
     print ("+ Summarize tRNA analysis for all samples...")
-    generate_DE.generate_DE(results_df, options.debug, expression_folder,  default_name="tRNA_expression-")
+
+    if 'mintmap' in options.soft_name:
+        results_df = results_df.set_index('type')
+
+        ## exclusive tRFs
+        print ("\n\n+ Parsing exclusive tRNA analysis for all samples...")
+        generate_DE.generate_DE(results_df.filter(like="amb", axis=0).set_index('name'), 
+                                options.debug, expression_folder,  type_analysis="tRF-amb")
+
+        ## amb tRFs
+        print ("\n\n+ Parsing ambiguous tRNA analysis for all samples...")
+        generate_DE.generate_DE(results_df.filter(like="exc", axis=0).set_index('name'), 
+                                options.debug, expression_folder,  type_analysis="tRF-exc")
+    else:
+        generate_DE.generate_DE(results_df, options.debug, expression_folder,  type_analysis="tRNA")
 
     print ("\n*************** Finish *******************")
     start_time_partial = functions.time_functions.timestamp(start_time_total)
@@ -211,6 +225,7 @@ def tRNA_analysis(reads, folder, name, threads, soft_list, species, database, De
             ## save results in dataframe
             filename_amb = os.path.join(MINTmap_folder, 'mintmap_parse', name + '_amb.tsv')
             filename_exc = os.path.join(MINTmap_folder, 'mintmap_parse', name + '_exc.tsv')
+
             results_df.loc[len(results_df)] = name, soft, "amb", filename_amb
             results_df.loc[len(results_df)] = name, soft, "exc", filename_exc
 
diff --git a/XICRA_pip/XICRA/scripts/MINTMap_caller.py b/XICRA_pip/XICRA/scripts/MINTMap_caller.py
@@ -69,7 +69,6 @@ def MINTMap_analysis(path_folder, reads, name, num_threads, species, Debug):
         print (colored("\tA previous command generated results on: %s [%s -- %s]" %(stamp, name, 'MINTmap call'), 'yellow'))
     else:
         # Call MINTMap_analysis
-        print ("\n+ Run MINTmap: ")
         codeReturn = MINTmap(reads, path_folder, name, num_threads, species_code, Debug)
         os.chdir(path_here)
 
@@ -110,7 +109,6 @@ def MINTMap_analysis(path_folder, reads, name, num_threads, species, Debug):
     if functions.files_functions.is_non_zero_file(amb_file) and functions.files_functions.is_non_zero_file(exc_file):
         filename_stamp = path_folder + '/.success_all'
         functions.time_functions.print_time_stamp(filename_stamp)
-
 
     return(True)
 
@@ -139,9 +137,7 @@ def parse_tRF(pathFile, sample_name, matrix_folder, ident, Debug):
         if Debug:    
             HCGB_aes.debug_message("MINTmap file: " + pathFile, "yellow")
 
-
         for line in expression_lines:
-
             # ------------------------------ #
             # Example line:
             # tRF-31-87R8WP9N1EWJ0    TCCCTGGTGGTCTAGTGGTTAGGATTCGGCG    5'-tRF    921    7026.67    452.60    na    [email protected], [email protected]

diff --git a/XICRA_pip/XICRA/scripts/generate_DE.py b/XICRA_pip/XICRA/scripts/generate_DE.py
@@ -12,28 +12,34 @@
 from sys import argv
 import pandas as pd
 import csv
+from termcolor import colored
 
 from HCGB import functions
+import HCGB.functions.aesthetics_functions as HCGB_aes
 
 ####################
-def generate_DE(dataframe_results, Debug, outfolder, default_name='miRNA_expression-'):
+def generate_DE(dataframe_results, Debug, outfolder, type_analysis='miRNA'):
 	"""Builds final expression matrices comparing all samples.
 	
 		Generates three .csv for each software used:
 		miRNA_expression-soft_name_dup.csv: counts with duplicated reads for each sample
 		miRNA_expression-soft_name.csv: final matrix, counts of each isomiR (with miRNA and variant info) without duplicated reads 
 		miRNA_expression-soft_name_seq.csv: table with the miRTop identifier and the corresponding DNA sequence
 		
-		miRNA_expression- is the default analysis but other can be provided such as tRNA, piRNA, etc
+		miRNA is the default analysis but other can be provided such as tRNA, piRNA, etc
 		
 		:param dataframe_results: dataframe with the paths of the outputs of each sample and software
 		:param Debug: display complete log
 		:param outfolder: output folder
 		
 	    :returns: None
 	"""
+
 	## get results dictionary for each software employed 
 	soft_list = dataframe_results.soft.unique()
+
+	dataframe_results = dataframe_results.reset_index()
+
 	## debugging messages
 	if Debug:
 		print ("## Debug:")
@@ -56,32 +62,33 @@ def generate_DE(dataframe_results, Debug, outfolder, default_name='miRNA_express
 			print ("dict_files")
 			print (dict_files)
 
-		## get data
-		(all_data, all_seqs) = generate_matrix(dict_files, soft_name.lower(), Debug)
-
-		## discard duplicate UIDs if any
-		all_data_filtered, all_data_duplicated = discard_UID_duplicated(all_data)
+		## get data and discard duplicate UIDs if any
+		(all_data, all_seqs) = generate_matrix(dict_files, soft_name.lower(), Debug, type_analysis=type_analysis)
+		all_data_filtered, all_data_duplicated = discard_UID_duplicated(all_data, type_res=type_analysis)
+
 
 		## dump data in folder provided
-		csv_outfile = os.path.join(outfolder, default_name, + soft_name)
+		csv_outfile = os.path.join(outfolder, type_analysis + '_expression-' + soft_name)
 		all_data_filtered.to_csv(csv_outfile + ".csv", quoting=csv.QUOTE_NONNUMERIC)
 		all_data_duplicated.to_csv(csv_outfile + '_dup.csv', quoting=csv.QUOTE_NONNUMERIC)
 		all_seqs.to_csv(csv_outfile + '_seq.csv', quoting=csv.QUOTE_NONNUMERIC)
 
 ####################
-def discard_UID_duplicated(df_data):
+def discard_UID_duplicated(df_data, type_res="miRNA"):
 	"""
 	"""
+
 	## get data index
 	df_data['ID'] = df_data.index
 	new_data = df_data.filter(['ID'], axis=1)	
 
 	# split ID (hsa-let-7a-2-3p&NA&qNkjr6Ov2) into miRNA, variant and UID
 	tmp = new_data['ID'].str.split('&', expand = True)
-	new_data['miRNA']  = tmp[0]
+
+	new_data[type_res]  = tmp[0]
 	new_data['variant']  = tmp[1]
 	new_data['UID']  = tmp[2]
-
+	
 	## count 
 	count_groups = new_data.groupby('UID').count()
 	## print to file?
@@ -133,6 +140,9 @@ def generate_matrix(dict_files, soft_name, Debug, type_analysis="miRNA"):
 	all_data = pd.DataFrame()
 	seq_all_data = pd.DataFrame()
 	for sample, this_file in dict_files.items():
+
+		new_data=pd.DataFrame()
+
 		print ('+ Reading information from sample: ', sample)	
 
 		## 
@@ -181,22 +191,36 @@ def generate_matrix(dict_files, soft_name, Debug, type_analysis="miRNA"):
 				new_data = new_data.set_index('unique_id')
 
 		####
-		elif type_analysis=="tRNA":
+		elif "tRF" in type_analysis: ## tRF-amb; tRF-exc, tRF
+
+			if Debug:
+				HCGB_aes.debug_message(type_analysis + " analysis: ", color)
 
 			## ------------------------------------------ ##
 			## Create matrix for tRNA results
 			## ------------------------------------------ ##
 			## UID	Read	tRNA	variant	ident	expression	soft\n'
 
 			data['variant'].fillna('NA', inplace=True)
-			data['unique_id'] = data.apply(lambda data: data['tRNA'] + '&' + data['variant'] + '&' + data['UID'], axis=1)
-
+
 			## parse according to software
 			if (soft_name == 'mintmap'):
-				new_data = data.filter(['unique_id', 'mintmap'], axis=1)
+				data['unique_id'] = data.apply(lambda data: data['tRNA'] + '&' + data['variant'] + '&' + data['UID'], axis=1)
+				new_data = data.filter(['unique_id', 'expression'], axis=1)
 				new_data = new_data.set_index('unique_id')
-				new_data = new_data.rename(columns={'mintmap': sample})
-
+				new_data = new_data.rename(columns={'expression': sample})
+
+			## TODO
+			#else:
+			#	data['unique_id'] = data.apply(lambda data: data['tRNA'] + '&' + data['variant'] + '&' + data['UID'], axis=1)
+			#	new_data = data.filter(['unique_id', 'XXXX'], axis=1) ## TODO
+			#	new_data = new_data.set_index('unique_id')			  ## TODO
+			#	new_data = new_data.rename(columns={'XXXX': sample})  ## TODO
+
+		else:
+			print()
+			## add new
+
 		## sequence information
 		seq_data = data.filter(['UID', 'Read'], axis=1)	
 		seq_data = seq_data.set_index('UID')
@@ -216,8 +240,7 @@ def generate_matrix(dict_files, soft_name, Debug, type_analysis="miRNA"):
 		print (all_data)
 		print ("*** DEBUG: data for sequences all samples ***")
 		print (seq_all_data)
-
-
+
 	return (all_data, seq_all_data)	
 
 ######