From 84f0ecf6856e0e0ebdad74147693dd0b23354b7b Mon Sep 17 00:00:00 2001 From: jfsanchezherrero Date: Mon, 18 Oct 2021 17:36:19 +0200 Subject: [PATCH] tRNA analysis MINTmap implemented --- XICRA_pip/XICRA/modules/tRNA.py | 19 ++++++- XICRA_pip/XICRA/scripts/MINTMap_caller.py | 4 -- XICRA_pip/XICRA/scripts/generate_DE.py | 61 ++++++++++++++++------- 3 files changed, 59 insertions(+), 25 deletions(-) diff --git a/XICRA_pip/XICRA/modules/tRNA.py b/XICRA_pip/XICRA/modules/tRNA.py index 70b0205..aff771a 100644 --- a/XICRA_pip/XICRA/modules/tRNA.py +++ b/XICRA_pip/XICRA/modules/tRNA.py @@ -183,10 +183,24 @@ def run_tRNA(options): ## debugging messages if options.debug: print (results_df) - + ## merge all parse gtf files created print ("+ Summarize tRNA analysis for all samples...") - generate_DE.generate_DE(results_df, options.debug, expression_folder, default_name="tRNA_expression-") + + if 'mintmap' in options.soft_name: + results_df = results_df.set_index('type') + + ## exclusive tRFs + print ("\n\n+ Parsing exclusive tRNA analysis for all samples...") + generate_DE.generate_DE(results_df.filter(like="amb", axis=0).set_index('name'), + options.debug, expression_folder, type_analysis="tRF-amb") + + ## amb tRFs + print ("\n\n+ Parsing ambiguous tRNA analysis for all samples...") + generate_DE.generate_DE(results_df.filter(like="exc", axis=0).set_index('name'), + options.debug, expression_folder, type_analysis="tRF-exc") + else: + generate_DE.generate_DE(results_df, options.debug, expression_folder, type_analysis="tRNA") print ("\n*************** Finish *******************") start_time_partial = functions.time_functions.timestamp(start_time_total) @@ -211,6 +225,7 @@ def tRNA_analysis(reads, folder, name, threads, soft_list, species, database, De ## save results in dataframe filename_amb = os.path.join(MINTmap_folder, 'mintmap_parse', name + '_amb.tsv') filename_exc = os.path.join(MINTmap_folder, 'mintmap_parse', name + '_exc.tsv') + results_df.loc[len(results_df)] = name, soft, "amb", filename_amb results_df.loc[len(results_df)] = name, soft, "exc", filename_exc \ No newline at end of file diff --git a/XICRA_pip/XICRA/scripts/MINTMap_caller.py b/XICRA_pip/XICRA/scripts/MINTMap_caller.py index b234f06..6c77594 100644 --- a/XICRA_pip/XICRA/scripts/MINTMap_caller.py +++ b/XICRA_pip/XICRA/scripts/MINTMap_caller.py @@ -69,7 +69,6 @@ def MINTMap_analysis(path_folder, reads, name, num_threads, species, Debug): print (colored("\tA previous command generated results on: %s [%s -- %s]" %(stamp, name, 'MINTmap call'), 'yellow')) else: # Call MINTMap_analysis - print ("\n+ Run MINTmap: ") codeReturn = MINTmap(reads, path_folder, name, num_threads, species_code, Debug) os.chdir(path_here) @@ -110,7 +109,6 @@ def MINTMap_analysis(path_folder, reads, name, num_threads, species, Debug): if functions.files_functions.is_non_zero_file(amb_file) and functions.files_functions.is_non_zero_file(exc_file): filename_stamp = path_folder + '/.success_all' functions.time_functions.print_time_stamp(filename_stamp) - return(True) @@ -139,9 +137,7 @@ def parse_tRF(pathFile, sample_name, matrix_folder, ident, Debug): if Debug: HCGB_aes.debug_message("MINTmap file: " + pathFile, "yellow") - for line in expression_lines: - # ------------------------------ # # Example line: # tRF-31-87R8WP9N1EWJ0 TCCCTGGTGGTCTAGTGGTTAGGATTCGGCG 5'-tRF 921 7026.67 452.60 na trna77_GluCTC_6_+_28949976_28950047@1.31.31, trna80_GluCTC_1_-_161417018_161417089@1.31.31 diff --git a/XICRA_pip/XICRA/scripts/generate_DE.py b/XICRA_pip/XICRA/scripts/generate_DE.py index 50fc48c..ccb1aaf 100644 --- a/XICRA_pip/XICRA/scripts/generate_DE.py +++ b/XICRA_pip/XICRA/scripts/generate_DE.py @@ -12,11 +12,13 @@ from sys import argv import pandas as pd import csv +from termcolor import colored from HCGB import functions +import HCGB.functions.aesthetics_functions as HCGB_aes #################### -def generate_DE(dataframe_results, Debug, outfolder, default_name='miRNA_expression-'): +def generate_DE(dataframe_results, Debug, outfolder, type_analysis='miRNA'): """Builds final expression matrices comparing all samples. Generates three .csv for each software used: @@ -24,7 +26,7 @@ def generate_DE(dataframe_results, Debug, outfolder, default_name='miRNA_express miRNA_expression-soft_name.csv: final matrix, counts of each isomiR (with miRNA and variant info) without duplicated reads miRNA_expression-soft_name_seq.csv: table with the miRTop identifier and the corresponding DNA sequence - miRNA_expression- is the default analysis but other can be provided such as tRNA, piRNA, etc + miRNA is the default analysis but other can be provided such as tRNA, piRNA, etc :param dataframe_results: dataframe with the paths of the outputs of each sample and software :param Debug: display complete log @@ -32,8 +34,12 @@ def generate_DE(dataframe_results, Debug, outfolder, default_name='miRNA_express :returns: None """ + ## get results dictionary for each software employed soft_list = dataframe_results.soft.unique() + + dataframe_results = dataframe_results.reset_index() + ## debugging messages if Debug: print ("## Debug:") @@ -56,32 +62,33 @@ def generate_DE(dataframe_results, Debug, outfolder, default_name='miRNA_express print ("dict_files") print (dict_files) - ## get data - (all_data, all_seqs) = generate_matrix(dict_files, soft_name.lower(), Debug) - - ## discard duplicate UIDs if any - all_data_filtered, all_data_duplicated = discard_UID_duplicated(all_data) + ## get data and discard duplicate UIDs if any + (all_data, all_seqs) = generate_matrix(dict_files, soft_name.lower(), Debug, type_analysis=type_analysis) + all_data_filtered, all_data_duplicated = discard_UID_duplicated(all_data, type_res=type_analysis) + ## dump data in folder provided - csv_outfile = os.path.join(outfolder, default_name, + soft_name) + csv_outfile = os.path.join(outfolder, type_analysis + '_expression-' + soft_name) all_data_filtered.to_csv(csv_outfile + ".csv", quoting=csv.QUOTE_NONNUMERIC) all_data_duplicated.to_csv(csv_outfile + '_dup.csv', quoting=csv.QUOTE_NONNUMERIC) all_seqs.to_csv(csv_outfile + '_seq.csv', quoting=csv.QUOTE_NONNUMERIC) #################### -def discard_UID_duplicated(df_data): +def discard_UID_duplicated(df_data, type_res="miRNA"): """ """ + ## get data index df_data['ID'] = df_data.index new_data = df_data.filter(['ID'], axis=1) # split ID (hsa-let-7a-2-3p&NA&qNkjr6Ov2) into miRNA, variant and UID tmp = new_data['ID'].str.split('&', expand = True) - new_data['miRNA'] = tmp[0] + + new_data[type_res] = tmp[0] new_data['variant'] = tmp[1] new_data['UID'] = tmp[2] - + ## count count_groups = new_data.groupby('UID').count() ## print to file? @@ -133,6 +140,9 @@ def generate_matrix(dict_files, soft_name, Debug, type_analysis="miRNA"): all_data = pd.DataFrame() seq_all_data = pd.DataFrame() for sample, this_file in dict_files.items(): + + new_data=pd.DataFrame() + print ('+ Reading information from sample: ', sample) ## @@ -181,7 +191,10 @@ def generate_matrix(dict_files, soft_name, Debug, type_analysis="miRNA"): new_data = new_data.set_index('unique_id') #### - elif type_analysis=="tRNA": + elif "tRF" in type_analysis: ## tRF-amb; tRF-exc, tRF + + if Debug: + HCGB_aes.debug_message(type_analysis + " analysis: ", color) ## ------------------------------------------ ## ## Create matrix for tRNA results @@ -189,14 +202,25 @@ def generate_matrix(dict_files, soft_name, Debug, type_analysis="miRNA"): ## UID Read tRNA variant ident expression soft\n' data['variant'].fillna('NA', inplace=True) - data['unique_id'] = data.apply(lambda data: data['tRNA'] + '&' + data['variant'] + '&' + data['UID'], axis=1) - + ## parse according to software if (soft_name == 'mintmap'): - new_data = data.filter(['unique_id', 'mintmap'], axis=1) + data['unique_id'] = data.apply(lambda data: data['tRNA'] + '&' + data['variant'] + '&' + data['UID'], axis=1) + new_data = data.filter(['unique_id', 'expression'], axis=1) new_data = new_data.set_index('unique_id') - new_data = new_data.rename(columns={'mintmap': sample}) - + new_data = new_data.rename(columns={'expression': sample}) + + ## TODO + #else: + # data['unique_id'] = data.apply(lambda data: data['tRNA'] + '&' + data['variant'] + '&' + data['UID'], axis=1) + # new_data = data.filter(['unique_id', 'XXXX'], axis=1) ## TODO + # new_data = new_data.set_index('unique_id') ## TODO + # new_data = new_data.rename(columns={'XXXX': sample}) ## TODO + + else: + print() + ## add new + ## sequence information seq_data = data.filter(['UID', 'Read'], axis=1) seq_data = seq_data.set_index('UID') @@ -216,8 +240,7 @@ def generate_matrix(dict_files, soft_name, Debug, type_analysis="miRNA"): print (all_data) print ("*** DEBUG: data for sequences all samples ***") print (seq_all_data) - - + return (all_data, seq_all_data) ######