Skip to content

Commit

Permalink
tRNA analysis MINTmap implemented
Browse files Browse the repository at this point in the history
  • Loading branch information
JFsanchezherrero committed Oct 18, 2021
1 parent cb16406 commit 84f0ecf
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 25 deletions.
19 changes: 17 additions & 2 deletions XICRA_pip/XICRA/modules/tRNA.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,10 +183,24 @@ def run_tRNA(options):
## debugging messages
if options.debug:
print (results_df)

## merge all parse gtf files created
print ("+ Summarize tRNA analysis for all samples...")
generate_DE.generate_DE(results_df, options.debug, expression_folder, default_name="tRNA_expression-")

if 'mintmap' in options.soft_name:
results_df = results_df.set_index('type')

## exclusive tRFs
print ("\n\n+ Parsing exclusive tRNA analysis for all samples...")
generate_DE.generate_DE(results_df.filter(like="amb", axis=0).set_index('name'),
options.debug, expression_folder, type_analysis="tRF-amb")

## amb tRFs
print ("\n\n+ Parsing ambiguous tRNA analysis for all samples...")
generate_DE.generate_DE(results_df.filter(like="exc", axis=0).set_index('name'),
options.debug, expression_folder, type_analysis="tRF-exc")
else:
generate_DE.generate_DE(results_df, options.debug, expression_folder, type_analysis="tRNA")

print ("\n*************** Finish *******************")
start_time_partial = functions.time_functions.timestamp(start_time_total)
Expand All @@ -211,6 +225,7 @@ def tRNA_analysis(reads, folder, name, threads, soft_list, species, database, De
## save results in dataframe
filename_amb = os.path.join(MINTmap_folder, 'mintmap_parse', name + '_amb.tsv')
filename_exc = os.path.join(MINTmap_folder, 'mintmap_parse', name + '_exc.tsv')

results_df.loc[len(results_df)] = name, soft, "amb", filename_amb
results_df.loc[len(results_df)] = name, soft, "exc", filename_exc

4 changes: 0 additions & 4 deletions XICRA_pip/XICRA/scripts/MINTMap_caller.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,6 @@ def MINTMap_analysis(path_folder, reads, name, num_threads, species, Debug):
print (colored("\tA previous command generated results on: %s [%s -- %s]" %(stamp, name, 'MINTmap call'), 'yellow'))
else:
# Call MINTMap_analysis
print ("\n+ Run MINTmap: ")
codeReturn = MINTmap(reads, path_folder, name, num_threads, species_code, Debug)
os.chdir(path_here)

Expand Down Expand Up @@ -110,7 +109,6 @@ def MINTMap_analysis(path_folder, reads, name, num_threads, species, Debug):
if functions.files_functions.is_non_zero_file(amb_file) and functions.files_functions.is_non_zero_file(exc_file):
filename_stamp = path_folder + '/.success_all'
functions.time_functions.print_time_stamp(filename_stamp)


return(True)

Expand Down Expand Up @@ -139,9 +137,7 @@ def parse_tRF(pathFile, sample_name, matrix_folder, ident, Debug):
if Debug:
HCGB_aes.debug_message("MINTmap file: " + pathFile, "yellow")


for line in expression_lines:

# ------------------------------ #
# Example line:
# tRF-31-87R8WP9N1EWJ0 TCCCTGGTGGTCTAGTGGTTAGGATTCGGCG 5'-tRF 921 7026.67 452.60 na [email protected], [email protected]
Expand Down
61 changes: 42 additions & 19 deletions XICRA_pip/XICRA/scripts/generate_DE.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,28 +12,34 @@
from sys import argv
import pandas as pd
import csv
from termcolor import colored

from HCGB import functions
import HCGB.functions.aesthetics_functions as HCGB_aes

####################
def generate_DE(dataframe_results, Debug, outfolder, default_name='miRNA_expression-'):
def generate_DE(dataframe_results, Debug, outfolder, type_analysis='miRNA'):
"""Builds final expression matrices comparing all samples.
Generates three .csv for each software used:
miRNA_expression-soft_name_dup.csv: counts with duplicated reads for each sample
miRNA_expression-soft_name.csv: final matrix, counts of each isomiR (with miRNA and variant info) without duplicated reads
miRNA_expression-soft_name_seq.csv: table with the miRTop identifier and the corresponding DNA sequence
miRNA_expression- is the default analysis but other can be provided such as tRNA, piRNA, etc
miRNA is the default analysis but other can be provided such as tRNA, piRNA, etc
:param dataframe_results: dataframe with the paths of the outputs of each sample and software
:param Debug: display complete log
:param outfolder: output folder
:returns: None
"""

## get results dictionary for each software employed
soft_list = dataframe_results.soft.unique()

dataframe_results = dataframe_results.reset_index()

## debugging messages
if Debug:
print ("## Debug:")
Expand All @@ -56,32 +62,33 @@ def generate_DE(dataframe_results, Debug, outfolder, default_name='miRNA_express
print ("dict_files")
print (dict_files)

## get data
(all_data, all_seqs) = generate_matrix(dict_files, soft_name.lower(), Debug)

## discard duplicate UIDs if any
all_data_filtered, all_data_duplicated = discard_UID_duplicated(all_data)
## get data and discard duplicate UIDs if any
(all_data, all_seqs) = generate_matrix(dict_files, soft_name.lower(), Debug, type_analysis=type_analysis)
all_data_filtered, all_data_duplicated = discard_UID_duplicated(all_data, type_res=type_analysis)


## dump data in folder provided
csv_outfile = os.path.join(outfolder, default_name, + soft_name)
csv_outfile = os.path.join(outfolder, type_analysis + '_expression-' + soft_name)
all_data_filtered.to_csv(csv_outfile + ".csv", quoting=csv.QUOTE_NONNUMERIC)
all_data_duplicated.to_csv(csv_outfile + '_dup.csv', quoting=csv.QUOTE_NONNUMERIC)
all_seqs.to_csv(csv_outfile + '_seq.csv', quoting=csv.QUOTE_NONNUMERIC)

####################
def discard_UID_duplicated(df_data):
def discard_UID_duplicated(df_data, type_res="miRNA"):
"""
"""

## get data index
df_data['ID'] = df_data.index
new_data = df_data.filter(['ID'], axis=1)

# split ID (hsa-let-7a-2-3p&NA&qNkjr6Ov2) into miRNA, variant and UID
tmp = new_data['ID'].str.split('&', expand = True)
new_data['miRNA'] = tmp[0]

new_data[type_res] = tmp[0]
new_data['variant'] = tmp[1]
new_data['UID'] = tmp[2]

## count
count_groups = new_data.groupby('UID').count()
## print to file?
Expand Down Expand Up @@ -133,6 +140,9 @@ def generate_matrix(dict_files, soft_name, Debug, type_analysis="miRNA"):
all_data = pd.DataFrame()
seq_all_data = pd.DataFrame()
for sample, this_file in dict_files.items():

new_data=pd.DataFrame()

print ('+ Reading information from sample: ', sample)

##
Expand Down Expand Up @@ -181,22 +191,36 @@ def generate_matrix(dict_files, soft_name, Debug, type_analysis="miRNA"):
new_data = new_data.set_index('unique_id')

####
elif type_analysis=="tRNA":
elif "tRF" in type_analysis: ## tRF-amb; tRF-exc, tRF

if Debug:
HCGB_aes.debug_message(type_analysis + " analysis: ", color)

## ------------------------------------------ ##
## Create matrix for tRNA results
## ------------------------------------------ ##
## UID Read tRNA variant ident expression soft\n'

data['variant'].fillna('NA', inplace=True)
data['unique_id'] = data.apply(lambda data: data['tRNA'] + '&' + data['variant'] + '&' + data['UID'], axis=1)


## parse according to software
if (soft_name == 'mintmap'):
new_data = data.filter(['unique_id', 'mintmap'], axis=1)
data['unique_id'] = data.apply(lambda data: data['tRNA'] + '&' + data['variant'] + '&' + data['UID'], axis=1)
new_data = data.filter(['unique_id', 'expression'], axis=1)
new_data = new_data.set_index('unique_id')
new_data = new_data.rename(columns={'mintmap': sample})

new_data = new_data.rename(columns={'expression': sample})

## TODO
#else:
# data['unique_id'] = data.apply(lambda data: data['tRNA'] + '&' + data['variant'] + '&' + data['UID'], axis=1)
# new_data = data.filter(['unique_id', 'XXXX'], axis=1) ## TODO
# new_data = new_data.set_index('unique_id') ## TODO
# new_data = new_data.rename(columns={'XXXX': sample}) ## TODO

else:
print()
## add new

## sequence information
seq_data = data.filter(['UID', 'Read'], axis=1)
seq_data = seq_data.set_index('UID')
Expand All @@ -216,8 +240,7 @@ def generate_matrix(dict_files, soft_name, Debug, type_analysis="miRNA"):
print (all_data)
print ("*** DEBUG: data for sequences all samples ***")
print (seq_all_data)



return (all_data, seq_all_data)

######
Expand Down

0 comments on commit 84f0ecf

Please sign in to comment.