diff --git a/XICRA_pip/XICRA/config/software/dependencies.csv b/XICRA_pip/XICRA/config/software/dependencies.csv index c7146e7..43af1d0 100644 --- a/XICRA_pip/XICRA/config/software/dependencies.csv +++ b/XICRA_pip/XICRA/config/software/dependencies.csv @@ -6,7 +6,7 @@ Rscript,--version,.*version\s([0-9\.]+).*,3.5.1,Rscript java,,,na,java python,--version,([0-9\.]+),3.6,python perl,--version,v([0-9]\.[0-9]+\.[0-9]),5.18.1,perl -make,--version,GNU Make ([0-9]\.[0-9].*),4.0,make +make,--version,GNU Make ([0-9]\.[0-9].*),4,make git,--version,git version ([0-9]\.[0-9]+\.[0-9]),2.1.0,git fastqjoin,,,na,fastq-join sRNAbench,-h,([0-9\.]+),1.5,sRNAbench.jar @@ -14,4 +14,5 @@ miRTop,--version,([0-9\.]+).*,0.4.23,mirtop optimir,,,na,optimir miraligner,,,na,miraligner.jar STAR,--version,([0-9\.]+).*,2.6.1,STAR -featureCounts,-v,v([0-9\.]+).*,1.5.1,featureCounts \ No newline at end of file +featureCounts,-v,v([0-9\.]+).*,1.5.1,featureCounts +MINTmap,,,na,MINTmap diff --git a/XICRA_pip/XICRA/modules/__init__.py b/XICRA_pip/XICRA/modules/__init__.py index 6cd4cc4..4aedaaa 100644 --- a/XICRA_pip/XICRA/modules/__init__.py +++ b/XICRA_pip/XICRA/modules/__init__.py @@ -7,7 +7,8 @@ 'miRNA', 'prep', 'qc', - 'trimm' + 'trimm', + 'tRNA' ] from XICRA.modules import * diff --git a/XICRA_pip/XICRA/modules/help_XICRA.py b/XICRA_pip/XICRA/modules/help_XICRA.py index e42f6e2..5b1f601 100644 --- a/XICRA_pip/XICRA/modules/help_XICRA.py +++ b/XICRA_pip/XICRA/modules/help_XICRA.py @@ -7,10 +7,8 @@ Help messages for different scripts, modules """ from termcolor import colored - from HCGB import functions - ############### def help_fastq_format(): """ @@ -153,18 +151,27 @@ def help_fastq_format(): print ("name_L00x_R2.fastq\tname_L00x_R2.fq\nname_L00x_R2.fastq.gz\tname_L00x_R2.fq.gz") print ("\n") - +############### def project_help(): return () +############### def multiqc_help(): return () +############### def print_help_adapters(): return() +############### def help_join_reads(): return () +############### def help_miRNA(): return () + +############### +def help_tRNA(): + return () + diff --git a/XICRA_pip/XICRA/modules/miRNA.py b/XICRA_pip/XICRA/modules/miRNA.py index 8dd7563..471eb64 100644 --- a/XICRA_pip/XICRA/modules/miRNA.py +++ b/XICRA_pip/XICRA/modules/miRNA.py @@ -22,10 +22,16 @@ ## import my modules from HCGB import sampleParser from HCGB import functions + from XICRA.config import set_config from XICRA.modules import help_XICRA from XICRA.scripts import generate_DE -from HCGB.functions import fasta_functions + +from XICRA.scripts import mirtop_caller +from XICRA.scripts import sRNAbench_caller +from XICRA.scripts import optimir_caller +from XICRA.scripts import miraligner_caller + ############################################## def run_miRNA(options): @@ -295,7 +301,7 @@ def miRNA_analysis(reads, folder, name, threads, miRNA_gff, soft_list, if (soft == "sRNAbench"): ## create sRNAbench sRNAbench_folder = functions.files_functions.create_subfolder('sRNAbench', folder) - code_success = sRNAbench_caller(reads, sRNAbench_folder, name, threads, species, Debug) ## Any additional sRNAbench parameter? + code_success = sRNAbench_caller.sRNAbench_caller(reads, sRNAbench_folder, name, threads, species, Debug) ## Any additional sRNAbench parameter? if not code_success: print ('** miRTop would not be executed for sample %s...' %name) @@ -303,7 +309,7 @@ def miRNA_analysis(reads, folder, name, threads, miRNA_gff, soft_list, ## create folder for sRNAbench results miRTop_folder = functions.files_functions.create_subfolder("sRNAbench_miRTop", folder) - miRTop_caller(sRNAbench_folder, miRTop_folder, name, threads, miRNA_gff, hairpinFasta, 'sRNAbench', species, Debug) + mirtop_caller.miRTop_caller(sRNAbench_folder, miRTop_folder, name, threads, miRNA_gff, hairpinFasta, 'sRNAbench', species, Debug) ## save results in dataframe filename = os.path.join(miRTop_folder, 'counts', 'mirtop.tsv') @@ -313,11 +319,11 @@ def miRNA_analysis(reads, folder, name, threads, miRNA_gff, soft_list, if (soft == "optimir"): ## create OptimiR analysis optimir_folder = functions.files_functions.create_subfolder('OptimiR', folder) - code_success = optimir_caller(reads, optimir_folder, name, threads, matureFasta, hairpinFasta, miRNA_gff, species, Debug) ## Any additional sRNAbench parameter? + code_success = optimir_caller.optimir_caller(reads, optimir_folder, name, threads, matureFasta, hairpinFasta, miRNA_gff, species, Debug) ## Any additional sRNAbench parameter? ## create folder for Optimir results miRTop_folder = functions.files_functions.create_subfolder("OptimiR_miRTop", folder) - miRTop_caller(optimir_folder, miRTop_folder, name, threads, miRNA_gff, hairpinFasta, 'optimir', species, Debug) + mirtop_caller.miRTop_caller(optimir_folder, miRTop_folder, name, threads, miRNA_gff, hairpinFasta, 'optimir', species, Debug) ## save results in dataframe filename = os.path.join(miRTop_folder, 'counts', 'mirtop.tsv') @@ -325,400 +331,16 @@ def miRNA_analysis(reads, folder, name, threads, miRNA_gff, soft_list, ### if (soft == "miraligner"): + ## create OptimiR analysis miraligner_folder = functions.files_functions.create_subfolder('miraligner', folder) - code_success = miraligner_caller(reads, miraligner_folder, name, threads, database, species, Debug) + code_success = miraligner_caller.miraligner_caller(reads, miraligner_folder, name, threads, database, species, Debug) ## create folder for Optimir results miRTop_folder = functions.files_functions.create_subfolder("miraligner_miRTop", folder) - miRTop_caller(miraligner_folder, miRTop_folder, name, threads, miRNA_gff, hairpinFasta, 'seqbuster', species, Debug) + mirtop_caller.miRTop_caller(miraligner_folder, miRTop_folder, name, threads, miRNA_gff, hairpinFasta, 'seqbuster', species, Debug) ## save results in dataframe filename = os.path.join(miRTop_folder, 'counts', 'mirtop.tsv') results_df.loc[len(results_df)] = name, soft, filename -############### -def sRNAbench_caller(reads, sample_folder, name, threads, species, Debug): - """Passes the sample information to be analyzed by sRNAbench() - - Checks if the computation has been performed before. If not, it calls - sRNAbench(). - - :param reads: file with sample reads - :param sample_folder: output folder - :param name: sample name - :param threads: selected threads (by defoult 2) - :param species: species tag ID. Default: hsa (Homo sapiens) - :param Debug: display complete log. - - - :returns: True/False - """ - # check if previously generated and succeeded - filename_stamp = sample_folder + '/.success' - if os.path.isfile(filename_stamp): - stamp = functions.time_functions.read_time_stamp(filename_stamp) - print (colored("\tA previous command generated results on: %s [%s -- %s]" %(stamp, name, 'sRNAbench'), 'yellow')) - else: - # Call sRNAbench - code_returned = sRNAbench(reads, sample_folder, name, threads, species, Debug) - if code_returned: - functions.time_functions.print_time_stamp(filename_stamp) - else: - print ('** Sample %s failed...' %name) - return(False) - - return(True) - -############### -def sRNAbench (reads, outpath, file_name, num_threads, species, Debug): - """Executes the sRNAbench analyis of the sample. - - Checks if the reads are joined and builds the sRNAbench command to execute it. - - :param reads: file with sample reads - :param outpath: output folder - :param file_name: sample name - :param num_threads: selected threads (by defoult 2) - :param species: species tag ID. Default: hsa (Homo sapiens) - :param Debug: display complete log. - - - :returns: The sRNAbench output - """ - sRNAbench_exe = set_config.get_exe("sRNAbench", Debug=Debug) - - ## set as option - ## sRNAbench.jar in exec/ folder within sRNAtoolboxDB - ## sRNAbench_db = os.path.abspath(os.path.join(os.path.dirname(sRNAbench_exe), '..')) ## sRNAtoolboxDB - - ## sRNAbench.jar linked in bin/. sRNAtoolboxDB in same folder - sRNAbench_db = os.path.abspath(os.path.join(os.path.dirname(sRNAbench_exe), 'sRNAtoolboxDB')) ## sRNAtoolboxDB - - logfile = os.path.join(outpath, 'sRNAbench.log') - - if (len(reads) > 1): - print (colored("** ERROR: Only 1 fastq file is allowed please joined reads before...", 'red')) - exit() - - ## create command - java_exe = set_config.get_exe('java', Debug=Debug) - cmd = '%s -jar %s dbPath=%s input=%s output=%s' %(java_exe, sRNAbench_exe, sRNAbench_db, reads[0], outpath) - cmd = cmd + ' microRNA=%s isoMiR=true plotLibs=true graphics=true' %species - cmd = cmd + ' plotMiR=true bedGraphMode=true writeGenomeDist=true' - cmd = cmd + ' chromosomeLevel=true chrMappingByLength=true > ' + logfile - - return(functions.system_call_functions.system_call(cmd)) - -############### -def optimir_caller(reads, sample_folder, name, threads, matureFasta, hairpinFasta, miRNA_gff, species, Debug): - """Passes the sample information to be analyzed by optimiR(). - - Checks if the computation has been performed before. If not, it calls - optimir(). - - :param reads: file with sample reads - :param sample_folder: output folder - :param name: sample name - :param threads: selected threads (by defoult 2) - :param matureFasta: mature fasta file - :param hairpinFasta: hairpin fasta file - :param miRNA_gff: miRNA gff3 annotation file - :param species: species tag ID. Default: hsa (Homo sapiens) - :param Debug: display complete log. - - - :returns: True/False - """ - # check if previously generated and succeeded - filename_stamp = sample_folder + '/.success' - if os.path.isfile(filename_stamp): - stamp = functions.time_functions.read_time_stamp(filename_stamp) - print (colored("\tA previous command generated results on: %s [%s -- %s]" %(stamp, name, 'OptimiR'), 'yellow')) - else: - # Call sRNAbench - ## no species option for OptimiR - code_returned = optimir(reads, sample_folder, name, threads, matureFasta, hairpinFasta, miRNA_gff, Debug) - if code_returned: - functions.time_functions.print_time_stamp(filename_stamp) - else: - print ('** Sample %s failed...' %name) - return(False) - - return(True) - -############### -def optimir (reads, outpath, file_name, num_threads, matureFasta, hairpinFasta, miRNA_gff, Debug): - """Executes the optimiR analyis of the sample. - - Checks if the reads are joined and builds the optimiR command to execute it. - - :param reads: file with sample reads - :param outpath: output folder - :param file_name: sample name - :param num_threads: selected threads (by defoult 2) - :param matureFasta: mature fasta file - :param hairpinFasta: hairpin fasta file - :param miRNA_gff: miRNA gff3 annotation file - :param Debug: display complete log. - - - :returns: The optimiR output - """ - optimir_exe = set_config.get_exe("optimir", Debug=Debug) - sRNAbench_db = os.path.abspath(os.path.join(os.path.dirname(optimir_exe), '..')) ## optimir - logfile = os.path.join(outpath, 'optimir.log') - errfile = os.path.join(outpath, 'optimir.err') - - if (len(reads) > 1): - print (colored("** ERROR: Only 1 fastq file is allowed please joined reads before...", 'red')) - exit() - - ## create command - cmd = "%s process --fq %s --gff_out -o %s --maturesFasta %s --hairpinsFasta %s --gff3 %s > %s 2> %s" %( - optimir_exe, reads[0], outpath, matureFasta, hairpinFasta, miRNA_gff, logfile, errfile) - return(functions.system_call_functions.system_call(cmd)) - -############### -def miraligner_caller(reads, sample_folder, name, threads, database, species, Debug): - """Passes the sample information to be analyzed by miraligner(). - - Checks if the computation has been performed before. If not, it calls - miraligner(). - - :param reads: file with sample reads - :param sample_folder: output folder - :param name: sample name - :param threads: selected threads (by defoult 2) - :param database: path to store miRNA annotation files downloaded - :param species: species tag ID. Default: hsa (Homo sapiens) - :param Debug: display complete log. - - - :returns: True/False - """ - # check if previously generated and succeeded - filename_stamp = sample_folder + '/.success' - if os.path.isfile(filename_stamp): - stamp = functions.time_functions.read_time_stamp(filename_stamp) - print (colored("\tA previous command generated results on: %s [%s -- %s]" %(stamp, name, 'sRNAbench'), 'yellow')) - else: - # Call miralinger - code_returned = miraligner(reads, sample_folder, name, database, species, Debug) - if code_returned: - functions.time_functions.print_time_stamp(filename_stamp) - else: - print ('** Sample %s failed...' %name) - return(False) - - return(True) - -############### -def miraligner (reads, outpath, file_name, database, species, Debug): - """Executes the miraligner analyis of the sample - - Checks if the reads are joined and builds the miraligner command to execute it. - - :param reads: file with sample reads - :param outpath: output folder - :param file_name: sample name - :param database: path to store miRNA annotation files downloaded - :param species: species tag ID. Default: hsa (Homo sapiens) - :param Debug: display complete log. - - - :returns: The miraligner output - """ - miraligner_exe = set_config.get_exe("miraligner", Debug=Debug) - logfile = os.path.join(outpath, 'miraligner.log') - - ## output - outpath_file = os.path.join(outpath, file_name) - - if (len(reads) > 1): - print (colored("** ERROR: Only 1 fastq file is allowed please joined reads before...", 'red')) - exit() - - ## create tabular information of reads - tabular_info = os.path.join(outpath, file_name + '-tab.freq.txt') - fasta_functions.reads2tabular(reads[0], tabular_info) - - ## create command - java_exe = set_config.get_exe('java', Debug=Debug) - cmd = '%s -jar %s -db %s -sub 1 -add 3 -trim 3 -s %s -i %s -o %s 2> %s' %( - java_exe, miraligner_exe, database, species, tabular_info, outpath_file, logfile) - - return(functions.system_call_functions.system_call(cmd)) - - -############### -def miRTop_caller(results_folder, mirtop_folder, name, threads, miRNA_gff, hairpinFasta, format, species, Debug): - """Checks if the computation has already been performed. If not, it calls - miRTop() - - :param results_folder: file with the output of the sample from each software - :param folder: output miRTop folder - :param name: sample name - :param threads: selected threads (by defoult 2) - :param miRNA_gff: miRNA gff3 annotation file - :param hairpinFasta: hairpin fasta file - :param format: 'sRNAbench', 'optimir' or 'seqbuster' - :param species: species tag ID. Default: hsa (Homo sapiens) - :param Debug: display complete log. - - - :returns: True/False - """ - # check if previously generated and succeeded - mirtop_folder_gff = functions.files_functions.create_subfolder('gff', mirtop_folder) - mirtop_folder_stats = functions.files_functions.create_subfolder('stats', mirtop_folder) - mirtop_folder_counts = functions.files_functions.create_subfolder('counts', mirtop_folder) - mirtop_folder_counts = functions.files_functions.create_subfolder('export', mirtop_folder) - - filename_stamp = mirtop_folder_counts + '/.success' - if os.path.isfile(filename_stamp): - stamp = functions.time_functions.read_time_stamp(filename_stamp) - print (colored("\tA previous command generated results on: %s [%s -- %s]" %(stamp, name, 'miRTop'), 'yellow')) - else: - # Call miRTop - code_returned = miRTop(results_folder, mirtop_folder, name, threads, format.lower(), miRNA_gff, hairpinFasta, species, Debug) - if code_returned: - functions.time_functions.print_time_stamp(filename_stamp) - else: - print ('** Sample %s failed...' %name) - return(False) - - return(True) - -############### -def miRTop(results_folder, sample_folder, name, threads, format, miRNA_gff, hairpinFasta, species,Debug): - """Checks if the computation has already been performed. If not, it calls - miRTop() - - :param results_folder: file with the output of the sample from each software - :param folder: output miRTop folder - :param name: sample name - :param threads: selected threads (by defoult 2) - :param miRNA_gff: miRNA gff3 annotation file - :param hairpinFasta: hairpin fasta file - :param format: 'sRNAbench', 'optimir' or 'seqbuster' - :param species: species tag ID. Default: hsa (Homo sapiens) - :param Debug: display complete log. - - - :returns: Folder with the results for the sample in miRTop format - """ - - miRTop_exe = set_config.get_exe('miRTop', Debug=Debug) - logfile = os.path.join(sample_folder, name + '.log') - - ## folders - mirtop_folder_gff = os.path.join(sample_folder, 'gff') - mirtop_folder_stats = os.path.join(sample_folder, 'stats') - mirtop_folder_counts = os.path.join(sample_folder, 'counts') - mirtop_folder_export = os.path.join(sample_folder, 'export') - - ## get info according to software - if format == "sRNAbench": - ## get sRNAbench info - reads_annot = os.path.join(results_folder, "reads.annotation") - - ## check non zero - if not functions.files_functions.is_non_zero_file(reads_annot): - print (colored("\tNo isomiRs detected for sample [%s -- %s]" %(name, 'sRNAbench'), 'yellow')) - return (False) - - elif format == "optimir": - ## get optimir info - gff3_file = functions.main_functions.retrieve_matching_files(os.path.join(results_folder, "OptimiR_Results"), "gff3", Debug)[0] - results_folder = gff3_file - - ## check non zero - if not functions.files_functions.is_non_zero_file(gff3_file): - print (colored("\tNo isomiRs detected for sample [%s -- %s]" %(name, 'optimir'), 'yellow')) - return (False) - - elif format == "seqbuster": - ## get miraligner info - mirna_file = functions.main_functions.retrieve_matching_files(results_folder, ".mirna", Debug)[0] - results_folder = mirna_file - - ## check non zero - if not functions.files_functions.is_non_zero_file(mirna_file): - print (colored("\tNo isomiRs detected for sample [%s -- %s]" %(name, 'miraligner'), 'yellow')) - return (False) - - - ## miRTop analysis gff - filename_stamp_gff = mirtop_folder_gff + '/.success' - if os.path.isfile(filename_stamp_gff): - stamp = functions.time_functions.read_time_stamp(filename_stamp_gff) - print (colored("\tA previous command generated results on: %s [%s -- %s - gff]" %(stamp, name, 'miRTop'), 'yellow')) - else: - print ('Creating isomiRs gtf file for sample %s' %name) - cmd = miRTop_exe + ' gff --sps %s --hairpin %s --gtf %s --format %s -o %s %s 2> %s' %( - species, hairpinFasta, miRNA_gff, format, - mirtop_folder_gff, results_folder, logfile) - - ## execute - code_miRTop = functions.system_call_functions.system_call(cmd) - if code_miRTop: - functions.time_functions.print_time_stamp(filename_stamp_gff) - else: - return(False) - - ## miRTop stats - mirtop_folder_gff_file = os.path.join(mirtop_folder_gff, 'mirtop.gff') - - #filename_stamp_stats = mirtop_folder_stats + '/.success' - #if os.path.isfile(filename_stamp_stats): - # stamp = functions.time_functions.read_time_stamp(filename_stamp_stats) - # print (colored("\tA previous command generated results on: %s [%s -- %s - stats]" %(stamp, name, 'miRTop'), 'yellow')) - #else: - # print ('Creating isomiRs stats for sample %s' %name) - # cmd_stats = miRTop_exe + ' stats -o %s %s 2>> %s' %(mirtop_folder_stats, mirtop_folder_gff_file, logfile) - # code_miRTop_stats = functions.system_call_functions.system_call(cmd_stats) - # if code_miRTop_stats: - # functions.time_functions.print_time_stamp(filename_stamp_stats) - # else: - # return(False) - - ## miRTop counts - filename_stamp_counts = mirtop_folder_counts + '/.success' - if os.path.isfile(filename_stamp_counts): - stamp = functions.time_functions.read_time_stamp(filename_stamp_counts) - print (colored("\tA previous command generated results on: %s [%s -- %s - counts]" %(stamp, name, 'miRTop'), 'yellow')) - else: - print ('Creating isomiRs counts for sample %s' %name) - ## if both succeeded - cmd_stats = miRTop_exe + ' counts -o %s --gff %s --hairpin %s --gtf %s --sps %s 2>> %s' %(mirtop_folder_counts, mirtop_folder_gff_file, hairpinFasta, miRNA_gff, species, logfile) - code_miRTop_counts = functions.system_call_functions.system_call(cmd_stats) - - if code_miRTop_counts: - functions.time_functions.print_time_stamp(filename_stamp_counts) - else: - return(False) - - ## miRTop export - filename_stamp_export = mirtop_folder_export + '/.success' - if os.path.isfile(filename_stamp_export): - stamp = functions.time_functions.read_time_stamp(filename_stamp_export) - print (colored("\tA previous command generated results on: %s [%s -- %s - export]" %(stamp, name, 'miRTop'), 'yellow')) - else: - print ('Creating isomiRs export information for sample %s' %name) - ## if both succeeded - cmd_export = miRTop_exe + ' export -o %s --hairpin %s --gtf %s --sps %s --format isomir %s 2> %s' %(mirtop_folder_export, hairpinFasta, miRNA_gff, species, mirtop_folder_gff_file, logfile) - code_miRTop_export = functions.system_call_functions.system_call(cmd_export) - - if code_miRTop_export: - functions.time_functions.print_time_stamp(filename_stamp_export) - else: - return(False) - - ## return all success - outdir_tsv = os.path.join(mirtop_folder_counts, 'mirtop.tsv') - return (outdir_tsv) - - ## if any command failed - #return(False) - diff --git a/XICRA_pip/XICRA/modules/tRNA.py b/XICRA_pip/XICRA/modules/tRNA.py new file mode 100644 index 0000000..aff771a --- /dev/null +++ b/XICRA_pip/XICRA/modules/tRNA.py @@ -0,0 +1,231 @@ +#!/usr/bin/env python3 +############################################################ +## Jose F. Sanchez, Marta Lopez & Lauro Sumoy ## +## Copyright (C) 2019-2021 Lauro Sumoy Lab, IGTP, Spain ## +############################################################ +""" +This module performs tRNA analysis with several possible tools: +MINTMap, ... + +It shows results unifiying formats by using the miRTop nomenclature. +""" +## import useful modules +import os +import sys +import re +import time +from io import open +import shutil +import concurrent.futures +import pandas as pd +from termcolor import colored + +## import my modules +from HCGB import sampleParser +from HCGB import functions +from XICRA.config import set_config +from XICRA.modules import help_XICRA +from XICRA.scripts import generate_DE +from XICRA.scripts import MINTMap_caller + +############################################## +def run_tRNA(options): + ''' + ''' + + ## init time + start_time_total = time.time() + + ################################## + ### show help messages if desired + ################################## + if (options.help_format): + ## help_format option + help_XICRA.help_fastq_format() + elif (options.help_project): + ## information for project + help_XICRA.project_help() + exit() + elif (options.help_tRNA): + ## information for join reads + help_XICRA.help_tRNA() + exit() + + ## debugging messages + global Debug + if (options.debug): + Debug = True + else: + Debug = False + + ### set as default paired_end mode + if (options.single_end): + options.pair = False + else: + options.pair = True + + functions.aesthetics_functions.pipeline_header('XICRA') + functions.aesthetics_functions.boxymcboxface("tRNA analysis") + print ("--------- Starting Process ---------") + functions.time_functions.print_time() + + ## absolute path for in & out + input_dir = os.path.abspath(options.input) + outdir="" + + ## set mode: project/detached + if (options.detached): + outdir = os.path.abspath(options.output_folder) + options.project = False + else: + options.project = True + outdir = input_dir + + ## user software selection + print ("+ Software for tRNA analysis selected:") + print (options.soft_name) + + ## get files + print ('+ Getting files from input folder... ') + if options.pair: + options.pair = False ## set paired-end to false for further prepocessing + if options.noTrim: + print ('+ Mode: fastq.\n+ Extension: ') + print ("[ fastq, fq, fastq.gz, fq.gz ]\n") + pd_samples_retrieved = sampleParser.files.get_files(options, input_dir, "fastq", ("fastq", "fq", "fastq.gz", "fq.gz"), options.debug) + else: + print ('+ Mode: join.\n+ Extension: ') + print ("[_joined.fastq]\n") + pd_samples_retrieved = sampleParser.files.get_files(options, input_dir, "join", ['_joined.fastq'], options.debug) + else: + if options.noTrim: + print ('+ Mode: fastq.\n+ Extension: ') + print ("[ fastq, fq, fastq.gz, fq.gz ]\n") + pd_samples_retrieved = sampleParser.files.get_files(options, input_dir, "fastq", ("fastq", "fq", "fastq.gz", "fq.gz"), options.debug) + else: + print ('+ Mode: join.\n+ Extension: ') + print ("[_joined.fastq]\n") + pd_samples_retrieved = sampleParser.files.get_files(options, input_dir, "trim", ['_trim'], options.debug) + + ## debug message + if (Debug): + print (colored("**DEBUG: pd_samples_retrieve **", 'yellow')) + print (pd_samples_retrieved) + + ## species + print ("+ Species provided:", options.species) + + ## set database path if necessary + if not (options.database): + install_path = os.path.dirname(os.path.realpath(__file__)) + options.database = os.path.join(install_path, "db_files") + else: + options.database = os.path.abspath(options.database) + + ## generate output folder, if necessary + if not options.project: + print ("\n+ Create output folder(s):") + functions.files_functions.create_folder(outdir) + + ## for samples + outdir_dict = functions.files_functions.outdir_project(outdir, options.project, pd_samples_retrieved, "tRNA", options.debug) + + ## optimize threads + name_list = set(pd_samples_retrieved["new_name"].tolist()) + threads_job = functions.main_functions.optimize_threads(options.threads, len(name_list)) ## threads optimization + max_workers_int = int(options.threads/threads_job) + + ## to FIX: MINTmap requires to chdir to folder to create results + max_workers_int = 1 + + ## debug message + if (Debug): + print (colored("**DEBUG: options.threads " + str(options.threads) + " **", 'yellow')) + print (colored("**DEBUG: max_workers " + str(max_workers_int) + " **", 'yellow')) + print (colored("**DEBUG: cpu_here " + str(threads_job) + " **", 'yellow')) + + print ("+ Create a tRNA analysis for each sample retrieved...") + + ## call tRNA_analysis: + ## Get user software selection: mintmap, ... + + ## dictionary results + global results_df + results_df = pd.DataFrame(columns=("name", "soft", "type", "filename")) + + # Group dataframe by sample name + sample_frame = pd_samples_retrieved.groupby(["new_name"]) + + ## send for each sample + with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers_int) as executor: + commandsSent = { executor.submit(tRNA_analysis, + sorted(cluster["sample"].tolist()), + outdir_dict[name], name, threads_job, + options.soft_name, options.species, + options.database, Debug): name for name, cluster in sample_frame } + + for cmd2 in concurrent.futures.as_completed(commandsSent): + details = commandsSent[cmd2] + try: + data = cmd2.result() + except Exception as exc: + print ('***ERROR:') + print (cmd2) + print('%r generated an exception: %s' % (details, exc)) + + print ("\n\n+ tRNA analysis is finished...") + print ("+ Let's summarize all results...") + + ## outdir + outdir_report = functions.files_functions.create_subfolder("report", outdir) + expression_folder = functions.files_functions.create_subfolder("tRNA", outdir_report) + + ## debugging messages + if options.debug: + print (results_df) + + ## merge all parse gtf files created + print ("+ Summarize tRNA analysis for all samples...") + + if 'mintmap' in options.soft_name: + results_df = results_df.set_index('type') + + ## exclusive tRFs + print ("\n\n+ Parsing exclusive tRNA analysis for all samples...") + generate_DE.generate_DE(results_df.filter(like="amb", axis=0).set_index('name'), + options.debug, expression_folder, type_analysis="tRF-amb") + + ## amb tRFs + print ("\n\n+ Parsing ambiguous tRNA analysis for all samples...") + generate_DE.generate_DE(results_df.filter(like="exc", axis=0).set_index('name'), + options.debug, expression_folder, type_analysis="tRF-exc") + else: + generate_DE.generate_DE(results_df, options.debug, expression_folder, type_analysis="tRNA") + + print ("\n*************** Finish *******************") + start_time_partial = functions.time_functions.timestamp(start_time_total) + print ("\n+ Exiting tRNA module.") + return() + + +######################################### +def tRNA_analysis(reads, folder, name, threads, soft_list, species, database, Debug): + + ## + for soft in soft_list: + if (soft == "mintmap"): + ## create mintmap + MINTmap_folder = functions.files_functions.create_subfolder('mintmap', folder) + code_success = MINTMap_caller.MINTmap_caller(MINTmap_folder, reads, name, threads, species, Debug) + + if not code_success: + print ('** Some error ocurred during MINTmap analysis for sample %s...' %name) + return () + + ## save results in dataframe + filename_amb = os.path.join(MINTmap_folder, 'mintmap_parse', name + '_amb.tsv') + filename_exc = os.path.join(MINTmap_folder, 'mintmap_parse', name + '_exc.tsv') + + results_df.loc[len(results_df)] = name, soft, "amb", filename_amb + results_df.loc[len(results_df)] = name, soft, "exc", filename_exc + \ No newline at end of file diff --git a/XICRA_pip/XICRA/scripts/MINTMap_caller.py b/XICRA_pip/XICRA/scripts/MINTMap_caller.py new file mode 100644 index 0000000..6c77594 --- /dev/null +++ b/XICRA_pip/XICRA/scripts/MINTMap_caller.py @@ -0,0 +1,204 @@ +#!/usr/bin/env python3 +########################################################## +## Jose F. Sanchez, Marta Lopez & Lauro Sumoy ## +## Copyright (C) 2019-2021 Lauro Sumoy Lab, IGTP, Spain ## +########################################################## +''' +Calls MINTMap to create tRNA profile +''' +## useful imports +import time +import io +import os +import re +import sys +from sys import argv +from io import open +from termcolor import colored + +## import my modules +from HCGB import functions +from XICRA.config import set_config +import HCGB.functions.aesthetics_functions as HCGB_aes + +############################ +def MINTmap_caller(MINTmap_folder, reads, name, num_threads, species, Debug): + # check if previously generated and succeeded + filename_stamp = MINTmap_folder + '/.success_all' + if os.path.isfile(filename_stamp): + stamp = functions.time_functions.read_time_stamp(filename_stamp) + print (colored("\tA previous command generated results on: %s [%s -- %s]" %(stamp, name, 'MINTmap'), 'yellow')) + return True + + else: + # Call MINTMap_analysis + code_returned = MINTMap_analysis(MINTmap_folder, reads, name, num_threads, species, Debug) + if code_returned: + functions.time_functions.print_time_stamp(filename_stamp) + return True + else: + print ('** Sample %s failed...' %name) + return(False) + +############################ +def MINTMap_analysis(path_folder, reads, name, num_threads, species, Debug): + + ## check species + species_code="" + if species=="hsa": + species_code="default" + else: + print (colored("** ERROR: Not available yet. No mapping bundle available for species " + species, 'red')) + exit() + + ## debug messages + if Debug: + HCGB_aes.debug_message("species: " + species, "yellow") + HCGB_aes.debug_message("species_code: " + species_code, "yellow") + + ## ATTENTION: MINTmap needs to chdir to output folder + path_here = os.getcwd() + + ## debug messages + if Debug: + HCGB_aes.debug_message("path_here: " + path_here, "yellow") + + filename_stamp = path_folder + '/.success_mintmap' + if os.path.isfile(filename_stamp): + stamp = functions.time_functions.read_time_stamp(filename_stamp) + print (colored("\tA previous command generated results on: %s [%s -- %s]" %(stamp, name, 'MINTmap call'), 'yellow')) + else: + # Call MINTMap_analysis + codeReturn = MINTmap(reads, path_folder, name, num_threads, species_code, Debug) + os.chdir(path_here) + + if not codeReturn: + print ('** Sample %s failed...' %name) + return False + + ## create time stamp + functions.time_functions.print_time_stamp(filename_stamp) + + ## Get MINTmap matrix + MINTmap_matrix_folder = functions.files_functions.create_subfolder("mintmap_parse", path_folder) + + files = os.listdir(path_folder) + for item in files: + abs_path_file = os.path.abspath(os.path.join(path_folder, item)) + + ## debug messages + if Debug: + HCGB_aes.debug_message("abs_path_file: " + abs_path_file, "yellow") + + ## parse them + if 'countsmeta' in item: + continue + if item.endswith('html'): + continue + if 'ambigu' in item: + amb_file = parse_tRF(abs_path_file, name, MINTmap_matrix_folder, 'amb', Debug) + elif 'exclu' in item: + exc_file = parse_tRF(abs_path_file, name, MINTmap_matrix_folder, 'exc', Debug) + + ## debug messages + if Debug: + HCGB_aes.debug_message("exc_file: " + exc_file, "yellow") + HCGB_aes.debug_message("amb_file: " + amb_file, "yellow") + + + if functions.files_functions.is_non_zero_file(amb_file) and functions.files_functions.is_non_zero_file(exc_file): + filename_stamp = path_folder + '/.success_all' + functions.time_functions.print_time_stamp(filename_stamp) + + return(True) + +############## +def parse_tRF(pathFile, sample_name, matrix_folder, ident, Debug): + + ## tsv file name + tsv_file = os.path.join(matrix_folder, sample_name + "_" + ident + '.tsv') + + ## time stamp + filename_stamp = matrix_folder + '/.success_parse' + if os.path.isfile(filename_stamp) and functions.files_functions.is_non_zero_file(tsv_file): + stamp = functions.time_functions.read_time_stamp(filename_stamp) + print (colored("\tA previous command generated results on: %s [%s -- %s]" %(stamp, sample_name, 'MINTmap - parse'), 'yellow')) + else: + ## Open file + fil = open(tsv_file, 'w') + string2write = 'UID\tRead\ttRNA\tvariant\tident\texpression\tsoft\n' + fil.write(string2write) + ## Read file + expression_file = open(pathFile) + expression_text = expression_file.read() + expression_lines = expression_text.splitlines() + + ## debug messages + if Debug: + HCGB_aes.debug_message("MINTmap file: " + pathFile, "yellow") + + for line in expression_lines: + # ------------------------------ # + # Example line: + # tRF-31-87R8WP9N1EWJ0 TCCCTGGTGGTCTAGTGGTTAGGATTCGGCG 5'-tRF 921 7026.67 452.60 na trna77_GluCTC_6_+_28949976_28950047@1.31.31, trna80_GluCTC_1_-_161417018_161417089@1.31.31 + # ------------------------------ # + + if not line.startswith('#'): + if not line.startswith('License Plate'): + UID = line.split('\t')[0] ## License Plate + seq = line.split('\t')[1] ## tRF sequence + variant = line.split('\t')[2] ## tRF type + expression = line.split('\t')[3] ## unnormalized counts + ### there are other RPM counts taking into account several things such as total base pairs, reads, etc. We would use raw counts + + ## Get tRNA name + tRNA_name = line.split('\t')[-1].split(',')[0] + tRNA_search = re.search(r"trna.{1,3}\_(.{6})\_(.{1,2})\_.*", tRNA_name) + tRNA_family = 'na' + if tRNA_search: + tRNA_family = tRNA_search.group(1) + if (tRNA_search.group(2) == 'MT'): + tRNA_family = tRNA_family + '_MT' + + + string2write = UID + '\t' + seq + '\t' + tRNA_family +'\t' + variant +'\t' + ident + '\t' + expression + '\tmintmap\n' + + ## debug messages + if Debug: + HCGB_aes.debug_message(string2write, "yellow") + + fil.write(string2write) + + fil.close() + + return(tsv_file) + +############## +def MINTmap(reads, outpath, name, num_threads, species_code, Debug): + + outpath = os.path.abspath(outpath) + functions.files_functions.create_folder(outpath) + + ## change path where the results are required + os.chdir(outpath) + + mintmap_exe = set_config.get_exe("MINTmap", Debug=Debug) + logfile = os.path.join(outpath, 'MINTmap.log') + + ## output + outpath_file = os.path.join(outpath, name) + + if (len(reads) > 1): + print (colored("** ERROR: Only 1 fastq file is allowed please joined reads before...", 'red')) + exit() + + ## species bundle + if species_code == "default": + ## create command: use default mapping bundle provided with MINTmap + cmd = '%s -p %s %s 2> %s' %(mintmap_exe, name, reads[0], logfile) + else: + ## create command: use specific mapping bundle path + cmd = '%s -p %s -m %s %s 2> %s' %(mintmap_exe, name, species_code, reads[0], logfile) + + return(functions.system_call_functions.system_call(cmd)) + \ No newline at end of file diff --git a/XICRA_pip/XICRA/scripts/__init__.py b/XICRA_pip/XICRA/scripts/__init__.py index 47c9ab5..23bb928 100644 --- a/XICRA_pip/XICRA/scripts/__init__.py +++ b/XICRA_pip/XICRA/scripts/__init__.py @@ -4,7 +4,11 @@ 'generate_DE', 'RNAbiotype', 'mapReads', - 'cutadapt_caller' + 'cutadapt_caller', + 'mirtop_caller', + 'optimir_caller', + 'miraligner_caller', + 'MINTMap_caller' ] diff --git a/XICRA_pip/XICRA/scripts/cutadapt_caller.py b/XICRA_pip/XICRA/scripts/cutadapt_caller.py index bd3d066..4913bc6 100644 --- a/XICRA_pip/XICRA/scripts/cutadapt_caller.py +++ b/XICRA_pip/XICRA/scripts/cutadapt_caller.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 ########################################################## -## Jose F. Sanchez ## -## Copyright (C) 2019 Lauro Sumoy Lab, IGTP, Spain ## +## Jose F. Sanchez, Marta Lopez & Lauro Sumoy ## +## Copyright (C) 2019-2021 Lauro Sumoy Lab, IGTP, Spain ## ########################################################## ''' Calls cutadapt to trim raw reads diff --git a/XICRA_pip/XICRA/scripts/generate_DE.py b/XICRA_pip/XICRA/scripts/generate_DE.py index 3cf8769..ccb1aaf 100644 --- a/XICRA_pip/XICRA/scripts/generate_DE.py +++ b/XICRA_pip/XICRA/scripts/generate_DE.py @@ -12,17 +12,21 @@ from sys import argv import pandas as pd import csv +from termcolor import colored from HCGB import functions +import HCGB.functions.aesthetics_functions as HCGB_aes #################### -def generate_DE(dataframe_results, Debug, outfolder): +def generate_DE(dataframe_results, Debug, outfolder, type_analysis='miRNA'): """Builds final expression matrices comparing all samples. Generates three .csv for each software used: miRNA_expression-soft_name_dup.csv: counts with duplicated reads for each sample miRNA_expression-soft_name.csv: final matrix, counts of each isomiR (with miRNA and variant info) without duplicated reads - miRNA_expression-soft_name_seq.csv: table with the miRTop identifier and the corresponding DNA sequence + miRNA_expression-soft_name_seq.csv: table with the miRTop identifier and the corresponding DNA sequence + + miRNA is the default analysis but other can be provided such as tRNA, piRNA, etc :param dataframe_results: dataframe with the paths of the outputs of each sample and software :param Debug: display complete log @@ -30,8 +34,12 @@ def generate_DE(dataframe_results, Debug, outfolder): :returns: None """ + ## get results dictionary for each software employed soft_list = dataframe_results.soft.unique() + + dataframe_results = dataframe_results.reset_index() + ## debugging messages if Debug: print ("## Debug:") @@ -54,32 +62,33 @@ def generate_DE(dataframe_results, Debug, outfolder): print ("dict_files") print (dict_files) - ## get data - (all_data, all_seqs) = generate_matrix(dict_files, soft_name.lower(), Debug) - - ## discard duplicate UIDs if any - all_data_filtered, all_data_duplicated = discard_UID_duplicated(all_data) + ## get data and discard duplicate UIDs if any + (all_data, all_seqs) = generate_matrix(dict_files, soft_name.lower(), Debug, type_analysis=type_analysis) + all_data_filtered, all_data_duplicated = discard_UID_duplicated(all_data, type_res=type_analysis) + ## dump data in folder provided - csv_outfile = os.path.join(outfolder, 'miRNA_expression-' + soft_name) + csv_outfile = os.path.join(outfolder, type_analysis + '_expression-' + soft_name) all_data_filtered.to_csv(csv_outfile + ".csv", quoting=csv.QUOTE_NONNUMERIC) all_data_duplicated.to_csv(csv_outfile + '_dup.csv', quoting=csv.QUOTE_NONNUMERIC) all_seqs.to_csv(csv_outfile + '_seq.csv', quoting=csv.QUOTE_NONNUMERIC) #################### -def discard_UID_duplicated(df_data): +def discard_UID_duplicated(df_data, type_res="miRNA"): """ """ + ## get data index df_data['ID'] = df_data.index new_data = df_data.filter(['ID'], axis=1) # split ID (hsa-let-7a-2-3p&NA&qNkjr6Ov2) into miRNA, variant and UID tmp = new_data['ID'].str.split('&', expand = True) - new_data['miRNA'] = tmp[0] + + new_data[type_res] = tmp[0] new_data['variant'] = tmp[1] new_data['UID'] = tmp[2] - + ## count count_groups = new_data.groupby('UID').count() ## print to file? @@ -115,7 +124,7 @@ def discard_UID_duplicated(df_data): return (clean_data_expression, duplicates_expression) #################### -def generate_matrix(dict_files, soft_name, Debug): +def generate_matrix(dict_files, soft_name, Debug, type_analysis="miRNA"): """ """ ###################################################### @@ -131,6 +140,9 @@ def generate_matrix(dict_files, soft_name, Debug): all_data = pd.DataFrame() seq_all_data = pd.DataFrame() for sample, this_file in dict_files.items(): + + new_data=pd.DataFrame() + print ('+ Reading information from sample: ', sample) ## @@ -144,33 +156,71 @@ def generate_matrix(dict_files, soft_name, Debug): print ('\t - Information not available for sample: ', sample) continue - ## get info, generate unique name and merge for samples - ## header of tsv files: - ## UID Read miRNA Variant iso_5p iso_3p iso_add3p iso_snp sRNAbench - - data['Variant'].fillna('NA', inplace=True) - data['unique_id'] = data.apply(lambda data: data['miRNA'] + '&' + data['Variant'] + '&' + data['UID'], axis=1) - - ## parse according to software - if (soft_name == 'srnabench'): - ## sRNAbench mirtop creates a column id with sRNAbench instead of sample name - new_data = data.filter(['unique_id', 'sRNAbench'], axis=1) - new_data = new_data.set_index('unique_id') - new_data = new_data.rename(columns={'sRNAbench': sample}) - - if (soft_name == 'optimir'): - ## OptimiR mirtop creates a column containing sample name and other tags (trim, joined, fastq...) - regex=re.compile(sample + '.*') - search_list = list(filter(regex.match, data.columns.values.tolist())) - new_data = data.filter(['unique_id', search_list[0]], axis=1) - new_data = new_data.set_index('unique_id') - new_data = new_data.rename(columns={search_list[0]: sample}) - - if (soft_name == 'miraligner'): - ## miraligner mirtop creates a column containing sample name and other tags (trim, joined, fastq...) - new_data = data.filter(['unique_id', sample], axis=1) - new_data = new_data.set_index('unique_id') + #### + if type_analysis=="miRNA": + + ## ------------------------------------------ ## + ## Create matrix for miRNA results + ## ------------------------------------------ ## + + ## get info, generate unique name and merge for samples + ## header of tsv files: + ## UID Read miRNA Variant iso_5p iso_3p iso_add3p iso_snp sRNAbench + + data['Variant'].fillna('NA', inplace=True) + data['unique_id'] = data.apply(lambda data: data['miRNA'] + '&' + data['Variant'] + '&' + data['UID'], axis=1) + + ## parse according to software + if (soft_name == 'srnabench'): + ## sRNAbench mirtop creates a column id with sRNAbench instead of sample name + new_data = data.filter(['unique_id', 'sRNAbench'], axis=1) + new_data = new_data.set_index('unique_id') + new_data = new_data.rename(columns={'sRNAbench': sample}) + + if (soft_name == 'optimir'): + ## OptimiR mirtop creates a column containing sample name and other tags (trim, joined, fastq...) + regex=re.compile(sample + '.*') + search_list = list(filter(regex.match, data.columns.values.tolist())) + new_data = data.filter(['unique_id', search_list[0]], axis=1) + new_data = new_data.set_index('unique_id') + new_data = new_data.rename(columns={search_list[0]: sample}) + + if (soft_name == 'miraligner'): + ## miraligner mirtop creates a column containing sample name and other tags (trim, joined, fastq...) + new_data = data.filter(['unique_id', sample], axis=1) + new_data = new_data.set_index('unique_id') + + #### + elif "tRF" in type_analysis: ## tRF-amb; tRF-exc, tRF + + if Debug: + HCGB_aes.debug_message(type_analysis + " analysis: ", color) + ## ------------------------------------------ ## + ## Create matrix for tRNA results + ## ------------------------------------------ ## + ## UID Read tRNA variant ident expression soft\n' + + data['variant'].fillna('NA', inplace=True) + + ## parse according to software + if (soft_name == 'mintmap'): + data['unique_id'] = data.apply(lambda data: data['tRNA'] + '&' + data['variant'] + '&' + data['UID'], axis=1) + new_data = data.filter(['unique_id', 'expression'], axis=1) + new_data = new_data.set_index('unique_id') + new_data = new_data.rename(columns={'expression': sample}) + + ## TODO + #else: + # data['unique_id'] = data.apply(lambda data: data['tRNA'] + '&' + data['variant'] + '&' + data['UID'], axis=1) + # new_data = data.filter(['unique_id', 'XXXX'], axis=1) ## TODO + # new_data = new_data.set_index('unique_id') ## TODO + # new_data = new_data.rename(columns={'XXXX': sample}) ## TODO + + else: + print() + ## add new + ## sequence information seq_data = data.filter(['UID', 'Read'], axis=1) seq_data = seq_data.set_index('UID') @@ -190,8 +240,7 @@ def generate_matrix(dict_files, soft_name, Debug): print (all_data) print ("*** DEBUG: data for sequences all samples ***") print (seq_all_data) - - + return (all_data, seq_all_data) ###### diff --git a/XICRA_pip/XICRA/scripts/miraligner_caller.py b/XICRA_pip/XICRA/scripts/miraligner_caller.py new file mode 100644 index 0000000..063b279 --- /dev/null +++ b/XICRA_pip/XICRA/scripts/miraligner_caller.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python3 +########################################################## +## Jose F. Sanchez, Marta Lopez & Lauro Sumoy ## +## Copyright (C) 2019-2021 Lauro Sumoy Lab, IGTP, Spain ## +########################################################## +''' +Calls miraligner to create miRNA profile +''' +## useful imports +import time +import io +import os +import re +import sys +from sys import argv +from io import open +from termcolor import colored + +## import my modules +from HCGB.functions import fasta_functions +from HCGB import functions +from XICRA.config import set_config + +############### +def miraligner_caller(reads, sample_folder, name, threads, database, species, Debug): + """Passes the sample information to be analyzed by miraligner(). + + Checks if the computation has been performed before. If not, it calls + miraligner(). + + :param reads: file with sample reads + :param sample_folder: output folder + :param name: sample name + :param threads: selected threads (by defoult 2) + :param database: path to store miRNA annotation files downloaded + :param species: species tag ID. Default: hsa (Homo sapiens) + :param Debug: display complete log. + + + :returns: True/False + """ + # check if previously generated and succeeded + filename_stamp = sample_folder + '/.success' + if os.path.isfile(filename_stamp): + stamp = functions.time_functions.read_time_stamp(filename_stamp) + print (colored("\tA previous command generated results on: %s [%s -- %s]" %(stamp, name, 'miraligner'), 'yellow')) + else: + # Call miralinger + code_returned = miraligner(reads, sample_folder, name, database, species, Debug) + if code_returned: + functions.time_functions.print_time_stamp(filename_stamp) + else: + print ('** Sample %s failed...' %name) + return(False) + + return(True) + +############### +def miraligner (reads, outpath, file_name, database, species, Debug): + """Executes the miraligner analyis of the sample + + Checks if the reads are joined and builds the miraligner command to execute it. + + :param reads: file with sample reads + :param outpath: output folder + :param file_name: sample name + :param database: path to store miRNA annotation files downloaded + :param species: species tag ID. Default: hsa (Homo sapiens) + :param Debug: display complete log. + + + :returns: The miraligner output + """ + miraligner_exe = set_config.get_exe("miraligner", Debug=Debug) + logfile = os.path.join(outpath, 'miraligner.log') + + ## output + outpath_file = os.path.join(outpath, file_name) + + if (len(reads) > 1): + print (colored("** ERROR: Only 1 fastq file is allowed please joined reads before...", 'red')) + exit() + + ## create tabular information of reads + tabular_info = os.path.join(outpath, file_name + '-tab.freq.txt') + fasta_functions.reads2tabular(reads[0], tabular_info) + + ## create command + java_exe = set_config.get_exe('java', Debug=Debug) + cmd = '%s -jar %s -db %s -sub 1 -add 3 -trim 3 -s %s -i %s -o %s 2> %s' %( + java_exe, miraligner_exe, database, species, tabular_info, outpath_file, logfile) + + return(functions.system_call_functions.system_call(cmd)) diff --git a/XICRA_pip/XICRA/scripts/mirtop_caller.py b/XICRA_pip/XICRA/scripts/mirtop_caller.py new file mode 100644 index 0000000..9aa177d --- /dev/null +++ b/XICRA_pip/XICRA/scripts/mirtop_caller.py @@ -0,0 +1,190 @@ +#!/usr/bin/env python3 +########################################################## +## Jose F. Sanchez, Marta Lopez & Lauro Sumoy ## +## Copyright (C) 2019-2021 Lauro Sumoy Lab, IGTP, Spain ## +########################################################## +''' +Calls mirtop to accommodate results +''' +## useful imports +import time +import io +import os +import re +import sys +from sys import argv +from io import open +from termcolor import colored + +## import my modules +from HCGB import functions +from XICRA.config import set_config + +############### +def miRTop_caller(results_folder, mirtop_folder, name, threads, miRNA_gff, hairpinFasta, format, species, Debug): + """Checks if the computation has already been performed. If not, it calls + miRTop() + :param results_folder: file with the output of the sample from each software + :param folder: output miRTop folder + :param name: sample name + :param threads: selected threads (by defoult 2) + :param miRNA_gff: miRNA gff3 annotation file + :param hairpinFasta: hairpin fasta file + :param format: 'sRNAbench', 'optimir' or 'seqbuster' + :param species: species tag ID. Default: hsa (Homo sapiens) + :param Debug: display complete log. + :returns: True/False + """ + # check if previously generated and succeeded + mirtop_folder_gff = functions.files_functions.create_subfolder('gff', mirtop_folder) + mirtop_folder_stats = functions.files_functions.create_subfolder('stats', mirtop_folder) + mirtop_folder_counts = functions.files_functions.create_subfolder('counts', mirtop_folder) + mirtop_folder_counts = functions.files_functions.create_subfolder('export', mirtop_folder) + + filename_stamp = mirtop_folder_counts + '/.success' + if os.path.isfile(filename_stamp): + stamp = functions.time_functions.read_time_stamp(filename_stamp) + print (colored("\tA previous command generated results on: %s [%s -- %s]" %(stamp, name, 'miRTop'), 'yellow')) + else: + # Call miRTop + code_returned = miRTop(results_folder, mirtop_folder, name, threads, format.lower(), miRNA_gff, hairpinFasta, species, Debug) + if code_returned: + functions.time_functions.print_time_stamp(filename_stamp) + else: + print ('** Sample %s failed...' %name) + return(False) + + return(True) + +############### +def miRTop(results_folder, sample_folder, name, threads, format, miRNA_gff, hairpinFasta, species,Debug): + """Checks if the computation has already been performed. If not, it calls + miRTop() + + :param results_folder: file with the output of the sample from each software + :param folder: output miRTop folder + :param name: sample name + :param threads: selected threads (by defoult 2) + :param miRNA_gff: miRNA gff3 annotation file + :param hairpinFasta: hairpin fasta file + :param format: 'sRNAbench', 'optimir' or 'seqbuster' + :param species: species tag ID. Default: hsa (Homo sapiens) + :param Debug: display complete log. + + + :returns: Folder with the results for the sample in miRTop format + """ + + miRTop_exe = set_config.get_exe('miRTop', Debug=Debug) + logfile = os.path.join(sample_folder, name + '.log') + + ## folders + mirtop_folder_gff = os.path.join(sample_folder, 'gff') + mirtop_folder_stats = os.path.join(sample_folder, 'stats') + mirtop_folder_counts = os.path.join(sample_folder, 'counts') + mirtop_folder_export = os.path.join(sample_folder, 'export') + + ## get info according to software + if format == "sRNAbench": + ## get sRNAbench info + reads_annot = os.path.join(results_folder, "reads.annotation") + + ## check non zero + if not functions.files_functions.is_non_zero_file(reads_annot): + print (colored("\tNo isomiRs detected for sample [%s -- %s]" %(name, 'sRNAbench'), 'yellow')) + return (False) + + elif format == "optimir": + ## get optimir info + gff3_file = functions.main_functions.retrieve_matching_files(os.path.join(results_folder, "OptimiR_Results"), "gff3", Debug)[0] + results_folder = gff3_file + + ## check non zero + if not functions.files_functions.is_non_zero_file(gff3_file): + print (colored("\tNo isomiRs detected for sample [%s -- %s]" %(name, 'optimir'), 'yellow')) + return (False) + + elif format == "seqbuster": + ## get miraligner info + mirna_file = functions.main_functions.retrieve_matching_files(results_folder, ".mirna", Debug)[0] + results_folder = mirna_file + + ## check non zero + if not functions.files_functions.is_non_zero_file(mirna_file): + print (colored("\tNo isomiRs detected for sample [%s -- %s]" %(name, 'miraligner'), 'yellow')) + return (False) + + + ## miRTop analysis gff + filename_stamp_gff = mirtop_folder_gff + '/.success' + if os.path.isfile(filename_stamp_gff): + stamp = functions.time_functions.read_time_stamp(filename_stamp_gff) + print (colored("\tA previous command generated results on: %s [%s -- %s - gff]" %(stamp, name, 'miRTop'), 'yellow')) + else: + print ('Creating isomiRs gtf file for sample %s' %name) + cmd = miRTop_exe + ' gff --sps %s --hairpin %s --gtf %s --format %s -o %s %s 2> %s' %( + species, hairpinFasta, miRNA_gff, format, + mirtop_folder_gff, results_folder, logfile) + + ## execute + code_miRTop = functions.system_call_functions.system_call(cmd) + if code_miRTop: + functions.time_functions.print_time_stamp(filename_stamp_gff) + else: + return(False) + + ## miRTop stats + mirtop_folder_gff_file = os.path.join(mirtop_folder_gff, 'mirtop.gff') + + #filename_stamp_stats = mirtop_folder_stats + '/.success' + #if os.path.isfile(filename_stamp_stats): + # stamp = functions.time_functions.read_time_stamp(filename_stamp_stats) + # print (colored("\tA previous command generated results on: %s [%s -- %s - stats]" %(stamp, name, 'miRTop'), 'yellow')) + #else: + # print ('Creating isomiRs stats for sample %s' %name) + # cmd_stats = miRTop_exe + ' stats -o %s %s 2>> %s' %(mirtop_folder_stats, mirtop_folder_gff_file, logfile) + # code_miRTop_stats = functions.system_call_functions.system_call(cmd_stats) + # if code_miRTop_stats: + # functions.time_functions.print_time_stamp(filename_stamp_stats) + # else: + # return(False) + + ## miRTop counts + filename_stamp_counts = mirtop_folder_counts + '/.success' + if os.path.isfile(filename_stamp_counts): + stamp = functions.time_functions.read_time_stamp(filename_stamp_counts) + print (colored("\tA previous command generated results on: %s [%s -- %s - counts]" %(stamp, name, 'miRTop'), 'yellow')) + else: + print ('Creating isomiRs counts for sample %s' %name) + ## if both succeeded + cmd_stats = miRTop_exe + ' counts -o %s --gff %s --hairpin %s --gtf %s --sps %s 2>> %s' %(mirtop_folder_counts, mirtop_folder_gff_file, hairpinFasta, miRNA_gff, species, logfile) + code_miRTop_counts = functions.system_call_functions.system_call(cmd_stats) + + if code_miRTop_counts: + functions.time_functions.print_time_stamp(filename_stamp_counts) + else: + return(False) + + ## miRTop export + filename_stamp_export = mirtop_folder_export + '/.success' + if os.path.isfile(filename_stamp_export): + stamp = functions.time_functions.read_time_stamp(filename_stamp_export) + print (colored("\tA previous command generated results on: %s [%s -- %s - export]" %(stamp, name, 'miRTop'), 'yellow')) + else: + print ('Creating isomiRs export information for sample %s' %name) + ## if both succeeded + cmd_export = miRTop_exe + ' export -o %s --hairpin %s --gtf %s --sps %s --format isomir %s 2> %s' %(mirtop_folder_export, hairpinFasta, miRNA_gff, species, mirtop_folder_gff_file, logfile) + code_miRTop_export = functions.system_call_functions.system_call(cmd_export) + + if code_miRTop_export: + functions.time_functions.print_time_stamp(filename_stamp_export) + else: + return(False) + + ## return all success + outdir_tsv = os.path.join(mirtop_folder_counts, 'mirtop.tsv') + return (outdir_tsv) + + ## if any command failed + #return(False) + diff --git a/XICRA_pip/XICRA/scripts/optimir_caller.py b/XICRA_pip/XICRA/scripts/optimir_caller.py new file mode 100644 index 0000000..818a55d --- /dev/null +++ b/XICRA_pip/XICRA/scripts/optimir_caller.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python3 +########################################################## +## Jose F. Sanchez, Marta Lopez & Lauro Sumoy ## +## Copyright (C) 2019-2021 Lauro Sumoy Lab, IGTP, Spain ## +########################################################## +''' +Calls optimir to create miRNA profile +''' +## useful imports +import time +import io +import os +import re +import sys +from sys import argv +from io import open +from termcolor import colored + +## import my modules +from HCGB import functions +from XICRA.config import set_config + +############### +def optimir (reads, outpath, file_name, num_threads, matureFasta, hairpinFasta, miRNA_gff, Debug): + """Executes the optimiR analyis of the sample. + + Checks if the reads are joined and builds the optimiR command to execute it. + + :param reads: file with sample reads + :param outpath: output folder + :param file_name: sample name + :param num_threads: selected threads (by defoult 2) + :param matureFasta: mature fasta file + :param hairpinFasta: hairpin fasta file + :param miRNA_gff: miRNA gff3 annotation file + :param Debug: display complete log. + + + :returns: The optimiR output + """ + optimir_exe = set_config.get_exe("optimir", Debug=Debug) + sRNAbench_db = os.path.abspath(os.path.join(os.path.dirname(optimir_exe), '..')) ## optimir + logfile = os.path.join(outpath, 'optimir.log') + errfile = os.path.join(outpath, 'optimir.err') + + if (len(reads) > 1): + print (colored("** ERROR: Only 1 fastq file is allowed please joined reads before...", 'red')) + exit() + + ## create command + cmd = "%s process --fq %s --gff_out -o %s --maturesFasta %s --hairpinsFasta %s --gff3 %s > %s 2> %s" %( + optimir_exe, reads[0], outpath, matureFasta, hairpinFasta, miRNA_gff, logfile, errfile) + return(functions.system_call_functions.system_call(cmd)) + + +############### +def optimir_caller(reads, sample_folder, name, threads, matureFasta, hairpinFasta, miRNA_gff, species, Debug): + """Passes the sample information to be analyzed by optimiR(). + + Checks if the computation has been performed before. If not, it calls + optimir(). + + :param reads: file with sample reads + :param sample_folder: output folder + :param name: sample name + :param threads: selected threads (by defoult 2) + :param matureFasta: mature fasta file + :param hairpinFasta: hairpin fasta file + :param miRNA_gff: miRNA gff3 annotation file + :param species: species tag ID. Default: hsa (Homo sapiens) + :param Debug: display complete log. + + + :returns: True/False + """ + # check if previously generated and succeeded + filename_stamp = sample_folder + '/.success' + if os.path.isfile(filename_stamp): + stamp = functions.time_functions.read_time_stamp(filename_stamp) + print (colored("\tA previous command generated results on: %s [%s -- %s]" %(stamp, name, 'OptimiR'), 'yellow')) + else: + # Call sRNAbench + ## no species option for OptimiR + code_returned = optimir(reads, sample_folder, name, threads, matureFasta, hairpinFasta, miRNA_gff, Debug) + if code_returned: + functions.time_functions.print_time_stamp(filename_stamp) + else: + print ('** Sample %s failed...' %name) + return(False) + + return(True) diff --git a/XICRA_pip/XICRA/scripts/sRNAbench_caller.py b/XICRA_pip/XICRA/scripts/sRNAbench_caller.py new file mode 100644 index 0000000..4ed3531 --- /dev/null +++ b/XICRA_pip/XICRA/scripts/sRNAbench_caller.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python3 +########################################################## +## Jose F. Sanchez, Marta Lopez & Lauro Sumoy ## +## Copyright (C) 2019-2021 Lauro Sumoy Lab, IGTP, Spain ## +########################################################## +''' +Calls sRNAbench to create miRNA profile +''' +## useful imports +import time +import io +import os +import re +import sys +from sys import argv +from io import open +from termcolor import colored + +## import my modules +from HCGB import functions +from XICRA.config import set_config + + +############### +def sRNAbench_caller(reads, sample_folder, name, threads, species, Debug): + """Passes the sample information to be analyzed by sRNAbench() + + Checks if the computation has been performed before. If not, it calls + sRNAbench(). + + :param reads: file with sample reads + :param sample_folder: output folder + :param name: sample name + :param threads: selected threads (by defoult 2) + :param species: species tag ID. Default: hsa (Homo sapiens) + :param Debug: display complete log. + + + :returns: True/False + """ + # check if previously generated and succeeded + filename_stamp = sample_folder + '/.success' + if os.path.isfile(filename_stamp): + stamp = functions.time_functions.read_time_stamp(filename_stamp) + print (colored("\tA previous command generated results on: %s [%s -- %s]" %(stamp, name, 'sRNAbench'), 'yellow')) + else: + # Call sRNAbench + code_returned = sRNAbench(reads, sample_folder, name, threads, species, Debug) + if code_returned: + functions.time_functions.print_time_stamp(filename_stamp) + else: + print ('** Sample %s failed...' %name) + return(False) + + return(True) + +############### +def sRNAbench (reads, outpath, file_name, num_threads, species, Debug): + """Executes the sRNAbench analyis of the sample. + + Checks if the reads are joined and builds the sRNAbench command to execute it. + + :param reads: file with sample reads + :param outpath: output folder + :param file_name: sample name + :param num_threads: selected threads (by defoult 2) + :param species: species tag ID. Default: hsa (Homo sapiens) + :param Debug: display complete log. + + + :returns: The sRNAbench output + """ + sRNAbench_exe = set_config.get_exe("sRNAbench", Debug=Debug) + + ## set as option + ## sRNAbench.jar in exec/ folder within sRNAtoolboxDB + ## sRNAbench_db = os.path.abspath(os.path.join(os.path.dirname(sRNAbench_exe), '..')) ## sRNAtoolboxDB + + ## sRNAbench.jar linked in bin/. sRNAtoolboxDB in same folder + sRNAbench_db = os.path.abspath(os.path.join(os.path.dirname(sRNAbench_exe), 'sRNAtoolboxDB')) ## sRNAtoolboxDB + + logfile = os.path.join(outpath, 'sRNAbench.log') + + if (len(reads) > 1): + print (colored("** ERROR: Only 1 fastq file is allowed please joined reads before...", 'red')) + exit() + + ## create command + java_exe = set_config.get_exe('java', Debug=Debug) + cmd = '%s -jar %s dbPath=%s input=%s output=%s' %(java_exe, sRNAbench_exe, sRNAbench_db, reads[0], outpath) + cmd = cmd + ' microRNA=%s isoMiR=true plotLibs=true graphics=true' %species + cmd = cmd + ' plotMiR=true bedGraphMode=true writeGenomeDist=true' + cmd = cmd + ' chromosomeLevel=true chrMappingByLength=true > ' + logfile + + return(functions.system_call_functions.system_call(cmd)) diff --git a/XICRA_pip/main/XICRA b/XICRA_pip/main/XICRA index fdac800..3018ef8 100644 --- a/XICRA_pip/main/XICRA +++ b/XICRA_pip/main/XICRA @@ -22,6 +22,7 @@ help_options = ('--help_format', '--help_trimm_adapters', '--help_join_reads', '--help_miRNA', + '--help_tRNA', '--help_RNAbiotype', '--help_multiqc') @@ -259,12 +260,14 @@ options_group_miRNA.add_argument("--matureFasta", help="miRNA mature fasta file. options_group_miRNA.add_argument("--miRBase_str", help="miRBase str information.") ## TODO: Enhancement - ##options_group_miRNA.add_argument("--sRNAbench_options", type=int, help="Additional sRNAbench options.") ##options_group_miRNA.add_argument("--miRTop_options", type=int, help="Additional miRTop options.") software_group_miRNA = subparser_miRNA.add_argument_group("Software") -software_group_miRNA.add_argument("--software", dest='soft_name', nargs='*', help="Software to analyze miRNAs. Provide several input if desired", choices=['sRNAbench','optimir', 'miraligner'], required= not any(elem in help_options for elem in sys.argv)) +software_group_miRNA.add_argument("--software", dest='soft_name', nargs='*', + help="Software to analyze miRNAs. Provide several input if desired", + choices=['sRNAbench','optimir', 'miraligner'], + required= not any(elem in help_options for elem in sys.argv)) info_group_miRNA = subparser_miRNA.add_argument_group("Additional information") info_group_miRNA.add_argument("--help_format", action="store_true", help="Show additional help on name format for files.") @@ -275,12 +278,42 @@ info_group_miRNA.add_argument("--debug", action="store_true", help="Show additio subparser_miRNA.set_defaults(func=XICRA.modules.miRNA.run_miRNA) ##-------------------------------------------------------------## -##------------------------------ tRF ----------------------- ## -##subparser_tRF = subparsers.add_parser( -## 'tRF', -## help='tRF analysis.', -## description='This module generates a tRF analysis', -##) +##------------------------------ tRNA ----------------------- ## +subparser_tRNA = subparsers.add_parser( + 'tRNA', + help='tRNA analysis.', + description='This module generates a tRNA analysis', +) +in_out_group_tRNA = subparser_tRNA.add_argument_group("Input/Output") +in_out_group_tRNA.add_argument("-i", "--input", help="Folder containing a project or reads, according to the mode selected. Files could be .fastq/.fq/ or fastq.gz/.fq.gz. See --help_format for additional details.", required= not any(elem in help_options for elem in sys.argv)) +in_out_group_tRNA.add_argument("-o", "--output_folder", help="Output folder.", required = '--detached' in sys.argv) +in_out_group_tRNA.add_argument("--single_end", action="store_true", help="Single end files [Default OFF]. Default mode is paired-end.") +in_out_group_tRNA.add_argument("-b", "--batch", action="store_true", help="Provide this option if input is a file containing multiple paths instead a path.") +in_out_group_tRNA.add_argument("--in_sample", help="File containing a list of samples to include (one per line) from input folder(s) [Default OFF].") +in_out_group_tRNA.add_argument("--ex_sample", help="File containing a list of samples to exclude (one per line) from input folder(s) [Default OFF].") +in_out_group_tRNA.add_argument("--detached", action="store_true", help="Isolated mode. --input is a folder containing fastq reads. Provide a unique path o several using --batch option") +in_out_group_tRNA.add_argument("--include_lane", action="store_true", help="Include the lane tag (*L00X*) in the sample name. See --help_format for additional details [Default OFF]") +in_out_group_tRNA.add_argument("--include_all", action="store_true", help="Include all characters as tag name before read pair, if any. See --help_format for additional details [Default OFF]") +in_out_group_tRNA.add_argument("--noTrim", action='store_true', help="Use non-trimmed reads [or not containing '_trim' in the name].") + +options_group_tRNA = subparser_tRNA.add_argument_group("Options") +options_group_tRNA.add_argument("-t", "--threads", type=int, help="Number of CPUs to use [Default: 2].", default=2) +options_group_tRNA.add_argument("--species", help="Species tag ID [Default: hsa (Homo sapiens)].", default='hsa') +options_group_tRNA.add_argument("--database", help="Path to store tRNA annotation files downloaded: GtRNAdb, etc") + +software_group_tRNA = subparser_tRNA.add_argument_group("Software") +software_group_tRNA.add_argument("--software", dest='soft_name', nargs='*', + help="Software to analyze tRNAs. Provide several input if desired", + choices=['mintmap'], + required= not any(elem in help_options for elem in sys.argv)) + +info_group_tRNA = subparser_tRNA.add_argument_group("Additional information") +info_group_tRNA.add_argument("--help_format", action="store_true", help="Show additional help on name format for files.") +info_group_tRNA.add_argument("--help_project", action="store_true", help="Show additional help on the project scheme.") +info_group_tRNA.add_argument("--help_tRNA", action="store_true", help="Show additional help on the miRNA paired-end reads process.") +info_group_tRNA.add_argument("--debug", action="store_true", help="Show additional message for debugging purposes.") + +subparser_tRNA.set_defaults(func=XICRA.modules.tRNA.run_tRNA) ##-------------------------------------------------------------## ##------------------------------ piRNA ----------------------- ##