From e5ed55509ce349c9df17b2887490f5c41cbc8af6 Mon Sep 17 00:00:00 2001 From: jfsanchezherrero Date: Thu, 14 Oct 2021 18:03:13 +0200 Subject: [PATCH 1/7] fixes and updates --- XICRA_pip/XICRA/modules/qc.py | 97 +++----------- XICRA_pip/XICRA/modules/test.py | 10 ++ XICRA_pip/XICRA/modules/trimm.py | 118 +---------------- XICRA_pip/XICRA/scripts/cutadapt_caller.py | 141 +++++++++++++++++++++ XICRA_pip/main/XICRA | 10 +- 5 files changed, 182 insertions(+), 194 deletions(-) create mode 100644 XICRA_pip/XICRA/modules/test.py create mode 100644 XICRA_pip/XICRA/scripts/cutadapt_caller.py diff --git a/XICRA_pip/XICRA/modules/qc.py b/XICRA_pip/XICRA/modules/qc.py index 546bc02..a0cfdd7 100644 --- a/XICRA_pip/XICRA/modules/qc.py +++ b/XICRA_pip/XICRA/modules/qc.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 ########################################################## -## Jose F. Sanchez ## -## Copyright (C) 2019 Lauro Sumoy Lab, IGTP, Spain ## +## Jose F. Sanchez, Marta Lopez and Lauro Sumoy ## +## Copyright (C) 2019-2021 Lauro Sumoy Lab, IGTP, Spain ## ########################################################## """ Creates Quality check sequence adapters within fastq reads. @@ -77,7 +77,6 @@ def run_QC(options): options.project = True outdir = input_dir - #fastqc(input_dir, outdir, options, start_time_total) functions.aesthetics_functions.boxymcboxface("FASTQC Quality check for samples") ## get files @@ -86,80 +85,10 @@ def run_QC(options): print ("[ fastq, fq, fastq.gz, fq.gz ]\n") pd_samples_retrieved = sampleParser.files.get_files(options, input_dir, "fastq", ("fastq", "fq", "fastq.gz", "fq.gz"), options.debug) - ## debug message - if (Debug): - print (colored("\n**DEBUG: pd_samples_retrieve **", 'yellow')) - print (pd_samples_retrieved) - print ("\n") - - ## generate output folder, if necessary - print ("\n+ Create output folder(s):") - - ## if not project, outdir contains the dir to put output - ## in this case, in some other cases might not occur - if not options.project: - functions.files_functions.create_folder(outdir) - outdir_dict = functions.files_functions.outdir_project(outdir, options.project, pd_samples_retrieved, "fastqc", options.debug) - - print ("+ Checking quality for each sample retrieved...") start_time_partial = start_time_total - # Group dataframe by sample name - sample_frame = pd_samples_retrieved.groupby(["name"]) - - ## optimize threads - name_list = set(pd_samples_retrieved["name"].tolist()) - threads_job = functions.main_functions.optimize_threads(options.threads, len(name_list)) ## threads optimization - max_workers_int = int(options.threads/threads_job) - - ## debug message - if (Debug): - print (colored("**DEBUG: options.threads " + str(options.threads) + " **", 'yellow')) - print (colored("**DEBUG: max_workers " + str(max_workers_int) + " **", 'yellow')) - print (colored("**DEBUG: cpu_here " + str(threads_job) + " **", 'yellow')) - - ## send for each sample - print ("+ Calling fastqc for samples...") - with concurrent.futures.ThreadPoolExecutor(max_workers=int(max_workers_int)) as executor: - commandsSent = { executor.submit(fastqc_caller.run_module_fastqc, outdir_dict[name], sorted( cluster["sample"].tolist() ), name, threads_job): name for name, cluster in sample_frame } - - for cmd2 in concurrent.futures.as_completed(commandsSent): - details = commandsSent[cmd2] - try: - data = cmd2.result() - except Exception as exc: - print ('***ERROR:') - print (cmd2) - print('%r generated an exception: %s' % (details, exc)) - - print ("+ FASTQC for samples has finished...") - - ## functions.time_functions.timestamp - start_time_partial = functions.time_functions.timestamp(start_time_partial) - - if (options.skip_report): - print ("+ No report generation...") - else: - print ("\n+ Generating a report using MultiQC module.") - outdir_report = functions.files_functions.create_subfolder("report", outdir) - - ## get subdirs generated and call multiQC report module - givenList = [] - print ("+ Detail information for each sample could be identified in separate folders:") - - ## call multiQC report module - givenList = [ v for v in outdir_dict.values() ] - my_outdir_list = set(givenList) - - ## debug message - if (Debug): - print (colored("\n**DEBUG: my_outdir_list for multiqc report **", 'yellow')) - print (my_outdir_list) - print ("\n") - - fastqc_report = functions.files_functions.create_subfolder("FASTQC", outdir_report) - multiQC_report.multiQC_module_call(my_outdir_list, "FASTQC", fastqc_report,"") - print ('\n+ A summary HTML report of each sample is generated in folder: %s' %fastqc_report) + ## create FASTQC call + fastqc(pd_samples_retrieved, outdir, options, start_time_total, name_analysis, Debug) print ("\n*************** Finish *******************") start_time_partial = functions.time_functions.timestamp(start_time_total) @@ -167,11 +96,9 @@ def run_QC(options): print ("+ Exiting qc module.") exit() +####################### def fastqc(pd_samples_retrieved, outdir, options, start_time_total, name_analysis, Debug): - print("+ FASTQC Quality check for trimmed samples") - - ## debug message if (Debug): print (colored("\n**DEBUG: pd_samples_retrieve **", 'yellow')) @@ -185,7 +112,15 @@ def fastqc(pd_samples_retrieved, outdir, options, start_time_total, name_analysi ## in this case, in some other cases might not occur if not options.project: functions.create_folder(outdir) - outdir_dict = functions.files_functions.outdir_project(outdir, options.project, pd_samples_retrieved, "fastqc_" + name_analysis, options.debug) + + ## folder name + if (name_analysis): + fold_name = "fastqc_" + name_analysis + else: + fold_name = "fastqc" + + ## create output dirs for each sample + outdir_dict = functions.files_functions.outdir_project(outdir, options.project, pd_samples_retrieved, fold_name, options.debug) print ("+ Checking quality for each sample retrieved...") start_time_partial = start_time_total @@ -207,7 +142,9 @@ def fastqc(pd_samples_retrieved, outdir, options, start_time_total, name_analysi ## send for each sample print ("+ Calling fastqc for samples...") with concurrent.futures.ThreadPoolExecutor(max_workers=int(max_workers_int)) as executor: - commandsSent = { executor.submit(fastqc_caller.run_module_fastqc, outdir_dict[name], sorted( cluster["sample"].tolist() ), name, threads_job): name for name, cluster in sample_frame } + commandsSent = { executor.submit(fastqc_caller.run_module_fastqc, + outdir_dict[name], sorted( cluster["sample"].tolist() ), + name, threads_job): name for name, cluster in sample_frame } for cmd2 in concurrent.futures.as_completed(commandsSent): details = commandsSent[cmd2] diff --git a/XICRA_pip/XICRA/modules/test.py b/XICRA_pip/XICRA/modules/test.py new file mode 100644 index 0000000..465ad7a --- /dev/null +++ b/XICRA_pip/XICRA/modules/test.py @@ -0,0 +1,10 @@ +#!/usr/bin/env python3 + +""" +test example file +""" + +import os + + +import HCGB diff --git a/XICRA_pip/XICRA/modules/trimm.py b/XICRA_pip/XICRA/modules/trimm.py index efb3ef3..d0f803b 100644 --- a/XICRA_pip/XICRA/modules/trimm.py +++ b/XICRA_pip/XICRA/modules/trimm.py @@ -18,6 +18,7 @@ ## import my modules from XICRA.scripts import multiQC_report +from XICRA.scripts import cutadapt_caller from XICRA.config import set_config from XICRA.modules import help_XICRA from XICRA.modules import qc @@ -100,6 +101,10 @@ def run_trimm(options): if (options.adapters_a): adapters_dict['adapter_A'] = options.adapters_A + ## set default + #if not options.min_len_read: + # options.min_len_read=15 + ## get files print ('+ Getting files from input folder... ') print ('+ Mode: fastq.\n+ Extension: ') @@ -139,9 +144,9 @@ def run_trimm(options): ## send for each sample with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers_int) as executor: - commandsSent = { executor.submit(cutadapt_caller, sorted(cluster["sample"].tolist()), + commandsSent = { executor.submit(cutadapt_caller.caller, sorted(cluster["sample"].tolist()), outdir_dict[name], name, threads_job, - Debug, adapters_dict, options.extra): name for name, cluster in sample_frame } + min_len_read, Debug, adapters_dict, options.extra): name for name, cluster in sample_frame } for cmd2 in concurrent.futures.as_completed(commandsSent): details = commandsSent[cmd2] @@ -212,112 +217,3 @@ def run_trimm(options): exit() -############################################# -def cutadapt_caller(list_reads, sample_folder, name, threads, Debug, adapters, extra): - ## check if previously trimmed and succeeded - filename_stamp = sample_folder + '/.success' - if os.path.isfile(filename_stamp): - stamp = functions.time_functions.read_time_stamp(filename_stamp) - print (colored("\tA previous command generated results on: %s [%s -- %s]" %(stamp, name, 'cutadapt'), 'yellow')) - else: - # Call cutadapt - cutadapt_exe = set_config.get_exe('cutadapt') - code_returned = cutadapt(cutadapt_exe, list_reads, sample_folder, name, threads, Debug, adapters, extra) - if code_returned: - functions.time_functions.print_time_stamp(filename_stamp) - else: - print ('** Sample %s failed...' %name) - - -############################################# -def cutadapt (cutadapt_exe, reads, path, sample_name, num_threads, Debug, adapters, extra): - """ - - :param cutadapt_exe: - :param reads: - :param path: - :param sample_name: - :param num_threads: - :param Debug: - :param adapters - :param extra: - - :type cutadapt_exe: - :type reads: - :type path: - :type sample_name: - :type num_threads: - :type Debug: - :type adapters: dictionary - :type extra: string - - """ - logfile = os.path.join(path, sample_name + '.cutadapt.log') - - if (len(reads) == 2): - if not adapters['adapter_a'] or not adapters['adapter_A']: - print ("** ERROR: Missing adapter information") - exit() - - if extra: - o_param = os.path.join(path, sample_name + '_temp1_trim_R1.fastq') - p_param = os.path.join(path, sample_name + '_temp1_trim_R2.fastq') - else: - p_param = os.path.join(path, sample_name + '_trim_R2.fastq') - o_param = os.path.join(path, sample_name + '_trim_R1.fastq') - - ## paired-end mode, 15 bps as the min length cutoff - cmd = '%s -j %s -a %s -A %s -o %s -p %s %s %s > %s -m 15' %(cutadapt_exe, - num_threads, adapters['adapter_a'], - adapters['adapter_A'], o_param, - p_param, reads[0], reads[1], logfile) - - elif (len(reads) == 1): - if not adapters['adapter_a']: - print ("** ERROR: Missing adapter information") - exit() - - if extra: - o_param = os.path.join(path, sample_name + '_temp1_trim.fastq') - else: - o_param = os.path.join(path, sample_name + '_trim.fastq') - - ## single-end mode: - cmd = '%s -j %s -a %s -o %s %s > %s' %(cutadapt_exe, num_threads, - adapters['adapter_a'], o_param, reads[0], logfile) - else: - print ('** Wrong number of files provided for sample: %s...' %sample_name) - return(False) - - ## - code = functions.system_call_functions.system_call(cmd) - - ## if additional options, run a second cutadapt command - ## to ensure this options take effect. - if (extra): - if (len(reads) == 2): - o_param2 = os.path.join(path, sample_name + '_trim_R1.fastq') - p_param2 = os.path.join(path, sample_name + '_trim_R2.fastq') - - ## paired-end mode - extra_cmd = '%s %s -j %s -a %s -A %s -o %s -p %s %s %s >> %s' %(cutadapt_exe, extra, num_threads, - adapters['adapter_a'], adapters['adapter_A'], - o_param2, p_param2, o_param, p_param, logfile) - - elif (len(reads) == 1): - o_param2 = os.path.join(path, sample_name + '_trim.fastq') - ## single-end mode: - extra_cmd = '%s %s -j %s -a %s -o %s %s >> %s' %(cutadapt_exe, extra, num_threads, adapters['adapter_a'], - o_param2, o_param, logfile) - - code2 = functions.system_call_functions.system_call(extra_cmd) - - ## remove: o_param p_param - if (len(reads) == 2): - os.remove(p_param) - - os.remove(o_param) - return (code2) - - else: - return (code) diff --git a/XICRA_pip/XICRA/scripts/cutadapt_caller.py b/XICRA_pip/XICRA/scripts/cutadapt_caller.py new file mode 100644 index 0000000..7dc3963 --- /dev/null +++ b/XICRA_pip/XICRA/scripts/cutadapt_caller.py @@ -0,0 +1,141 @@ +#!/usr/bin/env python3 +########################################################## +## Jose F. Sanchez ## +## Copyright (C) 2019 Lauro Sumoy Lab, IGTP, Spain ## +########################################################## +''' +Calls cutadapt to trim raw reads +''' +## useful imports +import time +import io +import os +import re +import sys +from sys import argv +from io import open + +## import my modules +from HCGB import functions +from XICRA.config import set_config + +############################################# +def caller(list_reads, sample_folder, name, threads, Debug, adapters, extra): + ## check if previously trimmed and succeeded + filename_stamp = sample_folder + '/.success' + if os.path.isfile(filename_stamp): + stamp = functions.time_functions.read_time_stamp(filename_stamp) + print (colored("\tA previous command generated results on: %s [%s -- %s]" %(stamp, name, 'cutadapt'), 'yellow')) + else: + # Call cutadapt + cutadapt_exe = set_config.get_exe('cutadapt') + code_returned = cutadapt(cutadapt_exe, list_reads, sample_folder, name, threads, Debug, adapters, extra) + if code_returned: + functions.time_functions.print_time_stamp(filename_stamp) + else: + print ('** Sample %s failed...' %name) + + +############################################# +def cutadapt (cutadapt_exe, reads, path, sample_name, num_threads, min_len_given, Debug, adapters, extra): + """ + + :param cutadapt_exe: + :param reads: + :param path: + :param sample_name: + :param num_threads: + :param Debug: + :param adapters + :param extra: + + :type cutadapt_exe: + :type reads: + :type path: + :type sample_name: + :type num_threads: + :type Debug: + :type adapters: dictionary + :type extra: string + + """ + logfile = os.path.join(path, sample_name + '.cutadapt.log') + + if (len(reads) == 2): + if not adapters['adapter_a'] or not adapters['adapter_A']: + print ("** ERROR: Missing adapter information") + exit() + + if extra: + o_param = os.path.join(path, sample_name + '_temp1_trim_R1.fastq') + p_param = os.path.join(path, sample_name + '_temp1_trim_R2.fastq') + else: + p_param = os.path.join(path, sample_name + '_trim_R2.fastq') + o_param = os.path.join(path, sample_name + '_trim_R1.fastq') + + ## paired-end mode, 15 bps as the min length cutoff + cmd = '%s -j %s -m %s -a %s -A %s -o %s -p %s %s %s > %s' %(cutadapt_exe, + num_threads, min_len_given, + adapters['adapter_a'], + adapters['adapter_A'], o_param, + p_param, reads[0], reads[1], logfile) + elif (len(reads) == 1): + if not adapters['adapter_a']: + print ("** ERROR: Missing adapter information") + exit() + + if extra: + o_param = os.path.join(path, sample_name + '_temp1_trim.fastq') + else: + o_param = os.path.join(path, sample_name + '_trim.fastq') + + ## single-end mode: + cmd = '%s -j %s -m %s -a %s -o %s %s > %s' %(cutadapt_exe, num_threads, + min_len_given, + adapters['adapter_a'], + o_param, reads[0], logfile) + else: + print ('** Wrong number of files provided for sample: %s...' %sample_name) + return(False) + + ## + code = functions.system_call_functions.system_call(cmd) + + ## if additional options, run a second cutadapt command + ## to ensure this options take effect. + if (extra): + if (len(reads) == 2): + o_param2 = os.path.join(path, sample_name + '_trim_R1.fastq') + p_param2 = os.path.join(path, sample_name + '_trim_R2.fastq') + + ## paired-end mode + extra_cmd = '%s %s -j %s -m %s -a %s -A %s -o %s -p %s %s %s >> %s' %(cutadapt_exe, + extra, + num_threads, + min_len_given, + adapters['adapter_a'], + adapters['adapter_A'], + o_param2, p_param2, + o_param, p_param, logfile) + + elif (len(reads) == 1): + o_param2 = os.path.join(path, sample_name + '_trim.fastq') + ## single-end mode: + extra_cmd = '%s %s -j %s -m %s -a %s -o %s %s >> %s' %(cutadapt_exe, + extra, + num_threads, + min_len_given, + adapters['adapter_a'], + o_param2, o_param, logfile) + + code2 = functions.system_call_functions.system_call(extra_cmd) + + ## remove: o_param p_param + if (len(reads) == 2): + os.remove(p_param) + + os.remove(o_param) + return (code2) + + else: + return (code) diff --git a/XICRA_pip/main/XICRA b/XICRA_pip/main/XICRA index 3335e0c..3d7af60 100644 --- a/XICRA_pip/main/XICRA +++ b/XICRA_pip/main/XICRA @@ -8,6 +8,7 @@ import argparse import os import sys import XICRA.modules +from email.policy import default ## initiate parser parser = argparse.ArgumentParser(prog='XICRA', description='Paired-end small RNA sequence analysis pipeline.' @@ -131,10 +132,13 @@ in_out_group_trimm.add_argument("--detached", action="store_true", help="Isolate in_out_group_trimm.add_argument("--include_lane", action="store_true", help="Include the lane tag (*L00X*) in the sample name. See --help_format for additional details [Default OFF]") in_out_group_trimm.add_argument("--include_all", action="store_true", help="Include all characters as tag name before read pair, if any. See --help_format for additional details [Default OFF]") +parameters_group_trimm = subparser_trimm.add_argument_group("Parameters") +parameters_group_trimm.add_argument("--adapters_a", help="Sequence of an adapter ligated to the 3' end. See --help_trimm_adapters for further information.") +parameters_group_trimm.add_argument("--adapters_A", help="Sequence of an adapter ligated to the 3' read in pair. See --help_trimm_adapters for further information.") +parameters_group_trimm.add_argument("--min_read_len", type=int, help="Minimum length of read to maintain.", default=15) +parameters_group_trimm.add_argument("--extra", help="Provide extra options for cutadapt trimming process. See --help_trimm_adapters for further information.") + options_group_trimm = subparser_trimm.add_argument_group("Options") -options_group_trimm.add_argument("--adapters_a", help="Sequence of an adapter ligated to the 3' end. See --help_trimm_adapters for further information.") -options_group_trimm.add_argument("--adapters_A", help="Sequence of an adapter ligated to the 3' read in pair. See --help_trimm_adapters for further information.") -options_group_trimm.add_argument("--extra", help="Provide extra options for cutadapt trimming process. See --help_trimm_adapters for further information.") options_group_trimm.add_argument("--skip_report", action="store_true", help="Do not report statistics using MultiQC report module [Default OFF]. See details in --help_multiqc") options_group_trimm.add_argument("--threads", type=int, help="Number of CPUs to use [Default: 2].", default=2) From 11804ad687a7175ede8b847bea50706baf7a81bf Mon Sep 17 00:00:00 2001 From: jfsanchezherrero Date: Thu, 14 Oct 2021 18:03:37 +0200 Subject: [PATCH 2/7] missing file in push --- XICRA_pip/XICRA/scripts/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/XICRA_pip/XICRA/scripts/__init__.py b/XICRA_pip/XICRA/scripts/__init__.py index 4a6df04..47c9ab5 100644 --- a/XICRA_pip/XICRA/scripts/__init__.py +++ b/XICRA_pip/XICRA/scripts/__init__.py @@ -3,7 +3,8 @@ 'multiQC_report', 'generate_DE', 'RNAbiotype', - 'mapReads' + 'mapReads', + 'cutadapt_caller' ] From 0a650bf8e3fe1fad745206f9bc0841afd188aaf2 Mon Sep 17 00:00:00 2001 From: jfsanchezherrero Date: Thu, 14 Oct 2021 18:16:03 +0200 Subject: [PATCH 3/7] fixes and updates --- XICRA_pip/XICRA/modules/qc.py | 2 +- XICRA_pip/main/XICRA | 48 +++++++++++++++++------------------ 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/XICRA_pip/XICRA/modules/qc.py b/XICRA_pip/XICRA/modules/qc.py index a0cfdd7..d1fe1c9 100644 --- a/XICRA_pip/XICRA/modules/qc.py +++ b/XICRA_pip/XICRA/modules/qc.py @@ -88,7 +88,7 @@ def run_QC(options): start_time_partial = start_time_total ## create FASTQC call - fastqc(pd_samples_retrieved, outdir, options, start_time_total, name_analysis, Debug) + fastqc(pd_samples_retrieved, outdir, options, start_time_total, "", Debug) print ("\n*************** Finish *******************") start_time_partial = functions.time_functions.timestamp(start_time_total) diff --git a/XICRA_pip/main/XICRA b/XICRA_pip/main/XICRA index 3d7af60..fdac800 100644 --- a/XICRA_pip/main/XICRA +++ b/XICRA_pip/main/XICRA @@ -58,10 +58,10 @@ subparser_prep = subparsers.add_parser( ) in_out_group_prep = subparser_prep.add_argument_group("Input/Output") -in_out_group_prep.add_argument("--input", help="Folder containing fastq files. Files could be .fastq/.fq/ or fastq.gz/.fq.gz. All files would be retrieved.", required= not any(elem in help_options for elem in sys.argv)) -in_out_group_prep.add_argument("--output_folder", help="Output folder. Name for the project folder.", required= not any(elem in help_options for elem in sys.argv)) +in_out_group_prep.add_argument("-i", "--input", help="Folder containing fastq files. Files could be .fastq/.fq/ or fastq.gz/.fq.gz. All files would be retrieved.", required= not any(elem in help_options for elem in sys.argv)) +in_out_group_prep.add_argument("-o", "--output_folder", help="Output folder. Name for the project folder.", required= not any(elem in help_options for elem in sys.argv)) in_out_group_prep.add_argument("--single_end", action="store_true", help="Single end files [Default OFF]. Default mode is paired-end.") -in_out_group_prep.add_argument("--batch", action="store_true", help="Provide this option if input is a file containing multiple paths instead a path.") +in_out_group_prep.add_argument("-b", "--batch", action="store_true", help="Provide this option if input is a file containing multiple paths instead a path.") in_out_group_prep.add_argument("--in_sample", help="File containing a list of samples to include (one per line) from input folder(s) [Default OFF].") in_out_group_prep.add_argument("--ex_sample", help="File containing a list of samples to exclude (one per line) from input folder(s) [Default OFF].") in_out_group_prep.add_argument("--detached", help="Isolated mode. No project folder initiated for further steps [Default OFF].") @@ -69,7 +69,7 @@ in_out_group_prep.add_argument("--include_lane", action="store_true", help="Iden in_out_group_prep.add_argument("--include_all", action="store_true", help="Include all characters as tag name before read pair, if any. See --help_format for additional details [Default OFF]") options_group_prep = subparser_prep.add_argument_group("Options") -options_group_prep.add_argument("--threads", type=int, help="Number of CPUs to use [Default: 2].", default=2) +options_group_prep.add_argument("-t", "--threads", type=int, help="Number of CPUs to use [Default: 2].", default=2) options_group_prep.add_argument("--copy_reads", action="store_true", help="Instead of generating symbolic links, copy files into output folder. [Default OFF].") options_group_prep.add_argument("--merge_Reads", action="store_true", help="Merges FASTQ files for the same sample [Default OFF].") #options_group_prep.add_argument("--merge_Reads_by_lane", action="store_true", help="Merges FASTQ files for the same sample by lane (Technical replicates) [Default OFF].") @@ -91,9 +91,9 @@ subparser_qc = subparsers.add_parser( description='This module calls different quality check programs attending the input provided.', ) in_out_group_qc = subparser_qc.add_argument_group("Input/Output") -in_out_group_qc.add_argument("--input", help="Folder containing input. Project or raw reads, assembly or annotation fasta files according to mode option provided.", required= not any(elem in help_options for elem in sys.argv)) -in_out_group_qc.add_argument("--output_folder", help="Output folder. Required if '--detached' mode. Under '--project' mode, information will be stored following a designed scheme. See instructions for further details", required = '--detached' in sys.argv) -in_out_group_qc.add_argument("--batch", action="store_true", help="Provide this option if input is a file containing multiple paths instead a path.") +in_out_group_qc.add_argument("-i", "--input", help="Folder containing input. Project or raw reads, assembly or annotation fasta files according to mode option provided.", required= not any(elem in help_options for elem in sys.argv)) +in_out_group_qc.add_argument("-o", "--output_folder", help="Output folder. Required if '--detached' mode. Under '--project' mode, information will be stored following a designed scheme. See instructions for further details", required = '--detached' in sys.argv) +in_out_group_qc.add_argument("-b", "--batch", action="store_true", help="Provide this option if input is a file containing multiple paths instead a path.") in_out_group_qc.add_argument("--in_sample", help="File containing a list of samples to include (one per line) from input folder(s) [Default OFF].") in_out_group_qc.add_argument("--ex_sample", help="File containing a list of samples to exclude (one per line) from input folder(s) [Default OFF].") in_out_group_qc.add_argument("--detached", action="store_true", help="Isolated mode. --input is a folder containing samples, contigs or protein sequences. Provide a unique path o several using --batch option") @@ -105,7 +105,7 @@ in_out_group_qc.add_argument("--include_all", action="store_true", help="Include options_group_qc = subparser_qc.add_argument_group("Configuration") options_group_qc.add_argument("--single_end", action="store_true", help="Single end files [Default OFF]. Default mode is paired-end. Only applicable if --raw_reads option.") options_group_qc.add_argument("--skip_report", action="store_true", help="Do not report statistics using MultiQC report module [Default OFF]") -options_group_qc.add_argument("--threads", type=int, help="Number of CPUs to use [Default: 2].", default=2) +options_group_qc.add_argument("-t", "--threads", type=int, help="Number of CPUs to use [Default: 2].", default=2) info_group_qc = subparser_qc.add_argument_group("Additional information") info_group_qc.add_argument("--help_format", action="store_true", help="Show additional help on name format for files.") @@ -122,10 +122,10 @@ subparser_trimm = subparsers.add_parser( description='This module trimms sequencing adapters that could be present in next generation sequencing files', ) in_out_group_trimm = subparser_trimm.add_argument_group("Input/Output") -in_out_group_trimm.add_argument("--input", help="Folder containing a project or reads, according to the mode selected. Files could be .fastq/.fq/ or fastq.gz/.fq.gz. See --help_format for additional details.", required= not any(elem in help_options for elem in sys.argv)) -in_out_group_trimm.add_argument("--output_folder", help="Output folder.", required = '--detached' in sys.argv) +in_out_group_trimm.add_argument("-i", "--input", help="Folder containing a project or reads, according to the mode selected. Files could be .fastq/.fq/ or fastq.gz/.fq.gz. See --help_format for additional details.", required= not any(elem in help_options for elem in sys.argv)) +in_out_group_trimm.add_argument("-o", "--output_folder", help="Output folder.", required = '--detached' in sys.argv) in_out_group_trimm.add_argument("--single_end", action="store_true", help="Single end files [Default OFF]. Default mode is paired-end.") -in_out_group_trimm.add_argument("--batch", action="store_true", help="Provide this option if input is a file containing multiple paths instead a path.") +in_out_group_trimm.add_argument("-b", "--batch", action="store_true", help="Provide this option if input is a file containing multiple paths instead a path.") in_out_group_trimm.add_argument("--in_sample", help="File containing a list of samples to include (one per line) from input folder(s) [Default OFF].") in_out_group_trimm.add_argument("--ex_sample", help="File containing a list of samples to exclude (one per line) from input folder(s) [Default OFF].") in_out_group_trimm.add_argument("--detached", action="store_true", help="Isolated mode. --input is a folder containing fastq reads. Provide a unique path o several using --batch option") @@ -140,7 +140,7 @@ parameters_group_trimm.add_argument("--extra", help="Provide extra options for c options_group_trimm = subparser_trimm.add_argument_group("Options") options_group_trimm.add_argument("--skip_report", action="store_true", help="Do not report statistics using MultiQC report module [Default OFF]. See details in --help_multiqc") -options_group_trimm.add_argument("--threads", type=int, help="Number of CPUs to use [Default: 2].", default=2) +options_group_trimm.add_argument("-t", "--threads", type=int, help="Number of CPUs to use [Default: 2].", default=2) info_group_trimm = subparser_trimm.add_argument_group("Additional information") info_group_trimm.add_argument("--help_format", action="store_true", help="Show additional help on name format for files.") @@ -159,10 +159,10 @@ subparser_join = subparsers.add_parser( description='This module joins sequencing reads (paired-end)', ) in_out_group_join = subparser_join.add_argument_group("Input/Output") -in_out_group_join.add_argument("--input", help="Folder containing a project or reads, according to the mode selected. Files could be .fastq/.fq/ or fastq.gz/.fq.gz. See --help_format for additional details.", required= not any(elem in help_options for elem in sys.argv)) -in_out_group_join.add_argument("--output_folder", help="Output folder.", required = '--detached' in sys.argv) +in_out_group_join.add_argument("-i", "--input", help="Folder containing a project or reads, according to the mode selected. Files could be .fastq/.fq/ or fastq.gz/.fq.gz. See --help_format for additional details.", required= not any(elem in help_options for elem in sys.argv)) +in_out_group_join.add_argument("-o", "--output_folder", help="Output folder.", required = '--detached' in sys.argv) in_out_group_join.add_argument("--single_end", action="store_true", help="Single end files [Default OFF]. Default mode is paired-end.") -in_out_group_join.add_argument("--batch", action="store_true", help="Provide this option if input is a file containing multiple paths instead a path.") +in_out_group_join.add_argument("-b", "--batch", action="store_true", help="Provide this option if input is a file containing multiple paths instead a path.") in_out_group_join.add_argument("--in_sample", help="File containing a list of samples to include (one per line) from input folder(s) [Default OFF].") in_out_group_join.add_argument("--ex_sample", help="File containing a list of samples to exclude (one per line) from input folder(s) [Default OFF].") in_out_group_join.add_argument("--detached", action="store_true", help="Isolated mode. --input is a folder containing fastq reads. Provide a unique path o several using --batch option") @@ -170,7 +170,7 @@ in_out_group_join.add_argument("--include_lane", action="store_true", help="Incl in_out_group_join.add_argument("--include_all", action="store_true", help="Include all characters as tag name before read pair, if any. See --help_format for additional details [Default OFF]") options_group_join = subparser_join.add_argument_group("Options") -options_group_join.add_argument("--threads", type=int, help="Number of CPUs to use [Default: 2].", default=2) +options_group_join.add_argument("-t", "--threads", type=int, help="Number of CPUs to use [Default: 2].", default=2) options_group_join.add_argument("--perc_diff", type=int, help="Percentage difference for fastqjoin [Default: 0].") options_group_join.add_argument("--noTrim", action='store_true', help="Use non-trimmed reads [or not containing '_trim' in the name].") @@ -193,10 +193,10 @@ subparser_RNAbiotype = subparsers.add_parser( description='This module generates a RNA biotype analysis', ) in_out_group_RNAbiotype = subparser_RNAbiotype.add_argument_group("Input/Output") -in_out_group_RNAbiotype.add_argument("--input", help="Folder containing a project or reads, according to the mode selected. Files could be .fastq/.fq/ or fastq.gz/.fq.gz. See --help_format for additional details.", required= not any(elem in help_options for elem in sys.argv)) -in_out_group_RNAbiotype.add_argument("--output_folder", help="Output folder.", required = '--detached' in sys.argv) +in_out_group_RNAbiotype.add_argument("-i", "--input", help="Folder containing a project or reads, according to the mode selected. Files could be .fastq/.fq/ or fastq.gz/.fq.gz. See --help_format for additional details.", required= not any(elem in help_options for elem in sys.argv)) +in_out_group_RNAbiotype.add_argument("-o", "--output_folder", help="Output folder.", required = '--detached' in sys.argv) in_out_group_RNAbiotype.add_argument("--single_end", action="store_true", help="Single end files [Default OFF]. Default mode is paired-end.") -in_out_group_RNAbiotype.add_argument("--batch", action="store_true", help="Provide this option if input is a file containing multiple paths instead a path.") +in_out_group_RNAbiotype.add_argument("-b", "--batch", action="store_true", help="Provide this option if input is a file containing multiple paths instead a path.") in_out_group_RNAbiotype.add_argument("--in_sample", help="File containing a list of samples to include (one per line) from input folder(s) [Default OFF].") in_out_group_RNAbiotype.add_argument("--ex_sample", help="File containing a list of samples to exclude (one per line) from input folder(s) [Default OFF].") in_out_group_RNAbiotype.add_argument("--detached", action="store_true", help="Isolated mode. --input is a folder containing fastq reads. Provide a unique path o several using --batch option") @@ -204,7 +204,7 @@ in_out_group_RNAbiotype.add_argument("--include_lane", action="store_true", help in_out_group_RNAbiotype.add_argument("--include_all", action="store_true", help="Include all characters as tag name before read pair, if any. See --help_format for additional details [Default OFF]") options_group_RNAbiotype = subparser_RNAbiotype.add_argument_group("Options") -options_group_RNAbiotype.add_argument("--threads", type=int, help="Number of CPUs to use [Default: 2].", default=2) +options_group_RNAbiotype.add_argument("-t", "--threads", type=int, help="Number of CPUs to use [Default: 2].", default=2) options_group_RNAbiotype.add_argument("--annotation", help="Reference genome annotation in GTF format.", required=True) options_group_RNAbiotype.add_argument("--limitRAM", type=int, help="limitRAM parameter for STAR mapping. Default 20 Gbytes.", default=20000000000) options_group_RNAbiotype.add_argument("--noTrim", action='store_true', help="Use non-trimmed reads [or not containing '_trim' in the name].") @@ -238,10 +238,10 @@ subparser_miRNA = subparsers.add_parser( description='This module generates a miRNA analysis', ) in_out_group_miRNA = subparser_miRNA.add_argument_group("Input/Output") -in_out_group_miRNA.add_argument("--input", help="Folder containing a project or reads, according to the mode selected. Files could be .fastq/.fq/ or fastq.gz/.fq.gz. See --help_format for additional details.", required= not any(elem in help_options for elem in sys.argv)) -in_out_group_miRNA.add_argument("--output_folder", help="Output folder.", required = '--detached' in sys.argv) +in_out_group_miRNA.add_argument("-i", "--input", help="Folder containing a project or reads, according to the mode selected. Files could be .fastq/.fq/ or fastq.gz/.fq.gz. See --help_format for additional details.", required= not any(elem in help_options for elem in sys.argv)) +in_out_group_miRNA.add_argument("-o", "--output_folder", help="Output folder.", required = '--detached' in sys.argv) in_out_group_miRNA.add_argument("--single_end", action="store_true", help="Single end files [Default OFF]. Default mode is paired-end.") -in_out_group_miRNA.add_argument("--batch", action="store_true", help="Provide this option if input is a file containing multiple paths instead a path.") +in_out_group_miRNA.add_argument("-b", "--batch", action="store_true", help="Provide this option if input is a file containing multiple paths instead a path.") in_out_group_miRNA.add_argument("--in_sample", help="File containing a list of samples to include (one per line) from input folder(s) [Default OFF].") in_out_group_miRNA.add_argument("--ex_sample", help="File containing a list of samples to exclude (one per line) from input folder(s) [Default OFF].") in_out_group_miRNA.add_argument("--detached", action="store_true", help="Isolated mode. --input is a folder containing fastq reads. Provide a unique path o several using --batch option") @@ -250,7 +250,7 @@ in_out_group_miRNA.add_argument("--include_all", action="store_true", help="Incl in_out_group_miRNA.add_argument("--noTrim", action='store_true', help="Use non-trimmed reads [or not containing '_trim' in the name].") options_group_miRNA = subparser_miRNA.add_argument_group("Options") -options_group_miRNA.add_argument("--threads", type=int, help="Number of CPUs to use [Default: 2].", default=2) +options_group_miRNA.add_argument("-t", "--threads", type=int, help="Number of CPUs to use [Default: 2].", default=2) options_group_miRNA.add_argument("--species", help="Species tag ID [Default: hsa (Homo sapiens)].", default='hsa') options_group_miRNA.add_argument("--database", help="Path to store miRNA annotation files downloaded: miRBase, miRCarta, etc") options_group_miRNA.add_argument("--miRNA_gff", help="miRBase hsa GFF file containing miRNA information.") From ddc40248dc378304c2bd0fc8081b572cb21d0260 Mon Sep 17 00:00:00 2001 From: jfsanchezherrero Date: Thu, 14 Oct 2021 18:22:48 +0200 Subject: [PATCH 4/7] fixes and updates --- XICRA_pip/XICRA/modules/qc.py | 17 +++++++++++++---- XICRA_pip/XICRA/modules/trimm.py | 4 ++-- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/XICRA_pip/XICRA/modules/qc.py b/XICRA_pip/XICRA/modules/qc.py index d1fe1c9..6e15ad6 100644 --- a/XICRA_pip/XICRA/modules/qc.py +++ b/XICRA_pip/XICRA/modules/qc.py @@ -85,10 +85,8 @@ def run_QC(options): print ("[ fastq, fq, fastq.gz, fq.gz ]\n") pd_samples_retrieved = sampleParser.files.get_files(options, input_dir, "fastq", ("fastq", "fq", "fastq.gz", "fq.gz"), options.debug) - start_time_partial = start_time_total - ## create FASTQC call - fastqc(pd_samples_retrieved, outdir, options, start_time_total, "", Debug) + fastqc(pd_samples_retrieved, outdir, options, "", Debug) print ("\n*************** Finish *******************") start_time_partial = functions.time_functions.timestamp(start_time_total) @@ -97,7 +95,18 @@ def run_QC(options): exit() ####################### -def fastqc(pd_samples_retrieved, outdir, options, start_time_total, name_analysis, Debug): +def fastqc(pd_samples_retrieved, outdir, options, name_analysis, Debug): + ''' + This is a main function to prepare data to call FASTQC. + + :param pd_samples_retrieved + :param outdir + :param options + :param name_analysis + :param Debug + + ''' + ## debug message if (Debug): diff --git a/XICRA_pip/XICRA/modules/trimm.py b/XICRA_pip/XICRA/modules/trimm.py index d0f803b..aa6542b 100644 --- a/XICRA_pip/XICRA/modules/trimm.py +++ b/XICRA_pip/XICRA/modules/trimm.py @@ -146,7 +146,7 @@ def run_trimm(options): with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers_int) as executor: commandsSent = { executor.submit(cutadapt_caller.caller, sorted(cluster["sample"].tolist()), outdir_dict[name], name, threads_job, - min_len_read, Debug, adapters_dict, options.extra): name for name, cluster in sample_frame } + options.min_read_len, Debug, adapters_dict, options.extra): name for name, cluster in sample_frame } for cmd2 in concurrent.futures.as_completed(commandsSent): details = commandsSent[cmd2] @@ -209,7 +209,7 @@ def run_trimm(options): print (colored("** Beginning FAStQC analysis **", 'red')) pd_samples_retrieved_trimmed = sampleParser.files.get_files(options, input_dir, "trim", ['_trim'], options.debug) - qc.fastqc(pd_samples_retrieved_trimmed, outdir, options, start_time_partial, "trimmed", Debug) + qc.fastqc(pd_samples_retrieved_trimmed, outdir, options, "trimmed", Debug) print ("\n*************** Finish *******************") start_time_partial = functions.time_functions.timestamp(start_time_total) From 141f4b0de47c2af2e18202c6e7cd72523fabdcb9 Mon Sep 17 00:00:00 2001 From: jfsanchezherrero Date: Thu, 14 Oct 2021 18:33:54 +0200 Subject: [PATCH 5/7] add subset test --- .gitignore | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 0fb127d..b612e84 100644 --- a/.gitignore +++ b/.gitignore @@ -7,7 +7,6 @@ XICRA_pip/docs/build/ XICRA_pip/docs/source/_static XICRA_pip/docs/source/_templates - ## eclipse .project .pydevproject @@ -23,7 +22,9 @@ XICRA_pip/dist XICRA_pip/build XICRA_pip/.pypirc - ## big file to avoid BMC_bioinformatics_paper/simulation/data/simulations_results_percDiff-8_XICRA.simulations.csv XICRA_pip/XICRA/modules/db_files/ + +## test subset +XICRA_pip/test_subset From 1e7129acd2923b6931caa4439e40837c262c6e2a Mon Sep 17 00:00:00 2001 From: jfsanchezherrero Date: Thu, 14 Oct 2021 18:52:16 +0200 Subject: [PATCH 6/7] fixes and updates --- XICRA_pip/XICRA/modules/qc.py | 17 +++++++++++------ XICRA_pip/XICRA/modules/trimm.py | 12 ++++++++---- XICRA_pip/XICRA/scripts/cutadapt_caller.py | 7 ++++--- 3 files changed, 23 insertions(+), 13 deletions(-) diff --git a/XICRA_pip/XICRA/modules/qc.py b/XICRA_pip/XICRA/modules/qc.py index 6e15ad6..b7d12c8 100644 --- a/XICRA_pip/XICRA/modules/qc.py +++ b/XICRA_pip/XICRA/modules/qc.py @@ -15,7 +15,6 @@ import shutil import concurrent.futures from termcolor import colored -import cutadapt ## import my modules from XICRA.scripts import multiQC_report @@ -27,7 +26,8 @@ ############################################## def run_QC(options): - ## init time + + ## init time start_time_total = time.time() ################################## @@ -86,7 +86,7 @@ def run_QC(options): pd_samples_retrieved = sampleParser.files.get_files(options, input_dir, "fastq", ("fastq", "fq", "fastq.gz", "fq.gz"), options.debug) ## create FASTQC call - fastqc(pd_samples_retrieved, outdir, options, "", Debug) + fastqc(pd_samples_retrieved, outdir, options, "", start_time_total, Debug) print ("\n*************** Finish *******************") start_time_partial = functions.time_functions.timestamp(start_time_total) @@ -95,7 +95,7 @@ def run_QC(options): exit() ####################### -def fastqc(pd_samples_retrieved, outdir, options, name_analysis, Debug): +def fastqc(pd_samples_retrieved, outdir, options, name_analysis, time_stamp, Debug): ''' This is a main function to prepare data to call FASTQC. @@ -105,8 +105,13 @@ def fastqc(pd_samples_retrieved, outdir, options, name_analysis, Debug): :param name_analysis :param Debug - ''' + :type pd_samples_retrieved + :type outdir + :type options + :type name_analysis + :type Debug + ''' ## debug message if (Debug): @@ -132,7 +137,7 @@ def fastqc(pd_samples_retrieved, outdir, options, name_analysis, Debug): outdir_dict = functions.files_functions.outdir_project(outdir, options.project, pd_samples_retrieved, fold_name, options.debug) print ("+ Checking quality for each sample retrieved...") - start_time_partial = start_time_total + start_time_partial = time_stamp # Group dataframe by sample name sample_frame = pd_samples_retrieved.groupby(["name"]) diff --git a/XICRA_pip/XICRA/modules/trimm.py b/XICRA_pip/XICRA/modules/trimm.py index aa6542b..4b6baaf 100644 --- a/XICRA_pip/XICRA/modules/trimm.py +++ b/XICRA_pip/XICRA/modules/trimm.py @@ -158,6 +158,7 @@ def run_trimm(options): print('%r generated an exception: %s' % (details, exc)) print ("\n\n+ Trimming samples has finished...") + ## functions.time_functions.timestamp start_time_partial = functions.time_functions.timestamp(start_time_total) @@ -208,12 +209,15 @@ def run_trimm(options): if (Debug): print (colored("** Beginning FAStQC analysis **", 'red')) - pd_samples_retrieved_trimmed = sampleParser.files.get_files(options, input_dir, "trim", ['_trim'], options.debug) - qc.fastqc(pd_samples_retrieved_trimmed, outdir, options, "trimmed", Debug) + ## functions.time_functions.timestamp + start_time_partial = functions.time_functions.timestamp(start_time_partial) + + ## create FASTQC calling for trimmed reads + pd_samples_retrieved_trimmed = sampleParser.files.get_files(options, input_dir, "trim", ['_trim'], options.debug) + qc.fastqc(pd_samples_retrieved_trimmed, outdir, options, "trimmed", start_time_partial, Debug) print ("\n*************** Finish *******************") start_time_partial = functions.time_functions.timestamp(start_time_total) print ("\n+ Exiting trimm module.") exit() - - + \ No newline at end of file diff --git a/XICRA_pip/XICRA/scripts/cutadapt_caller.py b/XICRA_pip/XICRA/scripts/cutadapt_caller.py index 7dc3963..bd3d066 100644 --- a/XICRA_pip/XICRA/scripts/cutadapt_caller.py +++ b/XICRA_pip/XICRA/scripts/cutadapt_caller.py @@ -14,13 +14,14 @@ import sys from sys import argv from io import open +from termcolor import colored ## import my modules from HCGB import functions from XICRA.config import set_config ############################################# -def caller(list_reads, sample_folder, name, threads, Debug, adapters, extra): +def caller(list_reads, sample_folder, name, threads, min_read_len, Debug, adapters, extra): ## check if previously trimmed and succeeded filename_stamp = sample_folder + '/.success' if os.path.isfile(filename_stamp): @@ -29,7 +30,7 @@ def caller(list_reads, sample_folder, name, threads, Debug, adapters, extra): else: # Call cutadapt cutadapt_exe = set_config.get_exe('cutadapt') - code_returned = cutadapt(cutadapt_exe, list_reads, sample_folder, name, threads, Debug, adapters, extra) + code_returned = cutadapt(cutadapt_exe, list_reads, sample_folder, name, threads, min_read_len, Debug, adapters, extra) if code_returned: functions.time_functions.print_time_stamp(filename_stamp) else: @@ -37,7 +38,7 @@ def caller(list_reads, sample_folder, name, threads, Debug, adapters, extra): ############################################# -def cutadapt (cutadapt_exe, reads, path, sample_name, num_threads, min_len_given, Debug, adapters, extra): +def cutadapt(cutadapt_exe, reads, path, sample_name, num_threads, min_len_given, Debug, adapters, extra): """ :param cutadapt_exe: From 3151632ee9f62bebde47325212a87d7b3b23254a Mon Sep 17 00:00:00 2001 From: jfsanchezherrero Date: Thu, 14 Oct 2021 19:04:44 +0200 Subject: [PATCH 7/7] fixes and updates --- XICRA_pip/XICRA/modules/qc.py | 2 +- XICRA_pip/XICRA/scripts/fastqc_caller.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/XICRA_pip/XICRA/modules/qc.py b/XICRA_pip/XICRA/modules/qc.py index b7d12c8..b1c6b3a 100644 --- a/XICRA_pip/XICRA/modules/qc.py +++ b/XICRA_pip/XICRA/modules/qc.py @@ -195,7 +195,7 @@ def fastqc(pd_samples_retrieved, outdir, options, name_analysis, time_stamp, Deb print ("\n") fastqc_report = functions.files_functions.create_subfolder("FASTQC", outdir_report) - fastqc_final_report = functions.files_functions.create_subfolder(name_analysis, fastqc_report) + fastqc_final_report = functions.files_functions.create_subfolder(fold_name, fastqc_report) multiQC_report.multiQC_module_call(my_outdir_list, "FASTQC", fastqc_final_report,"") print ('\n+ A summary HTML report of each sample is generated in folder: %s' %fastqc_final_report) diff --git a/XICRA_pip/XICRA/scripts/fastqc_caller.py b/XICRA_pip/XICRA/scripts/fastqc_caller.py index 4379aa0..d222ac9 100644 --- a/XICRA_pip/XICRA/scripts/fastqc_caller.py +++ b/XICRA_pip/XICRA/scripts/fastqc_caller.py @@ -14,6 +14,7 @@ import sys from sys import argv from io import open +from termcolor import colored ## import my modules from HCGB import functions @@ -47,7 +48,7 @@ def run_module_fastqc(path, files, sample, threads): filename_stamp = path + '/.success' if os.path.isfile(filename_stamp): stamp = functions.time_functions.read_time_stamp(filename_stamp) - print (colored("\tA previous command generated results on: %s [%s -- %s]" %(stamp, name, 'fastqc'), 'yellow')) + print (colored("\tA previous command generated results on: %s [%s -- %s]" %(stamp, sample, 'fastqc'), 'yellow')) else: ## call fastqc fastqc_bin = set_config.get_exe('fastqc')