From bb1e88abb4eeda981cc2eeeaffa92f519bebc272 Mon Sep 17 00:00:00 2001 From: Marta Lopez Balastegui Date: Thu, 14 Oct 2021 10:04:20 +0200 Subject: [PATCH] QC after trimm --- XICRA_pip/XICRA/modules/help_XICRA.py | 47 ++++++++------- XICRA_pip/XICRA/modules/qc.py | 83 +++++++++++++++++++++++++++ XICRA_pip/XICRA/modules/trimm.py | 12 +++- 3 files changed, 118 insertions(+), 24 deletions(-) diff --git a/XICRA_pip/XICRA/modules/help_XICRA.py b/XICRA_pip/XICRA/modules/help_XICRA.py index 48a500f..e42f6e2 100644 --- a/XICRA_pip/XICRA/modules/help_XICRA.py +++ b/XICRA_pip/XICRA/modules/help_XICRA.py @@ -24,8 +24,8 @@ def help_fastq_format(): print ("name_1.fastq.gz, adding '1' or '2' to specify the read") print ("name_R2.fastq.gz, adding 'R1' or 'R2' to specify the read") print ("name_L001_R1.fastq.gz, adding the lane information as L00x after the name") - print ("name_L001_R1_001.fastq.gz, adding 00X at the end. This naming is useful when the fastq" + - "files of the sample sample had been cut in different files).") + print ("name_L001_R1_001.fastq.gz, adding 00X at the end. This naming is useful when the fastq") + print ("files of the sample sample had been cut in different files).") print ("name_L001_XYZ_R1_001.fastq.gz, there can be extra info for each file.") print ("\nThere are many options and here we provide some guidelines on the name format.") print ("\n") @@ -55,13 +55,17 @@ def help_fastq_format(): print (colored('** See additional details for Lane information **', 'yellow')) print ("\n") - print ("XICRA will store the names of all the input files. After that, it will identify the samples." + - "It can be the case that more than one file belong to the same sample. In order to pass this information"+ - "to XICRA a combination of the following parameters may be needed:") + functions.aesthetics_functions.print_sepLine("*",55,"red") + print ("[4] Sample identification:") + functions.aesthetics_functions.print_sepLine("*",55,"red") + + print ("XICRA will store the names of all the input files. After that, it will identify the samples.") + print ("It can be the case that more than one file belong to the same sample. In order to pass this information") + print ("to XICRA a combination of the following parameters may be needed:") print ("\n") functions.aesthetics_functions.print_sepLine("*",55,"red") - print ("[4] Lane information:") + print ("[4.1] Lane information:") functions.aesthetics_functions.print_sepLine("*",55,"red") print ("In some cases, files might contain lane information (*L00x* and/or *00x*).") print ("XICRA supports these names as long as follow these examples:") @@ -71,14 +75,13 @@ def help_fastq_format(): print ("name_L00x_R1_00x.fastq.gz\tname_L00x_R2_00x.fastq.gz") print ("\n") - print ("If you want to include lane tags (*L00X*) into each each sample name (differentiate samples considering the lane):") + print ("- If you want to include lane tags (*L00X*) into each each sample name (differentiate samples considering the lane):") print (colored("** Use option --include-lane within each module and the lane tag will also be used to identify samples", 'yellow')) - print (colored("\n** However, if you want to consider as a single sample the different lanes, you need to merge the fastq files from " + - "the different lanes, use option --merge_Reads"+ - "within module prep**", 'yellow')) + print ("\n- However, if you want to consider as a single sample the different lanes, you need to merge") + print ("the fastq files from the different lanes, use option --merge_Reads within module prep.") print("As an example:") - print (colored("\n** Options --include_lane --merge_Reads within module prep **", 'yellow')) + print (colored("** Options --merge_Reads within module prep **", 'yellow')) print ("sample1_L001_R1.fastq.gz\tsample1_L001_R2.fastq.gz") print ("sample1_L002_R1.fastq.gz\tsample1_L002_R2.fastq.gz") print ("sample1_L003_R1.fastq.gz\tsample1_L003_R2.fastq.gz") @@ -88,8 +91,8 @@ def help_fastq_format(): print ("sample1_R1.fastq.gz\tsample1_R2.fastq.gz") print ("\n") - print (colored("\n** If you need to merge fastq files of the same lane that differ in the last group of numbers" + - "use option --mergeReads together with --include-lane within module prep**", 'yellow')) + print ("\n- If you need to merge fastq files of the same lane that differ in the last group of numbers") + print ("use option --mergeReads together with --include-lane within module prep.") print (colored("\n** Option --include_lane --merge-by-lane within module prep **", 'yellow')) print ("sample1_L001_R1_001.fastq.gz\tsample1_L001_R2_001.fastq.gz") print ("sample1_L001_R1_002.fastq.gz\tsample1_L001_R2_002.fastq.gz") @@ -103,8 +106,8 @@ def help_fastq_format(): print ("\n") ### if you want to merge lane and extension --mergeReads - print (colored("\n** If you need to merge fastq files with different lanes and final extension " + - "(_001, _002, ...), use only option --merge_Reads within module prep**", 'yellow')) + print ("- If you need to merge fastq files with different lanes and final extension ") + print ("(_001, _002, ...), use only option --merge_Reads within module prep.") print("As an example:") print (colored("\n** Options --merge_Reads within module prep **", 'yellow')) print ("sample1_L001_R1_001.fastq.gz\tsample1_L001_R2_001.fastq.gz") @@ -118,18 +121,18 @@ def help_fastq_format(): functions.aesthetics_functions.print_sepLine("*",55,"red") - print ("[5] Include all information:") + print ("[4.2] Include all information:") functions.aesthetics_functions.print_sepLine("*",55,"red") - print ("In some cases, files might contain other extra information and it is necessary to " + - "include it all as a tag name, in that case use --include-all. In the following example" + - "XYZ is the extra information and it is also used to identify each sample:") + print ("In some cases, files might contain other extra information and it is necessary to ") + print ("include it all as a tag name, in that case use --include-all. In the following example") + print ("XYZ is the extra information and it is also used to identify each sample:") print ("sample1_L001_XYZ_R1_001.fastq.gz\tsample1_L001_XYZ_R2_001.fastq.gz") print (colored("** Remember to use option --include_all within each module", 'yellow')) print (colored("** It might be appropriate to change samples names using --rename option under prep module", 'yellow')) - print (colored("\n** If you need to merge fastq files that only differ in the last group of numbers " + - "(_001, _002, ...), use option --merge_Reads within module prep together with --include-all**", 'yellow')) + print ("\n- If you need to merge fastq files that only differ in the last group of numbers ") + print ("(_001, _002, ...), use option --merge_Reads within module prep together with --include-all.") print("As an example:") print (colored("\n** Options --include_all --merge_Reads within module prep **", 'yellow')) print ("sample1_L001_XYZ_R1_001.fastq.gz\tsample1_L001_XYZ_R2_001.fastq.gz") @@ -145,7 +148,7 @@ def help_fastq_format(): print ("\n") functions.aesthetics_functions.print_sepLine("*",15,"red") - print ("[6] Extensions:") + print ("[4.3] Extensions:") functions.aesthetics_functions.print_sepLine("*",15,"red") print ("name_L00x_R2.fastq\tname_L00x_R2.fq\nname_L00x_R2.fastq.gz\tname_L00x_R2.fq.gz") print ("\n") diff --git a/XICRA_pip/XICRA/modules/qc.py b/XICRA_pip/XICRA/modules/qc.py index be2968a..546bc02 100644 --- a/XICRA_pip/XICRA/modules/qc.py +++ b/XICRA_pip/XICRA/modules/qc.py @@ -166,3 +166,86 @@ def run_QC(options): print ("+ Exiting qc module.") exit() + +def fastqc(pd_samples_retrieved, outdir, options, start_time_total, name_analysis, Debug): + + print("+ FASTQC Quality check for trimmed samples") + + + ## debug message + if (Debug): + print (colored("\n**DEBUG: pd_samples_retrieve **", 'yellow')) + print (pd_samples_retrieved) + print ("\n") + + ## generate output folder, if necessary + print ("\n+ Create output folder(s):") + + ## if not project, outdir contains the dir to put output + ## in this case, in some other cases might not occur + if not options.project: + functions.create_folder(outdir) + outdir_dict = functions.files_functions.outdir_project(outdir, options.project, pd_samples_retrieved, "fastqc_" + name_analysis, options.debug) + + print ("+ Checking quality for each sample retrieved...") + start_time_partial = start_time_total + + # Group dataframe by sample name + sample_frame = pd_samples_retrieved.groupby(["name"]) + + ## optimize threads + name_list = set(pd_samples_retrieved["name"].tolist()) + threads_job = functions.main_functions.optimize_threads(options.threads, len(name_list)) ## threads optimization + max_workers_int = int(options.threads/threads_job) + + ## debug message + if (Debug): + functions.aesthetics_functions.debug_message("options.threads: " + str(options.threads), "yellow") + functions.aesthetics_functions.debug_message("max_workers: " + str(max_workers_int), "yellow") + functions.aesthetics_functions.debug_message("threads_job: " + str(threads_job), "yellow") + + ## send for each sample + print ("+ Calling fastqc for samples...") + with concurrent.futures.ThreadPoolExecutor(max_workers=int(max_workers_int)) as executor: + commandsSent = { executor.submit(fastqc_caller.run_module_fastqc, outdir_dict[name], sorted( cluster["sample"].tolist() ), name, threads_job): name for name, cluster in sample_frame } + + for cmd2 in concurrent.futures.as_completed(commandsSent): + details = commandsSent[cmd2] + try: + data = cmd2.result() + except Exception as exc: + print ('***ERROR:') + print (cmd2) + print('%r generated an exception: %s' % (details, exc)) + + print ("+ FASTQC for samples has finished...") + + ## functions.timestamp + start_time_partial = functions.time_functions.timestamp(start_time_partial) + + if (options.skip_report): + print ("+ No report generation...") + else: + print ("\n+ Generating a report using MultiQC module.") + outdir_report = functions.files_functions.create_subfolder("report", outdir) + + ## get subdirs generated and call multiQC report module + givenList = [] + print ("+ Detail information for each sample could be identified in separate folders:") + + ## call multiQC report module + givenList = [ v for v in outdir_dict.values() ] + my_outdir_list = set(givenList) + + ## debug message + if (Debug): + print (colored("\n**DEBUG: my_outdir_list for multiqc report **", 'yellow')) + print (my_outdir_list) + print ("\n") + + fastqc_report = functions.files_functions.create_subfolder("FASTQC", outdir_report) + fastqc_final_report = functions.files_functions.create_subfolder(name_analysis, fastqc_report) + multiQC_report.multiQC_module_call(my_outdir_list, "FASTQC", fastqc_final_report,"") + print ('\n+ A summary HTML report of each sample is generated in folder: %s' %fastqc_final_report) + + return() diff --git a/XICRA_pip/XICRA/modules/trimm.py b/XICRA_pip/XICRA/modules/trimm.py index a805b31..efb3ef3 100644 --- a/XICRA_pip/XICRA/modules/trimm.py +++ b/XICRA_pip/XICRA/modules/trimm.py @@ -20,6 +20,7 @@ from XICRA.scripts import multiQC_report from XICRA.config import set_config from XICRA.modules import help_XICRA +from XICRA.modules import qc from HCGB import functions from HCGB import sampleParser @@ -198,6 +199,13 @@ def run_trimm(options): multiQC_report.multiQC_module_call(my_outdir_list, "Cutadapt", trimm_report,"") print ('\n+ A summary HTML report of each sample is generated in folder: %s' %trimm_report) + ## QC analysis for trimmed reads + if (Debug): + print (colored("** Beginning FAStQC analysis **", 'red')) + + pd_samples_retrieved_trimmed = sampleParser.files.get_files(options, input_dir, "trim", ['_trim'], options.debug) + qc.fastqc(pd_samples_retrieved_trimmed, outdir, options, start_time_partial, "trimmed", Debug) + print ("\n*************** Finish *******************") start_time_partial = functions.time_functions.timestamp(start_time_total) print ("\n+ Exiting trimm module.") @@ -258,8 +266,8 @@ def cutadapt (cutadapt_exe, reads, path, sample_name, num_threads, Debug, adapte p_param = os.path.join(path, sample_name + '_trim_R2.fastq') o_param = os.path.join(path, sample_name + '_trim_R1.fastq') - ## paired-end mode - cmd = '%s -j %s -a %s -A %s -o %s -p %s %s %s > %s' %(cutadapt_exe, + ## paired-end mode, 15 bps as the min length cutoff + cmd = '%s -j %s -a %s -A %s -o %s -p %s %s %s > %s -m 15' %(cutadapt_exe, num_threads, adapters['adapter_a'], adapters['adapter_A'], o_param, p_param, reads[0], reads[1], logfile)