From bb1e88abb4eeda981cc2eeeaffa92f519bebc272 Mon Sep 17 00:00:00 2001
From: Marta Lopez Balastegui <mlopezb@ws-ls-merkator.igtpscp.local>
Date: Thu, 14 Oct 2021 10:04:20 +0200
Subject: [PATCH] QC after trimm

---
 XICRA_pip/XICRA/modules/help_XICRA.py | 47 ++++++++-------
 XICRA_pip/XICRA/modules/qc.py         | 83 +++++++++++++++++++++++++++
 XICRA_pip/XICRA/modules/trimm.py      | 12 +++-
 3 files changed, 118 insertions(+), 24 deletions(-)

diff --git a/XICRA_pip/XICRA/modules/help_XICRA.py b/XICRA_pip/XICRA/modules/help_XICRA.py
index 48a500f..e42f6e2 100644
--- a/XICRA_pip/XICRA/modules/help_XICRA.py
+++ b/XICRA_pip/XICRA/modules/help_XICRA.py
@@ -24,8 +24,8 @@ def help_fastq_format():
     print ("name_1.fastq.gz, adding '1' or '2' to specify the read")
     print ("name_R2.fastq.gz, adding 'R1' or 'R2' to specify the read")
     print ("name_L001_R1.fastq.gz, adding the lane information as L00x after the name")
-    print ("name_L001_R1_001.fastq.gz, adding 00X at the end. This naming is useful when the fastq" + 
-           "files of the sample sample had been cut in different files).")
+    print ("name_L001_R1_001.fastq.gz, adding 00X at the end. This naming is useful when the fastq")
+    print ("files of the sample sample had been cut in different files).")
     print ("name_L001_XYZ_R1_001.fastq.gz, there can be extra info for each file.")
     print ("\nThere are many options and here we provide some guidelines on the name format.")
     print ("\n")
@@ -55,13 +55,17 @@ def help_fastq_format():
     print (colored('** See additional details for Lane information **', 'yellow'))
     print ("\n")
     
-    print ("XICRA will store the names of all the input files. After that, it will identify the samples." +
-           "It can be the case that more than one file belong to the same sample. In order to pass this information"+
-           "to XICRA a combination of the following parameters may be needed:")
+    functions.aesthetics_functions.print_sepLine("*",55,"red")
+    print ("[4] Sample identification:")
+    functions.aesthetics_functions.print_sepLine("*",55,"red")
+    
+    print ("XICRA will store the names of all the input files. After that, it will identify the samples.")
+    print ("It can be the case that more than one file belong to the same sample. In order to pass this information")
+    print ("to XICRA a combination of the following parameters may be needed:")
     print ("\n")
 
     functions.aesthetics_functions.print_sepLine("*",55,"red")
-    print ("[4] Lane information:")
+    print ("[4.1] Lane information:")
     functions.aesthetics_functions.print_sepLine("*",55,"red")
     print ("In some cases, files might contain lane information (*L00x* and/or *00x*).")
     print ("XICRA supports these names as long as follow these examples:")
@@ -71,14 +75,13 @@ def help_fastq_format():
     print ("name_L00x_R1_00x.fastq.gz\tname_L00x_R2_00x.fastq.gz")
     print ("\n")
     
-    print ("If you want to include lane tags (*L00X*) into each  each sample name (differentiate samples considering the lane):")
+    print ("- If you want to include lane tags (*L00X*) into each  each sample name (differentiate samples considering the lane):")
     print (colored("** Use option --include-lane within each module and the lane tag will also be used to identify samples", 'yellow'))
     
-    print (colored("\n** However, if you want to consider as a single sample the different lanes, you need to merge the fastq files from " +
-                   "the different lanes, use option --merge_Reads"+
-                   "within module prep**", 'yellow'))
+    print ("\n- However, if you want to consider as a single sample the different lanes, you need to merge")
+    print ("the fastq files from the different lanes, use option --merge_Reads within module prep.")
     print("As an example:")
-    print (colored("\n** Options --include_lane --merge_Reads within module prep **", 'yellow'))
+    print (colored("** Options --merge_Reads within module prep **", 'yellow'))
     print ("sample1_L001_R1.fastq.gz\tsample1_L001_R2.fastq.gz")
     print ("sample1_L002_R1.fastq.gz\tsample1_L002_R2.fastq.gz")
     print ("sample1_L003_R1.fastq.gz\tsample1_L003_R2.fastq.gz")
@@ -88,8 +91,8 @@ def help_fastq_format():
     print ("sample1_R1.fastq.gz\tsample1_R2.fastq.gz")
     print ("\n")
     
-    print (colored("\n** If you need to merge fastq files of the same lane that differ in the last group of numbers" +
-                   "use option --mergeReads together with --include-lane within module prep**", 'yellow'))
+    print ("\n- If you need to merge fastq files of the same lane that differ in the last group of numbers")
+    print ("use option --mergeReads together with --include-lane within module prep.")
     print (colored("\n** Option --include_lane --merge-by-lane within module prep **", 'yellow'))
     print ("sample1_L001_R1_001.fastq.gz\tsample1_L001_R2_001.fastq.gz")
     print ("sample1_L001_R1_002.fastq.gz\tsample1_L001_R2_002.fastq.gz")
@@ -103,8 +106,8 @@ def help_fastq_format():
     print ("\n")
     
     ### if you want to merge lane and extension --mergeReads
-    print (colored("\n** If you need to merge fastq files with different lanes and final extension " +
-                   "(_001, _002, ...), use only option --merge_Reads within module prep**", 'yellow'))
+    print ("- If you need to merge fastq files with different lanes and final extension ")
+    print ("(_001, _002, ...), use only option --merge_Reads within module prep.")
     print("As an example:")
     print (colored("\n** Options --merge_Reads within module prep **", 'yellow'))
     print ("sample1_L001_R1_001.fastq.gz\tsample1_L001_R2_001.fastq.gz")
@@ -118,18 +121,18 @@ def help_fastq_format():
     
     
     functions.aesthetics_functions.print_sepLine("*",55,"red")
-    print ("[5] Include all information:")
+    print ("[4.2] Include all information:")
     functions.aesthetics_functions.print_sepLine("*",55,"red")
-    print ("In some cases, files might contain other extra information and it is necessary to " +
-           "include it all as a tag name, in that case use --include-all. In the following example" + 
-           "XYZ is the extra information and it is also used to identify each sample:")
+    print ("In some cases, files might contain other extra information and it is necessary to ")
+    print ("include it all as a tag name, in that case use --include-all. In the following example")
+    print ("XYZ is the extra information and it is also used to identify each sample:")
     print ("sample1_L001_XYZ_R1_001.fastq.gz\tsample1_L001_XYZ_R2_001.fastq.gz")
     print (colored("** Remember to use option --include_all within each module", 'yellow'))
     
     print (colored("** It might be appropriate to change samples names using --rename option under prep module", 'yellow'))
     
-    print (colored("\n** If you need to merge fastq files that only differ in the last group of numbers " +
-                   "(_001, _002, ...), use option --merge_Reads within module prep together with --include-all**", 'yellow'))
+    print ("\n- If you need to merge fastq files that only differ in the last group of numbers ")
+    print ("(_001, _002, ...), use option --merge_Reads within module prep together with --include-all.")
     print("As an example:")
     print (colored("\n** Options --include_all --merge_Reads within module prep **", 'yellow'))
     print ("sample1_L001_XYZ_R1_001.fastq.gz\tsample1_L001_XYZ_R2_001.fastq.gz")
@@ -145,7 +148,7 @@ def help_fastq_format():
     
     print ("\n")
     functions.aesthetics_functions.print_sepLine("*",15,"red")
-    print ("[6] Extensions:")
+    print ("[4.3] Extensions:")
     functions.aesthetics_functions.print_sepLine("*",15,"red")
     print ("name_L00x_R2.fastq\tname_L00x_R2.fq\nname_L00x_R2.fastq.gz\tname_L00x_R2.fq.gz")
     print ("\n")
diff --git a/XICRA_pip/XICRA/modules/qc.py b/XICRA_pip/XICRA/modules/qc.py
index be2968a..546bc02 100644
--- a/XICRA_pip/XICRA/modules/qc.py
+++ b/XICRA_pip/XICRA/modules/qc.py
@@ -166,3 +166,86 @@ def run_QC(options):
 
     print ("+ Exiting qc module.")
     exit()
+
+def fastqc(pd_samples_retrieved, outdir, options, start_time_total, name_analysis, Debug):
+    
+    print("+ FASTQC Quality check for trimmed samples")
+
+    
+    ## debug message
+    if (Debug):
+        print (colored("\n**DEBUG: pd_samples_retrieve **", 'yellow'))
+        print (pd_samples_retrieved)
+        print ("\n")
+
+    ## generate output folder, if necessary
+    print ("\n+ Create output folder(s):")
+    
+    ## if not project, outdir contains the dir to put output
+    ## in this case, in some other cases might not occur    
+    if not options.project:
+        functions.create_folder(outdir)
+    outdir_dict = functions.files_functions.outdir_project(outdir, options.project, pd_samples_retrieved, "fastqc_" + name_analysis, options.debug)
+    
+    print ("+ Checking quality for each sample retrieved...")
+    start_time_partial = start_time_total
+    
+    # Group dataframe by sample name
+    sample_frame = pd_samples_retrieved.groupby(["name"])
+
+    ## optimize threads
+    name_list = set(pd_samples_retrieved["name"].tolist())
+    threads_job = functions.main_functions.optimize_threads(options.threads, len(name_list)) ## threads optimization
+    max_workers_int = int(options.threads/threads_job)
+
+    ## debug message
+    if (Debug):
+        functions.aesthetics_functions.debug_message("options.threads: " + str(options.threads), "yellow")
+        functions.aesthetics_functions.debug_message("max_workers: " + str(max_workers_int), "yellow")
+        functions.aesthetics_functions.debug_message("threads_job: " + str(threads_job), "yellow")
+
+    ## send for each sample
+    print ("+ Calling fastqc for samples...")    
+    with concurrent.futures.ThreadPoolExecutor(max_workers=int(max_workers_int)) as executor:
+        commandsSent = { executor.submit(fastqc_caller.run_module_fastqc, outdir_dict[name], sorted( cluster["sample"].tolist() ), name, threads_job): name for name, cluster in sample_frame }
+        
+        for cmd2 in concurrent.futures.as_completed(commandsSent):
+            details = commandsSent[cmd2]
+            try:
+                data = cmd2.result()
+            except Exception as exc:
+                print ('***ERROR:')
+                print (cmd2)
+                print('%r generated an exception: %s' % (details, exc))
+
+    print ("+ FASTQC for samples has finished...")    
+    
+    ## functions.timestamp
+    start_time_partial = functions.time_functions.timestamp(start_time_partial)
+
+    if (options.skip_report):
+        print ("+ No report generation...")
+    else:
+        print ("\n+ Generating a report using MultiQC module.")
+        outdir_report = functions.files_functions.create_subfolder("report", outdir)
+
+        ## get subdirs generated and call multiQC report module
+        givenList = []
+        print ("+ Detail information for each sample could be identified in separate folders:")
+        
+        ## call multiQC report module
+        givenList = [ v for v in outdir_dict.values() ]
+        my_outdir_list = set(givenList)
+
+        ## debug message
+        if (Debug):
+            print (colored("\n**DEBUG: my_outdir_list for multiqc report **", 'yellow'))
+            print (my_outdir_list)
+            print ("\n")
+        
+        fastqc_report = functions.files_functions.create_subfolder("FASTQC", outdir_report)
+        fastqc_final_report = functions.files_functions.create_subfolder(name_analysis, fastqc_report)
+        multiQC_report.multiQC_module_call(my_outdir_list, "FASTQC", fastqc_final_report,"")
+        print ('\n+ A summary HTML report of each sample is generated in folder: %s' %fastqc_final_report)
+
+    return()
diff --git a/XICRA_pip/XICRA/modules/trimm.py b/XICRA_pip/XICRA/modules/trimm.py
index a805b31..efb3ef3 100644
--- a/XICRA_pip/XICRA/modules/trimm.py
+++ b/XICRA_pip/XICRA/modules/trimm.py
@@ -20,6 +20,7 @@
 from XICRA.scripts import multiQC_report
 from XICRA.config import set_config
 from XICRA.modules import help_XICRA
+from XICRA.modules import qc
 from HCGB import functions
 from HCGB import sampleParser
 
@@ -198,6 +199,13 @@ def run_trimm(options):
         multiQC_report.multiQC_module_call(my_outdir_list, "Cutadapt", trimm_report,"")
         print ('\n+ A summary HTML report of each sample is generated in folder: %s' %trimm_report)
         
+        ## QC analysis for trimmed reads
+        if (Debug):
+            print (colored("** Beginning FAStQC analysis **", 'red'))
+
+        pd_samples_retrieved_trimmed = sampleParser.files.get_files(options, input_dir, "trim", ['_trim'], options.debug)
+        qc.fastqc(pd_samples_retrieved_trimmed, outdir, options, start_time_partial, "trimmed", Debug)
+        
     print ("\n*************** Finish *******************")
     start_time_partial = functions.time_functions.timestamp(start_time_total)
     print ("\n+ Exiting trimm module.")
@@ -258,8 +266,8 @@ def cutadapt (cutadapt_exe, reads, path, sample_name, num_threads, Debug, adapte
             p_param = os.path.join(path, sample_name + '_trim_R2.fastq')
             o_param = os.path.join(path, sample_name + '_trim_R1.fastq')
         
-        ## paired-end mode
-        cmd = '%s -j %s -a %s -A %s -o %s -p %s %s %s > %s' %(cutadapt_exe,  
+        ## paired-end mode, 15 bps as the min length cutoff
+        cmd = '%s -j %s -a %s -A %s -o %s -p %s %s %s > %s -m 15' %(cutadapt_exe,  
                                                                        num_threads, adapters['adapter_a'], 
                                                                        adapters['adapter_A'], o_param, 
                                                                        p_param, reads[0], reads[1], logfile)