Skip to content

Commit

Permalink
QC after trimm
Browse files Browse the repository at this point in the history
  • Loading branch information
Marta Lopez Balastegui committed Oct 14, 2021
1 parent 5c455c8 commit bb1e88a
Show file tree
Hide file tree
Showing 3 changed files with 118 additions and 24 deletions.
47 changes: 25 additions & 22 deletions XICRA_pip/XICRA/modules/help_XICRA.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ def help_fastq_format():
print ("name_1.fastq.gz, adding '1' or '2' to specify the read")
print ("name_R2.fastq.gz, adding 'R1' or 'R2' to specify the read")
print ("name_L001_R1.fastq.gz, adding the lane information as L00x after the name")
print ("name_L001_R1_001.fastq.gz, adding 00X at the end. This naming is useful when the fastq" +
"files of the sample sample had been cut in different files).")
print ("name_L001_R1_001.fastq.gz, adding 00X at the end. This naming is useful when the fastq")
print ("files of the sample sample had been cut in different files).")
print ("name_L001_XYZ_R1_001.fastq.gz, there can be extra info for each file.")
print ("\nThere are many options and here we provide some guidelines on the name format.")
print ("\n")
Expand Down Expand Up @@ -55,13 +55,17 @@ def help_fastq_format():
print (colored('** See additional details for Lane information **', 'yellow'))
print ("\n")

print ("XICRA will store the names of all the input files. After that, it will identify the samples." +
"It can be the case that more than one file belong to the same sample. In order to pass this information"+
"to XICRA a combination of the following parameters may be needed:")
functions.aesthetics_functions.print_sepLine("*",55,"red")
print ("[4] Sample identification:")
functions.aesthetics_functions.print_sepLine("*",55,"red")

print ("XICRA will store the names of all the input files. After that, it will identify the samples.")
print ("It can be the case that more than one file belong to the same sample. In order to pass this information")
print ("to XICRA a combination of the following parameters may be needed:")
print ("\n")

functions.aesthetics_functions.print_sepLine("*",55,"red")
print ("[4] Lane information:")
print ("[4.1] Lane information:")
functions.aesthetics_functions.print_sepLine("*",55,"red")
print ("In some cases, files might contain lane information (*L00x* and/or *00x*).")
print ("XICRA supports these names as long as follow these examples:")
Expand All @@ -71,14 +75,13 @@ def help_fastq_format():
print ("name_L00x_R1_00x.fastq.gz\tname_L00x_R2_00x.fastq.gz")
print ("\n")

print ("If you want to include lane tags (*L00X*) into each each sample name (differentiate samples considering the lane):")
print ("- If you want to include lane tags (*L00X*) into each each sample name (differentiate samples considering the lane):")
print (colored("** Use option --include-lane within each module and the lane tag will also be used to identify samples", 'yellow'))

print (colored("\n** However, if you want to consider as a single sample the different lanes, you need to merge the fastq files from " +
"the different lanes, use option --merge_Reads"+
"within module prep**", 'yellow'))
print ("\n- However, if you want to consider as a single sample the different lanes, you need to merge")
print ("the fastq files from the different lanes, use option --merge_Reads within module prep.")
print("As an example:")
print (colored("\n** Options --include_lane --merge_Reads within module prep **", 'yellow'))
print (colored("** Options --merge_Reads within module prep **", 'yellow'))
print ("sample1_L001_R1.fastq.gz\tsample1_L001_R2.fastq.gz")
print ("sample1_L002_R1.fastq.gz\tsample1_L002_R2.fastq.gz")
print ("sample1_L003_R1.fastq.gz\tsample1_L003_R2.fastq.gz")
Expand All @@ -88,8 +91,8 @@ def help_fastq_format():
print ("sample1_R1.fastq.gz\tsample1_R2.fastq.gz")
print ("\n")

print (colored("\n** If you need to merge fastq files of the same lane that differ in the last group of numbers" +
"use option --mergeReads together with --include-lane within module prep**", 'yellow'))
print ("\n- If you need to merge fastq files of the same lane that differ in the last group of numbers")
print ("use option --mergeReads together with --include-lane within module prep.")
print (colored("\n** Option --include_lane --merge-by-lane within module prep **", 'yellow'))
print ("sample1_L001_R1_001.fastq.gz\tsample1_L001_R2_001.fastq.gz")
print ("sample1_L001_R1_002.fastq.gz\tsample1_L001_R2_002.fastq.gz")
Expand All @@ -103,8 +106,8 @@ def help_fastq_format():
print ("\n")

### if you want to merge lane and extension --mergeReads
print (colored("\n** If you need to merge fastq files with different lanes and final extension " +
"(_001, _002, ...), use only option --merge_Reads within module prep**", 'yellow'))
print ("- If you need to merge fastq files with different lanes and final extension ")
print ("(_001, _002, ...), use only option --merge_Reads within module prep.")
print("As an example:")
print (colored("\n** Options --merge_Reads within module prep **", 'yellow'))
print ("sample1_L001_R1_001.fastq.gz\tsample1_L001_R2_001.fastq.gz")
Expand All @@ -118,18 +121,18 @@ def help_fastq_format():


functions.aesthetics_functions.print_sepLine("*",55,"red")
print ("[5] Include all information:")
print ("[4.2] Include all information:")
functions.aesthetics_functions.print_sepLine("*",55,"red")
print ("In some cases, files might contain other extra information and it is necessary to " +
"include it all as a tag name, in that case use --include-all. In the following example" +
"XYZ is the extra information and it is also used to identify each sample:")
print ("In some cases, files might contain other extra information and it is necessary to ")
print ("include it all as a tag name, in that case use --include-all. In the following example")
print ("XYZ is the extra information and it is also used to identify each sample:")
print ("sample1_L001_XYZ_R1_001.fastq.gz\tsample1_L001_XYZ_R2_001.fastq.gz")
print (colored("** Remember to use option --include_all within each module", 'yellow'))

print (colored("** It might be appropriate to change samples names using --rename option under prep module", 'yellow'))

print (colored("\n** If you need to merge fastq files that only differ in the last group of numbers " +
"(_001, _002, ...), use option --merge_Reads within module prep together with --include-all**", 'yellow'))
print ("\n- If you need to merge fastq files that only differ in the last group of numbers ")
print ("(_001, _002, ...), use option --merge_Reads within module prep together with --include-all.")
print("As an example:")
print (colored("\n** Options --include_all --merge_Reads within module prep **", 'yellow'))
print ("sample1_L001_XYZ_R1_001.fastq.gz\tsample1_L001_XYZ_R2_001.fastq.gz")
Expand All @@ -145,7 +148,7 @@ def help_fastq_format():

print ("\n")
functions.aesthetics_functions.print_sepLine("*",15,"red")
print ("[6] Extensions:")
print ("[4.3] Extensions:")
functions.aesthetics_functions.print_sepLine("*",15,"red")
print ("name_L00x_R2.fastq\tname_L00x_R2.fq\nname_L00x_R2.fastq.gz\tname_L00x_R2.fq.gz")
print ("\n")
Expand Down
83 changes: 83 additions & 0 deletions XICRA_pip/XICRA/modules/qc.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,3 +166,86 @@ def run_QC(options):

print ("+ Exiting qc module.")
exit()

def fastqc(pd_samples_retrieved, outdir, options, start_time_total, name_analysis, Debug):

print("+ FASTQC Quality check for trimmed samples")


## debug message
if (Debug):
print (colored("\n**DEBUG: pd_samples_retrieve **", 'yellow'))
print (pd_samples_retrieved)
print ("\n")

## generate output folder, if necessary
print ("\n+ Create output folder(s):")

## if not project, outdir contains the dir to put output
## in this case, in some other cases might not occur
if not options.project:
functions.create_folder(outdir)
outdir_dict = functions.files_functions.outdir_project(outdir, options.project, pd_samples_retrieved, "fastqc_" + name_analysis, options.debug)

print ("+ Checking quality for each sample retrieved...")
start_time_partial = start_time_total

# Group dataframe by sample name
sample_frame = pd_samples_retrieved.groupby(["name"])

## optimize threads
name_list = set(pd_samples_retrieved["name"].tolist())
threads_job = functions.main_functions.optimize_threads(options.threads, len(name_list)) ## threads optimization
max_workers_int = int(options.threads/threads_job)

## debug message
if (Debug):
functions.aesthetics_functions.debug_message("options.threads: " + str(options.threads), "yellow")
functions.aesthetics_functions.debug_message("max_workers: " + str(max_workers_int), "yellow")
functions.aesthetics_functions.debug_message("threads_job: " + str(threads_job), "yellow")

## send for each sample
print ("+ Calling fastqc for samples...")
with concurrent.futures.ThreadPoolExecutor(max_workers=int(max_workers_int)) as executor:
commandsSent = { executor.submit(fastqc_caller.run_module_fastqc, outdir_dict[name], sorted( cluster["sample"].tolist() ), name, threads_job): name for name, cluster in sample_frame }

for cmd2 in concurrent.futures.as_completed(commandsSent):
details = commandsSent[cmd2]
try:
data = cmd2.result()
except Exception as exc:
print ('***ERROR:')
print (cmd2)
print('%r generated an exception: %s' % (details, exc))

print ("+ FASTQC for samples has finished...")

## functions.timestamp
start_time_partial = functions.time_functions.timestamp(start_time_partial)

if (options.skip_report):
print ("+ No report generation...")
else:
print ("\n+ Generating a report using MultiQC module.")
outdir_report = functions.files_functions.create_subfolder("report", outdir)

## get subdirs generated and call multiQC report module
givenList = []
print ("+ Detail information for each sample could be identified in separate folders:")

## call multiQC report module
givenList = [ v for v in outdir_dict.values() ]
my_outdir_list = set(givenList)

## debug message
if (Debug):
print (colored("\n**DEBUG: my_outdir_list for multiqc report **", 'yellow'))
print (my_outdir_list)
print ("\n")

fastqc_report = functions.files_functions.create_subfolder("FASTQC", outdir_report)
fastqc_final_report = functions.files_functions.create_subfolder(name_analysis, fastqc_report)
multiQC_report.multiQC_module_call(my_outdir_list, "FASTQC", fastqc_final_report,"")
print ('\n+ A summary HTML report of each sample is generated in folder: %s' %fastqc_final_report)

return()
12 changes: 10 additions & 2 deletions XICRA_pip/XICRA/modules/trimm.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from XICRA.scripts import multiQC_report
from XICRA.config import set_config
from XICRA.modules import help_XICRA
from XICRA.modules import qc
from HCGB import functions
from HCGB import sampleParser

Expand Down Expand Up @@ -198,6 +199,13 @@ def run_trimm(options):
multiQC_report.multiQC_module_call(my_outdir_list, "Cutadapt", trimm_report,"")
print ('\n+ A summary HTML report of each sample is generated in folder: %s' %trimm_report)

## QC analysis for trimmed reads
if (Debug):
print (colored("** Beginning FAStQC analysis **", 'red'))

pd_samples_retrieved_trimmed = sampleParser.files.get_files(options, input_dir, "trim", ['_trim'], options.debug)
qc.fastqc(pd_samples_retrieved_trimmed, outdir, options, start_time_partial, "trimmed", Debug)

print ("\n*************** Finish *******************")
start_time_partial = functions.time_functions.timestamp(start_time_total)
print ("\n+ Exiting trimm module.")
Expand Down Expand Up @@ -258,8 +266,8 @@ def cutadapt (cutadapt_exe, reads, path, sample_name, num_threads, Debug, adapte
p_param = os.path.join(path, sample_name + '_trim_R2.fastq')
o_param = os.path.join(path, sample_name + '_trim_R1.fastq')

## paired-end mode
cmd = '%s -j %s -a %s -A %s -o %s -p %s %s %s > %s' %(cutadapt_exe,
## paired-end mode, 15 bps as the min length cutoff
cmd = '%s -j %s -a %s -A %s -o %s -p %s %s %s > %s -m 15' %(cutadapt_exe,
num_threads, adapters['adapter_a'],
adapters['adapter_A'], o_param,
p_param, reads[0], reads[1], logfile)
Expand Down

0 comments on commit bb1e88a

Please sign in to comment.