diff --git a/bin/samplesheet2yaml.py b/bin/samplesheet2yaml.py index 4320463..115bccb 100644 --- a/bin/samplesheet2yaml.py +++ b/bin/samplesheet2yaml.py @@ -3,7 +3,7 @@ import sys # Function to convert CSV to the exact YAML structure -def csv_to_exact_yaml(csv_file, yaml_file, path_prefix=None): +def csv_to_exact_yaml(csv_file, yaml_file): data = {} # Reading the CSV file and grouping data by 'condition' @@ -14,8 +14,8 @@ def csv_to_exact_yaml(csv_file, yaml_file, path_prefix=None): if condition not in data: data[condition] = {"long read files": [], "labels": []} - # Append the full path if path_prefix is provided - bam_file = f"{path_prefix}/{row['fastq']}.bam" if path_prefix else row['fastq'] + # Append .bam to the filename + bam_file = f"{row['fastq']}.bam" label = f"Sample{row['sample']}" data[condition]["long read files"].append(bam_file) @@ -48,14 +48,13 @@ def csv_to_exact_yaml(csv_file, yaml_file, path_prefix=None): # Main function to handle command-line arguments if __name__ == "__main__": # Argument parsing - parser = argparse.ArgumentParser(description="Convert CSV to YAML and update bam file paths") + parser = argparse.ArgumentParser(description="Convert CSV to YAML and append .bam to file names") parser.add_argument('--input', required=True, help="Input CSV file") parser.add_argument('--output', required=True, help="Output YAML file") - parser.add_argument('--path', help="Optional path to prepend to 'bam' column values") args = parser.parse_args() - # Convert the CSV to the YAML structure, appending the full path to 'bam' if provided - csv_to_exact_yaml(args.input, args.output, args.path) + # Convert the CSV to the YAML structure + csv_to_exact_yaml(args.input, args.output) print(f"YAML file has been created: {args.output}") \ No newline at end of file diff --git a/main.nf b/main.nf index e1a48be..aa82202 100644 --- a/main.nf +++ b/main.nf @@ -8,7 +8,9 @@ */ nextflow.enable.dsl=2 -if ( params.help ) { +params.help = false + +if ( params.help) { help = """ Usage: nextflow run main.nf --reads --samplesheet [options] @@ -39,18 +41,6 @@ if ( params.help ) { exit(0) } -// Display pipeline details -println """\ - T R A N S C R I P T - A N N O T A T I O N - N F P I P E L I N E - =================================== - orientation : ${params.oriented} - fastq : ${params.reads} - sam : ${params.sam} - genome : ${params.genome} - annotation : ${params.annotation} - outdir : ${params.outdir} - """ - .stripIndent() /* ======================================================================================== @@ -67,7 +57,6 @@ include { NONORIENTED_WORKFLOW } from './subworkflows/nonoriented_annotati */ workflow{ - genome_ch = file( params.genome ) annot_ch = file( params.annotation ) config_ch = file( params.config, checkIfExists:true ) shortread_ch = params.optional_shortread != null ? file(params.optional_shortread, type: "file") : file("no_shortread", type: "file") @@ -77,8 +66,7 @@ workflow{ if (params.oriented == false) { - NONORIENTED_WORKFLOW(genome_ch, - annot_ch, + NONORIENTED_WORKFLOW(annot_ch, config_ch, shortread_ch, junc_bed_ch, @@ -87,8 +75,7 @@ workflow{ } else if (params.oriented == true) { sam_ch = Channel.fromPath( params.sam, checkIfExists:true ) - ORIENTED_WORKFLOW(genome_ch, - annot_ch, + ORIENTED_WORKFLOW(annot_ch, config_ch, shortread_ch, junc_bed_ch, @@ -129,9 +116,11 @@ log.info """\ junction bed files minimap2 : ${params.junc_bed} IsoQuant model strategy : ${params.model_strategy} RNABloom short read polishing data : ${params.optional_shortread} + gffread parameters : ${params.gffread_parameters} outdir : ${params.outdir} """ .stripIndent() /* -======================================================================================== \ No newline at end of file +======================================================================================== +*/ \ No newline at end of file diff --git a/modules/gffread.nf b/modules/gffread.nf index 3fe2399..33227d5 100644 --- a/modules/gffread.nf +++ b/modules/gffread.nf @@ -19,6 +19,7 @@ process GFFREAD { input: path genome tuple val(condition), path(polished_gtf) + val gffread_parameters output: tuple val(condition), path("${condition}.transcripts_polished_clustersMKZ.gff3"), emit: gffread_gff3 @@ -27,6 +28,6 @@ process GFFREAD { """ gffread -g ${genome} \ -o ${condition}.transcripts_polished_clustersMKZ.gff3 \ - -M -K -Z ${polished_gtf} \ + ${gffread_parameters} ${polished_gtf} """ } diff --git a/modules/isoquant.nf b/modules/isoquant.nf index 87f7ee2..54d1f1a 100644 --- a/modules/isoquant.nf +++ b/modules/isoquant.nf @@ -11,7 +11,9 @@ process ISOQUANT { // where to store the results and in which way - cpus 24 + cpus 16 + maxForks 1 + publishDir( "${params.outdir}", mode: 'copy' ) // show in the log which input file is analysed @@ -20,6 +22,7 @@ process ISOQUANT { input: val ready + path bams path genome path samplesheet val model_strategy diff --git a/modules/merge_fastq.nf b/modules/merge_fastq.nf index 4efe9c9..77d6e7b 100644 --- a/modules/merge_fastq.nf +++ b/modules/merge_fastq.nf @@ -10,6 +10,8 @@ process MERGE_FASTQ_RESTRANDER { // where to store the results and in which way debug true publishDir( "${params.outdir}/rnabloom", mode: 'copy' ) + + tag( "${reads}" ) input: path samplesheet @@ -29,6 +31,8 @@ process MERGE_FASTQ_EOULSAN { // where to store the results and in which way debug true publishDir( "${params.outdir}/rnabloom", mode: 'copy' ) + + tag( "${reads}" ) input: path samplesheet diff --git a/modules/rnabloom.nf b/modules/rnabloom.nf index d7e4af7..157f138 100644 --- a/modules/rnabloom.nf +++ b/modules/rnabloom.nf @@ -10,7 +10,7 @@ process RNA_BLOOM { // where to store the results and in which way debug true - cpus 24 + cpus 16 maxForks 1 maxRetries 2 diff --git a/modules/rnabloom_minimap2.nf b/modules/rnabloom_minimap2.nf index 58963df..ddd6d25 100644 --- a/modules/rnabloom_minimap2.nf +++ b/modules/rnabloom_minimap2.nf @@ -21,14 +21,17 @@ process RNABLOOM_MINIMAP2 { input: path genome tuple val(condition), path(bloomfasta) + val intron_length + path junc_bed output: tuple val(condition), path( "${bloomfasta.SimpleName}.sam" ), emit: rnabloom_sam script: + def junc_bed_arg = junc_bed.name != 'no_junc_bed' ? "--junc-bed $junc_bed" : "" """ - minimap2 -ax splice -uf -k14 \ - ${genome} ${bloomfasta} > ${bloomfasta.SimpleName}.sam + minimap2 -G ${intron_length} -ax splice -uf -k14 \ + ${junc_bed_arg} ${genome} ${bloomfasta} > ${bloomfasta.SimpleName}.sam """ } diff --git a/modules/samplesheet2yaml.nf b/modules/samplesheet2yaml.nf index cbf9bb6..8278f6f 100644 --- a/modules/samplesheet2yaml.nf +++ b/modules/samplesheet2yaml.nf @@ -25,6 +25,6 @@ process SAMPLESHEET2YAML { script: """ - python3 $projectDir/bin/samplesheet2yaml.py --input ${samplesheet} --output dataset.yaml --path ${params.outdir}/bam + python3 $projectDir/bin/samplesheet2yaml.py --input ${samplesheet} --output dataset.yaml """ } \ No newline at end of file diff --git a/modules/samtools.nf b/modules/samtools.nf index c1898b8..85bb337 100644 --- a/modules/samtools.nf +++ b/modules/samtools.nf @@ -20,8 +20,7 @@ process SAMTOOLS { path(sam) output: - path("${sam.SimpleName}.bam"), emit: samtools_bam - path("${sam.SimpleName}.bam.bai") + tuple path("${sam.SimpleName}.bam"), path("${sam.SimpleName}.bam.bai"), emit: samtools_bam val("process_complete"), emit: process_control script: diff --git a/modules/uncompress_files.nf b/modules/uncompress_files.nf index 5b8ed80..74ea998 100644 --- a/modules/uncompress_files.nf +++ b/modules/uncompress_files.nf @@ -14,27 +14,12 @@ process UNCOMPRESS_GENOME { path genome output: - path( "*" ), emit: genome_isoquant - path( "*" ), emit: genome_gffread + path( "${genome.BaseName}" ), emit: genome_isoquant + path( "${genome.BaseName}" ), emit: genome_minimap2 + path( "${genome.BaseName}" ), emit: genome_gffread script: """ bzip2 -dc ${genome} > ${genome.BaseName} """ -} - -process UNCOMPRESS_ANNOTATION { - debug true - publishDir( "${params.outdir}/ressources", mode: 'copy' ) - - input: - path annotation - - output: - path( "*" ), emit: annotation_merge - - script: - """ - bzip2 -dk ${annotation} - """ } \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index 54b840f..b4f651f 100644 --- a/nextflow.config +++ b/nextflow.config @@ -8,34 +8,37 @@ params { // Input options - reads = "/import/pontos01/analyses/OUTOFTHEBLUE_C2024/achilles/*.fastq.gz" - samplesheet = "/import/pontos01/analyses/OUTOFTHEBLUE_C2024/egzotek/achilles/samplesheet.csv" + reads = "/import/pontos01/analyses/OUTOFTHEBLUE_C2024/egzotek/achilles/*.fastq" + samplesheet = "/import/pontos01/analyses/OUTOFTHEBLUE_C2024/egzotek/achilles/samplesheet.csv" // References //genome = "${launchDir}/data/hdujardini_HiC" //annotation = "${launchDir}/data/hdujardini_HiC" - genome = "/import/rhodos10/ressources/sequencages/genomes/morphoach1.fa.bz2" - annotation = "/home/brunon/shares-net/sequencages/ressources/annotations/morphoach1.gff.bz2" + genome = "/import/rhodos10/ressources/sequencages/genomes/morphoach1.fa.bz2" + annotation = "/import/rhodos10/ressources/sequencages/annotations/morphoach1.gff.bz2" // Orientation of FASTQ files - oriented = true // if reads already oriented, replace with oriented = true - sam = "/import/pontos01/analyses/OUTOFTHEBLUE_C2024/egzotek/achilles/*.sam" // if oriented = true, provide sam files from eoulsan + oriented = true // if reads already oriented, replace with oriented = true + sam = "/import/pontos01/analyses/OUTOFTHEBLUE_C2024/egzotek/achilles/*.sam" // if oriented = true, provide sam files from eoulsan // Restrander configuration file (TSO and RTP sequences) - config = "${launchDir}/assets/PCB111.json" + config = "${launchDir}/assets/PCB111.json" + + // GFFRead input parameters + gffread_parameters = "-M" // Minimap2 intron length - intron_length = "20000" // 200k by default - junc_bed = null // if no junk bed, replace with optional_shortread = null + intron_length = "20000" // 200k by default + junc_bed = null // if no junk bed, replace with optional_shortread = null // IsoQuant module input parameters - model_strategy = "default_ont" + model_strategy = "default_ont" // RNABloom input options - optional_shortread = null // if no short reads, replace with optional_shortread = null + optional_shortread = null // if no short reads, replace with optional_shortread = null // Output directory - outdir = "${launchDir}/result/achilles" + outdir = "/import/pontos01/analyses/OUTOFTHEBLUE_C2024/egzotek/achilles/result" } docker { diff --git a/subworkflows/oriented_annotation.nf b/subworkflows/oriented_annotation.nf index cb2437e..fba9a78 100644 --- a/subworkflows/oriented_annotation.nf +++ b/subworkflows/oriented_annotation.nf @@ -15,11 +15,10 @@ include { RNABLOOM_PAFTOOLS } include { RNABLOOM_AGAT_BED2GFF; RNABLOOM_AGAT_GFF2GTF; AGAT_COMPLEMENT; MERGE_AGAT_GFF2GTF } from '../modules/agat.nf' include { SAMPLESHEET2YAML } from '../modules/samplesheet2yaml.nf' include { SAMTOOLS } from '../modules/samtools.nf' -include { UNCOMPRESS_GENOME; UNCOMPRESS_ANNOTATION } from '../modules/uncompress_files.nf' +include { UNCOMPRESS_GENOME } from '../modules/uncompress_files.nf' workflow ORIENTED_WORKFLOW { take: - genome annot config shortread @@ -29,23 +28,37 @@ workflow ORIENTED_WORKFLOW { reads main: + // Prepare genome for different steps + ch_isoquant_genome = Channel.empty() + if (params.genome.endsWith('.gz')|| params.genome.endsWith(".bz2")){ + genome_ch = file( params.genome ) + UNCOMPRESS_GENOME(genome_ch) + ch_isoquant_genome = UNCOMPRESS_GENOME.out.genome_isoquant + ch_minimap2_genome = UNCOMPRESS_GENOME.out.genome_minimap2 + ch_gffread_genome = UNCOMPRESS_GENOME.out.genome_gffread + } else { + ch_isoquant_genome = file( params.genome ) + ch_minimap2_genome = file( params.genome ) + ch_gffread_genome = file( params.genome ) + } + + // Transcript annotation modules: IsoQuant SAMTOOLS(sam) SAMPLESHEET2YAML(samplesheet) - UNCOMPRESS_GENOME(genome) - ISOQUANT(SAMTOOLS.out.process_control.collect(), UNCOMPRESS_GENOME.out.genome_isoquant, SAMPLESHEET2YAML.out.dataset_yaml, params.model_strategy) + ISOQUANT(SAMTOOLS.out.process_control.collect(), SAMTOOLS.out.samtools_bam.collect(), ch_isoquant_genome, SAMPLESHEET2YAML.out.dataset_yaml, params.model_strategy) ISOQUANT_CONDITION(ISOQUANT.out.isoquant_gtf.flatten()) // Transcript annotation modules: RNABloom - MERGE_FASTQ_EOULSAN(samplesheet, reads) + MERGE_FASTQ_EOULSAN(samplesheet, reads.collect()) RNA_BLOOM(MERGE_FASTQ_EOULSAN.out.merged_fastq.flatten(), shortread) - RNABLOOM_MINIMAP2(genome, RNA_BLOOM.out.rnabloom_fasta) + RNABLOOM_MINIMAP2(ch_minimap2_genome, RNA_BLOOM.out.rnabloom_fasta, params.intron_length, junc_bed) RNABLOOM_PAFTOOLS(RNABLOOM_MINIMAP2.out.rnabloom_sam) RNABLOOM_AGAT_BED2GFF(RNABLOOM_PAFTOOLS.out.rnabloom_bed) RNABLOOM_AGAT_GFF2GTF(RNABLOOM_AGAT_BED2GFF.out.agat_gff) // Merging of transcript annotations AGAT_COMPLEMENT(ISOQUANT_CONDITION.out.isoquant_condition_gtf.join(RNABLOOM_AGAT_GFF2GTF.out.agat_gtf)) - GFFREAD(UNCOMPRESS_GENOME.out.genome_gffread, AGAT_COMPLEMENT.out.polished_gtf) + GFFREAD(ch_gffread_genome, AGAT_COMPLEMENT.out.polished_gtf, params.gffread_parameters) MERGE_AGAT_GFF2GTF(GFFREAD.out.gffread_gff3) MERGE_ANNOTATION(annot, MERGE_AGAT_GFF2GTF.out.merged_agat_gtf) }