Merge pull request #2 from GenomiqueENS/dev

GenomiqueENS · Nov 18, 2024 · 1e12af7 · 1e12af7
2 parents 66b70ee + 0c1d7a6
commit 1e12af7
Show file tree

Hide file tree

Showing 12 changed files with 70 additions and 71 deletions.
diff --git a/bin/samplesheet2yaml.py b/bin/samplesheet2yaml.py
@@ -3,7 +3,7 @@
 import sys
 
 # Function to convert CSV to the exact YAML structure
-def csv_to_exact_yaml(csv_file, yaml_file, path_prefix=None):
+def csv_to_exact_yaml(csv_file, yaml_file):
     data = {}
 
     # Reading the CSV file and grouping data by 'condition'
@@ -14,8 +14,8 @@ def csv_to_exact_yaml(csv_file, yaml_file, path_prefix=None):
             if condition not in data:
                 data[condition] = {"long read files": [], "labels": []}
 
-            # Append the full path if path_prefix is provided
-            bam_file = f"{path_prefix}/{row['fastq']}.bam" if path_prefix else row['fastq']
+            # Append .bam to the filename
+            bam_file = f"{row['fastq']}.bam"
             label = f"Sample{row['sample']}"
 
             data[condition]["long read files"].append(bam_file)
@@ -48,14 +48,13 @@ def csv_to_exact_yaml(csv_file, yaml_file, path_prefix=None):
 # Main function to handle command-line arguments
 if __name__ == "__main__":
     # Argument parsing
-    parser = argparse.ArgumentParser(description="Convert CSV to YAML and update bam file paths")
+    parser = argparse.ArgumentParser(description="Convert CSV to YAML and append .bam to file names")
     parser.add_argument('--input', required=True, help="Input CSV file")
     parser.add_argument('--output', required=True, help="Output YAML file")
-    parser.add_argument('--path', help="Optional path to prepend to 'bam' column values")
 
     args = parser.parse_args()
 
-    # Convert the CSV to the YAML structure, appending the full path to 'bam' if provided
-    csv_to_exact_yaml(args.input, args.output, args.path)
+    # Convert the CSV to the YAML structure
+    csv_to_exact_yaml(args.input, args.output)
 
     print(f"YAML file has been created: {args.output}")
diff --git a/main.nf b/main.nf
@@ -8,7 +8,9 @@
 */
 
 nextflow.enable.dsl=2
-if ( params.help ) {
+params.help = false
+
+if ( params.help) {
    help = """
    Usage:
       nextflow run main.nf --reads <path> --samplesheet <path> [options]
@@ -39,18 +41,6 @@ if ( params.help ) {
    exit(0)
 }
 
-// Display pipeline details
-println """\
-      T R A N S C R I P T - A N N O T A T I O N - N F   P I P E L I N E
-      ===================================
-      orientation : ${params.oriented}
-      fastq       : ${params.reads}
-      sam         : ${params.sam}
-      genome      : ${params.genome}
-      annotation  : ${params.annotation}
-      outdir      : ${params.outdir}
-      """
-      .stripIndent()
 
 /*
 ========================================================================================
@@ -67,7 +57,6 @@ include { NONORIENTED_WORKFLOW       } from './subworkflows/nonoriented_annotati
 */
 
 workflow{
-   genome_ch = file( params.genome )
    annot_ch = file( params.annotation )
    config_ch = file( params.config, checkIfExists:true )
    shortread_ch = params.optional_shortread != null ? file(params.optional_shortread, type: "file") : file("no_shortread", type: "file")
@@ -77,8 +66,7 @@ workflow{
 
    if (params.oriented == false) {
 
-      NONORIENTED_WORKFLOW(genome_ch,
-                        annot_ch,
+      NONORIENTED_WORKFLOW(annot_ch,
                         config_ch,
                         shortread_ch,
                         junc_bed_ch,
@@ -87,8 +75,7 @@ workflow{
    } else if (params.oriented == true) {
       sam_ch = Channel.fromPath( params.sam, checkIfExists:true )
 
-      ORIENTED_WORKFLOW(genome_ch,
-                        annot_ch,
+      ORIENTED_WORKFLOW(annot_ch,
                         config_ch,
                         shortread_ch,
                         junc_bed_ch,
@@ -129,9 +116,11 @@ log.info """\
    junction bed files minimap2           : ${params.junc_bed}
    IsoQuant model strategy               : ${params.model_strategy}
    RNABloom short read polishing data    : ${params.optional_shortread}
+   gffread parameters                    : ${params.gffread_parameters}
    outdir                                : ${params.outdir}
    """
    .stripIndent()
 
 /*
-========================================================================================
+========================================================================================
+*/
diff --git a/modules/gffread.nf b/modules/gffread.nf
@@ -19,6 +19,7 @@ process GFFREAD {
     input:
     path genome
     tuple val(condition), path(polished_gtf)
+    val gffread_parameters
 
     output:
     tuple val(condition), path("${condition}.transcripts_polished_clustersMKZ.gff3"), emit: gffread_gff3
@@ -27,6 +28,6 @@ process GFFREAD {
     """
     gffread  -g ${genome} \
     -o ${condition}.transcripts_polished_clustersMKZ.gff3 \
-    -M -K -Z ${polished_gtf} \
+    ${gffread_parameters} ${polished_gtf} 
     """
 }
diff --git a/modules/isoquant.nf b/modules/isoquant.nf
@@ -11,7 +11,9 @@
 process ISOQUANT {
 
    // where to store the results and in which way
-   cpus 24
+   cpus 16
+   maxForks 1
+
    publishDir( "${params.outdir}", mode: 'copy' )
 
    // show in the log which input file is analysed
@@ -20,6 +22,7 @@ process ISOQUANT {
 
    input:
    val ready
+   path bams
    path genome 
    path samplesheet
    val model_strategy

diff --git a/modules/merge_fastq.nf b/modules/merge_fastq.nf
@@ -10,6 +10,8 @@ process MERGE_FASTQ_RESTRANDER {
    // where to store the results and in which way
    debug true
    publishDir( "${params.outdir}/rnabloom", mode: 'copy' )
+
+   tag( "${reads}" )
 
    input:
    path samplesheet
@@ -29,6 +31,8 @@ process MERGE_FASTQ_EOULSAN {
    // where to store the results and in which way
    debug true
    publishDir( "${params.outdir}/rnabloom", mode: 'copy' )
+
+   tag( "${reads}" )
 
    input:
    path samplesheet

diff --git a/modules/rnabloom.nf b/modules/rnabloom.nf
@@ -10,7 +10,7 @@
 process RNA_BLOOM {
    // where to store the results and in which way
    debug true
-   cpus 24
+   cpus 16
    maxForks 1
    maxRetries 2
 

diff --git a/modules/rnabloom_minimap2.nf b/modules/rnabloom_minimap2.nf
@@ -21,14 +21,17 @@ process RNABLOOM_MINIMAP2 {
    input:
    path genome
    tuple val(condition), path(bloomfasta)
+   val intron_length
+   path junc_bed
 
    output:
    tuple val(condition), path( "${bloomfasta.SimpleName}.sam" ), emit: rnabloom_sam
 
    script:
+   def junc_bed_arg = junc_bed.name != 'no_junc_bed' ? "--junc-bed $junc_bed" : ""
    """
-   minimap2 -ax splice -uf -k14 \
-   ${genome} ${bloomfasta} > ${bloomfasta.SimpleName}.sam
+   minimap2 -G ${intron_length} -ax splice -uf -k14 \
+   ${junc_bed_arg} ${genome} ${bloomfasta} > ${bloomfasta.SimpleName}.sam
    """
 
 }
diff --git a/modules/samplesheet2yaml.nf b/modules/samplesheet2yaml.nf
@@ -25,6 +25,6 @@ process SAMPLESHEET2YAML {
 
    script:
    """
-   python3 $projectDir/bin/samplesheet2yaml.py --input ${samplesheet} --output dataset.yaml --path ${params.outdir}/bam
+   python3 $projectDir/bin/samplesheet2yaml.py --input ${samplesheet} --output dataset.yaml
    """
 }  
diff --git a/modules/samtools.nf b/modules/samtools.nf
@@ -20,8 +20,7 @@ process SAMTOOLS {
    path(sam)
 
    output:
-   path("${sam.SimpleName}.bam"), emit: samtools_bam
-   path("${sam.SimpleName}.bam.bai")
+   tuple path("${sam.SimpleName}.bam"), path("${sam.SimpleName}.bam.bai"), emit: samtools_bam
    val("process_complete"), emit: process_control
 
    script:

diff --git a/modules/uncompress_files.nf b/modules/uncompress_files.nf
@@ -14,27 +14,12 @@ process UNCOMPRESS_GENOME {
    path genome
 
    output:
-   path( "*" ), emit: genome_isoquant
-   path( "*" ), emit: genome_gffread
+   path( "${genome.BaseName}" ), emit: genome_isoquant
+   path( "${genome.BaseName}" ), emit: genome_minimap2
+   path( "${genome.BaseName}" ), emit: genome_gffread
 
    script:
    """
    bzip2 -dc ${genome} > ${genome.BaseName}
    """
-}
-
-process UNCOMPRESS_ANNOTATION {
-   debug true
-   publishDir( "${params.outdir}/ressources", mode: 'copy' )
-
-   input:
-   path annotation
-
-   output:
-   path( "*" ), emit: annotation_merge
-
-   script:
-   """
-   bzip2 -dk ${annotation}
-   """
 }
diff --git a/nextflow.config b/nextflow.config
@@ -8,34 +8,37 @@
 
 params {
  	// Input options
-    reads = "/import/pontos01/analyses/OUTOFTHEBLUE_C2024/achilles/*.fastq.gz"
-    samplesheet = "/import/pontos01/analyses/OUTOFTHEBLUE_C2024/egzotek/achilles/samplesheet.csv"
+    reads               = "/import/pontos01/analyses/OUTOFTHEBLUE_C2024/egzotek/achilles/*.fastq"
+    samplesheet         = "/import/pontos01/analyses/OUTOFTHEBLUE_C2024/egzotek/achilles/samplesheet.csv"
 
  	// References
     //genome = "${launchDir}/data/hdujardini_HiC"
     //annotation = "${launchDir}/data/hdujardini_HiC"
-    genome = "/import/rhodos10/ressources/sequencages/genomes/morphoach1.fa.bz2"
-    annotation = "/home/brunon/shares-net/sequencages/ressources/annotations/morphoach1.gff.bz2"
+    genome              = "/import/rhodos10/ressources/sequencages/genomes/morphoach1.fa.bz2"
+    annotation          = "/import/rhodos10/ressources/sequencages/annotations/morphoach1.gff.bz2"
 
     // Orientation of FASTQ files
-    oriented = true  // if reads already oriented, replace with oriented = true
-    sam = "/import/pontos01/analyses/OUTOFTHEBLUE_C2024/egzotek/achilles/*.sam"  // if oriented = true, provide sam files from eoulsan
+    oriented            = true  // if reads already oriented, replace with oriented = true
+    sam                 = "/import/pontos01/analyses/OUTOFTHEBLUE_C2024/egzotek/achilles/*.sam"  // if oriented = true, provide sam files from eoulsan
 
     // Restrander configuration file (TSO and RTP sequences)
-    config = "${launchDir}/assets/PCB111.json"
+    config              = "${launchDir}/assets/PCB111.json"
+
+    // GFFRead input parameters
+    gffread_parameters = "-M"
 
     // Minimap2 intron length
-    intron_length = "20000" // 200k by default
-    junc_bed = null 	// if no junk bed, replace with optional_shortread = null
+    intron_length       = "20000" // 200k by default
+    junc_bed            = null 	// if no junk bed, replace with optional_shortread = null
 
     // IsoQuant module input parameters
-    model_strategy = "default_ont"
+    model_strategy      = "default_ont"
 
     // RNABloom input options
-    optional_shortread = null  	// if no short reads, replace with optional_shortread = null
+    optional_shortread  = null  	// if no short reads, replace with optional_shortread = null
 
  	// Output directory
-    outdir = "${launchDir}/result/achilles"
+    outdir              = "/import/pontos01/analyses/OUTOFTHEBLUE_C2024/egzotek/achilles/result"
 }
 
 docker {

diff --git a/subworkflows/oriented_annotation.nf b/subworkflows/oriented_annotation.nf
@@ -15,11 +15,10 @@ include { RNABLOOM_PAFTOOLS }
 include { RNABLOOM_AGAT_BED2GFF; RNABLOOM_AGAT_GFF2GTF; AGAT_COMPLEMENT; MERGE_AGAT_GFF2GTF }   from '../modules/agat.nf'
 include { SAMPLESHEET2YAML }                                                                    from '../modules/samplesheet2yaml.nf'
 include { SAMTOOLS }                                                                            from '../modules/samtools.nf'
-include { UNCOMPRESS_GENOME; UNCOMPRESS_ANNOTATION }                                            from '../modules/uncompress_files.nf'
+include { UNCOMPRESS_GENOME }                                                                   from '../modules/uncompress_files.nf'
 
 workflow ORIENTED_WORKFLOW {
    take:
-      genome
       annot
       config
       shortread
@@ -29,23 +28,37 @@ workflow ORIENTED_WORKFLOW {
       reads
 
    main:
+      // Prepare genome for different steps
+      ch_isoquant_genome = Channel.empty()      
+      if (params.genome.endsWith('.gz')|| params.genome.endsWith(".bz2")){
+            genome_ch = file( params.genome )
+            UNCOMPRESS_GENOME(genome_ch)
+            ch_isoquant_genome = UNCOMPRESS_GENOME.out.genome_isoquant
+            ch_minimap2_genome = UNCOMPRESS_GENOME.out.genome_minimap2
+            ch_gffread_genome  = UNCOMPRESS_GENOME.out.genome_gffread
+      } else {
+            ch_isoquant_genome = file( params.genome )
+            ch_minimap2_genome = file( params.genome )
+            ch_gffread_genome  = file( params.genome )
+      }
+
+      // Transcript annotation modules: IsoQuant
       SAMTOOLS(sam)
       SAMPLESHEET2YAML(samplesheet)
-      UNCOMPRESS_GENOME(genome)
-      ISOQUANT(SAMTOOLS.out.process_control.collect(), UNCOMPRESS_GENOME.out.genome_isoquant, SAMPLESHEET2YAML.out.dataset_yaml, params.model_strategy)
+      ISOQUANT(SAMTOOLS.out.process_control.collect(), SAMTOOLS.out.samtools_bam.collect(), ch_isoquant_genome, SAMPLESHEET2YAML.out.dataset_yaml, params.model_strategy)
       ISOQUANT_CONDITION(ISOQUANT.out.isoquant_gtf.flatten())
 
       // Transcript annotation modules: RNABloom
-      MERGE_FASTQ_EOULSAN(samplesheet, reads)
+      MERGE_FASTQ_EOULSAN(samplesheet, reads.collect())
       RNA_BLOOM(MERGE_FASTQ_EOULSAN.out.merged_fastq.flatten(), shortread)
-      RNABLOOM_MINIMAP2(genome, RNA_BLOOM.out.rnabloom_fasta)
+      RNABLOOM_MINIMAP2(ch_minimap2_genome, RNA_BLOOM.out.rnabloom_fasta, params.intron_length, junc_bed)
       RNABLOOM_PAFTOOLS(RNABLOOM_MINIMAP2.out.rnabloom_sam)
       RNABLOOM_AGAT_BED2GFF(RNABLOOM_PAFTOOLS.out.rnabloom_bed)
       RNABLOOM_AGAT_GFF2GTF(RNABLOOM_AGAT_BED2GFF.out.agat_gff)
 
       // Merging of transcript annotations
       AGAT_COMPLEMENT(ISOQUANT_CONDITION.out.isoquant_condition_gtf.join(RNABLOOM_AGAT_GFF2GTF.out.agat_gtf))
-      GFFREAD(UNCOMPRESS_GENOME.out.genome_gffread, AGAT_COMPLEMENT.out.polished_gtf)
+      GFFREAD(ch_gffread_genome, AGAT_COMPLEMENT.out.polished_gtf, params.gffread_parameters)
       MERGE_AGAT_GFF2GTF(GFFREAD.out.gffread_gff3)
       MERGE_ANNOTATION(annot, MERGE_AGAT_GFF2GTF.out.merged_agat_gtf)
 }
-Original file line number
+Diff line change
@@ Expand Up / @@ -25,6 +25,6 @@ process SAMPLESHEET2YAML { @@
        script:
        """
-       python3 $projectDir/bin/samplesheet2yaml.py --input ${samplesheet} --output dataset.yaml --path ${params.outdir}/bam
+       python3 $projectDir/bin/samplesheet2yaml.py --input ${samplesheet} --output dataset.yaml
        """
     }