Use splitCsv to parse the samplesheet

Use splitCsv to parse the sample_bams samplesheet, which means we do not need to strip the newline characters from the file paths and can use relative paths.
nextflow-io · Nov 7, 2024 · f46674c · f46674c
1 parent 08428f0
commit f46674c
Show file tree

Hide file tree

Showing 12 changed files with 84 additions and 17 deletions.
diff --git a/docs/hello_nextflow/04_hello_genomics.md b/docs/hello_nextflow/04_hello_genomics.md
@@ -757,7 +757,7 @@ This way we can continue to be lazy, but the list of files no longer lives in th
 Currently, our input channel factory treats any files we give it as the data inputs we want to feed to the indexing process.
 Since we're now giving it a file that lists input file paths, we need to change its behavior to parse the file and treat the file paths it contains as the data inputs.
 
-Fortunately we can do that very simply, just by adding the [`.splitText()` operator](https://www.nextflow.io/docs/latest/reference/operator.html#operator-splittext) to the channel construction step.
+We are going to use the [`.splitCsv()`](https://www.nextflow.io/docs/latest/operator.html#operator-splitcsv) operator to parse the file into lines, and then use `.map()` to convert each line into a file path object. This introduces some advanced concepts that we'll explain in more detail later in this training series, but for now it's enough to understand that we can manipulate the contents of the samplesheet after we read it in but before we use it.
 
 _Before:_
 
@@ -771,7 +771,8 @@ _After:_
 ````groovy title="hello-genomics.nf" linenums="68"
 // Create input channel from a text file listing input file paths
 reads_ch = Channel.fromPath(params.reads_bam)
-                .splitText() { bamPath -> file(bamPath.trim()) }
+                .splitCsv()
+                .map { bamPath -> file(bamPath[0]) }
 ```
 
 !!! tip

diff --git a/hello-nextflow/hello-config/main.nf b/hello-nextflow/hello-config/main.nf
@@ -110,7 +110,8 @@ workflow {
 
     // Create input channel from a text file listing input file paths
     reads_ch = Channel.fromPath(params.reads_bam)
-                    .splitText() { bamPath -> file(bamPath.trim()) }
+                    .splitCsv()
+                    .map { bamPath -> file(bamPath[0]) }
 
     // Load the file paths for the accessory files (reference and intervals)
     ref_file        = file(params.reference)

diff --git a/hello-nextflow/hello-genomics.nf b/hello-nextflow/hello-genomics.nf
@@ -4,32 +4,88 @@
  * Pipeline parameters
  */
 
-// Primary input
+// Primary input (file of input files, one per line)
+params.reads_bam = "${projectDir}/data/sample_bams.txt"
+
+// Accessory files
+params.reference        = "${projectDir}/data/ref/ref.fasta"
+params.reference_index  = "${projectDir}/data/ref/ref.fasta.fai"
+params.reference_dict   = "${projectDir}/data/ref/ref.dict"
+params.intervals        = "${projectDir}/data/ref/intervals.bed"
 
 /*
  * Generate BAM index file
  */
 process SAMTOOLS_INDEX {
 
-    container
+    container 'community.wave.seqera.io/library/samtools:1.20--b5dfbd93de237464'
 
-    publishDir
+    publishDir 'results_genomics', mode: 'symlink'
 
     input:
+        path input_bam
 
     output:
+        tuple path(input_bam), path("${input_bam}.bai")
 
     script:
     """
-
+    samtools index '$input_bam'
     """
+}
+
+/*
+ * Call variants with GATK HaplotypeCaller
+ */
+process GATK_HAPLOTYPECALLER {
+
+    container "community.wave.seqera.io/library/gatk4:4.5.0.0--730ee8817e436867"
+
+    publishDir 'results_genomics', mode: 'symlink'
 
+    input:
+        tuple path(input_bam), path(input_bam_index)
+        path ref_fasta
+        path ref_index
+        path ref_dict
+        path interval_list
+
+    output:
+        path "${input_bam}.vcf"     , emit: vcf
+        path "${input_bam}.vcf.idx" , emit: idx
+
+    script:
+    """
+    gatk HaplotypeCaller \
+        -R ${ref_fasta} \
+        -I ${input_bam} \
+        -O ${input_bam}.vcf \
+        -L ${interval_list}
+    """
 }
 
 workflow {
 
-    // Create input channel
+    // Create input channel from a text file listing input file paths
+    reads_ch = Channel.fromPath(params.reads_bam)
+                    .splitCsv()
+                    .map { bamPath -> file(bamPath[0]) }
+
+    // Load the file paths for the accessory files (reference and intervals)
+    ref_file        = file(params.reference)
+    ref_index_file  = file(params.reference_index)
+    ref_dict_file   = file(params.reference_dict)
+    intervals_file  = file(params.intervals)
 
     // Create index file for input BAM file
+    SAMTOOLS_INDEX(reads_ch)
 
+    // Call variants from the indexed BAM file
+    GATK_HAPLOTYPECALLER(
+        SAMTOOLS_INDEX.out,
+        ref_file,
+        ref_index_file,
+        ref_dict_file,
+        intervals_file
+    )
 }
diff --git a/hello-nextflow/hello-modules/main.nf b/hello-nextflow/hello-modules/main.nf
@@ -97,7 +97,8 @@ workflow {
 
     // Create input channel from a text file listing input file paths
     reads_ch = Channel.fromPath(params.reads_bam)
-                    .splitText() { bamPath -> file(bamPath.trim()) }
+                    .splitCsv()
+                    .map { bamPath -> file(bamPath[0]) }
 
     // Load the file paths for the accessory files (reference and intervals)
     ref_file        = file(params.reference)

diff --git a/hello-nextflow/hello-nf-test/main.nf b/hello-nextflow/hello-nf-test/main.nf
@@ -9,7 +9,8 @@ workflow {
 
     // Create input channel from a text file listing input file paths
     reads_ch = Channel.fromPath(params.reads_bam)
-                    .splitText() { bamPath -> file(bamPath.trim()) }
+                    .splitCsv()
+                    .map { bamPath -> file(bamPath[0]) }
 
     // Load the file paths for the accessory files (reference and intervals)
     ref_file        = file(params.reference)

diff --git a/hello-nextflow/hello-operators.nf b/hello-nextflow/hello-operators.nf
@@ -68,7 +68,8 @@ workflow {
 
     // Create input channel from a text file listing input file paths
     reads_ch = Channel.fromPath(params.reads_bam)
-                    .splitText() { bamPath -> file(bamPath.trim()) }
+                    .splitCsv()
+                    .map { bamPath -> file(bamPath[0]) }
 
     // Load the file paths for the accessory files (reference and intervals)
     ref_file        = file(params.reference)

diff --git a/hello-nextflow/solutions/hello-config/final-main.nf b/hello-nextflow/solutions/hello-config/final-main.nf
@@ -97,7 +97,8 @@ workflow {
 
     // Create input channel from a text file listing input file paths
     reads_ch = Channel.fromPath(params.reads_bam)
-                    .splitText() { bamPath -> file(bamPath.trim()) }
+                    .splitCsv()
+                    .map { bamPath -> file(bamPath[0]) }
 
     // Load the file paths for the accessory files (reference and intervals)
     ref_file        = file(params.reference)

diff --git a/hello-nextflow/solutions/hello-genomics/hello-genomics-4.nf b/hello-nextflow/solutions/hello-genomics/hello-genomics-4.nf
@@ -68,7 +68,8 @@ workflow {
 
     // Create input channel from a text file listing input file paths
     reads_ch = Channel.fromPath(params.reads_bam)
-                    .splitText() { bamPath -> file(bamPath.trim()) }
+                    .splitCsv()
+                    .map { bamPath -> file(bamPath[0]) }
 
     // Load the file paths for the accessory files (reference and intervals)
     ref_file        = file(params.reference)

diff --git a/hello-nextflow/solutions/hello-modules/final-main.nf b/hello-nextflow/solutions/hello-modules/final-main.nf
@@ -9,7 +9,8 @@ workflow {
 
     // Create input channel from a text file listing input file paths
     reads_ch = Channel.fromPath(params.reads_bam)
-                    .splitText() { bamPath -> file(bamPath.trim()) }
+                    .splitCsv()
+                    .map { bamPath -> file(bamPath[0]) }
 
     // Load the file paths for the accessory files (reference and intervals)
     ref_file        = file(params.reference)

diff --git a/hello-nextflow/solutions/hello-operators/hello-operators-1.nf b/hello-nextflow/solutions/hello-operators/hello-operators-1.nf
@@ -69,7 +69,8 @@ workflow {
 
     // Create input channel from a text file listing input file paths
     reads_ch = Channel.fromPath(params.reads_bam)
-                    .splitText() { bamPath -> file(bamPath.trim()) }
+                    .splitCsv()
+                    .map { bamPath -> file(bamPath[0]) }
 
     // Load the file paths for the accessory files (reference and intervals)
     ref_file        = file(params.reference)

diff --git a/hello-nextflow/solutions/hello-operators/hello-operators-2.nf b/hello-nextflow/solutions/hello-operators/hello-operators-2.nf
@@ -100,7 +100,8 @@ workflow {
 
     // Create input channel from a text file listing input file paths
     reads_ch = Channel.fromPath(params.reads_bam)
-                    .splitText() { bamPath -> file(bamPath.trim()) }
+                    .splitCsv()
+                    .map { bamPath -> file(bamPath[0]) }
 
     // Load the file paths for the accessory files (reference and intervals)
     ref_file        = file(params.reference)

diff --git a/hello-nextflow/solutions/hello-operators/hello-operators-3.nf b/hello-nextflow/solutions/hello-operators/hello-operators-3.nf
@@ -110,7 +110,8 @@ workflow {
 
     // Create input channel from a text file listing input file paths
     reads_ch = Channel.fromPath(params.reads_bam)
-                    .splitText() { bamPath -> file(bamPath.trim()) }
+                    .splitCsv()
+                    .map { bamPath -> file(bamPath[0]) }
 
     // Load the file paths for the accessory files (reference and intervals)
     ref_file        = file(params.reference)