New option to build bismark index. New UPPMAX test script. Docker tes…

…ts with and without bismark index.
Gregor-Mendel-Institute · Jun 25, 2017 · e2bfd12 · e2bfd12
1 parent 73072b5
commit e2bfd12
Show file tree

Hide file tree

Showing 5 changed files with 122 additions and 40 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -18,4 +18,6 @@ env:
   - s=docker_test
   - s=docker_test_bwameth
 
-script: "./${s}.sh"
+script:
+  - "./${s}.sh"
+  - "./${s}.sh true" # Run again, building reference genome
diff --git a/bismark.nf b/bismark.nf
@@ -22,10 +22,12 @@ vim: syntax=groovy
 version = 0.1
 
 // Configurable variables
+params.name = false
 params.project = false
 params.email = false
 params.genome = false
 params.bismark_index = params.genome ? params.genomes[ params.genome ].bismark ?: false : false
+params.fasta = params.genome ? params.genomes[ params.genome ].fasta ?: false : false
 params.saveReference = false
 params.saveTrimmed = false
 params.saveAlignedIntermediates = false
@@ -46,13 +48,29 @@ params.numMismatches = 0.6
 
 // Validate inputs
 if( params.bismark_index ){
-    bismark_index = file(params.bismark_index)
-    if( !bismark_index.exists() ) exit 1, "Bismark index not found: ${params.bismark_index}"
-} else {
-    exit 1, "No reference genome specified! Please use --genome or --bismark_index"
+    bismark_index = Channel
+        .fromPath(params.bismark_index)
+        .ifEmpty { exit 1, "Bismark index not found: ${params.bismark_index}" }
+}
+else if ( params.fasta ){
+    fasta = file(params.fasta)
+    if( !fasta.exists() ) exit 1, "Fasta file not found: ${params.fasta}"
+}
+else {
+    exit 1, "No reference genome specified! Please use --genome, --bismark_index or --fasta"
 }
 multiqc_config = file(params.multiqc_config)
 
+// Validate inputs
+if( workflow.profile == 'standard' && !params.project ) exit 1, "No UPPMAX project ID found! Use --project"
+
+// Has the run name been specified by the user?
+//  this has the bonus effect of catching both -name and --name
+custom_runName = params.name
+if( !(workflow.runName ==~ /[a-z]+_[a-z]+/) ){
+  custom_runName = workflow.runName
+}
+
 params.rrbs = false
 params.pbat = false
 params.single_cell = false
@@ -105,22 +123,12 @@ log.info "=================================================="
 log.info " NGI-MethylSeq : Bisulfite-Seq Best Practice v${version}"
 log.info "=================================================="
 def summary = [:]
+summary['Run Name']       = custom_runName ?: workflow.runName
 summary['Reads']          = params.reads
 summary['Data Type']      = params.singleEnd ? 'Single-End' : 'Paired-End'
 summary['Genome']         = params.genome
-summary['Bismark Index']  = params.bismark_index
-summary['Current home']   = "$HOME"
-summary['Current user']   = "$USER"
-summary['Current path']   = "$PWD"
-summary['Working dir']    = workflow.workDir
-summary['Output dir']     = params.outdir
-summary['Script dir']     = workflow.projectDir
-summary['Deduplication']  = params.nodedup ? 'No' : 'Yes'
-summary['Save Trimmed']   = params.saveTrimmed
-summary['Save Unmapped']  = params.unmapped ? 'Yes' : 'No'
-summary['Save Intermeds'] = params.saveAlignedIntermediates
-summary['Directional Mode'] = params.non_directional ? 'No' : 'Yes'
-summary['All C Contexts'] = params.comprehensive ? 'Yes' : 'No'
+if(params.bismark_index) summary['Bismark Index'] = params.bismark_index
+else if(params.fasta)    summary['Fasta Ref'] = params.fasta
 if(params.rrbs) summary['RRBS Mode'] = 'On'
 if(params.relaxMismatches) summary['Mismatch Func'] = "L,0,-${params.numMismatches} (Bismark default = L,0,-0.2)"
 if(params.notrim)       summary['Trimming Step'] = 'Skipped'
@@ -129,18 +137,51 @@ if(params.single_cell)  summary['Trim Profile'] = 'Single Cell'
 if(params.epignome)     summary['Trim Profile'] = 'Epignome'
 if(params.accel)        summary['Trim Profile'] = 'Accel'
 if(params.cegx)         summary['Trim Profile'] = 'CEGX'
-if(params.clip_r1 > 0)  summary['Trim R1'] = params.clip_r1
-if(params.clip_r2 > 0)  summary['Trim R2'] = params.clip_r2
-if(params.three_prime_clip_r1 > 0) summary["Trim 3' R1"] = params.three_prime_clip_r1
-if(params.three_prime_clip_r2 > 0) summary["Trim 3' R2"] = params.three_prime_clip_r2
+summary['Trim R1'] = params.clip_r1
+summary['Trim R2'] = params.clip_r2
+summary["Trim 3' R1"] = params.three_prime_clip_r1
+summary["Trim 3' R2"] = params.three_prime_clip_r2
+summary['Deduplication']  = params.nodedup ? 'No' : 'Yes'
+summary['Save Reference'] = params.saveReference ? 'Yes' : 'No'
+summary['Save Trimmed']   = params.saveTrimmed ? 'Yes' : 'No'
+summary['Save Unmapped']  = params.unmapped ? 'Yes' : 'No'
+summary['Save Intermeds'] = params.saveAlignedIntermediates ? 'Yes' : 'No'
+summary['Directional Mode'] = params.non_directional ? 'No' : 'Yes'
+summary['All C Contexts'] = params.comprehensive ? 'Yes' : 'No'
+summary['Current home']   = "$HOME"
+summary['Current user']   = "$USER"
+summary['Current path']   = "$PWD"
+summary['Working dir']    = workflow.workDir
+summary['Output dir']     = params.outdir
+summary['Script dir']     = workflow.projectDir
 summary['Config Profile'] = (workflow.profile == 'standard' ? 'UPPMAX' : workflow.profile)
 if(params.project) summary['UPPMAX Project'] = params.project
 if(params.email) summary['E-mail Address'] = params.email
 log.info summary.collect { k,v -> "${k.padRight(18)}: $v" }.join("\n")
 log.info "========================================="
 
-// Validate inputs
-if( workflow.profile == 'standard' && !params.project ) exit 1, "No UPPMAX project ID found! Use --project"
+
+/*
+ * PREPROCESSING - Build Bismark index
+ */
+if(!params.bismark_index && fasta){
+    process makeBismarkIndex {
+        tag fasta
+        publishDir path: { params.saveReference ? "${params.outdir}/reference_genome" : params.outdir },
+                   saveAs: { params.saveReference ? it : null }, mode: 'copy'
+
+        input:
+        file fasta from fasta
+
+        output:
+        file "Bisulfite_Genome" into bismark_index
+
+        script:
+        """
+        bismark_genome_preparation ./
+        """
+    }
+}
 
 
 /*

diff --git a/docs/installation.md b/docs/installation.md
@@ -82,20 +82,18 @@ process {
 ```
 
 ### Reference Genomes
-The NGI-MethylSeq pipeline needs a reference genome for alignment and annotation. If not already available, start by downloading the relevant reference, for example from [illumina iGenomes](https://support.illumina.com/sequencing/sequencing_software/igenome.html).
+The NGI-MethylSeq pipeline needs a reference genome for read alignment. Support for many common genomes is built in if running on UPPMAX or AWS, by using [illumina iGenomes](https://support.illumina.com/sequencing/sequencing_software/igenome.html).
 
-> NB: The below paragraph is a lie. You currently need a Bismark reference. Integrated builds from Fasta files coming soon...
+If you don't want to use the illumina iGenomes you can supply either a Bismark reference or a FASTA file. If a Bismark reference is specified, the pipeline won't have to generate it and will be finished quite a bit faster. If a FASTA file is supplied then the Bismark reference will be built when the pipeline starts. Use the command line option `--saveReference` to keep the generated references so that they can be added to your config and used again in the future. Use  `--bismark_index` or `--fasta` to specify the paths to the reference.
 
-The minimal requirements are a FASTA file. If a Bismark reference is specified, the pipeline won't have to generate it and will be finished quite a bit faster. Use the command line option `--saveReference` to keep the generated references so that they can be added to your config and used again in the future.
-
-A reference genome path can be specified on the command line each time you run with `--bismark_index` or `--fasta`. Alternatively, add the paths to the config under a relevant id and just specify this id with `--genome ID` when you run the pipeline _(this can also be set as a default in your config)_:
+Alternatively, you can add the paths to your NextFlow config under a relevant id and just specify this id with `--genome ID` when you run the pipeline:
 
 ```groovy
 params {
   genomes {
     'YOUR-ID' {
       bismark  = '<PATH TO BISMARK REF>/BismarkIndex'
-      fasta  = '<PATH TO FASTA FILE>/genome.fa'
+      fasta  = '<PATH TO FASTA FILE>/genome.fa' // used if above is not specified
     }
     'OTHER-GENOME' {
       // [..]
@@ -106,7 +104,6 @@ params {
 }
 ```
 
-
 ### Software Requirements
 To run the pipeline, several software packages are required. How you satisfy these requirements is essentially up to you and depends on your system.
 

diff --git a/tests/docker_test.sh b/tests/docker_test.sh
@@ -1,13 +1,5 @@
 #!/usr/bin/env bash
 
-script_path="../bismark.nf"
-if [ -z $1]
-then
-    echo "No argument given, going to try to run ../bismark.nf"
-else
-    script_path=$1
-fi
-
 data_path="/tmp"
 if [ -d "./test_data" ]
 then
@@ -32,7 +24,16 @@ else
     echo "Done"
 fi
 
-cmd="nextflow run $script_path -resume -profile testing --bismark_index ${data_dir}/references/BismarkIndex/ --singleEnd --reads \"${data_dir}/*.fastq.gz\""
+if [ -z $1]
+then
+    buildrefs="--fasta ${data_dir}/references/WholeGenomeFasta/genome.fa"
+else
+    buildrefs="--bismark_index ${data_dir}/references/BismarkIndex/"
+fi
+
+run_name="Test MethylSeq Run: "$(date +%s)
+
+cmd="nextflow run ../bismark.nf -resume -name \"$run_name\" -profile testing $buildrefs --singleEnd --reads \"${data_dir}/*.fastq.gz\""
 echo "Starting nextflow... Command:"
 echo $cmd
 echo "-----"

diff --git a/tests/uppmax_test.sh b/tests/uppmax_test.sh
@@ -0,0 +1,41 @@
+#!/usr/bin/env bash
+
+script_path="../bismark.nf"
+if [ -z $1]
+then
+    echo "No argument given, going to try to run ../main.nf"
+else
+    script_path=$1
+fi
+
+data_path=$SNIC_NOBACKUP
+if [ -d "./test_data" ]
+then
+    data_path="./test_data"
+    echo "Found data directory in current working directory, using ./test_data/"
+fi
+
+curl --version >/dev/null 2>&1 || { echo >&2 "I require curl, but it's not installed. Aborting."; exit 1; }
+tar --version >/dev/null 2>&1 || { echo >&2 "I require tar, but it's not installed. Aborting."; exit 1; }
+nextflow -v >/dev/null 2>&1 || { echo >&2 "I require nextflow, but it's not installed. If you hava Java, run 'curl -fsSL get.nextflow.io | bash'. If not, install Java."; exit 1; }
+
+data_dir=${data_path}/ngi-bisulfite_test_set
+if [ -d $data_dir ]
+then
+    echo "Found existing test set, using $data_dir"
+else
+    echo "Downloading test set..."
+    curl https://export.uppmax.uu.se/b2013064/test-data/ngi-bisulfite_test_set.tar.bz2 > ${data_path}/ngi-bisulfite_test_set.tar.bz2
+    echo "Unpacking test set..."
+    tar xvjf ${data_path}/ngi-bisulfite_test_set.tar.bz2 -C ${data_path}
+    echo "Done"
+fi
+
+run_name="Test RNA Run: "$(date +%s)
+
+cmd="nextflow run $script_path -resume -name \"$run_name\" -profile devel --bismark_index ${data_dir}/references/BismarkIndex/ --singleEnd --reads \"${data_dir}/*.fastq.gz\""
+echo "Starting nextflow... Command:"
+echo $cmd
+echo "-----"
+eval $cmd
+