From 923d7ef2667489ea61509a9840b879ccb57ca0a0 Mon Sep 17 00:00:00 2001 From: PhilPalmer Date: Thu, 25 Jul 2019 10:49:14 -0400 Subject: [PATCH 1/3] Added hisat2 component --- flowcraft/generator/components/mapping.py | 60 ++++++++++++++++ flowcraft/generator/templates/hisat2.nf | 84 +++++++++++++++++++++++ 2 files changed, 144 insertions(+) create mode 100644 flowcraft/generator/templates/hisat2.nf diff --git a/flowcraft/generator/components/mapping.py b/flowcraft/generator/components/mapping.py index a310b198..660ed397 100644 --- a/flowcraft/generator/components/mapping.py +++ b/flowcraft/generator/components/mapping.py @@ -245,4 +245,64 @@ def __init__(self, **kwargs): self.status_channels = [ "base_recalibrator", "apply_bqsr" + ] + + +class Hisat2(Process): + """HISAT2 is a fast and sensitive alignment program for mapping next-generation sequencing reads (both DNA and RNA) to a population of human genomes (as well as to a single reference genome) + This process is set with: + - ``input_type``: fastq + - ``output_type``: bam + - ``ptype``: mapping + """ + + def __init__(self, **kwargs): + + super().__init__(**kwargs) + + self.input_type = "fastq" + self.output_type = "bam" + + self.params = { + "reference": { + "default": "null", + "description": "Specifies the reference genome to be provided " + "to HISAT2." + }, + "hisat2_index": { + "default": "null", + "description": "Specifies the reference indexes to be provided " + "to HISAT2." + }, + "hisat2_index_name": { + "default": "null", + "description": "Specifies the reference indexes folder & basename to be provided " + "to HISAT2, eg hisat2_index_folder/basename." + } + } + + self.directives = { + "make_hisat2_index": { + "container": "makaho/hisat2-zstd", + "version": "latest", + "memory": "{5.Gb*task.attempt}", + "cpus": 1 + }, + "hisat2": { + "container": "makaho/hisat2-zstd", + "version": "latest", + "memory": "{5.Gb*task.attempt}", + "cpus": 4 + }, + "samtools_sort": { + "container": "lifebitai/samtools", + "version": "latest", + "memory": "{5.Gb*task.attempt}", + "cpus": 4 + } + } + + self.status_channels = [ + "hisat2", + "samtools_sort" ] \ No newline at end of file diff --git a/flowcraft/generator/templates/hisat2.nf b/flowcraft/generator/templates/hisat2.nf new file mode 100644 index 00000000..621a6718 --- /dev/null +++ b/flowcraft/generator/templates/hisat2.nf @@ -0,0 +1,84 @@ +if (params.reference{{ param_id }}) { + Channel + .fromPath(params.reference{{ param_id }}) + .ifEmpty { exit 1, "FASTA annotation file not found: ${params.reference{{ param_id }}}" } + .set { hisat2Fasta_{{pid}} } +} else if (params.hisat2_index{{ param_id }}) { + Channel + .fromPath(params.hisat2_index{{ param_id }}) + .ifEmpty { exit 1, "Folder containing Hisat2 indexes for reference genome not found: ${params.hisat2_index{{ param_id }}}" } + .set { hisat2Index_{{pid}} } + hisat2IndexName_{{pid}} = Channel.value( "${params.hisat2_index_name{{ param_id }}}" ) +} else { + exit 1, "Please specify either `--reference /path/to/file.fasta` OR `--hisat2_index /path/to/hisat2_index_folder` AND `--hisat2_index_name hisat2_index_folder/basename`" +} + +if (!params.hisat2_index{{ param_id }}) { + process make_hisat2_index_{{ pid }} { + + {% include "post.txt" ignore missing %} + tag "$fasta" + + input: + each file(fasta) from hisat2Fasta_{{pid}} + + output: + val "hisat2_index/${fasta.baseName}.hisat2_index" into hisat2IndexName_{{pid}} + file "hisat2_index" into hisat2Index_{{pid}} + + """ + mkdir hisat2_index + hisat2-build -p ${task.cpus} $fasta hisat2_index/${fasta.baseName}.hisat2_index + """ + } +} + +process hisat2_{{ pid }} { + + {% include "post.txt" ignore missing %} + tag "$sample_id" + + input: + set sample_id, file(fastq_pair) from {{ input_channel }} + each index_name from hisat2IndexName_{{pid}} + each file(index) from hisat2Index_{{pid}} + + output: + set sample_id, file("${sample_id}.sam") into hisat2Sam_{{pid}} + {% with task_name="hisat2" %} + {%- include "compiler_channels.txt" ignore missing -%} + {% endwith %} + + """ + hisat2 \ + -p ${task.cpus} \ + -x $index_name \ + -1 ${fastq_pair[0]} \ + -2 ${fastq_pair[1]} \ + -S ${sample_id}.sam + """ +} + +process samtools_sort_{{ pid }} { + + {% include "post.txt" ignore missing %} + publishDir "results/mapping/hisat2_{{ pid }}" + tag "$sample_id" + + input: + set sample_id, file(sam) from hisat2Sam_{{pid}} + + output: + set sample_id, file("${sample_id}.sorted.bam"), file("${sample_id}.sorted.bam.bai") into {{ output_channel }} + {% with task_name="samtools_sort" %} + {%- include "compiler_channels.txt" ignore missing -%} + {% endwith %} + + """ + samtools view -Sb $sam > ${sample_id}.bam + samtools sort -T ${sample_id}.bam.tmp ${sample_id}.bam -o ${sample_id}.sorted.bam + samtools index ${sample_id}.sorted.bam + """ +} + +{{ forks }} \ No newline at end of file From a3dd95fc5f429f8c30776da95adc987b1ba9f4b6 Mon Sep 17 00:00:00 2001 From: PhilPalmer Date: Thu, 25 Jul 2019 13:34:29 -0400 Subject: [PATCH 2/3] Updated changelog with hisat2 component --- changelog.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/changelog.md b/changelog.md index 645f2d1a..82fbd88a 100644 --- a/changelog.md +++ b/changelog.md @@ -4,10 +4,11 @@ ### New components -- `Bwa`: align short paired-end sequencing reads to long reference sequences +- `Bwa`: Align short paired-end sequencing reads to long reference sequences - `MarkDuplicates`: Identifies duplicate reads - `BaseRecalibrator`: Detects systematic errors in base quality scores - `Haplotypecaller`: Call germline SNPs and indels via local re-assembly of haplotypes +- `HiSAT2`: Alignment program for mapping next-generation sequencing reads to a reference genome - `Seroba`: Serotyping of *Streptococcus pneumoniae* sequencing data (FastQ) - `Concoct`: Clustering metagenomic assembled comtigs with coverage and composition From 7392efd2f27795eb793a2838f6031c4b8114173f Mon Sep 17 00:00:00 2001 From: PhilPalmer Date: Thu, 25 Jul 2019 15:15:42 -0400 Subject: [PATCH 3/3] Updated reference parameter --- flowcraft/generator/templates/hisat2.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/flowcraft/generator/templates/hisat2.nf b/flowcraft/generator/templates/hisat2.nf index 621a6718..f3fe6d10 100644 --- a/flowcraft/generator/templates/hisat2.nf +++ b/flowcraft/generator/templates/hisat2.nf @@ -1,6 +1,6 @@ if (params.reference{{ param_id }}) { Channel - .fromPath(params.reference{{ param_id }}) + .fromPath("${params.reference{{ param_id }}}.fasta") .ifEmpty { exit 1, "FASTA annotation file not found: ${params.reference{{ param_id }}}" } .set { hisat2Fasta_{{pid}} } } else if (params.hisat2_index{{ param_id }}) { @@ -10,7 +10,7 @@ if (params.reference{{ param_id }}) { .set { hisat2Index_{{pid}} } hisat2IndexName_{{pid}} = Channel.value( "${params.hisat2_index_name{{ param_id }}}" ) } else { - exit 1, "Please specify either `--reference /path/to/file.fasta` OR `--hisat2_index /path/to/hisat2_index_folder` AND `--hisat2_index_name hisat2_index_folder/basename`" + exit 1, "Please specify either `--reference /path/to/file_basename` OR `--hisat2_index /path/to/hisat2_index_folder` AND `--hisat2_index_name hisat2_index_folder/basename`" } if (!params.hisat2_index{{ param_id }}) {