Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added hisat2 component #213

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,11 @@

### New components

- `Bwa`: align short paired-end sequencing reads to long reference sequences
- `Bwa`: Align short paired-end sequencing reads to long reference sequences
- `MarkDuplicates`: Identifies duplicate reads
- `BaseRecalibrator`: Detects systematic errors in base quality scores
- `Haplotypecaller`: Call germline SNPs and indels via local re-assembly of haplotypes
- `HiSAT2`: Alignment program for mapping next-generation sequencing reads to a reference genome

- `Seroba`: Serotyping of *Streptococcus pneumoniae* sequencing data (FastQ)
- `Concoct`: Clustering metagenomic assembled comtigs with coverage and composition
Expand Down
60 changes: 60 additions & 0 deletions flowcraft/generator/components/mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,4 +245,64 @@ def __init__(self, **kwargs):
self.status_channels = [
"base_recalibrator",
"apply_bqsr"
]


class Hisat2(Process):
"""HISAT2 is a fast and sensitive alignment program for mapping next-generation sequencing reads (both DNA and RNA) to a population of human genomes (as well as to a single reference genome)
This process is set with:
- ``input_type``: fastq
- ``output_type``: bam
- ``ptype``: mapping
"""

def __init__(self, **kwargs):

super().__init__(**kwargs)

self.input_type = "fastq"
self.output_type = "bam"

self.params = {
"reference": {
"default": "null",
"description": "Specifies the reference genome to be provided "
"to HISAT2."
},
"hisat2_index": {
"default": "null",
"description": "Specifies the reference indexes to be provided "
"to HISAT2."
},
"hisat2_index_name": {
"default": "null",
"description": "Specifies the reference indexes folder & basename to be provided "
"to HISAT2, eg hisat2_index_folder/basename."
}
}

self.directives = {
"make_hisat2_index": {
"container": "makaho/hisat2-zstd",
"version": "latest",
"memory": "{5.Gb*task.attempt}",
"cpus": 1
},
"hisat2": {
"container": "makaho/hisat2-zstd",
"version": "latest",
"memory": "{5.Gb*task.attempt}",
"cpus": 4
},
"samtools_sort": {
"container": "lifebitai/samtools",
"version": "latest",
"memory": "{5.Gb*task.attempt}",
"cpus": 4
}
}

self.status_channels = [
"hisat2",
"samtools_sort"
]
84 changes: 84 additions & 0 deletions flowcraft/generator/templates/hisat2.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
if (params.reference{{ param_id }}) {
Channel
.fromPath("${params.reference{{ param_id }}}.fasta")
.ifEmpty { exit 1, "FASTA annotation file not found: ${params.reference{{ param_id }}}" }
.set { hisat2Fasta_{{pid}} }
} else if (params.hisat2_index{{ param_id }}) {
Channel
.fromPath(params.hisat2_index{{ param_id }})
.ifEmpty { exit 1, "Folder containing Hisat2 indexes for reference genome not found: ${params.hisat2_index{{ param_id }}}" }
.set { hisat2Index_{{pid}} }
hisat2IndexName_{{pid}} = Channel.value( "${params.hisat2_index_name{{ param_id }}}" )
} else {
exit 1, "Please specify either `--reference /path/to/file_basename` OR `--hisat2_index /path/to/hisat2_index_folder` AND `--hisat2_index_name hisat2_index_folder/basename`"
}

if (!params.hisat2_index{{ param_id }}) {
process make_hisat2_index_{{ pid }} {

{% include "post.txt" ignore missing %}
tag "$fasta"

input:
each file(fasta) from hisat2Fasta_{{pid}}

output:
val "hisat2_index/${fasta.baseName}.hisat2_index" into hisat2IndexName_{{pid}}
file "hisat2_index" into hisat2Index_{{pid}}

"""
mkdir hisat2_index
hisat2-build -p ${task.cpus} $fasta hisat2_index/${fasta.baseName}.hisat2_index
"""
}
}

process hisat2_{{ pid }} {

{% include "post.txt" ignore missing %}
tag "$sample_id"

input:
set sample_id, file(fastq_pair) from {{ input_channel }}
each index_name from hisat2IndexName_{{pid}}
each file(index) from hisat2Index_{{pid}}

output:
set sample_id, file("${sample_id}.sam") into hisat2Sam_{{pid}}
{% with task_name="hisat2" %}
{%- include "compiler_channels.txt" ignore missing -%}
{% endwith %}

"""
hisat2 \
-p ${task.cpus} \
-x $index_name \
-1 ${fastq_pair[0]} \
-2 ${fastq_pair[1]} \
-S ${sample_id}.sam
"""
}

process samtools_sort_{{ pid }} {

{% include "post.txt" ignore missing %}
publishDir "results/mapping/hisat2_{{ pid }}"
tag "$sample_id"

input:
set sample_id, file(sam) from hisat2Sam_{{pid}}

output:
set sample_id, file("${sample_id}.sorted.bam"), file("${sample_id}.sorted.bam.bai") into {{ output_channel }}
{% with task_name="samtools_sort" %}
{%- include "compiler_channels.txt" ignore missing -%}
{% endwith %}

"""
samtools view -Sb $sam > ${sample_id}.bam
samtools sort -T ${sample_id}.bam.tmp ${sample_id}.bam -o ${sample_id}.sorted.bam
Copy link
Collaborator

@tiagofilipe12 tiagofilipe12 Jul 26, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can improve I/O here by piping some of these commands: https://github.com/assemblerflow/flowcraft/blob/master/flowcraft/generator/templates/mapping_patlas.nf#L40 (example).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Additionally, in your class you have defined the output as bam, not as bam, bai. This makes this component theoretically compatible with modules that receive bam files (retrieve_mapped, for example) but in fact they're not being compatible. Why don't you pass the .bai file as a side channel?

samtools index ${sample_id}.sorted.bam
"""
}

{{ forks }}