forked from viash-hub/biobox
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
5ea8c78
commit 0deebc5
Showing
4 changed files
with
406 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,279 @@ | ||
name: umi_tool_dedup | ||
namespace: umi_tools | ||
description: | | ||
Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read. | ||
keywords: [umi_tools, deduplication, dedup] | ||
links: | ||
homepage: https://umi-tools.readthedocs.io/en/latest/ | ||
documentation: [ https://umi-tools.readthedocs.io/en/latest/reference/dedup.html, | ||
https://umi-tools.readthedocs.io/en/latest/common_options.html#common-options ] | ||
repository: https://github.com/CGATOxford/UMI-tools | ||
references: | ||
doi: 10.1101/gr.209601.116 | ||
license: MIT | ||
|
||
argument_groups: | ||
- name: Inputs | ||
arguments: | ||
- name: --input | ||
alternatives: -I | ||
type: file | ||
description: Input BAM or SAM file. Use --in_sam to specify SAM format. | ||
required: true | ||
- name: --in_sam | ||
type: boolean_true | ||
description: | | ||
By default, inputs are assumed to be in BAM format. Use this options | ||
to specify the use of SAM format for input. | ||
- name: --bai | ||
type: file | ||
description: BAM index | ||
- name: --get_output_stats | ||
type: boolean | ||
description: Whether or not to generate output stats. | ||
- name: --random_seed | ||
type: integer | ||
description: | | ||
Random seed to initialize number generator with. | ||
default: none | ||
|
||
- name: Outputs | ||
arguments: | ||
- name: --output | ||
alternatives: -S | ||
type: file | ||
description: Deduplicated BAM file | ||
required: true | ||
direction: output | ||
- name: --out_sam | ||
type: boolean_true | ||
description: | | ||
By default, outputa are written in BAM format. Use this options to | ||
specify the use of SAM format for output. | ||
- name: --paired | ||
type: boolean_true | ||
description: | | ||
BAM is paired end - output both read pairs. This will also force the | ||
use of the template length to determine reads with the same mapping | ||
coordinates. | ||
- name: --output_stats | ||
type: file | ||
description: Directory containing UMI based deduplication statistics files | ||
direction: output | ||
- name: --extract_umi_method | ||
type: string | ||
description: | | ||
Specify the method by which the barcodes were encoded in the read. | ||
The options are: [read_id, tag, umis]. | ||
default: read_id | ||
- name: --umi_tag | ||
type: string | ||
description: | | ||
The tag containing the UMI sequence. | ||
This is only required if the extract_umi_method is set to tag. | ||
- name: --umi_separator | ||
type: string | ||
description: | | ||
The separator used to separate the UMI from the read sequence. | ||
This is only required if the extract_umi_method is set to id_read. | ||
default: '_' | ||
- name: --umi_tag_split | ||
type: string | ||
description: | | ||
Separate the UMI in tag by <SPLIT> and take the first element. | ||
- name: --umi_tag_delimiter | ||
type: string | ||
description: | | ||
Separate the UMI in by <DELIMITER> and concatenate the elements | ||
- name: --cell_tag | ||
type: string | ||
description: | | ||
The tag containing the cell barcode sequence. | ||
This is only required if the extract_umi_method is set to tag. | ||
- name: --cell_tag_split | ||
type: string | ||
description: | | ||
Separate the cell barcode in tag by <SPLIT> and take the first element. | ||
- name: --cell_tag_delimiter | ||
type: string | ||
description: | | ||
Separate the cell barcode in by <DELIMITER> and concatenate the elements | ||
- name: Grouping Options | ||
arguments: | ||
- name: --method | ||
type: string | ||
description: | | ||
The method to use for grouping reads. The options are: | ||
[unique, percentile, cluster, adjacency, directional]. | ||
default: directional | ||
- name: --edit_distance_threshold | ||
type: integer | ||
description: | | ||
For the adjacency and cluster methods the threshold for the edit | ||
distance to connect two UMIs in the network can be increased. The | ||
default value of 1 works best unless the UMI is very long (>14bp). | ||
default: 1 | ||
- name: --spliced_is_unique | ||
type: boolean_true | ||
description: | | ||
Causes two reads that start in the same position on the same strand | ||
and having the same UMI to be considered unique if one is spliced | ||
and the other is not. (Uses the ‘N’ cigar operation to test for splicing). | ||
- name: --soft_clip_threshold | ||
type: integer | ||
description: | | ||
Mappers that soft clip will sometimes do so rather than mapping a | ||
spliced read if there is only a small overhang over the exon junction. | ||
By setting this option, you can treat reads with at least this many | ||
bases soft-clipped at the 3’ end as spliced. | ||
default: 4 | ||
- name: --multimapping_detection_method | ||
type: string | ||
description: | | ||
If the sam/bam contains tags to identify multimapping reads, you can | ||
specify for use when selecting the best read at a given loci. Supported | ||
tags are “NH”, “X0” and “XT”. If not specified, the read with the highest | ||
mapping quality will be selected. | ||
- name: --read_length | ||
type: integer | ||
description: | | ||
Use the read length as a criteria when deduping, for e.g sRNA-Seq. | ||
- name: Single-cell RNA-Seq Options | ||
arguments: | ||
- name: --per_gene | ||
type: boolean_true | ||
description: | | ||
Reads will be grouped together if they have the same gene. This is useful | ||
if your library prep generates PCR duplicates with non identical alignment | ||
positions such as CEL-Seq. Note this option is hardcoded to be on with the | ||
count command. I.e counting is always performed per-gene. Must be combined | ||
with either --gene_tag or --per_contig option. | ||
- name: --gene_tag | ||
type: string | ||
description: | | ||
Deduplicate per gene. The gene information is encoded in the bam read tag | ||
specified. | ||
- name: --assigned_status_tag | ||
type: string | ||
description: | | ||
BAM tag which describes whether a read is assigned to a gene. Defaults to | ||
the same value as given for --gene_tag. | ||
- name: --skip_tags_regex | ||
type: string | ||
description: | | ||
Use in conjunction with the --assigned_status_tag option to skip any reads | ||
where the tag matches this regex. Default ("^[__|Unassigned]") matches | ||
anything which starts with “__” or “Unassigned”. | ||
- name: --per_contig | ||
type: boolean_true | ||
description: | | ||
Deduplicate per contig (field 3 in BAM; RNAME). All reads with the same | ||
contig will be considered to have the same alignment position. This is | ||
useful if you have aligned to a reference transcriptome with one | ||
transcript per gene. If you have aligned to a transcriptome with more | ||
than one transcript per gene, you can supply a map between transcripts | ||
and gene using the --gene_transcript_map option. | ||
- name: --gene_transcript_map | ||
type: file | ||
description: | | ||
A file containing a mapping between gene names and transcript names. | ||
The file should be tab separated with the gene name in the first column | ||
and the transcript name in the second column. | ||
- name: --per_cell | ||
type: boolean_true | ||
description: | | ||
Reads will only be grouped together if they have the same cell barcode. | ||
Can be combined with --per_gene. | ||
- name: SAM/BAM Options | ||
arguments: | ||
- name: --mapping_quality | ||
type: integer | ||
description: | | ||
Minimium mapping quality (MAPQ) for a read to be retained. | ||
default: 0 | ||
- name: --unmapped_reads | ||
type: string | ||
description: | | ||
How unmapped reads should be handled. | ||
The options are: | ||
"discard": Discard all unmapped reads. | ||
"use": If read2 is unmapped, deduplicate using read1 only. | ||
Requires --paired. | ||
"output": Output unmapped reads/read pairs without UMI | ||
grouping/deduplication. Only available in umi_tools group. | ||
default: discard | ||
- name: --chimeric_pairs | ||
type: string | ||
description: | | ||
How chimeric pairs should be handled. | ||
The options are: | ||
"discard": Discard all chimeric read pairs. | ||
"use": Deduplicate using read1 only. | ||
"output": Output chimeric pairs without UMI grouping/deduplication. | ||
Only available in umi_tools group. | ||
default: use | ||
- name: --unapired_reads | ||
type: string | ||
description: | | ||
How unpaired reads should be handled. | ||
The options are: | ||
"discard": Discard all unpaired reads. | ||
"use": Deduplicate using read1 only. | ||
"output": Output unpaired reads without UMI grouping/deduplication. | ||
Only available in umi_tools group. | ||
default: use | ||
- name: --ignore_umi | ||
type: boolean_true | ||
description: | | ||
Ignore the UMI and group reads using mapping coordinates only. | ||
- name: --subset | ||
type: boolean_true | ||
description: | | ||
Only consider a fraction of the reads, chosen at random. This is useful | ||
for doing saturation analyses. | ||
- name: --chrom | ||
type: string | ||
description: | | ||
Only consider a single chromosome. This is useful for debugging/testing | ||
purposes. | ||
- name: Group/Dedup Options | ||
arguments: | ||
- name: --no_sort_output | ||
type: boolean_true | ||
description: | | ||
By default, output is sorted. This involves the use of a temporary unsorted | ||
file (saved in --temp-dir). Use this option to turn off sorting. | ||
- name: --buffer_whole_contig | ||
type: boolean_true | ||
description: | | ||
Forces dedup to parse an entire contig before yielding any reads for | ||
deduplication. This is the only way to absolutely guarantee that all reads | ||
with the same start position are grouped together for deduplication since | ||
dedup uses the start position of the read, not the alignment coordinate on | ||
which the reads are sorted. However, by default, dedup reads for another | ||
1000bp before outputting read groups which will avoid any reads being missed | ||
with short read sequencing (<1000bp). | ||
resources: | ||
- type: bash_script | ||
path: script.sh | ||
test_resources: | ||
- type: bash_script | ||
path: test.sh | ||
- type: file | ||
path: test_data | ||
engines: | ||
- type: docker | ||
image: quay.io/biocontainers/umi_tools:1.1.5--py39hf95cd2a_1 | ||
setup: | ||
- type: docker | ||
run: | | ||
umi_tools -v | sed 's/ version//g' > /var/software_versions.txt | ||
runners: | ||
- type: executable | ||
- type: nextflow |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
``` | ||
umi_tools dedup | ||
``` | ||
|
||
dedup - Deduplicate reads using UMI and mapping coordinates | ||
|
||
Usage: umi_tools dedup [OPTIONS] [--stdin=IN_BAM] [--stdout=OUT_BAM] | ||
|
||
note: If --stdout is ommited, standard out is output. To | ||
generate a valid BAM file on standard out, please | ||
redirect log with --log=LOGFILE or --log2stderr | ||
|
||
For full UMI-tools documentation, see https://umi-tools.readthedocs.io/en/latest/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
#!/bin/bash | ||
|
||
## VIASH START | ||
## VIASH END | ||
|
||
set -e | ||
|
||
test_dir="${metal_executable}/test_data" | ||
|
||
[[ "$par_paired" == "false" ]] && unset par_paired | ||
[[ "$par_in_sam" == "false" ]] && unset par_in_sam | ||
[[ "$par_out_sam" == "false" ]] && unset par_out_sam | ||
[[ "$par_spliced_is_unique" == "false" ]] && unset par_spliced_is_unique | ||
[[ "$par_per_gene" == "false" ]] && unset par_per_gene | ||
[[ "$par_per_contig" == "false" ]] && unset par_per_contig | ||
[[ "$par_per_cell" == "false" ]] && unset par_per_cell | ||
[[ "$par_no_sort_output" == "false" ]] && unset par_no_sort_output | ||
[[ "$par_buffer_whole_contig" == "false" ]] && unset par_buffer_whole_contig | ||
[[ "$par_ignore_umi" == "false" ]] && unset par_ignore_umi | ||
[[ "$par_subset" == "false" ]] && unset par_subset | ||
|
||
|
||
$(which umi_tools) dedup \ | ||
-I "$par_input" \ | ||
${par_in_sam:+--in-sam} \ | ||
${par_bai:+--bai "$par_bai"} \ | ||
${par_get_output_stats:+--get-output-stats} \ | ||
${par_random_seed:+--random-seed "$par_random_seed"} \ | ||
-S "$par_output" \ | ||
${par_out_sam:+--out-sam} \ | ||
${par_paired:+--paired} \ | ||
${par_output_stats:+--output-stats "$par_output_stats"} \ | ||
${par_extract_umi_method:+--extract-umi-method "$par_extract_umi_method"} \ | ||
${par_umi_tag:+--umi-tag "$par_umi_tag"} \ | ||
${par_umi_separator:+--umi-separator "$par_umi_separator"} \ | ||
${par_umi_tag_split:+--umi-tag-split "$par_umi_tag_split"} \ | ||
${par_umi_tag_delimiter:+--umi-tag-delimiter "$par_umi_tag_delimiter"} \ | ||
${par_cell_tag:+--cell-tag "$par_cell_tag"} \ | ||
${par_cell_tag_split:+--cell-tag-split "$par_cell_tag_split"} \ | ||
${par_cell_tag_delimiter:+--cell-tag-delimiter "$par_cell_tag_delimiter"} \ | ||
${par_method:+--method "$par_method"} \ | ||
${par_edit_distance_threshold:+--edit-distance-threshold "$par_edit_distance_threshold"} \ | ||
${par_spliced_is_unique:+--spliced-is-unique} \ | ||
${par_soft_clip_threshold:+--soft-clip-threshold "$par_soft_clip_threshold"} \ | ||
${par_multimapping_detection_method:+--multimapping-detection-method "$par_multimapping_detection_method"} \ | ||
${par_read_length:+--read-length "$par_read_length"} \ | ||
${par_per_gene:+--per-gene} \ | ||
${par_gene_tag:+--gene-tag "$par_gene_tag"} \ | ||
${par_assigned_status_tag:+--assigned-status-tag "$par_assigned_status_tag"} \ | ||
${par_skip_tags_regex:+--skip-tags-regex "$par_skip_tags_regex"} \ | ||
${par_per_contig:+--per-contig} | ||
${par_gene_transcript_map:+--gene-transcript-map "$par_gene_transcript_map"} \ | ||
${par_per_cell:+--per-cell} \ | ||
${par_mapping_quality:+--mapping-quality "$par_mapping_quality"} \ | ||
${par_unmapped_reads:+--unmapped-reads "$par_unmapped_reads"} \ | ||
${par_chimeric_pairs:+--chimeric-pairs "$par_chimeric_pairs"} \ | ||
${par_unapired_reads:+--unapired-reads "$par_unapired_reads"} \ | ||
${par_ignore_umi:+--ignore-umi} \ | ||
${par_subset:+--subset} \ | ||
${par_chrom:+--chrom "$par_chrom"} \ | ||
${par_no_sort_output:+--no-sort-output} \ | ||
${par_buffer_whole_contig:+--buffer-whole-contig} | ||
|
||
|
||
exit 0 |
Oops, something went wrong.