From 4cac1278b9c4a96aebcc8185d8df834fee9a7a87 Mon Sep 17 00:00:00 2001 From: emmarousseau Date: Thu, 11 Apr 2024 11:50:45 +0100 Subject: [PATCH] Initial commit for umi-tools dedup --- src/umi_tools/umi_tools_dedup/config.vsh.yaml | 279 ++++++++++++++++++ src/umi_tools/umi_tools_dedup/help.txt | 13 + src/umi_tools/umi_tools_dedup/script.sh | 65 ++++ src/umi_tools/umi_tools_dedup/test.sh | 49 +++ 4 files changed, 406 insertions(+) create mode 100644 src/umi_tools/umi_tools_dedup/config.vsh.yaml create mode 100644 src/umi_tools/umi_tools_dedup/help.txt create mode 100644 src/umi_tools/umi_tools_dedup/script.sh create mode 100644 src/umi_tools/umi_tools_dedup/test.sh diff --git a/src/umi_tools/umi_tools_dedup/config.vsh.yaml b/src/umi_tools/umi_tools_dedup/config.vsh.yaml new file mode 100644 index 00000000..75306541 --- /dev/null +++ b/src/umi_tools/umi_tools_dedup/config.vsh.yaml @@ -0,0 +1,279 @@ +name: umi_tool_dedup +namespace: umi_tools +description: | + Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read. +keywords: [umi_tools, deduplication, dedup] +links: + homepage: https://umi-tools.readthedocs.io/en/latest/ + documentation: [ https://umi-tools.readthedocs.io/en/latest/reference/dedup.html, + https://umi-tools.readthedocs.io/en/latest/common_options.html#common-options ] + repository: https://github.com/CGATOxford/UMI-tools +references: + doi: 10.1101/gr.209601.116 +license: MIT + +argument_groups: + - name: Inputs + arguments: + - name: --input + alternatives: -I + type: file + description: Input BAM or SAM file. Use --in_sam to specify SAM format. + required: true + - name: --in_sam + type: boolean_true + description: | + By default, inputs are assumed to be in BAM format. Use this options + to specify the use of SAM format for input. + - name: --bai + type: file + description: BAM index + - name: --get_output_stats + type: boolean + description: Whether or not to generate output stats. + - name: --random_seed + type: integer + description: | + Random seed to initialize number generator with. + default: none + + - name: Outputs + arguments: + - name: --output + alternatives: -S + type: file + description: Deduplicated BAM file + required: true + direction: output + - name: --out_sam + type: boolean_true + description: | + By default, outputa are written in BAM format. Use this options to + specify the use of SAM format for output. + - name: --paired + type: boolean_true + description: | + BAM is paired end - output both read pairs. This will also force the + use of the template length to determine reads with the same mapping + coordinates. + - name: --output_stats + type: file + description: Directory containing UMI based deduplication statistics files + direction: output + - name: --extract_umi_method + type: string + description: | + Specify the method by which the barcodes were encoded in the read. + The options are: [read_id, tag, umis]. + default: read_id + - name: --umi_tag + type: string + description: | + The tag containing the UMI sequence. + This is only required if the extract_umi_method is set to tag. + - name: --umi_separator + type: string + description: | + The separator used to separate the UMI from the read sequence. + This is only required if the extract_umi_method is set to id_read. + default: '_' + - name: --umi_tag_split + type: string + description: | + Separate the UMI in tag by and take the first element. + - name: --umi_tag_delimiter + type: string + description: | + Separate the UMI in by and concatenate the elements + - name: --cell_tag + type: string + description: | + The tag containing the cell barcode sequence. + This is only required if the extract_umi_method is set to tag. + - name: --cell_tag_split + type: string + description: | + Separate the cell barcode in tag by and take the first element. + - name: --cell_tag_delimiter + type: string + description: | + Separate the cell barcode in by and concatenate the elements + + - name: Grouping Options + arguments: + - name: --method + type: string + description: | + The method to use for grouping reads. The options are: + [unique, percentile, cluster, adjacency, directional]. + default: directional + - name: --edit_distance_threshold + type: integer + description: | + For the adjacency and cluster methods the threshold for the edit + distance to connect two UMIs in the network can be increased. The + default value of 1 works best unless the UMI is very long (>14bp). + default: 1 + - name: --spliced_is_unique + type: boolean_true + description: | + Causes two reads that start in the same position on the same strand + and having the same UMI to be considered unique if one is spliced + and the other is not. (Uses the ‘N’ cigar operation to test for splicing). + - name: --soft_clip_threshold + type: integer + description: | + Mappers that soft clip will sometimes do so rather than mapping a + spliced read if there is only a small overhang over the exon junction. + By setting this option, you can treat reads with at least this many + bases soft-clipped at the 3’ end as spliced. + default: 4 + - name: --multimapping_detection_method + type: string + description: | + If the sam/bam contains tags to identify multimapping reads, you can + specify for use when selecting the best read at a given loci. Supported + tags are “NH”, “X0” and “XT”. If not specified, the read with the highest + mapping quality will be selected. + - name: --read_length + type: integer + description: | + Use the read length as a criteria when deduping, for e.g sRNA-Seq. + + - name: Single-cell RNA-Seq Options + arguments: + - name: --per_gene + type: boolean_true + description: | + Reads will be grouped together if they have the same gene. This is useful + if your library prep generates PCR duplicates with non identical alignment + positions such as CEL-Seq. Note this option is hardcoded to be on with the + count command. I.e counting is always performed per-gene. Must be combined + with either --gene_tag or --per_contig option. + - name: --gene_tag + type: string + description: | + Deduplicate per gene. The gene information is encoded in the bam read tag + specified. + - name: --assigned_status_tag + type: string + description: | + BAM tag which describes whether a read is assigned to a gene. Defaults to + the same value as given for --gene_tag. + - name: --skip_tags_regex + type: string + description: | + Use in conjunction with the --assigned_status_tag option to skip any reads + where the tag matches this regex. Default ("^[__|Unassigned]") matches + anything which starts with “__” or “Unassigned”. + - name: --per_contig + type: boolean_true + description: | + Deduplicate per contig (field 3 in BAM; RNAME). All reads with the same + contig will be considered to have the same alignment position. This is + useful if you have aligned to a reference transcriptome with one + transcript per gene. If you have aligned to a transcriptome with more + than one transcript per gene, you can supply a map between transcripts + and gene using the --gene_transcript_map option. + - name: --gene_transcript_map + type: file + description: | + A file containing a mapping between gene names and transcript names. + The file should be tab separated with the gene name in the first column + and the transcript name in the second column. + - name: --per_cell + type: boolean_true + description: | + Reads will only be grouped together if they have the same cell barcode. + Can be combined with --per_gene. + + - name: SAM/BAM Options + arguments: + - name: --mapping_quality + type: integer + description: | + Minimium mapping quality (MAPQ) for a read to be retained. + default: 0 + - name: --unmapped_reads + type: string + description: | + How unmapped reads should be handled. + The options are: + "discard": Discard all unmapped reads. + "use": If read2 is unmapped, deduplicate using read1 only. + Requires --paired. + "output": Output unmapped reads/read pairs without UMI + grouping/deduplication. Only available in umi_tools group. + default: discard + - name: --chimeric_pairs + type: string + description: | + How chimeric pairs should be handled. + The options are: + "discard": Discard all chimeric read pairs. + "use": Deduplicate using read1 only. + "output": Output chimeric pairs without UMI grouping/deduplication. + Only available in umi_tools group. + default: use + - name: --unapired_reads + type: string + description: | + How unpaired reads should be handled. + The options are: + "discard": Discard all unpaired reads. + "use": Deduplicate using read1 only. + "output": Output unpaired reads without UMI grouping/deduplication. + Only available in umi_tools group. + default: use + - name: --ignore_umi + type: boolean_true + description: | + Ignore the UMI and group reads using mapping coordinates only. + - name: --subset + type: boolean_true + description: | + Only consider a fraction of the reads, chosen at random. This is useful + for doing saturation analyses. + - name: --chrom + type: string + description: | + Only consider a single chromosome. This is useful for debugging/testing + purposes. + + - name: Group/Dedup Options + arguments: + - name: --no_sort_output + type: boolean_true + description: | + By default, output is sorted. This involves the use of a temporary unsorted + file (saved in --temp-dir). Use this option to turn off sorting. + - name: --buffer_whole_contig + type: boolean_true + description: | + Forces dedup to parse an entire contig before yielding any reads for + deduplication. This is the only way to absolutely guarantee that all reads + with the same start position are grouped together for deduplication since + dedup uses the start position of the read, not the alignment coordinate on + which the reads are sorted. However, by default, dedup reads for another + 1000bp before outputting read groups which will avoid any reads being missed + with short read sequencing (<1000bp). + + +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - type: file + path: test_data +engines: + - type: docker + image: quay.io/biocontainers/umi_tools:1.1.5--py39hf95cd2a_1 + setup: + - type: docker + run: | + umi_tools -v | sed 's/ version//g' > /var/software_versions.txt +runners: +- type: executable +- type: nextflow \ No newline at end of file diff --git a/src/umi_tools/umi_tools_dedup/help.txt b/src/umi_tools/umi_tools_dedup/help.txt new file mode 100644 index 00000000..d3c8fa44 --- /dev/null +++ b/src/umi_tools/umi_tools_dedup/help.txt @@ -0,0 +1,13 @@ +``` +umi_tools dedup +``` + +dedup - Deduplicate reads using UMI and mapping coordinates + +Usage: umi_tools dedup [OPTIONS] [--stdin=IN_BAM] [--stdout=OUT_BAM] + + note: If --stdout is ommited, standard out is output. To + generate a valid BAM file on standard out, please + redirect log with --log=LOGFILE or --log2stderr + +For full UMI-tools documentation, see https://umi-tools.readthedocs.io/en/latest/ \ No newline at end of file diff --git a/src/umi_tools/umi_tools_dedup/script.sh b/src/umi_tools/umi_tools_dedup/script.sh new file mode 100644 index 00000000..57c01258 --- /dev/null +++ b/src/umi_tools/umi_tools_dedup/script.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +set -e + +test_dir="${metal_executable}/test_data" + +[[ "$par_paired" == "false" ]] && unset par_paired +[[ "$par_in_sam" == "false" ]] && unset par_in_sam +[[ "$par_out_sam" == "false" ]] && unset par_out_sam +[[ "$par_spliced_is_unique" == "false" ]] && unset par_spliced_is_unique +[[ "$par_per_gene" == "false" ]] && unset par_per_gene +[[ "$par_per_contig" == "false" ]] && unset par_per_contig +[[ "$par_per_cell" == "false" ]] && unset par_per_cell +[[ "$par_no_sort_output" == "false" ]] && unset par_no_sort_output +[[ "$par_buffer_whole_contig" == "false" ]] && unset par_buffer_whole_contig +[[ "$par_ignore_umi" == "false" ]] && unset par_ignore_umi +[[ "$par_subset" == "false" ]] && unset par_subset + + +$(which umi_tools) dedup \ + -I "$par_input" \ + ${par_in_sam:+--in-sam} \ + ${par_bai:+--bai "$par_bai"} \ + ${par_get_output_stats:+--get-output-stats} \ + ${par_random_seed:+--random-seed "$par_random_seed"} \ + -S "$par_output" \ + ${par_out_sam:+--out-sam} \ + ${par_paired:+--paired} \ + ${par_output_stats:+--output-stats "$par_output_stats"} \ + ${par_extract_umi_method:+--extract-umi-method "$par_extract_umi_method"} \ + ${par_umi_tag:+--umi-tag "$par_umi_tag"} \ + ${par_umi_separator:+--umi-separator "$par_umi_separator"} \ + ${par_umi_tag_split:+--umi-tag-split "$par_umi_tag_split"} \ + ${par_umi_tag_delimiter:+--umi-tag-delimiter "$par_umi_tag_delimiter"} \ + ${par_cell_tag:+--cell-tag "$par_cell_tag"} \ + ${par_cell_tag_split:+--cell-tag-split "$par_cell_tag_split"} \ + ${par_cell_tag_delimiter:+--cell-tag-delimiter "$par_cell_tag_delimiter"} \ + ${par_method:+--method "$par_method"} \ + ${par_edit_distance_threshold:+--edit-distance-threshold "$par_edit_distance_threshold"} \ + ${par_spliced_is_unique:+--spliced-is-unique} \ + ${par_soft_clip_threshold:+--soft-clip-threshold "$par_soft_clip_threshold"} \ + ${par_multimapping_detection_method:+--multimapping-detection-method "$par_multimapping_detection_method"} \ + ${par_read_length:+--read-length "$par_read_length"} \ + ${par_per_gene:+--per-gene} \ + ${par_gene_tag:+--gene-tag "$par_gene_tag"} \ + ${par_assigned_status_tag:+--assigned-status-tag "$par_assigned_status_tag"} \ + ${par_skip_tags_regex:+--skip-tags-regex "$par_skip_tags_regex"} \ + ${par_per_contig:+--per-contig} + ${par_gene_transcript_map:+--gene-transcript-map "$par_gene_transcript_map"} \ + ${par_per_cell:+--per-cell} \ + ${par_mapping_quality:+--mapping-quality "$par_mapping_quality"} \ + ${par_unmapped_reads:+--unmapped-reads "$par_unmapped_reads"} \ + ${par_chimeric_pairs:+--chimeric-pairs "$par_chimeric_pairs"} \ + ${par_unapired_reads:+--unapired-reads "$par_unapired_reads"} \ + ${par_ignore_umi:+--ignore-umi} \ + ${par_subset:+--subset} \ + ${par_chrom:+--chrom "$par_chrom"} \ + ${par_no_sort_output:+--no-sort-output} \ + ${par_buffer_whole_contig:+--buffer-whole-contig} + + +exit 0 \ No newline at end of file diff --git a/src/umi_tools/umi_tools_dedup/test.sh b/src/umi_tools/umi_tools_dedup/test.sh new file mode 100644 index 00000000..1459ec08 --- /dev/null +++ b/src/umi_tools/umi_tools_dedup/test.sh @@ -0,0 +1,49 @@ +#!/bin/bash + +test_dir="${meta_resources_dir}/test_data" +echo ">>> Testing $meta_functionality_name" + +"$meta_executable" \ + --bam "$test_dir/a.sorted.bam" \ + --bai "$test_dir/a.sorted.bam.bai" \ + --output "$test_dir/a.sorted.idxstats" + +echo ">>> Checking whether output exists" +[ ! -f "$test_dir/a.sorted.idxstats" ] && echo "File 'a.sorted.idxstats' does not exist!" && exit 1 + +echo ">>> Checking whether output is non-empty" +[ ! -s "$test_dir/a.sorted.idxstats" ] && echo "File 'a.sorted.idxstats' is empty!" && exit 1 + +echo ">>> Checking whether output is correct" +diff "$test_dir/a.sorted.idxstats" "$test_dir/a_ref.sorted.idxstats" || \ + (echo "Output file a.sorted.idxstats does not match expected output" && exit 1) + +rm "$test_dir/a.sorted.idxstats" + +############################################################################################ + +echo ">>> Testing $meta_functionality_name with singletons in the input" + +"$meta_executable" \ + --bam "$test_dir/test.paired_end.sorted.bam" \ + --bai "$test_dir/test.paired_end.sorted.bam.bai" \ + --output "$test_dir/test.paired_end.sorted.idxstats" + +echo ">>> Checking whether output exists" +[ ! -f "$test_dir/test.paired_end.sorted.idxstats" ] && \ + echo "File 'test.paired_end.sorted.idxstats' does not exist!" && exit 1 + +echo ">>> Checking whether output is non-empty" +[ ! -s "$test_dir/test.paired_end.sorted.idxstats" ] && \ + echo "File 'test.paired_end.sorted.idxstats' is empty!" && exit 1 + +echo ">>> Checking whether output is correct" +diff "$test_dir/test.paired_end.sorted.idxstats" "$test_dir/test_ref.paired_end.sorted.idxstats" || \ + (echo "Output file test.paired_end.sorted.idxstats does not match expected output" && exit 1) + +rm "$test_dir/test.paired_end.sorted.idxstats" + +############################################################################################ + +echo "All tests succeeded!" +exit 0 \ No newline at end of file