diff --git a/CHANGELOG.md b/CHANGELOG.md index 1bef9345..4fd7f001 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -39,9 +39,6 @@ - `samtools/flagstat`: Counts the number of alignments in SAM/BAM/CRAM files for each FLAG type (PR #31). - `samtools/idxstats`: Reports alignment summary statistics for a SAM/BAM/CRAM file (PR #32). -* `umi_tools`: - - `umi_tools/umi_tools_dedup`: Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read (PR #). - ## MAJOR CHANGES ## MINOR CHANGES diff --git a/src/umi_tools/umi_tools_dedup/config.vsh.yaml b/src/umi_tools/umi_tools_dedup/config.vsh.yaml deleted file mode 100644 index 75306541..00000000 --- a/src/umi_tools/umi_tools_dedup/config.vsh.yaml +++ /dev/null @@ -1,279 +0,0 @@ -name: umi_tool_dedup -namespace: umi_tools -description: | - Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read. -keywords: [umi_tools, deduplication, dedup] -links: - homepage: https://umi-tools.readthedocs.io/en/latest/ - documentation: [ https://umi-tools.readthedocs.io/en/latest/reference/dedup.html, - https://umi-tools.readthedocs.io/en/latest/common_options.html#common-options ] - repository: https://github.com/CGATOxford/UMI-tools -references: - doi: 10.1101/gr.209601.116 -license: MIT - -argument_groups: - - name: Inputs - arguments: - - name: --input - alternatives: -I - type: file - description: Input BAM or SAM file. Use --in_sam to specify SAM format. - required: true - - name: --in_sam - type: boolean_true - description: | - By default, inputs are assumed to be in BAM format. Use this options - to specify the use of SAM format for input. - - name: --bai - type: file - description: BAM index - - name: --get_output_stats - type: boolean - description: Whether or not to generate output stats. - - name: --random_seed - type: integer - description: | - Random seed to initialize number generator with. - default: none - - - name: Outputs - arguments: - - name: --output - alternatives: -S - type: file - description: Deduplicated BAM file - required: true - direction: output - - name: --out_sam - type: boolean_true - description: | - By default, outputa are written in BAM format. Use this options to - specify the use of SAM format for output. - - name: --paired - type: boolean_true - description: | - BAM is paired end - output both read pairs. This will also force the - use of the template length to determine reads with the same mapping - coordinates. - - name: --output_stats - type: file - description: Directory containing UMI based deduplication statistics files - direction: output - - name: --extract_umi_method - type: string - description: | - Specify the method by which the barcodes were encoded in the read. - The options are: [read_id, tag, umis]. - default: read_id - - name: --umi_tag - type: string - description: | - The tag containing the UMI sequence. - This is only required if the extract_umi_method is set to tag. - - name: --umi_separator - type: string - description: | - The separator used to separate the UMI from the read sequence. - This is only required if the extract_umi_method is set to id_read. - default: '_' - - name: --umi_tag_split - type: string - description: | - Separate the UMI in tag by and take the first element. - - name: --umi_tag_delimiter - type: string - description: | - Separate the UMI in by and concatenate the elements - - name: --cell_tag - type: string - description: | - The tag containing the cell barcode sequence. - This is only required if the extract_umi_method is set to tag. - - name: --cell_tag_split - type: string - description: | - Separate the cell barcode in tag by and take the first element. - - name: --cell_tag_delimiter - type: string - description: | - Separate the cell barcode in by and concatenate the elements - - - name: Grouping Options - arguments: - - name: --method - type: string - description: | - The method to use for grouping reads. The options are: - [unique, percentile, cluster, adjacency, directional]. - default: directional - - name: --edit_distance_threshold - type: integer - description: | - For the adjacency and cluster methods the threshold for the edit - distance to connect two UMIs in the network can be increased. The - default value of 1 works best unless the UMI is very long (>14bp). - default: 1 - - name: --spliced_is_unique - type: boolean_true - description: | - Causes two reads that start in the same position on the same strand - and having the same UMI to be considered unique if one is spliced - and the other is not. (Uses the ‘N’ cigar operation to test for splicing). - - name: --soft_clip_threshold - type: integer - description: | - Mappers that soft clip will sometimes do so rather than mapping a - spliced read if there is only a small overhang over the exon junction. - By setting this option, you can treat reads with at least this many - bases soft-clipped at the 3’ end as spliced. - default: 4 - - name: --multimapping_detection_method - type: string - description: | - If the sam/bam contains tags to identify multimapping reads, you can - specify for use when selecting the best read at a given loci. Supported - tags are “NH”, “X0” and “XT”. If not specified, the read with the highest - mapping quality will be selected. - - name: --read_length - type: integer - description: | - Use the read length as a criteria when deduping, for e.g sRNA-Seq. - - - name: Single-cell RNA-Seq Options - arguments: - - name: --per_gene - type: boolean_true - description: | - Reads will be grouped together if they have the same gene. This is useful - if your library prep generates PCR duplicates with non identical alignment - positions such as CEL-Seq. Note this option is hardcoded to be on with the - count command. I.e counting is always performed per-gene. Must be combined - with either --gene_tag or --per_contig option. - - name: --gene_tag - type: string - description: | - Deduplicate per gene. The gene information is encoded in the bam read tag - specified. - - name: --assigned_status_tag - type: string - description: | - BAM tag which describes whether a read is assigned to a gene. Defaults to - the same value as given for --gene_tag. - - name: --skip_tags_regex - type: string - description: | - Use in conjunction with the --assigned_status_tag option to skip any reads - where the tag matches this regex. Default ("^[__|Unassigned]") matches - anything which starts with “__” or “Unassigned”. - - name: --per_contig - type: boolean_true - description: | - Deduplicate per contig (field 3 in BAM; RNAME). All reads with the same - contig will be considered to have the same alignment position. This is - useful if you have aligned to a reference transcriptome with one - transcript per gene. If you have aligned to a transcriptome with more - than one transcript per gene, you can supply a map between transcripts - and gene using the --gene_transcript_map option. - - name: --gene_transcript_map - type: file - description: | - A file containing a mapping between gene names and transcript names. - The file should be tab separated with the gene name in the first column - and the transcript name in the second column. - - name: --per_cell - type: boolean_true - description: | - Reads will only be grouped together if they have the same cell barcode. - Can be combined with --per_gene. - - - name: SAM/BAM Options - arguments: - - name: --mapping_quality - type: integer - description: | - Minimium mapping quality (MAPQ) for a read to be retained. - default: 0 - - name: --unmapped_reads - type: string - description: | - How unmapped reads should be handled. - The options are: - "discard": Discard all unmapped reads. - "use": If read2 is unmapped, deduplicate using read1 only. - Requires --paired. - "output": Output unmapped reads/read pairs without UMI - grouping/deduplication. Only available in umi_tools group. - default: discard - - name: --chimeric_pairs - type: string - description: | - How chimeric pairs should be handled. - The options are: - "discard": Discard all chimeric read pairs. - "use": Deduplicate using read1 only. - "output": Output chimeric pairs without UMI grouping/deduplication. - Only available in umi_tools group. - default: use - - name: --unapired_reads - type: string - description: | - How unpaired reads should be handled. - The options are: - "discard": Discard all unpaired reads. - "use": Deduplicate using read1 only. - "output": Output unpaired reads without UMI grouping/deduplication. - Only available in umi_tools group. - default: use - - name: --ignore_umi - type: boolean_true - description: | - Ignore the UMI and group reads using mapping coordinates only. - - name: --subset - type: boolean_true - description: | - Only consider a fraction of the reads, chosen at random. This is useful - for doing saturation analyses. - - name: --chrom - type: string - description: | - Only consider a single chromosome. This is useful for debugging/testing - purposes. - - - name: Group/Dedup Options - arguments: - - name: --no_sort_output - type: boolean_true - description: | - By default, output is sorted. This involves the use of a temporary unsorted - file (saved in --temp-dir). Use this option to turn off sorting. - - name: --buffer_whole_contig - type: boolean_true - description: | - Forces dedup to parse an entire contig before yielding any reads for - deduplication. This is the only way to absolutely guarantee that all reads - with the same start position are grouped together for deduplication since - dedup uses the start position of the read, not the alignment coordinate on - which the reads are sorted. However, by default, dedup reads for another - 1000bp before outputting read groups which will avoid any reads being missed - with short read sequencing (<1000bp). - - -resources: - - type: bash_script - path: script.sh -test_resources: - - type: bash_script - path: test.sh - - type: file - path: test_data -engines: - - type: docker - image: quay.io/biocontainers/umi_tools:1.1.5--py39hf95cd2a_1 - setup: - - type: docker - run: | - umi_tools -v | sed 's/ version//g' > /var/software_versions.txt -runners: -- type: executable -- type: nextflow \ No newline at end of file diff --git a/src/umi_tools/umi_tools_dedup/help.txt b/src/umi_tools/umi_tools_dedup/help.txt deleted file mode 100644 index d3c8fa44..00000000 --- a/src/umi_tools/umi_tools_dedup/help.txt +++ /dev/null @@ -1,13 +0,0 @@ -``` -umi_tools dedup -``` - -dedup - Deduplicate reads using UMI and mapping coordinates - -Usage: umi_tools dedup [OPTIONS] [--stdin=IN_BAM] [--stdout=OUT_BAM] - - note: If --stdout is ommited, standard out is output. To - generate a valid BAM file on standard out, please - redirect log with --log=LOGFILE or --log2stderr - -For full UMI-tools documentation, see https://umi-tools.readthedocs.io/en/latest/ \ No newline at end of file diff --git a/src/umi_tools/umi_tools_dedup/script.sh b/src/umi_tools/umi_tools_dedup/script.sh deleted file mode 100644 index 57c01258..00000000 --- a/src/umi_tools/umi_tools_dedup/script.sh +++ /dev/null @@ -1,65 +0,0 @@ -#!/bin/bash - -## VIASH START -## VIASH END - -set -e - -test_dir="${metal_executable}/test_data" - -[[ "$par_paired" == "false" ]] && unset par_paired -[[ "$par_in_sam" == "false" ]] && unset par_in_sam -[[ "$par_out_sam" == "false" ]] && unset par_out_sam -[[ "$par_spliced_is_unique" == "false" ]] && unset par_spliced_is_unique -[[ "$par_per_gene" == "false" ]] && unset par_per_gene -[[ "$par_per_contig" == "false" ]] && unset par_per_contig -[[ "$par_per_cell" == "false" ]] && unset par_per_cell -[[ "$par_no_sort_output" == "false" ]] && unset par_no_sort_output -[[ "$par_buffer_whole_contig" == "false" ]] && unset par_buffer_whole_contig -[[ "$par_ignore_umi" == "false" ]] && unset par_ignore_umi -[[ "$par_subset" == "false" ]] && unset par_subset - - -$(which umi_tools) dedup \ - -I "$par_input" \ - ${par_in_sam:+--in-sam} \ - ${par_bai:+--bai "$par_bai"} \ - ${par_get_output_stats:+--get-output-stats} \ - ${par_random_seed:+--random-seed "$par_random_seed"} \ - -S "$par_output" \ - ${par_out_sam:+--out-sam} \ - ${par_paired:+--paired} \ - ${par_output_stats:+--output-stats "$par_output_stats"} \ - ${par_extract_umi_method:+--extract-umi-method "$par_extract_umi_method"} \ - ${par_umi_tag:+--umi-tag "$par_umi_tag"} \ - ${par_umi_separator:+--umi-separator "$par_umi_separator"} \ - ${par_umi_tag_split:+--umi-tag-split "$par_umi_tag_split"} \ - ${par_umi_tag_delimiter:+--umi-tag-delimiter "$par_umi_tag_delimiter"} \ - ${par_cell_tag:+--cell-tag "$par_cell_tag"} \ - ${par_cell_tag_split:+--cell-tag-split "$par_cell_tag_split"} \ - ${par_cell_tag_delimiter:+--cell-tag-delimiter "$par_cell_tag_delimiter"} \ - ${par_method:+--method "$par_method"} \ - ${par_edit_distance_threshold:+--edit-distance-threshold "$par_edit_distance_threshold"} \ - ${par_spliced_is_unique:+--spliced-is-unique} \ - ${par_soft_clip_threshold:+--soft-clip-threshold "$par_soft_clip_threshold"} \ - ${par_multimapping_detection_method:+--multimapping-detection-method "$par_multimapping_detection_method"} \ - ${par_read_length:+--read-length "$par_read_length"} \ - ${par_per_gene:+--per-gene} \ - ${par_gene_tag:+--gene-tag "$par_gene_tag"} \ - ${par_assigned_status_tag:+--assigned-status-tag "$par_assigned_status_tag"} \ - ${par_skip_tags_regex:+--skip-tags-regex "$par_skip_tags_regex"} \ - ${par_per_contig:+--per-contig} - ${par_gene_transcript_map:+--gene-transcript-map "$par_gene_transcript_map"} \ - ${par_per_cell:+--per-cell} \ - ${par_mapping_quality:+--mapping-quality "$par_mapping_quality"} \ - ${par_unmapped_reads:+--unmapped-reads "$par_unmapped_reads"} \ - ${par_chimeric_pairs:+--chimeric-pairs "$par_chimeric_pairs"} \ - ${par_unapired_reads:+--unapired-reads "$par_unapired_reads"} \ - ${par_ignore_umi:+--ignore-umi} \ - ${par_subset:+--subset} \ - ${par_chrom:+--chrom "$par_chrom"} \ - ${par_no_sort_output:+--no-sort-output} \ - ${par_buffer_whole_contig:+--buffer-whole-contig} - - -exit 0 \ No newline at end of file diff --git a/src/umi_tools/umi_tools_dedup/test.sh b/src/umi_tools/umi_tools_dedup/test.sh deleted file mode 100644 index 1459ec08..00000000 --- a/src/umi_tools/umi_tools_dedup/test.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/bin/bash - -test_dir="${meta_resources_dir}/test_data" -echo ">>> Testing $meta_functionality_name" - -"$meta_executable" \ - --bam "$test_dir/a.sorted.bam" \ - --bai "$test_dir/a.sorted.bam.bai" \ - --output "$test_dir/a.sorted.idxstats" - -echo ">>> Checking whether output exists" -[ ! -f "$test_dir/a.sorted.idxstats" ] && echo "File 'a.sorted.idxstats' does not exist!" && exit 1 - -echo ">>> Checking whether output is non-empty" -[ ! -s "$test_dir/a.sorted.idxstats" ] && echo "File 'a.sorted.idxstats' is empty!" && exit 1 - -echo ">>> Checking whether output is correct" -diff "$test_dir/a.sorted.idxstats" "$test_dir/a_ref.sorted.idxstats" || \ - (echo "Output file a.sorted.idxstats does not match expected output" && exit 1) - -rm "$test_dir/a.sorted.idxstats" - -############################################################################################ - -echo ">>> Testing $meta_functionality_name with singletons in the input" - -"$meta_executable" \ - --bam "$test_dir/test.paired_end.sorted.bam" \ - --bai "$test_dir/test.paired_end.sorted.bam.bai" \ - --output "$test_dir/test.paired_end.sorted.idxstats" - -echo ">>> Checking whether output exists" -[ ! -f "$test_dir/test.paired_end.sorted.idxstats" ] && \ - echo "File 'test.paired_end.sorted.idxstats' does not exist!" && exit 1 - -echo ">>> Checking whether output is non-empty" -[ ! -s "$test_dir/test.paired_end.sorted.idxstats" ] && \ - echo "File 'test.paired_end.sorted.idxstats' is empty!" && exit 1 - -echo ">>> Checking whether output is correct" -diff "$test_dir/test.paired_end.sorted.idxstats" "$test_dir/test_ref.paired_end.sorted.idxstats" || \ - (echo "Output file test.paired_end.sorted.idxstats does not match expected output" && exit 1) - -rm "$test_dir/test.paired_end.sorted.idxstats" - -############################################################################################ - -echo "All tests succeeded!" -exit 0 \ No newline at end of file