From 38f586bec0ac9e4312b016e29c3aa0bd53f292b2 Mon Sep 17 00:00:00 2001 From: emmarousseau Date: Thu, 11 Apr 2024 11:04:14 +0100 Subject: [PATCH 1/5] initial commit dedup --- CHANGELOG.md | 3 + src/umi_tools/umi_tools_dedup/config.vsh.yaml | 279 ++++++++++++++++++ src/umi_tools/umi_tools_dedup/help.txt | 13 + src/umi_tools/umi_tools_dedup/script.sh | 65 ++++ src/umi_tools/umi_tools_dedup/test.sh | 49 +++ 5 files changed, 409 insertions(+) create mode 100644 src/umi_tools/umi_tools_dedup/config.vsh.yaml create mode 100644 src/umi_tools/umi_tools_dedup/help.txt create mode 100644 src/umi_tools/umi_tools_dedup/script.sh create mode 100644 src/umi_tools/umi_tools_dedup/test.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 4fd7f001..1bef9345 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -39,6 +39,9 @@ - `samtools/flagstat`: Counts the number of alignments in SAM/BAM/CRAM files for each FLAG type (PR #31). - `samtools/idxstats`: Reports alignment summary statistics for a SAM/BAM/CRAM file (PR #32). +* `umi_tools`: + - `umi_tools/umi_tools_dedup`: Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read (PR #). + ## MAJOR CHANGES ## MINOR CHANGES diff --git a/src/umi_tools/umi_tools_dedup/config.vsh.yaml b/src/umi_tools/umi_tools_dedup/config.vsh.yaml new file mode 100644 index 00000000..75306541 --- /dev/null +++ b/src/umi_tools/umi_tools_dedup/config.vsh.yaml @@ -0,0 +1,279 @@ +name: umi_tool_dedup +namespace: umi_tools +description: | + Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read. +keywords: [umi_tools, deduplication, dedup] +links: + homepage: https://umi-tools.readthedocs.io/en/latest/ + documentation: [ https://umi-tools.readthedocs.io/en/latest/reference/dedup.html, + https://umi-tools.readthedocs.io/en/latest/common_options.html#common-options ] + repository: https://github.com/CGATOxford/UMI-tools +references: + doi: 10.1101/gr.209601.116 +license: MIT + +argument_groups: + - name: Inputs + arguments: + - name: --input + alternatives: -I + type: file + description: Input BAM or SAM file. Use --in_sam to specify SAM format. + required: true + - name: --in_sam + type: boolean_true + description: | + By default, inputs are assumed to be in BAM format. Use this options + to specify the use of SAM format for input. + - name: --bai + type: file + description: BAM index + - name: --get_output_stats + type: boolean + description: Whether or not to generate output stats. + - name: --random_seed + type: integer + description: | + Random seed to initialize number generator with. + default: none + + - name: Outputs + arguments: + - name: --output + alternatives: -S + type: file + description: Deduplicated BAM file + required: true + direction: output + - name: --out_sam + type: boolean_true + description: | + By default, outputa are written in BAM format. Use this options to + specify the use of SAM format for output. + - name: --paired + type: boolean_true + description: | + BAM is paired end - output both read pairs. This will also force the + use of the template length to determine reads with the same mapping + coordinates. + - name: --output_stats + type: file + description: Directory containing UMI based deduplication statistics files + direction: output + - name: --extract_umi_method + type: string + description: | + Specify the method by which the barcodes were encoded in the read. + The options are: [read_id, tag, umis]. + default: read_id + - name: --umi_tag + type: string + description: | + The tag containing the UMI sequence. + This is only required if the extract_umi_method is set to tag. + - name: --umi_separator + type: string + description: | + The separator used to separate the UMI from the read sequence. + This is only required if the extract_umi_method is set to id_read. + default: '_' + - name: --umi_tag_split + type: string + description: | + Separate the UMI in tag by and take the first element. + - name: --umi_tag_delimiter + type: string + description: | + Separate the UMI in by and concatenate the elements + - name: --cell_tag + type: string + description: | + The tag containing the cell barcode sequence. + This is only required if the extract_umi_method is set to tag. + - name: --cell_tag_split + type: string + description: | + Separate the cell barcode in tag by and take the first element. + - name: --cell_tag_delimiter + type: string + description: | + Separate the cell barcode in by and concatenate the elements + + - name: Grouping Options + arguments: + - name: --method + type: string + description: | + The method to use for grouping reads. The options are: + [unique, percentile, cluster, adjacency, directional]. + default: directional + - name: --edit_distance_threshold + type: integer + description: | + For the adjacency and cluster methods the threshold for the edit + distance to connect two UMIs in the network can be increased. The + default value of 1 works best unless the UMI is very long (>14bp). + default: 1 + - name: --spliced_is_unique + type: boolean_true + description: | + Causes two reads that start in the same position on the same strand + and having the same UMI to be considered unique if one is spliced + and the other is not. (Uses the ‘N’ cigar operation to test for splicing). + - name: --soft_clip_threshold + type: integer + description: | + Mappers that soft clip will sometimes do so rather than mapping a + spliced read if there is only a small overhang over the exon junction. + By setting this option, you can treat reads with at least this many + bases soft-clipped at the 3’ end as spliced. + default: 4 + - name: --multimapping_detection_method + type: string + description: | + If the sam/bam contains tags to identify multimapping reads, you can + specify for use when selecting the best read at a given loci. Supported + tags are “NH”, “X0” and “XT”. If not specified, the read with the highest + mapping quality will be selected. + - name: --read_length + type: integer + description: | + Use the read length as a criteria when deduping, for e.g sRNA-Seq. + + - name: Single-cell RNA-Seq Options + arguments: + - name: --per_gene + type: boolean_true + description: | + Reads will be grouped together if they have the same gene. This is useful + if your library prep generates PCR duplicates with non identical alignment + positions such as CEL-Seq. Note this option is hardcoded to be on with the + count command. I.e counting is always performed per-gene. Must be combined + with either --gene_tag or --per_contig option. + - name: --gene_tag + type: string + description: | + Deduplicate per gene. The gene information is encoded in the bam read tag + specified. + - name: --assigned_status_tag + type: string + description: | + BAM tag which describes whether a read is assigned to a gene. Defaults to + the same value as given for --gene_tag. + - name: --skip_tags_regex + type: string + description: | + Use in conjunction with the --assigned_status_tag option to skip any reads + where the tag matches this regex. Default ("^[__|Unassigned]") matches + anything which starts with “__” or “Unassigned”. + - name: --per_contig + type: boolean_true + description: | + Deduplicate per contig (field 3 in BAM; RNAME). All reads with the same + contig will be considered to have the same alignment position. This is + useful if you have aligned to a reference transcriptome with one + transcript per gene. If you have aligned to a transcriptome with more + than one transcript per gene, you can supply a map between transcripts + and gene using the --gene_transcript_map option. + - name: --gene_transcript_map + type: file + description: | + A file containing a mapping between gene names and transcript names. + The file should be tab separated with the gene name in the first column + and the transcript name in the second column. + - name: --per_cell + type: boolean_true + description: | + Reads will only be grouped together if they have the same cell barcode. + Can be combined with --per_gene. + + - name: SAM/BAM Options + arguments: + - name: --mapping_quality + type: integer + description: | + Minimium mapping quality (MAPQ) for a read to be retained. + default: 0 + - name: --unmapped_reads + type: string + description: | + How unmapped reads should be handled. + The options are: + "discard": Discard all unmapped reads. + "use": If read2 is unmapped, deduplicate using read1 only. + Requires --paired. + "output": Output unmapped reads/read pairs without UMI + grouping/deduplication. Only available in umi_tools group. + default: discard + - name: --chimeric_pairs + type: string + description: | + How chimeric pairs should be handled. + The options are: + "discard": Discard all chimeric read pairs. + "use": Deduplicate using read1 only. + "output": Output chimeric pairs without UMI grouping/deduplication. + Only available in umi_tools group. + default: use + - name: --unapired_reads + type: string + description: | + How unpaired reads should be handled. + The options are: + "discard": Discard all unpaired reads. + "use": Deduplicate using read1 only. + "output": Output unpaired reads without UMI grouping/deduplication. + Only available in umi_tools group. + default: use + - name: --ignore_umi + type: boolean_true + description: | + Ignore the UMI and group reads using mapping coordinates only. + - name: --subset + type: boolean_true + description: | + Only consider a fraction of the reads, chosen at random. This is useful + for doing saturation analyses. + - name: --chrom + type: string + description: | + Only consider a single chromosome. This is useful for debugging/testing + purposes. + + - name: Group/Dedup Options + arguments: + - name: --no_sort_output + type: boolean_true + description: | + By default, output is sorted. This involves the use of a temporary unsorted + file (saved in --temp-dir). Use this option to turn off sorting. + - name: --buffer_whole_contig + type: boolean_true + description: | + Forces dedup to parse an entire contig before yielding any reads for + deduplication. This is the only way to absolutely guarantee that all reads + with the same start position are grouped together for deduplication since + dedup uses the start position of the read, not the alignment coordinate on + which the reads are sorted. However, by default, dedup reads for another + 1000bp before outputting read groups which will avoid any reads being missed + with short read sequencing (<1000bp). + + +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - type: file + path: test_data +engines: + - type: docker + image: quay.io/biocontainers/umi_tools:1.1.5--py39hf95cd2a_1 + setup: + - type: docker + run: | + umi_tools -v | sed 's/ version//g' > /var/software_versions.txt +runners: +- type: executable +- type: nextflow \ No newline at end of file diff --git a/src/umi_tools/umi_tools_dedup/help.txt b/src/umi_tools/umi_tools_dedup/help.txt new file mode 100644 index 00000000..d3c8fa44 --- /dev/null +++ b/src/umi_tools/umi_tools_dedup/help.txt @@ -0,0 +1,13 @@ +``` +umi_tools dedup +``` + +dedup - Deduplicate reads using UMI and mapping coordinates + +Usage: umi_tools dedup [OPTIONS] [--stdin=IN_BAM] [--stdout=OUT_BAM] + + note: If --stdout is ommited, standard out is output. To + generate a valid BAM file on standard out, please + redirect log with --log=LOGFILE or --log2stderr + +For full UMI-tools documentation, see https://umi-tools.readthedocs.io/en/latest/ \ No newline at end of file diff --git a/src/umi_tools/umi_tools_dedup/script.sh b/src/umi_tools/umi_tools_dedup/script.sh new file mode 100644 index 00000000..57c01258 --- /dev/null +++ b/src/umi_tools/umi_tools_dedup/script.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +set -e + +test_dir="${metal_executable}/test_data" + +[[ "$par_paired" == "false" ]] && unset par_paired +[[ "$par_in_sam" == "false" ]] && unset par_in_sam +[[ "$par_out_sam" == "false" ]] && unset par_out_sam +[[ "$par_spliced_is_unique" == "false" ]] && unset par_spliced_is_unique +[[ "$par_per_gene" == "false" ]] && unset par_per_gene +[[ "$par_per_contig" == "false" ]] && unset par_per_contig +[[ "$par_per_cell" == "false" ]] && unset par_per_cell +[[ "$par_no_sort_output" == "false" ]] && unset par_no_sort_output +[[ "$par_buffer_whole_contig" == "false" ]] && unset par_buffer_whole_contig +[[ "$par_ignore_umi" == "false" ]] && unset par_ignore_umi +[[ "$par_subset" == "false" ]] && unset par_subset + + +$(which umi_tools) dedup \ + -I "$par_input" \ + ${par_in_sam:+--in-sam} \ + ${par_bai:+--bai "$par_bai"} \ + ${par_get_output_stats:+--get-output-stats} \ + ${par_random_seed:+--random-seed "$par_random_seed"} \ + -S "$par_output" \ + ${par_out_sam:+--out-sam} \ + ${par_paired:+--paired} \ + ${par_output_stats:+--output-stats "$par_output_stats"} \ + ${par_extract_umi_method:+--extract-umi-method "$par_extract_umi_method"} \ + ${par_umi_tag:+--umi-tag "$par_umi_tag"} \ + ${par_umi_separator:+--umi-separator "$par_umi_separator"} \ + ${par_umi_tag_split:+--umi-tag-split "$par_umi_tag_split"} \ + ${par_umi_tag_delimiter:+--umi-tag-delimiter "$par_umi_tag_delimiter"} \ + ${par_cell_tag:+--cell-tag "$par_cell_tag"} \ + ${par_cell_tag_split:+--cell-tag-split "$par_cell_tag_split"} \ + ${par_cell_tag_delimiter:+--cell-tag-delimiter "$par_cell_tag_delimiter"} \ + ${par_method:+--method "$par_method"} \ + ${par_edit_distance_threshold:+--edit-distance-threshold "$par_edit_distance_threshold"} \ + ${par_spliced_is_unique:+--spliced-is-unique} \ + ${par_soft_clip_threshold:+--soft-clip-threshold "$par_soft_clip_threshold"} \ + ${par_multimapping_detection_method:+--multimapping-detection-method "$par_multimapping_detection_method"} \ + ${par_read_length:+--read-length "$par_read_length"} \ + ${par_per_gene:+--per-gene} \ + ${par_gene_tag:+--gene-tag "$par_gene_tag"} \ + ${par_assigned_status_tag:+--assigned-status-tag "$par_assigned_status_tag"} \ + ${par_skip_tags_regex:+--skip-tags-regex "$par_skip_tags_regex"} \ + ${par_per_contig:+--per-contig} + ${par_gene_transcript_map:+--gene-transcript-map "$par_gene_transcript_map"} \ + ${par_per_cell:+--per-cell} \ + ${par_mapping_quality:+--mapping-quality "$par_mapping_quality"} \ + ${par_unmapped_reads:+--unmapped-reads "$par_unmapped_reads"} \ + ${par_chimeric_pairs:+--chimeric-pairs "$par_chimeric_pairs"} \ + ${par_unapired_reads:+--unapired-reads "$par_unapired_reads"} \ + ${par_ignore_umi:+--ignore-umi} \ + ${par_subset:+--subset} \ + ${par_chrom:+--chrom "$par_chrom"} \ + ${par_no_sort_output:+--no-sort-output} \ + ${par_buffer_whole_contig:+--buffer-whole-contig} + + +exit 0 \ No newline at end of file diff --git a/src/umi_tools/umi_tools_dedup/test.sh b/src/umi_tools/umi_tools_dedup/test.sh new file mode 100644 index 00000000..1459ec08 --- /dev/null +++ b/src/umi_tools/umi_tools_dedup/test.sh @@ -0,0 +1,49 @@ +#!/bin/bash + +test_dir="${meta_resources_dir}/test_data" +echo ">>> Testing $meta_functionality_name" + +"$meta_executable" \ + --bam "$test_dir/a.sorted.bam" \ + --bai "$test_dir/a.sorted.bam.bai" \ + --output "$test_dir/a.sorted.idxstats" + +echo ">>> Checking whether output exists" +[ ! -f "$test_dir/a.sorted.idxstats" ] && echo "File 'a.sorted.idxstats' does not exist!" && exit 1 + +echo ">>> Checking whether output is non-empty" +[ ! -s "$test_dir/a.sorted.idxstats" ] && echo "File 'a.sorted.idxstats' is empty!" && exit 1 + +echo ">>> Checking whether output is correct" +diff "$test_dir/a.sorted.idxstats" "$test_dir/a_ref.sorted.idxstats" || \ + (echo "Output file a.sorted.idxstats does not match expected output" && exit 1) + +rm "$test_dir/a.sorted.idxstats" + +############################################################################################ + +echo ">>> Testing $meta_functionality_name with singletons in the input" + +"$meta_executable" \ + --bam "$test_dir/test.paired_end.sorted.bam" \ + --bai "$test_dir/test.paired_end.sorted.bam.bai" \ + --output "$test_dir/test.paired_end.sorted.idxstats" + +echo ">>> Checking whether output exists" +[ ! -f "$test_dir/test.paired_end.sorted.idxstats" ] && \ + echo "File 'test.paired_end.sorted.idxstats' does not exist!" && exit 1 + +echo ">>> Checking whether output is non-empty" +[ ! -s "$test_dir/test.paired_end.sorted.idxstats" ] && \ + echo "File 'test.paired_end.sorted.idxstats' is empty!" && exit 1 + +echo ">>> Checking whether output is correct" +diff "$test_dir/test.paired_end.sorted.idxstats" "$test_dir/test_ref.paired_end.sorted.idxstats" || \ + (echo "Output file test.paired_end.sorted.idxstats does not match expected output" && exit 1) + +rm "$test_dir/test.paired_end.sorted.idxstats" + +############################################################################################ + +echo "All tests succeeded!" +exit 0 \ No newline at end of file From 2c269682620a407803e528652198646435ef2c03 Mon Sep 17 00:00:00 2001 From: emmarousseau Date: Thu, 11 Apr 2024 11:38:57 +0100 Subject: [PATCH 2/5] Revert "initial commit dedup" This reverts commit 38f586bec0ac9e4312b016e29c3aa0bd53f292b2. --- CHANGELOG.md | 3 - src/umi_tools/umi_tools_dedup/config.vsh.yaml | 279 ------------------ src/umi_tools/umi_tools_dedup/help.txt | 13 - src/umi_tools/umi_tools_dedup/script.sh | 65 ---- src/umi_tools/umi_tools_dedup/test.sh | 49 --- 5 files changed, 409 deletions(-) delete mode 100644 src/umi_tools/umi_tools_dedup/config.vsh.yaml delete mode 100644 src/umi_tools/umi_tools_dedup/help.txt delete mode 100644 src/umi_tools/umi_tools_dedup/script.sh delete mode 100644 src/umi_tools/umi_tools_dedup/test.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 1bef9345..4fd7f001 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -39,9 +39,6 @@ - `samtools/flagstat`: Counts the number of alignments in SAM/BAM/CRAM files for each FLAG type (PR #31). - `samtools/idxstats`: Reports alignment summary statistics for a SAM/BAM/CRAM file (PR #32). -* `umi_tools`: - - `umi_tools/umi_tools_dedup`: Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read (PR #). - ## MAJOR CHANGES ## MINOR CHANGES diff --git a/src/umi_tools/umi_tools_dedup/config.vsh.yaml b/src/umi_tools/umi_tools_dedup/config.vsh.yaml deleted file mode 100644 index 75306541..00000000 --- a/src/umi_tools/umi_tools_dedup/config.vsh.yaml +++ /dev/null @@ -1,279 +0,0 @@ -name: umi_tool_dedup -namespace: umi_tools -description: | - Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read. -keywords: [umi_tools, deduplication, dedup] -links: - homepage: https://umi-tools.readthedocs.io/en/latest/ - documentation: [ https://umi-tools.readthedocs.io/en/latest/reference/dedup.html, - https://umi-tools.readthedocs.io/en/latest/common_options.html#common-options ] - repository: https://github.com/CGATOxford/UMI-tools -references: - doi: 10.1101/gr.209601.116 -license: MIT - -argument_groups: - - name: Inputs - arguments: - - name: --input - alternatives: -I - type: file - description: Input BAM or SAM file. Use --in_sam to specify SAM format. - required: true - - name: --in_sam - type: boolean_true - description: | - By default, inputs are assumed to be in BAM format. Use this options - to specify the use of SAM format for input. - - name: --bai - type: file - description: BAM index - - name: --get_output_stats - type: boolean - description: Whether or not to generate output stats. - - name: --random_seed - type: integer - description: | - Random seed to initialize number generator with. - default: none - - - name: Outputs - arguments: - - name: --output - alternatives: -S - type: file - description: Deduplicated BAM file - required: true - direction: output - - name: --out_sam - type: boolean_true - description: | - By default, outputa are written in BAM format. Use this options to - specify the use of SAM format for output. - - name: --paired - type: boolean_true - description: | - BAM is paired end - output both read pairs. This will also force the - use of the template length to determine reads with the same mapping - coordinates. - - name: --output_stats - type: file - description: Directory containing UMI based deduplication statistics files - direction: output - - name: --extract_umi_method - type: string - description: | - Specify the method by which the barcodes were encoded in the read. - The options are: [read_id, tag, umis]. - default: read_id - - name: --umi_tag - type: string - description: | - The tag containing the UMI sequence. - This is only required if the extract_umi_method is set to tag. - - name: --umi_separator - type: string - description: | - The separator used to separate the UMI from the read sequence. - This is only required if the extract_umi_method is set to id_read. - default: '_' - - name: --umi_tag_split - type: string - description: | - Separate the UMI in tag by and take the first element. - - name: --umi_tag_delimiter - type: string - description: | - Separate the UMI in by and concatenate the elements - - name: --cell_tag - type: string - description: | - The tag containing the cell barcode sequence. - This is only required if the extract_umi_method is set to tag. - - name: --cell_tag_split - type: string - description: | - Separate the cell barcode in tag by and take the first element. - - name: --cell_tag_delimiter - type: string - description: | - Separate the cell barcode in by and concatenate the elements - - - name: Grouping Options - arguments: - - name: --method - type: string - description: | - The method to use for grouping reads. The options are: - [unique, percentile, cluster, adjacency, directional]. - default: directional - - name: --edit_distance_threshold - type: integer - description: | - For the adjacency and cluster methods the threshold for the edit - distance to connect two UMIs in the network can be increased. The - default value of 1 works best unless the UMI is very long (>14bp). - default: 1 - - name: --spliced_is_unique - type: boolean_true - description: | - Causes two reads that start in the same position on the same strand - and having the same UMI to be considered unique if one is spliced - and the other is not. (Uses the ‘N’ cigar operation to test for splicing). - - name: --soft_clip_threshold - type: integer - description: | - Mappers that soft clip will sometimes do so rather than mapping a - spliced read if there is only a small overhang over the exon junction. - By setting this option, you can treat reads with at least this many - bases soft-clipped at the 3’ end as spliced. - default: 4 - - name: --multimapping_detection_method - type: string - description: | - If the sam/bam contains tags to identify multimapping reads, you can - specify for use when selecting the best read at a given loci. Supported - tags are “NH”, “X0” and “XT”. If not specified, the read with the highest - mapping quality will be selected. - - name: --read_length - type: integer - description: | - Use the read length as a criteria when deduping, for e.g sRNA-Seq. - - - name: Single-cell RNA-Seq Options - arguments: - - name: --per_gene - type: boolean_true - description: | - Reads will be grouped together if they have the same gene. This is useful - if your library prep generates PCR duplicates with non identical alignment - positions such as CEL-Seq. Note this option is hardcoded to be on with the - count command. I.e counting is always performed per-gene. Must be combined - with either --gene_tag or --per_contig option. - - name: --gene_tag - type: string - description: | - Deduplicate per gene. The gene information is encoded in the bam read tag - specified. - - name: --assigned_status_tag - type: string - description: | - BAM tag which describes whether a read is assigned to a gene. Defaults to - the same value as given for --gene_tag. - - name: --skip_tags_regex - type: string - description: | - Use in conjunction with the --assigned_status_tag option to skip any reads - where the tag matches this regex. Default ("^[__|Unassigned]") matches - anything which starts with “__” or “Unassigned”. - - name: --per_contig - type: boolean_true - description: | - Deduplicate per contig (field 3 in BAM; RNAME). All reads with the same - contig will be considered to have the same alignment position. This is - useful if you have aligned to a reference transcriptome with one - transcript per gene. If you have aligned to a transcriptome with more - than one transcript per gene, you can supply a map between transcripts - and gene using the --gene_transcript_map option. - - name: --gene_transcript_map - type: file - description: | - A file containing a mapping between gene names and transcript names. - The file should be tab separated with the gene name in the first column - and the transcript name in the second column. - - name: --per_cell - type: boolean_true - description: | - Reads will only be grouped together if they have the same cell barcode. - Can be combined with --per_gene. - - - name: SAM/BAM Options - arguments: - - name: --mapping_quality - type: integer - description: | - Minimium mapping quality (MAPQ) for a read to be retained. - default: 0 - - name: --unmapped_reads - type: string - description: | - How unmapped reads should be handled. - The options are: - "discard": Discard all unmapped reads. - "use": If read2 is unmapped, deduplicate using read1 only. - Requires --paired. - "output": Output unmapped reads/read pairs without UMI - grouping/deduplication. Only available in umi_tools group. - default: discard - - name: --chimeric_pairs - type: string - description: | - How chimeric pairs should be handled. - The options are: - "discard": Discard all chimeric read pairs. - "use": Deduplicate using read1 only. - "output": Output chimeric pairs without UMI grouping/deduplication. - Only available in umi_tools group. - default: use - - name: --unapired_reads - type: string - description: | - How unpaired reads should be handled. - The options are: - "discard": Discard all unpaired reads. - "use": Deduplicate using read1 only. - "output": Output unpaired reads without UMI grouping/deduplication. - Only available in umi_tools group. - default: use - - name: --ignore_umi - type: boolean_true - description: | - Ignore the UMI and group reads using mapping coordinates only. - - name: --subset - type: boolean_true - description: | - Only consider a fraction of the reads, chosen at random. This is useful - for doing saturation analyses. - - name: --chrom - type: string - description: | - Only consider a single chromosome. This is useful for debugging/testing - purposes. - - - name: Group/Dedup Options - arguments: - - name: --no_sort_output - type: boolean_true - description: | - By default, output is sorted. This involves the use of a temporary unsorted - file (saved in --temp-dir). Use this option to turn off sorting. - - name: --buffer_whole_contig - type: boolean_true - description: | - Forces dedup to parse an entire contig before yielding any reads for - deduplication. This is the only way to absolutely guarantee that all reads - with the same start position are grouped together for deduplication since - dedup uses the start position of the read, not the alignment coordinate on - which the reads are sorted. However, by default, dedup reads for another - 1000bp before outputting read groups which will avoid any reads being missed - with short read sequencing (<1000bp). - - -resources: - - type: bash_script - path: script.sh -test_resources: - - type: bash_script - path: test.sh - - type: file - path: test_data -engines: - - type: docker - image: quay.io/biocontainers/umi_tools:1.1.5--py39hf95cd2a_1 - setup: - - type: docker - run: | - umi_tools -v | sed 's/ version//g' > /var/software_versions.txt -runners: -- type: executable -- type: nextflow \ No newline at end of file diff --git a/src/umi_tools/umi_tools_dedup/help.txt b/src/umi_tools/umi_tools_dedup/help.txt deleted file mode 100644 index d3c8fa44..00000000 --- a/src/umi_tools/umi_tools_dedup/help.txt +++ /dev/null @@ -1,13 +0,0 @@ -``` -umi_tools dedup -``` - -dedup - Deduplicate reads using UMI and mapping coordinates - -Usage: umi_tools dedup [OPTIONS] [--stdin=IN_BAM] [--stdout=OUT_BAM] - - note: If --stdout is ommited, standard out is output. To - generate a valid BAM file on standard out, please - redirect log with --log=LOGFILE or --log2stderr - -For full UMI-tools documentation, see https://umi-tools.readthedocs.io/en/latest/ \ No newline at end of file diff --git a/src/umi_tools/umi_tools_dedup/script.sh b/src/umi_tools/umi_tools_dedup/script.sh deleted file mode 100644 index 57c01258..00000000 --- a/src/umi_tools/umi_tools_dedup/script.sh +++ /dev/null @@ -1,65 +0,0 @@ -#!/bin/bash - -## VIASH START -## VIASH END - -set -e - -test_dir="${metal_executable}/test_data" - -[[ "$par_paired" == "false" ]] && unset par_paired -[[ "$par_in_sam" == "false" ]] && unset par_in_sam -[[ "$par_out_sam" == "false" ]] && unset par_out_sam -[[ "$par_spliced_is_unique" == "false" ]] && unset par_spliced_is_unique -[[ "$par_per_gene" == "false" ]] && unset par_per_gene -[[ "$par_per_contig" == "false" ]] && unset par_per_contig -[[ "$par_per_cell" == "false" ]] && unset par_per_cell -[[ "$par_no_sort_output" == "false" ]] && unset par_no_sort_output -[[ "$par_buffer_whole_contig" == "false" ]] && unset par_buffer_whole_contig -[[ "$par_ignore_umi" == "false" ]] && unset par_ignore_umi -[[ "$par_subset" == "false" ]] && unset par_subset - - -$(which umi_tools) dedup \ - -I "$par_input" \ - ${par_in_sam:+--in-sam} \ - ${par_bai:+--bai "$par_bai"} \ - ${par_get_output_stats:+--get-output-stats} \ - ${par_random_seed:+--random-seed "$par_random_seed"} \ - -S "$par_output" \ - ${par_out_sam:+--out-sam} \ - ${par_paired:+--paired} \ - ${par_output_stats:+--output-stats "$par_output_stats"} \ - ${par_extract_umi_method:+--extract-umi-method "$par_extract_umi_method"} \ - ${par_umi_tag:+--umi-tag "$par_umi_tag"} \ - ${par_umi_separator:+--umi-separator "$par_umi_separator"} \ - ${par_umi_tag_split:+--umi-tag-split "$par_umi_tag_split"} \ - ${par_umi_tag_delimiter:+--umi-tag-delimiter "$par_umi_tag_delimiter"} \ - ${par_cell_tag:+--cell-tag "$par_cell_tag"} \ - ${par_cell_tag_split:+--cell-tag-split "$par_cell_tag_split"} \ - ${par_cell_tag_delimiter:+--cell-tag-delimiter "$par_cell_tag_delimiter"} \ - ${par_method:+--method "$par_method"} \ - ${par_edit_distance_threshold:+--edit-distance-threshold "$par_edit_distance_threshold"} \ - ${par_spliced_is_unique:+--spliced-is-unique} \ - ${par_soft_clip_threshold:+--soft-clip-threshold "$par_soft_clip_threshold"} \ - ${par_multimapping_detection_method:+--multimapping-detection-method "$par_multimapping_detection_method"} \ - ${par_read_length:+--read-length "$par_read_length"} \ - ${par_per_gene:+--per-gene} \ - ${par_gene_tag:+--gene-tag "$par_gene_tag"} \ - ${par_assigned_status_tag:+--assigned-status-tag "$par_assigned_status_tag"} \ - ${par_skip_tags_regex:+--skip-tags-regex "$par_skip_tags_regex"} \ - ${par_per_contig:+--per-contig} - ${par_gene_transcript_map:+--gene-transcript-map "$par_gene_transcript_map"} \ - ${par_per_cell:+--per-cell} \ - ${par_mapping_quality:+--mapping-quality "$par_mapping_quality"} \ - ${par_unmapped_reads:+--unmapped-reads "$par_unmapped_reads"} \ - ${par_chimeric_pairs:+--chimeric-pairs "$par_chimeric_pairs"} \ - ${par_unapired_reads:+--unapired-reads "$par_unapired_reads"} \ - ${par_ignore_umi:+--ignore-umi} \ - ${par_subset:+--subset} \ - ${par_chrom:+--chrom "$par_chrom"} \ - ${par_no_sort_output:+--no-sort-output} \ - ${par_buffer_whole_contig:+--buffer-whole-contig} - - -exit 0 \ No newline at end of file diff --git a/src/umi_tools/umi_tools_dedup/test.sh b/src/umi_tools/umi_tools_dedup/test.sh deleted file mode 100644 index 1459ec08..00000000 --- a/src/umi_tools/umi_tools_dedup/test.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/bin/bash - -test_dir="${meta_resources_dir}/test_data" -echo ">>> Testing $meta_functionality_name" - -"$meta_executable" \ - --bam "$test_dir/a.sorted.bam" \ - --bai "$test_dir/a.sorted.bam.bai" \ - --output "$test_dir/a.sorted.idxstats" - -echo ">>> Checking whether output exists" -[ ! -f "$test_dir/a.sorted.idxstats" ] && echo "File 'a.sorted.idxstats' does not exist!" && exit 1 - -echo ">>> Checking whether output is non-empty" -[ ! -s "$test_dir/a.sorted.idxstats" ] && echo "File 'a.sorted.idxstats' is empty!" && exit 1 - -echo ">>> Checking whether output is correct" -diff "$test_dir/a.sorted.idxstats" "$test_dir/a_ref.sorted.idxstats" || \ - (echo "Output file a.sorted.idxstats does not match expected output" && exit 1) - -rm "$test_dir/a.sorted.idxstats" - -############################################################################################ - -echo ">>> Testing $meta_functionality_name with singletons in the input" - -"$meta_executable" \ - --bam "$test_dir/test.paired_end.sorted.bam" \ - --bai "$test_dir/test.paired_end.sorted.bam.bai" \ - --output "$test_dir/test.paired_end.sorted.idxstats" - -echo ">>> Checking whether output exists" -[ ! -f "$test_dir/test.paired_end.sorted.idxstats" ] && \ - echo "File 'test.paired_end.sorted.idxstats' does not exist!" && exit 1 - -echo ">>> Checking whether output is non-empty" -[ ! -s "$test_dir/test.paired_end.sorted.idxstats" ] && \ - echo "File 'test.paired_end.sorted.idxstats' is empty!" && exit 1 - -echo ">>> Checking whether output is correct" -diff "$test_dir/test.paired_end.sorted.idxstats" "$test_dir/test_ref.paired_end.sorted.idxstats" || \ - (echo "Output file test.paired_end.sorted.idxstats does not match expected output" && exit 1) - -rm "$test_dir/test.paired_end.sorted.idxstats" - -############################################################################################ - -echo "All tests succeeded!" -exit 0 \ No newline at end of file From 4b11f7f2332517695c6c3d247e25e585c3dd8522 Mon Sep 17 00:00:00 2001 From: emmarousseau Date: Sun, 1 Sep 2024 20:08:59 +0200 Subject: [PATCH 3/5] Functional component with tests --- src/sortmerna/config.vsh.yaml | 292 ++++++++++++++++++++ src/sortmerna/help.txt | 319 ++++++++++++++++++++++ src/sortmerna/script.sh | 108 ++++++++ src/sortmerna/test.sh | 52 ++++ src/sortmerna/test_data/rRNA/database1.fa | 24 ++ src/sortmerna/test_data/rRNA/database2.fa | 16 ++ src/sortmerna/test_data/reads_1.fq.gz | Bin 0 -> 189 bytes src/sortmerna/test_data/reads_2.fq.gz | Bin 0 -> 147 bytes src/sortmerna/test_data/script.sh | 8 + 9 files changed, 819 insertions(+) create mode 100644 src/sortmerna/config.vsh.yaml create mode 100644 src/sortmerna/help.txt create mode 100755 src/sortmerna/script.sh create mode 100644 src/sortmerna/test.sh create mode 100644 src/sortmerna/test_data/rRNA/database1.fa create mode 100644 src/sortmerna/test_data/rRNA/database2.fa create mode 100644 src/sortmerna/test_data/reads_1.fq.gz create mode 100644 src/sortmerna/test_data/reads_2.fq.gz create mode 100755 src/sortmerna/test_data/script.sh diff --git a/src/sortmerna/config.vsh.yaml b/src/sortmerna/config.vsh.yaml new file mode 100644 index 00000000..23925132 --- /dev/null +++ b/src/sortmerna/config.vsh.yaml @@ -0,0 +1,292 @@ +name: sortmerna +description: | + Local sequence alignment tool for filtering, mapping and clustering. The main + application of SortMeRNA is filtering rRNA from metatranscriptomic data. SortMeRNA + takes as input files of reads (fasta, fastq, fasta.gz, fastq.gz) and one or multiple + rRNA database file(s), and sorts apart aligned and rejected reads into two files. +keywords: [sort, mRNA, rRNA, alignment, filtering, mapping, clustering] +links: + homepage: https://sortmerna.readthedocs.io/en/latest/ + documentation: https://sortmerna.readthedocs.io/en/latest/manual4.0.html + repository: https://github.com/sortmerna/sortmerna +references: + doi: 10.1093/bioinformatics/bts611 +license: GPL-3.0 + +argument_groups: +- name: "Input" + arguments: + - name: "--paired" + type: boolean_true + description: | + Reads are paired-end. If a single reads file is provided, use this option + to indicate the file contains interleaved paired reads when neither + 'paired_in' | 'paired_out' | 'out2' | 'sout' are specified. + - name: "--input" + type: file + multiple: true + description: Input fastq + - name: "--ref" + type: file + multiple: true + description: Reference fasta file(s) for rRNA database. + - name: "--ribo_database_manifest" + type: file + description: Text file containing paths to fasta files (one per line) that will be used to create the database for SortMeRNA. + +- name: "Output" + arguments: + - name: "--log" + type: file + direction: output + must_exist: false + example: $id.sortmerna.log + description: Sortmerna log file. + - name: "--output" + alternatives: ["--aligned"] + type: string + description: | + Directory and file prefix for aligned output. The appropriate extension: + (fasta|fastq|blast|sam|etc) is automatically added. + If 'dir' is not specified, the output is created in the WORKDIR/out/. + If 'pfx' is not specified, the prefix 'aligned' is used. + - name: "--other" + type: string + description: Create Non-aligned reads output file with this path/prefix. Must be used with fastx. + +- name: "Options" + arguments: + - name: "--kvdb" + type: string + description: Path to directory of the key-value database file, used for storing the alignment results. + - name: "--idx_dir" + type: string + description: Path to the directory for storing the reference index files. + - name: "--readb" + type: string + description: Path to the directory for storing pre-processed reads. + - name: "--fastx" + type: boolean_true + description: Output aligned reads into FASTA/FASTQ file + - name: "--sam" + type: boolean_true + description: Output SAM alignment for aligned reads. + - name: "--sq" + type: boolean_true + description: Add SQ tags to the SAM file + - name: "--blast" + type: string + description: | + Blast options: + * '0' - pairwise + * '1' - tabular(Blast - m 8 format) + * '1 cigar' - tabular + column for CIGAR + * '1 cigar qcov' - tabular + columns for CIGAR and query coverage + * '1 cigar qcov qstrand' - tabular + columns for CIGAR, query coverage and strand + choices: ['0', '1', '1 cigar', '1 cigar qcov', '1 cigar qcov qstrand'] + - name: "--num_alignments" + type: integer + description: | + Report first INT alignments per read reaching E-value. If Int = 0, all alignments will be output. Default: '0' + example: 0 + - name: "--min_lis" + type: integer + description: | + search all alignments having the first INT longest LIS. LIS stands for Longest Increasing Subsequence, it is + computed using seeds’ positions to expand hits into longer matches prior to Smith-Waterman alignment. Default: '2'. + example: 2 + - name: "--print_all_reads" + type: boolean_true + description: output null alignment strings for non-aligned reads to SAM and/or BLAST tabular files. + - name: "--paired_in" + type: boolean_true + description: | + In the case where a pair of reads is aligned with a score above the threshold, the output of the reads is controlled + by the following options: + * --paired_in and --paired_out are both false: Only one read per pair is output to the aligned fasta file. + * --paired_in is true and --paired_out is false: Both reads of the pair are output to the aligned fasta file. + * --paired_in is false and --paired_out is true: Both reads are output the the other fasta file (if it is specified). + - name: "--paired_out" + type: boolean_true + description: See description of --paired_in. + - name: "--out2" + type: boolean_true + description: | + Output paired reads into separate files. Must be used with '--fastx'. If a single reads file is provided, this options + implies interleaved paired reads. When used with 'sout', four (4) output files for aligned reads will be generated: + 'aligned-paired-fwd, aligned-paired-rev, aligned-singleton-fwd, aligned-singleton-rev'. If 'other' option is also used, + eight (8) output files will be generated. + - name: "--sout" + type: boolean_true + description: | + Separate paired and singleton aligned reads. Must be used with '--fastx'. If a single reads file is provided, + this options implies interleaved paired reads. Cannot be used with '--paired_in' or '--paired_out'. + - name: "--zip_out" + type: string + description: | + Compress the output files. The possible values are: + * '1/true/t/yes/y' + * '0/false/f/no/n' + *'-1' (the same format as input - default) + The values are Not case sensitive. + choices: ['1', 'true', 't', 'yes', 'y', '0', 'false', 'f', 'no', 'n', '-1'] + example: "-1" + - name: "--match" + type: integer + description: | + Smith-Waterman score for a match (positive integer). Default: '2'. + example: 2 + - name: "--mismatch" + type: integer + description: | + Smith-Waterman penalty for a mismatch (negative integer). Default: '-3'. + example: -3 + - name: "--gap_open" + type: integer + description: | + Smith-Waterman penalty for introducing a gap (positive integer). Default: '5'. + example: 5 + - name: "--gap_ext" + type: integer + description: | + Smith-Waterman penalty for extending a gap (positive integer). Default: '2'. + example: 2 + - name: "--N" + type: integer + description: | + Smith-Waterman penalty for ambiguous letters (N’s) scored as --mismatch. Default: '-1'.\ + example: -1 + - name: "--a" + type: integer + description: | + Number of threads to use. Default: '1'. + example: 1 + - name: "--e" + type: double + description: | + E-value threshold. Default: '1'. + example: 1 + - name: "--F" + type: boolean_true + description: Search only the forward strand. + - name: "--R" + type: boolean_true + description: Search only the reverse-complementary strand. + - name: "--num_alignment" + type: integer + description: | + Report first INT alignments per read reaching E-value (--num_alignments 0 signifies all alignments will be output). + Default: '-1' + example: -1 + - name: "--best" + type: integer + description: | + Report INT best alignments per read reaching E-value by searching --min_lis INT candidate alignments (--best 0 + signifies all candidate alignments will be searched) Default: '1'. + example: 1 + - name: "--verbose" + alternatives: ["-v"] + type: boolean_true + description: Verbose output. + +- name: "OTU picking options" + arguments: + - name: "--id" + type: double + description: | + %id similarity threshold (the alignment must still pass the E-value threshold). Default: '0.97'. + example: 0.97 + - name: "--coverage" + type: double + description: | + %query coverage threshold (the alignment must still pass the E-value threshold). Default: '0.97'. + example: 0.97 + - name: "--de_novo" + type: boolean_true + description: | + FASTA/FASTQ file for reads matching database < %id off (set using --id) and < %cov (set using --coverage) + (alignment must still pass the E-value threshold). + - name: "--otu_map" + type: boolean_true + description: | + Output OTU map (input to QIIME’s make_otu_table.py). + +- name: "Advanced options" + arguments: + - name: "--num_seed" + type: integer + description: | + Number of seeds matched before searching for candidate LIS. Default: '2'. + example: 2 + - name: "--passes" + type: integer + multiple: true + description: | + Three intervals at which to place the seed on the read L,L/2,3 (L is the seed length set in ./indexdb_rna). + - name: "--edge" + type: string + description: | + The number (or percentage if followed by %) of nucleotides to add to each edge of the alignment region on the + reference sequence before performing Smith-Waterman alignment. Default: '4'. + example: 4 + - name: "--full_search" + type: boolean_true + description: | + Search for all 0-error and 1-error seed off matches in the index rather than stopping after finding a 0-error match + (<1% gain in sensitivity with up four-fold decrease in speed). + +- name: "Indexing Options" + arguments: + - name: "--index" + type: integer + description: | + Create index files for the reference database. By default when this option is not used, the program checks the + reference index and builds it if not already existing. + This can be changed by using '-index' as follows: + * '-index 0' - skip indexing. If the index does not exist, the program will terminate + and warn to build the index prior performing the alignment + * '-index 1' - only perform the indexing and terminate + * '-index 2' - the default behaviour, the same as when not using this option at all + example: 2 + choices: [0, 1, 2] + - name: "-L" + type: double + description: | + Indexing seed length. Default: '18' + example: 18 + - name: "--interval" + type: integer + description: | + Index every Nth L-mer in the reference database. Default: '1' + example: 1 + - name: "--max_pos" + type: integer + description: | + Maximum number of positions to store for each unique L-mer. Set to 0 to store all positions. Default: '1000' + example: 1000 + + + +resources: + - type: bash_script + path: script.sh + +test_resources: + - type: bash_script + path: test.sh + - path: test_data + +engines: +- type: docker + image: ubuntu:22.04 + setup: + - type: docker + run: | + apt-get update && \ + apt-get install -y --no-install-recommends gzip cmake g++ wget && \ + apt-get clean && \ + wget --no-check-certificate https://github.com/sortmerna/sortmerna/releases/download/v4.3.6/sortmerna-4.3.6-Linux.sh && \ + bash sortmerna-4.3.6-Linux.sh --skip-license +runners: +- type: executable +- type: nextflow \ No newline at end of file diff --git a/src/sortmerna/help.txt b/src/sortmerna/help.txt new file mode 100644 index 00000000..f0842707 --- /dev/null +++ b/src/sortmerna/help.txt @@ -0,0 +1,319 @@ +``` +sortmerna -h +``` + + + Program: SortMeRNA version 4.3.6 + Copyright: 2016-2020 Clarity Genomics BVBA: + Turnhoutseweg 30, 2340 Beerse, Belgium + 2014-2016 Knight Lab: + Department of Pediatrics, UCSD, La Jolla + 2012-2014 Bonsai Bioinformatics Research Group: + LIFL, University Lille 1, CNRS UMR 8022, INRIA Nord-Europe + Disclaimer: SortMeRNA comes with ABSOLUTELY NO WARRANTY; without even the + implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU Lesser General Public License for more details. + Contributors: Jenya Kopylova jenya.kopylov@gmail.com + Laurent Noé laurent.noe@lifl.fr + Pierre Pericard pierre.pericard@lifl.fr + Daniel McDonald wasade@gmail.com + Mikaël Salson mikael.salson@lifl.fr + Hélène Touzet helene.touzet@lifl.fr + Rob Knight robknight@ucsd.edu + + Usage: sortmerna -ref FILE [-ref FILE] -reads FWD_READS [-reads REV_READS] [OPTIONS]: + ------------------------------------------------------------------------------------------------------------- + | option type-format description default | + ------------------------------------------------------------------------------------------------------------- + + [REQUIRED] + --ref PATH Required Reference file (FASTA) absolute or relative path. + + Use mutliple times, once per a reference file + + + --reads PATH Required Raw reads file (FASTA/FASTQ/FASTA.GZ/FASTQ.GZ). + + Use twice for files with paired reads. + The file extensions are Not important. The program automatically + recognizes the file format as flat/compressed, fasta/fastq + + + + [COMMON] + --workdir PATH Optional Workspace directory USRDIR/sortmerna/run/ + + Default structure: WORKDIR/ + idx/ (References index) + kvdb/ (Key-value storage for alignments) + out/ (processing output) + readb/ (pre-processed reads/index) + + + --kvdb PATH Optional Directory for Key-value database WORKDIR/kvdb + + KVDB is used for storing the alignment results. + + + --idx-dir PATH Optional Directory for storing Reference index. WORKDIR/idx + + + --readb PATH Optional Storage for pre-processed reads WORKDIR/readb/ + + Directory storing the split reads, or the random access index of compressed reads + + + --fastx BOOL Optional Output aligned reads into FASTA/FASTQ file + --sam BOOL Optional Output SAM alignment for aligned reads. + + + --SQ BOOL Optional Add SQ tags to the SAM file + + + --blast STR Optional output alignments in various Blast-like formats + + Sample values: '0' - pairwise + '1' - tabular (Blast - m 8 format) + '1 cigar' - tabular + column for CIGAR + '1 cigar qcov' - tabular + columns for CIGAR and query coverage + '1 cigar qcov qstrand' - tabular + columns for CIGAR, query coverage, + and strand + + + --aligned STR/BOOL Optional Aligned reads file prefix [dir/][pfx] WORKDIR/out/aligned + + Directory and file prefix for aligned output i.e. each + output file goes into the specified directory with the given prefix. + The appropriate extension: (fasta|fastq|blast|sam|etc) is automatically added. + Both 'dir' and 'pfx' are optional. + The 'dir' can be a relative or an absolute path. + If 'dir' is not specified, the output is created in the WORKDIR/out/ + If 'pfx' is not specified, the prefix 'aligned' is used + Examples: + '-aligned $MYDIR/dir_1/dir_2/1' -> $MYDIR/dir_1/dir_2/1.fasta + '-aligned dir_1/apfx' -> $PWD/dir_1/apfx.fasta + '-aligned dir_1/' -> $PWD/aligned.fasta + '-aligned apfx' -> $PWD/apfx.fasta + '-aligned (no argument)' -> WORKDIR/out/aligned.fasta + + + --other STR/BOOL Optional Non-aligned reads file prefix [dir/][pfx] WORKDIR/out/other + + Directory and file prefix for non-aligned output i.e. each + output file goes into the specified directory with the given prefix. + The appropriate extension: (fasta|fastq|blast|sam|etc) is automatically added. + Must be used with 'fastx'. + Both 'dir' and 'pfx' are optional. + The 'dir' can be a relative or an absolute path. + If 'dir' is not specified, the output is created in the WORKDIR/out/ + If 'pfx' is not specified, the prefix 'other' is used + Examples: + '-other $MYDIR/dir_1/dir_2/1' -> $MYDIR/dir_1/dir_2/1.fasta + '-other dir_1/apfx' -> $PWD/dir_1/apfx.fasta + '-other dir_1/' -> $PWD/dir_1/other.fasta + '-other apfx' -> $PWD/apfx.fasta + '-other (no argument)' -> aligned_out/other.fasta + i.e. the same output directory + as used for aligned output + + + --num_alignments INT Optional Positive integer (INT >=0). + + If used with '-no-best' reports first INT alignments per read reaching + E-value threshold, which allows to lower the CPU time and memory use. + Otherwise outputs INT best alignments. + If INT = 0, all alignments are output + + + --no-best BOOL Optional Disable best alignments search False + + The 'best' alignment is the highest scoring alignment out of All alignments of a read, + and the read can potentially be aligned (reaching E-value threshold) to multiple reference + sequences. + By default the program searches for best alignments i.e. performs an exhaustive search + over all references. Using '-no-best' will make the program to search just + the first N alignments, where N is set using '-num_alignments' i.e. 1 by default. + + + --min_lis INT Optional Search only alignments that have the LIS 2 + of at least N seeds long + + LIS stands for Longest Increasing Subsequence. It is computed using seeds, which + are k-mers common to the read and the reference sequence. Sorted sequences of such seeds + are used to filter the candidate references prior performing the Smith-Waterman alignment. + + + --print_all_reads BOOL Optional Output null alignment strings for non-aligned reads False + to SAM and/or BLAST tabular files + + --paired BOOL Optional Flags paired reads False + + If a single reads file is provided, use this option to indicate + the file contains interleaved paired reads when neither + 'paired_in' | 'paired_out' | 'out2' | 'sout' are specified. + + + --paired_in BOOL Optional Flags the paired-end reads as Aligned, False + when either of them is Aligned. + + With this option both reads are output into Aligned FASTA/Q file + Must be used with 'fastx'. + Mutually exclusive with 'paired_out'. + + + --paired_out BOOL Optional Flags the paired-end reads as Non-aligned, False + when either of them is non-aligned. + + With this option both reads are output into Non-Aligned FASTA/Q file + Must be used with 'fastx'. + Mutually exclusive with 'paired_in'. + + + --out2 BOOL Optional Output paired reads into separate files. False + + Must be used with 'fastx'. + If a single reads file is provided, this options implies interleaved paired reads + When used with 'sout', four (4) output files for aligned reads will be generated: + 'aligned-paired-fwd, aligned-paired-rev, aligned-singleton-fwd, aligned-singleton-rev'. + If 'other' option is also used, eight (8) output files will be generated. + + + --sout BOOL Optional Separate paired and singleton aligned reads. False + + To be used with 'fastx'. + If a single reads file is provided, this options implies interleaved paired reads + Cannot be used with 'paired_in' | 'paired_out' + + + --zip-out STR/BOOL Optional Controls the output compression '-1' + + By default the report files are produced in the same format as the input i.e. + if the reads files are compressed (gz), the output is also compressed. + The default behaviour can be overriden by using '-zip-out'. + The possible values: '1/true/t/yes/y' + '0/false/f/no/n' + '-1' (the same format as input - default) + The values are Not case sensitive i.e. 'Yes, YES, yEs, Y, y' are all OK + Examples: + '-reads freads.gz -zip-out n' : generate flat output when the input is compressed + '-reads freads.flat -zip-out' : compress the output when the input files are flat + + + --match INT Optional SW score (positive integer) for a match. 2 + + --mismatch INT Optional SW penalty (negative integer) for a mismatch. -3 + + --gap_open INT Optional SW penalty (positive integer) for introducing a gap. 5 + + --gap_ext INT Optional SW penalty (positive integer) for extending a gap. 2 + + -e DOUBLE Optional E-value threshold. 1 + + Defines the 'statistical significance' of a local alignment. + Exponentially correllates with the Minimal Alignment score. + Higher E-values (100, 1000, ...) cause More reads to Pass the alignment threshold + + + -F BOOL Optional Search only the forward strand. False + + -N BOOL Optional SW penalty for ambiguous letters (N's) scored + as --mismatch + + -R BOOL Optional Search only the reverse-complementary strand. False + + + [OTU_PICKING] + --id INT Optional %%id similarity threshold (the alignment 0.97 + must still pass the E-value threshold). + + --coverage INT Optional %%query coverage threshold (the alignment must 0.97 + still pass the E-value threshold) + + --de_novo_otu BOOL Optional Output FASTA file with 'de novo' reads False + + Read is 'de novo' if its alignment score passes E-value threshold, but both the identity + '-id', and the '-coverage' are below their corresponding thresholds + i.e. ID < %%id and COV < %%cov + + + --otu_map BOOL Optional Output OTU map (input to QIIME's make_otu_table.py). False + Cannot be used with 'no-best because + the grouping is done around the best alignment' + + + [ADVANCED] + --passes INT,INT,INT Optional Three intervals at which to place the seed on L,L/2,3 + the read (L is the seed length) + + --edges INT Optional Number (or percent if INT followed by %% sign) of 4 + nucleotides to add to each edge of the read + prior to SW local alignment + + --num_seeds BOOL Optional Number of seeds matched before searching 2 + for candidate LIS + + --full_search INT Optional Search for all 0-error and 1-error seed False + matches in the index rather than stopping + after finding a 0-error match (<1%% gain in + sensitivity with up four-fold decrease in speed) + + --pid BOOL Optional Add pid to output file names. False + + -a INT Optional DEPRECATED in favour of '-threads'. Number of numCores + processing threads to use. + Automatically redirects to '-threads' + + --threads INT Optional Number of Processing threads to use 2 + + + [INDEXING] + --index INT Optional Build reference database index 2 + + By default when this option is not used, the program checks the reference index and + builds it if not already existing. + This can be changed by using '-index' as follows: + '-index 0' - skip indexing. If the index does not exist, the program will terminate + and warn to build the index prior performing the alignment + '-index 1' - only perform the indexing and terminate + '-index 2' - the default behaviour, the same as when not using this option at all + + + -L DOUBLE Optional Indexing: seed length. 18 + + -m DOUBLE Optional Indexing: the amount of memory (in Mbytes) for 3072 + building the index. + + -v BOOL Optional Produce verbose output when building the index True + + --interval INT Optional Indexing: Positive integer: index every Nth L-mer in 1 + the reference database e.g. '-interval 2'. + + --max_pos INT Optional Indexing: maximum (integer) number of positions to 1000 + store for each unique L-mer. + If 0 - all positions are stored. + + + [HELP] + -h BOOL Optional Print help information + + --version BOOL Optional Print SortMeRNA version number + + + [DEVELOPER] + --dbg_put_db BOOL Optional + --cmd BOOL Optional Launch an interactive session (command prompt) False + + --task INT Optional Processing Task 4 + + Possible values: 0 - align. Only perform alignment + 1 - post-processing (log writing) + 2 - generate reports + 3 - align and post-process + 4 - all + + + --dbg-level INT Optional Debug level 0 + + Controls verbosity of the execution trace. Default value of 0 corresponds to + the least verbose output. + The highest value currently is 2. diff --git a/src/sortmerna/script.sh b/src/sortmerna/script.sh new file mode 100755 index 00000000..8dda3d60 --- /dev/null +++ b/src/sortmerna/script.sh @@ -0,0 +1,108 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +set -eo pipefail + +unset_if_false=( par_fastx par_sq par_fastx par_print_all_reads par_paired_in par_paired_out + par_F par_R par_verbose par_de_novo par_otu_map par_full_search par_out2 + par_sout par_sam par_paired ) + + +for var in "${unset_if_false[@]}"; do + if [ "${!var}" == "false" ]; then + unset $var + fi +done + +reads=() +IFS=";" read -ra input <<< "$par_input" +if [ "${#input[@]}" -eq 2 ]; then + reads="--reads ${input[0]} --reads ${input[1]}" + # set paired to true in case it's not + par_paired=true +else + reads="--reads ${input[0]}" + par_paired=false +fi + +refs=() + +# check if references are input normally or through a manifest file +if [[ ! -z "$par_ribo_database_manifest" ]]; then + while IFS= read -r path || [[ -n $path ]]; do + refs=$refs" --ref $path" + done < $par_ribo_database_manifest + +elif [[ ! -z "$par_ref" ]]; then + IFS=";" read -ra ref <<< "$par_ref" + # check if length is 2 and par_paired is set to true + if [[ "${#ref[@]}" -eq 2 && "$par_paired" == "true" ]]; then + refs="--ref ${ref[0]} --ref ${ref[1]}" + # check if length is 1 and par_paired is set to false + elif [[ "${#ref[@]}" -eq 1 && "$par_paired" == "false" ]]; then + refs="--ref $par_ref" + else # if one reference provided but paired is set to true: + echo "Two reference fasta files are required for paired-end reads" + exit 1 + fi +else + echo "No reference fasta file(s) provided" + exit 1 +fi + + +sortmerna \ + $refs \ + $reads \ + --workdir . \ + ${par_output:+--aligned "${par_output}"} \ + ${par_fastx:+--fastx} \ + ${par_other:+--other "${par_other}"} \ + ${par_kvdb:+--kvdb "${par_kvdb}"} \ + ${par_idx_dir:+--idx-dir "${par_idx_dir}"} \ + ${par_readb:+--readb "${par_readb}"} \ + ${par_sam:+--sam} \ + ${par_sq:+--sq} \ + ${par_blast:+--blast "${par_blast}"} \ + ${par_num_alignments:+--num_alignments "${par_num_alignments}"} \ + ${par_min_lis:+--min_lis "${par_min_lis}"} \ + ${par_print_all_reads:+--print_all_reads} \ + ${par_paired_in:+--paired_in} \ + ${par_paired_out:+--paired_out} \ + ${par_out2:+--out2} \ + ${par_sout:+--sout} \ + ${par_zip_out:+--zip-out "${par_zip_out}"} \ + ${par_match:+--match "${par_match}"} \ + ${par_mismatch:+--mismatch "${par_mismatch}"} \ + ${par_gap_open:+--gap_open "${par_gap_open}"} \ + ${par_gap_ext:+--gap_ext "${par_gap_ext}"} \ + ${par_N:+-N "${par_N}"} \ + ${par_a:+-a "${par_a}"} \ + ${par_e:+-e "${par_e}"} \ + ${par_F:+-F} \ + ${par_R:+-R} \ + ${par_num_alignment:+--num_alignment "${par_num_alignment}"} \ + ${par_best:+--best "${par_best}"} \ + ${par_verbose:+--verbose} \ + ${par_id:+--id "${par_id}"} \ + ${par_coverage:+--coverage "${par_coverage}"} \ + ${par_de_novo:+--de_novo} \ + ${par_otu_map:+--otu_map} \ + ${par_num_seed:+--num_seed "${par_num_seed}"} \ + ${par_passes:+--passes "${par_passes}"} \ + ${par_edge:+--edge "${par_edge}"} \ + ${par_full_search:+--full_search} \ + ${par_index:+--index "${par_index}"} \ + ${par_L:+-L $par_L} \ + ${par_interval:+--interval "${par_interval}"} \ + ${par_max_pos:+--max_pos "${par_max_pos}"} + + +if [ ! -z $par_log ]; then + mv "${par_output}.log" $par_log +fi + +exit 0 + diff --git a/src/sortmerna/test.sh b/src/sortmerna/test.sh new file mode 100644 index 00000000..4d49c5ed --- /dev/null +++ b/src/sortmerna/test.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +echo ">>> Testing $meta_functionality_name" + +find $meta_resources_dir/test_data/rRNA -type f > test_data/rrna-db.txt + +echo ">>> Testing for paired-end reads" +# out2 separates the read pairs into two files (one fwd and one rev) +# paired_in outputs both reads of a pair +# other is the output file for non-rRNA reads +"$meta_executable" \ + --output "rRNA_reads" \ + --other "non_rRNA_reads" \ + --input "$meta_resources_dir/test_data/reads_1.fq.gz;$meta_resources_dir/test_data/reads_2.fq.gz" \ + --ribo_database_manifest test_data/rrna-db.txt \ + --log test_log.log \ + --paired_in \ + --fastx \ + --out2 + + +echo ">> Checking if the correct files are present" +[[ -f "rRNA_reads_fwd.fq.gz" ]] || [[ -f "rRNA_reads_rev.fq.gz" ]] || { echo "rRNA output fastq file is missing!"; exit 1; } +[[ -s "rRNA_reads_fwd.fq.gz" ]] && [[ -s "rRNA_reads_rev.fq.gz" ]] || { echo "rRNA output fastq file is empty!"; exit 1; } +[[ -f "non_rRNA_reads_fwd.fq.gz" ]] || [[ -f "non_rRNA_reads_rev.fq.gz" ]] || { echo "Non-rRNA output fastq file is missing!"; exit 1;} +gzip -dk non_rRNA_reads_fwd.fq.gz +gzip -dk non_rRNA_reads_rev.fq.gz +[[ ! -s "non_rRNA_reads_fwd.fq" ]] && [[ ! -s "non_rRNA_reads_rev.fq" ]] || { echo "Non-rRNA output fastq file is not empty!"; exit 1;} + +rm -f rRNA_reads_fwd.fq.gz rRNA_reads_rev.fq.gz non_rRNA_reads_fwd.fq.gz non_rRNA_reads_rev.fq.gz test_log.log +rm -rf kvdb/ + + +echo ">>> Testing for single-end reads" +"$meta_executable" \ + --aligned "rRNA_reads" \ + --other "non_rRNA_reads" \ + --input $meta_resources_dir/test_data/reads_1.fq.gz \ + --ref $meta_resources_dir/test_data/rRNA/database1.fa \ + --log test_log.log \ + --fastx + +echo ">> Checking if the correct files are present" +[[ ! -f "rRNA_reads.fq.gz" ]] && echo "rRNA output fastq file is missing!" && exit 1 +gzip -dk rRNA_reads.fq.gz +[[ -s "rRNA_reads.fq" ]] && echo "rRNA output fastq file is not empty!" && exit 1 +[[ ! -f "non_rRNA_reads.fq.gz" ]] && echo "Non-rRNA output fastq file is missing!" && exit 1 +[[ ! -s "non_rRNA_reads.fq.gz" ]] && echo "Non-rRNA output fastq file is empty!" && exit 1 + + +echo ">>> All tests passed" +exit 0 \ No newline at end of file diff --git a/src/sortmerna/test_data/rRNA/database1.fa b/src/sortmerna/test_data/rRNA/database1.fa new file mode 100644 index 00000000..bae23aba --- /dev/null +++ b/src/sortmerna/test_data/rRNA/database1.fa @@ -0,0 +1,24 @@ +>AY846379.1.1791 Eukaryota;Archaeplastida;Chloroplastida;Chlorophyta;Chlorophyceae;Sphaeropleales;Monoraphidium;Monoraphidium sp. Itas 9/21 14-6w +CCUGGUUGAUCCUGCCAGUAGUCAUAUGCUUGUCUCAAAGAUUAAGCCAUGCAUGUCUAAGUAUAAACUGCUUAUACUGU +GAAACUGCGAAUGGCUCAUUAAAUCAGUUAUAGUUUAUUUGAUGGUACCUCUACACGGAUAACCGUAGUAAUUCUAGAGC +UAAUACGUGCGUAAAUCCCGACUUCUGGAAGGGACGUAUUUAUUAGAUAAAAGGCCGACCGAGCUUUGCUCGACCCGCGG +UGAAUCAUGAUAACUUCACGAAUCGCAUAGCCUUGUGCUGGCGAUGUUUCAUUCAAAUUUCUGCCCUAUCAACUUUCGAU +GGUAGGAUAGAGGCCUACCAUGGUGGUAACGGGUGACGGAGGAUUAGGGUUCGAUUCCGGAGAGGGAGCCUGAGAAACGG +CUACCACAUCCAAGGAAGGCAGCAGGCGCGCAAAUUACCCAAUCCUGAUACGGGGAGGUAGUGACAAUAAAUAACAAUGC +CGGGCAUUUCAUGUCUGGCAAUUGGAAUGAGUACAAUCUAAAUCCCUUAACGAGGAUCAAUUGGAGGGCAAGUCUGGUGC +CAGCAGCCGCGGUAAUUCCAGCUCCAAUAGCGUAUAUUUAAGUUGUUGCAGUUAAAAAGCUCGUAGUUGGAUUUCGGGUG +GGUUCCAGCGGUCCGCCUAUGGUGAGUACUGCUGUGGCCCUCCUUUUUGUCGGGGACGGGCUCCUGGGCUUCAUUGUCCG +GGACUCGGAGUCGACGAUGAUACUUUGAGUAAAUUAGAGUGUUCAAAGCAAGCCUACGCUCUGAAUACUUUAGCAUGGAA +UAUCGCGAUAGGACUCUGGCCUAUCUCGUUGGUCUGUAGGACCGGAGUAAUGAUUAAGAGGGACAGUCGGGGGCAUUCGU +AUUUCAUUGUCAGAGGUGAAAUUCUUGGAUUUAUGAAAGACGAACUACUGCGAAAGCAUUUGCCAAGGAUGUUUUCAUUA +AUCAAGAACGAAAGUUGGGGGCUCGAAGACGAUUAGAUACCGUCGUAGUCUCAACCAUAAACGAUGCCGACUAGGGAUUG +GAGGAUGUUCUUUUGAUGACUUCUCCAGCACCUUAUGAGAAAUCAAAGUUUUUGGGUUCCGGGGGGAGUAUGGUCGCAAG +GCUGAAACUUAAAGGAAUUGACGGAAGGGCACCACCAGGCGUGGAGCCUGCGGCUUAAUUUGACUCAACACGGGAAAACU +UACCAGGUCCAGACAUAGUGAGGAUUGACAGAUUGAGAGCUCUUUCUUGAUUCUAUGGGUGGUGGUGCAUGGCCGUUCUU +AGUUGGUGGGUUGCCUUGUCAGGUUGAUUCCGGUAACGAACGAGACCUCAGCCUGCUAAAUAUGUCACAUUCGCUUUUUG +CGGAUGGCCGACUUCUUAGAGGGACUAUUGGCGUUUAGUCAAUGGAAGUAUGAGGCAAUAACAGGUCUGUGAUGCCCUUA +GAUGUUCUGGGCCGCACGCGCGCUACACUGACGCAUUCAGCAAGCCUAUCCUUGACCGAGAGGUCUGGGUAAUCUUUGAA +ACUGCGUCGUGAUGGGGAUAGAUUAUUGCAAUUAUUAGUCUUCAACGAGGAAUGCCUAGUAAGCGCAAGUCAUCAGCUUG +CGUUGAUUACGUCCCUGCCCUUUGUACACACCGCCCGUCGCUCCUACCGAUUGGGUGUGCUGGUGAAGUGUUCGGAUUGG +CAGAGCGGGUGGCAACACUUGCUUUUGCCGAGAAGUUCAUUAAACCCUCCCACCUAGAGGAAGGAGAAGUCGUAACAAGG +UUUCCGUAGGUGAACCUGCAGAAG \ No newline at end of file diff --git a/src/sortmerna/test_data/rRNA/database2.fa b/src/sortmerna/test_data/rRNA/database2.fa new file mode 100644 index 00000000..87b5bc99 --- /dev/null +++ b/src/sortmerna/test_data/rRNA/database2.fa @@ -0,0 +1,16 @@ +>AB001445.1.1538 Bacteria;Proteobacteria;Gammaproteobacteria;Pseudomonadales;Pseudomonadaceae;Pseudomonas;Pseudomonas amygdali pv. morsprunorum +AGAGUUUGAUCAUGGCUCAGAUUGAACGCUGGCGGCAGGCCUAACACAUGCAAGUCGAGCGGCAGCACGGGUACUUGUAC +CUGGUGGCGAGCGGCGGACGGGUGAGUAAUGCCUAGGAAUCUGCCUGGUAGUGGGGGAUAACGCUCGGAAACGGACGCUA +AUACCGCAUACGUCCUACGGGAGAAAGCAGGGGACCUUCGGGCCUUGCGCUAUCAGAUGAGCCUAGGUCGGAUUAGCUAG +UUGGUGAGGUAAUGGCUCACCAAGGCGACGAUCCGUAACUGGUCUGAGAGGAUGAUCAGUCACACUGGAACUGAGACACG +GUCCAGACUCCUACGGGAGGCAGCAGUGGGGAAUAUUGGACAAUGGGCGAAAGCCUGAUCCAGCCAUGCCGCGUGUGUGA +AGAAGGUCUUCGGAUUGUAAAGCACUUUAAGUUGGGAGGAAGGGCAGUUACCUAAUACGUAUCUGUUUUGACGUUACCGA +CAGAAUAAGCACCGGCUAACUCUGUGCCAGCAGCCGCGGUAAUACAGAGGGUGCAAGCGUUAAUCGGAAUUACUGGGCGU +AAAGCGCGCGUAGGUGGUUUGUUAAGUUGAAUGUGAAAUCCCCGGGCUCAACCUGGGAACUGCAUCCAAAACUGGCAAGC +UAGAGUAUGGUAGAGGGUGGUGGAAUUUCCUGUGUAGCGGUGAAAUGCGUAGAUAUAGGAAGGAACACCAGUGGCGAAGG +CGACCACCUGGACUGAUACUGACACUGAGGUGCGAAAGCGUGGGGAGCAAACAGGAUUAGAUACCCUGGUAGUCCACGCC +GUAAACGAUGUCAACUAGCCGUUGGGAGCCUUGAGCUCUUAGUGGCGCAGCUAACGCAUUAAGUUGACCGCCUGGGGAGU +ACGGCCGCAAGGUUAAAACUCAAAUGAAUUGACGGGGGCCCGCACAAGCGGUGGAGCAUGUGGUUUAAUUCGAAGCAACG +CGAAGAACCUUACCAGGCCUUGACAUCCAAUGAAUCCUUUAGAGAUAGAGGAGUGCCUUCGGGAGCAUUGAGACAGGUGC +UGCAUGGCUGUCGUCAGCUCGUGUCGUGAGAUGUUGGGUUAAGUCCCGUAACGAGCGCAACCCUUGUCCUUAGUUACCAG +CACGUCAUGGUGGGCACUCUAAGGAGACUGCCGGUGACAAACCGGAGGAAGGUGGGGAUGACGUCAAGUCAUCAUGGCCC diff --git a/src/sortmerna/test_data/reads_1.fq.gz b/src/sortmerna/test_data/reads_1.fq.gz new file mode 100644 index 0000000000000000000000000000000000000000..41c02a22dbbae13db84acf1e79bc4fc3fa8589e6 GIT binary patch literal 189 zcmV;u07CyCiwFo$iqvKR19D|yWOH9JE@p86wU0dx!Y~Yl_nZQWu>(o}P^}JqwIX+b zPO-%OPl6LsC<}stmpJjW<4E7M&Ycg1S#Apj3L$tq`=RbA_@+MuTFFyl z78alq=EMofi84dLb}3jyAYujED%nAymVqrZpNFl>Dky^%D&v_(cRIcb rrwC*NyuH|rwZ}M?)J Date: Sun, 1 Sep 2024 21:21:55 +0200 Subject: [PATCH 4/5] Update changelog, shorten description --- CHANGELOG.md | 3 +++ src/sortmerna/config.vsh.yaml | 4 +--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3e9f40fc..16d9120f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -131,6 +131,9 @@ - `bedtools_getfasta`: extract sequences from a FASTA file for each of the intervals defined in a BED/GFF/VCF file (PR #59). +* `sortmerna`: Local sequence alignment tool for mapping, clustering, and filtering rRNA from metatranscriptomic + data. (PR #146) + ## MINOR CHANGES * Uniformize component metadata (PR #23). diff --git a/src/sortmerna/config.vsh.yaml b/src/sortmerna/config.vsh.yaml index 23925132..6477660f 100644 --- a/src/sortmerna/config.vsh.yaml +++ b/src/sortmerna/config.vsh.yaml @@ -1,9 +1,7 @@ name: sortmerna description: | Local sequence alignment tool for filtering, mapping and clustering. The main - application of SortMeRNA is filtering rRNA from metatranscriptomic data. SortMeRNA - takes as input files of reads (fasta, fastq, fasta.gz, fastq.gz) and one or multiple - rRNA database file(s), and sorts apart aligned and rejected reads into two files. + application of SortMeRNA is filtering rRNA from metatranscriptomic data. keywords: [sort, mRNA, rRNA, alignment, filtering, mapping, clustering] links: homepage: https://sortmerna.readthedocs.io/en/latest/ From c6a747f16dbe23819e76a3b32ceb23fb7e932288 Mon Sep 17 00:00:00 2001 From: emmarousseau Date: Sun, 8 Sep 2024 17:28:14 +0200 Subject: [PATCH 5/5] Add more test scenarios --- src/sortmerna/test.sh | 53 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 51 insertions(+), 2 deletions(-) diff --git a/src/sortmerna/test.sh b/src/sortmerna/test.sh index 4d49c5ed..390b9307 100644 --- a/src/sortmerna/test.sh +++ b/src/sortmerna/test.sh @@ -4,7 +4,7 @@ echo ">>> Testing $meta_functionality_name" find $meta_resources_dir/test_data/rRNA -type f > test_data/rrna-db.txt -echo ">>> Testing for paired-end reads" +echo ">>> Testing for paired-end reads and database manifest" # out2 separates the read pairs into two files (one fwd and one rev) # paired_in outputs both reads of a pair # other is the output file for non-rRNA reads @@ -30,8 +30,33 @@ gzip -dk non_rRNA_reads_rev.fq.gz rm -f rRNA_reads_fwd.fq.gz rRNA_reads_rev.fq.gz non_rRNA_reads_fwd.fq.gz non_rRNA_reads_rev.fq.gz test_log.log rm -rf kvdb/ +################################################################################ +echo ">>> Testing for paired-end reads and --ref and --paired_out argumens" +"$meta_executable" \ + --output "rRNA_reads" \ + --other "non_rRNA_reads" \ + --input "$meta_resources_dir/test_data/reads_1.fq.gz;$meta_resources_dir/test_data/reads_2.fq.gz" \ + --ref "$meta_resources_dir/test_data/rRNA/database1.fa;$meta_resources_dir/test_data/rRNA/database2.fa" \ + --log test_log.log \ + --paired_out \ + --fastx \ + --out2 + +echo ">> Checking if the correct files are present" +[[ -f "rRNA_reads_fwd.fq.gz" ]] || [[ -f "rRNA_reads_rev.fq.gz" ]] || { echo "rRNA output fastq file is missing!"; exit 1; } +gzip -dkf rRNA_reads_fwd.fq.gz +[[ ! -s "rRNA_reads_fwd.fq" ]] && [[ ! -s "rRNA_reads_rev.fq" ]] || { echo "rRNA output fastq file is not empty!"; exit 1; } +[[ -f "non_rRNA_reads_fwd.fq.gz" ]] || [[ -f "non_rRNA_reads_rev.fq.gz" ]] || { echo "Non-rRNA output fastq file is missing!"; exit 1;} +gzip -dkf non_rRNA_reads_fwd.fq.gz +gzip -dkf non_rRNA_reads_rev.fq.gz +[[ -s "non_rRNA_reads_fwd.fq" ]] && [[ -s "non_rRNA_reads_rev.fq" ]] || { echo "Non-rRNA output fastq file is empty!"; exit 1; } + +rm -f rRNA_reads_fwd.fq.gz rRNA_reads_rev.fq.gz non_rRNA_reads_fwd.fq.gz non_rRNA_reads_rev.fq.gz test_log.log +rm -rf kvdb/ + +################################################################################ -echo ">>> Testing for single-end reads" +echo ">>> Testing for single-end reads and --ref argument" "$meta_executable" \ --aligned "rRNA_reads" \ --other "non_rRNA_reads" \ @@ -47,6 +72,30 @@ gzip -dk rRNA_reads.fq.gz [[ ! -f "non_rRNA_reads.fq.gz" ]] && echo "Non-rRNA output fastq file is missing!" && exit 1 [[ ! -s "non_rRNA_reads.fq.gz" ]] && echo "Non-rRNA output fastq file is empty!" && exit 1 +rm -f rRNA_reads.fq.gz non_rRNA_reads.fq.gz test_log.log +rm -rf kvdb/ + +################################################################################ + +echo ">>> Testing for single-end reads with singleton output files" +"$meta_executable" \ + --aligned "rRNA_reads" \ + --other "non_rRNA_reads" \ + --input "$meta_resources_dir/test_data/reads_1.fq.gz;$meta_resources_dir/test_data/reads_2.fq.gz" \ + --ribo_database_manifest test_data/rrna-db.txt \ + --log test_log.log \ + --fastx \ + --sout + +echo ">> Checking if the correct files are present" +[[ ! -f "rRNA_reads_paired.fq.gz" ]] && echo "Aligned paired fwd output fastq file is missing!" && exit 1 +[[ ! -f "rRNA_reads_singleton.fq.gz" ]] && echo "Aligned singleton fwd output fastq file is missing!" && exit 1 +[[ ! -f "non_rRNA_reads_fwd.fq" ]] && echo "Non-rRNA fwd output fastq file is missing!" && exit 1 +[[ ! -f "non_rRNA_reads_rev.fq" ]] && echo "Non-rRNA rev output fastq file is missing!" && exit 1 +[[ ! -f "non_rRNA_reads_singleton.fq.gz" ]] && echo "Non-rRNA singleton output fastq file is missing!" && exit 1 +[[ ! -f "non_rRNA_reads_paired.fq.gz" ]] && echo "Non-rRNA paired output fastq file is missing!" && exit 1 + + echo ">>> All tests passed" exit 0 \ No newline at end of file