From 38f586bec0ac9e4312b016e29c3aa0bd53f292b2 Mon Sep 17 00:00:00 2001 From: emmarousseau Date: Thu, 11 Apr 2024 11:04:14 +0100 Subject: [PATCH 1/6] initial commit dedup --- CHANGELOG.md | 3 + src/umi_tools/umi_tools_dedup/config.vsh.yaml | 279 ++++++++++++++++++ src/umi_tools/umi_tools_dedup/help.txt | 13 + src/umi_tools/umi_tools_dedup/script.sh | 65 ++++ src/umi_tools/umi_tools_dedup/test.sh | 49 +++ 5 files changed, 409 insertions(+) create mode 100644 src/umi_tools/umi_tools_dedup/config.vsh.yaml create mode 100644 src/umi_tools/umi_tools_dedup/help.txt create mode 100644 src/umi_tools/umi_tools_dedup/script.sh create mode 100644 src/umi_tools/umi_tools_dedup/test.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 4fd7f001..1bef9345 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -39,6 +39,9 @@ - `samtools/flagstat`: Counts the number of alignments in SAM/BAM/CRAM files for each FLAG type (PR #31). - `samtools/idxstats`: Reports alignment summary statistics for a SAM/BAM/CRAM file (PR #32). +* `umi_tools`: + - `umi_tools/umi_tools_dedup`: Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read (PR #). + ## MAJOR CHANGES ## MINOR CHANGES diff --git a/src/umi_tools/umi_tools_dedup/config.vsh.yaml b/src/umi_tools/umi_tools_dedup/config.vsh.yaml new file mode 100644 index 00000000..75306541 --- /dev/null +++ b/src/umi_tools/umi_tools_dedup/config.vsh.yaml @@ -0,0 +1,279 @@ +name: umi_tool_dedup +namespace: umi_tools +description: | + Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read. +keywords: [umi_tools, deduplication, dedup] +links: + homepage: https://umi-tools.readthedocs.io/en/latest/ + documentation: [ https://umi-tools.readthedocs.io/en/latest/reference/dedup.html, + https://umi-tools.readthedocs.io/en/latest/common_options.html#common-options ] + repository: https://github.com/CGATOxford/UMI-tools +references: + doi: 10.1101/gr.209601.116 +license: MIT + +argument_groups: + - name: Inputs + arguments: + - name: --input + alternatives: -I + type: file + description: Input BAM or SAM file. Use --in_sam to specify SAM format. + required: true + - name: --in_sam + type: boolean_true + description: | + By default, inputs are assumed to be in BAM format. Use this options + to specify the use of SAM format for input. + - name: --bai + type: file + description: BAM index + - name: --get_output_stats + type: boolean + description: Whether or not to generate output stats. + - name: --random_seed + type: integer + description: | + Random seed to initialize number generator with. + default: none + + - name: Outputs + arguments: + - name: --output + alternatives: -S + type: file + description: Deduplicated BAM file + required: true + direction: output + - name: --out_sam + type: boolean_true + description: | + By default, outputa are written in BAM format. Use this options to + specify the use of SAM format for output. + - name: --paired + type: boolean_true + description: | + BAM is paired end - output both read pairs. This will also force the + use of the template length to determine reads with the same mapping + coordinates. + - name: --output_stats + type: file + description: Directory containing UMI based deduplication statistics files + direction: output + - name: --extract_umi_method + type: string + description: | + Specify the method by which the barcodes were encoded in the read. + The options are: [read_id, tag, umis]. + default: read_id + - name: --umi_tag + type: string + description: | + The tag containing the UMI sequence. + This is only required if the extract_umi_method is set to tag. + - name: --umi_separator + type: string + description: | + The separator used to separate the UMI from the read sequence. + This is only required if the extract_umi_method is set to id_read. + default: '_' + - name: --umi_tag_split + type: string + description: | + Separate the UMI in tag by and take the first element. + - name: --umi_tag_delimiter + type: string + description: | + Separate the UMI in by and concatenate the elements + - name: --cell_tag + type: string + description: | + The tag containing the cell barcode sequence. + This is only required if the extract_umi_method is set to tag. + - name: --cell_tag_split + type: string + description: | + Separate the cell barcode in tag by and take the first element. + - name: --cell_tag_delimiter + type: string + description: | + Separate the cell barcode in by and concatenate the elements + + - name: Grouping Options + arguments: + - name: --method + type: string + description: | + The method to use for grouping reads. The options are: + [unique, percentile, cluster, adjacency, directional]. + default: directional + - name: --edit_distance_threshold + type: integer + description: | + For the adjacency and cluster methods the threshold for the edit + distance to connect two UMIs in the network can be increased. The + default value of 1 works best unless the UMI is very long (>14bp). + default: 1 + - name: --spliced_is_unique + type: boolean_true + description: | + Causes two reads that start in the same position on the same strand + and having the same UMI to be considered unique if one is spliced + and the other is not. (Uses the ‘N’ cigar operation to test for splicing). + - name: --soft_clip_threshold + type: integer + description: | + Mappers that soft clip will sometimes do so rather than mapping a + spliced read if there is only a small overhang over the exon junction. + By setting this option, you can treat reads with at least this many + bases soft-clipped at the 3’ end as spliced. + default: 4 + - name: --multimapping_detection_method + type: string + description: | + If the sam/bam contains tags to identify multimapping reads, you can + specify for use when selecting the best read at a given loci. Supported + tags are “NH”, “X0” and “XT”. If not specified, the read with the highest + mapping quality will be selected. + - name: --read_length + type: integer + description: | + Use the read length as a criteria when deduping, for e.g sRNA-Seq. + + - name: Single-cell RNA-Seq Options + arguments: + - name: --per_gene + type: boolean_true + description: | + Reads will be grouped together if they have the same gene. This is useful + if your library prep generates PCR duplicates with non identical alignment + positions such as CEL-Seq. Note this option is hardcoded to be on with the + count command. I.e counting is always performed per-gene. Must be combined + with either --gene_tag or --per_contig option. + - name: --gene_tag + type: string + description: | + Deduplicate per gene. The gene information is encoded in the bam read tag + specified. + - name: --assigned_status_tag + type: string + description: | + BAM tag which describes whether a read is assigned to a gene. Defaults to + the same value as given for --gene_tag. + - name: --skip_tags_regex + type: string + description: | + Use in conjunction with the --assigned_status_tag option to skip any reads + where the tag matches this regex. Default ("^[__|Unassigned]") matches + anything which starts with “__” or “Unassigned”. + - name: --per_contig + type: boolean_true + description: | + Deduplicate per contig (field 3 in BAM; RNAME). All reads with the same + contig will be considered to have the same alignment position. This is + useful if you have aligned to a reference transcriptome with one + transcript per gene. If you have aligned to a transcriptome with more + than one transcript per gene, you can supply a map between transcripts + and gene using the --gene_transcript_map option. + - name: --gene_transcript_map + type: file + description: | + A file containing a mapping between gene names and transcript names. + The file should be tab separated with the gene name in the first column + and the transcript name in the second column. + - name: --per_cell + type: boolean_true + description: | + Reads will only be grouped together if they have the same cell barcode. + Can be combined with --per_gene. + + - name: SAM/BAM Options + arguments: + - name: --mapping_quality + type: integer + description: | + Minimium mapping quality (MAPQ) for a read to be retained. + default: 0 + - name: --unmapped_reads + type: string + description: | + How unmapped reads should be handled. + The options are: + "discard": Discard all unmapped reads. + "use": If read2 is unmapped, deduplicate using read1 only. + Requires --paired. + "output": Output unmapped reads/read pairs without UMI + grouping/deduplication. Only available in umi_tools group. + default: discard + - name: --chimeric_pairs + type: string + description: | + How chimeric pairs should be handled. + The options are: + "discard": Discard all chimeric read pairs. + "use": Deduplicate using read1 only. + "output": Output chimeric pairs without UMI grouping/deduplication. + Only available in umi_tools group. + default: use + - name: --unapired_reads + type: string + description: | + How unpaired reads should be handled. + The options are: + "discard": Discard all unpaired reads. + "use": Deduplicate using read1 only. + "output": Output unpaired reads without UMI grouping/deduplication. + Only available in umi_tools group. + default: use + - name: --ignore_umi + type: boolean_true + description: | + Ignore the UMI and group reads using mapping coordinates only. + - name: --subset + type: boolean_true + description: | + Only consider a fraction of the reads, chosen at random. This is useful + for doing saturation analyses. + - name: --chrom + type: string + description: | + Only consider a single chromosome. This is useful for debugging/testing + purposes. + + - name: Group/Dedup Options + arguments: + - name: --no_sort_output + type: boolean_true + description: | + By default, output is sorted. This involves the use of a temporary unsorted + file (saved in --temp-dir). Use this option to turn off sorting. + - name: --buffer_whole_contig + type: boolean_true + description: | + Forces dedup to parse an entire contig before yielding any reads for + deduplication. This is the only way to absolutely guarantee that all reads + with the same start position are grouped together for deduplication since + dedup uses the start position of the read, not the alignment coordinate on + which the reads are sorted. However, by default, dedup reads for another + 1000bp before outputting read groups which will avoid any reads being missed + with short read sequencing (<1000bp). + + +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - type: file + path: test_data +engines: + - type: docker + image: quay.io/biocontainers/umi_tools:1.1.5--py39hf95cd2a_1 + setup: + - type: docker + run: | + umi_tools -v | sed 's/ version//g' > /var/software_versions.txt +runners: +- type: executable +- type: nextflow \ No newline at end of file diff --git a/src/umi_tools/umi_tools_dedup/help.txt b/src/umi_tools/umi_tools_dedup/help.txt new file mode 100644 index 00000000..d3c8fa44 --- /dev/null +++ b/src/umi_tools/umi_tools_dedup/help.txt @@ -0,0 +1,13 @@ +``` +umi_tools dedup +``` + +dedup - Deduplicate reads using UMI and mapping coordinates + +Usage: umi_tools dedup [OPTIONS] [--stdin=IN_BAM] [--stdout=OUT_BAM] + + note: If --stdout is ommited, standard out is output. To + generate a valid BAM file on standard out, please + redirect log with --log=LOGFILE or --log2stderr + +For full UMI-tools documentation, see https://umi-tools.readthedocs.io/en/latest/ \ No newline at end of file diff --git a/src/umi_tools/umi_tools_dedup/script.sh b/src/umi_tools/umi_tools_dedup/script.sh new file mode 100644 index 00000000..57c01258 --- /dev/null +++ b/src/umi_tools/umi_tools_dedup/script.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +set -e + +test_dir="${metal_executable}/test_data" + +[[ "$par_paired" == "false" ]] && unset par_paired +[[ "$par_in_sam" == "false" ]] && unset par_in_sam +[[ "$par_out_sam" == "false" ]] && unset par_out_sam +[[ "$par_spliced_is_unique" == "false" ]] && unset par_spliced_is_unique +[[ "$par_per_gene" == "false" ]] && unset par_per_gene +[[ "$par_per_contig" == "false" ]] && unset par_per_contig +[[ "$par_per_cell" == "false" ]] && unset par_per_cell +[[ "$par_no_sort_output" == "false" ]] && unset par_no_sort_output +[[ "$par_buffer_whole_contig" == "false" ]] && unset par_buffer_whole_contig +[[ "$par_ignore_umi" == "false" ]] && unset par_ignore_umi +[[ "$par_subset" == "false" ]] && unset par_subset + + +$(which umi_tools) dedup \ + -I "$par_input" \ + ${par_in_sam:+--in-sam} \ + ${par_bai:+--bai "$par_bai"} \ + ${par_get_output_stats:+--get-output-stats} \ + ${par_random_seed:+--random-seed "$par_random_seed"} \ + -S "$par_output" \ + ${par_out_sam:+--out-sam} \ + ${par_paired:+--paired} \ + ${par_output_stats:+--output-stats "$par_output_stats"} \ + ${par_extract_umi_method:+--extract-umi-method "$par_extract_umi_method"} \ + ${par_umi_tag:+--umi-tag "$par_umi_tag"} \ + ${par_umi_separator:+--umi-separator "$par_umi_separator"} \ + ${par_umi_tag_split:+--umi-tag-split "$par_umi_tag_split"} \ + ${par_umi_tag_delimiter:+--umi-tag-delimiter "$par_umi_tag_delimiter"} \ + ${par_cell_tag:+--cell-tag "$par_cell_tag"} \ + ${par_cell_tag_split:+--cell-tag-split "$par_cell_tag_split"} \ + ${par_cell_tag_delimiter:+--cell-tag-delimiter "$par_cell_tag_delimiter"} \ + ${par_method:+--method "$par_method"} \ + ${par_edit_distance_threshold:+--edit-distance-threshold "$par_edit_distance_threshold"} \ + ${par_spliced_is_unique:+--spliced-is-unique} \ + ${par_soft_clip_threshold:+--soft-clip-threshold "$par_soft_clip_threshold"} \ + ${par_multimapping_detection_method:+--multimapping-detection-method "$par_multimapping_detection_method"} \ + ${par_read_length:+--read-length "$par_read_length"} \ + ${par_per_gene:+--per-gene} \ + ${par_gene_tag:+--gene-tag "$par_gene_tag"} \ + ${par_assigned_status_tag:+--assigned-status-tag "$par_assigned_status_tag"} \ + ${par_skip_tags_regex:+--skip-tags-regex "$par_skip_tags_regex"} \ + ${par_per_contig:+--per-contig} + ${par_gene_transcript_map:+--gene-transcript-map "$par_gene_transcript_map"} \ + ${par_per_cell:+--per-cell} \ + ${par_mapping_quality:+--mapping-quality "$par_mapping_quality"} \ + ${par_unmapped_reads:+--unmapped-reads "$par_unmapped_reads"} \ + ${par_chimeric_pairs:+--chimeric-pairs "$par_chimeric_pairs"} \ + ${par_unapired_reads:+--unapired-reads "$par_unapired_reads"} \ + ${par_ignore_umi:+--ignore-umi} \ + ${par_subset:+--subset} \ + ${par_chrom:+--chrom "$par_chrom"} \ + ${par_no_sort_output:+--no-sort-output} \ + ${par_buffer_whole_contig:+--buffer-whole-contig} + + +exit 0 \ No newline at end of file diff --git a/src/umi_tools/umi_tools_dedup/test.sh b/src/umi_tools/umi_tools_dedup/test.sh new file mode 100644 index 00000000..1459ec08 --- /dev/null +++ b/src/umi_tools/umi_tools_dedup/test.sh @@ -0,0 +1,49 @@ +#!/bin/bash + +test_dir="${meta_resources_dir}/test_data" +echo ">>> Testing $meta_functionality_name" + +"$meta_executable" \ + --bam "$test_dir/a.sorted.bam" \ + --bai "$test_dir/a.sorted.bam.bai" \ + --output "$test_dir/a.sorted.idxstats" + +echo ">>> Checking whether output exists" +[ ! -f "$test_dir/a.sorted.idxstats" ] && echo "File 'a.sorted.idxstats' does not exist!" && exit 1 + +echo ">>> Checking whether output is non-empty" +[ ! -s "$test_dir/a.sorted.idxstats" ] && echo "File 'a.sorted.idxstats' is empty!" && exit 1 + +echo ">>> Checking whether output is correct" +diff "$test_dir/a.sorted.idxstats" "$test_dir/a_ref.sorted.idxstats" || \ + (echo "Output file a.sorted.idxstats does not match expected output" && exit 1) + +rm "$test_dir/a.sorted.idxstats" + +############################################################################################ + +echo ">>> Testing $meta_functionality_name with singletons in the input" + +"$meta_executable" \ + --bam "$test_dir/test.paired_end.sorted.bam" \ + --bai "$test_dir/test.paired_end.sorted.bam.bai" \ + --output "$test_dir/test.paired_end.sorted.idxstats" + +echo ">>> Checking whether output exists" +[ ! -f "$test_dir/test.paired_end.sorted.idxstats" ] && \ + echo "File 'test.paired_end.sorted.idxstats' does not exist!" && exit 1 + +echo ">>> Checking whether output is non-empty" +[ ! -s "$test_dir/test.paired_end.sorted.idxstats" ] && \ + echo "File 'test.paired_end.sorted.idxstats' is empty!" && exit 1 + +echo ">>> Checking whether output is correct" +diff "$test_dir/test.paired_end.sorted.idxstats" "$test_dir/test_ref.paired_end.sorted.idxstats" || \ + (echo "Output file test.paired_end.sorted.idxstats does not match expected output" && exit 1) + +rm "$test_dir/test.paired_end.sorted.idxstats" + +############################################################################################ + +echo "All tests succeeded!" +exit 0 \ No newline at end of file From 2c269682620a407803e528652198646435ef2c03 Mon Sep 17 00:00:00 2001 From: emmarousseau Date: Thu, 11 Apr 2024 11:38:57 +0100 Subject: [PATCH 2/6] Revert "initial commit dedup" This reverts commit 38f586bec0ac9e4312b016e29c3aa0bd53f292b2. --- CHANGELOG.md | 3 - src/umi_tools/umi_tools_dedup/config.vsh.yaml | 279 ------------------ src/umi_tools/umi_tools_dedup/help.txt | 13 - src/umi_tools/umi_tools_dedup/script.sh | 65 ---- src/umi_tools/umi_tools_dedup/test.sh | 49 --- 5 files changed, 409 deletions(-) delete mode 100644 src/umi_tools/umi_tools_dedup/config.vsh.yaml delete mode 100644 src/umi_tools/umi_tools_dedup/help.txt delete mode 100644 src/umi_tools/umi_tools_dedup/script.sh delete mode 100644 src/umi_tools/umi_tools_dedup/test.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 1bef9345..4fd7f001 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -39,9 +39,6 @@ - `samtools/flagstat`: Counts the number of alignments in SAM/BAM/CRAM files for each FLAG type (PR #31). - `samtools/idxstats`: Reports alignment summary statistics for a SAM/BAM/CRAM file (PR #32). -* `umi_tools`: - - `umi_tools/umi_tools_dedup`: Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read (PR #). - ## MAJOR CHANGES ## MINOR CHANGES diff --git a/src/umi_tools/umi_tools_dedup/config.vsh.yaml b/src/umi_tools/umi_tools_dedup/config.vsh.yaml deleted file mode 100644 index 75306541..00000000 --- a/src/umi_tools/umi_tools_dedup/config.vsh.yaml +++ /dev/null @@ -1,279 +0,0 @@ -name: umi_tool_dedup -namespace: umi_tools -description: | - Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read. -keywords: [umi_tools, deduplication, dedup] -links: - homepage: https://umi-tools.readthedocs.io/en/latest/ - documentation: [ https://umi-tools.readthedocs.io/en/latest/reference/dedup.html, - https://umi-tools.readthedocs.io/en/latest/common_options.html#common-options ] - repository: https://github.com/CGATOxford/UMI-tools -references: - doi: 10.1101/gr.209601.116 -license: MIT - -argument_groups: - - name: Inputs - arguments: - - name: --input - alternatives: -I - type: file - description: Input BAM or SAM file. Use --in_sam to specify SAM format. - required: true - - name: --in_sam - type: boolean_true - description: | - By default, inputs are assumed to be in BAM format. Use this options - to specify the use of SAM format for input. - - name: --bai - type: file - description: BAM index - - name: --get_output_stats - type: boolean - description: Whether or not to generate output stats. - - name: --random_seed - type: integer - description: | - Random seed to initialize number generator with. - default: none - - - name: Outputs - arguments: - - name: --output - alternatives: -S - type: file - description: Deduplicated BAM file - required: true - direction: output - - name: --out_sam - type: boolean_true - description: | - By default, outputa are written in BAM format. Use this options to - specify the use of SAM format for output. - - name: --paired - type: boolean_true - description: | - BAM is paired end - output both read pairs. This will also force the - use of the template length to determine reads with the same mapping - coordinates. - - name: --output_stats - type: file - description: Directory containing UMI based deduplication statistics files - direction: output - - name: --extract_umi_method - type: string - description: | - Specify the method by which the barcodes were encoded in the read. - The options are: [read_id, tag, umis]. - default: read_id - - name: --umi_tag - type: string - description: | - The tag containing the UMI sequence. - This is only required if the extract_umi_method is set to tag. - - name: --umi_separator - type: string - description: | - The separator used to separate the UMI from the read sequence. - This is only required if the extract_umi_method is set to id_read. - default: '_' - - name: --umi_tag_split - type: string - description: | - Separate the UMI in tag by and take the first element. - - name: --umi_tag_delimiter - type: string - description: | - Separate the UMI in by and concatenate the elements - - name: --cell_tag - type: string - description: | - The tag containing the cell barcode sequence. - This is only required if the extract_umi_method is set to tag. - - name: --cell_tag_split - type: string - description: | - Separate the cell barcode in tag by and take the first element. - - name: --cell_tag_delimiter - type: string - description: | - Separate the cell barcode in by and concatenate the elements - - - name: Grouping Options - arguments: - - name: --method - type: string - description: | - The method to use for grouping reads. The options are: - [unique, percentile, cluster, adjacency, directional]. - default: directional - - name: --edit_distance_threshold - type: integer - description: | - For the adjacency and cluster methods the threshold for the edit - distance to connect two UMIs in the network can be increased. The - default value of 1 works best unless the UMI is very long (>14bp). - default: 1 - - name: --spliced_is_unique - type: boolean_true - description: | - Causes two reads that start in the same position on the same strand - and having the same UMI to be considered unique if one is spliced - and the other is not. (Uses the ‘N’ cigar operation to test for splicing). - - name: --soft_clip_threshold - type: integer - description: | - Mappers that soft clip will sometimes do so rather than mapping a - spliced read if there is only a small overhang over the exon junction. - By setting this option, you can treat reads with at least this many - bases soft-clipped at the 3’ end as spliced. - default: 4 - - name: --multimapping_detection_method - type: string - description: | - If the sam/bam contains tags to identify multimapping reads, you can - specify for use when selecting the best read at a given loci. Supported - tags are “NH”, “X0” and “XT”. If not specified, the read with the highest - mapping quality will be selected. - - name: --read_length - type: integer - description: | - Use the read length as a criteria when deduping, for e.g sRNA-Seq. - - - name: Single-cell RNA-Seq Options - arguments: - - name: --per_gene - type: boolean_true - description: | - Reads will be grouped together if they have the same gene. This is useful - if your library prep generates PCR duplicates with non identical alignment - positions such as CEL-Seq. Note this option is hardcoded to be on with the - count command. I.e counting is always performed per-gene. Must be combined - with either --gene_tag or --per_contig option. - - name: --gene_tag - type: string - description: | - Deduplicate per gene. The gene information is encoded in the bam read tag - specified. - - name: --assigned_status_tag - type: string - description: | - BAM tag which describes whether a read is assigned to a gene. Defaults to - the same value as given for --gene_tag. - - name: --skip_tags_regex - type: string - description: | - Use in conjunction with the --assigned_status_tag option to skip any reads - where the tag matches this regex. Default ("^[__|Unassigned]") matches - anything which starts with “__” or “Unassigned”. - - name: --per_contig - type: boolean_true - description: | - Deduplicate per contig (field 3 in BAM; RNAME). All reads with the same - contig will be considered to have the same alignment position. This is - useful if you have aligned to a reference transcriptome with one - transcript per gene. If you have aligned to a transcriptome with more - than one transcript per gene, you can supply a map between transcripts - and gene using the --gene_transcript_map option. - - name: --gene_transcript_map - type: file - description: | - A file containing a mapping between gene names and transcript names. - The file should be tab separated with the gene name in the first column - and the transcript name in the second column. - - name: --per_cell - type: boolean_true - description: | - Reads will only be grouped together if they have the same cell barcode. - Can be combined with --per_gene. - - - name: SAM/BAM Options - arguments: - - name: --mapping_quality - type: integer - description: | - Minimium mapping quality (MAPQ) for a read to be retained. - default: 0 - - name: --unmapped_reads - type: string - description: | - How unmapped reads should be handled. - The options are: - "discard": Discard all unmapped reads. - "use": If read2 is unmapped, deduplicate using read1 only. - Requires --paired. - "output": Output unmapped reads/read pairs without UMI - grouping/deduplication. Only available in umi_tools group. - default: discard - - name: --chimeric_pairs - type: string - description: | - How chimeric pairs should be handled. - The options are: - "discard": Discard all chimeric read pairs. - "use": Deduplicate using read1 only. - "output": Output chimeric pairs without UMI grouping/deduplication. - Only available in umi_tools group. - default: use - - name: --unapired_reads - type: string - description: | - How unpaired reads should be handled. - The options are: - "discard": Discard all unpaired reads. - "use": Deduplicate using read1 only. - "output": Output unpaired reads without UMI grouping/deduplication. - Only available in umi_tools group. - default: use - - name: --ignore_umi - type: boolean_true - description: | - Ignore the UMI and group reads using mapping coordinates only. - - name: --subset - type: boolean_true - description: | - Only consider a fraction of the reads, chosen at random. This is useful - for doing saturation analyses. - - name: --chrom - type: string - description: | - Only consider a single chromosome. This is useful for debugging/testing - purposes. - - - name: Group/Dedup Options - arguments: - - name: --no_sort_output - type: boolean_true - description: | - By default, output is sorted. This involves the use of a temporary unsorted - file (saved in --temp-dir). Use this option to turn off sorting. - - name: --buffer_whole_contig - type: boolean_true - description: | - Forces dedup to parse an entire contig before yielding any reads for - deduplication. This is the only way to absolutely guarantee that all reads - with the same start position are grouped together for deduplication since - dedup uses the start position of the read, not the alignment coordinate on - which the reads are sorted. However, by default, dedup reads for another - 1000bp before outputting read groups which will avoid any reads being missed - with short read sequencing (<1000bp). - - -resources: - - type: bash_script - path: script.sh -test_resources: - - type: bash_script - path: test.sh - - type: file - path: test_data -engines: - - type: docker - image: quay.io/biocontainers/umi_tools:1.1.5--py39hf95cd2a_1 - setup: - - type: docker - run: | - umi_tools -v | sed 's/ version//g' > /var/software_versions.txt -runners: -- type: executable -- type: nextflow \ No newline at end of file diff --git a/src/umi_tools/umi_tools_dedup/help.txt b/src/umi_tools/umi_tools_dedup/help.txt deleted file mode 100644 index d3c8fa44..00000000 --- a/src/umi_tools/umi_tools_dedup/help.txt +++ /dev/null @@ -1,13 +0,0 @@ -``` -umi_tools dedup -``` - -dedup - Deduplicate reads using UMI and mapping coordinates - -Usage: umi_tools dedup [OPTIONS] [--stdin=IN_BAM] [--stdout=OUT_BAM] - - note: If --stdout is ommited, standard out is output. To - generate a valid BAM file on standard out, please - redirect log with --log=LOGFILE or --log2stderr - -For full UMI-tools documentation, see https://umi-tools.readthedocs.io/en/latest/ \ No newline at end of file diff --git a/src/umi_tools/umi_tools_dedup/script.sh b/src/umi_tools/umi_tools_dedup/script.sh deleted file mode 100644 index 57c01258..00000000 --- a/src/umi_tools/umi_tools_dedup/script.sh +++ /dev/null @@ -1,65 +0,0 @@ -#!/bin/bash - -## VIASH START -## VIASH END - -set -e - -test_dir="${metal_executable}/test_data" - -[[ "$par_paired" == "false" ]] && unset par_paired -[[ "$par_in_sam" == "false" ]] && unset par_in_sam -[[ "$par_out_sam" == "false" ]] && unset par_out_sam -[[ "$par_spliced_is_unique" == "false" ]] && unset par_spliced_is_unique -[[ "$par_per_gene" == "false" ]] && unset par_per_gene -[[ "$par_per_contig" == "false" ]] && unset par_per_contig -[[ "$par_per_cell" == "false" ]] && unset par_per_cell -[[ "$par_no_sort_output" == "false" ]] && unset par_no_sort_output -[[ "$par_buffer_whole_contig" == "false" ]] && unset par_buffer_whole_contig -[[ "$par_ignore_umi" == "false" ]] && unset par_ignore_umi -[[ "$par_subset" == "false" ]] && unset par_subset - - -$(which umi_tools) dedup \ - -I "$par_input" \ - ${par_in_sam:+--in-sam} \ - ${par_bai:+--bai "$par_bai"} \ - ${par_get_output_stats:+--get-output-stats} \ - ${par_random_seed:+--random-seed "$par_random_seed"} \ - -S "$par_output" \ - ${par_out_sam:+--out-sam} \ - ${par_paired:+--paired} \ - ${par_output_stats:+--output-stats "$par_output_stats"} \ - ${par_extract_umi_method:+--extract-umi-method "$par_extract_umi_method"} \ - ${par_umi_tag:+--umi-tag "$par_umi_tag"} \ - ${par_umi_separator:+--umi-separator "$par_umi_separator"} \ - ${par_umi_tag_split:+--umi-tag-split "$par_umi_tag_split"} \ - ${par_umi_tag_delimiter:+--umi-tag-delimiter "$par_umi_tag_delimiter"} \ - ${par_cell_tag:+--cell-tag "$par_cell_tag"} \ - ${par_cell_tag_split:+--cell-tag-split "$par_cell_tag_split"} \ - ${par_cell_tag_delimiter:+--cell-tag-delimiter "$par_cell_tag_delimiter"} \ - ${par_method:+--method "$par_method"} \ - ${par_edit_distance_threshold:+--edit-distance-threshold "$par_edit_distance_threshold"} \ - ${par_spliced_is_unique:+--spliced-is-unique} \ - ${par_soft_clip_threshold:+--soft-clip-threshold "$par_soft_clip_threshold"} \ - ${par_multimapping_detection_method:+--multimapping-detection-method "$par_multimapping_detection_method"} \ - ${par_read_length:+--read-length "$par_read_length"} \ - ${par_per_gene:+--per-gene} \ - ${par_gene_tag:+--gene-tag "$par_gene_tag"} \ - ${par_assigned_status_tag:+--assigned-status-tag "$par_assigned_status_tag"} \ - ${par_skip_tags_regex:+--skip-tags-regex "$par_skip_tags_regex"} \ - ${par_per_contig:+--per-contig} - ${par_gene_transcript_map:+--gene-transcript-map "$par_gene_transcript_map"} \ - ${par_per_cell:+--per-cell} \ - ${par_mapping_quality:+--mapping-quality "$par_mapping_quality"} \ - ${par_unmapped_reads:+--unmapped-reads "$par_unmapped_reads"} \ - ${par_chimeric_pairs:+--chimeric-pairs "$par_chimeric_pairs"} \ - ${par_unapired_reads:+--unapired-reads "$par_unapired_reads"} \ - ${par_ignore_umi:+--ignore-umi} \ - ${par_subset:+--subset} \ - ${par_chrom:+--chrom "$par_chrom"} \ - ${par_no_sort_output:+--no-sort-output} \ - ${par_buffer_whole_contig:+--buffer-whole-contig} - - -exit 0 \ No newline at end of file diff --git a/src/umi_tools/umi_tools_dedup/test.sh b/src/umi_tools/umi_tools_dedup/test.sh deleted file mode 100644 index 1459ec08..00000000 --- a/src/umi_tools/umi_tools_dedup/test.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/bin/bash - -test_dir="${meta_resources_dir}/test_data" -echo ">>> Testing $meta_functionality_name" - -"$meta_executable" \ - --bam "$test_dir/a.sorted.bam" \ - --bai "$test_dir/a.sorted.bam.bai" \ - --output "$test_dir/a.sorted.idxstats" - -echo ">>> Checking whether output exists" -[ ! -f "$test_dir/a.sorted.idxstats" ] && echo "File 'a.sorted.idxstats' does not exist!" && exit 1 - -echo ">>> Checking whether output is non-empty" -[ ! -s "$test_dir/a.sorted.idxstats" ] && echo "File 'a.sorted.idxstats' is empty!" && exit 1 - -echo ">>> Checking whether output is correct" -diff "$test_dir/a.sorted.idxstats" "$test_dir/a_ref.sorted.idxstats" || \ - (echo "Output file a.sorted.idxstats does not match expected output" && exit 1) - -rm "$test_dir/a.sorted.idxstats" - -############################################################################################ - -echo ">>> Testing $meta_functionality_name with singletons in the input" - -"$meta_executable" \ - --bam "$test_dir/test.paired_end.sorted.bam" \ - --bai "$test_dir/test.paired_end.sorted.bam.bai" \ - --output "$test_dir/test.paired_end.sorted.idxstats" - -echo ">>> Checking whether output exists" -[ ! -f "$test_dir/test.paired_end.sorted.idxstats" ] && \ - echo "File 'test.paired_end.sorted.idxstats' does not exist!" && exit 1 - -echo ">>> Checking whether output is non-empty" -[ ! -s "$test_dir/test.paired_end.sorted.idxstats" ] && \ - echo "File 'test.paired_end.sorted.idxstats' is empty!" && exit 1 - -echo ">>> Checking whether output is correct" -diff "$test_dir/test.paired_end.sorted.idxstats" "$test_dir/test_ref.paired_end.sorted.idxstats" || \ - (echo "Output file test.paired_end.sorted.idxstats does not match expected output" && exit 1) - -rm "$test_dir/test.paired_end.sorted.idxstats" - -############################################################################################ - -echo "All tests succeeded!" -exit 0 \ No newline at end of file From 1177dc1a82ad21513e0367ee4af7852059c0fd3f Mon Sep 17 00:00:00 2001 From: emmarousseau Date: Sat, 7 Sep 2024 17:04:27 +0200 Subject: [PATCH 3/6] test data, complete config, help, changelog update --- CHANGELOG.md | 3 + src/kallisto/kallisto_index/config.vsh.yaml | 89 ++++++++++++++++++ src/kallisto/kallisto_index/help.txt | 21 +++++ src/kallisto/kallisto_index/k_index | Bin 0 -> 4036 bytes src/kallisto/kallisto_index/script.sh | 25 +++++ src/kallisto/kallisto_index/test.sh | 23 +++++ .../kallisto_index/test_data/d_list.fasta | 5 + .../test_data/transcriptome.fasta | 23 +++++ 8 files changed, 189 insertions(+) create mode 100644 src/kallisto/kallisto_index/config.vsh.yaml create mode 100644 src/kallisto/kallisto_index/help.txt create mode 100644 src/kallisto/kallisto_index/k_index create mode 100644 src/kallisto/kallisto_index/script.sh create mode 100644 src/kallisto/kallisto_index/test.sh create mode 100644 src/kallisto/kallisto_index/test_data/d_list.fasta create mode 100644 src/kallisto/kallisto_index/test_data/transcriptome.fasta diff --git a/CHANGELOG.md b/CHANGELOG.md index 98e78c17..1bec0a04 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -140,6 +140,9 @@ - `bedtools_getfasta`: extract sequences from a FASTA file for each of the intervals defined in a BED/GFF/VCF file (PR #59). +* `kallisto`: + - `kallisto_index`: Create a kallisto index (PR #149). + ## MINOR CHANGES * Uniformize component metadata (PR #23). diff --git a/src/kallisto/kallisto_index/config.vsh.yaml b/src/kallisto/kallisto_index/config.vsh.yaml new file mode 100644 index 00000000..3ae6241f --- /dev/null +++ b/src/kallisto/kallisto_index/config.vsh.yaml @@ -0,0 +1,89 @@ +name: kallisto_index +namespace: kallisto +description: | + Build a Kallisto index for the transcriptome to use Kallisto in the mapping-based mode. +keywords: [kallisto, index] +links: + homepage: https://pachterlab.github.io/kallisto/about + documentation: https://pachterlab.github.io/kallisto/manual + repository: https://github.com/pachterlab/kallisto + issue_tracker: https://github.com/pachterlab/kallisto/issues +references: + doi: https://doi.org/10.1038/nbt.3519 +license: BSD 2-Clause License + +argument_groups: +- name: "Input" + arguments: + - name: "--input" + type: file + description: | + Path to a FASTA-file containing the transcriptome sequences, either in plain text or + compressed (.gz) format. + required: true + - name: "--d_list" + type: file + description: | + Path to a FASTA-file containing sequences to mask from quantification. + +- name: "Output" + arguments: + - name: "--kallisto_index" + type: file + direction: output + must_exist: false + example: Kallisto_index + +- name: "Options" + arguments: + - name: "--kmer_size" + type: integer + description: | + Kmer length passed to indexing step of pseudoaligners (default: '31'). + example: 31 + - name: "--make_unique" + type: boolean_true + description: | + Replace repeated target names with unique names. + - name: "--aa" + type: boolean_true + description: | + Generate index from a FASTA-file containing amino acid sequences. + - name: "--distiguish" + type: boolean_true + description: | + Generate index where sequences are distinguished by the sequence names. + - name: "--min_size" + alternatives: ["-m"] + type: integer + description: | + Length of minimizers (default: automatically chosen). + - name: "--ec_max_size" + alternatives: ["-e"] + type: integer + description: | + Maximum number of targets in an equivalence class (default: no maximum). + +resources: + - type: bash_script + path: script.sh + +test_resources: + - type: bash_script + path: test.sh + - path: test_data + +engines: + - type: docker + image: ubuntu:22.04 + setup: + - type: docker + run: | + apt-get update && \ + apt-get install -y --no-install-recommends wget && \ + wget --no-check-certificate https://github.com/pachterlab/kallisto/releases/download/v0.50.1/kallisto_linux-v0.50.1.tar.gz && \ + tar -xzf kallisto_linux-v0.50.1.tar.gz && \ + mv kallisto/kallisto /usr/local/bin/ +runners: + - type: executable + - type: nextflow diff --git a/src/kallisto/kallisto_index/help.txt b/src/kallisto/kallisto_index/help.txt new file mode 100644 index 00000000..28778ac0 --- /dev/null +++ b/src/kallisto/kallisto_index/help.txt @@ -0,0 +1,21 @@ +``` +kallisto index +``` +kallisto 0.50.1 +Builds a kallisto index + +Usage: kallisto index [arguments] FASTA-files + +Required argument: +-i, --index=STRING Filename for the kallisto index to be constructed + +Optional argument: +-k, --kmer-size=INT k-mer (odd) length (default: 31, max value: 31) +-t, --threads=INT Number of threads to use (default: 1) +-d, --d-list=STRING Path to a FASTA-file containing sequences to mask from quantification + --make-unique Replace repeated target names with unique names + --aa Generate index from a FASTA-file containing amino acid sequences + --distinguish Generate index where sequences are distinguished by the sequence name +-T, --tmp=STRING Temporary directory (default: tmp) +-m, --min-size=INT Length of minimizers (default: automatically chosen) +-e, --ec-max-size=INT Maximum number of targets in an equivalence class (default: no maximum) diff --git a/src/kallisto/kallisto_index/k_index b/src/kallisto/kallisto_index/k_index new file mode 100644 index 0000000000000000000000000000000000000000..cbb91fd2dafbbdbb7bc1a7a942bf09d8ce3c4ad6 GIT binary patch literal 4036 zcmeHK-;W#B6}}#io$Iv;vt}E1)3m+*k!V{Fwn1qKLM}62lMF1Xyo+f-qRmn*C{n1D z_6b$E@=UzcS<;whvz5(Nfh)mX|(E)$fjH zDBk=~{(y{T=H5B?obP_;JI7=DJ~BGa1xBL}!xP72zxWo$2XXBhnaL~esI}v^Ii$vb znk;Ox?q+kom*|> zAV}ES#!`;Um9RH6)Jl{g!YTt!7t$Gg0u6HpG)z!~<*aQ80-UfR!x^MOR24_`wDx7& zU?AMFus#6)zSTV09(G%UK6EhCb_+#7$uf3}K_cW~TZz7fZ8P?&oHAI-90IruoK_rp z^f#Mu=x%0gtBFLzW*p7{??4$|&jbovZp+k6bFEAmQB@!#V^lJeG8*M}8kWA<7*Wod zkU98AjRuUJO;{L;nBvU>=b=;IA(zTbcUGY&(D-R{%Mg5Wm>7+e zgTp(>N+45e$T?GJxOLOoY4C;6{T4ElQz%tSMi)R(6sZh7Hk-WotGupVYa5n6ieWHR z0`T2c2Kv|>!tf48e;ExV4FuFIXEYE?A-Or6Z>Q1I)NJ&v?XH>buQnhmYQ~eWGUYZz z&753=VCX^UH01fUE<-A4fXbBD8t%u494v9Q2hMi*I<8o~?>?0GntpLlzsdWjkINI?w7r71BU+*_S| zQ<@YW_5!6@yB#`A+PzLl4{idwvwkptj2VmeMfqDIYymY@_Oy{isp`9&Af3c zIOd?t;L)qEHxk!}ukqvgWp+;5I;@avi5{7YTp)p<->5$?oM0cTTc^9bHv;)fe?4>R z;OW}WDhGX{;NQ4>L8(Z}gA-L!*u8}bopnvlPUr+(yBf)IBb;_a(?2G_;?0wwH2?YGy? zOdpc#!qUN-n92U+i-1CuXEzq~FE071?j__QA9ryo?tJDBj{#EG2CqQ`&fa`HfWZ219kX`R$IPjx9bdkc!VwDq~r$ z6H?*DdLN{05aoZ*_&*WvC;JQPGucCr9rI$G$HT`SQpeU0&yB4=iuL<}$&bgg4p+ZJ H*iZfk8k`iG literal 0 HcmV?d00001 diff --git a/src/kallisto/kallisto_index/script.sh b/src/kallisto/kallisto_index/script.sh new file mode 100644 index 00000000..b16e2781 --- /dev/null +++ b/src/kallisto/kallisto_index/script.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +set -eo pipefail + +if [ -n "$par_kmer_size" ]; then + if [[ "$par_kmer_size" -lt 1 || "$par_kmer_size" -gt 31 || $(( par_kmer_size % 2 )) -eq 0 ]]; then + echo "Error: Kmer size must be an odd number between 1 and 31." + exit 1 + fi +fi + +kallisto index \ + -i "${par_kallisto_index}" \ + ${par_kmer_size:+--kmer-size $par_kmer_size} \ + ${par_make_unique:+--make-unique} \ + ${par_aa:+--aa} \ + ${par_distinguish:+--distinguish} \ + ${par_min_size:+--min-size $par_min_size} \ + ${par_ec_max_size:+--ec-max-size $par_ec_max_size} \ + ${par_d_list:+--d-list "${par_d_list}"} \ + "${par_input}" + diff --git a/src/kallisto/kallisto_index/test.sh b/src/kallisto/kallisto_index/test.sh new file mode 100644 index 00000000..67869b99 --- /dev/null +++ b/src/kallisto/kallisto_index/test.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +echo ">>>Test1: Testing $meta_functionality_name" + +"$meta_executable" \ + --input "$meta_resources_dir/test_data/transcriptome.fasta" \ + --kallisto_index Kallisto \ + --kmer_size 21 \ + --make_unique + +echo ">>> Checking whether output exists" +[ ! -f "Kallisto" ] && echo "Kallisto index does not exist!" && exit 1 +[ ! -s "Kallisto" ] && echo "Kallisto index is empty!" && exit 1 + +echo ">>>Test2: Testing $meta_functionality_name" + +"$meta_executable" \ + --input "$meta_resources_dir/test_data/transcriptome.fasta" \ + --kallisto_index Kallisto \ + --d_list "$meta_resources_dir/test_data/d_list.fasta" + +echo "All tests succeeded!" +exit 0 diff --git a/src/kallisto/kallisto_index/test_data/d_list.fasta b/src/kallisto/kallisto_index/test_data/d_list.fasta new file mode 100644 index 00000000..ad5e05bf --- /dev/null +++ b/src/kallisto/kallisto_index/test_data/d_list.fasta @@ -0,0 +1,5 @@ +>YAL067W-A CDS=1-228 +ATGCCAATTATAGGGGTGCCGAGGTGCCTTATAAAACCCTTTTCTGTGCCTGTGACATTTCCTTTTTCGG +TCAAAAAGAATATCCGAATTTTAGATTTGGACCCTCGTACAGAAGCTTATTGTCTAAGCCTGAATTCAGT +CTGCTTTAAACGGCTTCCGCGGAGGAAATATTTCCATCTCTTGAATTCGTACAACATTAAACGTGTGTTG +GGAGTCGTATACTGTTAG diff --git a/src/kallisto/kallisto_index/test_data/transcriptome.fasta b/src/kallisto/kallisto_index/test_data/transcriptome.fasta new file mode 100644 index 00000000..94c06163 --- /dev/null +++ b/src/kallisto/kallisto_index/test_data/transcriptome.fasta @@ -0,0 +1,23 @@ +>YAL069W CDS=1-315 +ATGATCGTAAATAACACACACGTGCTTACCCTACCACTTTATACCACCACCACATGCCATACTCACCCTC +ACTTGTATACTGATTTTACGTACGCACACGGATGCTACAGTATATACCATCTCAAACTTACCCTACTCTC +AGATTCCACTTCACTCCATGGCCCATCTCTCACTGAATCAGTACCAAATGCACTCACATCATTATGCACG +GCACTTGCCTCAGCGGTCTATACCCTGTGCCATTTACCCATAACGCCCATCATTATCCACATTTTGATAT +CTATATCTCATTCGGCGGTCCCAAATATTGTATAA +>YAL068W-A CDS=1-255 +ATGCACGGCACTTGCCTCAGCGGTCTATACCCTGTGCCATTTACCCATAACGCCCATCATTATCCACATT +TTGATATCTATATCTCATTCGGCGGTCCCAAATATTGTATAACTGCCCTTAATACATACGTTATACCACT +TTTGCACCATATACTTACCACTCCATTTATATACACTTATGTCAATATTACAGAAAAATCCCCACAAAAA +TCACCTAAACATAAAAATATTCTACTTTTCAACAATAATACATAA +>YAL068C CDS=1-363 +ATGGTCAAATTAACTTCAATCGCCGCTGGTGTCGCTGCCATCGCTGCTACTGCTTCTGCAACCACCACTC +TAGCTCAATCTGACGAAAGAGTCAACTTGGTGGAATTGGGTGTCTACGTCTCTGATATCAGAGCTCACTT +AGCCCAATACTACATGTTCCAAGCCGCCCACCCAACTGAAACCTACCCAGTCGAAGTTGCTGAAGCCGTT +TTCAACTACGGTGACTTCACCACCATGTTGACCGGTATTGCTCCAGACCAAGTGACCAGAATGATCACCG +GTGTTCCATGGTACTCCAGCAGATTAAAGCCAGCCATCTCCAGTGCTCTATCCAAGGACGGTATCTACAC +TATCGCAAACTAG +>YAL067W-A CDS=1-228 +ATGCCAATTATAGGGGTGCCGAGGTGCCTTATAAAACCCTTTTCTGTGCCTGTGACATTTCCTTTTTCGG +TCAAAAAGAATATCCGAATTTTAGATTTGGACCCTCGTACAGAAGCTTATTGTCTAAGCCTGAATTCAGT +CTGCTTTAAACGGCTTCCGCGGAGGAAATATTTCCATCTCTTGAATTCGTACAACATTAAACGTGTGTTG +GGAGTCGTATACTGTTAG \ No newline at end of file From 41960e3b4eba025794e2097b2eefa70f14edbf56 Mon Sep 17 00:00:00 2001 From: emmarousseau Date: Sun, 8 Sep 2024 16:17:27 +0200 Subject: [PATCH 4/6] check test output contents --- src/kallisto/kallisto_index/test.sh | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/src/kallisto/kallisto_index/test.sh b/src/kallisto/kallisto_index/test.sh index 67869b99..40ed0aab 100644 --- a/src/kallisto/kallisto_index/test.sh +++ b/src/kallisto/kallisto_index/test.sh @@ -1,6 +1,6 @@ #!/bin/bash -echo ">>>Test1: Testing $meta_functionality_name" +echo ">>>Test1: Testing $meta_functionality_name with make_unique argument" "$meta_executable" \ --input "$meta_resources_dir/test_data/transcriptome.fasta" \ @@ -8,16 +8,27 @@ echo ">>>Test1: Testing $meta_functionality_name" --kmer_size 21 \ --make_unique + echo ">>> Checking whether output exists" [ ! -f "Kallisto" ] && echo "Kallisto index does not exist!" && exit 1 [ ! -s "Kallisto" ] && echo "Kallisto index is empty!" && exit 1 -echo ">>>Test2: Testing $meta_functionality_name" +kallisto inspect Kallisto 2> test.txt +grep "number of k-mers: 2,978" test.txt || { echo "The content of the index seems to be incorrect." && exit 1; } + +echo ">>>Test2: Testing $meta_functionality_name with d_list argument" "$meta_executable" \ --input "$meta_resources_dir/test_data/transcriptome.fasta" \ --kallisto_index Kallisto \ --d_list "$meta_resources_dir/test_data/d_list.fasta" +echo ">>> Checking whether output exists" +[ ! -f "Kallisto" ] && echo "Kallisto index does not exist!" && exit 1 +[ ! -s "Kallisto" ] && echo "Kallisto index is empty!" && exit 1 + +kallisto inspect Kallisto 2> test.txt +grep "number of k-mers: 3,056" test.txt || { echo "The content of the index seems to be incorrect." && exit 1; } + echo "All tests succeeded!" exit 0 From f75d6db97e78792d3d9e7fd3299654abf6327050 Mon Sep 17 00:00:00 2001 From: emmarousseau Date: Sun, 8 Sep 2024 17:38:34 +0200 Subject: [PATCH 5/6] remove extra files and clean up test script --- src/kallisto/kallisto_index/k_index | Bin 4036 -> 0 bytes src/kallisto/kallisto_index/test.sh | 11 ++++++----- 2 files changed, 6 insertions(+), 5 deletions(-) delete mode 100644 src/kallisto/kallisto_index/k_index diff --git a/src/kallisto/kallisto_index/k_index b/src/kallisto/kallisto_index/k_index deleted file mode 100644 index cbb91fd2dafbbdbb7bc1a7a942bf09d8ce3c4ad6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4036 zcmeHK-;W#B6}}#io$Iv;vt}E1)3m+*k!V{Fwn1qKLM}62lMF1Xyo+f-qRmn*C{n1D z_6b$E@=UzcS<;whvz5(Nfh)mX|(E)$fjH zDBk=~{(y{T=H5B?obP_;JI7=DJ~BGa1xBL}!xP72zxWo$2XXBhnaL~esI}v^Ii$vb znk;Ox?q+kom*|> zAV}ES#!`;Um9RH6)Jl{g!YTt!7t$Gg0u6HpG)z!~<*aQ80-UfR!x^MOR24_`wDx7& zU?AMFus#6)zSTV09(G%UK6EhCb_+#7$uf3}K_cW~TZz7fZ8P?&oHAI-90IruoK_rp z^f#Mu=x%0gtBFLzW*p7{??4$|&jbovZp+k6bFEAmQB@!#V^lJeG8*M}8kWA<7*Wod zkU98AjRuUJO;{L;nBvU>=b=;IA(zTbcUGY&(D-R{%Mg5Wm>7+e zgTp(>N+45e$T?GJxOLOoY4C;6{T4ElQz%tSMi)R(6sZh7Hk-WotGupVYa5n6ieWHR z0`T2c2Kv|>!tf48e;ExV4FuFIXEYE?A-Or6Z>Q1I)NJ&v?XH>buQnhmYQ~eWGUYZz z&753=VCX^UH01fUE<-A4fXbBD8t%u494v9Q2hMi*I<8o~?>?0GntpLlzsdWjkINI?w7r71BU+*_S| zQ<@YW_5!6@yB#`A+PzLl4{idwvwkptj2VmeMfqDIYymY@_Oy{isp`9&Af3c zIOd?t;L)qEHxk!}ukqvgWp+;5I;@avi5{7YTp)p<->5$?oM0cTTc^9bHv;)fe?4>R z;OW}WDhGX{;NQ4>L8(Z}gA-L!*u8}bopnvlPUr+(yBf)IBb;_a(?2G_;?0wwH2?YGy? zOdpc#!qUN-n92U+i-1CuXEzq~FE071?j__QA9ryo?tJDBj{#EG2CqQ`&fa`HfWZ219kX`R$IPjx9bdkc!VwDq~r$ z6H?*DdLN{05aoZ*_&*WvC;JQPGucCr9rI$G$HT`SQpeU0&yB4=iuL<}$&bgg4p+ZJ H*iZfk8k`iG diff --git a/src/kallisto/kallisto_index/test.sh b/src/kallisto/kallisto_index/test.sh index 40ed0aab..bd8ace10 100644 --- a/src/kallisto/kallisto_index/test.sh +++ b/src/kallisto/kallisto_index/test.sh @@ -1,21 +1,22 @@ #!/bin/bash -echo ">>>Test1: Testing $meta_functionality_name with make_unique argument" +echo ">>>Test1: Testing $meta_functionality_name with non-default k-mer size" "$meta_executable" \ --input "$meta_resources_dir/test_data/transcriptome.fasta" \ --kallisto_index Kallisto \ - --kmer_size 21 \ - --make_unique + --kmer_size 21 -echo ">>> Checking whether output exists" +echo ">>> Checking whether output exists and is correct" [ ! -f "Kallisto" ] && echo "Kallisto index does not exist!" && exit 1 [ ! -s "Kallisto" ] && echo "Kallisto index is empty!" && exit 1 kallisto inspect Kallisto 2> test.txt grep "number of k-mers: 2,978" test.txt || { echo "The content of the index seems to be incorrect." && exit 1; } +################################################################################ + echo ">>>Test2: Testing $meta_functionality_name with d_list argument" "$meta_executable" \ @@ -23,7 +24,7 @@ echo ">>>Test2: Testing $meta_functionality_name with d_list argument" --kallisto_index Kallisto \ --d_list "$meta_resources_dir/test_data/d_list.fasta" -echo ">>> Checking whether output exists" +echo ">>> Checking whether output exists and is correct" [ ! -f "Kallisto" ] && echo "Kallisto index does not exist!" && exit 1 [ ! -s "Kallisto" ] && echo "Kallisto index is empty!" && exit 1 From ece9dbba3f4faa1b3bee4e58ce877eb759f111f6 Mon Sep 17 00:00:00 2001 From: emmarousseau Date: Wed, 11 Sep 2024 20:26:45 +0200 Subject: [PATCH 6/6] unset bool arguments, add missing arguments to script --- src/kallisto/kallisto_index/Kallisto | Bin 0 -> 2439 bytes src/kallisto/kallisto_index/config.vsh.yaml | 9 +++++++-- src/kallisto/kallisto_index/script.sh | 17 +++++++++++++---- src/kallisto/kallisto_index/test.sh | 12 ++++++------ 4 files changed, 26 insertions(+), 12 deletions(-) create mode 100644 src/kallisto/kallisto_index/Kallisto diff --git a/src/kallisto/kallisto_index/Kallisto b/src/kallisto/kallisto_index/Kallisto new file mode 100644 index 0000000000000000000000000000000000000000..3c7b5b2bff962965d99ca3f9a4a6b6af6da1f3f0 GIT binary patch literal 2439 zcmeHJTTBx{6rFCj(iW7(R4PH@f{iwcTCEBOksY_VO)N&jMkQb@MkU0z5b%pZJItq>6y3i5QJC02jb;7mKSNlWm{Po|xmnK|d)x%cj5cE^Hf zo5Ms=gP>q-=Dx`Y&8Tam%b*(*sAYc)aw*sH`%Qmb#irI!wJd#g^%N{jJ942Y*B;7v zR8myiWp1c$UHLL&=A`AC^o$*g(yPKxE9&k2TL(+#tPc&Aci!%O@N%%KUe}u^mgT!# zi%Y`WO!oXjbAkQw#f0xZP+kZo&F8eXH{r?boldZcY~7r&DBNCpW#D;g<%>%V>88S* zn(fP&_poKg&FeY4ul8ovgjcuEYhCLuEry9tsbgo; zfcbNEMVB@;wf0@jF3-`vqIs3Vln>G5z~*-Q*|O7R4?+L_8<&^;{;|Iv=yw^0x?1Fq zKKH{t-pk+KZ7B%++|bB;H`mVIyPQk@FmSG}@6usnuL2HvP5$1gzh1RfdlA=oM8O*p9pU%UxMoXtZa4MiI;e6lN9)Zj9SsczoL4F-nxW7gjS;lL#ITXLZTe7Tm{6y<^@7*BFcmyOu`fe-N>GMM_MNm8n3s15~-A-du zs>S(M$mE>7gVQsBMnsW@M&}f>b(D#qkcNO}MG=t0Wgt?+7*11VW2Q|{QbnyveDt-4O(8?_>}HQ+|nsHK}JD> zEI}_&#snCR!wU`=ImWmY2!IuUp@YzBuGIbjA?Q;}NCbS6Ej{38{Q(Ed(9~7CHlh~@ z(zu?LfHTHk>o~Hk>ib8~11>`F@%qmr>8X$)4UE=ZAnP=qIJp|ns6M_j(fMdSqjeZP zKmX@E#Gf*HzUVyzWeLinBtr;M7o$H(mPA>GqA1d9hM{`wuLpH{uWG16io*!3&Oq!~ zY>JwOK3Z%+$HT0KEnpYTsK*f41@5@T5O@KlCdndB3}+_eA<9%le@sX@NTS*o#e2qq z{mZk6y~(I%5^AViqJ%~m(IWP&+TTT!n9y(~NA!$eA9;u!LWn;?@K-_t>bR9cmu5}*J8 literal 0 HcmV?d00001 diff --git a/src/kallisto/kallisto_index/config.vsh.yaml b/src/kallisto/kallisto_index/config.vsh.yaml index 3ae6241f..2c4f65c7 100644 --- a/src/kallisto/kallisto_index/config.vsh.yaml +++ b/src/kallisto/kallisto_index/config.vsh.yaml @@ -28,10 +28,9 @@ argument_groups: - name: "Output" arguments: - - name: "--kallisto_index" + - name: "--index" type: file direction: output - must_exist: false example: Kallisto_index - name: "Options" @@ -63,6 +62,12 @@ argument_groups: type: integer description: | Maximum number of targets in an equivalence class (default: no maximum). + - name: "--tmp" + alternatives: ["-T"] + type: string + description: | + Path to a directory for temporary files. + example: "tmp" resources: - type: bash_script diff --git a/src/kallisto/kallisto_index/script.sh b/src/kallisto/kallisto_index/script.sh index b16e2781..59a5d3de 100644 --- a/src/kallisto/kallisto_index/script.sh +++ b/src/kallisto/kallisto_index/script.sh @@ -5,6 +5,13 @@ set -eo pipefail +unset_if_false=( par_make_unique par_aa par_distinguish ) + +for var in "${unset_if_false[@]}"; do + temp_var="${!var}" + [[ "$temp_var" == "false" ]] && unset $var +done + if [ -n "$par_kmer_size" ]; then if [[ "$par_kmer_size" -lt 1 || "$par_kmer_size" -gt 31 || $(( par_kmer_size % 2 )) -eq 0 ]]; then echo "Error: Kmer size must be an odd number between 1 and 31." @@ -13,13 +20,15 @@ if [ -n "$par_kmer_size" ]; then fi kallisto index \ - -i "${par_kallisto_index}" \ - ${par_kmer_size:+--kmer-size $par_kmer_size} \ + -i "${par_index}" \ + ${par_kmer_size:+--kmer-size "${par_kmer_size}"} \ ${par_make_unique:+--make-unique} \ ${par_aa:+--aa} \ ${par_distinguish:+--distinguish} \ - ${par_min_size:+--min-size $par_min_size} \ - ${par_ec_max_size:+--ec-max-size $par_ec_max_size} \ + ${par_min_size:+--min-size "${par_min_size}"} \ + ${par_ec_max_size:+--ec-max-size "${par_ec_max_size}"} \ ${par_d_list:+--d-list "${par_d_list}"} \ + ${meta_cpus:+--cpu "${meta_cpus}"} \ + ${par_tmp:+--tmp "${par_tmp}"} \ "${par_input}" diff --git a/src/kallisto/kallisto_index/test.sh b/src/kallisto/kallisto_index/test.sh index bd8ace10..2646dcd8 100644 --- a/src/kallisto/kallisto_index/test.sh +++ b/src/kallisto/kallisto_index/test.sh @@ -1,10 +1,10 @@ #!/bin/bash -echo ">>>Test1: Testing $meta_functionality_name with non-default k-mer size" +echo ">>>Test 1: Testing $meta_functionality_name with non-default k-mer size" "$meta_executable" \ --input "$meta_resources_dir/test_data/transcriptome.fasta" \ - --kallisto_index Kallisto \ + --index Kallisto \ --kmer_size 21 @@ -13,15 +13,15 @@ echo ">>> Checking whether output exists and is correct" [ ! -s "Kallisto" ] && echo "Kallisto index is empty!" && exit 1 kallisto inspect Kallisto 2> test.txt -grep "number of k-mers: 2,978" test.txt || { echo "The content of the index seems to be incorrect." && exit 1; } +grep "number of k-mers: 989" test.txt || { echo "The content of the index seems to be incorrect." && exit 1; } ################################################################################ -echo ">>>Test2: Testing $meta_functionality_name with d_list argument" +echo ">>>Test 2: Testing $meta_functionality_name with d_list argument" "$meta_executable" \ --input "$meta_resources_dir/test_data/transcriptome.fasta" \ - --kallisto_index Kallisto \ + --index Kallisto \ --d_list "$meta_resources_dir/test_data/d_list.fasta" echo ">>> Checking whether output exists and is correct" @@ -29,7 +29,7 @@ echo ">>> Checking whether output exists and is correct" [ ! -s "Kallisto" ] && echo "Kallisto index is empty!" && exit 1 kallisto inspect Kallisto 2> test.txt -grep "number of k-mers: 3,056" test.txt || { echo "The content of the index seems to be incorrect." && exit 1; } +grep "number of k-mers: 959" test.txt || { echo "The content of the index seems to be incorrect." && exit 1; } echo "All tests succeeded!" exit 0