From 38f586bec0ac9e4312b016e29c3aa0bd53f292b2 Mon Sep 17 00:00:00 2001 From: emmarousseau Date: Thu, 11 Apr 2024 11:04:14 +0100 Subject: [PATCH 1/5] initial commit dedup --- CHANGELOG.md | 3 + src/umi_tools/umi_tools_dedup/config.vsh.yaml | 279 ++++++++++++++++++ src/umi_tools/umi_tools_dedup/help.txt | 13 + src/umi_tools/umi_tools_dedup/script.sh | 65 ++++ src/umi_tools/umi_tools_dedup/test.sh | 49 +++ 5 files changed, 409 insertions(+) create mode 100644 src/umi_tools/umi_tools_dedup/config.vsh.yaml create mode 100644 src/umi_tools/umi_tools_dedup/help.txt create mode 100644 src/umi_tools/umi_tools_dedup/script.sh create mode 100644 src/umi_tools/umi_tools_dedup/test.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 4fd7f001..1bef9345 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -39,6 +39,9 @@ - `samtools/flagstat`: Counts the number of alignments in SAM/BAM/CRAM files for each FLAG type (PR #31). - `samtools/idxstats`: Reports alignment summary statistics for a SAM/BAM/CRAM file (PR #32). +* `umi_tools`: + - `umi_tools/umi_tools_dedup`: Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read (PR #). + ## MAJOR CHANGES ## MINOR CHANGES diff --git a/src/umi_tools/umi_tools_dedup/config.vsh.yaml b/src/umi_tools/umi_tools_dedup/config.vsh.yaml new file mode 100644 index 00000000..75306541 --- /dev/null +++ b/src/umi_tools/umi_tools_dedup/config.vsh.yaml @@ -0,0 +1,279 @@ +name: umi_tool_dedup +namespace: umi_tools +description: | + Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read. +keywords: [umi_tools, deduplication, dedup] +links: + homepage: https://umi-tools.readthedocs.io/en/latest/ + documentation: [ https://umi-tools.readthedocs.io/en/latest/reference/dedup.html, + https://umi-tools.readthedocs.io/en/latest/common_options.html#common-options ] + repository: https://github.com/CGATOxford/UMI-tools +references: + doi: 10.1101/gr.209601.116 +license: MIT + +argument_groups: + - name: Inputs + arguments: + - name: --input + alternatives: -I + type: file + description: Input BAM or SAM file. Use --in_sam to specify SAM format. + required: true + - name: --in_sam + type: boolean_true + description: | + By default, inputs are assumed to be in BAM format. Use this options + to specify the use of SAM format for input. + - name: --bai + type: file + description: BAM index + - name: --get_output_stats + type: boolean + description: Whether or not to generate output stats. + - name: --random_seed + type: integer + description: | + Random seed to initialize number generator with. + default: none + + - name: Outputs + arguments: + - name: --output + alternatives: -S + type: file + description: Deduplicated BAM file + required: true + direction: output + - name: --out_sam + type: boolean_true + description: | + By default, outputa are written in BAM format. Use this options to + specify the use of SAM format for output. + - name: --paired + type: boolean_true + description: | + BAM is paired end - output both read pairs. This will also force the + use of the template length to determine reads with the same mapping + coordinates. + - name: --output_stats + type: file + description: Directory containing UMI based deduplication statistics files + direction: output + - name: --extract_umi_method + type: string + description: | + Specify the method by which the barcodes were encoded in the read. + The options are: [read_id, tag, umis]. + default: read_id + - name: --umi_tag + type: string + description: | + The tag containing the UMI sequence. + This is only required if the extract_umi_method is set to tag. + - name: --umi_separator + type: string + description: | + The separator used to separate the UMI from the read sequence. + This is only required if the extract_umi_method is set to id_read. + default: '_' + - name: --umi_tag_split + type: string + description: | + Separate the UMI in tag by and take the first element. + - name: --umi_tag_delimiter + type: string + description: | + Separate the UMI in by and concatenate the elements + - name: --cell_tag + type: string + description: | + The tag containing the cell barcode sequence. + This is only required if the extract_umi_method is set to tag. + - name: --cell_tag_split + type: string + description: | + Separate the cell barcode in tag by and take the first element. + - name: --cell_tag_delimiter + type: string + description: | + Separate the cell barcode in by and concatenate the elements + + - name: Grouping Options + arguments: + - name: --method + type: string + description: | + The method to use for grouping reads. The options are: + [unique, percentile, cluster, adjacency, directional]. + default: directional + - name: --edit_distance_threshold + type: integer + description: | + For the adjacency and cluster methods the threshold for the edit + distance to connect two UMIs in the network can be increased. The + default value of 1 works best unless the UMI is very long (>14bp). + default: 1 + - name: --spliced_is_unique + type: boolean_true + description: | + Causes two reads that start in the same position on the same strand + and having the same UMI to be considered unique if one is spliced + and the other is not. (Uses the ‘N’ cigar operation to test for splicing). + - name: --soft_clip_threshold + type: integer + description: | + Mappers that soft clip will sometimes do so rather than mapping a + spliced read if there is only a small overhang over the exon junction. + By setting this option, you can treat reads with at least this many + bases soft-clipped at the 3’ end as spliced. + default: 4 + - name: --multimapping_detection_method + type: string + description: | + If the sam/bam contains tags to identify multimapping reads, you can + specify for use when selecting the best read at a given loci. Supported + tags are “NH”, “X0” and “XT”. If not specified, the read with the highest + mapping quality will be selected. + - name: --read_length + type: integer + description: | + Use the read length as a criteria when deduping, for e.g sRNA-Seq. + + - name: Single-cell RNA-Seq Options + arguments: + - name: --per_gene + type: boolean_true + description: | + Reads will be grouped together if they have the same gene. This is useful + if your library prep generates PCR duplicates with non identical alignment + positions such as CEL-Seq. Note this option is hardcoded to be on with the + count command. I.e counting is always performed per-gene. Must be combined + with either --gene_tag or --per_contig option. + - name: --gene_tag + type: string + description: | + Deduplicate per gene. The gene information is encoded in the bam read tag + specified. + - name: --assigned_status_tag + type: string + description: | + BAM tag which describes whether a read is assigned to a gene. Defaults to + the same value as given for --gene_tag. + - name: --skip_tags_regex + type: string + description: | + Use in conjunction with the --assigned_status_tag option to skip any reads + where the tag matches this regex. Default ("^[__|Unassigned]") matches + anything which starts with “__” or “Unassigned”. + - name: --per_contig + type: boolean_true + description: | + Deduplicate per contig (field 3 in BAM; RNAME). All reads with the same + contig will be considered to have the same alignment position. This is + useful if you have aligned to a reference transcriptome with one + transcript per gene. If you have aligned to a transcriptome with more + than one transcript per gene, you can supply a map between transcripts + and gene using the --gene_transcript_map option. + - name: --gene_transcript_map + type: file + description: | + A file containing a mapping between gene names and transcript names. + The file should be tab separated with the gene name in the first column + and the transcript name in the second column. + - name: --per_cell + type: boolean_true + description: | + Reads will only be grouped together if they have the same cell barcode. + Can be combined with --per_gene. + + - name: SAM/BAM Options + arguments: + - name: --mapping_quality + type: integer + description: | + Minimium mapping quality (MAPQ) for a read to be retained. + default: 0 + - name: --unmapped_reads + type: string + description: | + How unmapped reads should be handled. + The options are: + "discard": Discard all unmapped reads. + "use": If read2 is unmapped, deduplicate using read1 only. + Requires --paired. + "output": Output unmapped reads/read pairs without UMI + grouping/deduplication. Only available in umi_tools group. + default: discard + - name: --chimeric_pairs + type: string + description: | + How chimeric pairs should be handled. + The options are: + "discard": Discard all chimeric read pairs. + "use": Deduplicate using read1 only. + "output": Output chimeric pairs without UMI grouping/deduplication. + Only available in umi_tools group. + default: use + - name: --unapired_reads + type: string + description: | + How unpaired reads should be handled. + The options are: + "discard": Discard all unpaired reads. + "use": Deduplicate using read1 only. + "output": Output unpaired reads without UMI grouping/deduplication. + Only available in umi_tools group. + default: use + - name: --ignore_umi + type: boolean_true + description: | + Ignore the UMI and group reads using mapping coordinates only. + - name: --subset + type: boolean_true + description: | + Only consider a fraction of the reads, chosen at random. This is useful + for doing saturation analyses. + - name: --chrom + type: string + description: | + Only consider a single chromosome. This is useful for debugging/testing + purposes. + + - name: Group/Dedup Options + arguments: + - name: --no_sort_output + type: boolean_true + description: | + By default, output is sorted. This involves the use of a temporary unsorted + file (saved in --temp-dir). Use this option to turn off sorting. + - name: --buffer_whole_contig + type: boolean_true + description: | + Forces dedup to parse an entire contig before yielding any reads for + deduplication. This is the only way to absolutely guarantee that all reads + with the same start position are grouped together for deduplication since + dedup uses the start position of the read, not the alignment coordinate on + which the reads are sorted. However, by default, dedup reads for another + 1000bp before outputting read groups which will avoid any reads being missed + with short read sequencing (<1000bp). + + +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - type: file + path: test_data +engines: + - type: docker + image: quay.io/biocontainers/umi_tools:1.1.5--py39hf95cd2a_1 + setup: + - type: docker + run: | + umi_tools -v | sed 's/ version//g' > /var/software_versions.txt +runners: +- type: executable +- type: nextflow \ No newline at end of file diff --git a/src/umi_tools/umi_tools_dedup/help.txt b/src/umi_tools/umi_tools_dedup/help.txt new file mode 100644 index 00000000..d3c8fa44 --- /dev/null +++ b/src/umi_tools/umi_tools_dedup/help.txt @@ -0,0 +1,13 @@ +``` +umi_tools dedup +``` + +dedup - Deduplicate reads using UMI and mapping coordinates + +Usage: umi_tools dedup [OPTIONS] [--stdin=IN_BAM] [--stdout=OUT_BAM] + + note: If --stdout is ommited, standard out is output. To + generate a valid BAM file on standard out, please + redirect log with --log=LOGFILE or --log2stderr + +For full UMI-tools documentation, see https://umi-tools.readthedocs.io/en/latest/ \ No newline at end of file diff --git a/src/umi_tools/umi_tools_dedup/script.sh b/src/umi_tools/umi_tools_dedup/script.sh new file mode 100644 index 00000000..57c01258 --- /dev/null +++ b/src/umi_tools/umi_tools_dedup/script.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +set -e + +test_dir="${metal_executable}/test_data" + +[[ "$par_paired" == "false" ]] && unset par_paired +[[ "$par_in_sam" == "false" ]] && unset par_in_sam +[[ "$par_out_sam" == "false" ]] && unset par_out_sam +[[ "$par_spliced_is_unique" == "false" ]] && unset par_spliced_is_unique +[[ "$par_per_gene" == "false" ]] && unset par_per_gene +[[ "$par_per_contig" == "false" ]] && unset par_per_contig +[[ "$par_per_cell" == "false" ]] && unset par_per_cell +[[ "$par_no_sort_output" == "false" ]] && unset par_no_sort_output +[[ "$par_buffer_whole_contig" == "false" ]] && unset par_buffer_whole_contig +[[ "$par_ignore_umi" == "false" ]] && unset par_ignore_umi +[[ "$par_subset" == "false" ]] && unset par_subset + + +$(which umi_tools) dedup \ + -I "$par_input" \ + ${par_in_sam:+--in-sam} \ + ${par_bai:+--bai "$par_bai"} \ + ${par_get_output_stats:+--get-output-stats} \ + ${par_random_seed:+--random-seed "$par_random_seed"} \ + -S "$par_output" \ + ${par_out_sam:+--out-sam} \ + ${par_paired:+--paired} \ + ${par_output_stats:+--output-stats "$par_output_stats"} \ + ${par_extract_umi_method:+--extract-umi-method "$par_extract_umi_method"} \ + ${par_umi_tag:+--umi-tag "$par_umi_tag"} \ + ${par_umi_separator:+--umi-separator "$par_umi_separator"} \ + ${par_umi_tag_split:+--umi-tag-split "$par_umi_tag_split"} \ + ${par_umi_tag_delimiter:+--umi-tag-delimiter "$par_umi_tag_delimiter"} \ + ${par_cell_tag:+--cell-tag "$par_cell_tag"} \ + ${par_cell_tag_split:+--cell-tag-split "$par_cell_tag_split"} \ + ${par_cell_tag_delimiter:+--cell-tag-delimiter "$par_cell_tag_delimiter"} \ + ${par_method:+--method "$par_method"} \ + ${par_edit_distance_threshold:+--edit-distance-threshold "$par_edit_distance_threshold"} \ + ${par_spliced_is_unique:+--spliced-is-unique} \ + ${par_soft_clip_threshold:+--soft-clip-threshold "$par_soft_clip_threshold"} \ + ${par_multimapping_detection_method:+--multimapping-detection-method "$par_multimapping_detection_method"} \ + ${par_read_length:+--read-length "$par_read_length"} \ + ${par_per_gene:+--per-gene} \ + ${par_gene_tag:+--gene-tag "$par_gene_tag"} \ + ${par_assigned_status_tag:+--assigned-status-tag "$par_assigned_status_tag"} \ + ${par_skip_tags_regex:+--skip-tags-regex "$par_skip_tags_regex"} \ + ${par_per_contig:+--per-contig} + ${par_gene_transcript_map:+--gene-transcript-map "$par_gene_transcript_map"} \ + ${par_per_cell:+--per-cell} \ + ${par_mapping_quality:+--mapping-quality "$par_mapping_quality"} \ + ${par_unmapped_reads:+--unmapped-reads "$par_unmapped_reads"} \ + ${par_chimeric_pairs:+--chimeric-pairs "$par_chimeric_pairs"} \ + ${par_unapired_reads:+--unapired-reads "$par_unapired_reads"} \ + ${par_ignore_umi:+--ignore-umi} \ + ${par_subset:+--subset} \ + ${par_chrom:+--chrom "$par_chrom"} \ + ${par_no_sort_output:+--no-sort-output} \ + ${par_buffer_whole_contig:+--buffer-whole-contig} + + +exit 0 \ No newline at end of file diff --git a/src/umi_tools/umi_tools_dedup/test.sh b/src/umi_tools/umi_tools_dedup/test.sh new file mode 100644 index 00000000..1459ec08 --- /dev/null +++ b/src/umi_tools/umi_tools_dedup/test.sh @@ -0,0 +1,49 @@ +#!/bin/bash + +test_dir="${meta_resources_dir}/test_data" +echo ">>> Testing $meta_functionality_name" + +"$meta_executable" \ + --bam "$test_dir/a.sorted.bam" \ + --bai "$test_dir/a.sorted.bam.bai" \ + --output "$test_dir/a.sorted.idxstats" + +echo ">>> Checking whether output exists" +[ ! -f "$test_dir/a.sorted.idxstats" ] && echo "File 'a.sorted.idxstats' does not exist!" && exit 1 + +echo ">>> Checking whether output is non-empty" +[ ! -s "$test_dir/a.sorted.idxstats" ] && echo "File 'a.sorted.idxstats' is empty!" && exit 1 + +echo ">>> Checking whether output is correct" +diff "$test_dir/a.sorted.idxstats" "$test_dir/a_ref.sorted.idxstats" || \ + (echo "Output file a.sorted.idxstats does not match expected output" && exit 1) + +rm "$test_dir/a.sorted.idxstats" + +############################################################################################ + +echo ">>> Testing $meta_functionality_name with singletons in the input" + +"$meta_executable" \ + --bam "$test_dir/test.paired_end.sorted.bam" \ + --bai "$test_dir/test.paired_end.sorted.bam.bai" \ + --output "$test_dir/test.paired_end.sorted.idxstats" + +echo ">>> Checking whether output exists" +[ ! -f "$test_dir/test.paired_end.sorted.idxstats" ] && \ + echo "File 'test.paired_end.sorted.idxstats' does not exist!" && exit 1 + +echo ">>> Checking whether output is non-empty" +[ ! -s "$test_dir/test.paired_end.sorted.idxstats" ] && \ + echo "File 'test.paired_end.sorted.idxstats' is empty!" && exit 1 + +echo ">>> Checking whether output is correct" +diff "$test_dir/test.paired_end.sorted.idxstats" "$test_dir/test_ref.paired_end.sorted.idxstats" || \ + (echo "Output file test.paired_end.sorted.idxstats does not match expected output" && exit 1) + +rm "$test_dir/test.paired_end.sorted.idxstats" + +############################################################################################ + +echo "All tests succeeded!" +exit 0 \ No newline at end of file From 2c269682620a407803e528652198646435ef2c03 Mon Sep 17 00:00:00 2001 From: emmarousseau Date: Thu, 11 Apr 2024 11:38:57 +0100 Subject: [PATCH 2/5] Revert "initial commit dedup" This reverts commit 38f586bec0ac9e4312b016e29c3aa0bd53f292b2. --- CHANGELOG.md | 3 - src/umi_tools/umi_tools_dedup/config.vsh.yaml | 279 ------------------ src/umi_tools/umi_tools_dedup/help.txt | 13 - src/umi_tools/umi_tools_dedup/script.sh | 65 ---- src/umi_tools/umi_tools_dedup/test.sh | 49 --- 5 files changed, 409 deletions(-) delete mode 100644 src/umi_tools/umi_tools_dedup/config.vsh.yaml delete mode 100644 src/umi_tools/umi_tools_dedup/help.txt delete mode 100644 src/umi_tools/umi_tools_dedup/script.sh delete mode 100644 src/umi_tools/umi_tools_dedup/test.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 1bef9345..4fd7f001 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -39,9 +39,6 @@ - `samtools/flagstat`: Counts the number of alignments in SAM/BAM/CRAM files for each FLAG type (PR #31). - `samtools/idxstats`: Reports alignment summary statistics for a SAM/BAM/CRAM file (PR #32). -* `umi_tools`: - - `umi_tools/umi_tools_dedup`: Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read (PR #). - ## MAJOR CHANGES ## MINOR CHANGES diff --git a/src/umi_tools/umi_tools_dedup/config.vsh.yaml b/src/umi_tools/umi_tools_dedup/config.vsh.yaml deleted file mode 100644 index 75306541..00000000 --- a/src/umi_tools/umi_tools_dedup/config.vsh.yaml +++ /dev/null @@ -1,279 +0,0 @@ -name: umi_tool_dedup -namespace: umi_tools -description: | - Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read. -keywords: [umi_tools, deduplication, dedup] -links: - homepage: https://umi-tools.readthedocs.io/en/latest/ - documentation: [ https://umi-tools.readthedocs.io/en/latest/reference/dedup.html, - https://umi-tools.readthedocs.io/en/latest/common_options.html#common-options ] - repository: https://github.com/CGATOxford/UMI-tools -references: - doi: 10.1101/gr.209601.116 -license: MIT - -argument_groups: - - name: Inputs - arguments: - - name: --input - alternatives: -I - type: file - description: Input BAM or SAM file. Use --in_sam to specify SAM format. - required: true - - name: --in_sam - type: boolean_true - description: | - By default, inputs are assumed to be in BAM format. Use this options - to specify the use of SAM format for input. - - name: --bai - type: file - description: BAM index - - name: --get_output_stats - type: boolean - description: Whether or not to generate output stats. - - name: --random_seed - type: integer - description: | - Random seed to initialize number generator with. - default: none - - - name: Outputs - arguments: - - name: --output - alternatives: -S - type: file - description: Deduplicated BAM file - required: true - direction: output - - name: --out_sam - type: boolean_true - description: | - By default, outputa are written in BAM format. Use this options to - specify the use of SAM format for output. - - name: --paired - type: boolean_true - description: | - BAM is paired end - output both read pairs. This will also force the - use of the template length to determine reads with the same mapping - coordinates. - - name: --output_stats - type: file - description: Directory containing UMI based deduplication statistics files - direction: output - - name: --extract_umi_method - type: string - description: | - Specify the method by which the barcodes were encoded in the read. - The options are: [read_id, tag, umis]. - default: read_id - - name: --umi_tag - type: string - description: | - The tag containing the UMI sequence. - This is only required if the extract_umi_method is set to tag. - - name: --umi_separator - type: string - description: | - The separator used to separate the UMI from the read sequence. - This is only required if the extract_umi_method is set to id_read. - default: '_' - - name: --umi_tag_split - type: string - description: | - Separate the UMI in tag by and take the first element. - - name: --umi_tag_delimiter - type: string - description: | - Separate the UMI in by and concatenate the elements - - name: --cell_tag - type: string - description: | - The tag containing the cell barcode sequence. - This is only required if the extract_umi_method is set to tag. - - name: --cell_tag_split - type: string - description: | - Separate the cell barcode in tag by and take the first element. - - name: --cell_tag_delimiter - type: string - description: | - Separate the cell barcode in by and concatenate the elements - - - name: Grouping Options - arguments: - - name: --method - type: string - description: | - The method to use for grouping reads. The options are: - [unique, percentile, cluster, adjacency, directional]. - default: directional - - name: --edit_distance_threshold - type: integer - description: | - For the adjacency and cluster methods the threshold for the edit - distance to connect two UMIs in the network can be increased. The - default value of 1 works best unless the UMI is very long (>14bp). - default: 1 - - name: --spliced_is_unique - type: boolean_true - description: | - Causes two reads that start in the same position on the same strand - and having the same UMI to be considered unique if one is spliced - and the other is not. (Uses the ‘N’ cigar operation to test for splicing). - - name: --soft_clip_threshold - type: integer - description: | - Mappers that soft clip will sometimes do so rather than mapping a - spliced read if there is only a small overhang over the exon junction. - By setting this option, you can treat reads with at least this many - bases soft-clipped at the 3’ end as spliced. - default: 4 - - name: --multimapping_detection_method - type: string - description: | - If the sam/bam contains tags to identify multimapping reads, you can - specify for use when selecting the best read at a given loci. Supported - tags are “NH”, “X0” and “XT”. If not specified, the read with the highest - mapping quality will be selected. - - name: --read_length - type: integer - description: | - Use the read length as a criteria when deduping, for e.g sRNA-Seq. - - - name: Single-cell RNA-Seq Options - arguments: - - name: --per_gene - type: boolean_true - description: | - Reads will be grouped together if they have the same gene. This is useful - if your library prep generates PCR duplicates with non identical alignment - positions such as CEL-Seq. Note this option is hardcoded to be on with the - count command. I.e counting is always performed per-gene. Must be combined - with either --gene_tag or --per_contig option. - - name: --gene_tag - type: string - description: | - Deduplicate per gene. The gene information is encoded in the bam read tag - specified. - - name: --assigned_status_tag - type: string - description: | - BAM tag which describes whether a read is assigned to a gene. Defaults to - the same value as given for --gene_tag. - - name: --skip_tags_regex - type: string - description: | - Use in conjunction with the --assigned_status_tag option to skip any reads - where the tag matches this regex. Default ("^[__|Unassigned]") matches - anything which starts with “__” or “Unassigned”. - - name: --per_contig - type: boolean_true - description: | - Deduplicate per contig (field 3 in BAM; RNAME). All reads with the same - contig will be considered to have the same alignment position. This is - useful if you have aligned to a reference transcriptome with one - transcript per gene. If you have aligned to a transcriptome with more - than one transcript per gene, you can supply a map between transcripts - and gene using the --gene_transcript_map option. - - name: --gene_transcript_map - type: file - description: | - A file containing a mapping between gene names and transcript names. - The file should be tab separated with the gene name in the first column - and the transcript name in the second column. - - name: --per_cell - type: boolean_true - description: | - Reads will only be grouped together if they have the same cell barcode. - Can be combined with --per_gene. - - - name: SAM/BAM Options - arguments: - - name: --mapping_quality - type: integer - description: | - Minimium mapping quality (MAPQ) for a read to be retained. - default: 0 - - name: --unmapped_reads - type: string - description: | - How unmapped reads should be handled. - The options are: - "discard": Discard all unmapped reads. - "use": If read2 is unmapped, deduplicate using read1 only. - Requires --paired. - "output": Output unmapped reads/read pairs without UMI - grouping/deduplication. Only available in umi_tools group. - default: discard - - name: --chimeric_pairs - type: string - description: | - How chimeric pairs should be handled. - The options are: - "discard": Discard all chimeric read pairs. - "use": Deduplicate using read1 only. - "output": Output chimeric pairs without UMI grouping/deduplication. - Only available in umi_tools group. - default: use - - name: --unapired_reads - type: string - description: | - How unpaired reads should be handled. - The options are: - "discard": Discard all unpaired reads. - "use": Deduplicate using read1 only. - "output": Output unpaired reads without UMI grouping/deduplication. - Only available in umi_tools group. - default: use - - name: --ignore_umi - type: boolean_true - description: | - Ignore the UMI and group reads using mapping coordinates only. - - name: --subset - type: boolean_true - description: | - Only consider a fraction of the reads, chosen at random. This is useful - for doing saturation analyses. - - name: --chrom - type: string - description: | - Only consider a single chromosome. This is useful for debugging/testing - purposes. - - - name: Group/Dedup Options - arguments: - - name: --no_sort_output - type: boolean_true - description: | - By default, output is sorted. This involves the use of a temporary unsorted - file (saved in --temp-dir). Use this option to turn off sorting. - - name: --buffer_whole_contig - type: boolean_true - description: | - Forces dedup to parse an entire contig before yielding any reads for - deduplication. This is the only way to absolutely guarantee that all reads - with the same start position are grouped together for deduplication since - dedup uses the start position of the read, not the alignment coordinate on - which the reads are sorted. However, by default, dedup reads for another - 1000bp before outputting read groups which will avoid any reads being missed - with short read sequencing (<1000bp). - - -resources: - - type: bash_script - path: script.sh -test_resources: - - type: bash_script - path: test.sh - - type: file - path: test_data -engines: - - type: docker - image: quay.io/biocontainers/umi_tools:1.1.5--py39hf95cd2a_1 - setup: - - type: docker - run: | - umi_tools -v | sed 's/ version//g' > /var/software_versions.txt -runners: -- type: executable -- type: nextflow \ No newline at end of file diff --git a/src/umi_tools/umi_tools_dedup/help.txt b/src/umi_tools/umi_tools_dedup/help.txt deleted file mode 100644 index d3c8fa44..00000000 --- a/src/umi_tools/umi_tools_dedup/help.txt +++ /dev/null @@ -1,13 +0,0 @@ -``` -umi_tools dedup -``` - -dedup - Deduplicate reads using UMI and mapping coordinates - -Usage: umi_tools dedup [OPTIONS] [--stdin=IN_BAM] [--stdout=OUT_BAM] - - note: If --stdout is ommited, standard out is output. To - generate a valid BAM file on standard out, please - redirect log with --log=LOGFILE or --log2stderr - -For full UMI-tools documentation, see https://umi-tools.readthedocs.io/en/latest/ \ No newline at end of file diff --git a/src/umi_tools/umi_tools_dedup/script.sh b/src/umi_tools/umi_tools_dedup/script.sh deleted file mode 100644 index 57c01258..00000000 --- a/src/umi_tools/umi_tools_dedup/script.sh +++ /dev/null @@ -1,65 +0,0 @@ -#!/bin/bash - -## VIASH START -## VIASH END - -set -e - -test_dir="${metal_executable}/test_data" - -[[ "$par_paired" == "false" ]] && unset par_paired -[[ "$par_in_sam" == "false" ]] && unset par_in_sam -[[ "$par_out_sam" == "false" ]] && unset par_out_sam -[[ "$par_spliced_is_unique" == "false" ]] && unset par_spliced_is_unique -[[ "$par_per_gene" == "false" ]] && unset par_per_gene -[[ "$par_per_contig" == "false" ]] && unset par_per_contig -[[ "$par_per_cell" == "false" ]] && unset par_per_cell -[[ "$par_no_sort_output" == "false" ]] && unset par_no_sort_output -[[ "$par_buffer_whole_contig" == "false" ]] && unset par_buffer_whole_contig -[[ "$par_ignore_umi" == "false" ]] && unset par_ignore_umi -[[ "$par_subset" == "false" ]] && unset par_subset - - -$(which umi_tools) dedup \ - -I "$par_input" \ - ${par_in_sam:+--in-sam} \ - ${par_bai:+--bai "$par_bai"} \ - ${par_get_output_stats:+--get-output-stats} \ - ${par_random_seed:+--random-seed "$par_random_seed"} \ - -S "$par_output" \ - ${par_out_sam:+--out-sam} \ - ${par_paired:+--paired} \ - ${par_output_stats:+--output-stats "$par_output_stats"} \ - ${par_extract_umi_method:+--extract-umi-method "$par_extract_umi_method"} \ - ${par_umi_tag:+--umi-tag "$par_umi_tag"} \ - ${par_umi_separator:+--umi-separator "$par_umi_separator"} \ - ${par_umi_tag_split:+--umi-tag-split "$par_umi_tag_split"} \ - ${par_umi_tag_delimiter:+--umi-tag-delimiter "$par_umi_tag_delimiter"} \ - ${par_cell_tag:+--cell-tag "$par_cell_tag"} \ - ${par_cell_tag_split:+--cell-tag-split "$par_cell_tag_split"} \ - ${par_cell_tag_delimiter:+--cell-tag-delimiter "$par_cell_tag_delimiter"} \ - ${par_method:+--method "$par_method"} \ - ${par_edit_distance_threshold:+--edit-distance-threshold "$par_edit_distance_threshold"} \ - ${par_spliced_is_unique:+--spliced-is-unique} \ - ${par_soft_clip_threshold:+--soft-clip-threshold "$par_soft_clip_threshold"} \ - ${par_multimapping_detection_method:+--multimapping-detection-method "$par_multimapping_detection_method"} \ - ${par_read_length:+--read-length "$par_read_length"} \ - ${par_per_gene:+--per-gene} \ - ${par_gene_tag:+--gene-tag "$par_gene_tag"} \ - ${par_assigned_status_tag:+--assigned-status-tag "$par_assigned_status_tag"} \ - ${par_skip_tags_regex:+--skip-tags-regex "$par_skip_tags_regex"} \ - ${par_per_contig:+--per-contig} - ${par_gene_transcript_map:+--gene-transcript-map "$par_gene_transcript_map"} \ - ${par_per_cell:+--per-cell} \ - ${par_mapping_quality:+--mapping-quality "$par_mapping_quality"} \ - ${par_unmapped_reads:+--unmapped-reads "$par_unmapped_reads"} \ - ${par_chimeric_pairs:+--chimeric-pairs "$par_chimeric_pairs"} \ - ${par_unapired_reads:+--unapired-reads "$par_unapired_reads"} \ - ${par_ignore_umi:+--ignore-umi} \ - ${par_subset:+--subset} \ - ${par_chrom:+--chrom "$par_chrom"} \ - ${par_no_sort_output:+--no-sort-output} \ - ${par_buffer_whole_contig:+--buffer-whole-contig} - - -exit 0 \ No newline at end of file diff --git a/src/umi_tools/umi_tools_dedup/test.sh b/src/umi_tools/umi_tools_dedup/test.sh deleted file mode 100644 index 1459ec08..00000000 --- a/src/umi_tools/umi_tools_dedup/test.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/bin/bash - -test_dir="${meta_resources_dir}/test_data" -echo ">>> Testing $meta_functionality_name" - -"$meta_executable" \ - --bam "$test_dir/a.sorted.bam" \ - --bai "$test_dir/a.sorted.bam.bai" \ - --output "$test_dir/a.sorted.idxstats" - -echo ">>> Checking whether output exists" -[ ! -f "$test_dir/a.sorted.idxstats" ] && echo "File 'a.sorted.idxstats' does not exist!" && exit 1 - -echo ">>> Checking whether output is non-empty" -[ ! -s "$test_dir/a.sorted.idxstats" ] && echo "File 'a.sorted.idxstats' is empty!" && exit 1 - -echo ">>> Checking whether output is correct" -diff "$test_dir/a.sorted.idxstats" "$test_dir/a_ref.sorted.idxstats" || \ - (echo "Output file a.sorted.idxstats does not match expected output" && exit 1) - -rm "$test_dir/a.sorted.idxstats" - -############################################################################################ - -echo ">>> Testing $meta_functionality_name with singletons in the input" - -"$meta_executable" \ - --bam "$test_dir/test.paired_end.sorted.bam" \ - --bai "$test_dir/test.paired_end.sorted.bam.bai" \ - --output "$test_dir/test.paired_end.sorted.idxstats" - -echo ">>> Checking whether output exists" -[ ! -f "$test_dir/test.paired_end.sorted.idxstats" ] && \ - echo "File 'test.paired_end.sorted.idxstats' does not exist!" && exit 1 - -echo ">>> Checking whether output is non-empty" -[ ! -s "$test_dir/test.paired_end.sorted.idxstats" ] && \ - echo "File 'test.paired_end.sorted.idxstats' is empty!" && exit 1 - -echo ">>> Checking whether output is correct" -diff "$test_dir/test.paired_end.sorted.idxstats" "$test_dir/test_ref.paired_end.sorted.idxstats" || \ - (echo "Output file test.paired_end.sorted.idxstats does not match expected output" && exit 1) - -rm "$test_dir/test.paired_end.sorted.idxstats" - -############################################################################################ - -echo "All tests succeeded!" -exit 0 \ No newline at end of file From 8ad212d9315e8124b97bdecdac4924d4c4fd033a Mon Sep 17 00:00:00 2001 From: emmarousseau Date: Sun, 29 Sep 2024 16:02:04 +0200 Subject: [PATCH 3/5] full component with two tests --- CHANGELOG.md | 3 + .../rseqc_inner_distance/config.vsh.yaml | 109 ++++++++++++++++++ src/rseqc/rseqc_inner_distance/help.txt | 43 +++++++ src/rseqc/rseqc_inner_distance/script.sh | 25 ++++ src/rseqc/rseqc_inner_distance/test.sh | 77 +++++++++++++ .../rseqc_inner_distance/test_data/test.bed12 | 4 + .../test_data/test.paired_end.sorted.bam | Bin 0 -> 10205 bytes .../test_data/test1.inner_distance.txt | 49 ++++++++ .../test_data/test1.inner_distance_freq.txt | 100 ++++++++++++++++ .../test_data/test2.inner_distance.txt | 4 + .../test_data/test2.inner_distance_freq.txt | 100 ++++++++++++++++ 11 files changed, 514 insertions(+) create mode 100644 src/rseqc/rseqc_inner_distance/config.vsh.yaml create mode 100644 src/rseqc/rseqc_inner_distance/help.txt create mode 100644 src/rseqc/rseqc_inner_distance/script.sh create mode 100644 src/rseqc/rseqc_inner_distance/test.sh create mode 100644 src/rseqc/rseqc_inner_distance/test_data/test.bed12 create mode 100644 src/rseqc/rseqc_inner_distance/test_data/test.paired_end.sorted.bam create mode 100644 src/rseqc/rseqc_inner_distance/test_data/test1.inner_distance.txt create mode 100644 src/rseqc/rseqc_inner_distance/test_data/test1.inner_distance_freq.txt create mode 100644 src/rseqc/rseqc_inner_distance/test_data/test2.inner_distance.txt create mode 100644 src/rseqc/rseqc_inner_distance/test_data/test2.inner_distance_freq.txt diff --git a/CHANGELOG.md b/CHANGELOG.md index 0613fa25..204bec22 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,9 @@ * `rsem/rsem_calculate_expression`: Calculate expression levels (PR #93). +* `rseqc`: + - `rseqc/rseqc_inner_distance`: Calculate inner distance between read pairs (PR #159). + ## MINOR CHANGES * Upgrade to Viash 0.9.0. diff --git a/src/rseqc/rseqc_inner_distance/config.vsh.yaml b/src/rseqc/rseqc_inner_distance/config.vsh.yaml new file mode 100644 index 00000000..1b698517 --- /dev/null +++ b/src/rseqc/rseqc_inner_distance/config.vsh.yaml @@ -0,0 +1,109 @@ +name: "rseqc_inner_distance" +namespace: "rseqc" +description: | + Calculate inner distance between read pairs. +links: + homepage: https://rseqc.sourceforge.net/ + documentation: https://rseqc.sourceforge.net/#inner-distance-py + issue_tracker: https://github.com/MonashBioinformaticsPlatform/RSeQC/issues + repository: https://github.com/MonashBioinformaticsPlatform/RSeQC +references: + doi: 10.1093/bioinformatics/bts356 +license: GPL-3.0 +authors: + - __merge__: /src/_authors/emma_rousseau.yaml + roles: [ author, maintainer ] + +argument_groups: +- name: "Input" + arguments: + - name: "--input" + type: file + required: true + description: input alignment file in BAM or SAM format + + - name: "--refgene" + type: file + required: true + description: Reference gene model in bed format + + - name: "--sample_size" + type: integer + example: 200000 + description: Numer of reads sampled from SAM/BAM file, default = 200000. + + - name: "--map_qual" + type: integer + example: 30 + description: Minimum mapping quality (phred scaled) to determine uniquely mapped reads, default=30. + + - name: "--lower_bound_size" + type: integer + example: -250 + description: Lower bound of inner distance (bp). This option is used for ploting histograme, default=-250. + + - name: "--upper_bound_size" + type: integer + example: 250 + description: Upper bound of inner distance (bp). This option is used for ploting histograme, default=250. + + - name: "--step_size" + type: integer + example: 5 + description: Step size (bp) of histograme. This option is used for plotting histogram, default=5. + +- name: "Output" + arguments: + - name: "--output_prefix" + alternatives: ["-o"] + type: string + required: true + description: Rrefix of output files. + + - name: "--output_stats" + type: file + direction: output + description: output file (txt) with summary statistics of inner distances of paired reads + + - name: "--output_dist" + type: file + direction: output + description: output file (txt) with inner distances of all paired reads + + - name: "--output_freq" + type: file + direction: output + description: output file (txt) with frequencies of inner distances of all paired reads + + - name: "--output_plot" + type: file + direction: output + description: output file (pdf) with histogram plot of of inner distances of all paired reads + + - name: "--output_plot_r" + type: file + direction: output + description: output file (R) with script of histogram plot of of inner distances of all paired reads + +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - path: test_data + +engines: +- type: docker + image: ubuntu:22.04 + setup: + - type: apt + packages: [python3-pip, r-base] + - type: python + packages: [ RSeQC ] + - type: docker + run: | + echo "RSeQC - inner_distance.py: $(inner_distance.py --version | cut -d' ' -f2)" > /var/software_versions.txt +runners: +- type: executable +- type: nextflow \ No newline at end of file diff --git a/src/rseqc/rseqc_inner_distance/help.txt b/src/rseqc/rseqc_inner_distance/help.txt new file mode 100644 index 00000000..18f97bb6 --- /dev/null +++ b/src/rseqc/rseqc_inner_distance/help.txt @@ -0,0 +1,43 @@ +``` +inner_distance.py --help +``` + +Usage: inner_distance.py [options] + +Calculate the inner distance (insert size) of RNA-seq fragments. + + RNA fragment + _________________||_________________ +| | +| | +||||||||||------------------|||||||||| + read_1 insert_size read_2 + +fragment size = read_1 + insert_size + read_2 + + + +Options: + --version show program's version number and exit + -h, --help show this help message and exit + -i INPUT_FILE, --input-file=INPUT_FILE + Alignment file in BAM or SAM format. + -o OUTPUT_PREFIX, --out-prefix=OUTPUT_PREFIX + Prefix of output files(s) + -r REF_GENE, --refgene=REF_GENE + Reference gene model in BED format. + -k SAMPLESIZE, --sample-size=SAMPLESIZE + Number of read-pairs used to estimate inner distance. + default=1000000 + -l LOWER_BOUND_SIZE, --lower-bound=LOWER_BOUND_SIZE + Lower bound of inner distance (bp). This option is + used for ploting histograme. default=-250 + -u UPPER_BOUND_SIZE, --upper-bound=UPPER_BOUND_SIZE + Upper bound of inner distance (bp). This option is + used for plotting histogram. default=250 + -s STEP_SIZE, --step=STEP_SIZE + Step size (bp) of histograme. This option is used for + plotting histogram. default=5 + -q MAP_QUAL, --mapq=MAP_QUAL + Minimum mapping quality (phred scaled) for an + alignment to be called "uniquely mapped". default=30 \ No newline at end of file diff --git a/src/rseqc/rseqc_inner_distance/script.sh b/src/rseqc/rseqc_inner_distance/script.sh new file mode 100644 index 00000000..655f5b63 --- /dev/null +++ b/src/rseqc/rseqc_inner_distance/script.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +set -exo pipefail + + +inner_distance.py \ + -i $par_input \ + -r $par_refgene \ + -o $par_output_prefix \ + ${par_sample_size:+-k "${par_sample_size}"} \ + ${par_lower_bound_size:+-l "${par_lower_bound_size}"} \ + ${par_upper_bound_size:+-u "${par_upper_bound_size}"} \ + ${par_step_size:+-s "${par_step_size}"} \ + ${par_map_qual:+-q "${par_map_qual}"} \ +> stdout.txt + +if [[ -n $par_output_stats ]]; then head -n 2 stdout.txt > $par_output_stats; fi + + +[[ -n "$par_output_dist" && -f "$par_output_prefix.inner_distance.txt" ]] && mv $par_output_prefix.inner_distance.txt $par_output_dist +[[ -n "$par_output_plot" && -f "$par_output_prefix.inner_distance_plot.pdf" ]] && mv $par_output_prefix.inner_distance_plot.pdf $par_output_plot +[[ -n "$par_output_plot_r" && -f "$par_output_prefix.inner_distance_plot.r" ]] && mv $par_output_prefix.inner_distance_plot.r $par_output_plot_r +[[ -n "$par_output_freq" && -f "$par_output_prefix.inner_distance_freq.txt" ]] && mv $par_output_prefix.inner_distance_freq.txt $par_output_freq + +exit 0 \ No newline at end of file diff --git a/src/rseqc/rseqc_inner_distance/test.sh b/src/rseqc/rseqc_inner_distance/test.sh new file mode 100644 index 00000000..49405696 --- /dev/null +++ b/src/rseqc/rseqc_inner_distance/test.sh @@ -0,0 +1,77 @@ +#!/bin/bash + + +# define input and output for script +input_bam="$meta_resources_dir/test_data/test.paired_end.sorted.bam" +input_bed="$meta_resources_dir/test_data/test.bed12" + +output_stats="inner_distance_stats.txt" +output_dist="inner_distance.txt" +output_plot="inner_distance_plot.pdf" +output_plot_r="inner_distance_plot.r" +output_freq="inner_distance_freq.txt" + +# Run executable +echo "> Running $meta_functionality_name" + +"$meta_executable" \ + --input $input_bam \ + --refgene $input_bed \ + --output_prefix "test" \ + --output_stats $output_stats \ + --output_dist $output_dist \ + --output_plot $output_plot \ + --output_plot_r $output_plot_r \ + --output_freq $output_freq + +exit_code=$? +[[ $exit_code != 0 ]] && echo "Non zero exit code: $exit_code" && exit 1 + +echo ">> Check whether output is present and not empty" + +[[ -f "$output_stats" ]] || { echo "$output_stats was not created"; exit 1; } +[[ -s "$output_stats" ]] || { echo "$output_stats is empty"; exit 1; } +[[ -f "$output_dist" ]] || { echo "$output_dist was not created"; exit 1; } +[[ -s "$output_dist" ]] || { echo "$output_dist is empty"; exit 1; } +[[ -f "$output_plot" ]] || { echo "$output_plot was not created"; exit 1; } +[[ -s "$output_plot" ]] || { echo "$output_plot is empty"; exit 1; } +[[ -f "$output_plot_r" ]] || { echo "$output_plot_r was not created"; exit 1; } +[[ -s "$output_plot_r" ]] || { echo "$output_plot_r is empty"; exit 1; } +[[ -f "$output_freq" ]] || { echo "$output_freq was created"; exit 1; } +[[ -s "$output_freq" ]] || { echo "$output_freq is empty"; exit 1; } + +echo ">> Check whether output is correct" +diff "$output_freq" "$meta_resources_dir/test_data/test1.inner_distance_freq.txt" || { echo "Output is not correct"; exit 1; } +diff "$output_dist" "$meta_resources_dir/test_data/test1.inner_distance.txt" || { echo "Output is not correct"; exit 1; } + +# clean up +rm "$output_stats" "$output_dist" "$output_plot" "$output_plot_r" "$output_freq" +################################################################################ + +echo "> Running $meta_functionality_name with non-default parameters and default output file names" +"$meta_executable" \ + --input $input_bam \ + --refgene $input_bed \ + --output_prefix "test" \ + --sample_size 4 \ + --map_qual 10 + +exit_code=$? +[[ $exit_code != 0 ]] && echo "Non zero exit code: $exit_code" && exit 1 + +echo ">> Check whether output is present and not empty" + +[[ -f "test.inner_distance.txt" ]] || { echo "test.inner_distance.txt was not created"; exit 1; } +[[ -s "test.inner_distance.txt" ]] || { echo "test.inner_distance.txt is empty"; exit 1; } +[[ -f "test.inner_distance_plot.pdf" ]] || { echo "test.inner_distance_plot.pdf was not created"; exit 1; } +[[ -s "test.inner_distance_plot.pdf" ]] || { echo "test.inner_distance_plot.pdf is empty"; exit 1; } +[[ -f "test.inner_distance_plot.r" ]] || { echo "test.inner_distance_plot.r was not created"; exit 1; } +[[ -s "test.inner_distance_plot.r" ]] || { echo "test.inner_distance_plot.r is empty"; exit 1; } +[[ -f "test.inner_distance_freq.txt" ]] || { echo "test.inner_distance_freq.txt was created"; exit 1; } +[[ -s "test.inner_distance_freq.txt" ]] || { echo "test.inner_distance_freq.txt is empty"; exit 1; } + +echo ">> Check whether output is correct" +diff "test.inner_distance_freq.txt" "$meta_resources_dir/test_data/test2.inner_distance_freq.txt" || { echo "Output is not correct"; exit 1; } +diff "test.inner_distance.txt" "$meta_resources_dir/test_data/test2.inner_distance.txt" || { echo "Output is not correct"; exit 1; } + +exit 0 \ No newline at end of file diff --git a/src/rseqc/rseqc_inner_distance/test_data/test.bed12 b/src/rseqc/rseqc_inner_distance/test_data/test.bed12 new file mode 100644 index 00000000..33a46951 --- /dev/null +++ b/src/rseqc/rseqc_inner_distance/test_data/test.bed12 @@ -0,0 +1,4 @@ +MT192765.1 1242 1264 nCoV-2019_5_LEFT 1 + 1242 1264 0 2 10,12, 0,10, +MT192765.1 1573 1595 nCoV-2019_6_LEFT 2 + 1573 1595 0 2 7,15, 0,7, +MT192765.1 1623 1651 nCoV-2019_5_RIGHT 1 - 1623 1651 0 2 14,14, 0,14, +MT192765.1 1942 1964 nCoV-2019_6_RIGHT 2 - 1942 1964 0 2 11,11 0,11, diff --git a/src/rseqc/rseqc_inner_distance/test_data/test.paired_end.sorted.bam b/src/rseqc/rseqc_inner_distance/test_data/test.paired_end.sorted.bam new file mode 100644 index 0000000000000000000000000000000000000000..8b215e12d1a932f1619cf7ded7e172141f45d479 GIT binary patch literal 10205 zcmV<3CnDG%iwFb&00000{{{d;LjnMD0fmy^PQox0hpTtRm)Hxe<5ULHD+VM;vd!s) z+ok&hE@3OK4I1x#Sl_~Iz@|9l?zE@j$SUWjh*u-d8%zMC+9d0b3kPX^@EAz)ObM}( zWuT_^euV=9Rjy-S+oj2yru(5*gZU;Wl4qw>0;k-%ZsST(C`}g)cFWTuiT89-s3ayK z&sy7Ii=3X56WHr%w89S7C{uiWCPTDz#3XP$C+s z&xA^#5*26?p+YG#h&GCa0JXTSQ0t}~%fLyPigm5zveTlGwPB6vOwqPl)Fl^PDnzFmW7?u=+p4INMy0$-)57F=9+z<$ zj=X;qXEcs88GXz1(p?wF^uS&k)5Y#FouXlSM%TxgWP8Ja_}gFtl=xiw;0cI48;D`;qu&$*o!P{OAn(J>})}&a>wPIMv0kU znTujcaDK7*Y7oV{M{t%~ft%0vaDK4oCh>J^DvD!no(m~vB={>UU8_P$ooZo{l*vxj ztOd1`RN|FHl{_PkIBBHE_K1lr`0pl8Gk27t=a~b@tD-!B+>%p#$?;^oD5Q>u*0l1=Df!87MZROa<}*Jgt59hbtQz0Hxqa4UYr!q2mj(Jvz z*;AR5H~#{h=iAn9q+m6u=~ae|RR41DHDD0HzTz1TYbhu|zi& zZwr8^mFXBT?S$eYWy%O9(grv~?ZJ%jMjGCk2V@N9$-&{lo;4OT@QNvT$vW^-Q@0W?tazQ^M|;I1&4jf|lG?x~7^NFk@viE0 zr#jK(|ef+b7OeZl9RUdB+*Z@tl8SCXbh$^Vxs)bs-T1C&!#$ zzh=%ikLUacBM}KsK^+aE^d~br=ENqatJ@mhRI8F&NL`Y?CNHZZuM6VR=)(Fj5d4ML zqt>$~E52}=!g@UZ?_OA?v7pD!Jd^XiL|bTqtMA5S-g?I`2G-s_n#{Xj?sdTS3kK`- zem|K6WAC2@ZimMwd%Mx6Z;zr{a+VtSHZZ_dsuG==ve265RB71?1)tGC##05;Y?xt8 z$fo9+o0L~=lPWkpLYupISYld@*f%M<#H2_RBm`w@+Kx zVCPn*!}ZSkTQlWvo;6Q#7Q3i2cH@Ba8rzza@qrnh_l)p-VCzS>2JGg+3Ss=Ysb2DU zl5b?Vfv_qG*%+nw>YzGdR$8S|yR>b%W5IIV}=8Nvgdr6mOM{JW>Yvk{uLMTO-MW zA;o3Qs?zEJ!QHUf&Ny=cf{V^<27u=5 z95^pW4vfr#5d1{)|0;=$YkjpfR!E82$Gsz% zf=nh_rX^fyQ^BX!kV8l*UJGlA6jsxvS8l97m~TEc{e|;Pm_6ql=w~jbz;5+$KHS6k zRP>f;7<{{uPa=A~IJ2cJ*# zeJ07HSE6otqFdF$n}^~D*F=H*FoWhtykBsMs0Q^1sT8?q9sDm1T_qV6BwCIX0F2?4 z8zyzZSBTGkjk`^GpSsM+#JWeByC7s5Afnf9-MUzEIQUy%W6R_ujK4MYw>+Li&K`{l z{u3wCagMBTs(7Iam{n20%Wq*xInsNYC3K|#;vLwGXrXH0yCPf!N!bicn0^WdMzxG} zVD_OvS+2|uUgq%RoT0gjop(>z8PNP2cQ2<&Z%U(RcgD`+)03TJmnKEUOSCDz8MygE zlmgCH!+0xrB@`%#FjXU0B^Byk8YLrmFyO9Am_*L1LnfIVTwx43V>dpvG91}hYlirK z0M2Dj$+7J(bh$mC$v2{zu0)fNo99Uh;V;a%IiLxz6|hOk5R9g^F;&Y{K_XmrGBrjb zm)8=uHNu3B>S+nW1kMVUAUJ5?#2AS)h*c{UJajLB&M}C6u_5jwgK`4F^0N!ykTcCu z;O1sDU$ipo{Oxp^n-46a=;^6ra=3#!(2|-3!A`X z6ELrC4ZsCz{=;Yh`2ML)_h9DKD{x+@5O0wkm)Z(HB?Azy!o^N;6CmZxN2D~p#Yif% zFyme<<1ln_c+Y*Sf;C@)vD`W{@rrY;hwb32;N^6_y0jT9_6x->5{Sl0iLos)b5z(vE?(2=H{5ih{|yOYuUfQlZ=Jvp5;w z2utEZIZHPe4tmo~@onDM1{@>v(FMTrGjlBOUt@dj@8$dV7R&N|?>$lUzh_t;>>Tbn zH}i%;$@mSjJs}*+CuCef%*@x}dTp6cHK+S)0Or>(X2?TO{9F&_2YWY@$a*l7Z%0gW zGfRzggX8*Q^UGp~C`op3;bE;>)SOjSUdcm6&2^`QR;Y!qeUqGTuaxsE_bj&>eHdf^b0+7< z`^P)Zj<@Vw!0{Py$c$0$K52_CP4JU~BKx;5${TgWtA{sOqNO$ufcf;wa_5iy_t!nN zLeB5)iTUAs200JJB!JTyF&&shOc-w}8`nH#jh0jzO43e>isNTnD8m}wnhtPP30!yh zo~VPsTNf>`q#8blmPHm3g%cQ{Y>?JopHYj<$lneb=n=KnqjP!V6^QxtCStyDLd>su z+iY zoR3VWH)=Gv`jN@(_M8eE&+cE(nR3zDg$B-SawoFYcc}kP8fNOKP>gWH`z@=WNz9fpC zpKQv;~sMrEO5_K1ZNJ*v3G(~)bOfaFAGQkt1mt<9>NZl#XwWytHV2Mi+ zw-+4wJII;l-l=znIPMFY-f}x2MwPcRL`$*Hr`ead@?9Qk%AwYr^_Sb4C#mSnXFRM3 z=e9kEzOqHWh<)=J>2)Ca$Mf`DJ}cmjjeWjVZ4I~)mN$WM{~#Lhy=lfchh>AaPu`AK zbTeE~ypv6b0F#`3s(sPHXxR+gnpYo*eC*`Wk(g)?i|b)Uj!ZY9;cSb&5i>2FYr}kS=1-;ao6dl2W0n zTp?r^sEn#s3x*1GDl4iarsFX4f`Yi@2K58+stfJ+aawLEK{dLL;QX^q&U`j;=7R(J zMJU`{6gVP3@6GRfNJE9$j!s`^O6$TMy-8IyRvC$rDWaZ)m$IRtB9j>~0?&v^uH{@JDq{oRuaeaLbC$6`5ZfA3$8qMx2} z^B9HfBbR@#IW%BFrD&6C(IV;;9aJGh!H4=FZ~}E9+keAFZBUve8v__oLV_wqEu_Os zYSFS;|5ARw$*-wcj6$Y7Jea+C?&=u6w&A`9{TkjGInM11W{g91h>#1^43bU=|JIb6 z9!-+;1PS=M(0~uICW1^P*=s7CbtrL2l!%ZHL53Xwrzs)L3VN+Sm-BPC$SAsJ%FdJH zozp!R9tnoXQn983?M$Uv3;Qimzo>-P)L|hd^o8to`!boj zaod=;GWj^+7WZddf-3e6ER;#1$F^6d^!-7U)uPpB)Zo62eKS{P9YnZKnR5$i0f8fB9-N)rx=PuY(s+SS=f&1~x#mY=wYzLA z6Xb5qll=aM_|pCy)|&zAT?Fel-}LLN8o}?{`XdkF{~L+^Px`SEqW^bhY&|(VJvxfM zgLrq2pJ_W()tfnfkLSJ<*e;>!|64yhc0;)f-rcD<^wL=x`o=Nm;oFNb#h;obf3Cg{ zGyFf*7s3p09fc|H3$}vv*gVBcy!rOjR(Rd!n#cw@Vja8{juH(jhN#@Y&QgjEg&I>S zfszhL!4Sx`90sQCYU&kFD^W>f1Q=C6?!pvm7Q_7}lJq43*9U9kirD>gv59wzX0_O_K%{X`G?T%i|YS+{+icbYonGRg7bzxQ^A>=NPh>T2M((xs-!l zDdie7U0Z(^5>(6G5_xs)>Yq#%_lriTem(dWUzf|C?=3*}-%SF(*#@vilJQ3usP z2em1b_o?NO!_sMnaH2Da9AE+A)9Q*N9Ft87fF%lC2zjYR0%3-(sG^`D!pV8s=kmOq zx0OtzsnO3-)Vdg6KJtg%sorSP+T>7#9@hMjD*MDH$Ct0HR}pY?$Cm(pQQ0Rp>9RW}x-8g+ zH@tM&Hjozh^wc(Zi6=PMBZPDT_*z5QTZU+?MI}I2m1w|XDR>DdSiquy-pttL+}2z0 zerV=Gq(vT%Ebo`=SuE``TJNJxSm};94iY|i&;$EP4J_6G83=n;U(|Zr5PLj7e*Ylr z@9=nj{(h(Z$9Nv^pYHEO(f5(8!_h_k4`~8anqe}fDH4izX>Fe1$Q39-%Su{sx1-Ru zk{xtqx)d~A?ugGA&j)AC{T6WS&GS_9GiO#yu0HcequGaYLU6ALr4P&m)*(Xf6*C=b zadk>*JeG8V;u6Epu2Q+yfi; zi+(2CPYgQ9J(}%Iv20BP&whWC?Yn1Xo8W%}n0L<#;xzW5*^if$(L!t+4SfX0}p;f?&hEHn1{4J2aQxt+O0L6I{=3<>~6Et_G z)TFVu%of9r&}?oXntqcSJ42gK!_@3-?P42e$a0VEBKG_Wg`0fbr@F}U!19sd>njRl zVPATb_cthTQvgjfPEN>KdiUlLnsVzU9?dWI+hYtGHRo#W%<=t-!->jLv) zK|+4P(<3>aFC!$3-aa>k&z;%PfAzTuLf#OL5qi&@Z#M`b`^Q>mYC(}CJqlru@4(1> ziZ--T#rd>;I9^~mIu_$ofpiNu99TpMy>tyiV@vwnD?LK(%Le`SL66W^Fm~V8j6+99 zdxxjdj*XPYLRD$RpbZk(2(zgGs5{z`q%`nCTLbuz(2ypDPgQV~4BMve3{N^LDww=C ztgFeeGBgT61Q&C<^u}9}I3F8?Jodtw^?Akqdp@oTy49u2`Rx1F)CcbC{rZ3UvOye! zU;inL{j0e+9_^usZyoz}=*UDea!ym)wyG5x9!f1L%6w5jKJ}o(WTTpDBobq?c|5;p z41d`p{GM!7-f_=eOl>{Qy2y>r33$h0fdt9Ijahx!%nG4Y-bWA)~rD(ERu& z)4RR0diALu%^`><#{R@y6%OpYZVY9YYP=lU^zl9y5d>|j@kfMN5ufL?%kpt!Pw=(# z@?zuw?hWs?MY}EgM#D5%VY@w;Wr9 zO%C$BuQhr9#%9>Yojs1Q@fsaFxdHjvr>1zGqa2v76!))=2pYtlRKP2i2-=JLMRjDt z8KV?ArM?sqN9C(VL+ z@1Do;u$hFv4O2@$v=@1Letc8u|F%i#KLmk4{lH+?Lvi)J82j8@d7hjcpB%dCYMXY3 zbzO?^H=!Ctswyr}W}(tbgHX9e%uG$EaEUrFk(wxI88*CZozPwg2(6JvK+f4m%x?Eb zS69irW%fPLJrU!wH(C@I0=&#M_{4rIqwr-z6NhEiztml3TiI>Y92UFxZPplF#4fc= zVE)d^a@FsB>^B}-u`cwko|^A{V374dO#=Akb80#;-D0uzprsBp?Lh-`Ycg7>Xw}rQ zXBSAI{+}jxmpg}!FXmZljJCYW@{uaFQ+AkT6Y*T1lJUx7sfB<3ib>e=JDV&~ywVbh z2bcY%`!V)U=G$5B@9iJ%J4?jUMlh)W3Qr3#6DgdofX9TivZyItBkPW~9!Ik5AIP3*jNB|9Ho!Ol-TxRhGl z-96kr-gl+j3-PakW`NE#Re_utijZ2c7MLRQ%o|yqU5J|xp3TBDBJ04*ETl3<{zJ(8 z4{tE|W_B}@RVzv!ShF&8sCInn!DYXbFur5X&H>Lg_`tbGL{7Woaa2NTc>A=gQ3~w< zjG3ZxDD46Ql^`U(r`sXMbS23?kL#o~8*Sh?UH(2#C$7E|SvjiWja{qx)$!yeoUuaZ zfi)Xy4SuKDS?1@XyHWJkS-`$`cyhSoHn43;S5`>pssaW+5wuYeqbhMLRFN{J7_DrA zbJPNIT8~Fdb11Jt8y!_i2kqO!C;?e@ku6oGj{MmUq&u^s=iKd#Ipp5GxeM=``LpyK ze&<~q{m#25fPUA5_t_rM$D*Gdjs5Ig;7?BX4)>y*_7dTV5DL0bm0HuvCx#-ML9SfX zWc3qXBm(HBP^dgr@C2?iG@m21;xq{L+IfcTGiPs4_?gM<51eZ=p8Y?bi|2LB{t`O< z{6`Qcntirr_BV}ZAH3w9-DNNNZ5aFL?A{|JKG|{02~Z<6DeawrDw#}BcjXPFifhGS zJd>stFqN_u37#hkCe)-I>;as-N>s>=!M!UQL@g{~R@eLB!aA*PGp~@dcd3NM!VSf4 z3%1y~$9}IgdjWNF`9AqLZ1U;%c5=I9QLB$#@0t0kjFlamU$7ft+PD?w>?gXuTT&$=3sE)&Qw2kXmYSLz7>t-YK8d z{Nm~5tE<9*@?&$IB-iL9*&EGAc9)Y21n*DIq|!rajpp^BCo7Ezr9^mwu_W(vUwucb zo8hQ4bME2(g&d>c#Y_3eHtG1=RvOJmcbAGXV4&lnusdZg$4&2S>yYzIb^r5(QUU+ zA@fZNW60h;r{Il+Yzjy|nasLl^mz<%c^+H5ijaY2_X4tat%L0531p!|^h1<&Qwb)xfWO$biq`OH*o!lKzsor#OU+vD$OZ|Id*|F` zTz&B~;OLbCS=X(NE6rX!$w38A`oE3f44(8i_LeiG`v*IR2+GKlzA;GBz8&j5=@lZE zuXwZ?+Hu-%bZs`fY_!drKfugixtu5cjTQEU;C<6vm>rbXp4<=^Wz(hgcG$?xmU%6# z@pz)kk49Uh(|=Sh=P&+#PbpRLagl5Y3NB>iR*>bRw@Qq(o^Uo-+~ z1o%COhM_Ha!;y3Dq-k_p^AeXzTvfC%;ygtW3eh%{XZ83f{nLxw@+7^T*#zNg^RZ;4 z)*r!>-rBlc_Wn)#%c`@(*uR;{`oZqr$^L2h^qcEK5%tarnfw%a)%@H$?gT^@YpVd?qNi*2p-SGES^0zE@J6CwQLGjaE5?%0h8S&4F7 zf>Z=j_*Ben8zetbJnl5o`lW%hjXX25TAUnoZnS1d={E|=3m9`%Ak0HNt@sreA%_qy zvpc=JB4lXxDtq3>Wx2Z8HbFES0ePNdECE5#Kf0-#>CTniOxp*8;}`_}VT}F3oR_pG zjdR42XDDRbw1yUxCP)$z{mM?4Ey^CCo6^vs(2yL(<+4l)p=FnLDUuID@RAmgmyN;I z;HAqm$LE`@f0nPiY^+k_ksgji%`^6O>8wxn_`5Yp@Bq(GUMyk`HvQoq&ky!CoiIM0 zGt=QoHeI9Y(HN9H>>l}WD2uvc1Nzu}$IJ9*mpc-VV zfNj@u=4O%EQjVveoA|=T!SS?{)tBwg*d{3USGbPZ3$KT;pN6=w)coWon||j?oBplC z<(T_VG4`?f=Y{Q`oF2JsjwpE}s)9Z}lae|$q}Xn53tWYOG*=12#*T6OsTkDM$d&05 z+H4+LmXpRz6A-pi%Jeb!Xpyq}%6yl0o|SVK%J0^=3QothGs5*=t-6>fb9$wZjuxZV z6yW)hd9TQQvqjEqBI>~e5q2)%84AdR@oVQF8Rqb`?G4urBdfv78miN+ptSqj#eTI> zwg~%ClX;34zE0P#x3}Lym>maB4%_W`470(MpQ#;2OAdP9)GN-H0?m)CGeLK(>~a5v zqvaI?e~7VfFMXQb-tMVGlXF#;N|m$|b3>c7(!-i|&_R(?Q7RYGE_9h7hNNaO2)nF= zN$qFZbscKW2|P~CO*!4C`Nr*~x9Z5|u$~zR5Srh-*rpqyLi5q1LDoZsCOU?{IY;wo z7e1$}(2(t+9Z4k0nNXMXIT6$nrr~~S##Kch$Hr;tk8-T+8bBvt%L@<^3AQFRNK7JP zzsR!KMJh_kd;(W*@%#$evxE6zVWS}rRPUGG`?Y(Ps* z+o60(=unGLM%a(Vk)St|nz;{Pu>zj^&o{|@kGxwFWak?crITjo7}{1Q`!IGN9GKLNBIi literal 0 HcmV?d00001 diff --git a/src/rseqc/rseqc_inner_distance/test_data/test1.inner_distance.txt b/src/rseqc/rseqc_inner_distance/test_data/test1.inner_distance.txt new file mode 100644 index 00000000..e5f09f8f --- /dev/null +++ b/src/rseqc/rseqc_inner_distance/test_data/test1.inner_distance.txt @@ -0,0 +1,49 @@ +ERR5069949.29668 -4 sameTranscript=No,dist=genomic +ERR5069949.114870 -45 sameTranscript=No,dist=genomic +ERR5069949.147998 94 sameTranscript=No,dist=genomic +ERR5069949.155944 -105 sameTranscript=No,dist=genomic +ERR5069949.184542 49 sameTranscript=No,dist=genomic +ERR5069949.169513 -92 sameTranscript=No,dist=genomic +ERR5069949.257821 -139 sameTranscript=No,dist=genomic +ERR5069949.309410 13 sameTranscript=No,dist=genomic +ERR5069949.376959 -66 sameTranscript=No,dist=genomic +ERR5069949.366975 -106 sameTranscript=No,dist=genomic +ERR5069949.465452 -19 sameTranscript=No,dist=genomic +ERR5069949.479807 5 sameTranscript=No,dist=genomic +ERR5069949.501486 -82 sameTranscript=No,dist=genomic +ERR5069949.532979 -96 sameTranscript=No,dist=genomic +ERR5069949.540529 -61 sameTranscript=No,dist=genomic +ERR5069949.573706 -63 sameTranscript=No,dist=genomic +ERR5069949.576388 -77 sameTranscript=No,dist=genomic +ERR5069949.611123 -125 sameTranscript=No,dist=genomic +ERR5069949.651338 -33 sameTranscript=No,dist=genomic +ERR5069949.686090 -29 sameTranscript=No,dist=genomic +ERR5069949.786562 42 sameTranscript=No,dist=genomic +ERR5069949.870926 -22 sameTranscript=No,dist=genomic +ERR5069949.856527 -69 sameTranscript=No,dist=genomic +ERR5069949.885966 -32 sameTranscript=No,dist=genomic +ERR5069949.937422 18 sameTranscript=No,dist=genomic +ERR5069949.919671 -116 sameTranscript=No,dist=genomic +ERR5069949.973930 -79 sameTranscript=No,dist=genomic +ERR5069949.986441 -22 sameTranscript=No,dist=genomic +ERR5069949.1014693 -150 sameTranscript=No,dist=genomic +ERR5069949.1020777 -122 sameTranscript=No,dist=genomic +ERR5069949.1066259 -4 sameTranscript=No,dist=genomic +ERR5069949.1062611 -124 sameTranscript=No,dist=genomic +ERR5069949.1067032 -103 sameTranscript=No,dist=genomic +ERR5069949.1088785 -101 sameTranscript=No,dist=genomic +ERR5069949.1132353 -142 sameTranscript=No,dist=genomic +ERR5069949.1151736 -55 sameTranscript=No,dist=genomic +ERR5069949.1258508 62 sameTranscript=No,dist=genomic +ERR5069949.1189252 -98 sameTranscript=No,dist=genomic +ERR5069949.1261808 -88 sameTranscript=No,dist=genomic +ERR5069949.1246538 -122 sameTranscript=No,dist=genomic +ERR5069949.1328186 -64 sameTranscript=No,dist=genomic +ERR5069949.1331889 -132 sameTranscript=No,dist=genomic +ERR5069949.1372331 -29 sameTranscript=No,dist=genomic +ERR5069949.1340552 -140 sameTranscript=No,dist=genomic +ERR5069949.1412839 -117 sameTranscript=No,dist=genomic +ERR5069949.1476386 -98 sameTranscript=No,dist=genomic +ERR5069949.1538968 -133 sameTranscript=No,dist=genomic +ERR5069949.1552198 -67 sameTranscript=No,dist=genomic +ERR5069949.1561137 -59 sameTranscript=No,dist=genomic diff --git a/src/rseqc/rseqc_inner_distance/test_data/test1.inner_distance_freq.txt b/src/rseqc/rseqc_inner_distance/test_data/test1.inner_distance_freq.txt new file mode 100644 index 00000000..908326ff --- /dev/null +++ b/src/rseqc/rseqc_inner_distance/test_data/test1.inner_distance_freq.txt @@ -0,0 +1,100 @@ +-250 -245 0 +-245 -240 0 +-240 -235 0 +-235 -230 0 +-230 -225 0 +-225 -220 0 +-220 -215 0 +-215 -210 0 +-210 -205 0 +-205 -200 0 +-200 -195 0 +-195 -190 0 +-190 -185 0 +-185 -180 0 +-180 -175 0 +-175 -170 0 +-170 -165 0 +-165 -160 0 +-160 -155 0 +-155 -150 1 +-150 -145 0 +-145 -140 2 +-140 -135 1 +-135 -130 2 +-130 -125 1 +-125 -120 3 +-120 -115 2 +-115 -110 0 +-110 -105 2 +-105 -100 2 +-100 -95 3 +-95 -90 1 +-90 -85 1 +-85 -80 1 +-80 -75 2 +-75 -70 0 +-70 -65 3 +-65 -60 3 +-60 -55 2 +-55 -50 0 +-50 -45 1 +-45 -40 0 +-40 -35 0 +-35 -30 2 +-30 -25 2 +-25 -20 2 +-20 -15 1 +-15 -10 0 +-10 -5 0 +-5 0 2 +0 5 1 +5 10 0 +10 15 1 +15 20 1 +20 25 0 +25 30 0 +30 35 0 +35 40 0 +40 45 1 +45 50 1 +50 55 0 +55 60 0 +60 65 1 +65 70 0 +70 75 0 +75 80 0 +80 85 0 +85 90 0 +90 95 1 +95 100 0 +100 105 0 +105 110 0 +110 115 0 +115 120 0 +120 125 0 +125 130 0 +130 135 0 +135 140 0 +140 145 0 +145 150 0 +150 155 0 +155 160 0 +160 165 0 +165 170 0 +170 175 0 +175 180 0 +180 185 0 +185 190 0 +190 195 0 +195 200 0 +200 205 0 +205 210 0 +210 215 0 +215 220 0 +220 225 0 +225 230 0 +230 235 0 +235 240 0 +240 245 0 +245 250 0 diff --git a/src/rseqc/rseqc_inner_distance/test_data/test2.inner_distance.txt b/src/rseqc/rseqc_inner_distance/test_data/test2.inner_distance.txt new file mode 100644 index 00000000..a1930c9e --- /dev/null +++ b/src/rseqc/rseqc_inner_distance/test_data/test2.inner_distance.txt @@ -0,0 +1,4 @@ +ERR5069949.29668 -4 sameTranscript=No,dist=genomic +ERR5069949.114870 -45 sameTranscript=No,dist=genomic +ERR5069949.147998 94 sameTranscript=No,dist=genomic +ERR5069949.155944 -105 sameTranscript=No,dist=genomic diff --git a/src/rseqc/rseqc_inner_distance/test_data/test2.inner_distance_freq.txt b/src/rseqc/rseqc_inner_distance/test_data/test2.inner_distance_freq.txt new file mode 100644 index 00000000..021311a2 --- /dev/null +++ b/src/rseqc/rseqc_inner_distance/test_data/test2.inner_distance_freq.txt @@ -0,0 +1,100 @@ +-250 -245 0 +-245 -240 0 +-240 -235 0 +-235 -230 0 +-230 -225 0 +-225 -220 0 +-220 -215 0 +-215 -210 0 +-210 -205 0 +-205 -200 0 +-200 -195 0 +-195 -190 0 +-190 -185 0 +-185 -180 0 +-180 -175 0 +-175 -170 0 +-170 -165 0 +-165 -160 0 +-160 -155 0 +-155 -150 0 +-150 -145 0 +-145 -140 0 +-140 -135 0 +-135 -130 0 +-130 -125 0 +-125 -120 0 +-120 -115 0 +-115 -110 0 +-110 -105 1 +-105 -100 0 +-100 -95 0 +-95 -90 0 +-90 -85 0 +-85 -80 0 +-80 -75 0 +-75 -70 0 +-70 -65 0 +-65 -60 0 +-60 -55 0 +-55 -50 0 +-50 -45 1 +-45 -40 0 +-40 -35 0 +-35 -30 0 +-30 -25 0 +-25 -20 0 +-20 -15 0 +-15 -10 0 +-10 -5 0 +-5 0 1 +0 5 0 +5 10 0 +10 15 0 +15 20 0 +20 25 0 +25 30 0 +30 35 0 +35 40 0 +40 45 0 +45 50 0 +50 55 0 +55 60 0 +60 65 0 +65 70 0 +70 75 0 +75 80 0 +80 85 0 +85 90 0 +90 95 1 +95 100 0 +100 105 0 +105 110 0 +110 115 0 +115 120 0 +120 125 0 +125 130 0 +130 135 0 +135 140 0 +140 145 0 +145 150 0 +150 155 0 +155 160 0 +160 165 0 +165 170 0 +170 175 0 +175 180 0 +180 185 0 +185 190 0 +190 195 0 +195 200 0 +200 205 0 +205 210 0 +210 215 0 +215 220 0 +220 225 0 +225 230 0 +230 235 0 +235 240 0 +240 245 0 +245 250 0 From ded981b556b74a36a249c8293eea31f4f2d2a3ea Mon Sep 17 00:00:00 2001 From: emmarousseau Date: Sun, 29 Sep 2024 16:41:42 +0200 Subject: [PATCH 4/5] fix default values --- src/rseqc/rseqc_inner_distance/config.vsh.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/rseqc/rseqc_inner_distance/config.vsh.yaml b/src/rseqc/rseqc_inner_distance/config.vsh.yaml index 1b698517..30c84e18 100644 --- a/src/rseqc/rseqc_inner_distance/config.vsh.yaml +++ b/src/rseqc/rseqc_inner_distance/config.vsh.yaml @@ -29,8 +29,8 @@ argument_groups: - name: "--sample_size" type: integer - example: 200000 - description: Numer of reads sampled from SAM/BAM file, default = 200000. + example: 1000000 + description: Numer of reads sampled from SAM/BAM file, default = 1000000. - name: "--map_qual" type: integer @@ -59,7 +59,7 @@ argument_groups: type: string required: true description: Rrefix of output files. - + - name: "--output_stats" type: file direction: output From 1fb6ab44799358f2449e226ec9b69b0f653ff0a3 Mon Sep 17 00:00:00 2001 From: emmarousseau Date: Tue, 1 Oct 2024 10:12:37 +0200 Subject: [PATCH 5/5] adjust argument names and container image --- .../rseqc_inner_distance/config.vsh.yaml | 21 ++++++++++++------- src/rseqc/rseqc_inner_distance/script.sh | 10 ++++----- src/rseqc/rseqc_inner_distance/test.sh | 6 +++--- 3 files changed, 22 insertions(+), 15 deletions(-) diff --git a/src/rseqc/rseqc_inner_distance/config.vsh.yaml b/src/rseqc/rseqc_inner_distance/config.vsh.yaml index 30c84e18..e050bb24 100644 --- a/src/rseqc/rseqc_inner_distance/config.vsh.yaml +++ b/src/rseqc/rseqc_inner_distance/config.vsh.yaml @@ -17,37 +17,44 @@ authors: argument_groups: - name: "Input" arguments: - - name: "--input" + - name: "--input_file" + alternatives: ["-i"] type: file required: true description: input alignment file in BAM or SAM format - name: "--refgene" + alternatives: ["-r"] type: file required: true description: Reference gene model in bed format - name: "--sample_size" + alternatives: ["-k"] type: integer example: 1000000 description: Numer of reads sampled from SAM/BAM file, default = 1000000. - - name: "--map_qual" + - name: "--mapq" + alternatives: ["-q"] type: integer example: 30 description: Minimum mapping quality (phred scaled) to determine uniquely mapped reads, default=30. - - name: "--lower_bound_size" + - name: "--lower_bound" + alternatives: ["-l"] type: integer example: -250 description: Lower bound of inner distance (bp). This option is used for ploting histograme, default=-250. - - name: "--upper_bound_size" + - name: "--upper_bound" + alternatives: ["-u"] type: integer example: 250 description: Upper bound of inner distance (bp). This option is used for ploting histograme, default=250. - - name: "--step_size" + - name: "--step" + alternatives: ["-s"] type: integer example: 5 description: Step size (bp) of histograme. This option is used for plotting histogram, default=5. @@ -95,10 +102,10 @@ test_resources: engines: - type: docker - image: ubuntu:22.04 + image: python:3.10 setup: - type: apt - packages: [python3-pip, r-base] + packages: [r-base] - type: python packages: [ RSeQC ] - type: docker diff --git a/src/rseqc/rseqc_inner_distance/script.sh b/src/rseqc/rseqc_inner_distance/script.sh index 655f5b63..fe00c590 100644 --- a/src/rseqc/rseqc_inner_distance/script.sh +++ b/src/rseqc/rseqc_inner_distance/script.sh @@ -4,14 +4,14 @@ set -exo pipefail inner_distance.py \ - -i $par_input \ + -i $par_input_file \ -r $par_refgene \ -o $par_output_prefix \ ${par_sample_size:+-k "${par_sample_size}"} \ - ${par_lower_bound_size:+-l "${par_lower_bound_size}"} \ - ${par_upper_bound_size:+-u "${par_upper_bound_size}"} \ - ${par_step_size:+-s "${par_step_size}"} \ - ${par_map_qual:+-q "${par_map_qual}"} \ + ${par_lower_bound:+-l "${par_lower_bound}"} \ + ${par_upper_bound:+-u "${par_upper_bound}"} \ + ${par_step:+-s "${par_step}"} \ + ${par_mapq:+-q "${par_mapq}"} \ > stdout.txt if [[ -n $par_output_stats ]]; then head -n 2 stdout.txt > $par_output_stats; fi diff --git a/src/rseqc/rseqc_inner_distance/test.sh b/src/rseqc/rseqc_inner_distance/test.sh index 49405696..927a69a9 100644 --- a/src/rseqc/rseqc_inner_distance/test.sh +++ b/src/rseqc/rseqc_inner_distance/test.sh @@ -15,7 +15,7 @@ output_freq="inner_distance_freq.txt" echo "> Running $meta_functionality_name" "$meta_executable" \ - --input $input_bam \ + --input_file $input_bam \ --refgene $input_bed \ --output_prefix "test" \ --output_stats $output_stats \ @@ -50,11 +50,11 @@ rm "$output_stats" "$output_dist" "$output_plot" "$output_plot_r" "$output_freq" echo "> Running $meta_functionality_name with non-default parameters and default output file names" "$meta_executable" \ - --input $input_bam \ + --input_file $input_bam \ --refgene $input_bed \ --output_prefix "test" \ --sample_size 4 \ - --map_qual 10 + --mapq 10 exit_code=$? [[ $exit_code != 0 ]] && echo "Non zero exit code: $exit_code" && exit 1