From 38f586bec0ac9e4312b016e29c3aa0bd53f292b2 Mon Sep 17 00:00:00 2001 From: emmarousseau Date: Thu, 11 Apr 2024 11:04:14 +0100 Subject: [PATCH 01/11] initial commit dedup --- CHANGELOG.md | 3 + src/umi_tools/umi_tools_dedup/config.vsh.yaml | 279 ++++++++++++++++++ src/umi_tools/umi_tools_dedup/help.txt | 13 + src/umi_tools/umi_tools_dedup/script.sh | 65 ++++ src/umi_tools/umi_tools_dedup/test.sh | 49 +++ 5 files changed, 409 insertions(+) create mode 100644 src/umi_tools/umi_tools_dedup/config.vsh.yaml create mode 100644 src/umi_tools/umi_tools_dedup/help.txt create mode 100644 src/umi_tools/umi_tools_dedup/script.sh create mode 100644 src/umi_tools/umi_tools_dedup/test.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 4fd7f001..1bef9345 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -39,6 +39,9 @@ - `samtools/flagstat`: Counts the number of alignments in SAM/BAM/CRAM files for each FLAG type (PR #31). - `samtools/idxstats`: Reports alignment summary statistics for a SAM/BAM/CRAM file (PR #32). +* `umi_tools`: + - `umi_tools/umi_tools_dedup`: Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read (PR #). + ## MAJOR CHANGES ## MINOR CHANGES diff --git a/src/umi_tools/umi_tools_dedup/config.vsh.yaml b/src/umi_tools/umi_tools_dedup/config.vsh.yaml new file mode 100644 index 00000000..75306541 --- /dev/null +++ b/src/umi_tools/umi_tools_dedup/config.vsh.yaml @@ -0,0 +1,279 @@ +name: umi_tool_dedup +namespace: umi_tools +description: | + Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read. +keywords: [umi_tools, deduplication, dedup] +links: + homepage: https://umi-tools.readthedocs.io/en/latest/ + documentation: [ https://umi-tools.readthedocs.io/en/latest/reference/dedup.html, + https://umi-tools.readthedocs.io/en/latest/common_options.html#common-options ] + repository: https://github.com/CGATOxford/UMI-tools +references: + doi: 10.1101/gr.209601.116 +license: MIT + +argument_groups: + - name: Inputs + arguments: + - name: --input + alternatives: -I + type: file + description: Input BAM or SAM file. Use --in_sam to specify SAM format. + required: true + - name: --in_sam + type: boolean_true + description: | + By default, inputs are assumed to be in BAM format. Use this options + to specify the use of SAM format for input. + - name: --bai + type: file + description: BAM index + - name: --get_output_stats + type: boolean + description: Whether or not to generate output stats. + - name: --random_seed + type: integer + description: | + Random seed to initialize number generator with. + default: none + + - name: Outputs + arguments: + - name: --output + alternatives: -S + type: file + description: Deduplicated BAM file + required: true + direction: output + - name: --out_sam + type: boolean_true + description: | + By default, outputa are written in BAM format. Use this options to + specify the use of SAM format for output. + - name: --paired + type: boolean_true + description: | + BAM is paired end - output both read pairs. This will also force the + use of the template length to determine reads with the same mapping + coordinates. + - name: --output_stats + type: file + description: Directory containing UMI based deduplication statistics files + direction: output + - name: --extract_umi_method + type: string + description: | + Specify the method by which the barcodes were encoded in the read. + The options are: [read_id, tag, umis]. + default: read_id + - name: --umi_tag + type: string + description: | + The tag containing the UMI sequence. + This is only required if the extract_umi_method is set to tag. + - name: --umi_separator + type: string + description: | + The separator used to separate the UMI from the read sequence. + This is only required if the extract_umi_method is set to id_read. + default: '_' + - name: --umi_tag_split + type: string + description: | + Separate the UMI in tag by and take the first element. + - name: --umi_tag_delimiter + type: string + description: | + Separate the UMI in by and concatenate the elements + - name: --cell_tag + type: string + description: | + The tag containing the cell barcode sequence. + This is only required if the extract_umi_method is set to tag. + - name: --cell_tag_split + type: string + description: | + Separate the cell barcode in tag by and take the first element. + - name: --cell_tag_delimiter + type: string + description: | + Separate the cell barcode in by and concatenate the elements + + - name: Grouping Options + arguments: + - name: --method + type: string + description: | + The method to use for grouping reads. The options are: + [unique, percentile, cluster, adjacency, directional]. + default: directional + - name: --edit_distance_threshold + type: integer + description: | + For the adjacency and cluster methods the threshold for the edit + distance to connect two UMIs in the network can be increased. The + default value of 1 works best unless the UMI is very long (>14bp). + default: 1 + - name: --spliced_is_unique + type: boolean_true + description: | + Causes two reads that start in the same position on the same strand + and having the same UMI to be considered unique if one is spliced + and the other is not. (Uses the ‘N’ cigar operation to test for splicing). + - name: --soft_clip_threshold + type: integer + description: | + Mappers that soft clip will sometimes do so rather than mapping a + spliced read if there is only a small overhang over the exon junction. + By setting this option, you can treat reads with at least this many + bases soft-clipped at the 3’ end as spliced. + default: 4 + - name: --multimapping_detection_method + type: string + description: | + If the sam/bam contains tags to identify multimapping reads, you can + specify for use when selecting the best read at a given loci. Supported + tags are “NH”, “X0” and “XT”. If not specified, the read with the highest + mapping quality will be selected. + - name: --read_length + type: integer + description: | + Use the read length as a criteria when deduping, for e.g sRNA-Seq. + + - name: Single-cell RNA-Seq Options + arguments: + - name: --per_gene + type: boolean_true + description: | + Reads will be grouped together if they have the same gene. This is useful + if your library prep generates PCR duplicates with non identical alignment + positions such as CEL-Seq. Note this option is hardcoded to be on with the + count command. I.e counting is always performed per-gene. Must be combined + with either --gene_tag or --per_contig option. + - name: --gene_tag + type: string + description: | + Deduplicate per gene. The gene information is encoded in the bam read tag + specified. + - name: --assigned_status_tag + type: string + description: | + BAM tag which describes whether a read is assigned to a gene. Defaults to + the same value as given for --gene_tag. + - name: --skip_tags_regex + type: string + description: | + Use in conjunction with the --assigned_status_tag option to skip any reads + where the tag matches this regex. Default ("^[__|Unassigned]") matches + anything which starts with “__” or “Unassigned”. + - name: --per_contig + type: boolean_true + description: | + Deduplicate per contig (field 3 in BAM; RNAME). All reads with the same + contig will be considered to have the same alignment position. This is + useful if you have aligned to a reference transcriptome with one + transcript per gene. If you have aligned to a transcriptome with more + than one transcript per gene, you can supply a map between transcripts + and gene using the --gene_transcript_map option. + - name: --gene_transcript_map + type: file + description: | + A file containing a mapping between gene names and transcript names. + The file should be tab separated with the gene name in the first column + and the transcript name in the second column. + - name: --per_cell + type: boolean_true + description: | + Reads will only be grouped together if they have the same cell barcode. + Can be combined with --per_gene. + + - name: SAM/BAM Options + arguments: + - name: --mapping_quality + type: integer + description: | + Minimium mapping quality (MAPQ) for a read to be retained. + default: 0 + - name: --unmapped_reads + type: string + description: | + How unmapped reads should be handled. + The options are: + "discard": Discard all unmapped reads. + "use": If read2 is unmapped, deduplicate using read1 only. + Requires --paired. + "output": Output unmapped reads/read pairs without UMI + grouping/deduplication. Only available in umi_tools group. + default: discard + - name: --chimeric_pairs + type: string + description: | + How chimeric pairs should be handled. + The options are: + "discard": Discard all chimeric read pairs. + "use": Deduplicate using read1 only. + "output": Output chimeric pairs without UMI grouping/deduplication. + Only available in umi_tools group. + default: use + - name: --unapired_reads + type: string + description: | + How unpaired reads should be handled. + The options are: + "discard": Discard all unpaired reads. + "use": Deduplicate using read1 only. + "output": Output unpaired reads without UMI grouping/deduplication. + Only available in umi_tools group. + default: use + - name: --ignore_umi + type: boolean_true + description: | + Ignore the UMI and group reads using mapping coordinates only. + - name: --subset + type: boolean_true + description: | + Only consider a fraction of the reads, chosen at random. This is useful + for doing saturation analyses. + - name: --chrom + type: string + description: | + Only consider a single chromosome. This is useful for debugging/testing + purposes. + + - name: Group/Dedup Options + arguments: + - name: --no_sort_output + type: boolean_true + description: | + By default, output is sorted. This involves the use of a temporary unsorted + file (saved in --temp-dir). Use this option to turn off sorting. + - name: --buffer_whole_contig + type: boolean_true + description: | + Forces dedup to parse an entire contig before yielding any reads for + deduplication. This is the only way to absolutely guarantee that all reads + with the same start position are grouped together for deduplication since + dedup uses the start position of the read, not the alignment coordinate on + which the reads are sorted. However, by default, dedup reads for another + 1000bp before outputting read groups which will avoid any reads being missed + with short read sequencing (<1000bp). + + +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - type: file + path: test_data +engines: + - type: docker + image: quay.io/biocontainers/umi_tools:1.1.5--py39hf95cd2a_1 + setup: + - type: docker + run: | + umi_tools -v | sed 's/ version//g' > /var/software_versions.txt +runners: +- type: executable +- type: nextflow \ No newline at end of file diff --git a/src/umi_tools/umi_tools_dedup/help.txt b/src/umi_tools/umi_tools_dedup/help.txt new file mode 100644 index 00000000..d3c8fa44 --- /dev/null +++ b/src/umi_tools/umi_tools_dedup/help.txt @@ -0,0 +1,13 @@ +``` +umi_tools dedup +``` + +dedup - Deduplicate reads using UMI and mapping coordinates + +Usage: umi_tools dedup [OPTIONS] [--stdin=IN_BAM] [--stdout=OUT_BAM] + + note: If --stdout is ommited, standard out is output. To + generate a valid BAM file on standard out, please + redirect log with --log=LOGFILE or --log2stderr + +For full UMI-tools documentation, see https://umi-tools.readthedocs.io/en/latest/ \ No newline at end of file diff --git a/src/umi_tools/umi_tools_dedup/script.sh b/src/umi_tools/umi_tools_dedup/script.sh new file mode 100644 index 00000000..57c01258 --- /dev/null +++ b/src/umi_tools/umi_tools_dedup/script.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +set -e + +test_dir="${metal_executable}/test_data" + +[[ "$par_paired" == "false" ]] && unset par_paired +[[ "$par_in_sam" == "false" ]] && unset par_in_sam +[[ "$par_out_sam" == "false" ]] && unset par_out_sam +[[ "$par_spliced_is_unique" == "false" ]] && unset par_spliced_is_unique +[[ "$par_per_gene" == "false" ]] && unset par_per_gene +[[ "$par_per_contig" == "false" ]] && unset par_per_contig +[[ "$par_per_cell" == "false" ]] && unset par_per_cell +[[ "$par_no_sort_output" == "false" ]] && unset par_no_sort_output +[[ "$par_buffer_whole_contig" == "false" ]] && unset par_buffer_whole_contig +[[ "$par_ignore_umi" == "false" ]] && unset par_ignore_umi +[[ "$par_subset" == "false" ]] && unset par_subset + + +$(which umi_tools) dedup \ + -I "$par_input" \ + ${par_in_sam:+--in-sam} \ + ${par_bai:+--bai "$par_bai"} \ + ${par_get_output_stats:+--get-output-stats} \ + ${par_random_seed:+--random-seed "$par_random_seed"} \ + -S "$par_output" \ + ${par_out_sam:+--out-sam} \ + ${par_paired:+--paired} \ + ${par_output_stats:+--output-stats "$par_output_stats"} \ + ${par_extract_umi_method:+--extract-umi-method "$par_extract_umi_method"} \ + ${par_umi_tag:+--umi-tag "$par_umi_tag"} \ + ${par_umi_separator:+--umi-separator "$par_umi_separator"} \ + ${par_umi_tag_split:+--umi-tag-split "$par_umi_tag_split"} \ + ${par_umi_tag_delimiter:+--umi-tag-delimiter "$par_umi_tag_delimiter"} \ + ${par_cell_tag:+--cell-tag "$par_cell_tag"} \ + ${par_cell_tag_split:+--cell-tag-split "$par_cell_tag_split"} \ + ${par_cell_tag_delimiter:+--cell-tag-delimiter "$par_cell_tag_delimiter"} \ + ${par_method:+--method "$par_method"} \ + ${par_edit_distance_threshold:+--edit-distance-threshold "$par_edit_distance_threshold"} \ + ${par_spliced_is_unique:+--spliced-is-unique} \ + ${par_soft_clip_threshold:+--soft-clip-threshold "$par_soft_clip_threshold"} \ + ${par_multimapping_detection_method:+--multimapping-detection-method "$par_multimapping_detection_method"} \ + ${par_read_length:+--read-length "$par_read_length"} \ + ${par_per_gene:+--per-gene} \ + ${par_gene_tag:+--gene-tag "$par_gene_tag"} \ + ${par_assigned_status_tag:+--assigned-status-tag "$par_assigned_status_tag"} \ + ${par_skip_tags_regex:+--skip-tags-regex "$par_skip_tags_regex"} \ + ${par_per_contig:+--per-contig} + ${par_gene_transcript_map:+--gene-transcript-map "$par_gene_transcript_map"} \ + ${par_per_cell:+--per-cell} \ + ${par_mapping_quality:+--mapping-quality "$par_mapping_quality"} \ + ${par_unmapped_reads:+--unmapped-reads "$par_unmapped_reads"} \ + ${par_chimeric_pairs:+--chimeric-pairs "$par_chimeric_pairs"} \ + ${par_unapired_reads:+--unapired-reads "$par_unapired_reads"} \ + ${par_ignore_umi:+--ignore-umi} \ + ${par_subset:+--subset} \ + ${par_chrom:+--chrom "$par_chrom"} \ + ${par_no_sort_output:+--no-sort-output} \ + ${par_buffer_whole_contig:+--buffer-whole-contig} + + +exit 0 \ No newline at end of file diff --git a/src/umi_tools/umi_tools_dedup/test.sh b/src/umi_tools/umi_tools_dedup/test.sh new file mode 100644 index 00000000..1459ec08 --- /dev/null +++ b/src/umi_tools/umi_tools_dedup/test.sh @@ -0,0 +1,49 @@ +#!/bin/bash + +test_dir="${meta_resources_dir}/test_data" +echo ">>> Testing $meta_functionality_name" + +"$meta_executable" \ + --bam "$test_dir/a.sorted.bam" \ + --bai "$test_dir/a.sorted.bam.bai" \ + --output "$test_dir/a.sorted.idxstats" + +echo ">>> Checking whether output exists" +[ ! -f "$test_dir/a.sorted.idxstats" ] && echo "File 'a.sorted.idxstats' does not exist!" && exit 1 + +echo ">>> Checking whether output is non-empty" +[ ! -s "$test_dir/a.sorted.idxstats" ] && echo "File 'a.sorted.idxstats' is empty!" && exit 1 + +echo ">>> Checking whether output is correct" +diff "$test_dir/a.sorted.idxstats" "$test_dir/a_ref.sorted.idxstats" || \ + (echo "Output file a.sorted.idxstats does not match expected output" && exit 1) + +rm "$test_dir/a.sorted.idxstats" + +############################################################################################ + +echo ">>> Testing $meta_functionality_name with singletons in the input" + +"$meta_executable" \ + --bam "$test_dir/test.paired_end.sorted.bam" \ + --bai "$test_dir/test.paired_end.sorted.bam.bai" \ + --output "$test_dir/test.paired_end.sorted.idxstats" + +echo ">>> Checking whether output exists" +[ ! -f "$test_dir/test.paired_end.sorted.idxstats" ] && \ + echo "File 'test.paired_end.sorted.idxstats' does not exist!" && exit 1 + +echo ">>> Checking whether output is non-empty" +[ ! -s "$test_dir/test.paired_end.sorted.idxstats" ] && \ + echo "File 'test.paired_end.sorted.idxstats' is empty!" && exit 1 + +echo ">>> Checking whether output is correct" +diff "$test_dir/test.paired_end.sorted.idxstats" "$test_dir/test_ref.paired_end.sorted.idxstats" || \ + (echo "Output file test.paired_end.sorted.idxstats does not match expected output" && exit 1) + +rm "$test_dir/test.paired_end.sorted.idxstats" + +############################################################################################ + +echo "All tests succeeded!" +exit 0 \ No newline at end of file From 2c269682620a407803e528652198646435ef2c03 Mon Sep 17 00:00:00 2001 From: emmarousseau Date: Thu, 11 Apr 2024 11:38:57 +0100 Subject: [PATCH 02/11] Revert "initial commit dedup" This reverts commit 38f586bec0ac9e4312b016e29c3aa0bd53f292b2. --- CHANGELOG.md | 3 - src/umi_tools/umi_tools_dedup/config.vsh.yaml | 279 ------------------ src/umi_tools/umi_tools_dedup/help.txt | 13 - src/umi_tools/umi_tools_dedup/script.sh | 65 ---- src/umi_tools/umi_tools_dedup/test.sh | 49 --- 5 files changed, 409 deletions(-) delete mode 100644 src/umi_tools/umi_tools_dedup/config.vsh.yaml delete mode 100644 src/umi_tools/umi_tools_dedup/help.txt delete mode 100644 src/umi_tools/umi_tools_dedup/script.sh delete mode 100644 src/umi_tools/umi_tools_dedup/test.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 1bef9345..4fd7f001 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -39,9 +39,6 @@ - `samtools/flagstat`: Counts the number of alignments in SAM/BAM/CRAM files for each FLAG type (PR #31). - `samtools/idxstats`: Reports alignment summary statistics for a SAM/BAM/CRAM file (PR #32). -* `umi_tools`: - - `umi_tools/umi_tools_dedup`: Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read (PR #). - ## MAJOR CHANGES ## MINOR CHANGES diff --git a/src/umi_tools/umi_tools_dedup/config.vsh.yaml b/src/umi_tools/umi_tools_dedup/config.vsh.yaml deleted file mode 100644 index 75306541..00000000 --- a/src/umi_tools/umi_tools_dedup/config.vsh.yaml +++ /dev/null @@ -1,279 +0,0 @@ -name: umi_tool_dedup -namespace: umi_tools -description: | - Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read. -keywords: [umi_tools, deduplication, dedup] -links: - homepage: https://umi-tools.readthedocs.io/en/latest/ - documentation: [ https://umi-tools.readthedocs.io/en/latest/reference/dedup.html, - https://umi-tools.readthedocs.io/en/latest/common_options.html#common-options ] - repository: https://github.com/CGATOxford/UMI-tools -references: - doi: 10.1101/gr.209601.116 -license: MIT - -argument_groups: - - name: Inputs - arguments: - - name: --input - alternatives: -I - type: file - description: Input BAM or SAM file. Use --in_sam to specify SAM format. - required: true - - name: --in_sam - type: boolean_true - description: | - By default, inputs are assumed to be in BAM format. Use this options - to specify the use of SAM format for input. - - name: --bai - type: file - description: BAM index - - name: --get_output_stats - type: boolean - description: Whether or not to generate output stats. - - name: --random_seed - type: integer - description: | - Random seed to initialize number generator with. - default: none - - - name: Outputs - arguments: - - name: --output - alternatives: -S - type: file - description: Deduplicated BAM file - required: true - direction: output - - name: --out_sam - type: boolean_true - description: | - By default, outputa are written in BAM format. Use this options to - specify the use of SAM format for output. - - name: --paired - type: boolean_true - description: | - BAM is paired end - output both read pairs. This will also force the - use of the template length to determine reads with the same mapping - coordinates. - - name: --output_stats - type: file - description: Directory containing UMI based deduplication statistics files - direction: output - - name: --extract_umi_method - type: string - description: | - Specify the method by which the barcodes were encoded in the read. - The options are: [read_id, tag, umis]. - default: read_id - - name: --umi_tag - type: string - description: | - The tag containing the UMI sequence. - This is only required if the extract_umi_method is set to tag. - - name: --umi_separator - type: string - description: | - The separator used to separate the UMI from the read sequence. - This is only required if the extract_umi_method is set to id_read. - default: '_' - - name: --umi_tag_split - type: string - description: | - Separate the UMI in tag by and take the first element. - - name: --umi_tag_delimiter - type: string - description: | - Separate the UMI in by and concatenate the elements - - name: --cell_tag - type: string - description: | - The tag containing the cell barcode sequence. - This is only required if the extract_umi_method is set to tag. - - name: --cell_tag_split - type: string - description: | - Separate the cell barcode in tag by and take the first element. - - name: --cell_tag_delimiter - type: string - description: | - Separate the cell barcode in by and concatenate the elements - - - name: Grouping Options - arguments: - - name: --method - type: string - description: | - The method to use for grouping reads. The options are: - [unique, percentile, cluster, adjacency, directional]. - default: directional - - name: --edit_distance_threshold - type: integer - description: | - For the adjacency and cluster methods the threshold for the edit - distance to connect two UMIs in the network can be increased. The - default value of 1 works best unless the UMI is very long (>14bp). - default: 1 - - name: --spliced_is_unique - type: boolean_true - description: | - Causes two reads that start in the same position on the same strand - and having the same UMI to be considered unique if one is spliced - and the other is not. (Uses the ‘N’ cigar operation to test for splicing). - - name: --soft_clip_threshold - type: integer - description: | - Mappers that soft clip will sometimes do so rather than mapping a - spliced read if there is only a small overhang over the exon junction. - By setting this option, you can treat reads with at least this many - bases soft-clipped at the 3’ end as spliced. - default: 4 - - name: --multimapping_detection_method - type: string - description: | - If the sam/bam contains tags to identify multimapping reads, you can - specify for use when selecting the best read at a given loci. Supported - tags are “NH”, “X0” and “XT”. If not specified, the read with the highest - mapping quality will be selected. - - name: --read_length - type: integer - description: | - Use the read length as a criteria when deduping, for e.g sRNA-Seq. - - - name: Single-cell RNA-Seq Options - arguments: - - name: --per_gene - type: boolean_true - description: | - Reads will be grouped together if they have the same gene. This is useful - if your library prep generates PCR duplicates with non identical alignment - positions such as CEL-Seq. Note this option is hardcoded to be on with the - count command. I.e counting is always performed per-gene. Must be combined - with either --gene_tag or --per_contig option. - - name: --gene_tag - type: string - description: | - Deduplicate per gene. The gene information is encoded in the bam read tag - specified. - - name: --assigned_status_tag - type: string - description: | - BAM tag which describes whether a read is assigned to a gene. Defaults to - the same value as given for --gene_tag. - - name: --skip_tags_regex - type: string - description: | - Use in conjunction with the --assigned_status_tag option to skip any reads - where the tag matches this regex. Default ("^[__|Unassigned]") matches - anything which starts with “__” or “Unassigned”. - - name: --per_contig - type: boolean_true - description: | - Deduplicate per contig (field 3 in BAM; RNAME). All reads with the same - contig will be considered to have the same alignment position. This is - useful if you have aligned to a reference transcriptome with one - transcript per gene. If you have aligned to a transcriptome with more - than one transcript per gene, you can supply a map between transcripts - and gene using the --gene_transcript_map option. - - name: --gene_transcript_map - type: file - description: | - A file containing a mapping between gene names and transcript names. - The file should be tab separated with the gene name in the first column - and the transcript name in the second column. - - name: --per_cell - type: boolean_true - description: | - Reads will only be grouped together if they have the same cell barcode. - Can be combined with --per_gene. - - - name: SAM/BAM Options - arguments: - - name: --mapping_quality - type: integer - description: | - Minimium mapping quality (MAPQ) for a read to be retained. - default: 0 - - name: --unmapped_reads - type: string - description: | - How unmapped reads should be handled. - The options are: - "discard": Discard all unmapped reads. - "use": If read2 is unmapped, deduplicate using read1 only. - Requires --paired. - "output": Output unmapped reads/read pairs without UMI - grouping/deduplication. Only available in umi_tools group. - default: discard - - name: --chimeric_pairs - type: string - description: | - How chimeric pairs should be handled. - The options are: - "discard": Discard all chimeric read pairs. - "use": Deduplicate using read1 only. - "output": Output chimeric pairs without UMI grouping/deduplication. - Only available in umi_tools group. - default: use - - name: --unapired_reads - type: string - description: | - How unpaired reads should be handled. - The options are: - "discard": Discard all unpaired reads. - "use": Deduplicate using read1 only. - "output": Output unpaired reads without UMI grouping/deduplication. - Only available in umi_tools group. - default: use - - name: --ignore_umi - type: boolean_true - description: | - Ignore the UMI and group reads using mapping coordinates only. - - name: --subset - type: boolean_true - description: | - Only consider a fraction of the reads, chosen at random. This is useful - for doing saturation analyses. - - name: --chrom - type: string - description: | - Only consider a single chromosome. This is useful for debugging/testing - purposes. - - - name: Group/Dedup Options - arguments: - - name: --no_sort_output - type: boolean_true - description: | - By default, output is sorted. This involves the use of a temporary unsorted - file (saved in --temp-dir). Use this option to turn off sorting. - - name: --buffer_whole_contig - type: boolean_true - description: | - Forces dedup to parse an entire contig before yielding any reads for - deduplication. This is the only way to absolutely guarantee that all reads - with the same start position are grouped together for deduplication since - dedup uses the start position of the read, not the alignment coordinate on - which the reads are sorted. However, by default, dedup reads for another - 1000bp before outputting read groups which will avoid any reads being missed - with short read sequencing (<1000bp). - - -resources: - - type: bash_script - path: script.sh -test_resources: - - type: bash_script - path: test.sh - - type: file - path: test_data -engines: - - type: docker - image: quay.io/biocontainers/umi_tools:1.1.5--py39hf95cd2a_1 - setup: - - type: docker - run: | - umi_tools -v | sed 's/ version//g' > /var/software_versions.txt -runners: -- type: executable -- type: nextflow \ No newline at end of file diff --git a/src/umi_tools/umi_tools_dedup/help.txt b/src/umi_tools/umi_tools_dedup/help.txt deleted file mode 100644 index d3c8fa44..00000000 --- a/src/umi_tools/umi_tools_dedup/help.txt +++ /dev/null @@ -1,13 +0,0 @@ -``` -umi_tools dedup -``` - -dedup - Deduplicate reads using UMI and mapping coordinates - -Usage: umi_tools dedup [OPTIONS] [--stdin=IN_BAM] [--stdout=OUT_BAM] - - note: If --stdout is ommited, standard out is output. To - generate a valid BAM file on standard out, please - redirect log with --log=LOGFILE or --log2stderr - -For full UMI-tools documentation, see https://umi-tools.readthedocs.io/en/latest/ \ No newline at end of file diff --git a/src/umi_tools/umi_tools_dedup/script.sh b/src/umi_tools/umi_tools_dedup/script.sh deleted file mode 100644 index 57c01258..00000000 --- a/src/umi_tools/umi_tools_dedup/script.sh +++ /dev/null @@ -1,65 +0,0 @@ -#!/bin/bash - -## VIASH START -## VIASH END - -set -e - -test_dir="${metal_executable}/test_data" - -[[ "$par_paired" == "false" ]] && unset par_paired -[[ "$par_in_sam" == "false" ]] && unset par_in_sam -[[ "$par_out_sam" == "false" ]] && unset par_out_sam -[[ "$par_spliced_is_unique" == "false" ]] && unset par_spliced_is_unique -[[ "$par_per_gene" == "false" ]] && unset par_per_gene -[[ "$par_per_contig" == "false" ]] && unset par_per_contig -[[ "$par_per_cell" == "false" ]] && unset par_per_cell -[[ "$par_no_sort_output" == "false" ]] && unset par_no_sort_output -[[ "$par_buffer_whole_contig" == "false" ]] && unset par_buffer_whole_contig -[[ "$par_ignore_umi" == "false" ]] && unset par_ignore_umi -[[ "$par_subset" == "false" ]] && unset par_subset - - -$(which umi_tools) dedup \ - -I "$par_input" \ - ${par_in_sam:+--in-sam} \ - ${par_bai:+--bai "$par_bai"} \ - ${par_get_output_stats:+--get-output-stats} \ - ${par_random_seed:+--random-seed "$par_random_seed"} \ - -S "$par_output" \ - ${par_out_sam:+--out-sam} \ - ${par_paired:+--paired} \ - ${par_output_stats:+--output-stats "$par_output_stats"} \ - ${par_extract_umi_method:+--extract-umi-method "$par_extract_umi_method"} \ - ${par_umi_tag:+--umi-tag "$par_umi_tag"} \ - ${par_umi_separator:+--umi-separator "$par_umi_separator"} \ - ${par_umi_tag_split:+--umi-tag-split "$par_umi_tag_split"} \ - ${par_umi_tag_delimiter:+--umi-tag-delimiter "$par_umi_tag_delimiter"} \ - ${par_cell_tag:+--cell-tag "$par_cell_tag"} \ - ${par_cell_tag_split:+--cell-tag-split "$par_cell_tag_split"} \ - ${par_cell_tag_delimiter:+--cell-tag-delimiter "$par_cell_tag_delimiter"} \ - ${par_method:+--method "$par_method"} \ - ${par_edit_distance_threshold:+--edit-distance-threshold "$par_edit_distance_threshold"} \ - ${par_spliced_is_unique:+--spliced-is-unique} \ - ${par_soft_clip_threshold:+--soft-clip-threshold "$par_soft_clip_threshold"} \ - ${par_multimapping_detection_method:+--multimapping-detection-method "$par_multimapping_detection_method"} \ - ${par_read_length:+--read-length "$par_read_length"} \ - ${par_per_gene:+--per-gene} \ - ${par_gene_tag:+--gene-tag "$par_gene_tag"} \ - ${par_assigned_status_tag:+--assigned-status-tag "$par_assigned_status_tag"} \ - ${par_skip_tags_regex:+--skip-tags-regex "$par_skip_tags_regex"} \ - ${par_per_contig:+--per-contig} - ${par_gene_transcript_map:+--gene-transcript-map "$par_gene_transcript_map"} \ - ${par_per_cell:+--per-cell} \ - ${par_mapping_quality:+--mapping-quality "$par_mapping_quality"} \ - ${par_unmapped_reads:+--unmapped-reads "$par_unmapped_reads"} \ - ${par_chimeric_pairs:+--chimeric-pairs "$par_chimeric_pairs"} \ - ${par_unapired_reads:+--unapired-reads "$par_unapired_reads"} \ - ${par_ignore_umi:+--ignore-umi} \ - ${par_subset:+--subset} \ - ${par_chrom:+--chrom "$par_chrom"} \ - ${par_no_sort_output:+--no-sort-output} \ - ${par_buffer_whole_contig:+--buffer-whole-contig} - - -exit 0 \ No newline at end of file diff --git a/src/umi_tools/umi_tools_dedup/test.sh b/src/umi_tools/umi_tools_dedup/test.sh deleted file mode 100644 index 1459ec08..00000000 --- a/src/umi_tools/umi_tools_dedup/test.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/bin/bash - -test_dir="${meta_resources_dir}/test_data" -echo ">>> Testing $meta_functionality_name" - -"$meta_executable" \ - --bam "$test_dir/a.sorted.bam" \ - --bai "$test_dir/a.sorted.bam.bai" \ - --output "$test_dir/a.sorted.idxstats" - -echo ">>> Checking whether output exists" -[ ! -f "$test_dir/a.sorted.idxstats" ] && echo "File 'a.sorted.idxstats' does not exist!" && exit 1 - -echo ">>> Checking whether output is non-empty" -[ ! -s "$test_dir/a.sorted.idxstats" ] && echo "File 'a.sorted.idxstats' is empty!" && exit 1 - -echo ">>> Checking whether output is correct" -diff "$test_dir/a.sorted.idxstats" "$test_dir/a_ref.sorted.idxstats" || \ - (echo "Output file a.sorted.idxstats does not match expected output" && exit 1) - -rm "$test_dir/a.sorted.idxstats" - -############################################################################################ - -echo ">>> Testing $meta_functionality_name with singletons in the input" - -"$meta_executable" \ - --bam "$test_dir/test.paired_end.sorted.bam" \ - --bai "$test_dir/test.paired_end.sorted.bam.bai" \ - --output "$test_dir/test.paired_end.sorted.idxstats" - -echo ">>> Checking whether output exists" -[ ! -f "$test_dir/test.paired_end.sorted.idxstats" ] && \ - echo "File 'test.paired_end.sorted.idxstats' does not exist!" && exit 1 - -echo ">>> Checking whether output is non-empty" -[ ! -s "$test_dir/test.paired_end.sorted.idxstats" ] && \ - echo "File 'test.paired_end.sorted.idxstats' is empty!" && exit 1 - -echo ">>> Checking whether output is correct" -diff "$test_dir/test.paired_end.sorted.idxstats" "$test_dir/test_ref.paired_end.sorted.idxstats" || \ - (echo "Output file test.paired_end.sorted.idxstats does not match expected output" && exit 1) - -rm "$test_dir/test.paired_end.sorted.idxstats" - -############################################################################################ - -echo "All tests succeeded!" -exit 0 \ No newline at end of file From 1f02f94844b2196c4208afe64d50cc8d14094227 Mon Sep 17 00:00:00 2001 From: emmarousseau Date: Sun, 2 Jun 2024 20:59:47 +0200 Subject: [PATCH 03/11] initial commit --- src/dupradar/config.vsh.yaml | 117 +++++++++++++++++++++++ src/dupradar/help.txt | 0 src/dupradar/script.R | 154 ++++++++++++++++++++++++++++++ src/dupradar/script.sh | 32 +++++++ src/dupradar/test.sh | 51 ++++++++++ src/dupradar/test_data/genes.gtf | 36 +++++++ src/dupradar/test_data/sample.bam | Bin 0 -> 8680 bytes src/dupradar/test_data/script.sh | 9 ++ 8 files changed, 399 insertions(+) create mode 100644 src/dupradar/config.vsh.yaml create mode 100644 src/dupradar/help.txt create mode 100644 src/dupradar/script.R create mode 100644 src/dupradar/script.sh create mode 100644 src/dupradar/test.sh create mode 100644 src/dupradar/test_data/genes.gtf create mode 100644 src/dupradar/test_data/sample.bam create mode 100644 src/dupradar/test_data/script.sh diff --git a/src/dupradar/config.vsh.yaml b/src/dupradar/config.vsh.yaml new file mode 100644 index 00000000..cae6964f --- /dev/null +++ b/src/dupradar/config.vsh.yaml @@ -0,0 +1,117 @@ +name: dupradar +description: Assessment of duplication rates in RNA-Seq datasets +keywords: [ rnaseq, duplication, genomics ] +links: + homepage: https://bioconductor.org/packages/release/bioc/html/dupRadar.html + documentation: https://bioconductor.org/packages/release/bioc/vignettes/dupRadar/inst/doc/dupRadar.html + repository: https://github.com/ssayols/dupRadar +references: + doi: "10.1186/s12859-016-1276-2" +license: GPL-3.0 + +argument_groups: +- name: "Input" + arguments: + - name: "--id" + type: string + description: Sample ID + + - name: "--input" + type: file + required: true + description: path to input alignment file in BAM format + + - name: "--gtf_annotation" + type: file + required: true + description: path to GTF annotation file. + + - name: "--paired" + type: boolean + description: add flag if input alignment file consists of paired reads + + - name: "--strandedness" + type: string + required: false + choices: ["forward", "reverse", "unstranded"] + description: strandedness of input bam file reads (forward, reverse or unstranded (default, applicable to paired reads)) + + - name: "--versions" + type: file + must_exist: false + +- name: "Output" + arguments: + - name: "--output_dupmatrix" + type: file + direction: output + required: false + must_exist: true + default: $id.dup_matrix.txt + description: path to output file (txt) of duplicate tag counts + + - name: "--output_dup_intercept_mqc" + type: file + direction: output + required: false + must_exist: true + default: $id.dup_intercept_mqc.txt + description: path to output file (txt) of multiqc intercept value DupRadar + + - name: "--output_duprate_exp_boxplot" + type: file + direction: output + required: false + must_exist: true + default: $id.duprate_exp_boxplot.pdf + description: path to output file (pdf) of distribution of expression box plot + + - name: "--output_duprate_exp_densplot" + type: file + direction: output + required: false + must_exist: true + default: $id.duprate_exp_densityplot.pdf + description: path to output file (pdf) of 2D density scatter plot of duplicate tag counts + + - name: "--output_duprate_exp_denscurve_mqc" + type: file + direction: output + required: false + must_exist: true + default: $id.duprate_exp_density_curve_mqc.txt + description: path to output file (pdf) of density curve of gene duplication multiqc + + - name: "--output_expression_histogram" + type: file + direction: output + required: false + must_exist: true + default: $id.expression_hist.pdf + description: path to output file (pdf) of distribution of RPK values per gene histogram + + - name: "--output_intercept_slope" + type: file + direction: output + required: false + must_exist: true + default: $id.intercept_slope.txt + description: output file (txt) with progression of duplication rate values + +resources: + - type: bash_script + path: script.sh + # Copied from https://github.com/nf-core/rnaseq/blob/3.12.0/bin/dupradar.r + - type: R_script + path: script.R + +test_resources: + - type: bash_script + path: test.sh + +engines: + - type: docker + image: quay.io/biocontainers/bioconductor-dupradar:1.32.0--r43hdfd78af_0 +runners: + - type: executable + - type: nextflow diff --git a/src/dupradar/help.txt b/src/dupradar/help.txt new file mode 100644 index 00000000..e69de29b diff --git a/src/dupradar/script.R b/src/dupradar/script.R new file mode 100644 index 00000000..82218c6e --- /dev/null +++ b/src/dupradar/script.R @@ -0,0 +1,154 @@ +#!/usr/bin/env Rscript + +# Command line argument processing +args = commandArgs(trailingOnly=TRUE) +if (length(args) < 5) { + stop("Usage: dupRadar.r ", call.=FALSE) +} + +message("paired_end is", args[5]) +message("the type is is", class(args[5])) + +input_bam <- args[1] +output_prefix <- args[2] +annotation_gtf <- args[3] +stranded <- as.numeric(args[4]) +paired_end <- ifelse(args[5] == "true", TRUE, FALSE) +threads <- as.numeric(args[6]) + +bamRegex <- "(.+)\\.bam$" + +if(!(grepl(bamRegex, input_bam) && file.exists(input_bam) && (!file.info(input_bam)$isdir))) stop("First argument '' must be an existing file (not a directory) with '.bam' extension...") +if(!(file.exists(annotation_gtf) && (!file.info(annotation_gtf)$isdir))) stop("Third argument '' must be an existing file (and not a directory)...") +if(is.na(stranded) || (!(stranded %in% (0:2)))) stop("Fourth argument must be a numeric value in 0(unstranded)/1(forward)/2(reverse)...") +if(is.na(threads) || (threads<=0)) stop("Fifth argument must be a strictly positive numeric value...") + +# Debug messages (stderr) +message("Input bam (Arg 1): ", input_bam) +message("Output basename(Arg 2): ", output_prefix) +message("Input gtf (Arg 3): ", annotation_gtf) +message("Strandness (Arg 4): ", c("unstranded", "forward", "reverse")[stranded+1]) +message("paired_end (Arg 5): ", paired_end) +message("Nb threads (Arg 6): ", threads) +message("R package loc. (Arg 7): ", ifelse(length(args) > 4, args[5], "Not specified")) + + +# Load / install packages +if (length(args) > 5) { .libPaths( c( args[6], .libPaths() ) ) } +if (!require("dupRadar")){ + source("http://bioconductor.org/biocLite.R") + biocLite("dupRadar", suppressUpdates=TRUE) + library("dupRadar") +} +if (!require("parallel")) { + install.packages("parallel", dependencies=TRUE, repos='http://cloud.r-project.org/') + library("parallel") +} + +# Duplicate stats +dm <- analyzeDuprates(input_bam, annotation_gtf, stranded, paired_end, threads) +write.table(dm, file=paste(output_prefix, "_dupMatrix.txt", sep=""), quote=F, row.name=F, sep="\t") + +# 2D density scatter plot +pdf(paste0(output_prefix, "_duprateExpDens.pdf")) +duprateExpDensPlot(DupMat=dm) +title("Density scatter plot") +mtext(output_prefix, side=3) +dev.off() +fit <- duprateExpFit(DupMat=dm) +cat( + paste("- dupRadar Int (duprate at low read counts):", fit$intercept), + paste("- dupRadar Sl (progression of the duplication rate):", fit$slope), + fill=TRUE, labels=output_prefix, + file=paste0(output_prefix, "_intercept_slope.txt"), append=FALSE +) + +# Create a multiqc file dupInt +sample_name <- gsub("Aligned.sortedByCoord.out.markDups", "", output_prefix) +line="#id: DupInt +#plot_type: 'generalstats' +#pconfig: +# dupRadar_intercept: +# title: 'dupInt' +# namespace: 'DupRadar' +# description: 'Intercept value from DupRadar' +# max: 100 +# min: 0 +# scale: 'RdYlGn-rev' +# format: '{:.2f}%' +Sample dupRadar_intercept" + +write(line,file=paste0(output_prefix, "_dup_intercept_mqc.txt"),append=TRUE) +write(paste(sample_name, fit$intercept),file=paste0(output_prefix, "_dup_intercept_mqc.txt"),append=TRUE) + +# Get numbers from dupRadar GLM +curve_x <- sort(log10(dm$RPK)) +curve_y = 100*predict(fit$glm, data.frame(x=curve_x), type="response") +# Remove all of the infinite values +infs = which(curve_x %in% c(-Inf,Inf)) +curve_x = curve_x[-infs] +curve_y = curve_y[-infs] +# Reduce number of data points +curve_x <- curve_x[seq(1, length(curve_x), 10)] +curve_y <- curve_y[seq(1, length(curve_y), 10)] +# Convert x values back to real counts +curve_x = 10^curve_x +# Write to file +line="#id: dupradar +#section_name: 'DupRadar' +#section_href: 'bioconductor.org/packages/release/bioc/html/dupRadar.html' +#description: \"provides duplication rate quality control for RNA-Seq datasets. Highly expressed genes can be expected to have a lot of duplicate reads, but high numbers of duplicates at low read counts can indicate low library complexity with technical duplication. +# This plot shows the general linear models - a summary of the gene duplication distributions. \" +#pconfig: +# title: 'DupRadar General Linear Model' +# xLog: True +# xlab: 'expression (reads/kbp)' +# ylab: '% duplicate reads' +# ymax: 100 +# ymin: 0 +# tt_label: '{point.x:.1f} reads/kbp: {point.y:,.2f}% duplicates' +# xPlotLines: +# - color: 'green' +# dashStyle: 'LongDash' +# label: +# style: {color: 'green'} +# text: '0.5 RPKM' +# verticalAlign: 'bottom' +# y: -65 +# value: 0.5 +# width: 1 +# - color: 'red' +# dashStyle: 'LongDash' +# label: +# style: {color: 'red'} +# text: '1 read/bp' +# verticalAlign: 'bottom' +# y: -65 +# value: 1000 +# width: 1" + +write(line,file=paste0(output_prefix, "_duprateExpDensCurve_mqc.txt"),append=TRUE) +write.table( + cbind(curve_x, curve_y), + file=paste0(output_prefix, "_duprateExpDensCurve_mqc.txt"), + quote=FALSE, row.names=FALSE, col.names=FALSE, append=TRUE, +) + +# Distribution of expression box plot +pdf(paste0(output_prefix, "_duprateExpBoxplot.pdf")) +duprateExpBoxplot(DupMat=dm) +title("Percent Duplication by Expression") +mtext(output_prefix, side=3) +dev.off() + +# Distribution of RPK values per gene +pdf(paste0(output_prefix, "_expressionHist.pdf")) +expressionHist(DupMat=dm) +title("Distribution of RPK values per gene") +mtext(output_prefix, side=3) +dev.off() + +# Print sessioninfo to standard out +print(output_prefix) +citation("dupRadar") +sessionInfo() diff --git a/src/dupradar/script.sh b/src/dupradar/script.sh new file mode 100644 index 00000000..85d51167 --- /dev/null +++ b/src/dupradar/script.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +set -eo pipefail + +function num_strandness { + if [ $par_strandedness == 'unstranded' ]; then echo 0 + elif [ $par_strandedness == 'forward' ]; then echo 1 + elif [ $par_strandedness == 'reverse' ]; then echo 2 + else echo "strandedness must be unstranded, forward or reverse." && \ + exit 1 + fi +} + +Rscript "$meta_resources_dir/script.R" \ + $par_input \ + $par_id \ + $par_gtf_annotation \ + $(num_strandness) \ + $par_paired + +mv "$par_id"_dupMatrix.txt $par_output_dupmatrix +mv "$par_id"_dup_intercept_mqc.txt $par_output_dup_intercept_mqc +mv "$par_id"_duprateExpBoxplot.pdf $par_output_duprate_exp_boxplot +mv "$par_id"_duprateExpDens.pdf $par_output_duprate_exp_densplot +mv "$par_id"_duprateExpDensCurve_mqc.txt $par_output_duprate_exp_denscurve_mqc +mv "$par_id"_expressionHist.pdf $par_output_expression_histogram +mv "$par_id"_intercept_slope.txt $par_output_intercept_slope + + +dupradar_ver=$(Rscript -e "library(dupRadar); cat(as.character(packageVersion('dupRadar')))") +text="bioconductor-dupradar: ${dupradar_ver}" +echo "$text" \ No newline at end of file diff --git a/src/dupradar/test.sh b/src/dupradar/test.sh new file mode 100644 index 00000000..01072e01 --- /dev/null +++ b/src/dupradar/test.sh @@ -0,0 +1,51 @@ +#!/bin/bash + +# define input and output for script +input_bam="$meta_resources_dir/sample.bam" +input_gtf="$meta_resources_dir/genes.gtf" + +output_dupmatrix="dup_matrix.txt" +output_dup_intercept_mqc="dup_intercept_mqc.txt" +output_duprate_exp_boxplot="duprate_exp_boxplot.pdf" +output_duprate_exp_densplot="duprate_exp_densityplot.pdf" +output_duprate_exp_denscurve_mqc="duprate_exp_density_curve_mqc.pdf" +output_expression_histogram="expression_hist.pdf" +output_intercept_slope="intercept_slope.txt" + +# Run executable +echo "> Running $meta_functionality_name for unpaired reads, writing to tmpdir $tmpdir." + +"$meta_executable" \ + --input "$input_bam" \ + --id "test" \ + --gtf_annotation "$input_gtf" \ + --strandedness "forward" \ + --paired false \ + --output_dupmatrix $output_dupmatrix \ + --output_dup_intercept_mqc $output_dup_intercept_mqc \ + --output_duprate_exp_boxplot $output_duprate_exp_boxplot \ + --output_duprate_exp_densplot $output_duprate_exp_densplot \ + --output_duprate_exp_denscurve_mqc $output_duprate_exp_denscurve_mqc \ + --output_expression_histogram $output_expression_histogram \ + --output_intercept_slope $output_intercept_slope + +exit_code=$? +[[ $exit_code != 0 ]] && echo "Non zero exit code: $exit_code" && exit 1 + +echo ">> asserting output has been created for paired read input" +[ ! -f "$output_dupmatrix" ] && echo "$output_dupmatrix was not created" && exit 1 +[ ! -s "$output_dupmatrix" ] && echo "$output_dupmatrix is empty" && exit 1 +[ ! -f "$output_dup_intercept_mqc" ] && echo "$output_dup_intercept_mqc was not created" && exit 1 +[ ! -s "$output_dup_intercept_mqc" ] && echo "$output_dup_intercept_mqc is empty" && exit 1 +[ ! -f "$output_duprate_exp_boxplot" ] && echo "$output_duprate_exp_boxplot was not created" && exit 1 +[ ! -s "$output_duprate_exp_boxplot" ] && echo "$output_duprate_exp_boxplot is empty" && exit 1 +[ ! -f "$output_duprate_exp_densplot" ] && echo "$output_duprate_exp_densplot was not created" && exit 1 +[ ! -s "$output_duprate_exp_densplot" ] && echo "$output_duprate_exp_densplot is empty" && exit 1 +[ ! -f "$output_duprate_exp_denscurve_mqc" ] && echo "$output_duprate_exp_denscurve_mqc was not created" && exit 1 +[ ! -s "$output_duprate_exp_denscurve_mqc" ] && echo "$output_duprate_exp_denscurve_mqc is empty" && exit 1 +[ ! -f "$output_expression_histogram" ] && echo "$output_expression_histogram was not created" && exit 1 +[ ! -s "$output_expression_histogram" ] && echo "$output_expression_histogram is empty" && exit 1 +[ ! -f "$output_intercept_slope" ] && echo "$output_intercept_slope was not created" && exit 1 +[ ! -s "$output_intercept_slope" ] && echo "$output_intercept_slope is empty" && exit 1 + +exit 0 \ No newline at end of file diff --git a/src/dupradar/test_data/genes.gtf b/src/dupradar/test_data/genes.gtf new file mode 100644 index 00000000..423ba8f8 --- /dev/null +++ b/src/dupradar/test_data/genes.gtf @@ -0,0 +1,36 @@ +chr1 unknown exon 14362 14829 . - . gene_id "WASH7P"; gene_name "WASH7P"; transcript_id "NR_024540"; tss_id "TSS7514"; +chr1 unknown exon 14970 15038 . - . gene_id "WASH7P"; gene_name "WASH7P"; transcript_id "NR_024540"; tss_id "TSS7514"; +chr1 unknown exon 15796 15947 . - . gene_id "WASH7P"; gene_name "WASH7P"; transcript_id "NR_024540"; tss_id "TSS7514"; +chr1 unknown exon 16607 16765 . - . gene_id "WASH7P"; gene_name "WASH7P"; transcript_id "NR_024540"; tss_id "TSS7514"; +chr1 unknown exon 16858 17055 . - . gene_id "WASH7P"; gene_name "WASH7P"; transcript_id "NR_024540"; tss_id "TSS7514"; +chr1 unknown exon 17233 17368 . - . gene_id "WASH7P"; gene_name "WASH7P"; transcript_id "NR_024540"; tss_id "TSS7514"; +chr1 unknown exon 17606 17742 . - . gene_id "WASH7P"; gene_name "WASH7P"; transcript_id "NR_024540"; tss_id "TSS7514"; +chr1 unknown exon 17915 18061 . - . gene_id "WASH7P"; gene_name "WASH7P"; transcript_id "NR_024540"; tss_id "TSS7514"; +chr1 unknown exon 18268 18366 . - . gene_id "WASH7P"; gene_name "WASH7P"; transcript_id "NR_024540"; tss_id "TSS7514"; +chr1 unknown exon 24738 24891 . - . gene_id "WASH7P"; gene_name "WASH7P"; transcript_id "NR_024540"; tss_id "TSS7514"; +chr1 unknown exon 29321 29370 . - . gene_id "WASH7P"; gene_name "WASH7P"; transcript_id "NR_024540"; tss_id "TSS7514"; +chr1 unknown exon 34611 35174 . - . gene_id "FAM138A"; gene_name "FAM138A"; transcript_id "NR_026818_1"; tss_id "TSS8403"; +chr1 unknown exon 34611 35174 . - . gene_id "FAM138F"; gene_name "FAM138F"; transcript_id "NR_026820_1"; tss_id "TSS8403"; +chr1 unknown exon 35277 35481 . - . gene_id "FAM138A"; gene_name "FAM138A"; transcript_id "NR_026818_1"; tss_id "TSS8403"; +chr1 unknown exon 35277 35481 . - . gene_id "FAM138F"; gene_name "FAM138F"; transcript_id "NR_026820_1"; tss_id "TSS8403"; +chr1 unknown exon 35721 36081 . - . gene_id "FAM138A"; gene_name "FAM138A"; transcript_id "NR_026818_1"; tss_id "TSS8403"; +chr1 unknown exon 35721 36081 . - . gene_id "FAM138F"; gene_name "FAM138F"; transcript_id "NR_026820_1"; tss_id "TSS8403"; +chr1 unknown CDS 69091 70005 . + 0 gene_id "OR4F5"; gene_name "OR4F5"; p_id "P1230"; transcript_id "NM_001005484"; tss_id "TSS14428"; +chr1 unknown exon 69091 70008 . + . gene_id "OR4F5"; gene_name "OR4F5"; p_id "P1230"; transcript_id "NM_001005484"; tss_id "TSS14428"; +chr1 unknown start_codon 69091 69093 . + . gene_id "OR4F5"; gene_name "OR4F5"; p_id "P1230"; transcript_id "NM_001005484"; tss_id "TSS14428"; +chr1 unknown stop_codon 70006 70008 . + . gene_id "OR4F5"; gene_name "OR4F5"; p_id "P1230"; transcript_id "NM_001005484"; tss_id "TSS14428"; +chr1 unknown exon 134773 139696 . - . gene_id "LOC729737"; gene_name "LOC729737"; transcript_id "NR_039983"; tss_id "TSS18541"; +chr1 unknown exon 139790 139847 . - . gene_id "LOC729737"; gene_name "LOC729737"; transcript_id "NR_039983"; tss_id "TSS18541"; +chr1 unknown exon 140075 140566 . - . gene_id "LOC729737"; gene_name "LOC729737"; transcript_id "NR_039983"; tss_id "TSS18541"; +chr1 unknown exon 323892 324060 . + . gene_id "LOC100132287"; gene_name "LOC100132287"; transcript_id "NR_028322_1"; tss_id "TSS12303"; +chr1 unknown exon 324288 324345 . + . gene_id "LOC100132287"; gene_name "LOC100132287"; transcript_id "NR_028322_1"; tss_id "TSS12303"; +chr1 unknown exon 324439 328581 . + . gene_id "LOC100132287"; gene_name "LOC100132287"; transcript_id "NR_028322_1"; tss_id "TSS12303"; +chr19 unknown exon 76220 76783 . - . gene_id "FAM138F"; gene_name "FAM138F"; transcript_id "NR_026820"; tss_id "TSS16829"; +chr19 unknown exon 76220 76783 . - . gene_id "FAM138A"; gene_name "FAM138A"; transcript_id "NR_026818"; tss_id "TSS16829"; +chr19 unknown exon 76886 77090 . - . gene_id "FAM138F"; gene_name "FAM138F"; transcript_id "NR_026820"; tss_id "TSS16829"; +chr19 unknown exon 76886 77090 . - . gene_id "FAM138A"; gene_name "FAM138A"; transcript_id "NR_026818"; tss_id "TSS16829"; +chr19 unknown exon 77330 77690 . - . gene_id "FAM138F"; gene_name "FAM138F"; transcript_id "NR_026820"; tss_id "TSS16829"; +chr19 unknown exon 77330 77690 . - . gene_id "FAM138A"; gene_name "FAM138A"; transcript_id "NR_026818"; tss_id "TSS16829"; +chr5 unknown exon 180750507 180750675 . + . gene_id "LOC100132287"; gene_name "LOC100132287"; transcript_id "NR_028322"; tss_id "TSS11538"; +chr5 unknown exon 180750903 180750960 . + . gene_id "LOC100132287"; gene_name "LOC100132287"; transcript_id "NR_028322"; tss_id "TSS11538"; +chr5 unknown exon 180751054 180755196 . + . gene_id "LOC100132287"; gene_name "LOC100132287"; transcript_id "NR_028322"; tss_id "TSS11538"; diff --git a/src/dupradar/test_data/sample.bam b/src/dupradar/test_data/sample.bam new file mode 100644 index 0000000000000000000000000000000000000000..04d8c95eeba209f13a4704d596a800e73d6ff955 GIT binary patch literal 8680 zcmVwO=+86Ldf z`@MV5`~UB|QCU0pdjg<*YdLDvN?4Rp{Z6UX?;o~zdd*>Hwp_m()oZ2J_8~^AwGt^u zEHq&tC=n8!lST$Yo(gnUP;5mYR3Z>rkSBQ2wK9GWCO#_v08Y-p1B!bs&< ziO_Ijh$Tu#q45PcAz2xgq3{J*M8||Sn8}AE<8zbv%xE1;QTE^p>Ss;&}`!3*H@z3%cVhcf7tKu4Wjkh*}eZ2(G@AG zuI5h*qn*xCVPQ}ZML~syt-?|FX0O$6cdE_3VW+jd(QDQ_4_Ef%V8(9XBg5^=-cGkS z*ytQkgNj?t{n>c{n2N8{9$=~ZV=e`P7k{2)AmQrEuNPBL&^~Twz_9rIcs2zM*C%8K z2D;V9Q3@7DjSn*^I5_#VJ2RaQ74Ulga!v>;x|9>a$?m@k8A)i|`#GJFf@{+ca$+by zy^@oL%KNRUj0}AF`|W&27RvYE=H%ej;F z63VgTBn}sG_WhCx9D5w%q%=)5g&A%kA<(o^AF7Insw!l*YlL4aAgHJp2~b-^)fN?1 zDlb)ON>HS>6aoYuN=w2;3hDz;2^2yK;altfe|zsed!NkSPX~6+#oK4*#NVv7zO~l3 zmi?$uulG!ZxL(}qNwFt>1@E8HFZ$V_clHgprRlkIZ!ONg?#!*PxH&yD>W$ypopooU z@vJ+V^k)6RNPN-}BJ;B>qA;tfDoetAUznGa^wrXDgtUbv z>18>VCGmj+$F`+ybo zqZ+|X-N~%m(>}NcRMmjWa~Gh(>X70@(7(ThE5?PLa*t4}&AXXY8LJ#&}qYwgu zi;KsV6ZwB-@S_rh8kNY0ep$}<=&BuxCje22&8A#HRN67AKs33*W@qgzxM;`fnB$d< z#{@I_sYwhkJGDa=A0h6#gTZXj?=vPZ+KeY6Jgz%0&#!ep{YAwgF(U;YyCOLtS&vf! zY^k#zhoZ)GV$o1foNjBA$*CP{lkot#VKik_aUF&z{DZLc=UD<;p(4KwizIQ~Aiy!2 zg7DBFk^ISDE$x<6V-XI?Aq67>kkuw`Qx)5e=1XFd1xqoi|2o~e-{4h92G87TZJX|3 zt=_?`-y4hH!BzEd5aJL-86{umZbz(2(TjYyN@UROP7ri>UkvxekH~KHRcP$iMt~^* z8XMkpqzyN^jyqui$I=L-SF0OS#% zxTyRb)~)h&)|1@!v~T!jH*X#KVE?2PN7mCO)HJyS>*av;i2fD7lp?2zEn!V&0}$n4 zdZ60UBFgeQ&5JxO^1R5r3hlQ?n}pqdK~1{&Cn>#9#byKQqOGW+oBkS6?C*L52 z39}`D!(bb)dXXB}g zuo{=2doFw~2wfD9NmN#_7O7f67*^=+s@mHlqY=kjDu|?(d&u>uRi$?%&Qap#R*k-K z>Xxbuq5gpKxolGQk~e^m7zfuIDFZ znItGfK$Jd;$pE54U2tO46$sZ-w&d?b`y^`FP0?X5)QRS>087uMUSxx1K3;nhfOqCl~9<*1&Ei+3UuTqe<=v4zcT3L)tjC|doYAF%zX%3o)y zZ!VbX{#^pAH<^v8H#-<8M%?Ih8G1An9Vqib8AKLb`r1kp=DvjMUdq3c|I$b!|8FrdJ-1C>;zlI(DV88Obbn+8p>qxuQ{6g*JDTT^i`zR#bP!c>hPBv;&sZ$p3$ z5MZb}`h%{xLj$~JC6y71nF_Vs3Tu+X(%eEafl@AL(?O@rXG@aE62Q-$ zZ4LOxM;dVR1Y9>vR7xsen`z!yIpsR|>mvQtn7M;rsUm1ODkQ2Bg9Y zTr%iQSj!@Sq-|XUytp_N&E(1OW%Es{%%Bt=^$CXI>P9h#>AMl)Co@Zqy9S^gaX zw72Ay(GX<`=4xXl955MT(v+jG&fPSeCuN;@D2e?n4N(!7e&%K{F@>6%Qjn4?97XmW zTNP6{VGEW+U@Kji=+3&c5Q_GOp~l-!U46oU`iCPyZSu-KOXZWH0u^mWlq*D#%r9_9 z)ix^&5=t)Xsi1U7GIl7~EXDbvL1?MZb)fD7C{u(yzFQG8Bq`*8DiMWi89;&v+iH*Aj! z?MHfklnaY41X=(p}ZQS4(=lZ9Oz>BSrNr~vM3fbVw|PlG8b|uiBGB7 zg`E-T^n{I50(aZ?T2y_Y`ilM^d>5#a?d>%ka|?~sHXbTqrbofJ8Z-DLDXS12b(v>n zoYi@igm|?9X0@{!i#?LLM2$g79N@1snkQ0GG3L-)79IiBTAi*ju>BibdEmzYcD>v_ zzy?$gPiB*;5*&fZ+k_|vB+L!V6>_#0ZPehsB9h;3~HEMTD9mZg~ zCu4@c|J#o=^1#!%N?}_uJlMTA0@06EIb3O{D;J$EPpTp-z0C9c6mU^{aF*35Y&x{r z9*)*~)v@Yb?B;NhF`8p-t3^!j1}0N>gf}0RmN&JT14LF2n5NrP=d)K-1e9iPPIeCxR?YzPV@s~oC+;njJFBNro>G|rIKG8#h#33Pojmjx4ALPeIk ze4Ea*SxG#gf{}9pC2L-&AaAs6#hYwYmC3AiLzr()+Iiu7UZb+vmU82WFitVE==Hd0 zB#Z|v%6T4ecRW@T^o?%NxpfowvYVigA_2a+Z&g(yX%Tdm`0TyKN z9x49QZl9y?-anEYeHTKha*XLAmH0tH;~F5Yj;Q)o;E|+b5e|aIc})8vulZPXKr)fN zJgr|6>RvUF!lbxw7dxf6?g4dfqwdTskRWNNRT0#l7l0+bJn`V7dFY2p>A4ZjeoK$2 zMK*cclv{immZ~Jxl8@cI6ieD0YOTeI8UuUhnYP-0%WXSIBjk}NAz?d}-%3~uwMZ2E zvjb#>v822W18CI+QNMUE2zJ}7VAWkG_lq$7bx}#y#oU{dJaYZNgsy{w` zrxYf^Ikz9Be{X8CF;Qa?OKJL$xk<;uED-WYMCl;HyEOGN!YIAc_3Mn+vMk$_k2Vit zm>kHlB%h}|4PdISYPS!Tvs`CG>G-@}tLcB|*LJ`(gpH82dwqq;&*zZuYx1%bbN8(D zJ-4psZjPwR=V_9;rMKUVZB00t_S2?M0xAq!+FbWjNvx2nc_5`WI$_x>nAlFAdTlFV zow$Rxm$r01_}-{bn!c8>G>mE)M(78rThGDto*NYD9*I?UNbtgLwNyQqLAmWg6nA|8g)78?pqoo>Y1|((2F&<~zjV7$NQ_T{Z1$|P3T9Wa<_wu$BzT)*q zP0BV2!(^)-m-)(8Fi=wZm>sb;hZ%M0mvbMSkylk2poK~DXh|8q@I@(EQu9pyNR67M znixH6UwBD-wGvja{{My+);(vz>vD%y69ormx!r@V;`L%4)OA+-s2hnmYK&a+xQe_m zC~GecsMhezro_@>8Mn|>Zb(JtJb_RnG^tI-=}JQ*@3a?mRO5tG+Wz?8r%Ygd;_MFE zFzLd>5mk$d*3&Qn-9;gWm}QX7Gq-j_8l1&Z5)?%l1!b5e`@A3Quu+Hr1xuoQ^&Qe| z$TK^%&dJj=$$hgW6qb&yr|t}n7YQ!NRh^sKq08&f?WT83XpV$rHC96xKP^&Bqr==Q zDPN&)si5Kna6JmNtg0Y8t}UO;GLq^xdE<}QAgGH`@_=RA7p7oit0i1-KGjx*y*C_H z6&|%nJVJdj987g=#fYlX_uV{mBQK-1DwK&}KsOA7Is)b(Kd+jxX5M1Rp2`Y6;Z)n4 zl(Cw%9NTHXwr$xuSm$qSVV!!*QLr`*D8LrO9@18Kq9~*Gi>M6oktdjMhE;_YrL0oO zQ*{(VfJ0olyqqdcvBV3}l!M4Jf~RCWw&2sI^_0A@Ly6f@!Nr=D)sD!P67g4}R&~4T zuAM}Iac?&1qud=%#|l^F(jIYAgozJd1k?Kw3fl@6tji=U=W{nm^&%X1B&_Iiza=S# z%;tmTm?BWM&@51?{q|OVv)41mt$!ujWc=QHcHxZStbYJhM)*xb^!^bpKXMaPxB<~U z%K0#zgB9@!(huwNypv+Fs7u7HmcpcRFsT;c$ZQ>!gn3RT9g~ndYF5@!Kb7m8@MZnh z2|s+#4jtEMNc*Q$x{uX;B6C2PXEoh`;Q4`{rv*MSR1eTC`(!>p4(EqN434Uo94-Ng z)rBzj(PD?%U9ou1Db*$>@duajM$K{7RZji_th?^pAsw?m+Q*maP{FEd*3{-zT)DoR z7MMGfet@j(qSDPn2*OcO?#Xs&w%TsQS@gjt zp3^e?hwj@=aGPQTNM1P53pu%~uH8+8EKO3tLQmpXMeTt*puxFd#w}LhmR@Ahj;+7x z4IbcfR?Cdv4_eOn=zTk7u0E;^YLSN{Es!cmIYaUQ$y_v;_ ztNyMImtM}=+YhdIMcZgScHa(*Ml@JCm<=W>8>0aZ%L*)?A&zhyqRaBq03(*f^X3Sx zG%cXfFVLAKY%}#UGLOWr+s-hNBn7GwSWwTo3Gl=Iafo;`_MR(_g(tJP3zLT|cRF zSSu(apQITV-Iwd3-l$O;)>SYsvgD9}2}cbnd7P*B#dLmBTAJFxVc^y zi#^nhZnYf!1Q^{_3f)mQt)%?CM)BYtP>3y2psE-V zs}J@1O-|SDmAKZTCM7LR^Qed+JjIwAO3K*+)J^~J)*Fd z9M0yyMg6{ZKi1L~b~@ND-xD42IgRK;t_6m;bGvAQoIB~^o2n_|JWFu(=on)!#};-fs8;TKQ57OBR|1*ZHInv%+{t*Z1Rk@i?Ryr&=+14KoU+sO z0D6pqjGVsee?(Toyt|x zSz67I#xys)@E5~05B$UthPUs&@+jWkw46MJFiVTB{hl6w=iss$%7hDPtV-ti1dfo* zJ9nbLf3#d(_S!-uL(DHBu~}I9yZ^q?I>JA-AY4EEPJ7C z$ie1Hg7)pm6Bq7WV?^(GszvnPt9I}^T0ZWLCMf1~pQNw6qAN%J0|kjFg6bkv4j6t% zb(}BB%)DZuo)@6*6|xlF@p-;ynfJ5QqBu)^tXh%<7mDu&(GZCKQ~&mlzSzQqZA%*s z`m^zfwpx`4Mj`{PGI8AlAq*4XB6Jx|jk%i_qv7@U1o@SEUISf}npLYCAvsV-sK>cxXH}1G_+cA-%QdU~WECx+D1tDxskMe1q_N98uQ|#tn`ZId z*_4_htqVoaAj8Nn>o6?x5-G?BJtT9?3Nijkfg)5W_7%6mLXy zal{a0a5*0Wc$QE%2ai*GRlr%KK4d#ouKr<@?oEI*W7CY2A9X?tIz-r(sudPULzfhm z9)OAlP(uvYE|}=DAM9YF?npgUXr!yt+V@}`D-e9k zqik;R)HXV45arjlPS^jzQBJq1xtq+UlQAt0=$%ZMBh==|VU9A$>ZH#6h@s+rk_G#k zsEEI$y3@$qd?YQ!2Z|Y(_5_YahuO{Uu=3wg_en1{5RIxZ1)$GrK=%XC_0Mbqw0WJ6 zo*6McK&dcQn2J1#)YT_RmPB!xqnHZ|%>D|zr#txrw5!XjPgdIK^j6qLUu)V1FKj7p@YGQhcae=y8KpkYEMcQa!ShqvMWOdT z5^e&3`tkRVy z=E`Ft1X4|j*^&fpYyx`7qkZ*w5q@@&qZ?2)aj+bvQSt3I`WZ8hx<}pS+(fI0t2SC{ zRxbr2BPm>oR{VOogX1kr3MG*?GeH`K2_$=Z;1+tkGG@}QlQPM5^~0H`UBr-%ewL4< z-07%Zf}aJ^hz+7?IWF+KMp8KQk~R_j{f~EG2a>nD5bcM`?PA*QN(fd2(?mpYbbnDx zctM(#`_6?geQN+$c1jSTo{qNxxYAQk-BR+1e!2zUrn8z^|M{0&x9dE;Lm;gxS$W7j z9l1dcvlPHVk45t^CxTj!k|@V$B05|_qnA9@nz5zpALekgRxX$pY_s3H5K61ma32tv zO^rA0f@oE3q`SP}byLRI;I({V<8EnZJI93WpUX zD`_o`nfVr@cKW<>=Wmyry&tu+{}22G!L&V#uj;~{2k!$d#7PP@p~U#rk0=}I^I;sP zQF`cX*e=}RwV(?(xSsQ*Y$is3hEdCP+v8f97{y)*Du`Svdq|+;Y(!^#z^V!myyqt8 z8_$!nbwi=YS`L3u8KIu5$UE|SC?>_~Q|?wLM*SC^Y=(LP24dHUw`}Kt%R@c57$S&P z?$XK&xM~R93T~K}d6g$oIY+b|>X{a0BnlmFjn)kDgnV{et`7D7@yeDWz6(6pZp3qu zW7mSR5vjs34Ft!M?xsN@0cqF;6(UVh8NTl{#ht^q)NqArn|Z#z5mY}Wg>S1-ciyw@ zxK!d!mjBkOeE(RWN+S9j@458hlh=xSZu+^^84yZi0(I~EK6c4xG^oGa3Dj|s8va$I z_%u+&8&Mo{dBONsdD#x8I8Ks+@vqw_u5CblWG7I^NL4VP-u1x8E_pIa6*Jo_w zVt>1L6c-%#_S>NN+TIR~c5GVi&uSF!cOO1^z1+daj!BzssQ5Z>hk!put+k1hAO1;8 z#ed-K5GTi`yEZ7kvcpqR~ zv#nDdlP=i=#yf%?63DUXjDH3+EnOQfW&i*miwFb&00000{{{d;LjnLB00RI300000 G0001Z$Dr&0 literal 0 HcmV?d00001 diff --git a/src/dupradar/test_data/script.sh b/src/dupradar/test_data/script.sh new file mode 100644 index 00000000..2b1c1aa5 --- /dev/null +++ b/src/dupradar/test_data/script.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +wget https://github.com/ssayols/dupRadar/raw/master/inst/extdata/genes.gtf +wget https://github.com/ssayols/dupRadar/raw/master/inst/extdata/wgEncodeCaltechRnaSeqGm12878R1x75dAlignsRep2V2.bam + +# subsample the bam file +samtools view -s 0.02 -b wgEncodeCaltechRnaSeqGm12878R1x75dAlignsRep2V2.bam > sample.bam + +rm wgEncodeCaltechRnaSeqGm12878R1x75dAlignsRep2V2.bam \ No newline at end of file From d6e5a45043128afcc3a1be88a1a76584a9248022 Mon Sep 17 00:00:00 2001 From: emmarousseau Date: Fri, 5 Jul 2024 13:22:53 +0100 Subject: [PATCH 04/11] config formatting --- src/dupradar/config.vsh.yaml | 94 ++++++++++++++---------------------- 1 file changed, 35 insertions(+), 59 deletions(-) diff --git a/src/dupradar/config.vsh.yaml b/src/dupradar/config.vsh.yaml index cae6964f..cb220f47 100644 --- a/src/dupradar/config.vsh.yaml +++ b/src/dupradar/config.vsh.yaml @@ -10,105 +10,81 @@ references: license: GPL-3.0 argument_groups: -- name: "Input" +- name: Input arguments: - - name: "--id" + - name: --id type: string description: Sample ID - - - name: "--input" + - name: --input type: file required: true - description: path to input alignment file in BAM format - - - name: "--gtf_annotation" + description: Path to input alignment file in BAM format + - name: --gtf_annotation type: file required: true - description: path to GTF annotation file. - - - name: "--paired" + description: Path to GTF annotation file. + - name: --paired type: boolean - description: add flag if input alignment file consists of paired reads - - - name: "--strandedness" + description: Add flag if input alignment file consists of paired reads + - name: --strandedness type: string - required: false - choices: ["forward", "reverse", "unstranded"] - description: strandedness of input bam file reads (forward, reverse or unstranded (default, applicable to paired reads)) - - - name: "--versions" - type: file - must_exist: false + choices: [forward, reverse, unstranded] + description: | + Strandedness of input bam file reads (forward, reverse or unstranded (default, applicable to paired reads)) -- name: "Output" +- name: Output arguments: - - name: "--output_dupmatrix" + - name: --output_dupmatrix type: file direction: output - required: false - must_exist: true - default: $id.dup_matrix.txt description: path to output file (txt) of duplicate tag counts - - - name: "--output_dup_intercept_mqc" + example: $id.dup_matrix.txt + - name: --output_dup_intercept_mqc type: file direction: output - required: false - must_exist: true - default: $id.dup_intercept_mqc.txt description: path to output file (txt) of multiqc intercept value DupRadar - - - name: "--output_duprate_exp_boxplot" + example: $id.dup_intercept_mqc.txt + - name: --output_duprate_exp_boxplot type: file direction: output required: false must_exist: true default: $id.duprate_exp_boxplot.pdf - description: path to output file (pdf) of distribution of expression box plot - - - name: "--output_duprate_exp_densplot" + description: | + Path to output file (pdf) of distribution of expression box plot + - name: --output_duprate_exp_densplot type: file direction: output - required: false - must_exist: true - default: $id.duprate_exp_densityplot.pdf - description: path to output file (pdf) of 2D density scatter plot of duplicate tag counts - - - name: "--output_duprate_exp_denscurve_mqc" + description: | + Path to output file (pdf) of 2D density scatter plot of duplicate tag counts + example: $id.duprate_exp_densityplot.pdf + - name: --output_duprate_exp_denscurve_mqc type: file direction: output - required: false - must_exist: true - default: $id.duprate_exp_density_curve_mqc.txt - description: path to output file (pdf) of density curve of gene duplication multiqc - - - name: "--output_expression_histogram" + description: | + Path to output file (pdf) of density curve of gene duplication multiqc + example: $id.duprate_exp_density_curve_mqc.txt + - name: --output_expression_histogram type: file direction: output - required: false - must_exist: true - default: $id.expression_hist.pdf - description: path to output file (pdf) of distribution of RPK values per gene histogram - - - name: "--output_intercept_slope" + description: | + Path to output file (pdf) of distribution of RPK values per gene histogram + example: $id.expression_hist.pdf + - name: --output_intercept_slope type: file direction: output - required: false - must_exist: true - default: $id.intercept_slope.txt description: output file (txt) with progression of duplication rate values + example: $id.intercept_slope.txt resources: - type: bash_script path: script.sh # Copied from https://github.com/nf-core/rnaseq/blob/3.12.0/bin/dupradar.r - - type: R_script + - type: r_script path: script.R - test_resources: - type: bash_script path: test.sh - engines: - type: docker image: quay.io/biocontainers/bioconductor-dupradar:1.32.0--r43hdfd78af_0 From 0a798864756f48f577b4810e1c02afcef7d7daaa Mon Sep 17 00:00:00 2001 From: emmarousseau Date: Fri, 5 Jul 2024 14:58:03 +0100 Subject: [PATCH 05/11] scripts and tests --- src/dupradar/script.R | 0 src/dupradar/script.sh | 7 ++++++- src/dupradar/test.sh | 0 3 files changed, 6 insertions(+), 1 deletion(-) mode change 100644 => 100755 src/dupradar/script.R mode change 100644 => 100755 src/dupradar/script.sh mode change 100644 => 100755 src/dupradar/test.sh diff --git a/src/dupradar/script.R b/src/dupradar/script.R old mode 100644 new mode 100755 diff --git a/src/dupradar/script.sh b/src/dupradar/script.sh old mode 100644 new mode 100755 index 85d51167..a590401a --- a/src/dupradar/script.sh +++ b/src/dupradar/script.sh @@ -1,4 +1,7 @@ #!/bin/bash +## VIASH START +## VIASH END + set -eo pipefail @@ -29,4 +32,6 @@ mv "$par_id"_intercept_slope.txt $par_output_intercept_slope dupradar_ver=$(Rscript -e "library(dupRadar); cat(as.character(packageVersion('dupRadar')))") text="bioconductor-dupradar: ${dupradar_ver}" -echo "$text" \ No newline at end of file +echo "$text" + +exit 0 diff --git a/src/dupradar/test.sh b/src/dupradar/test.sh old mode 100644 new mode 100755 From aeb8419f4e17ca9c07d6fb02cb78181ffd68492f Mon Sep 17 00:00:00 2001 From: emmarousseau Date: Wed, 10 Jul 2024 23:39:30 +0200 Subject: [PATCH 06/11] expected output (not correct) and update to R script package installation --- src/dupradar/config.vsh.yaml | 18 ++++----- src/dupradar/script.R | 10 +++-- src/dupradar/script.sh | 3 +- src/dupradar/test.sh | 24 +++++++++++- src/dupradar/test_data/test_dupMatrix.txt | 7 ++++ .../test_data/test_dup_intercept_mqc.txt | 39 +++++++++++++++++++ .../test_duprateExpDensCurve_mqc.txt | 33 ++++++++++++++++ .../test_data/test_intercept_slope.txt | 2 + 8 files changed, 120 insertions(+), 16 deletions(-) create mode 100644 src/dupradar/test_data/test_dupMatrix.txt create mode 100644 src/dupradar/test_data/test_dup_intercept_mqc.txt create mode 100644 src/dupradar/test_data/test_duprateExpDensCurve_mqc.txt create mode 100644 src/dupradar/test_data/test_intercept_slope.txt diff --git a/src/dupradar/config.vsh.yaml b/src/dupradar/config.vsh.yaml index cb220f47..9727fdec 100644 --- a/src/dupradar/config.vsh.yaml +++ b/src/dupradar/config.vsh.yaml @@ -38,18 +38,16 @@ argument_groups: type: file direction: output description: path to output file (txt) of duplicate tag counts - example: $id.dup_matrix.txt + example: dup_matrix.txt - name: --output_dup_intercept_mqc type: file direction: output description: path to output file (txt) of multiqc intercept value DupRadar - example: $id.dup_intercept_mqc.txt + example: dup_intercept_mqc.txt - name: --output_duprate_exp_boxplot type: file direction: output - required: false - must_exist: true - default: $id.duprate_exp_boxplot.pdf + default: duprate_exp_boxplot.pdf description: | Path to output file (pdf) of distribution of expression box plot - name: --output_duprate_exp_densplot @@ -57,24 +55,24 @@ argument_groups: direction: output description: | Path to output file (pdf) of 2D density scatter plot of duplicate tag counts - example: $id.duprate_exp_densityplot.pdf + example: duprate_exp_densityplot.pdf - name: --output_duprate_exp_denscurve_mqc type: file direction: output description: | Path to output file (pdf) of density curve of gene duplication multiqc - example: $id.duprate_exp_density_curve_mqc.txt + example: duprate_exp_density_curve_mqc.txt - name: --output_expression_histogram type: file direction: output description: | Path to output file (pdf) of distribution of RPK values per gene histogram - example: $id.expression_hist.pdf + example: expression_hist.pdf - name: --output_intercept_slope type: file direction: output description: output file (txt) with progression of duplication rate values - example: $id.intercept_slope.txt + example: intercept_slope.txt resources: - type: bash_script @@ -85,6 +83,8 @@ resources: test_resources: - type: bash_script path: test.sh + - type: file + path: test_data engines: - type: docker image: quay.io/biocontainers/bioconductor-dupradar:1.32.0--r43hdfd78af_0 diff --git a/src/dupradar/script.R b/src/dupradar/script.R index 82218c6e..24ebce2d 100755 --- a/src/dupradar/script.R +++ b/src/dupradar/script.R @@ -35,10 +35,12 @@ message("R package loc. (Arg 7): ", ifelse(length(args) > 4, args[5], "Not speci # Load / install packages if (length(args) > 5) { .libPaths( c( args[6], .libPaths() ) ) } -if (!require("dupRadar")){ - source("http://bioconductor.org/biocLite.R") - biocLite("dupRadar", suppressUpdates=TRUE) - library("dupRadar") +if (!require("dupRadar")) { + if (!requireNamespace("BiocManager", quietly = TRUE)) { + install.packages("BiocManager") + } + BiocManager::install("dupRadar", update = TRUE, ask=FALSE) + library("dupRadar") } if (!require("parallel")) { install.packages("parallel", dependencies=TRUE, repos='http://cloud.r-project.org/') diff --git a/src/dupradar/script.sh b/src/dupradar/script.sh index a590401a..c84440a7 100755 --- a/src/dupradar/script.sh +++ b/src/dupradar/script.sh @@ -19,7 +19,8 @@ Rscript "$meta_resources_dir/script.R" \ $par_id \ $par_gtf_annotation \ $(num_strandness) \ - $par_paired + $par_paired \ + ${meta_cpus:-1} mv "$par_id"_dupMatrix.txt $par_output_dupmatrix mv "$par_id"_dup_intercept_mqc.txt $par_output_dup_intercept_mqc diff --git a/src/dupradar/test.sh b/src/dupradar/test.sh index 01072e01..8a85af27 100755 --- a/src/dupradar/test.sh +++ b/src/dupradar/test.sh @@ -1,8 +1,8 @@ #!/bin/bash # define input and output for script -input_bam="$meta_resources_dir/sample.bam" -input_gtf="$meta_resources_dir/genes.gtf" +input_bam="${meta_resources_dir}/test_data/sample.bam" +input_gtf="${meta_resources_dir}/test_data/genes.gtf" output_dupmatrix="dup_matrix.txt" output_dup_intercept_mqc="dup_intercept_mqc.txt" @@ -48,4 +48,24 @@ echo ">> asserting output has been created for paired read input" [ ! -f "$output_intercept_slope" ] && echo "$output_intercept_slope was not created" && exit 1 [ ! -s "$output_intercept_slope" ] && echo "$output_intercept_slope is empty" && exit 1 + +echo ">> Check if output is empty" +[ ! -s "$output_dupmatrix" ] \ + && echo "Output file $output_dupmatrix is empty" && exit 1 +[ ! -s "$output_dup_intercept_mqc" ] \ + && echo "Output file $output_dup_intercept_mqc is empty" && exit 1 +[ ! -s "$output_intercept_slope" ] \ + && echo "Output file $output_intercept_slope is empty" && exit 1 + +echo ">> Check if output is correct" +cat "$output_dupmatrix" +cat "$output_dup_intercept_mqc" +cat "$output_intercept_slope" +# diff ignoring white spaces +diff -B -b "test_dupMatrix.pdf" "${meta_resources_dir}/test_data/test_dupMatrix.txt" || + (echo "Output file $output_dupmatrix is not correct" && exit 1) +diff -B -b "$output_dup_intercept_mqc" "${meta_resources_dir}/test_data/test_duprateExpDensCurve_mqc.txt" || \ + (echo "Output file $output_dup_intercept_mqc is not correct" && exit 1) +diff -B -b "$output_intercept_slope" "${meta_resources_dir}/test_data/test_intercept_slope.txt" || \ + (echo "Output file $output_intercept_slope is not correct" && exit 1) exit 0 \ No newline at end of file diff --git a/src/dupradar/test_data/test_dupMatrix.txt b/src/dupradar/test_data/test_dupMatrix.txt new file mode 100644 index 00000000..8ccda72e --- /dev/null +++ b/src/dupradar/test_data/test_dupMatrix.txt @@ -0,0 +1,7 @@ +ID geneLength allCountsMulti filteredCountsMulti dupRateMulti dupsPerIdMulti RPKMulti PKMMulti allCounts filteredCounts dupRate dupsPerId RPK RPKM +WASH7P 1769 41 41 0 0 23.1769361221029 188430.374976446 1 1 0 0 0.565291124929339 4595.86280430357 +FAM138A 2260 0 0 NA 0 0 0 0 0 NA 0 0 0 +FAM138F 2260 0 0 NA 0 0 0 0 0 NA 0 0 0 +OR4F5 918 0 0 NA 0 0 0 0 0 NA 0 0 0 +LOC729737 5474 18 18 0 0 3.28827183047132 26733.917320905 3 3 0 0 0.548045305078553 4455.65288681751 +LOC100132287 8740 39 39 0 0 4.46224256292906 36278.3948205615 1 1 0 0 0.11441647597254 930.215251809269 diff --git a/src/dupradar/test_data/test_dup_intercept_mqc.txt b/src/dupradar/test_data/test_dup_intercept_mqc.txt new file mode 100644 index 00000000..0d0b4e1a --- /dev/null +++ b/src/dupradar/test_data/test_dup_intercept_mqc.txt @@ -0,0 +1,39 @@ +#id: DupInt +#plot_type: 'generalstats' +#pconfig: +# dupRadar_intercept: +# title: 'dupInt' +# namespace: 'DupRadar' +# description: 'Intercept value from DupRadar' +# max: 100 +# min: 0 +# scale: 'RdYlGn-rev' +# format: '{:.2f}%' +Sample dupRadar_intercept +test 5.8262146393079e-11 +#id: DupInt +#plot_type: 'generalstats' +#pconfig: +# dupRadar_intercept: +# title: 'dupInt' +# namespace: 'DupRadar' +# description: 'Intercept value from DupRadar' +# max: 100 +# min: 0 +# scale: 'RdYlGn-rev' +# format: '{:.2f}%' +Sample dupRadar_intercept +test 5.8262146393079e-11 +#id: DupInt +#plot_type: 'generalstats' +#pconfig: +# dupRadar_intercept: +# title: 'dupInt' +# namespace: 'DupRadar' +# description: 'Intercept value from DupRadar' +# max: 100 +# min: 0 +# scale: 'RdYlGn-rev' +# format: '{:.2f}%' +Sample dupRadar_intercept +test 5.8262146393079e-11 diff --git a/src/dupradar/test_data/test_duprateExpDensCurve_mqc.txt b/src/dupradar/test_data/test_duprateExpDensCurve_mqc.txt new file mode 100644 index 00000000..ca9f199e --- /dev/null +++ b/src/dupradar/test_data/test_duprateExpDensCurve_mqc.txt @@ -0,0 +1,33 @@ +#id: dupradar +#section_name: 'DupRadar' +#section_href: 'bioconductor.org/packages/release/bioc/html/dupRadar.html' +#description: "provides duplication rate quality control for RNA-Seq datasets. Highly expressed genes can be expected to have a lot of duplicate reads, but high numbers of duplicates at low read counts can indicate low library complexity with technical duplication. +# This plot shows the general linear models - a summary of the gene duplication distributions. " +#pconfig: +# title: 'DupRadar General Linear Model' +# xLog: True +# xlab: 'expression (reads/kbp)' +# ylab: '% duplicate reads' +# ymax: 100 +# ymin: 0 +# tt_label: '{point.x:.1f} reads/kbp: {point.y:,.2f}% duplicates' +# xPlotLines: +# - color: 'green' +# dashStyle: 'LongDash' +# label: +# style: {color: 'green'} +# text: '0.5 RPKM' +# verticalAlign: 'bottom' +# y: -65 +# value: 0.5 +# width: 1 +# - color: 'red' +# dashStyle: 'LongDash' +# label: +# style: {color: 'red'} +# text: '1 read/bp' +# verticalAlign: 'bottom' +# y: -65 +# value: 1000 +# width: 1 +0.11441647597254 5.82621463896864e-09 diff --git a/src/dupradar/test_data/test_intercept_slope.txt b/src/dupradar/test_data/test_intercept_slope.txt new file mode 100644 index 00000000..5294b8ba --- /dev/null +++ b/src/dupradar/test_data/test_intercept_slope.txt @@ -0,0 +1,2 @@ +test - dupRadar Int (duprate at low read counts): 5.8262146393079e-11 +test - dupRadar Sl (progression of the duplication rate): 0.999999999999966 From a893a37f70a665bfd238fa2833eb9bfdc1acc1c5 Mon Sep 17 00:00:00 2001 From: emmarousseau Date: Mon, 23 Sep 2024 11:15:35 +0200 Subject: [PATCH 07/11] Version info, functional test --- src/dupradar/config.vsh.yaml | 15 +- src/dupradar/{script.R => dupradar.r} | 23 +- src/dupradar/help.txt | 583 ++++++++++++++++++ src/dupradar/script.sh | 13 +- src/dupradar/test.sh | 13 +- src/dupradar/test_data/test_dupMatrix.txt | 2 +- .../test_data/test_dup_intercept_mqc.txt | 39 -- 7 files changed, 628 insertions(+), 60 deletions(-) rename src/dupradar/{script.R => dupradar.r} (92%) delete mode 100644 src/dupradar/test_data/test_dup_intercept_mqc.txt diff --git a/src/dupradar/config.vsh.yaml b/src/dupradar/config.vsh.yaml index 9727fdec..d25547db 100644 --- a/src/dupradar/config.vsh.yaml +++ b/src/dupradar/config.vsh.yaml @@ -79,15 +79,26 @@ resources: path: script.sh # Copied from https://github.com/nf-core/rnaseq/blob/3.12.0/bin/dupradar.r - type: r_script - path: script.R + path: dupradar.r test_resources: - type: bash_script path: test.sh - type: file path: test_data + engines: - type: docker - image: quay.io/biocontainers/bioconductor-dupradar:1.32.0--r43hdfd78af_0 + image: ubuntu:22.04 + setup: + - type: apt + packages: [ r-base ] + - type: r + bioc: [ dupRadar ] + packages: [ parallel ] + - type: docker + run: | + echo "DupRadar: $(Rscript -e "library(dupRadar); cat(as.character(packageVersion('dupRadar')))")" > /var/software_versions.txt + runners: - type: executable - type: nextflow diff --git a/src/dupradar/script.R b/src/dupradar/dupradar.r similarity index 92% rename from src/dupradar/script.R rename to src/dupradar/dupradar.r index 24ebce2d..4294a42e 100755 --- a/src/dupradar/script.R +++ b/src/dupradar/dupradar.r @@ -47,17 +47,29 @@ if (!require("parallel")) { library("parallel") } + # Duplicate stats dm <- analyzeDuprates(input_bam, annotation_gtf, stranded, paired_end, threads) +print(input_bam) +print(output_prefix) +print(annotation_gtf) +print(stranded) +print(paired_end) +print(threads) +print("analyzeDuprates done") write.table(dm, file=paste(output_prefix, "_dupMatrix.txt", sep=""), quote=F, row.name=F, sep="\t") +print("write.table done") # 2D density scatter plot pdf(paste0(output_prefix, "_duprateExpDens.pdf")) +print("pdf done") duprateExpDensPlot(DupMat=dm) title("Density scatter plot") mtext(output_prefix, side=3) dev.off() +print("duprateExpDensPlot done") fit <- duprateExpFit(DupMat=dm) +print("duprateExpFit done") cat( paste("- dupRadar Int (duprate at low read counts):", fit$intercept), paste("- dupRadar Sl (progression of the duplication rate):", fit$slope), @@ -82,6 +94,7 @@ Sample dupRadar_intercept" write(line,file=paste0(output_prefix, "_dup_intercept_mqc.txt"),append=TRUE) write(paste(sample_name, fit$intercept),file=paste0(output_prefix, "_dup_intercept_mqc.txt"),append=TRUE) +print("write dup_intercept_mqc done") # Get numbers from dupRadar GLM curve_x <- sort(log10(dm$RPK)) @@ -135,22 +148,24 @@ write.table( file=paste0(output_prefix, "_duprateExpDensCurve_mqc.txt"), quote=FALSE, row.names=FALSE, col.names=FALSE, append=TRUE, ) - +print("write duprateExpDensCurve_mqc done") # Distribution of expression box plot pdf(paste0(output_prefix, "_duprateExpBoxplot.pdf")) duprateExpBoxplot(DupMat=dm) title("Percent Duplication by Expression") mtext(output_prefix, side=3) dev.off() - +print("duprateExpBoxplot done") # Distribution of RPK values per gene pdf(paste0(output_prefix, "_expressionHist.pdf")) expressionHist(DupMat=dm) title("Distribution of RPK values per gene") mtext(output_prefix, side=3) dev.off() +print("expressionHist done") # Print sessioninfo to standard out print(output_prefix) -citation("dupRadar") -sessionInfo() +# citation("dupRadar") +# print("citation done") +# sessionInfo() diff --git a/src/dupradar/help.txt b/src/dupradar/help.txt index e69de29b..819c352d 100644 --- a/src/dupradar/help.txt +++ b/src/dupradar/help.txt @@ -0,0 +1,583 @@ +--- +title: "Using the dupRadar package" +author: "Sergi Sayols, Holger Klein" +date: "June 23, 2015" +output: + BiocStyle::html_document: + toc: true +--- + + +```{r pre,echo=FALSE,results='hide'} +library(knitr) +opts_chunk$set(warning=FALSE,message=FALSE,cache=TRUE) +``` +```{r style,echo=FALSE,results='asis'} +BiocStyle::markdown() +``` + + + +# Introduction to dupRadar + +RNA-Seq experiments are a common strategy nowadays to quantify the gene +expression levels in cells. Due to its higher sensitivity compared to +arrays and the ability to discover novel features, makes them the choice for +most modern experiments. + +In RNA-Seq - as in all other NGS applications - quality control is essential. +Current NGS workflows usually involve PCR steps at some point, which involves +the danger of PCR artefacts and over-amplification of reads. +For common DNA-based assays PCR duplicates are often simply removed before +further analysis and their overall fraction or the read multiplicity taken +as quality metrics. +In RNA-Seq however, apart from the technical PCR duplicates, there is a strong +source for biological read duplicates: for a given gene length and sequencing +depth there exists an expression level beyond which it is not possible to add +place more reads inside a gene locus without placing a second read exactly +on the position of an already existing read. +For this reason the overall duplication rate is not a useful measure for +RNA-Seq. + +As in the NGS/RNA-Seq QC ecosystem there were no suitable tools to address this +question, we set out to develop a new tool. The `r Rpackage("dupRadar")` +package gives an insight into the duplication problem by graphically +relating the gene expression level and the duplication rate present on it. +Thus, failed experiments can be easily identified at a glance. + + +**Note: by now RNA-Seq protocols have matured so that for bulk RNA protocols data +rarely suffer from technical duplicates. With newer low-input or single cell +RNA-Seq protocols technical duplicates possible problems are worth to be +checked for by default, especially if protocols are pushed to or beyond +their boundaries. +Paired-end libraries make the distinction between duplicates due to highly +expressed genes and PCR duplicates easier, but the problem itself is not +completely solved, especially at higher sequencing depths.** + +# Getting started using dupRadar + +## Preparing your data + +Previous to running the duplication rate analysis, the BAM file with your +mapped reads has to be duplicate marked with a like Picard, or the faster +BamUtil dedup BioBamBam. +The `r Rpackage("dupRadar")` package only works with duplicate marked BAM files. + +If you do not have/want a duplication marking step in your default pipeline, +the `r Rpackage("dupRadar")` package includes, for your convenience, +wrappers to properly call some of these tools from your R session. Note that +you still have to supply the path of the dupmarker installation though: + +```{r loadLibrary} +library(dupRadar) +``` + +Now, simply call the wrapper: + +```{r eval=FALSE} +# call the duplicate marker and analyze the reads +bamDuprm <- markDuplicates(dupremover="bamutil", + bam="test.bam", + path="/opt/bamUtil-master/bin", + rminput=FALSE) +``` + +Simply specify which tool to use, the path where this tool is installed, +and the input bam file to be analyzed. After marking duplicates, it's safe +to remove to original BAM file in order to save space. + +The `r Rpackage("dupRadar")` package currently comes with support for: + +- [Picard MarkDuplicates](http://broadinstitute.github.io/picard) +- [BamUtil](http://genome.sph.umich.edu/wiki/BamUtil) + +After the BAM file is marked for duplicates, dupRadar is ready to +analyze how the duplication rate is related with the estimated gene expression +levels. + +## A GTF file + +Unless there is any specific reason, dupRadar can use the same GTF file +that will be later used to count the reads falling on features. + +A valid GTF file can be obtained from UCSC, Ensembl, the iGenomes or other +projects. + +Note that the resulting duplication rate plots depend on the GTF annotation +used. GTF files from the gencode projects result in a less clear picture of +duplication rates, as there are many more features and feature types +annotated, which overlap heavily as well. In some cases creating the plots +only using subsets of gencode annotation files (e.g. just protein +coding genes) serve the QC purpose of this tool better. + +## AnnotationHub as a source of GTF files + +The Bioconductor `r Rpackage("AnnotationHub")` package provides an alternative +approach to obtain annotation in GTF format from entities such as Ensembl, UCSC, +ENCODE, and 1000 Genomes projects. + +This is partly outlined in the AnnotationHub 'HOWTO' vignette +section "Ensembl GTF and FASTA files for TxDb gene models and sequence +queries"; for the Takifugu example, the downloaded GTF file is +available from the cache. + +```{r} +if(suppressWarnings(require(AnnotationHub))) { + ah = AnnotationHub() + query(ah, c("ensembl", "80", "Takifugu", "gtf")) # discovery + cache(ah["AH47101"]) # retrieve / file path +} +``` + +# dupRate demo data + +In the package we include two precomputed duplication matrices for two RNASeq +experiments used as examples of a good and a failed (in terms of high redundancy +of reads) experiments. The experiments come from the ENCODE project, as a source +of a wide variety of protocols, library types and sequencing facilities. + +Load the example dataset with: + +```{r} +attach(dupRadar_examples) +``` + +# The duplication rate analysis + +The analysis requires some info about the sequencing run and library +preparation protocol: + +- The strandess information about the reads: is the sequencing strand +specific? if so, are the reads reversely sequenced? +- Are the reads paired, or single? + +Due to its phenomenal performance, internally we use the featureCounts() +function from the Bioconductor `r Biocpkg("Rsubread")` package, which also +supports multiple threads. + +```{r eval=FALSE} +# The call parameters: +bamDuprm <- "test_duprm.bam" # the duplicate marked bam file +gtf <- "genes.gtf" # the gene model +stranded <- 2 # '0' (unstranded), '1' (stranded) and '2' (reversely stranded) +paired <- FALSE # is the library paired end? +threads <- 4 # number of threads to be used + +# Duplication rate analysis +dm <- analyzeDuprates(bamDuprm,gtf,stranded,paired,threads) +``` + +The duplication matrix contains read counts in different scenarios and RPK and RPKM values for every gene. + +|ID | geneLength| allCountsMulti| filteredCountsMulti| dupRateMulti| dupsPerIdMulti| RPKMulti| RPKMMulti| allCounts| filteredCounts| dupRate| dupsPerId| RPK| RPKM| +|:------|----------:|--------------:|-------------------:|------------:|--------------:|------------:|----------:|---------:|--------------:|---------:|---------:|-----------:|----------:| +|Xkr4 | 3634| 2| 2| 0.0000000| 0| 0.5503577| 0.0117159| 2| 2| 0.0000000| 0| 0.5503577| 0.0117159| +|Rp1 | 9747| 0| 0| NaN| 0| 0.0000000| 0.0000000| 0| 0| NaN| 0| 0.0000000| 0.0000000| +|Sox17 | 3130| 0| 0| NaN| 0| 0.0000000| 0.0000000| 0| 0| NaN| 0| 0.0000000| 0.0000000| +|Mrpl15 | 4203| 429| 261| 0.3916084| 168| 102.0699500| 2.1728436| 419| 258| 0.3842482| 161| 99.6906971| 2.1221946| +|Lypla1 | 2433| 1106| 576| 0.4792043| 530| 454.5828196| 9.6770634| 1069| 562| 0.4742750| 507| 439.3752569| 9.3533280| +|Tcea1 | 2847| 2890| 1046| 0.6380623| 1844| 1015.1036178| 21.6093121| 1822| 696| 0.6180022| 1126| 639.9719002| 13.6235871| + +# Plotting and interpretation + +The number of reads per base assigned to a gene in an ideal RNA-Seq data set +is expected to be proportional to the abundance of its transcripts in the +sample. For lowly expressed genes we expect read duplication to happen rarely by +chance, while for highly expressed genes - depending on the total sequencing +depth - we expect read duplication to happen often. + +A good way to learn if a dataset is following this trend is by relating the +normalized number of counts per gene (RPK, as a quantification of the gene +expression) and the fraction represented by duplicated reads. + +The `r Rpackage("dupRadar")` offers several functions to assess this +relationship. The most prominent may be the 2d density scatter plot: + +```{r fig.width=14,fig.height=7} +## make a duprate plot (blue cloud) +par(mfrow=c(1,2)) +duprateExpDensPlot(DupMat=dm) # a good looking plot +title("good experiment") +duprateExpDensPlot(DupMat=dm.bad) # a dataset with duplication problems +title("duplication problems") + +## duprate boxplot +duprateExpBoxplot(DupMat=dm) # a good looking plot +duprateExpBoxplot(DupMat=dm.bad) # a dataset with duplication problems +``` + +The *duprateExpDensPlot* has helper lines at the threshold of 1 read/bp and at +0.5 RPKM. Moreover by default a generalized linear model is fit to the data and +overplotted (see also below). + +## Fitting a model into the data + +To summarize the relationship between duplication rates and gene expression, +we propose fitting a logistic regression curve onto the data. With the +coefficients of the fitted model, one can get an idea of the initial +duplication rate at low read counts (Intercept) and the progression of the +duplication rate along with the progression of the read counts (Slope). + + +```{r fig.width=7,fig.height=7} +duprateExpDensPlot(DupMat=dm) + +# or, just to get the fitted model without plot +fit <- duprateExpFit(DupMat=dm) +cat("duprate at low read counts: ",fit$intercept,"\n", + "progression of the duplication rate: ",fit$slope,fill=TRUE) +``` + +Our main use case for that function is the condensation +of the plots into quality metrics that can be used for automatic flagging of +possibly problematic samples in large experiments or aggregation with other +quality metrics in large tables to analyse interdependencies. + + +The *duprateExpBoxplot* plot shows the range of the duplication rates at 5% +bins (default) along the distribution of RPK gene counts. The x-axis displays +the quantile of the RPK distribution, and the average RPK of the genes +contained in this quantile. + +Individual genes can be identified in the plot: + +```{r eval=FALSE} +## INTERACTIVE: identify single points on screen (name="ID" column of dm) +duprateExpPlot(DupMat=dm) # a good looking plot +duprateExpIdentify(DupMat=dm) +``` + +One can also call the function *duprateExpPlot* to get smooth color density +representation of the same data: + +```{r fig.width=14,fig.height=7} +par(mfrow=c(1,2)) +cols <- colorRampPalette(c("black","blue","green","yellow","red")) +duprateExpPlot(DupMat=dm,addLegend=FALSE) +duprateExpPlot(DupMat=dm.bad,addLegend=FALSE,nrpoints=10,nbin=500,colramp=cols) +``` + +Any further parm sent to the *duprateExpPlot* is also sent to the +*smoothScatter* function. + +## Comparing the fitted parameters to other datasets + +The parameters of the fitted model may mean very little (or just nothing) for +many, unless there's other data to compare with. We provide the pre-computed +duplication matrices for all the RNA-Seq experiments publicly available from +the ENCODE project, for human and mouse. + +![alt text](figures/encode.svg) + +With the experience from the ENCODE datasets, we expect from single read +experiments little duplication at low RPKM (low **intercept**) rapidly rising +because of natural duplication (high **slope**). In contrast, paired-end +experiments have a more mild rising of the natural duplication (low **slope**) +due to having higher diversity of reads pairs since pairs with same start may +still have different end. + +The common denominator for both designs is the importance of having a low +intercept, suggesting that duplication rate at lowly expressed genes may serve +as a quality measure. + +All the pre-computed duplication matrices are available in the [dupRadar Github site](https://github.com/ssayols/dupRadar). + + +## Other plots + +**CAVEAT: Sometimes in discussions duplicate reads (i.e. two physically +different reads are mapped to the exact same position) and multi-mapping +reads (i.e. a single read is mapped to two or more locations in the genome) +are mixed up. `r Rpackage("dupRadar")`'s main focus are PCR duplicates in RNA-Seq. +However internally we keep track of unique mappers and multimappers, and we use +both in some of the examples, to illustrate use cases of our package beyond +the main aim. Multi-mapping reads are completely independent of PCR duplicates.** + +Apart from the plots relating RPK and duplication rate, the +`r Rpackage("dupRadar")` package provides some other useful plots to extract +information about the gene expression levels. + +An interesting quality metric are the fraction of reads taken up by groups of +genes binned by 5% expression levels. +```{r fig.width=7,fig.height=7} +readcountExpBoxplot(DupMat=dm) +``` + +In the example we see that the 5% of +highest expressed genes in our sample data set take up around 60% of all reads. + +The distribution of RPK values per genes can be plotted with: + +```{r fig.width=7,fig.height=7} +expressionHist(DupMat=dm) +``` + +This would help in identifying skewed distributions with unusual amount of +lowly expressed genes, or to detect no consensus between replicates. + + +# Other information deduced from the data + +The duplication rate matrix calculated by the function *analyzeDuprates()* +contains some useful information about the sequencing experiment, that can be +used to assess the quality of the data. + +## Fraction of multimappers per gene + +Analogous to per gene duplication rate, the fraction of mutimappers +can be easily calculated fom the duplication matrix. + +Taking the counts from the column *allCountsMulti*, and substracting form it +the counts from the column *allCounts*, one can get the total number of +multihits. Thus, the fraction of multihits per gene can be calculating then +dividing by *allCountsMulti*. + +```{r} +# calculate the fraction of multimappers per gene +dm$mhRate <- (dm$allCountsMulti - dm$allCounts) / dm$allCountsMulti +# how many genes are exclusively covered by multimappers +sum(dm$mhRate == 1, na.rm=TRUE) + +# and how many have an RPKM (including multimappers) > 5 +sum(dm$mhRate==1 & dm$RPKMMulti > 5, na.rm=TRUE) + +# and which are they? +dm[dm$mhRate==1 & dm$RPKMMulti > 5, "ID"] +``` + +We can also generate an overall picture about less extreme cases: +```{r fig.width=7,fig.height=7} +hist(dm$mhRate, + breaks=50, + col="red", + main="Frequency of multimapping rates", + xlab="Multimapping rate per gene", + ylab="Frequency") + +``` + +Also the direct comparison of reads per kilobase between uniquely and +multimappers is possible. + +```{r fig.width=7,fig.height=7} +# comparison of multi-mapping RPK and uniquely-mapping RPK +plot(log2(dm$RPK), + log2(dm$RPKMulti), + xlab="Reads per kb (uniquely mapping reads only)", + ylab="Reads per kb (all including multimappers, non-weighted)" +) +``` + +Use with identify() to annotate interesting cases interactively. +```{r, eval=F} +identify(log2(dm$RPK), + log2(dm$RPKMulti), + labels=dm$ID) +``` + +## Connection between possible PCR artefacts and GC content +In some cases we wondered about influence of GC content on PCR artefacts. An +easy way to check using our `r Rpackage("dupRadar")` package in conjunction +with `r Rpackage("biomaRt")` is demonstrated in the following. For simplicity +we use our demo data here in which we *do not* see a big influence. + + +```{r eval=F} +library(dupRadar) +library(biomaRt) + +## for detailed explanations on biomaRt, please see the respective +## vignette + +## set up biomart connection for mouse (needs internet connection) +ensm <- useMart("ensembl") +ensm <- useDataset("mmusculus_gene_ensembl", mart=ensm) + +## get a table which has the gene GC content for the IDs that have been +## used to generate the table (depends on the GTF annotation that you +## use) +tr <- getBM(attributes=c("mgi_symbol", "percentage_gc_content"), + values=TRUE, mart=ensm) + +## create a GC vector with IDs as element names +mgi.gc <- tr$percentage_gc_content +names(mgi.gc) <- tr$mgi_symbol + +## using dm demo duplication matrix that comes with the package +## add GC content to our demo data and keep only subset for which we can +## retrieve data +keep <- dm$ID %in% tr$mgi_symbol +dm.gc <- dm[keep,] +dm.gc$gc <- mgi.gc[dm.gc$ID] + +## check distribution of annotated gene GC content (in %) +boxplot(dm.gc$gc, main="Gene GC content", ylab="% GC") +``` + +![alt text](figures/GCcontent.svg) + +Now we can compare the dependence of duplication rate on expression level +independently for below and above median GC genes (and to mention again, +in this data set we don't have a big difference). + +```{r eval=F} +par(mfrow=c(1,2)) + +## below median GC genes +duprateExpDensPlot(dm.gc[dm.gc$gc<=45,], main="below median GC genes") + +## above median GC genes +duprateExpDensPlot(dm.gc[dm.gc$gc>=45,], main="above median GC genes") +``` + +![alt text](figures/GCdependence.png) + +# Conclusion + +The `r Rpackage("dupRadar")` package provides a framework for the analysis of +duplicate rates in RNAseq datasets. In addition, it includes a set of +convenient wrappers to correctly call some common tools used for marking PCR +duplicated reads in the BAM file. It's shipped as Bioconductor package in +order to offer a common framework for all the tools involved, and simplify its +use. + +# Including dupRadar into pipelines + +To include dupRadar as a single step in an RNA-Seq pipeline, integration into a short R-script can be done like in the following: + +```{r,eval=F} +#!/usr/bin/env Rscript + +######################################## +## +## dupRadar shell script +## call dupRadar R package from the shell for +## easy integration into pipelines +## +## Holger Klein & Sergi Sayols +## +## https://github.com/ssayols/dupRadar +## +## input: +## - _duplicate marked_ bam file +## - gtf file +## - parameters for duplication counting routine: +## stranded, paired, outdir, threads. +## +######################################## + +library(dupRadar) + +#################### +## +## get name patterns from command line +## +args <- commandArgs(TRUE) + +## the bam file to analyse +bam <- args[1] +## usually, same GTF file as used in htseq-count +gtf <- gsub("gtf=","",args[2]) +## no|yes|reverse +stranded <- gsub("stranded=","",args[3]) +## is a paired end experiment +paired <- gsub("paired=","",args[4]) +## output directory +outdir <- gsub("outdir=","",args[5]) +## number of threads to be used +threads <- as.integer(gsub("threads=","",args[6])) + +if(length(args) != 6) { + stop (paste0("Usage: ./dupRadar.sh ", + " paired=[yes|no] ", + "outdir=./ threads=1")) +} + +if(!file.exists(bam)) { + stop(paste("File",bam,"does NOT exist")) +} + +if(!file.exists(gtf)) { + stop(paste("File",gtf,"does NOT exist")) +} + +if(!file.exists(outdir)) { + stop(paste("Dir",outdir,"does NOT exist")) +} + +if(is.na(stranded) | !(grepl("no|yes|reverse",stranded))) { + stop("Stranded has to be no|yes|reverse") +} + +if(is.na(paired) | !(grepl("no|yes",paired))) { + stop("Paired has to be no|yes") +} + +if(is.na(threads)) { + stop("Threads has to be an integer number") +} + +stranded <- if(stranded == "no") 0 else if(stranded == "yes") 1 else 2 + +## end command line parsing +## +######################################## + +######################################## +## +## analyze duprates and create plots +## +cat("Processing file ", bam, " with GTF ", gtf, "\n") + +## calculate duplication rate matrix +dm <- analyzeDuprates(bam, + gtf, + stranded, + (paired == "yes"), + threads) + +## produce plots + +## duprate vs. expression smooth scatter +png(file=paste0(outdir,"/",gsub("(.*)\\.[^.]+","\\1",basename(bam)),"_dupRadar_drescatter.png"), + width=1000, height=1000) +duprateExpDensPlot(dm, main=basename(bam)) +dev.off() + +## expression histogram +png(file=paste0(outdir,"/",gsub("(.*)\\.[^.]+","\\1",basename(bam)),"_dupRadar_ehist.png"), + width=1000, height=1000) +expressionHist(dm) +dev.off() + +## duprate vs. expression boxplot +png(file=paste0(outdir,"/",gsub("(.*)\\.[^.]+","\\1",basename(bam)),"_dupRadar_drebp.png"), + width=1000, height=1000) +par(mar=c(10,4,4,2)+.1) +duprateExpBoxplot(dm, main=basename(bam)) +dev.off() +``` + +## Citing dupRadar + +Please consider citing dupRadar if used in support of your own research: + +```{r citation} +citation("dupRadar") +``` + +## Reporting problems or bugs + +If you run into problems using dupRadar, the [Bioconductor Support site](https://support.bioconductor.org/) is a good first place to ask for help. If you think there is a bug or an unreported feature, you can report it using the [dupRadar Github site](https://github.com/ssayols/dupRadar). + +# Session info + +The following package and versions were used in the production of this vignette. + +```{r echo=FALSE} +sessionInfo() +``` \ No newline at end of file diff --git a/src/dupradar/script.sh b/src/dupradar/script.sh index c84440a7..6ac70bf0 100755 --- a/src/dupradar/script.sh +++ b/src/dupradar/script.sh @@ -5,6 +5,7 @@ set -eo pipefail + function num_strandness { if [ $par_strandedness == 'unstranded' ]; then echo 0 elif [ $par_strandedness == 'forward' ]; then echo 1 @@ -14,12 +15,16 @@ function num_strandness { fi } -Rscript "$meta_resources_dir/script.R" \ + +if [ $par_paired ]; then paired=TRUE; else paired=FALSE; fi + + +Rscript "$meta_resources_dir/dupradar.r" \ $par_input \ $par_id \ $par_gtf_annotation \ $(num_strandness) \ - $par_paired \ + ${paired} \ ${meta_cpus:-1} mv "$par_id"_dupMatrix.txt $par_output_dupmatrix @@ -31,8 +36,4 @@ mv "$par_id"_expressionHist.pdf $par_output_expression_histogram mv "$par_id"_intercept_slope.txt $par_output_intercept_slope -dupradar_ver=$(Rscript -e "library(dupRadar); cat(as.character(packageVersion('dupRadar')))") -text="bioconductor-dupradar: ${dupradar_ver}" -echo "$text" - exit 0 diff --git a/src/dupradar/test.sh b/src/dupradar/test.sh index 8a85af27..c4afd67a 100755 --- a/src/dupradar/test.sh +++ b/src/dupradar/test.sh @@ -58,14 +58,11 @@ echo ">> Check if output is empty" && echo "Output file $output_intercept_slope is empty" && exit 1 echo ">> Check if output is correct" -cat "$output_dupmatrix" -cat "$output_dup_intercept_mqc" -cat "$output_intercept_slope" # diff ignoring white spaces -diff -B -b "test_dupMatrix.pdf" "${meta_resources_dir}/test_data/test_dupMatrix.txt" || - (echo "Output file $output_dupmatrix is not correct" && exit 1) -diff -B -b "$output_dup_intercept_mqc" "${meta_resources_dir}/test_data/test_duprateExpDensCurve_mqc.txt" || \ - (echo "Output file $output_dup_intercept_mqc is not correct" && exit 1) +diff -B -b "$meta_resources_dir/test_data/test_dupMatrix.txt" "${meta_resources_dir}/test_data/test_dupMatrix.txt" || \ + { echo "Output file $output_dupmatrix is not correct"; exit 1; } +diff -B -b "$output_duprate_exp_denscurve_mqc" "${meta_resources_dir}/test_data/test_duprateExpDensCurve_mqc.txt" || \ + { echo "Output file $output_duprate_exp_denscurve_mqc is not correct"; exit 1; } diff -B -b "$output_intercept_slope" "${meta_resources_dir}/test_data/test_intercept_slope.txt" || \ - (echo "Output file $output_intercept_slope is not correct" && exit 1) + { echo "Output file $output_intercept_slope is not correct"; exit 1; } exit 0 \ No newline at end of file diff --git a/src/dupradar/test_data/test_dupMatrix.txt b/src/dupradar/test_data/test_dupMatrix.txt index 8ccda72e..9035e993 100644 --- a/src/dupradar/test_data/test_dupMatrix.txt +++ b/src/dupradar/test_data/test_dupMatrix.txt @@ -1,4 +1,4 @@ -ID geneLength allCountsMulti filteredCountsMulti dupRateMulti dupsPerIdMulti RPKMulti PKMMulti allCounts filteredCounts dupRate dupsPerId RPK RPKM +ID geneLength allCountsMulti filteredCountsMulti dupRateMulti dupsPerIdMulti RPKMulti RPKMMulti allCounts filteredCounts dupRate dupsPerId RPK RPKM WASH7P 1769 41 41 0 0 23.1769361221029 188430.374976446 1 1 0 0 0.565291124929339 4595.86280430357 FAM138A 2260 0 0 NA 0 0 0 0 0 NA 0 0 0 FAM138F 2260 0 0 NA 0 0 0 0 0 NA 0 0 0 diff --git a/src/dupradar/test_data/test_dup_intercept_mqc.txt b/src/dupradar/test_data/test_dup_intercept_mqc.txt deleted file mode 100644 index 0d0b4e1a..00000000 --- a/src/dupradar/test_data/test_dup_intercept_mqc.txt +++ /dev/null @@ -1,39 +0,0 @@ -#id: DupInt -#plot_type: 'generalstats' -#pconfig: -# dupRadar_intercept: -# title: 'dupInt' -# namespace: 'DupRadar' -# description: 'Intercept value from DupRadar' -# max: 100 -# min: 0 -# scale: 'RdYlGn-rev' -# format: '{:.2f}%' -Sample dupRadar_intercept -test 5.8262146393079e-11 -#id: DupInt -#plot_type: 'generalstats' -#pconfig: -# dupRadar_intercept: -# title: 'dupInt' -# namespace: 'DupRadar' -# description: 'Intercept value from DupRadar' -# max: 100 -# min: 0 -# scale: 'RdYlGn-rev' -# format: '{:.2f}%' -Sample dupRadar_intercept -test 5.8262146393079e-11 -#id: DupInt -#plot_type: 'generalstats' -#pconfig: -# dupRadar_intercept: -# title: 'dupInt' -# namespace: 'DupRadar' -# description: 'Intercept value from DupRadar' -# max: 100 -# min: 0 -# scale: 'RdYlGn-rev' -# format: '{:.2f}%' -Sample dupRadar_intercept -test 5.8262146393079e-11 From b51e6fb4111ddb12415644b3eff355414cdf3a6a Mon Sep 17 00:00:00 2001 From: emmarousseau Date: Mon, 23 Sep 2024 11:24:50 +0200 Subject: [PATCH 08/11] update chagelof --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5d380e54..d8016713 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,8 @@ * `rsem/rsem_calculate_expression`: Calculate expression levels (PR #93). +* `dupradar`: Assessment of duplication rates in RNA-Seq datasets (PR #154). + ## MINOR CHANGES * Upgrade to Viash 0.9.0. From 0aa46130fcb04c9baf2748001335702ded651f99 Mon Sep 17 00:00:00 2001 From: emmarousseau Date: Thu, 26 Sep 2024 14:24:14 +0200 Subject: [PATCH 09/11] Remove script.sh middleman, add test --- src/dupradar/config.vsh.yaml | 15 ++++--- src/dupradar/dupradar.r | 78 ++++++++++++++++++++++-------------- src/dupradar/script.sh | 39 ------------------ src/dupradar/test.sh | 45 +++++++++++++++++++-- 4 files changed, 98 insertions(+), 79 deletions(-) delete mode 100755 src/dupradar/script.sh diff --git a/src/dupradar/config.vsh.yaml b/src/dupradar/config.vsh.yaml index d25547db..ce4f79b1 100644 --- a/src/dupradar/config.vsh.yaml +++ b/src/dupradar/config.vsh.yaml @@ -24,13 +24,15 @@ argument_groups: required: true description: Path to GTF annotation file. - name: --paired - type: boolean + type: boolean_true description: Add flag if input alignment file consists of paired reads - name: --strandedness - type: string - choices: [forward, reverse, unstranded] + type: integer + choices: [0, 1, 2] description: | - Strandedness of input bam file reads (forward, reverse or unstranded (default, applicable to paired reads)) + Strandedness of input bam file reads (1 = forward, 2 = reverse or 0 = unstranded (default, + applicable to paired reads)) + example: 0 - name: Output arguments: @@ -47,7 +49,6 @@ argument_groups: - name: --output_duprate_exp_boxplot type: file direction: output - default: duprate_exp_boxplot.pdf description: | Path to output file (pdf) of distribution of expression box plot - name: --output_duprate_exp_densplot @@ -75,9 +76,7 @@ argument_groups: example: intercept_slope.txt resources: - - type: bash_script - path: script.sh - # Copied from https://github.com/nf-core/rnaseq/blob/3.12.0/bin/dupradar.r + # Adapted from https://github.com/nf-core/rnaseq/blob/3.12.0/bin/dupradar.r - type: r_script path: dupradar.r test_resources: diff --git a/src/dupradar/dupradar.r b/src/dupradar/dupradar.r index 4294a42e..a3497e7a 100755 --- a/src/dupradar/dupradar.r +++ b/src/dupradar/dupradar.r @@ -1,24 +1,48 @@ #!/usr/bin/env Rscript -# Command line argument processing -args = commandArgs(trailingOnly=TRUE) -if (length(args) < 5) { +## VIASH START +par <- list( + "input" = 'test_data/sample.bam', + "id" = 'test', + "gtf_annotation" = 'test_data/genes.gtf', + "strandedness" = 1, + "paired" = TRUE, + "threads" = 1, + "output_dupmatrix"="dup_matrix.txt", + "output_dup_intercept_mqc"="dup_intercept_mqc.txt", + "output_duprate_exp_boxplot"="duprate_exp_boxplot.pdf", + "output_duprate_exp_densplot"="duprate_exp_densityplot.pdf", + "output_duprate_exp_denscurve_mqc"="duprate_exp_density_curve_mqc.pdf", + "output_expression_histogram"="expression_hist.pdf", + "output_intercept_slope"="intercept_slope.txt" +) +## VIASH END + + +if (length(par) < 5) { stop("Usage: dupRadar.r ", call.=FALSE) } -message("paired_end is", args[5]) -message("the type is is", class(args[5])) -input_bam <- args[1] -output_prefix <- args[2] -annotation_gtf <- args[3] -stranded <- as.numeric(args[4]) -paired_end <- ifelse(args[5] == "true", TRUE, FALSE) -threads <- as.numeric(args[6]) +input_bam <- par$input +output_prefix <- par$id +annotation_gtf <- par$gtf_annotation +stranded <- ifelse(is.null(par$strandedness), 0, as.integer(par$strandedness)) +paired_end <- ifelse(par$paired, TRUE, FALSE) +threads <- ifelse(is.null(par$threads), 1, as.integer(par$threads)) + +output_dupmatrix <- ifelse(is.null(par$output_dupmatrix), paste0(output_prefix, "_dupMatrix.txt", sep=""), par$output_dupmatrix) +output_dup_intercept_mqc <- ifelse(is.null(par$output_dup_intercept_mqc), paste0(output_prefix, "_dup_intercept_mqc.txt", sep=""), par$output_dup_intercept_mqc) +output_duprate_exp_boxplot <- ifelse(is.null(par$output_duprate_exp_boxplot), paste0(output_prefix, "_duprateExpBoxplot.pdf", sep=""), par$output_duprate_exp_boxplot) +output_duprate_exp_densplot <- ifelse(is.null(par$output_duprate_exp_densplot), paste0(output_prefix, "_duprate_exp_densplot.pdf", sep=""), par$output_duprate_exp_densplot) +output_duprate_exp_denscurve_mqc <- ifelse(is.null(par$output_duprate_exp_denscurve_mqc), paste0(output_prefix, "_duprateExpDensCurve_mqc.txt", sep=""), par$output_duprate_exp_denscurve_mqc) +output_expression_histogram <- ifelse(is.null(par$output_expression_histogram), paste0(output_prefix, "_expressionHist.pdf", sep=""), par$output_expression_histogram) +output_intercept_slope <- ifelse(is.null(par$output_intercept_slope), paste0(output_prefix, "_intercept_slope.txt", sep=""), par$output_intercept_slope) bamRegex <- "(.+)\\.bam$" -if(!(grepl(bamRegex, input_bam) && file.exists(input_bam) && (!file.info(input_bam)$isdir))) stop("First argument '' must be an existing file (not a directory) with '.bam' extension...") + +if(!(grepl(bamRegex, input_bam) && file.exists(input_bam) && (!file.info(input_bam)$isdir))) stop("First argument '' must be an existing file (not a directory) with '.bam' extension...") if(!(file.exists(annotation_gtf) && (!file.info(annotation_gtf)$isdir))) stop("Third argument '' must be an existing file (and not a directory)...") if(is.na(stranded) || (!(stranded %in% (0:2)))) stop("Fourth argument must be a numeric value in 0(unstranded)/1(forward)/2(reverse)...") if(is.na(threads) || (threads<=0)) stop("Fifth argument must be a strictly positive numeric value...") @@ -27,7 +51,7 @@ if(is.na(threads) || (threads<=0)) stop("Fifth argument must be a st message("Input bam (Arg 1): ", input_bam) message("Output basename(Arg 2): ", output_prefix) message("Input gtf (Arg 3): ", annotation_gtf) -message("Strandness (Arg 4): ", c("unstranded", "forward", "reverse")[stranded+1]) +message("Strandness (Arg 4): ", c("unstranded", "forward", "reverse")[stranded]) message("paired_end (Arg 5): ", paired_end) message("Nb threads (Arg 6): ", threads) message("R package loc. (Arg 7): ", ifelse(length(args) > 4, args[5], "Not specified")) @@ -43,25 +67,18 @@ if (!require("dupRadar")) { library("dupRadar") } if (!require("parallel")) { - install.packages("parallel", dependencies=TRUE, repos='http://cloud.r-project.org/') library("parallel") } # Duplicate stats dm <- analyzeDuprates(input_bam, annotation_gtf, stranded, paired_end, threads) -print(input_bam) -print(output_prefix) -print(annotation_gtf) -print(stranded) -print(paired_end) -print(threads) print("analyzeDuprates done") -write.table(dm, file=paste(output_prefix, "_dupMatrix.txt", sep=""), quote=F, row.name=F, sep="\t") +write.table(dm, file=output_dupmatrix, quote=F, row.name=F, sep="\t") print("write.table done") # 2D density scatter plot -pdf(paste0(output_prefix, "_duprateExpDens.pdf")) +pdf(output_duprate_exp_densplot) print("pdf done") duprateExpDensPlot(DupMat=dm) title("Density scatter plot") @@ -74,7 +91,7 @@ cat( paste("- dupRadar Int (duprate at low read counts):", fit$intercept), paste("- dupRadar Sl (progression of the duplication rate):", fit$slope), fill=TRUE, labels=output_prefix, - file=paste0(output_prefix, "_intercept_slope.txt"), append=FALSE + file=output_intercept_slope, append=FALSE ) # Create a multiqc file dupInt @@ -92,8 +109,8 @@ line="#id: DupInt # format: '{:.2f}%' Sample dupRadar_intercept" -write(line,file=paste0(output_prefix, "_dup_intercept_mqc.txt"),append=TRUE) -write(paste(sample_name, fit$intercept),file=paste0(output_prefix, "_dup_intercept_mqc.txt"),append=TRUE) +write(line,file=output_dup_intercept_mqc, append=TRUE) +write(paste(sample_name, fit$intercept),file=output_dup_intercept_mqc, append=TRUE) print("write dup_intercept_mqc done") # Get numbers from dupRadar GLM @@ -142,28 +159,31 @@ line="#id: dupradar # value: 1000 # width: 1" -write(line,file=paste0(output_prefix, "_duprateExpDensCurve_mqc.txt"),append=TRUE) +write(line,file=output_duprate_exp_denscurve_mqc, append=TRUE) write.table( cbind(curve_x, curve_y), - file=paste0(output_prefix, "_duprateExpDensCurve_mqc.txt"), + file=output_duprate_exp_denscurve_mqc, quote=FALSE, row.names=FALSE, col.names=FALSE, append=TRUE, ) print("write duprateExpDensCurve_mqc done") # Distribution of expression box plot -pdf(paste0(output_prefix, "_duprateExpBoxplot.pdf")) +pdf(output_duprate_exp_boxplot) duprateExpBoxplot(DupMat=dm) title("Percent Duplication by Expression") mtext(output_prefix, side=3) dev.off() print("duprateExpBoxplot done") # Distribution of RPK values per gene -pdf(paste0(output_prefix, "_expressionHist.pdf")) +pdf(output_expression_histogram) expressionHist(DupMat=dm) title("Distribution of RPK values per gene") mtext(output_prefix, side=3) dev.off() print("expressionHist done") + +# + # Print sessioninfo to standard out print(output_prefix) # citation("dupRadar") diff --git a/src/dupradar/script.sh b/src/dupradar/script.sh deleted file mode 100755 index 6ac70bf0..00000000 --- a/src/dupradar/script.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash -## VIASH START -## VIASH END - - -set -eo pipefail - - -function num_strandness { - if [ $par_strandedness == 'unstranded' ]; then echo 0 - elif [ $par_strandedness == 'forward' ]; then echo 1 - elif [ $par_strandedness == 'reverse' ]; then echo 2 - else echo "strandedness must be unstranded, forward or reverse." && \ - exit 1 - fi -} - - -if [ $par_paired ]; then paired=TRUE; else paired=FALSE; fi - - -Rscript "$meta_resources_dir/dupradar.r" \ - $par_input \ - $par_id \ - $par_gtf_annotation \ - $(num_strandness) \ - ${paired} \ - ${meta_cpus:-1} - -mv "$par_id"_dupMatrix.txt $par_output_dupmatrix -mv "$par_id"_dup_intercept_mqc.txt $par_output_dup_intercept_mqc -mv "$par_id"_duprateExpBoxplot.pdf $par_output_duprate_exp_boxplot -mv "$par_id"_duprateExpDens.pdf $par_output_duprate_exp_densplot -mv "$par_id"_duprateExpDensCurve_mqc.txt $par_output_duprate_exp_denscurve_mqc -mv "$par_id"_expressionHist.pdf $par_output_expression_histogram -mv "$par_id"_intercept_slope.txt $par_output_intercept_slope - - -exit 0 diff --git a/src/dupradar/test.sh b/src/dupradar/test.sh index c4afd67a..99610357 100755 --- a/src/dupradar/test.sh +++ b/src/dupradar/test.sh @@ -8,7 +8,7 @@ output_dupmatrix="dup_matrix.txt" output_dup_intercept_mqc="dup_intercept_mqc.txt" output_duprate_exp_boxplot="duprate_exp_boxplot.pdf" output_duprate_exp_densplot="duprate_exp_densityplot.pdf" -output_duprate_exp_denscurve_mqc="duprate_exp_density_curve_mqc.pdf" +output_duprate_exp_denscurve_mqc="duprate_exp_density_curve_mqc.txt" output_expression_histogram="expression_hist.pdf" output_intercept_slope="intercept_slope.txt" @@ -19,8 +19,7 @@ echo "> Running $meta_functionality_name for unpaired reads, writing to tmpdir $ --input "$input_bam" \ --id "test" \ --gtf_annotation "$input_gtf" \ - --strandedness "forward" \ - --paired false \ + --strandedness 1 \ --output_dupmatrix $output_dupmatrix \ --output_dup_intercept_mqc $output_dup_intercept_mqc \ --output_duprate_exp_boxplot $output_duprate_exp_boxplot \ @@ -65,4 +64,44 @@ diff -B -b "$output_duprate_exp_denscurve_mqc" "${meta_resources_dir}/test_data/ { echo "Output file $output_duprate_exp_denscurve_mqc is not correct"; exit 1; } diff -B -b "$output_intercept_slope" "${meta_resources_dir}/test_data/test_intercept_slope.txt" || \ { echo "Output file $output_intercept_slope is not correct"; exit 1; } + + +echo ">>> Test 2: Example without specified output files" + +"$meta_executable" \ + --input "$input_bam" \ + --id "test" \ + --gtf_annotation "$input_gtf" \ + --strandedness 1 + +exit_code=$? +[[ $exit_code != 0 ]] && echo "Non zero exit code: $exit_code" && exit 1 + +echo ">> asserting output has been created for paired read input" +# output files with original names from DupRadar and id prefix +[ ! -f "test_dupMatrix.txt" ] && echo "test_dupMatrix.txt was not created" && exit 1 +[ ! -s "test_dupMatrix.txt" ] && echo "test_dupMatrix.txt is empty" && exit 1 +[ ! -f "test_dup_intercept_mqc.txt" ] && echo "test_dup_intercept_mqc.txt was not created" && exit 1 +[ ! -s "test_dup_intercept_mqc.txt" ] && echo "test_dup_intercept_mqc.txt is empty" && exit 1 +[ ! -f "test_duprateExpBoxplot.pdf" ] && echo "test_duprateExpBoxplot.pdf was not created" && exit 1 +[ ! -s "test_duprateExpBoxplot.pdf" ] && echo "test_duprateExpBoxplot.pdf is empty" && exit 1 +[ ! -f "test_duprate_exp_densplot.pdf" ] && echo "test_duprate_exp_densplot.pdf was not created" && exit 1 +[ ! -s "test_duprate_exp_densplot.pdf" ] && echo "test_duprate_exp_densplot.pdf is empty" && exit 1 +[ ! -f "test_duprateExpDensCurve_mqc.txt" ] && echo "test_duprateExpDensCurve_mqc.txt was not created" && exit 1 +[ ! -s "test_duprateExpDensCurve_mqc.txt" ] && echo "test_duprateExpDensCurve_mqc.txt is empty" && exit 1 +[ ! -f "test_expressionHist.pdf" ] && echo "test_expressionHist.pdf was not created" && exit 1 +[ ! -s "test_expressionHist.pdf" ] && echo "test_expressionHist.pdf is empty" && exit 1 +[ ! -f "test_intercept_slope.txt" ] && echo "test_intercept_slope.txt was not created" && exit 1 +[ ! -s "test_intercept_slope.txt" ] && echo "test_intercept_slope.txt is empty" && exit 1 + + +echo ">> Check if output is correct" +diff -B -b "test_dupMatrix.txt" "${meta_resources_dir}/test_data/test_dupMatrix.txt" || \ + { echo "Output file test_dupMatrix.txt is not correct"; exit 1; } +diff -B -b "test_duprateExpDensCurve_mqc.txt" "${meta_resources_dir}/test_data/test_duprateExpDensCurve_mqc.txt" || \ + { echo "Output file test_duprateExpDensCurve_mqc.txt is not correct"; exit 1; } +diff -B -b "test_intercept_slope.txt" "${meta_resources_dir}/test_data/test_intercept_slope.txt" || \ + { echo "Output file test_intercept_slope.txt is not correct"; exit 1; } + +echo ">>> All tests passed" exit 0 \ No newline at end of file From 81fe5001b1bfdf5e05a503e89e4acfa81b0fb43e Mon Sep 17 00:00:00 2001 From: Emma Rousseau Date: Wed, 30 Oct 2024 15:06:33 +0100 Subject: [PATCH 10/11] R-related syntax fixes --- src/dupradar/config.vsh.yaml | 2 +- src/dupradar/dupradar.r | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/dupradar/config.vsh.yaml b/src/dupradar/config.vsh.yaml index ce4f79b1..52063368 100644 --- a/src/dupradar/config.vsh.yaml +++ b/src/dupradar/config.vsh.yaml @@ -93,7 +93,7 @@ engines: packages: [ r-base ] - type: r bioc: [ dupRadar ] - packages: [ parallel ] + packages: [ parallel, rlang ] - type: docker run: | echo "DupRadar: $(Rscript -e "library(dupRadar); cat(as.character(packageVersion('dupRadar')))")" > /var/software_versions.txt diff --git a/src/dupradar/dupradar.r b/src/dupradar/dupradar.r index a3497e7a..700e3428 100755 --- a/src/dupradar/dupradar.r +++ b/src/dupradar/dupradar.r @@ -18,6 +18,7 @@ par <- list( ) ## VIASH END +library(rlang) if (length(par) < 5) { stop("Usage: dupRadar.r ", call.=FALSE) @@ -27,11 +28,11 @@ if (length(par) < 5) { input_bam <- par$input output_prefix <- par$id annotation_gtf <- par$gtf_annotation -stranded <- ifelse(is.null(par$strandedness), 0, as.integer(par$strandedness)) +stranded <- par$strandedness %||% 0L paired_end <- ifelse(par$paired, TRUE, FALSE) threads <- ifelse(is.null(par$threads), 1, as.integer(par$threads)) -output_dupmatrix <- ifelse(is.null(par$output_dupmatrix), paste0(output_prefix, "_dupMatrix.txt", sep=""), par$output_dupmatrix) +output_dupmatrix <- par$output_dupmatrix %||% paste0(output_prefix, "_dupMatrix.txt") output_dup_intercept_mqc <- ifelse(is.null(par$output_dup_intercept_mqc), paste0(output_prefix, "_dup_intercept_mqc.txt", sep=""), par$output_dup_intercept_mqc) output_duprate_exp_boxplot <- ifelse(is.null(par$output_duprate_exp_boxplot), paste0(output_prefix, "_duprateExpBoxplot.pdf", sep=""), par$output_duprate_exp_boxplot) output_duprate_exp_densplot <- ifelse(is.null(par$output_duprate_exp_densplot), paste0(output_prefix, "_duprate_exp_densplot.pdf", sep=""), par$output_duprate_exp_densplot) @@ -96,7 +97,7 @@ cat( # Create a multiqc file dupInt sample_name <- gsub("Aligned.sortedByCoord.out.markDups", "", output_prefix) -line="#id: DupInt +line <- "#id: DupInt #plot_type: 'generalstats' #pconfig: # dupRadar_intercept: From feb97e0f19ae286e2a5aa8799e5143d82567cbf4 Mon Sep 17 00:00:00 2001 From: Emma Rousseau Date: Thu, 31 Oct 2024 09:26:50 +0000 Subject: [PATCH 11/11] uniform R syntax changes --- src/dupradar/dupradar.r | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/src/dupradar/dupradar.r b/src/dupradar/dupradar.r index 700e3428..f813e902 100755 --- a/src/dupradar/dupradar.r +++ b/src/dupradar/dupradar.r @@ -8,13 +8,13 @@ par <- list( "strandedness" = 1, "paired" = TRUE, "threads" = 1, - "output_dupmatrix"="dup_matrix.txt", - "output_dup_intercept_mqc"="dup_intercept_mqc.txt", - "output_duprate_exp_boxplot"="duprate_exp_boxplot.pdf", - "output_duprate_exp_densplot"="duprate_exp_densityplot.pdf", - "output_duprate_exp_denscurve_mqc"="duprate_exp_density_curve_mqc.pdf", - "output_expression_histogram"="expression_hist.pdf", - "output_intercept_slope"="intercept_slope.txt" + "output_dupmatrix" = "dup_matrix.txt", + "output_dup_intercept_mqc" = "dup_intercept_mqc.txt", + "output_duprate_exp_boxplot" = "duprate_exp_boxplot.pdf", + "output_duprate_exp_densplot" = "duprate_exp_densityplot.pdf", + "output_duprate_exp_denscurve_mqc" = "duprate_exp_density_curve_mqc.pdf", + "output_expression_histogram" = "expression_hist.pdf", + "output_intercept_slope" = "intercept_slope.txt" ) ## VIASH END @@ -30,15 +30,15 @@ output_prefix <- par$id annotation_gtf <- par$gtf_annotation stranded <- par$strandedness %||% 0L paired_end <- ifelse(par$paired, TRUE, FALSE) -threads <- ifelse(is.null(par$threads), 1, as.integer(par$threads)) +threads <- par$threads %||% 1L output_dupmatrix <- par$output_dupmatrix %||% paste0(output_prefix, "_dupMatrix.txt") -output_dup_intercept_mqc <- ifelse(is.null(par$output_dup_intercept_mqc), paste0(output_prefix, "_dup_intercept_mqc.txt", sep=""), par$output_dup_intercept_mqc) -output_duprate_exp_boxplot <- ifelse(is.null(par$output_duprate_exp_boxplot), paste0(output_prefix, "_duprateExpBoxplot.pdf", sep=""), par$output_duprate_exp_boxplot) -output_duprate_exp_densplot <- ifelse(is.null(par$output_duprate_exp_densplot), paste0(output_prefix, "_duprate_exp_densplot.pdf", sep=""), par$output_duprate_exp_densplot) -output_duprate_exp_denscurve_mqc <- ifelse(is.null(par$output_duprate_exp_denscurve_mqc), paste0(output_prefix, "_duprateExpDensCurve_mqc.txt", sep=""), par$output_duprate_exp_denscurve_mqc) -output_expression_histogram <- ifelse(is.null(par$output_expression_histogram), paste0(output_prefix, "_expressionHist.pdf", sep=""), par$output_expression_histogram) -output_intercept_slope <- ifelse(is.null(par$output_intercept_slope), paste0(output_prefix, "_intercept_slope.txt", sep=""), par$output_intercept_slope) +output_dup_intercept_mqc <- par$output_dup_intercept_mqc %||% paste0(output_prefix, "_dup_intercept_mqc.txt") +output_duprate_exp_boxplot <- par$output_duprate_exp_boxplot %||% paste0(output_prefix, "_duprateExpBoxplot.pdf") +output_duprate_exp_densplot <- par$output_duprate_exp_densplot %||% paste0(output_prefix, "_duprate_exp_densplot.pdf") +output_duprate_exp_denscurve_mqc <- par$output_duprate_exp_denscurve_mqc %||% paste0(output_prefix, "_duprateExpDensCurve_mqc.txt") +output_expression_histogram <- par$output_expression_histogram %||% paste0(output_prefix, "_expressionHist.pdf") +output_intercept_slope <- par$output_intercept_slope %||% paste0(output_prefix, "_intercept_slope.txt") bamRegex <- "(.+)\\.bam$" @@ -116,18 +116,18 @@ print("write dup_intercept_mqc done") # Get numbers from dupRadar GLM curve_x <- sort(log10(dm$RPK)) -curve_y = 100*predict(fit$glm, data.frame(x=curve_x), type="response") +curve_y <- 100*predict(fit$glm, data.frame(x=curve_x), type="response") # Remove all of the infinite values infs = which(curve_x %in% c(-Inf,Inf)) -curve_x = curve_x[-infs] -curve_y = curve_y[-infs] +curve_x <- curve_x[-infs] +curve_y <- curve_y[-infs] # Reduce number of data points curve_x <- curve_x[seq(1, length(curve_x), 10)] curve_y <- curve_y[seq(1, length(curve_y), 10)] # Convert x values back to real counts -curve_x = 10^curve_x +curve_x <- 10^curve_x # Write to file -line="#id: dupradar +line <- "#id: dupradar #section_name: 'DupRadar' #section_href: 'bioconductor.org/packages/release/bioc/html/dupRadar.html' #description: \"provides duplication rate quality control for RNA-Seq datasets. Highly expressed genes can be expected to have a lot of duplicate reads, but high numbers of duplicates at low read counts can indicate low library complexity with technical duplication.