From 38f586bec0ac9e4312b016e29c3aa0bd53f292b2 Mon Sep 17 00:00:00 2001 From: emmarousseau Date: Thu, 11 Apr 2024 11:04:14 +0100 Subject: [PATCH 1/5] initial commit dedup --- CHANGELOG.md | 3 + src/umi_tools/umi_tools_dedup/config.vsh.yaml | 279 ++++++++++++++++++ src/umi_tools/umi_tools_dedup/help.txt | 13 + src/umi_tools/umi_tools_dedup/script.sh | 65 ++++ src/umi_tools/umi_tools_dedup/test.sh | 49 +++ 5 files changed, 409 insertions(+) create mode 100644 src/umi_tools/umi_tools_dedup/config.vsh.yaml create mode 100644 src/umi_tools/umi_tools_dedup/help.txt create mode 100644 src/umi_tools/umi_tools_dedup/script.sh create mode 100644 src/umi_tools/umi_tools_dedup/test.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 4fd7f001..1bef9345 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -39,6 +39,9 @@ - `samtools/flagstat`: Counts the number of alignments in SAM/BAM/CRAM files for each FLAG type (PR #31). - `samtools/idxstats`: Reports alignment summary statistics for a SAM/BAM/CRAM file (PR #32). +* `umi_tools`: + - `umi_tools/umi_tools_dedup`: Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read (PR #). + ## MAJOR CHANGES ## MINOR CHANGES diff --git a/src/umi_tools/umi_tools_dedup/config.vsh.yaml b/src/umi_tools/umi_tools_dedup/config.vsh.yaml new file mode 100644 index 00000000..75306541 --- /dev/null +++ b/src/umi_tools/umi_tools_dedup/config.vsh.yaml @@ -0,0 +1,279 @@ +name: umi_tool_dedup +namespace: umi_tools +description: | + Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read. +keywords: [umi_tools, deduplication, dedup] +links: + homepage: https://umi-tools.readthedocs.io/en/latest/ + documentation: [ https://umi-tools.readthedocs.io/en/latest/reference/dedup.html, + https://umi-tools.readthedocs.io/en/latest/common_options.html#common-options ] + repository: https://github.com/CGATOxford/UMI-tools +references: + doi: 10.1101/gr.209601.116 +license: MIT + +argument_groups: + - name: Inputs + arguments: + - name: --input + alternatives: -I + type: file + description: Input BAM or SAM file. Use --in_sam to specify SAM format. + required: true + - name: --in_sam + type: boolean_true + description: | + By default, inputs are assumed to be in BAM format. Use this options + to specify the use of SAM format for input. + - name: --bai + type: file + description: BAM index + - name: --get_output_stats + type: boolean + description: Whether or not to generate output stats. + - name: --random_seed + type: integer + description: | + Random seed to initialize number generator with. + default: none + + - name: Outputs + arguments: + - name: --output + alternatives: -S + type: file + description: Deduplicated BAM file + required: true + direction: output + - name: --out_sam + type: boolean_true + description: | + By default, outputa are written in BAM format. Use this options to + specify the use of SAM format for output. + - name: --paired + type: boolean_true + description: | + BAM is paired end - output both read pairs. This will also force the + use of the template length to determine reads with the same mapping + coordinates. + - name: --output_stats + type: file + description: Directory containing UMI based deduplication statistics files + direction: output + - name: --extract_umi_method + type: string + description: | + Specify the method by which the barcodes were encoded in the read. + The options are: [read_id, tag, umis]. + default: read_id + - name: --umi_tag + type: string + description: | + The tag containing the UMI sequence. + This is only required if the extract_umi_method is set to tag. + - name: --umi_separator + type: string + description: | + The separator used to separate the UMI from the read sequence. + This is only required if the extract_umi_method is set to id_read. + default: '_' + - name: --umi_tag_split + type: string + description: | + Separate the UMI in tag by and take the first element. + - name: --umi_tag_delimiter + type: string + description: | + Separate the UMI in by and concatenate the elements + - name: --cell_tag + type: string + description: | + The tag containing the cell barcode sequence. + This is only required if the extract_umi_method is set to tag. + - name: --cell_tag_split + type: string + description: | + Separate the cell barcode in tag by and take the first element. + - name: --cell_tag_delimiter + type: string + description: | + Separate the cell barcode in by and concatenate the elements + + - name: Grouping Options + arguments: + - name: --method + type: string + description: | + The method to use for grouping reads. The options are: + [unique, percentile, cluster, adjacency, directional]. + default: directional + - name: --edit_distance_threshold + type: integer + description: | + For the adjacency and cluster methods the threshold for the edit + distance to connect two UMIs in the network can be increased. The + default value of 1 works best unless the UMI is very long (>14bp). + default: 1 + - name: --spliced_is_unique + type: boolean_true + description: | + Causes two reads that start in the same position on the same strand + and having the same UMI to be considered unique if one is spliced + and the other is not. (Uses the ‘N’ cigar operation to test for splicing). + - name: --soft_clip_threshold + type: integer + description: | + Mappers that soft clip will sometimes do so rather than mapping a + spliced read if there is only a small overhang over the exon junction. + By setting this option, you can treat reads with at least this many + bases soft-clipped at the 3’ end as spliced. + default: 4 + - name: --multimapping_detection_method + type: string + description: | + If the sam/bam contains tags to identify multimapping reads, you can + specify for use when selecting the best read at a given loci. Supported + tags are “NH”, “X0” and “XT”. If not specified, the read with the highest + mapping quality will be selected. + - name: --read_length + type: integer + description: | + Use the read length as a criteria when deduping, for e.g sRNA-Seq. + + - name: Single-cell RNA-Seq Options + arguments: + - name: --per_gene + type: boolean_true + description: | + Reads will be grouped together if they have the same gene. This is useful + if your library prep generates PCR duplicates with non identical alignment + positions such as CEL-Seq. Note this option is hardcoded to be on with the + count command. I.e counting is always performed per-gene. Must be combined + with either --gene_tag or --per_contig option. + - name: --gene_tag + type: string + description: | + Deduplicate per gene. The gene information is encoded in the bam read tag + specified. + - name: --assigned_status_tag + type: string + description: | + BAM tag which describes whether a read is assigned to a gene. Defaults to + the same value as given for --gene_tag. + - name: --skip_tags_regex + type: string + description: | + Use in conjunction with the --assigned_status_tag option to skip any reads + where the tag matches this regex. Default ("^[__|Unassigned]") matches + anything which starts with “__” or “Unassigned”. + - name: --per_contig + type: boolean_true + description: | + Deduplicate per contig (field 3 in BAM; RNAME). All reads with the same + contig will be considered to have the same alignment position. This is + useful if you have aligned to a reference transcriptome with one + transcript per gene. If you have aligned to a transcriptome with more + than one transcript per gene, you can supply a map between transcripts + and gene using the --gene_transcript_map option. + - name: --gene_transcript_map + type: file + description: | + A file containing a mapping between gene names and transcript names. + The file should be tab separated with the gene name in the first column + and the transcript name in the second column. + - name: --per_cell + type: boolean_true + description: | + Reads will only be grouped together if they have the same cell barcode. + Can be combined with --per_gene. + + - name: SAM/BAM Options + arguments: + - name: --mapping_quality + type: integer + description: | + Minimium mapping quality (MAPQ) for a read to be retained. + default: 0 + - name: --unmapped_reads + type: string + description: | + How unmapped reads should be handled. + The options are: + "discard": Discard all unmapped reads. + "use": If read2 is unmapped, deduplicate using read1 only. + Requires --paired. + "output": Output unmapped reads/read pairs without UMI + grouping/deduplication. Only available in umi_tools group. + default: discard + - name: --chimeric_pairs + type: string + description: | + How chimeric pairs should be handled. + The options are: + "discard": Discard all chimeric read pairs. + "use": Deduplicate using read1 only. + "output": Output chimeric pairs without UMI grouping/deduplication. + Only available in umi_tools group. + default: use + - name: --unapired_reads + type: string + description: | + How unpaired reads should be handled. + The options are: + "discard": Discard all unpaired reads. + "use": Deduplicate using read1 only. + "output": Output unpaired reads without UMI grouping/deduplication. + Only available in umi_tools group. + default: use + - name: --ignore_umi + type: boolean_true + description: | + Ignore the UMI and group reads using mapping coordinates only. + - name: --subset + type: boolean_true + description: | + Only consider a fraction of the reads, chosen at random. This is useful + for doing saturation analyses. + - name: --chrom + type: string + description: | + Only consider a single chromosome. This is useful for debugging/testing + purposes. + + - name: Group/Dedup Options + arguments: + - name: --no_sort_output + type: boolean_true + description: | + By default, output is sorted. This involves the use of a temporary unsorted + file (saved in --temp-dir). Use this option to turn off sorting. + - name: --buffer_whole_contig + type: boolean_true + description: | + Forces dedup to parse an entire contig before yielding any reads for + deduplication. This is the only way to absolutely guarantee that all reads + with the same start position are grouped together for deduplication since + dedup uses the start position of the read, not the alignment coordinate on + which the reads are sorted. However, by default, dedup reads for another + 1000bp before outputting read groups which will avoid any reads being missed + with short read sequencing (<1000bp). + + +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - type: file + path: test_data +engines: + - type: docker + image: quay.io/biocontainers/umi_tools:1.1.5--py39hf95cd2a_1 + setup: + - type: docker + run: | + umi_tools -v | sed 's/ version//g' > /var/software_versions.txt +runners: +- type: executable +- type: nextflow \ No newline at end of file diff --git a/src/umi_tools/umi_tools_dedup/help.txt b/src/umi_tools/umi_tools_dedup/help.txt new file mode 100644 index 00000000..d3c8fa44 --- /dev/null +++ b/src/umi_tools/umi_tools_dedup/help.txt @@ -0,0 +1,13 @@ +``` +umi_tools dedup +``` + +dedup - Deduplicate reads using UMI and mapping coordinates + +Usage: umi_tools dedup [OPTIONS] [--stdin=IN_BAM] [--stdout=OUT_BAM] + + note: If --stdout is ommited, standard out is output. To + generate a valid BAM file on standard out, please + redirect log with --log=LOGFILE or --log2stderr + +For full UMI-tools documentation, see https://umi-tools.readthedocs.io/en/latest/ \ No newline at end of file diff --git a/src/umi_tools/umi_tools_dedup/script.sh b/src/umi_tools/umi_tools_dedup/script.sh new file mode 100644 index 00000000..57c01258 --- /dev/null +++ b/src/umi_tools/umi_tools_dedup/script.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +set -e + +test_dir="${metal_executable}/test_data" + +[[ "$par_paired" == "false" ]] && unset par_paired +[[ "$par_in_sam" == "false" ]] && unset par_in_sam +[[ "$par_out_sam" == "false" ]] && unset par_out_sam +[[ "$par_spliced_is_unique" == "false" ]] && unset par_spliced_is_unique +[[ "$par_per_gene" == "false" ]] && unset par_per_gene +[[ "$par_per_contig" == "false" ]] && unset par_per_contig +[[ "$par_per_cell" == "false" ]] && unset par_per_cell +[[ "$par_no_sort_output" == "false" ]] && unset par_no_sort_output +[[ "$par_buffer_whole_contig" == "false" ]] && unset par_buffer_whole_contig +[[ "$par_ignore_umi" == "false" ]] && unset par_ignore_umi +[[ "$par_subset" == "false" ]] && unset par_subset + + +$(which umi_tools) dedup \ + -I "$par_input" \ + ${par_in_sam:+--in-sam} \ + ${par_bai:+--bai "$par_bai"} \ + ${par_get_output_stats:+--get-output-stats} \ + ${par_random_seed:+--random-seed "$par_random_seed"} \ + -S "$par_output" \ + ${par_out_sam:+--out-sam} \ + ${par_paired:+--paired} \ + ${par_output_stats:+--output-stats "$par_output_stats"} \ + ${par_extract_umi_method:+--extract-umi-method "$par_extract_umi_method"} \ + ${par_umi_tag:+--umi-tag "$par_umi_tag"} \ + ${par_umi_separator:+--umi-separator "$par_umi_separator"} \ + ${par_umi_tag_split:+--umi-tag-split "$par_umi_tag_split"} \ + ${par_umi_tag_delimiter:+--umi-tag-delimiter "$par_umi_tag_delimiter"} \ + ${par_cell_tag:+--cell-tag "$par_cell_tag"} \ + ${par_cell_tag_split:+--cell-tag-split "$par_cell_tag_split"} \ + ${par_cell_tag_delimiter:+--cell-tag-delimiter "$par_cell_tag_delimiter"} \ + ${par_method:+--method "$par_method"} \ + ${par_edit_distance_threshold:+--edit-distance-threshold "$par_edit_distance_threshold"} \ + ${par_spliced_is_unique:+--spliced-is-unique} \ + ${par_soft_clip_threshold:+--soft-clip-threshold "$par_soft_clip_threshold"} \ + ${par_multimapping_detection_method:+--multimapping-detection-method "$par_multimapping_detection_method"} \ + ${par_read_length:+--read-length "$par_read_length"} \ + ${par_per_gene:+--per-gene} \ + ${par_gene_tag:+--gene-tag "$par_gene_tag"} \ + ${par_assigned_status_tag:+--assigned-status-tag "$par_assigned_status_tag"} \ + ${par_skip_tags_regex:+--skip-tags-regex "$par_skip_tags_regex"} \ + ${par_per_contig:+--per-contig} + ${par_gene_transcript_map:+--gene-transcript-map "$par_gene_transcript_map"} \ + ${par_per_cell:+--per-cell} \ + ${par_mapping_quality:+--mapping-quality "$par_mapping_quality"} \ + ${par_unmapped_reads:+--unmapped-reads "$par_unmapped_reads"} \ + ${par_chimeric_pairs:+--chimeric-pairs "$par_chimeric_pairs"} \ + ${par_unapired_reads:+--unapired-reads "$par_unapired_reads"} \ + ${par_ignore_umi:+--ignore-umi} \ + ${par_subset:+--subset} \ + ${par_chrom:+--chrom "$par_chrom"} \ + ${par_no_sort_output:+--no-sort-output} \ + ${par_buffer_whole_contig:+--buffer-whole-contig} + + +exit 0 \ No newline at end of file diff --git a/src/umi_tools/umi_tools_dedup/test.sh b/src/umi_tools/umi_tools_dedup/test.sh new file mode 100644 index 00000000..1459ec08 --- /dev/null +++ b/src/umi_tools/umi_tools_dedup/test.sh @@ -0,0 +1,49 @@ +#!/bin/bash + +test_dir="${meta_resources_dir}/test_data" +echo ">>> Testing $meta_functionality_name" + +"$meta_executable" \ + --bam "$test_dir/a.sorted.bam" \ + --bai "$test_dir/a.sorted.bam.bai" \ + --output "$test_dir/a.sorted.idxstats" + +echo ">>> Checking whether output exists" +[ ! -f "$test_dir/a.sorted.idxstats" ] && echo "File 'a.sorted.idxstats' does not exist!" && exit 1 + +echo ">>> Checking whether output is non-empty" +[ ! -s "$test_dir/a.sorted.idxstats" ] && echo "File 'a.sorted.idxstats' is empty!" && exit 1 + +echo ">>> Checking whether output is correct" +diff "$test_dir/a.sorted.idxstats" "$test_dir/a_ref.sorted.idxstats" || \ + (echo "Output file a.sorted.idxstats does not match expected output" && exit 1) + +rm "$test_dir/a.sorted.idxstats" + +############################################################################################ + +echo ">>> Testing $meta_functionality_name with singletons in the input" + +"$meta_executable" \ + --bam "$test_dir/test.paired_end.sorted.bam" \ + --bai "$test_dir/test.paired_end.sorted.bam.bai" \ + --output "$test_dir/test.paired_end.sorted.idxstats" + +echo ">>> Checking whether output exists" +[ ! -f "$test_dir/test.paired_end.sorted.idxstats" ] && \ + echo "File 'test.paired_end.sorted.idxstats' does not exist!" && exit 1 + +echo ">>> Checking whether output is non-empty" +[ ! -s "$test_dir/test.paired_end.sorted.idxstats" ] && \ + echo "File 'test.paired_end.sorted.idxstats' is empty!" && exit 1 + +echo ">>> Checking whether output is correct" +diff "$test_dir/test.paired_end.sorted.idxstats" "$test_dir/test_ref.paired_end.sorted.idxstats" || \ + (echo "Output file test.paired_end.sorted.idxstats does not match expected output" && exit 1) + +rm "$test_dir/test.paired_end.sorted.idxstats" + +############################################################################################ + +echo "All tests succeeded!" +exit 0 \ No newline at end of file From 2c269682620a407803e528652198646435ef2c03 Mon Sep 17 00:00:00 2001 From: emmarousseau Date: Thu, 11 Apr 2024 11:38:57 +0100 Subject: [PATCH 2/5] Revert "initial commit dedup" This reverts commit 38f586bec0ac9e4312b016e29c3aa0bd53f292b2. --- CHANGELOG.md | 3 - src/umi_tools/umi_tools_dedup/config.vsh.yaml | 279 ------------------ src/umi_tools/umi_tools_dedup/help.txt | 13 - src/umi_tools/umi_tools_dedup/script.sh | 65 ---- src/umi_tools/umi_tools_dedup/test.sh | 49 --- 5 files changed, 409 deletions(-) delete mode 100644 src/umi_tools/umi_tools_dedup/config.vsh.yaml delete mode 100644 src/umi_tools/umi_tools_dedup/help.txt delete mode 100644 src/umi_tools/umi_tools_dedup/script.sh delete mode 100644 src/umi_tools/umi_tools_dedup/test.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 1bef9345..4fd7f001 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -39,9 +39,6 @@ - `samtools/flagstat`: Counts the number of alignments in SAM/BAM/CRAM files for each FLAG type (PR #31). - `samtools/idxstats`: Reports alignment summary statistics for a SAM/BAM/CRAM file (PR #32). -* `umi_tools`: - - `umi_tools/umi_tools_dedup`: Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read (PR #). - ## MAJOR CHANGES ## MINOR CHANGES diff --git a/src/umi_tools/umi_tools_dedup/config.vsh.yaml b/src/umi_tools/umi_tools_dedup/config.vsh.yaml deleted file mode 100644 index 75306541..00000000 --- a/src/umi_tools/umi_tools_dedup/config.vsh.yaml +++ /dev/null @@ -1,279 +0,0 @@ -name: umi_tool_dedup -namespace: umi_tools -description: | - Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read. -keywords: [umi_tools, deduplication, dedup] -links: - homepage: https://umi-tools.readthedocs.io/en/latest/ - documentation: [ https://umi-tools.readthedocs.io/en/latest/reference/dedup.html, - https://umi-tools.readthedocs.io/en/latest/common_options.html#common-options ] - repository: https://github.com/CGATOxford/UMI-tools -references: - doi: 10.1101/gr.209601.116 -license: MIT - -argument_groups: - - name: Inputs - arguments: - - name: --input - alternatives: -I - type: file - description: Input BAM or SAM file. Use --in_sam to specify SAM format. - required: true - - name: --in_sam - type: boolean_true - description: | - By default, inputs are assumed to be in BAM format. Use this options - to specify the use of SAM format for input. - - name: --bai - type: file - description: BAM index - - name: --get_output_stats - type: boolean - description: Whether or not to generate output stats. - - name: --random_seed - type: integer - description: | - Random seed to initialize number generator with. - default: none - - - name: Outputs - arguments: - - name: --output - alternatives: -S - type: file - description: Deduplicated BAM file - required: true - direction: output - - name: --out_sam - type: boolean_true - description: | - By default, outputa are written in BAM format. Use this options to - specify the use of SAM format for output. - - name: --paired - type: boolean_true - description: | - BAM is paired end - output both read pairs. This will also force the - use of the template length to determine reads with the same mapping - coordinates. - - name: --output_stats - type: file - description: Directory containing UMI based deduplication statistics files - direction: output - - name: --extract_umi_method - type: string - description: | - Specify the method by which the barcodes were encoded in the read. - The options are: [read_id, tag, umis]. - default: read_id - - name: --umi_tag - type: string - description: | - The tag containing the UMI sequence. - This is only required if the extract_umi_method is set to tag. - - name: --umi_separator - type: string - description: | - The separator used to separate the UMI from the read sequence. - This is only required if the extract_umi_method is set to id_read. - default: '_' - - name: --umi_tag_split - type: string - description: | - Separate the UMI in tag by and take the first element. - - name: --umi_tag_delimiter - type: string - description: | - Separate the UMI in by and concatenate the elements - - name: --cell_tag - type: string - description: | - The tag containing the cell barcode sequence. - This is only required if the extract_umi_method is set to tag. - - name: --cell_tag_split - type: string - description: | - Separate the cell barcode in tag by and take the first element. - - name: --cell_tag_delimiter - type: string - description: | - Separate the cell barcode in by and concatenate the elements - - - name: Grouping Options - arguments: - - name: --method - type: string - description: | - The method to use for grouping reads. The options are: - [unique, percentile, cluster, adjacency, directional]. - default: directional - - name: --edit_distance_threshold - type: integer - description: | - For the adjacency and cluster methods the threshold for the edit - distance to connect two UMIs in the network can be increased. The - default value of 1 works best unless the UMI is very long (>14bp). - default: 1 - - name: --spliced_is_unique - type: boolean_true - description: | - Causes two reads that start in the same position on the same strand - and having the same UMI to be considered unique if one is spliced - and the other is not. (Uses the ‘N’ cigar operation to test for splicing). - - name: --soft_clip_threshold - type: integer - description: | - Mappers that soft clip will sometimes do so rather than mapping a - spliced read if there is only a small overhang over the exon junction. - By setting this option, you can treat reads with at least this many - bases soft-clipped at the 3’ end as spliced. - default: 4 - - name: --multimapping_detection_method - type: string - description: | - If the sam/bam contains tags to identify multimapping reads, you can - specify for use when selecting the best read at a given loci. Supported - tags are “NH”, “X0” and “XT”. If not specified, the read with the highest - mapping quality will be selected. - - name: --read_length - type: integer - description: | - Use the read length as a criteria when deduping, for e.g sRNA-Seq. - - - name: Single-cell RNA-Seq Options - arguments: - - name: --per_gene - type: boolean_true - description: | - Reads will be grouped together if they have the same gene. This is useful - if your library prep generates PCR duplicates with non identical alignment - positions such as CEL-Seq. Note this option is hardcoded to be on with the - count command. I.e counting is always performed per-gene. Must be combined - with either --gene_tag or --per_contig option. - - name: --gene_tag - type: string - description: | - Deduplicate per gene. The gene information is encoded in the bam read tag - specified. - - name: --assigned_status_tag - type: string - description: | - BAM tag which describes whether a read is assigned to a gene. Defaults to - the same value as given for --gene_tag. - - name: --skip_tags_regex - type: string - description: | - Use in conjunction with the --assigned_status_tag option to skip any reads - where the tag matches this regex. Default ("^[__|Unassigned]") matches - anything which starts with “__” or “Unassigned”. - - name: --per_contig - type: boolean_true - description: | - Deduplicate per contig (field 3 in BAM; RNAME). All reads with the same - contig will be considered to have the same alignment position. This is - useful if you have aligned to a reference transcriptome with one - transcript per gene. If you have aligned to a transcriptome with more - than one transcript per gene, you can supply a map between transcripts - and gene using the --gene_transcript_map option. - - name: --gene_transcript_map - type: file - description: | - A file containing a mapping between gene names and transcript names. - The file should be tab separated with the gene name in the first column - and the transcript name in the second column. - - name: --per_cell - type: boolean_true - description: | - Reads will only be grouped together if they have the same cell barcode. - Can be combined with --per_gene. - - - name: SAM/BAM Options - arguments: - - name: --mapping_quality - type: integer - description: | - Minimium mapping quality (MAPQ) for a read to be retained. - default: 0 - - name: --unmapped_reads - type: string - description: | - How unmapped reads should be handled. - The options are: - "discard": Discard all unmapped reads. - "use": If read2 is unmapped, deduplicate using read1 only. - Requires --paired. - "output": Output unmapped reads/read pairs without UMI - grouping/deduplication. Only available in umi_tools group. - default: discard - - name: --chimeric_pairs - type: string - description: | - How chimeric pairs should be handled. - The options are: - "discard": Discard all chimeric read pairs. - "use": Deduplicate using read1 only. - "output": Output chimeric pairs without UMI grouping/deduplication. - Only available in umi_tools group. - default: use - - name: --unapired_reads - type: string - description: | - How unpaired reads should be handled. - The options are: - "discard": Discard all unpaired reads. - "use": Deduplicate using read1 only. - "output": Output unpaired reads without UMI grouping/deduplication. - Only available in umi_tools group. - default: use - - name: --ignore_umi - type: boolean_true - description: | - Ignore the UMI and group reads using mapping coordinates only. - - name: --subset - type: boolean_true - description: | - Only consider a fraction of the reads, chosen at random. This is useful - for doing saturation analyses. - - name: --chrom - type: string - description: | - Only consider a single chromosome. This is useful for debugging/testing - purposes. - - - name: Group/Dedup Options - arguments: - - name: --no_sort_output - type: boolean_true - description: | - By default, output is sorted. This involves the use of a temporary unsorted - file (saved in --temp-dir). Use this option to turn off sorting. - - name: --buffer_whole_contig - type: boolean_true - description: | - Forces dedup to parse an entire contig before yielding any reads for - deduplication. This is the only way to absolutely guarantee that all reads - with the same start position are grouped together for deduplication since - dedup uses the start position of the read, not the alignment coordinate on - which the reads are sorted. However, by default, dedup reads for another - 1000bp before outputting read groups which will avoid any reads being missed - with short read sequencing (<1000bp). - - -resources: - - type: bash_script - path: script.sh -test_resources: - - type: bash_script - path: test.sh - - type: file - path: test_data -engines: - - type: docker - image: quay.io/biocontainers/umi_tools:1.1.5--py39hf95cd2a_1 - setup: - - type: docker - run: | - umi_tools -v | sed 's/ version//g' > /var/software_versions.txt -runners: -- type: executable -- type: nextflow \ No newline at end of file diff --git a/src/umi_tools/umi_tools_dedup/help.txt b/src/umi_tools/umi_tools_dedup/help.txt deleted file mode 100644 index d3c8fa44..00000000 --- a/src/umi_tools/umi_tools_dedup/help.txt +++ /dev/null @@ -1,13 +0,0 @@ -``` -umi_tools dedup -``` - -dedup - Deduplicate reads using UMI and mapping coordinates - -Usage: umi_tools dedup [OPTIONS] [--stdin=IN_BAM] [--stdout=OUT_BAM] - - note: If --stdout is ommited, standard out is output. To - generate a valid BAM file on standard out, please - redirect log with --log=LOGFILE or --log2stderr - -For full UMI-tools documentation, see https://umi-tools.readthedocs.io/en/latest/ \ No newline at end of file diff --git a/src/umi_tools/umi_tools_dedup/script.sh b/src/umi_tools/umi_tools_dedup/script.sh deleted file mode 100644 index 57c01258..00000000 --- a/src/umi_tools/umi_tools_dedup/script.sh +++ /dev/null @@ -1,65 +0,0 @@ -#!/bin/bash - -## VIASH START -## VIASH END - -set -e - -test_dir="${metal_executable}/test_data" - -[[ "$par_paired" == "false" ]] && unset par_paired -[[ "$par_in_sam" == "false" ]] && unset par_in_sam -[[ "$par_out_sam" == "false" ]] && unset par_out_sam -[[ "$par_spliced_is_unique" == "false" ]] && unset par_spliced_is_unique -[[ "$par_per_gene" == "false" ]] && unset par_per_gene -[[ "$par_per_contig" == "false" ]] && unset par_per_contig -[[ "$par_per_cell" == "false" ]] && unset par_per_cell -[[ "$par_no_sort_output" == "false" ]] && unset par_no_sort_output -[[ "$par_buffer_whole_contig" == "false" ]] && unset par_buffer_whole_contig -[[ "$par_ignore_umi" == "false" ]] && unset par_ignore_umi -[[ "$par_subset" == "false" ]] && unset par_subset - - -$(which umi_tools) dedup \ - -I "$par_input" \ - ${par_in_sam:+--in-sam} \ - ${par_bai:+--bai "$par_bai"} \ - ${par_get_output_stats:+--get-output-stats} \ - ${par_random_seed:+--random-seed "$par_random_seed"} \ - -S "$par_output" \ - ${par_out_sam:+--out-sam} \ - ${par_paired:+--paired} \ - ${par_output_stats:+--output-stats "$par_output_stats"} \ - ${par_extract_umi_method:+--extract-umi-method "$par_extract_umi_method"} \ - ${par_umi_tag:+--umi-tag "$par_umi_tag"} \ - ${par_umi_separator:+--umi-separator "$par_umi_separator"} \ - ${par_umi_tag_split:+--umi-tag-split "$par_umi_tag_split"} \ - ${par_umi_tag_delimiter:+--umi-tag-delimiter "$par_umi_tag_delimiter"} \ - ${par_cell_tag:+--cell-tag "$par_cell_tag"} \ - ${par_cell_tag_split:+--cell-tag-split "$par_cell_tag_split"} \ - ${par_cell_tag_delimiter:+--cell-tag-delimiter "$par_cell_tag_delimiter"} \ - ${par_method:+--method "$par_method"} \ - ${par_edit_distance_threshold:+--edit-distance-threshold "$par_edit_distance_threshold"} \ - ${par_spliced_is_unique:+--spliced-is-unique} \ - ${par_soft_clip_threshold:+--soft-clip-threshold "$par_soft_clip_threshold"} \ - ${par_multimapping_detection_method:+--multimapping-detection-method "$par_multimapping_detection_method"} \ - ${par_read_length:+--read-length "$par_read_length"} \ - ${par_per_gene:+--per-gene} \ - ${par_gene_tag:+--gene-tag "$par_gene_tag"} \ - ${par_assigned_status_tag:+--assigned-status-tag "$par_assigned_status_tag"} \ - ${par_skip_tags_regex:+--skip-tags-regex "$par_skip_tags_regex"} \ - ${par_per_contig:+--per-contig} - ${par_gene_transcript_map:+--gene-transcript-map "$par_gene_transcript_map"} \ - ${par_per_cell:+--per-cell} \ - ${par_mapping_quality:+--mapping-quality "$par_mapping_quality"} \ - ${par_unmapped_reads:+--unmapped-reads "$par_unmapped_reads"} \ - ${par_chimeric_pairs:+--chimeric-pairs "$par_chimeric_pairs"} \ - ${par_unapired_reads:+--unapired-reads "$par_unapired_reads"} \ - ${par_ignore_umi:+--ignore-umi} \ - ${par_subset:+--subset} \ - ${par_chrom:+--chrom "$par_chrom"} \ - ${par_no_sort_output:+--no-sort-output} \ - ${par_buffer_whole_contig:+--buffer-whole-contig} - - -exit 0 \ No newline at end of file diff --git a/src/umi_tools/umi_tools_dedup/test.sh b/src/umi_tools/umi_tools_dedup/test.sh deleted file mode 100644 index 1459ec08..00000000 --- a/src/umi_tools/umi_tools_dedup/test.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/bin/bash - -test_dir="${meta_resources_dir}/test_data" -echo ">>> Testing $meta_functionality_name" - -"$meta_executable" \ - --bam "$test_dir/a.sorted.bam" \ - --bai "$test_dir/a.sorted.bam.bai" \ - --output "$test_dir/a.sorted.idxstats" - -echo ">>> Checking whether output exists" -[ ! -f "$test_dir/a.sorted.idxstats" ] && echo "File 'a.sorted.idxstats' does not exist!" && exit 1 - -echo ">>> Checking whether output is non-empty" -[ ! -s "$test_dir/a.sorted.idxstats" ] && echo "File 'a.sorted.idxstats' is empty!" && exit 1 - -echo ">>> Checking whether output is correct" -diff "$test_dir/a.sorted.idxstats" "$test_dir/a_ref.sorted.idxstats" || \ - (echo "Output file a.sorted.idxstats does not match expected output" && exit 1) - -rm "$test_dir/a.sorted.idxstats" - -############################################################################################ - -echo ">>> Testing $meta_functionality_name with singletons in the input" - -"$meta_executable" \ - --bam "$test_dir/test.paired_end.sorted.bam" \ - --bai "$test_dir/test.paired_end.sorted.bam.bai" \ - --output "$test_dir/test.paired_end.sorted.idxstats" - -echo ">>> Checking whether output exists" -[ ! -f "$test_dir/test.paired_end.sorted.idxstats" ] && \ - echo "File 'test.paired_end.sorted.idxstats' does not exist!" && exit 1 - -echo ">>> Checking whether output is non-empty" -[ ! -s "$test_dir/test.paired_end.sorted.idxstats" ] && \ - echo "File 'test.paired_end.sorted.idxstats' is empty!" && exit 1 - -echo ">>> Checking whether output is correct" -diff "$test_dir/test.paired_end.sorted.idxstats" "$test_dir/test_ref.paired_end.sorted.idxstats" || \ - (echo "Output file test.paired_end.sorted.idxstats does not match expected output" && exit 1) - -rm "$test_dir/test.paired_end.sorted.idxstats" - -############################################################################################ - -echo "All tests succeeded!" -exit 0 \ No newline at end of file From 5be4d45d17c10d008131043f905affc382b4c48e Mon Sep 17 00:00:00 2001 From: emmarousseau Date: Wed, 4 Sep 2024 19:05:38 +0200 Subject: [PATCH 3/5] functional component, one test, changelog update --- CHANGELOG.md | 3 +- .../umi_tools_prepareforrsem/config.vsh.yaml | 63 ++++ .../umi_tools_prepareforrsem/help.txt | 13 + .../prepare-for-rsem.py | 270 ++++++++++++++++++ .../umi_tools_prepareforrsem/script.sh | 10 + .../umi_tools_prepareforrsem/test.sh | 20 ++ .../test_data/test.sam | 119 ++++++++ .../test_data/test_dedup.sam | 201 +++++++++++++ 8 files changed, 698 insertions(+), 1 deletion(-) create mode 100644 src/umi_tools/umi_tools_prepareforrsem/config.vsh.yaml create mode 100644 src/umi_tools/umi_tools_prepareforrsem/help.txt create mode 100755 src/umi_tools/umi_tools_prepareforrsem/prepare-for-rsem.py create mode 100755 src/umi_tools/umi_tools_prepareforrsem/script.sh create mode 100644 src/umi_tools/umi_tools_prepareforrsem/test.sh create mode 100644 src/umi_tools/umi_tools_prepareforrsem/test_data/test.sam create mode 100644 src/umi_tools/umi_tools_prepareforrsem/test_data/test_dedup.sam diff --git a/CHANGELOG.md b/CHANGELOG.md index 3e9f40fc..a18686c4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -123,7 +123,8 @@ - `samtools/samtools_fastq`: Converts a SAM/BAM/CRAM file to FASTA (PR #53). * `umi_tools`: - -`umi_tools/umi_tools_extract`: Flexible removal of UMI sequences from fastq reads (PR #71). + - `umi_tools/umi_tools_extract`: Flexible removal of UMI sequences from fastq reads (PR #71). + - `umi_tools/umi_tools_prepareforquant`: Fix paired-end reads in name sorted BAM file to prepare for RSEM (PR #148). * `falco`: A C++ drop-in replacement of FastQC to assess the quality of sequence read data (PR #43). diff --git a/src/umi_tools/umi_tools_prepareforrsem/config.vsh.yaml b/src/umi_tools/umi_tools_prepareforrsem/config.vsh.yaml new file mode 100644 index 00000000..3355a3d2 --- /dev/null +++ b/src/umi_tools/umi_tools_prepareforrsem/config.vsh.yaml @@ -0,0 +1,63 @@ +name: "umi_tools_prepareforrsem" +namespace: "umi_tools" +description: Make the output from umi-tools dedup or group compatible with RSEM +keywords: [umi_tools, rsem, bam, sam] +links: + homepage: https://umi-tools.readthedocs.io/en/latest/ + documentation: https://umi-tools.readthedocs.io/en/latest/reference/extract.html + repository: https://github.com/CGATOxford/UMI-tools +references: + doi: 10.1101/gr.209601.116 +license: MIT + +argument_groups: +- name: "Input" + arguments: + - name: "--input" + type: file + required: true + example: $id.transcriptome.bam + +- name: "Output" + arguments: + - name: "--output" + type: file + direction: output + example: $id.transcriptome_sorted.bam + - name: "--log" + type: file + direction: output + +- name: "Options" + arguments: + - name: "--tags" + type: string + description: | + Comma-seperated list of tags to transfer from read1 to read2 (Default: 'UG,BX') + example: "UG,BX" + - name: "--sam" + type: boolean_true + description: Input and output SAM rather than BAM. + +resources: + - type: bash_script + path: script.sh + # copied from https://github.com/nf-core/rnaseq/blob/3.12.0/bin/prepare-for-rsem.py + - path: prepare-for-rsem.py +test_resources: + - type: bash_script + path: test.sh + - type: file + path: test_data + +engines: +- type: docker + image: ubuntu:22.04 + setup: + - type: apt + packages: [pip] + - type: python + packages: [umi_tools, pysam] +runners: +- type: executable +- type: nextflow \ No newline at end of file diff --git a/src/umi_tools/umi_tools_prepareforrsem/help.txt b/src/umi_tools/umi_tools_prepareforrsem/help.txt new file mode 100644 index 00000000..58923242 --- /dev/null +++ b/src/umi_tools/umi_tools_prepareforrsem/help.txt @@ -0,0 +1,13 @@ +prepare_for_rsem - make output from dedup or group compatible with RSEM + +Usage: umi_tools prepare_for_rsem [OPTIONS] [--stdin=IN_BAM] [--stdout=OUT_BAM] + + note: If --stdout is omited, standard out is output. To + generate a valid BAM file on standard out, please + redirect log with --log=LOGFILE or --log2stderr + + --input,--stdin Input bam file. + --output, --stdout Output bam file. + --log Output log file (optional). + --tags Comma-seperated list of tags to transfer from read1 to read2 (Default: "UG,BX"). + --sam Input and output SAM rather than BAM. \ No newline at end of file diff --git a/src/umi_tools/umi_tools_prepareforrsem/prepare-for-rsem.py b/src/umi_tools/umi_tools_prepareforrsem/prepare-for-rsem.py new file mode 100755 index 00000000..59dd01ad --- /dev/null +++ b/src/umi_tools/umi_tools_prepareforrsem/prepare-for-rsem.py @@ -0,0 +1,270 @@ +#!/usr/bin/env python3 + +""" +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Credits +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This script is a clone of the "prepare-for-rsem.py" script written by +Ian Sudbury, Tom Smith and other contributors to the UMI-tools package: +https://github.com/CGATOxford/UMI-tools + +It has been included here to address problems encountered with +Salmon quant and RSEM as discussed in the issue below: +https://github.com/CGATOxford/UMI-tools/issues/465 + +When the "umi_tools prepare-for-rsem" command becomes available in an official +UMI-tools release this script will be replaced and deprecated. + +Commit: +https://github.com/CGATOxford/UMI-tools/blob/bf8608d6a172c5ca0dcf33c126b4e23429177a72/umi_tools/prepare-for-rsem.py + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +prepare_for_rsem - make the output from dedup or group compatible with RSEM +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The SAM format specification states that the mnext and mpos fields should point +to the primary alignment of a read's mate. However, not all aligners adhere to +this standard. In addition, the RSEM software requires that the mate of a read1 +appears directly after it in its input BAM. This requires that there is exactly +one read1 alignment for every read2 and vice versa. + +In general (except in a few edge cases) UMI tools outputs only the read2 to that +corresponds to the read specified in the mnext and mpos positions of a selected +read1, and only outputs this read once, even if multiple read1s point to it. +This makes UMI-tools outputs incompatible with RSEM. This script takes the output +from dedup or groups and ensures that each read1 has exactly one read2 (and vice +versa), that read2 always appears directly after read1,and that pairs point to +each other (note this is technically not valid SAM format). Copy any specified +tags from read1 to read2 if they are present (by default, UG and BX, the unique +group and correct UMI tags added by _group_) + +Input must to name sorted. + + +https://raw.githubusercontent.com/CGATOxford/UMI-tools/master/LICENSE + +""" + +from umi_tools import Utilities as U +from collections import defaultdict, Counter +import pysam +import sys + +usage = """ +prepare_for_rsem - make output from dedup or group compatible with RSEM + +Usage: umi_tools prepare_for_rsem [OPTIONS] [--stdin=IN_BAM] [--stdout=OUT_BAM] + + note: If --stdout is omited, standard out is output. To + generate a valid BAM file on standard out, please + redirect log with --log=LOGFILE or --log2stderr """ + + +def chunk_bam(bamfile): + """Take in a iterator of pysam.AlignmentSegment entries and yield + lists of reads that all share the same name""" + + last_query_name = None + output_buffer = list() + + for read in bamfile: + if last_query_name is not None and last_query_name != read.query_name: + yield (output_buffer) + output_buffer = list() + + last_query_name = read.query_name + output_buffer.append(read) + + yield (output_buffer) + + +def copy_tags(tags, read1, read2): + """Given a list of tags, copies the values of these tags from read1 + to read2, if the tag is set""" + + for tag in tags: + try: + read1_tag = read1.get_tag(tag, with_value_type=True) + read2.set_tag(tag, value=read1_tag[0], value_type=read1_tag[1]) + except KeyError: + pass + + return read2 + + +def pick_mate(read, template_dict, mate_key): + """Find the mate of read in the template dict using key. It will retrieve + all reads at that key, and then scan to pick the one that refers to _read_ + as it's mate. If there is no such read, it picks a first one it comes to""" + + mate = None + + # get a list of secondary reads at the correct alignment position + potential_mates = template_dict[not read.is_read1][mate_key] + + # search through one at a time to find a read that points to the current read + # as its mate. + for candidate_mate in potential_mates: + if ( + candidate_mate.next_reference_name == read.reference_name + and candidate_mate.next_reference_start == read.pos + ): + mate = candidate_mate + + # if no such read is found, then pick any old secondary alignment at that position + # note: this happens when UMI-tools outputs the wrong read as something's pair. + if mate is None and len(potential_mates) > 0: + mate = potential_mates[0] + + return mate + + +def main(argv=None): + if argv is None: + argv = sys.argv + + # setup command line parser + parser = U.OptionParser(version="%prog version: $Id$", usage=usage, description=globals()["__doc__"]) + group = U.OptionGroup(parser, "RSEM preparation specific options") + + group.add_option( + "--tags", + dest="tags", + type="string", + default="UG,BX", + help="Comma-separated list of tags to transfer from read1 to read2", + ) + group.add_option( + "--sam", dest="sam", action="store_true", default=False, help="input and output SAM rather than BAM" + ) + + parser.add_option_group(group) + + # add common options (-h/--help, ...) and parse command line + (options, args) = U.Start( + parser, argv=argv, add_group_dedup_options=False, add_umi_grouping_options=False, add_sam_options=False + ) + + skipped_stats = Counter() + + if options.stdin != sys.stdin: + in_name = options.stdin.name + options.stdin.close() + else: + in_name = "-" + + if options.sam: + mode = "" + else: + mode = "b" + + inbam = pysam.AlignmentFile(in_name, "r" + mode) + + if options.stdout != sys.stdout: + out_name = options.stdout.name + options.stdout.close() + else: + out_name = "-" + + outbam = pysam.AlignmentFile(out_name, "w" + mode, template=inbam) + + options.tags = options.tags.split(",") + + for template in chunk_bam(inbam): + assert len(set(r.query_name for r in template)) == 1 + current_template = {True: defaultdict(list), False: defaultdict(list)} + + for read in template: + key = (read.reference_name, read.pos, not read.is_secondary) + current_template[read.is_read1][key].append(read) + + output = set() + + for read in template: + mate = None + + # if this read is a non_primary alignment, we first want to check if it has a mate + # with the non-primary alignment flag set. + + mate_key_primary = True + mate_key_secondary = (read.next_reference_name, read.next_reference_start, False) + + # First look for a read that has the same primary/secondary status + # as read (i.e. secondary mate for secondary read, and primary mate + # for primary read) + mate_key = (read.next_reference_name, read.next_reference_start, read.is_secondary) + mate = pick_mate(read, current_template, mate_key) + + # If none was found then look for the opposite (primary mate of secondary + # read or seconadary mate of primary read) + if mate is None: + mate_key = (read.next_reference_name, read.next_reference_start, not read.is_secondary) + mate = pick_mate(read, current_template, mate_key) + + # If we still don't have a mate, then their can't be one? + if mate is None: + skipped_stats["no_mate"] += 1 + U.warn( + "Alignment {} has no mate -- skipped".format( + "\t".join(map(str, [read.query_name, read.flag, read.reference_name, int(read.pos)])) + ) + ) + continue + + # because we might want to make changes to the read, but not have those changes reflected + # if we need the read again,we copy the read. This is only way I can find to do this. + read = pysam.AlignedSegment().from_dict(read.to_dict(), read.header) + mate = pysam.AlignedSegment().from_dict(mate.to_dict(), read.header) + + # Make it so that if our read is secondary, the mate is also secondary. We don't make the + # mate primary if the read is primary because we would otherwise end up with mulitple + # primary alignments. + if read.is_secondary: + mate.is_secondary = True + + # In a situation where there is already one mate for each read, then we will come across + # each pair twice - once when we scan read1 and once when we scan read2. Thus we need + # to make sure we don't output something already output. + if read.is_read1: + mate = copy_tags(options.tags, read, mate) + output_key = str(read) + str(mate) + + if output_key not in output: + output.add(output_key) + outbam.write(read) + outbam.write(mate) + skipped_stats["pairs_output"] += 1 + + elif read.is_read2: + read = copy_tags(options.tags, mate, read) + output_key = str(mate) + str(read) + + if output_key not in output: + output.add(output_key) + outbam.write(mate) + outbam.write(read) + skipped_stats["pairs_output"] += 1 + + else: + skipped_stats["skipped_not_read_12"] += 1 + U.warn( + "Alignment {} is neither read1 nor read2 -- skipped".format( + "\t".join(map(str, [read.query_name, read.flag, read.reference_name, int(read.pos)])) + ) + ) + continue + + if not out_name == "-": + outbam.close() + + U.info( + "Total pairs output: {}, Pairs skipped - no mates: {}," + " Pairs skipped - not read1 or 2: {}".format( + skipped_stats["pairs_output"], skipped_stats["no_mate"], skipped_stats["skipped_not_read12"] + ) + ) + U.Stop() + + +if __name__ == "__main__": + sys.exit(main(sys.argv)) diff --git a/src/umi_tools/umi_tools_prepareforrsem/script.sh b/src/umi_tools/umi_tools_prepareforrsem/script.sh new file mode 100755 index 00000000..84a40413 --- /dev/null +++ b/src/umi_tools/umi_tools_prepareforrsem/script.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +set -eo pipefail + +python3 "$meta_resources_dir/prepare-for-rsem.py" \ + --stdin="${par_input}" \ + ${par_output:+--stdout "${par_output}"} \ + ${par_log:+--log "${par_log}"} \ + ${par_tags:+--tags "${par_tags}"} \ + ${par_sam:+--sam} diff --git a/src/umi_tools/umi_tools_prepareforrsem/test.sh b/src/umi_tools/umi_tools_prepareforrsem/test.sh new file mode 100644 index 00000000..177f44c3 --- /dev/null +++ b/src/umi_tools/umi_tools_prepareforrsem/test.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +test_dir="$meta_resources_dir/test_data" + +"${meta_executable}" \ + --input "$test_dir/test_dedup.sam" \ + --output "$test_dir/test_output.sam" \ + --sam + + +echo ">>> Check if output is present" +[[ ! -f "$test_dir/test_output.sam" ]] && echo "Output file not found" && exit 1 +[[ ! -s "$test_dir/test_output.sam" ]] && echo "Output file is empty" && exit 1 + +echo ">>> Check if output is correct" +# use diff but ignoring the header lines (which start with @) as they may differ slightly +diff <(grep -v "^@" "$test_dir/test_output.sam") <(grep -v "^@" "$test_dir/test.sam") && echo "Output is correct" || (echo "Output is incorrect" && exit 1) + +echo ">>> All test succeeded" +exit 0 \ No newline at end of file diff --git a/src/umi_tools/umi_tools_prepareforrsem/test_data/test.sam b/src/umi_tools/umi_tools_prepareforrsem/test_data/test.sam new file mode 100644 index 00000000..6465827d --- /dev/null +++ b/src/umi_tools/umi_tools_prepareforrsem/test_data/test.sam @@ -0,0 +1,119 @@ +@HD VN:1.6 SO:coordinate +@SQ SN:MT192765.1 LN:29829 +@RG ID:1 LB:lib1 PL:ILLUMINA SM:test PU:barcode1 +@PG ID:minimap2 PN:minimap2 VN:2.17-r941 CL:minimap2 -ax sr tests/data/fasta/sarscov2/GCA_011545545.1_ASM1154554v1_genomic.fna tests/data/fastq/dna/sarscov2_1.fastq.gz tests/data/fastq/dna/sarscov2_2.fastq.gz +@PG ID:samtools PN:samtools PP:minimap2 VN:1.11 CL:samtools view -Sb sarscov2_aln.sam +@PG ID:samtools.1 PN:samtools PP:samtools VN:1.11 CL:samtools sort -o sarscov2_paired_aln.sorted.bam sarscov2_paired_aln.bam +@PG ID:samtools.2 PN:samtools PP:samtools.1 VN:1.20 CL:samtools view -h test_data/test_dedup.bam +ERR5069949.29668 83 MT192765.1 267 60 89M = 121 -235 CCTTGTCCCTGGTTACAACTAGAAACCACACGTCCAACTCAGTTTGCCTGTTTTACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAG E////6/E/EE/EE/< Date: Tue, 10 Sep 2024 09:26:03 +0200 Subject: [PATCH 4/5] additional tests and version information --- .../umi_tools_prepareforrsem/config.vsh.yaml | 4 + .../umi_tools_prepareforrsem/script.sh | 6 +- .../umi_tools_prepareforrsem/test.sh | 37 ++++++- .../test_data/log.log | 103 ++++++++++++++++++ .../test_data/test.bam | Bin 0 -> 11123 bytes .../test_data/test_dedup.bam | Bin 0 -> 18822 bytes 6 files changed, 146 insertions(+), 4 deletions(-) create mode 100644 src/umi_tools/umi_tools_prepareforrsem/test_data/log.log create mode 100644 src/umi_tools/umi_tools_prepareforrsem/test_data/test.bam create mode 100644 src/umi_tools/umi_tools_prepareforrsem/test_data/test_dedup.bam diff --git a/src/umi_tools/umi_tools_prepareforrsem/config.vsh.yaml b/src/umi_tools/umi_tools_prepareforrsem/config.vsh.yaml index 3355a3d2..b7f12754 100644 --- a/src/umi_tools/umi_tools_prepareforrsem/config.vsh.yaml +++ b/src/umi_tools/umi_tools_prepareforrsem/config.vsh.yaml @@ -58,6 +58,10 @@ engines: packages: [pip] - type: python packages: [umi_tools, pysam] + - type: docker + run: | + pip list --format=freeze | sed 's/==/:/' | sort > /var/software_versions.txt + runners: - type: executable - type: nextflow \ No newline at end of file diff --git a/src/umi_tools/umi_tools_prepareforrsem/script.sh b/src/umi_tools/umi_tools_prepareforrsem/script.sh index 84a40413..3ac2fd03 100755 --- a/src/umi_tools/umi_tools_prepareforrsem/script.sh +++ b/src/umi_tools/umi_tools_prepareforrsem/script.sh @@ -3,8 +3,8 @@ set -eo pipefail python3 "$meta_resources_dir/prepare-for-rsem.py" \ - --stdin="${par_input}" \ - ${par_output:+--stdout "${par_output}"} \ ${par_log:+--log "${par_log}"} \ ${par_tags:+--tags "${par_tags}"} \ - ${par_sam:+--sam} + ${par_sam:+--sam} \ + --stdin="${par_input}" \ + ${par_output:+--stdout "${par_output}"} diff --git a/src/umi_tools/umi_tools_prepareforrsem/test.sh b/src/umi_tools/umi_tools_prepareforrsem/test.sh index 177f44c3..5f97a103 100644 --- a/src/umi_tools/umi_tools_prepareforrsem/test.sh +++ b/src/umi_tools/umi_tools_prepareforrsem/test.sh @@ -2,12 +2,14 @@ test_dir="$meta_resources_dir/test_data" +################################################################################ +echo ">>> Test 1: with --sam:" + "${meta_executable}" \ --input "$test_dir/test_dedup.sam" \ --output "$test_dir/test_output.sam" \ --sam - echo ">>> Check if output is present" [[ ! -f "$test_dir/test_output.sam" ]] && echo "Output file not found" && exit 1 [[ ! -s "$test_dir/test_output.sam" ]] && echo "Output file is empty" && exit 1 @@ -16,5 +18,38 @@ echo ">>> Check if output is correct" # use diff but ignoring the header lines (which start with @) as they may differ slightly diff <(grep -v "^@" "$test_dir/test_output.sam") <(grep -v "^@" "$test_dir/test.sam") && echo "Output is correct" || (echo "Output is incorrect" && exit 1) +################################################################################ +echo ">>> Test 2: without --sam:" + +"${meta_executable}" \ + --input "$test_dir/test_dedup.bam" \ + --output "$test_dir/test_output.bam" + +echo ">>> Check if output is present" +[[ ! -f "$test_dir/test_output.bam" ]] && echo "Output file not found" && exit 1 +[[ ! -s "$test_dir/test_output.bam" ]] && echo "Output file is empty" && exit 1 + +echo ">>> Check if output is correct" +apt-get update && apt-get install -y samtools +diff <(samtools view "$test_dir/test_output.bam") <(samtools view "$test_dir/test.bam") || (echo "Output is incorrect" && exit 1) + +################################################################################ +echo ">>> Test 3: with --log:" + +"${meta_executable}" \ + --log "$test_dir/test_log.log" \ + --input "$test_dir/test_dedup.sam" \ + --output "$test_dir/test_output.sam" \ + --sam + +echo ">>> Check if output is present" +[[ ! -f "$test_dir/test_output.sam" ]] && echo "Output file not found" && exit 1 +[[ ! -s "$test_dir/test_output.sam" ]] && echo "Output file is empty" && exit 1 +[[ ! -f "$test_dir/test_log.log" ]] && echo "Log file not found" && exit 1 +[[ ! -s "$test_dir/test_log.log" ]] && echo "Log file is empty" && exit 1 + +echo ">>> Check if log file is correct" +diff <(grep -v '^#' "$test_dir/test_log.log" | sed 's/^[0-9-]* [0-9:]*,[0-9]\{3\} //') <(grep -v '^#' "$test_dir/log.log" | sed 's/^[0-9-]* [0-9:]*,[0-9]\{3\} //') || (echo "Log file is incorrect" && exit 1) + echo ">>> All test succeeded" exit 0 \ No newline at end of file diff --git a/src/umi_tools/umi_tools_prepareforrsem/test_data/log.log b/src/umi_tools/umi_tools_prepareforrsem/test_data/log.log new file mode 100644 index 00000000..e4b56e57 --- /dev/null +++ b/src/umi_tools/umi_tools_prepareforrsem/test_data/log.log @@ -0,0 +1,103 @@ +# UMI-tools version: 1.1.5 +# output generated by prepare-for-rsem.py --log test_data/log.log --sam --stdin test_data/test_dedup.sam --stdout jnfgioeurg.sam +# job started at Tue Sep 10 06:43:30 2024 on 4855b4607095 -- 07ae7548-56e8-4772-9b48-7406710fd838 +# pid: 28, system: Linux 6.10.0-linuxkit #1 SMP PREEMPT_DYNAMIC Wed Jul 17 10:54:05 UTC 2024 x86_64 +# compresslevel : 6 +# log2stderr : False +# loglevel : 1 +# random_seed : None +# sam : True +# short_help : None +# stderr : <_io.TextIOWrapper name='' mode='w' encoding='utf-8'> +# stdin : <_io.TextIOWrapper name='test_data/test_dedup.sam' mode='r' encoding='UTF-8'> +# stdlog : <_io.TextIOWrapper name='test_data/log.log' mode='a' encoding='UTF-8'> +# stdout : <_io.TextIOWrapper name='jnfgioeurg.sam' mode='w' encoding='UTF-8'> +# tags : UG,BX +# timeit_file : None +# timeit_header : None +# timeit_name : all +# tmpdir : None +2024-09-10 06:43:30,918 WARNING Alignment ERR5069949.114870 99 MT192765.1 642 has no mate -- skipped +2024-09-10 06:43:30,918 WARNING Alignment ERR5069949.147998 163 MT192765.1 673 has no mate -- skipped +2024-09-10 06:43:30,919 WARNING Alignment ERR5069949.114870 147 MT192765.1 747 has no mate -- skipped +2024-09-10 06:43:30,920 WARNING Alignment ERR5069949.147998 83 MT192765.1 918 has no mate -- skipped +2024-09-10 06:43:30,921 WARNING Alignment ERR5069949.184542 99 MT192765.1 1054 has no mate -- skipped +2024-09-10 06:43:30,921 WARNING Alignment ERR5069949.184542 147 MT192765.1 1254 has no mate -- skipped +2024-09-10 06:43:30,922 WARNING Alignment ERR5069949.376959 99 MT192765.1 4104 has no mate -- skipped +2024-09-10 06:43:30,924 WARNING Alignment ERR5069949.376959 147 MT192765.1 4189 has no mate -- skipped +2024-09-10 06:43:30,925 WARNING Alignment ERR5069949.532979 99 MT192765.1 5567 has no mate -- skipped +2024-09-10 06:43:30,926 WARNING Alignment ERR5069949.540529 163 MT192765.1 5569 has no mate -- skipped +2024-09-10 06:43:30,926 WARNING Alignment ERR5069949.532979 147 MT192765.1 5620 has no mate -- skipped +2024-09-10 06:43:30,927 WARNING Alignment ERR5069949.540529 83 MT192765.1 5658 has no mate -- skipped +2024-09-10 06:43:30,930 WARNING Alignment ERR5069949.856527 99 MT192765.1 10117 has no mate -- skipped +2024-09-10 06:43:30,931 WARNING Alignment ERR5069949.870926 99 MT192765.1 10117 has no mate -- skipped +2024-09-10 06:43:30,931 WARNING Alignment ERR5069949.856527 147 MT192765.1 10198 has no mate -- skipped +2024-09-10 06:43:30,931 WARNING Alignment ERR5069949.885966 99 MT192765.1 10229 has no mate -- skipped +2024-09-10 06:43:30,932 WARNING Alignment ERR5069949.870926 147 MT192765.1 10244 has no mate -- skipped +2024-09-10 06:43:30,932 WARNING Alignment ERR5069949.885966 147 MT192765.1 10276 has no mate -- skipped +2024-09-10 06:43:30,932 WARNING Alignment ERR5069949.937422 99 MT192765.1 10421 has no mate -- skipped +2024-09-10 06:43:30,933 WARNING Alignment ERR5069949.937422 147 MT192765.1 10590 has no mate -- skipped +2024-09-10 06:43:30,934 WARNING Alignment ERR5069949.1066259 99 MT192765.1 11336 has no mate -- skipped +2024-09-10 06:43:30,935 WARNING Alignment ERR5069949.1062611 163 MT192765.1 11426 has no mate -- skipped +2024-09-10 06:43:30,936 WARNING Alignment ERR5069949.1067032 163 MT192765.1 11433 has no mate -- skipped +2024-09-10 06:43:30,936 WARNING Alignment ERR5069949.1062611 83 MT192765.1 11453 has no mate -- skipped +2024-09-10 06:43:30,936 WARNING Alignment ERR5069949.1066259 147 MT192765.1 11479 has no mate -- skipped +2024-09-10 06:43:30,937 WARNING Alignment ERR5069949.1067032 83 MT192765.1 11480 has no mate -- skipped +2024-09-10 06:43:30,938 WARNING Alignment ERR5069949.1258508 163 MT192765.1 12424 has no mate -- skipped +2024-09-10 06:43:30,939 WARNING Alignment ERR5069949.1261808 99 MT192765.1 12592 has no mate -- skipped +2024-09-10 06:43:30,940 WARNING Alignment ERR5069949.1258508 83 MT192765.1 12637 has no mate -- skipped +2024-09-10 06:43:30,940 WARNING Alignment ERR5069949.1261808 147 MT192765.1 12653 has no mate -- skipped +2024-09-10 06:43:30,941 WARNING Alignment ERR5069949.1372331 163 MT192765.1 13010 has no mate -- skipped +2024-09-10 06:43:30,941 WARNING Alignment ERR5069949.1372331 83 MT192765.1 13131 has no mate -- skipped +2024-09-10 06:43:30,942 WARNING Alignment ERR5069949.1552198 99 MT192765.1 13943 has no mate -- skipped +2024-09-10 06:43:30,943 WARNING Alignment ERR5069949.1561137 163 MT192765.1 13990 has no mate -- skipped +2024-09-10 06:43:30,943 WARNING Alignment ERR5069949.1552198 147 MT192765.1 14026 has no mate -- skipped +2024-09-10 06:43:30,944 WARNING Alignment ERR5069949.1561137 83 MT192765.1 14080 has no mate -- skipped +2024-09-10 06:43:30,947 WARNING Alignment ERR5069949.2098070 99 MT192765.1 17114 has no mate -- skipped +2024-09-10 06:43:30,947 WARNING Alignment ERR5069949.2064910 99 MT192765.1 17122 has no mate -- skipped +2024-09-10 06:43:30,947 WARNING Alignment ERR5069949.2125592 99 MT192765.1 17179 has no mate -- skipped +2024-09-10 06:43:30,947 WARNING Alignment ERR5069949.2064910 147 MT192765.1 17179 has no mate -- skipped +2024-09-10 06:43:30,948 WARNING Alignment ERR5069949.2098070 147 MT192765.1 17269 has no mate -- skipped +2024-09-10 06:43:30,948 WARNING Alignment ERR5069949.2125592 147 MT192765.1 17288 has no mate -- skipped +2024-09-10 06:43:30,948 WARNING Alignment ERR5069949.2185111 163 MT192765.1 17405 has no mate -- skipped +2024-09-10 06:43:30,949 WARNING Alignment ERR5069949.2151832 163 MT192765.1 17415 has no mate -- skipped +2024-09-10 06:43:30,949 WARNING Alignment ERR5069949.2176303 99 MT192765.1 17441 has no mate -- skipped +2024-09-10 06:43:30,949 WARNING Alignment ERR5069949.2151832 83 MT192765.1 17452 has no mate -- skipped +2024-09-10 06:43:30,949 WARNING Alignment ERR5069949.2205229 99 MT192765.1 17475 has no mate -- skipped +2024-09-10 06:43:30,950 WARNING Alignment ERR5069949.2216307 163 MT192765.1 17503 has no mate -- skipped +2024-09-10 06:43:30,950 WARNING Alignment ERR5069949.2176303 147 MT192765.1 17518 has no mate -- skipped +2024-09-10 06:43:30,950 WARNING Alignment ERR5069949.2185111 83 MT192765.1 17536 has no mate -- skipped +2024-09-10 06:43:30,951 WARNING Alignment ERR5069949.2205229 147 MT192765.1 17584 has no mate -- skipped +2024-09-10 06:43:30,951 WARNING Alignment ERR5069949.2216307 83 MT192765.1 17600 has no mate -- skipped +2024-09-10 06:43:30,952 WARNING Alignment ERR5069949.2270078 163 MT192765.1 17969 has no mate -- skipped +2024-09-10 06:43:30,953 WARNING Alignment ERR5069949.2270078 83 MT192765.1 18102 has no mate -- skipped +2024-09-10 06:43:30,953 WARNING Alignment ERR5069949.2328704 163 MT192765.1 18285 has no mate -- skipped +2024-09-10 06:43:30,954 WARNING Alignment ERR5069949.2342766 99 MT192765.1 18396 has no mate -- skipped +2024-09-10 06:43:30,954 WARNING Alignment ERR5069949.2328704 83 MT192765.1 18411 has no mate -- skipped +2024-09-10 06:43:30,954 WARNING Alignment ERR5069949.2361683 99 MT192765.1 18425 has no mate -- skipped +2024-09-10 06:43:30,954 WARNING Alignment ERR5069949.2342766 147 MT192765.1 18468 has no mate -- skipped +2024-09-10 06:43:30,955 WARNING Alignment ERR5069949.2361683 147 MT192765.1 18512 has no mate -- skipped +2024-09-10 06:43:30,955 WARNING Alignment ERR5069949.2415814 99 MT192765.1 18597 has no mate -- skipped +2024-09-10 06:43:30,955 WARNING Alignment ERR5069949.2385514 99 MT192765.1 18602 has no mate -- skipped +2024-09-10 06:43:30,956 WARNING Alignment ERR5069949.2417063 99 MT192765.1 18648 has no mate -- skipped +2024-09-10 06:43:30,956 WARNING Alignment ERR5069949.2388984 99 MT192765.1 18653 has no mate -- skipped +2024-09-10 06:43:30,956 WARNING Alignment ERR5069949.2385514 147 MT192765.1 18684 has no mate -- skipped +2024-09-10 06:43:30,956 WARNING Alignment ERR5069949.2388984 147 MT192765.1 18693 has no mate -- skipped +2024-09-10 06:43:30,957 WARNING Alignment ERR5069949.2431709 99 MT192765.1 18748 has no mate -- skipped +2024-09-10 06:43:30,957 WARNING Alignment ERR5069949.2415814 147 MT192765.1 18764 has no mate -- skipped +2024-09-10 06:43:30,957 WARNING Alignment ERR5069949.2417063 147 MT192765.1 18765 has no mate -- skipped +2024-09-10 06:43:30,958 WARNING Alignment ERR5069949.2431709 147 MT192765.1 18776 has no mate -- skipped +2024-09-10 06:43:30,959 WARNING Alignment ERR5069949.2668880 99 MT192765.1 23124 has no mate -- skipped +2024-09-10 06:43:30,960 WARNING Alignment ERR5069949.2674295 163 MT192765.1 23133 has no mate -- skipped +2024-09-10 06:43:30,960 WARNING Alignment ERR5069949.2668880 147 MT192765.1 23145 has no mate -- skipped +2024-09-10 06:43:30,960 WARNING Alignment ERR5069949.2674295 83 MT192765.1 23203 has no mate -- skipped +2024-09-10 06:43:30,963 WARNING Alignment ERR5069949.2953930 99 MT192765.1 25344 has no mate -- skipped +2024-09-10 06:43:30,963 WARNING Alignment ERR5069949.2972968 163 MT192765.1 25425 has no mate -- skipped +2024-09-10 06:43:30,963 WARNING Alignment ERR5069949.2953930 147 MT192765.1 25464 has no mate -- skipped +2024-09-10 06:43:30,964 WARNING Alignment ERR5069949.2972968 83 MT192765.1 25518 has no mate -- skipped +2024-09-10 06:43:30,966 WARNING Alignment ERR5069949.3273002 163 MT192765.1 28442 has no mate -- skipped +2024-09-10 06:43:30,966 WARNING Alignment ERR5069949.3277445 99 MT192765.1 28508 has no mate -- skipped +2024-09-10 06:43:30,966 WARNING Alignment ERR5069949.3273002 83 MT192765.1 28543 has no mate -- skipped +2024-09-10 06:43:30,966 WARNING Alignment ERR5069949.3277445 147 MT192765.1 28573 has no mate -- skipped +2024-09-10 06:43:30,968 INFO Total pairs output: 56, Pairs skipped - no mates: 82, Pairs skipped - not read1 or 2: 0 +# job finished in 0 seconds at Tue Sep 10 06:43:30 2024 -- 4.44 0.25 0.00 0.00 -- 07ae7548-56e8-4772-9b48-7406710fd838 diff --git a/src/umi_tools/umi_tools_prepareforrsem/test_data/test.bam b/src/umi_tools/umi_tools_prepareforrsem/test_data/test.bam new file mode 100644 index 0000000000000000000000000000000000000000..7793c7e3635e9c6a428647be7d067194f1d9d522 GIT binary patch literal 11123 zcmV-(D~!}1iwFb&00000{{{d;LjnM70fmy^PQox4#fx{vm)Hx?@l^({R~wKZ$u_4G zZkO&4xP+~^Ry5xGYCe}?$QGBOcc=Zn{!Y$Gr?%VsxVY=k@0u*NBTVWES6tt4skknADPwV< z`eJ5>LjYrB7$bu~Xb0}k=>Xw2EkvHhWK-}q;zdu21``6QF3I-epG8_Po&!mqD520^6NV7@DeX#>$<_dF}pM*P-JgGE!-FF(?Tr{A67(_wKp=z#&5V}SPHNO>oDuliP znC$J!Zged{rgwP){h&b`-j?>%RqHM93U>o?!?_kC;4Hx~A| zb!)L;cfBm$THL$%Q8)JY@NfCV6HnZI&pij*hbJfdC(qwKIXpaG{KAckMWNe5G&*J? zVR4?9T^@IdR)x|!PTH=nN+n8Js#>e+Vj1CBMrBl*ziCQ8mZcev(k#nT{8?r=8D-@% zi^>cq{XcxgXjZ1~=g7V}!)W&4&ZU3%;O?cxJum$mJBuHD<)!PZdoR83Z~xejUw_N3 zFMjJ!U0S^HO_y$Z^P4X%-u9O2Rd0Im()-%2VQO}|zqEM&2!e0m`^t1*f6HPaC6}BD zreoE$Do(1rOk~`(f{RY&ji`0p=uQ`HtMamtN$ceA)md&b`cG`S|sVMeIb!WBECZy?81*I|m0R z`}+n=#kgd;$yvp;ieu3+)l|GK7}rYbF4l~9x#W2gOP$Ag(&&mMj3q+xMrhu&3DYGP z0KX`+6fmS2&5{6HM$088l$8!CI$;i)Iej(28gPs#Ib;nwB}C8u5V^Y&c`2mNUM`1Q zm1dJWw;b-J`Og66e?AS&TW7)iTo2|Tm;~^Dp9p;Yz~=Uz_E3~Cy=@yvg>7t%1K zk>E`+n5HRRK&S8`HJ@j$OO4!GMK70WMAw?D(lP{d0WhCOU=lVCOf~}Zd0XFfz`g7S z59S*euNjTKCIIvB9f^|)g$Vn4s z{hDCbmp(q7^ixKb?ct2*Je>NsPuRxmv+ z7ZsO6=0&WUTvu9FDp6GnI~4N^B(5o$-^^Oo%*bqJEeIqRqa?{B;@+zFC`DtqjDn@^ zl~u9jnF00C{e3{UJyw$210~rSDaq|yFWee{@)qa4iviI02CmS8_ep?P3mzxBin~VU zRU(;6bXh2++a|A+XmiP!&^0R(*(o?o9mlz7YOc6W0CbZ`Rm)tSS$VQ+bj`>(h3|9d zO-u~mU}E^zD;#=%_o;)4`G7<3wx=x?gzgBvz3r3zo$bY2V6Ur$Rk|T46i+&qGtsIx zk6BgLi9%e#VZUQZ3K=uqD#TK(=$K9rJuni)M7JF|#wt-442F@Bo@{BRW8}?82)$n# zO?L!=d)2G%;52&=N1;R1UvJ%IWJ41p`V@Ci+(K`)UXuPjxN~p372^5)0MFY;cs_sY z<^0v{fc2cjXuGe4EfrH$_h&1@CTxo9GuL=!hUH1ECuG+$&5;{%If?EO>2 zxOaGXa&)k`eA8k{X9O1^dGxPc7#%7m4JBEMg=+1!9c0N;F5* zu3|=4=0LoI^dVNM8rXgY%O&$T024-qLXj++2I*_S>;sXqoDKS~`pXbPq{;1Mx`x1`p(YGYZD&{dsCv=+z}+nBd`4ad;M zF}wpD1CNWC!5uVOLYQ#|v0Wv-O?Dz3)TUf&5oW0v~_RtX!29heEKY! zgj|?Q62d!yt`BIo;B)de!5597BhSkm(MCh~0b~hwxZgZ`9`()%s!;y~thx>U-saQ40($Q@>E1Jm!%}Wl==Pw7@{HG4hKLDC{PJQXY z_RjwC;UXi>s6k|$w3R~ehF}gq#JUBc!Rwqt;SMn}t6HI0soEB}31twb_zw{ZXrqMH z3Q)!_2%%{#M^jJT%!au~=>Wi4j%@bjDerN9_%k7l<<{Et6z4p_@dwGOP!hCXn0jId zqY3chBmoB`8U;86QVXSW0Y8)!QnXAaadKv$uZd93e84Fm>&T|mhTkTfwBfdW-yE2s zOvV$L^k81ReemWU%nxAf{{~Yo5f75|rt(q!W3?@x3i1C}G~} z4scZrh7(>1i~?+B4x=fF0OfGwttJZM0+C^9KPgWyVl*9DJD-h=l8(*jT)9j4TK?zY?by zP0*lVkmmQ2Fo^!(TF&Is2Pd<0Hf%h*SB7eL-r0GT>VNZWNHCYf(2z=5LYv6^G!5om zo?&*kj%QamJ$ThKJcD@e;%64a*gu@!yThHGo!z~~XTN8$;1U82z9(mrCmJypOg5_; z-5?T6I6^eoZK)*kfljta)wzV`U>Hf1NN`W5Vw$DH#KJkPvlH2sNV=EtU7+;#F-qT_ zqIB~JrEgDAaxySR>1U=gaP3eUeAm?IZkl(xerT=ji2RrfeT~5ZN^cKkDL^S2qf~hZ z^=ymMb&#c>b141xUqTof$P&2dz2ilLvzKqiJHQ)Oh|UX+yb?O2G-^Lh2PY1ISb`*p z4o>3&cp*crYE7qL;)uXUI22RV2veWsaIAPZR_)0G?m4VrMhT zsAmX!bb5)I@L;Yi8M(Ja7En)dd+IywFE=D2-JH|~?Z(Oz--rP!4llE19Otmi z0!Hk@2+ORGm@a?cYKD0S#=betFb|Ipw@)m;h%x{O#Y7-(qH3B_9myPrhIUeb{6M9* z?iwmmD%1zSfJ?+pEa5GFq^lTEhu7~^%zJ?dH7_%0jB}VQu<`4TYy$szZJ-y-@1904 z-Zr5Z|Jv37oEJ<_CK13FrU|A4(~#d?f=s+^!71s6M6LwUQ72$<6@(2%bY$f*93szS ztyw8S*PrQIa4s<6#(1GbjM&40gq*_Lp zRwxv=v64+**02W^!yQ3d)wU0r77NRITlG%;jNFus+Gb-3&Q6!QE4x8G%uCAHFI-_Q zfv^ixO~Uv`rsC=FBwepLj3M~9R_jEeKnTnL9^lR4d=Rq(x>%XYAkS;vwwio4v>bFW ztSWfY4sWURHV(>L+1Vi2z-IcNQ~z=r?)CY?GIPAA{pwAv|KwRTH}fA}*>8C6s&sx; zEEX@Fy7H5qlf$DOBb_|qQpGTsorY~hHV#-6%BkRwnndUDP9T+uweEc-wyk#Ps-VqUW zp>Pt!pO`vz2PX+9I3d+`yoFzfWP-Z9a(vIggJgZ=Gm+x(bvFGnbi!m~k+r9M=ZC#_ z?sv$t-h$478OkP~$mW-QNO|XwZvK!P`?nbTg+Mls_D=S;7vH273EhFss1ph*!Li3p zOIj|dN+@!$M$qycpj}K&0W=yaosoHku+n#5+4Fv~Ygmktul=!row)R~nBgB)pSf!$ z;xCzL(-z*V_OPU>Mm(9EnP4&YSc=FB69);h`JgOro2#8GYEF!bEND_?kri1zj4JAT$!4!~__*YZ^i{?roh;xOj@lB5W9A=KxmB@!l z2iis=WSXLuro3(?%(n2E>r=|-aS6cr%^)mJ5Mfn{0J|m7 z2MZ1v7v8j7(wsZ66bf`d;;{})AgDreiUKqhtOsxrLP2U|jk3GsfKEzPfJM{|(u@*# zRXO<}@1s&X_sn&D**+PYU`0M~wPmZJWv>S|>YEBiC9UBu_*c<1$XGR3Uh{tQG~;sn zY#HT$0?gY2l9k5ZA9Rs{u|7Q6vn;FNycRV=Rs^{@&th5@Jcgi1g) zA$NI~h?X)i$qNFphFMY=w=Anu$farA^qqwC2ND(i2Mm(3h#>Tw1pf(-2X5t?G_*$&Om z5}MBOy>l^){i6`g-R+~Jqs3*I!K(t(N`qx7qs+WAseRhF#HKDq$eiDgvME_GZciE= z%Lzgb(xn~t7`+)7J$LLU{&+oC`^OW6oaY=P^v+OwuMtA$D=}r~JUy~>VI!AEXt4pI z?;RuLoyCtlf3Rx(0PDYaB~>imH1{~g?DRQAY{|9}NdEw2>W z#`)ea8xv*=_-X%Edzoo^H{R9JROY`mz8ItAjg7bh%8v!h3hsR>DdeXf{qS9@SmK;F zZ|~{B9LyU5{K8NnEts~2VKZon1x-276fj5ds`93-i7l2aPf(|;62@z9kW3)qx)!og zuX>u5V?Ua@ws%`i$yUT}o@&pc-p#DYXPCiy)&1CM&LWz17I(hDqdBm3ImYe}(A?SE z-8-<(f)yyS)Kb&{86> zfx0RXE9W&-IBp?xNXU32st!`C;)3xO75Jox3GfGsU_Ws-r({mU4B)HdKQmw1 z=<^ecLpoZIF>QO>L=_K%X*o*&C+k0gtE15edinh7X?U_(JYPE_pM>$Bh4MMzc`gZT zU}@Lm|Ik{`y~4$%HcpjhZ9_%hM>Uu>LEtfEbvQ&ziL{LGBHB!L5CMSiO5zE1mB(SGEk#JmB*-_&& zxDBT<|NFwQ=CSn)VIG^ze&2fH@$5eoYUfqV{t8U`*$=NxH2ZYJ?5`Wo{-sWrj(3K8 z|2{YSdocDNf_uNae|UhR;YI3Zp$0$Rq2#P(&6}bF`5|NYP1e*HrgR9F)e)Uk!SwjUfgs5ddP0Wtm5_U%hB&nD!H0@p@0nF$xczU_N@9S-NS~%>T39 zRkvATEDrG8+dbYnwq~9+9APdblP5_F#T2SVBm&stM3x;zCynIDM5+X2RKYX~g;Jw} zEn13E5G5<*bBOz5lnqU|i_|_fHrLiuOe!Bap-Qu58VwQiw2Cu0#8En%&7oADGB`SY zR=0WEES}Zws@o)t|9_zBBRty}#cP>^6sh98fJFnnhaW)*OKR-{Pt01bd5rQlODfG` zh7+=CBuX+G^#{mioukaviul4<**$FQm^k;-Y-BNjF{FIi$UkJG5mTge(YUK6q^aX{ z?C)>|c)q|d!R_&U<3cTlF%{yuw{v`avbcCN@qIBBQ5hl%1RJ#=jb{SXL5F;e2vtE& zLjhQ7qO7Q)MY3D-$ajX2-MpbT zc=Hr8N9v6sdsnE^*A}uVAlYOxn~rF(nNfKbi#ey@UTdET$lkdLvg;?1d9D50j1~Od z-PP1~Z-4v1t;9K#eFK0J+*Q+5d5kg-oO#PAth_fVs+Xu71n;1@HP{wD**ICalf zU-dNSsbH-ut)4RK@%$&}^C~{z`Q4Ef^Z^fH945C8PYQU>>@!ZqbaTKHZJc(w@fgK2 z6P6(jE*q_MaK{{!n**L7a8P!8gJ?eDpu8VruLwkQe`oi2?_{xUdi3KGL={5_iDZmu zzP14^ib8d#kf4#*4ex_at_^IKa1w()nVRbUf3TTvp1P{W9+v=G^y{}20Dng z3(+><>vJOb5VHWZvQuRXIshO^1x<$@bCB3&$%iR= zt1!^XHeEJ0rqRd(N1j`a9RHp636GQgNty#ZfAutznarBZyL&tz=}jhKd=%=r#gj~? z0$b8(&?Sh!Td2Ay>MAZkN9Ao^qgoQP4s~J{1M#|1G6Bs}w6toyt(zPIEl_Yr%G2`L z&LYi9ANfRX;!9f%h^Cb_N4KtN^pD&Xwt{!=_64(>(9($JubyTyZ=IM-ujt=8@Bz?3 zRR2E4?hE(8**iXQI|>nrCleIcm02|hIP)APv0}(>x|o~A|2U;kmhTv3w}DCEq}~aG zoK{lmxCV|j3}+^I1}mZIXGUB`Q371e5SoilQcommp@Kp0iSfW6 z>N>5Ewc7-7Y3DQNV0x7v*1g!FrM)=n#k61W7>$K4Ez_(gLu@_I7WDf8+n;5_T{1WH zfe-H7yWBc$ZPRkHw#hd_K6lSa)m9uu+pHkoy`%X6;zey)S_IWHVq802CYd-oF|oF{vSM~e%{ z9UJo6njtwXSc@nV7#666GB|FOlnbt**s5t+BNC!8px!yJ;Oi3#P7w(_WVb%++Y-!< zb8fNAxZfsnJ<+xgsn^%Y>8tfo&K!hI4l74CilQ>Np61SRW7oHUR{4P^UNqCq@U010 z&N7a{l8aE2uO(P3$<3-7pB87%8^>(&Eg0>OaPiR$Sl>Ga>(xGwxyw4`>qjZ}yD;{K zP_mAW!AzJX+cB-;q79G)io{4q5>(q|46Ovk*ybg|aN0#C)<_g%su0AC>AHlj^0uhK zo9A&%D5^wtAlTq^5`MAwi&nJKY%LC}KDPI~lX2&j*CsTl&>y`B9s$*dH|=>b4t9n; zF9^extOjEr3Wd<(Y8KHH@wI|r!Xf0rigpm zIu^8a2I+T_pl;ks+Rm&IQWn4!0nkG$xcFIR&S&B}76r3BH+Tn*=#DwT9HRK07XxmX>aV38Jk+Xx)<3 zRJ4gXNR1p6X2B`b0f!-y%}mghBC_tsNlD+g)29x@)UAt-%zC*`U0J*4Z`Wi)Up`4} z{!6`**z9ia?Hz90ekPRUbyLV1>MG(wi@K|!^e8vf8e_aJA-+&}sZ_!mip`}~1vDGj zO4`|z$D&5y3G#^0Ox^Nsn|!Bcr>vz*RHC7?*m&iM-Al7RtsY17Zod!+mfe z&Y75T@JpjMe`>qSB4-K_+T%ht_#t$CSsCzhdm6J*9y#~%U>N&@rb7Qh?$lj`)HtjZi?ACrdXKYF||{4Q%l_n zKu-olo+McMZ2;ZY!S=3}k?kN{8jiu}#mX!{yng%65Br4b*X|xXptmvG82j?$(^>BB z?``kecmiVHao)j$#i*CFyv9onON=UUr6s(fp#4me9EQEgrO-8JV4_hyjs^9DC)z|7 zNvA93oA}H&o@La#FSm|v6s|cSERC7_|{1N$G3RB>)$tCjQV>gcRB6 zYq>@^=gzuqFeenX?a%IYfVBOw)9{REBeb9H@f-*g!uSP&RdaZnu3qY;sRX#Cgu|Vo zaD*~k)5^RP!_62NCBb`wE+0Tev+B@R%2Aw??rjd zqq_csT}<)VY3lxz*$C~uuNb7$qxpvz`%buj@9vK8krpI}u{I6T)O?P_3~?>^Jk~(u zbY-?7YZMr$9DE)}Fd<{mc@Rr#r&NHMQ$*2flZ{yy;J2r7&cybly1sie&E=RyiL4)q zCVRZtGuD^(Xs3NYL*#Z)Ht%s;+b>uUM5$LiomTE9n7ENMfryq^*FZfyAM74BAeaZSh#HV}bm(rS@}$+FdP^GN)Z? zi-a6h3u8cO7}rhPHnG|9!tNAnw~}@%0;yXG(S%tYv2@r_s!EX+MH!i22H|!OO6|{? zi@v-s&s>;dv9gR4&(yE zMb{=18$KbBQ@0R*Za;*yHzQ_Kugu(!Xufm5+`thGjLVl_y(*Q2?frpNIvhzvQbE%OhfIm@9oKyQ8Z94>p8Ku!e5(8)i7fEdmUxoAmM1C$gE)uBJ>n0oO?2|mH zuacC#>W}frR{hT!XwQ3$*Fik{>{*yc^H=U$jW@gdR_dO+hX>ooN46>tr^0vz00k}b zq9RkmJ5DB(bQqR0Cq}jI@}`FGso>X94W(y>;Ei#PY7>lSBWh6obvep^X884iY6_3> z(%1lx@@fu`<|j|{KTn(Gue$qI!%af?w?ZAaXqxs|M3hAhzXEYX4T%=fRj3{n5KLlN zxByj%*Q%o>4?5QhK+4Z zMn@MW(a}pBo?ri)1LNYi%%ZWk2GV)7x4n0~yZDOPFAKqU+fuBPgPzJ8DOD+G2jNQK zFVg~vU)dnIM@lG?j?t1RMSK+kcG%b&wNK=WhB5MFi*Z1iFLtK3%;9u?9>9>|Pd!Fw z)c>si950dEK=9|s2tI2=`S@8=1f4{V5&XG8BA)~V*9hYoRy<{+B{yq&nrwh-?u6Or z*`=noJ!-k^H+!+=K@*|>d?sq!Fib_u@;9V!mx&faDstV&dFQE1{anh*96VeEPI%G>a?&$eS0-#>7)w#%% znC0ME&Dt5O!`T!SUxD+jou0%PKWS;Y{CdM44xa09SNkVTy>Prndyc1i%7mx#Z8qQO z#qnSCZ8ik(`&Pu!fJtpOS0#i>NFgXhZB*Kq@*_SHoLi5KFLG|OM1i?g`;zu2Z#-FDf$&ne?GE}Zy1ezXjLwckM_)( zo^vBvp6$p^8(*_{)>^h_gpaui*7V#kx1#CKM|N#I`!}o#(iPACNvU5FEL=60cF17}&RAaLh-?w##)#HfPV@|m=d}g1Z6XD@AR}4PK&SEP%&IxZt9L89XKbNS@3;wEN9>0Wa>9g zYYrO6Z&@D=FJ8M^FC(nK6iSuF)$#`=_${gGg!atIb4q7vhiF8;9JHZgU>6v`Em%cY zp!P@)v*8+pc@fj)L?Y5I3j9p+cJ-S~&&15mta&>&`7P$q+`J{?O%Bbc|E(e8^$!}M zu?IplkB<%x4$YHBY9)x5E@@wLB3NYH6gsA5{#1+8tFoS{-KeR_h#uFYA;NW4GP*Q-QgQpPhgG`&&w=W_gQ<@I@R zusU`c5?f;={1dkBd)*)t{_$iq_HZB*$9ubnM~CM5WNjYjjM{0TRV*)sqBU3o#lM_P z6}aUNSrbkv&I{PAR;ZY$xkgkdL^IGMjjp<(^KLSg(=bTOsL$}%r0}Db7B}D?7d2u) zJT-5PAh1)R&%b>GTRc3U%%Hbz8$9TVX2I-tUU+lsZHqJI6=>6=gIx6-6g;B`o-eb5WQn0=lk}7KeYY;)CN?r zp?xhCD`31qWYP-3m^yb_(ZJS&1V&0{e%$sn zVbXHcw%)gLI4)w^PBWSKCJ?icmXXzZ{f1|hjr##U@7JF_IFbs#{*1;R38i9pcXw~c ztUudu?;#~I?d~YR-Il7Qr@;V*TqFgj;Gz^Aim)v%QUWpUxvQ<`Vq@0@VAKC`{2S1V zx!%G%HnghWu{A-`$l-bj>%WJa`uCQ;w?=rLG*(I9?R zbxynUCZKPW^s64%Ky$T+iRE7!+NO`Jf7r!GCbQq8**`Lw{l5w2=;~%)K52))`!xG} z!|bmc&;AatMSo{to!-3|#$+)2o!#A&qwU2v=~*9nQo+yDGn6_cFtyA(=v9rP9H)NW zrmA(B*PvhWMu8}8k=@ihPC8uJa8+MG@pkEYEr!j4GT)IkYIku_r(dyPwsPx3_Hn3h z^Icb-3$lW2Bx7zl$EAHB^^P%8kFV!w9xum8x#;L0jF5VFpd}8euUtJ+PGOd#nUpiH zM`Ukbo$Wy_%kicOhPJdHOJ(=*Q{T5TKJN(i>AP#Uy1MjTA4UGJ2UaVP zAH`S|O4RZGp{wVj_?6J=^Csb_EV7d23T!WIFwAisqnMkE5)4%fykr+kP%ka&YNkg# zq32YvgcZn~HL7Md6r#k2C|T+WSlTC&xeXYp`5^5V45niU({kud7_Es3V$w77UO|_^ zCr*nZZ=S8^e)NIWn4A#)Z=uu;XaXad04Az-OD44dDMgP-sTnRTrkA}>Hs#Is@oKLvYae(jk6<|{PWPAiLx6PJ8`)`*rM~&yD-0Z z{MYY$@l0^}uI1J+mD}#H!t%D!l)oMbi<{m>W1>{t(q0BygDK9tinI@ZOiL-m1+zwe zRubb-l*p4CaGO;jib}igC^r#vV@#2RnuwRS&ScKpCkokHow>&gndc0-&B4tpb9HK; zSQOb!Ps@Hgdfzz}Ekj^#tnR(^F#vP&qAxbJ(YnI72H z^#yD(5NW{4bsZ4lNd8ewP)Sa!BRVGX28LbasPfS>S?Tfcw3joZC%n`s{{dC-lgVcYWLvu4z@E19ArW&A#001A02m}BC000301^_}s0stET0{{R300000 F000^eh;IM@ literal 0 HcmV?d00001 diff --git a/src/umi_tools/umi_tools_prepareforrsem/test_data/test_dedup.bam b/src/umi_tools/umi_tools_prepareforrsem/test_data/test_dedup.bam new file mode 100644 index 0000000000000000000000000000000000000000..0694dec737c76228e68a5e50bb5bbffca8429716 GIT binary patch literal 18822 zcmV)DK*7HsiwFb&00000{{{d;LjnM70fmy^PQox4#fx{vm)Hxe&k`Z3G%vZz z`gZt$L!Wt1VA1i&B!FF?QR|jkefx2a7c&Ofov>{l0}foaaNVW1@Esg_Ol4BxV98=$ zB_hqBZwK{*A}{lT?StqS(p4A@nWQ9i-~~Lmt^zt&mwIWPk|&aln`5a z2-ND}S2!S0=Q_=`eV)B(dKhaBm~TrWd2vo9aL#RX7gvf(X}V~!JDyirdZdd=Whsey z(K(Jg%;_a_0!Mo~=!o@PcNojD4(ABzYC000000RIL6LPG)o{71EY3$!d*c~+l!F*6L6TQ$|ghC)(RR>sZ9 z8qWIx1Kz4z14V>5nZYiIG1Uf21QWVN7_FN z5?aSA34KY^LSGjI^h8}6(8c;Wy+q6O6I~zRvIH+DiTX`+b&9pL2$t$QJ)^%Xp|5z3 zC3-tn@C4VsY3I^Ee$(!y#l0_h>CWOCUVLe*yzjnz>(9LQTYuv1zxJH({Ps(W*S!AH zliu)#ON;M%V|mZ(--Op5b8ypG-naPX#enbfD_{A_1j~0`zgXk|h1W6)c^pPrR;#I_}cQmOYg@0UVGEc&o%dZ!*IWs-u*%&I_))!MWk-`HNEIOhPAJZMQ3Mc z|M+NI$xaxTRoB!xiy@>=b0osgpI*+>B!DAxBj>R^L;&-4=#Shfcc(l7mNG0#$fIrotzvm-Uu)m595}} zl8ZWz%1YK5FIz6Uu#_^3!;sfu#>0rkSyOj1%9}E3S=Hdmuqf*~6B#c!YYG--7gF^o zmIgp2Wa&~rR}!UEO{=Q`-Rf0+v`NjB7Ip}Q8faV zdN8Dgz41?aH-70uH5o_#wwIN9GveAq(c?jtp5a3E<~? z3ORt;g$%&d85{vz1avIy>XJ1%K2;6_-@C99h_lm+7m zOmf@+8JLtJ0176S!4k$sn`|XGctKmh-WO=K(gGQnPeK_pKw{9e_y-nv`rXUr@S!AQ z)m#prl9kJ!*%WBpJPI_dvv~S-gXgu*;E6<#gx@g7a^=a9#sApQ@e3{dXBSZ*ky!>elBBbT4$^e3`TMGEdPDPY!nW z77xL`dCFRbVI|=cr08lXG--2m9tdjv+FaMv&MjoZ(zHf}#Vy60`} zAl-ZZF;5@Qd(W%?*;j;wRh$&}{IWIoe64%W|H)Ai8x&NFL6v^j^J7|VQo6dS5KT2A ztTNseWUnd8DkR2G?$;7FJIlW5P6jR4;-v{je zqVJB_8h6CDPPPW5q8;sy-o6+R{Vp$6)9C#a(94B@aEH8TqBv||4w0X6B)MG`v#M)Z z7)G*#A8HscncyPJqpr?mDMg7ev20*hBX)uO4`m!hs?*O@xz<#>lSCQ7Brt}~q`y5# zfs2ANl!Zu7v~a=^Q7>qQ2<@3>{^er|ozV)5*;``Oz*+27e# z?x&2AmDD6h9gEvA3whHu@H=IJ)D9E^L!cB1vMy05n+~~en75&fK@@}pWh|SPTvZu& zI#)3X$x$e8%{A7hN{`X_MWbmZeNHPg!6&%PlEJB^{P*hvAGAAxo$k>ONG!x40fq7pn*4jTV{2pkXWD_i#SAc->-Hdz}a8;o-^A!Qz1% z77G$UqIohNA%?F+RU%U85M4G^7R69-#=;!#1$RfJLXpEj`C8;|HL4r5`bU zVvNL1j>G-L2#Dusm*E2Op0UfgQXo!yPs`b>lC(%qfrWJdn}E2^1;VCsFTXG~SEO`H z4+yPbid#AD&3B@#VjnhH7 z2B*z0SSTOelox-^C@;3Y^qZoowR!|=PaXTx{lkO(1D(o6oT4?6RSgTfMzo03cq<`m zSq|%IgtbULFn9`n>LO^W( zZv6==H?Ca@tpD{}XCwZsFMTAM3MXOwTVr2p@Fd~v(J0{?83`R@VEkgia*;ZXfAx-tXeEuu8cBXelg^`)2tTjO2I)c}OX2^ko!K2L^#5l7l|#D@G7Ze>(FAveF${+1!lglV=&l--)NPdFOJm zcz7I`9BhN*Kh#lk6vyBR5zB>D$C2!I9pbiD#<}24DByKOjiflHI1s8GQHC7J1_T+a zGRlHmq{m^|@GiV6vN`h$HvC#cNG3X`Q8D_!bF0wimo|YJjljILH2`Nt^RF)kfbSUl zbOUBeb>P!NRiJhUmFAsNeoFL^iXg;}@gYFVBa>~=?JXUxOza)^Y8lw2v%`DpkIH)U z35>UB4H3r6i)5G|h#osV6ejaPT=*i^X?#a=~;vT@5K{+!HH{FH64XheG3A7cJ zVd#7(t;9`POY=dI z1<|RY$?Y_+FDgJy9=DYeZlZ18O({C-96lNA3%9&9J$}eOL z(GYaEY^a*mh>ko?gm9CC?J7}-sTe3lDWKvUbPj^2S}>8v_zf9nVRS(e{Rw%QK$T(= zgNe>o0v*q5^-j!<2YNk8hakQWiUJ2=k5JaKzH}ndw3rirtLGotloY#qerK=eB!|{q zT0K|t>cOdHll1(&*da?obs56UQ7h~+Q5F|dTP@L@2e!>+X{vO_bgJdTEUB`el{70^ z=}f`Y0TRF1?XOrm*O*=zTLm*%etMIhUzpYNi?>bNNPYxs|HISsqrIbT?Z=zwRKPJ0 zxTeC@5a%Xu;}G8|DN_ITWpTA0!TjgZ1w`bJ;;KbXOU35fiJv!%iURE$4-Xxe$8VTkj2DiW8WBK6X#m?G` zeZf53J3d}~4CSKJfq2;pon@dM)=tY}|MRl2mnbGW&`^$0!TmkYXK{XXe0xq2`O=S$ z?rv9`F!%2Mxi2Yay*t~SmyAA%Wd0NCzmkO+2P#Alq^G#ME%)wnqX+jqW%%s=(qdTq zOXJV(aA#*{cW?2rCoC4AaX~rdaTW>|OJq{W7C;5d8u?btkcc5=5a6dWh5!Q8I}^|x zEQ846vH47$kI?&+W*^DwrIlLIyET)#DOCbWZ}w68(io+{LFr2)l(w}DxG4RVu?$>2 zl)RfslR>y=?snFirsuH-))MRjr8loZ=|&eNYg2#m4-K-^+tjBo7LR+fghKh=v9hV) z+Q7Xb&Sa7aq)1`e6>*IG2uv`c7DpKiK`-$#k3rputZh&i>wqOLMc$q>@ORL&j+vld zJK~@(RvFK&fpDrSh0T@%!>6gY=+aDT>xH(CnDm#cdKQ`J#4sL8g;UjfL4T5Ex(rNX z5!rPh`RhJCmwLq+*V(6A<<@{3VR;QC?uQlwzSnpXr?IT@>H|089j%9pLNXoDfs2G+=*|QLwQiG~C zs*5p4&;@GVO*7A^h?6j=UkK(dR9{_MZbDEw*Abjw-W1FyqhQ_!pbvcQl$IliKlk`p zCN-QY(HtQXc3n$i$6)YUfog?BMp?BrwWM{jL=Fmr7(y0VgCFTC)VMPQQEeA7KRF!T zeNwojWaaAc|B~h9s9|IUsLV_tpEu_U;s;r5;-jwuo zH@~K0;iOEded7J{RM#;~Z9{*x_SODoaGV#;$Qaw|5Fuwp(`q^){13;XY0xB14?)0J zc?bBAYa+=6$zD<6tVM}~qeKKc1PnU>j$=YPk`Pc1h&xedMFkgChAdVoEP=lWvyeFo3J`bRXL*E-dK5WB%Pwl*$2sa4W!80!R}tqW{7@ysoQG-$gW#DgMBEC7 zx`yN+m8cdBdz)Q254q%au^7Xcx*So^zoI#nK(}m=BU8@{<=IpZnrcZ`n_-kwSJVMH zDk%Lg%zE_|MYpuKpcYn__PmZS7PpO+{`hG7WLIaMXNFToEo+;w;bF_DT8G@fri{O_ zk+WLB{IGy^KEG z%jiJW5x`F#he$?77h76J-$-Fn4hDqwf2_i)g%@fXA!|fTq64wlbqMEU{EzrEit`XA zqJ)iz#bt_>s9SoO^o=@+3J?)KB|0pm=5UQ(YJ;6rg-VHwJ>FpGyg+#`-PlI(>&x5^ zkHRgVI06;tmIN=kf=&&p=ySujxiFFz9gV~993Qs=>K$XVuIyFZ`2zy z-#~}lnknwP#-X z@eCwqV{?77ceKB|ySV3iq-WPGR6n#t;@VUKwdfp*p&B2c6o`J2oWXK8$YEEG^5BoXDl>O|cz?aR|s|Ywh@Fjp>FjhVTrrHj9 zZtUVLL%s%ESW{;O6WK)vGILAIaS)K2=pilE zn1CtKDbg~vk0kB4Vks=u7D*FqjigdPg1~C{5b6!mPiA1b21qNgtNNnW`-Z^a`QvvC zy8cFk=P%x&t-p)s(ca14_G0k_(shKnsQ-Z`K&2Tjlgm7$e3$lJWei+_$Y{$z1KDjU zwJmuITNzKH+w(nw73=8`tm(f6GWJ@ZN`7=@wdB&HpK|WrmJ@7vMJT<~Q&^1%g;$Ym zQH!f$qVb}zjp~MSeoj5>JVOB=vXenmFD8MBnQY&^39e`-joAwy1kPs2U?m7viC()Q zymBiFKkyX0fwBL{U$c0NxxsJwql?A=^FH$9!~K0&Bz)xK`vC3Jk=Wv5oJ=jJjc@xY5`lrq+ozfhz`Z`SW>rs0ldmAT@L-eiP;=&wzisX6qS_(eT#}} z%N%xp_P_@HhK|u_fx!mpXWrC-sW-K9_8&L-zT0No1phO@ywxj+)7pEy?TFhu`-dlc zx+2aQt9S*j7VK^XWWg+=D8ulj6aWh-!(P+YkdnjpK<=_O<`k|)LQP!ADkx!xbzKoR z7KZ{+o3H4USMLcnNsVP00|%6JB&hZmnVB1&^qS!Sf$H2&^w18ouO~4+kMx|DfTlcK zXWIzP?XhUm+UvZHjt-jjRYcPaAW?Uy_%z&_y4zH3Z&i*gHP|i#BVV>~le&GX%Otf@ z?l``_qTnj~*)r~kM*+o;Np80!`k=wvbno`bab?M z0A}!tz&u-$ke=~nN00q2VuaBfd`I~Cl^y+;K0iXpI6@bp@Ac*FDj}r)u_~BaQYJ~y zY}#WwFcL%2+E%I{9oG-t&2!FU>AqxzZmE~86cIwtSc8!3Nk9JrgHZF#!M?rAAoNMB z-M;0?(Ba|k!O3D8U!}dY$^bDElry!B%;tp0zMELT*ip0a)y#&Q&+8I zVM|2?nf8u#UKnOmBMXELF~?hPj28(~*CC{VQO=|%Y5T9KTNTth2d8}Y?Q7}-xA$@V z-+ShujyA6UFxLK=ua1YiDB`QYejPeI+E_^)9=6_puO0`WBG4E$myB77<$8{k|n zmu7EoFa#b0*=17Bz)X?R0+$Xln)G64MlR@n07KQ961NJW6Fpo(H|v1wP0YUm*U^M2 zs*3~7AK&D9FU(f2KHQ@@B=ID%-{70VK{~G(L*B+6-VSSex#uFRqKz%SBQP`hdFox3 zx{WRF5k#xRzYcP_xdRaUf(?=fn@l=LWQM^|Kp~O|Kj4Nn-ClpcU z)}WM(E$s&OH0_HbCcaS&o7>r=1vXe?V@DrAy85Lto~I}W#oQZ(; z>V8%m8L}uMik#8TVM$kUQ3GxeCrWj-vsUxmw5%dPRFtqCX=f29jkHa-dg@kc!gDi` zCLJ{krhRxG1jC>J^D&G){m@=y#Pi2DmHuxSmHurK_^0k1{JO0IejnEUy>C2^kB^QI zbQ!RTTN$-&jPy688bmBgmZ8i-rIi|~a)X?inobcCwU9(=qM&Wqh_bto_PHaq29E&F z*<{RmvY)Q2B*rtF9_U^WgQ9m@lotZLL^t>ZW=xI!qf!%%WzxS@U#42wRn~0!tJ)rUp}Q7F!QmtU>DxXpr2B zoE9ot6?N>X4HBsT$Dx{FrqMCQJS7^=IHM$WOsTqMjaf2M&-IjyF&6VPVf?NPE1DnN zwZH3+D%{)MJJ{2nD2nSW;sT(sIEQ3{!fA6vOrVvsis%|%wKQ%4 z)ew{in~non>ZPDr6%w*3fn>v};b+)pyPBI?|CkN368BZ_3N@9m=c&Ed%%(rW^WB>a z=W^C?-tnxd;UtV-<&W3Xc&a_%ISghcM8&hRk+IBq3q@xYWUk{;$Fmq^GFDb7NCI1i zRH*Gju%T)ncn6?cNN?9wB5Wt}l8iif9~7lyACh|CXI?t(!(*7ObiR9&bY4FzoiCq} z&JRCpLM`s>9PAwJ>C)|))&Uqtg36&Z1q3D`fyk>8+T`a zZ(?4hpAmQ7vN7(wbp+^_40ylL1Nxf9tDUu1`wD-2vU{+*NNJV`3$skX7K)k=aLzagb(2ZeGSu798F2Kb)vgZ*4+Iy?%mrcd3$FXCBGhP@Ap1?q{PSDdOHDXgfgbt38<3s5Or5p zL#tRPA~?^mt}?hv-ee(u4h3AOj9d5v1bJmvf*V74m)FQzqA*gn-Xs?)Xw`$7EN8E& zgym8%1$w+zpu=M`2TNT*om{3*KClBE&5&T#XsBlVs{NjcsmdtbQT&1$kCz(GKe3Y= zzZXzGYh9Fl{U}PdRN}D%=DQ7;_hRjz`e5!K9_$@2F4BHC0+OurSM`OY*5AxHY|$2~j;rzsmaN-+I(9!g%J z9Lhvif;a#~O^&EI5=9iFI^1=w>x(+v2u!%|6pSo~ zV4Cafji$30tiz``#XCm7)sQu}oRO@t^S55_25-ka+nuW!_@?O?*8}s{`2E2dDTlFx0+h+ zPQ$UkxjPw*fZXeN=U9igxJMMuGt^JYD9hm}QC`Q7S(Xdh1|7yowU}fP2&ynHWf+Bc zL4eglY$j_EfibVLtce;Pjk?!ukm-f4zNXGbtbwq@V9>nrz zrwVUxolO`1=I&He3FF8UEC|3Vb1Mr6eOH1Umm3D{L8T1i_e!4Obv zhZGkYoP{NACgpKg*N{`hL;^9OifUHf>N?}5K~CJ}jSNxpYBl|b&lgFPR{MC8_Ivwtt){CPnPlH&NFBG-Rf@VM{?FutWW%tQtY+YxTQ2LasaSzaIj+hZIB^!E2!(7L#ffpnN1MRyN`*J zTK}+-^w!q-*!x%QO-*NuwSVF1`u@)D@!p9&+2o2)M7^^FlOKauO;0^>$0D*^d&Ssx zn7&UV)2S&sr-f}_w1lXh+ifj`63_RaZM4?&w+7^_c-nR*Lik0Vx*IgN)gZ#CKshc1 z6#)vLikVFf@)O15wgatSNCey9nZar?3ecJCx(JkhEkIttnTs509`b3yF6s=~rf`Xx z;;oC2w%M!b2B8il&-tXHK~DPhXY&l3`|`*>8T#`X^jXT;Q67mWz5#6-`(T+uHL5;#z($n zYCOrOcc^;Q5@iqMGYy23=T#ZyC~IU*R&}h3!nTpf4}o}9cOpi)ByVU!e^b>NOaqSv zux%R_>AOf&DaVMPzVW5bgM)D=t0~*97!wqjD|E-~nfF7elkS#EG(WV-r{6s5(|_w= znsa{g4#PiIn-=UR9nQ* z&mZ-BMQ-D~2(uAeR+Pr`|!S=Gz5kw^6dR z^Qg%^1xr(>>oqw8GIO|w1jZ#kUq z82BjGerIx0+3wDXMw2m76hagSOvdV* zvbII7IYh+SF~O&`s}9mJfv zLPNfXCX#TJGhr_2oCskxi+5fddHwZ&-bkI+)t)_^X9FV7*6AlT|yf|x|cewJ;q z)9aGiIDY=u4TXCrMsS;abh9!QaVE2h=B63AVBXOi&W|2W%L*i}w@uD{Q=+Lll+Otr zY7s(a>c|rg@s4;G=@U(qf~!r4`lQsQZaYv!u9`2D^h}7^d78F7{6fQ}xpnu2x zFpR4I<|d#1HM6lh38LS5Ljrrj$5RWZirvo$T2{l*(tx8T%R1_;N92z5GnWz5yEn}l zm3j#d-?A=1)9IUN!6b;YdCK_nHFO|fVD%9L^|5uKRDKtOG<={ zsA*Apo;9GwBNSaw$ddFkHH3zF0p}!9geiF|XmAm48Xh8Gq9Pgoqv1Ox=wr>KFAV@Q zcC(L#hEb7vX{j#`dVkap%@}QhPt>`NCXUn%in=qmU(lrt%@(K%hMruq+&H;pIns4Y z$$j8t+OtRqf7DCV4VucSXEhX?cuJJ=K~k0+m5PS?zoQJO!U@@gr_J&6f72ZxGGYr#}3h z^iFi&|8jef(ejLkFFeD*`E}18{E%1-YhvLG=gHpT(c%I~;hNGN3FiZnw?RS)40Az! z1K>TS%$Y=B)YVN`^O$NqFbo+hktD^Gha!cV^FC9_+~niekQ`G#?Oo-MvZNiTUj$mk zk9_4hGuy7dI0DOTyK=#L+gE(qyP9APW>1BbL1Bk-+L~%=yol+Tb?OKDy>`4^VnTR- zW7aH+sblyEtUK0#wdI0k&E8w@njwI&_FiAIj*d}s&?E2ih-#!zH5yb&vnpf*HoeIT zkU-=;Xs`#5I&#w?9q9`-Wldg%bS_Fn0CsWLq6k^eot<#DjLxHb3U`;JsTB5mKN9=h zDOgy8H+U?)@%`;*)6=-K(F!m z3~~7BA1jUDs0Y-JQArGji5Z`)XSDtN+4e!(K>SyFGzTdqgbzI_HE62MLk*o9SW>$k zox(_`SR*Wlmj)k+WRQ~X7-EVnys08iqe8)3#&O0Wy>td<&4tJzsz}E#Pj0%eUOD}i z>0sW>rS9G>AHcKjzVaScI)8psAb!gz5Vu+4zpMu7w4viKRue{OcYAN|a9i(oA^JQ- zR87s>vd%>X<>gT>dDT`h0YYSz#2Tv#AP@ByT^ENnksLyHIrJPtE9hiqI~I6!|6?$c zx##{{hTc8%8-{lcy6=n#q0X2zFrQbD^aosqB-#BZ@#oxY;Sp@N#{&$nuNR zBJn3}ooz!wKibOy)arTyK!4KL^~3#>oo%J-rv@`y3np?(I}64V5GKdd16+#n220(q zX0_C)xRy_~^-{I#((8d9+Nd>KouW=+Z(vLZLWG{D`rH;Z3RX{l#2TdY2RDrXcO6mM z>pB5v!L+(AvGxTI%$?nXgA?u6SyPrYz&d11f~Eio$(op4jL5sJ=s4KRk_H1~y`^3@ zI)+M=a86xI8!bEFo8gURSwv@a>%*nV#w$iY_7du@j$NPL#A%82|9LjD^2sim%5uJI z@#e)KuJ`(u(}?C16q?N$(2NE&FRL^816|Qc^ia#xZerCBC0V+>l@23cGzV8K)yTud zUY+Q4eQ70CpWCC4II|{Ik2$;@;QXLFvFTx8^Wx<@=EfX8Yz6Jjy`T+ep^$0(*yA2F z+6=10P}a&$OjU#i>nJmXC^B>icS6bw5r-pmMyWIY^Si z+19Hz;s?5*qFqa(wN@$n(&W)5anUCy4Kk#A^3y?_T$4NVu$y~lY~cE_vv#z5n6S#}{PNml0+jf0(p%yqytkC}iF zhRRHeOz*bUo0!e`P?~b0Bay)DQFR{4#u~S;up6y`^~Mtoy|6|@FZBHZf6xQ>VBa5b zxn3-u;(6npnhZ@t2UJ68L}dSD2hnLbkJEGW^pJ4UdtLdj0l1l zMncpr2(wP+Z5?tmWYY{AwT^Vy0Z!kRW(=4*03y)yifB}?9$chnLj-!}iAtFhJwPlm zN~&l3nUOQL8(PhcNc8rP)c9lgW&O7T$uIhc8%2cV_ka8qcg^hLje5`VgnF6~J`-#I z%J&=xJI8z0b8G@J2Z(Ylh;3cSCigc~1slcY4A)vg8@<)}+su*2iF1hxbyPz5Gw$A* zGwg}3r%~b?u=X21nn#Cw+j>v*l>x|MG@CApI+)85)3}NmOuc3Ig86Ny&UJ|`Wd-(P zOUb1}4|U?aMPCFw|I7J?Nj$U1bI4&xV(<11xi&ssk0W=Q7GlMkcY1Vfob2 zCL!E&lgANjT3(cqY>Em-l2{Ff zoUJlrXB1yDXkP0?i9dz4Ydx#EyM3^`d$M@vjha;Z7zy^dg;Lhw)L5t{qSFB#r7NU? zi-dW$l7jFzh>yN43p@3C^6s9tOJ|F_dTs7>j;F6_yLkH6uNwqk=;-`C&f2}c&L2{D z>SA#{?b|;clqyTGai0!MObC+(&XA_1)$E|3-mM3ErutiwnBm~K$-DjCF%PWFelI=n zY4;Y7bewXpv-Xp|8tPk2&v1*m`Zp$K{Exo1#2k@f`$0@vR@mJ_hV?g2lWt56%UG&5 z6Qv-NyZo`0?eCX<%%0c#EsiVvVRM%cVeRwY2Z`I<+eW!VZOPuK2<40-AP{3=j2JD0 z2gNJl1+{spKVq{FD%3<)l_#2gqCsEsP-y}+*d%@Ss&YwHp;JK!#T|h{ELFOpd&cy@ z8#T#&sUS9e@2Sc{=AnR@R8+sUriY58QKjaIG_JL}S z#i>oQU^Mt$8xmDjq9Fmux*93qJ3Gmop>TtlG4qfULxWH{KOXQg>n-QZkTFl`G@MxV z9&5Az%lWG5muEa6MZceY+(T1Gx{9zr6^LRDhzLhI{FJq+CFc~Ap;E123Hb=z+tYKD zOul6}qQQ#mV^WMRCuXRynhmdbkW>qD-wu{ow~!>xhtgBCOM~VYH;q$rr>Esx-?%k+ zJM967^p(8gA9!dQ5BQKd2;`l}@I4XqH5jE2$><1f(MA3tTrTqxrbi6!V8u(FJF`8bJx*|GpfKOX< zJk`_sWb7eFL~Sxgw)7TH;f>v-WNfn@_afM3Otdkv0-`R6>o4hSgscz|>}z+?_QDc{ zw$u?CrA*6oX*cSs9)&<{Y*f;f=$_bg*k`Iqxn5v(il#47BAsZjyHrbV1E}7)ujSRq zxJL4kStP&zWx4pY(M^BCsDa#P`PL6^Ss&x$h z7XdJ8@)f5IYqBn>38W1pb%qcZPs_(I?!ABfP48nL-I*PiG0(pWE+p$1j{PH>)cuLG>i)s!Pum;F1boJ~o{qYo9<4Y+h{Rb5@uctxq(AIL z*fk>OArdKZc>pgBoU}+iOY}f{Lr7_t?ryzyXq(kU;ig`jRd}94-Jd+`j$Z$QDcSlM z)?&|o?(H5QZSO1b%(_z6ye+%7tZR6tN^r!HRVQna!w%%o`$_=8R%8g9!W^{}-au$+ zn-(39l_Sy;@LaOJ;@LVbvFBhiT}uaPAAFoo_1I4oaNf9ezV@5{`huzTESkmQzj@Yk zZ-4jbQ15NaY6%z4=-4&EqL@huUoMfI!`tT-uVa=m8IlZ(Ji?_>2;0yiszN!hD}*wu zi02y-cjq~BGEL8^?c)snhvnUb_n03aqx@xOKigFTeMKX@|o-5YNYa@jTo)Jl@ld@K*uHw(TN&c)sZ} z7(yA-o1P3%xv9Lnsd6|+UblabE^}koL>H&*$EDIQ^ZS?j@9t0gBH8?nv-QDS&rW*n zdv?4*Ht&u>GT*L3;qYLh?juA5E8{`hT;LMQBQ>%$KAKuIX&39caEqxI6j5nhqEKCAv^|D1 zBgn|IIEj!SglKLI6_ApHQ7zTX|y9^BX zVxjzvO}kjb(JoeNH2;1xZM}KX3m1#$c|LuA=iqop8%+>NF#@7is0mRWN>ZYZcv&IX z1Y?J2iNz7MeS%kNGhWF|9mxUk>a1n0kTpZ{9sx-h&2#vmm9HclRy|kmD}iQQ_3**_ zuGwfbm$Q85KbnE_%UJtu-)J5m92hv)3R_bmE~mi>60kuE?3UV(US<2C4*4#NkzRkxID2d36eH>E_Z90iK7i&I&bBGRqWP&AG_MDmAD+nO@yW54%`>WVK_w#ao^6=1 z8c{vMc!cy>ScN$otNPjjKEqPF{`Da@oenq)oS)K>rmHW?P$wjt@7siCIE&_&XFNJ- zKl6Ripy}t9#0X?EZ&=rcO|JIFMW)9t>0_b57KD<-2(6Q_rw*8X2c8RV>aV2U_x`bp zOjXgEujzA@jhjZ54I6Bd9QuuUB(IkyXe!xM2SuDu4$HO^w5g`e)TFcw-124RMVE%i z>8X#vOw6~`*$5A{OEBs0-M*&I*s^EVt&WOL86LfFQ#n;Bkt*>7dawo({+lT{;ii@nn+0>7_m{25zup*tAaXF)+!s|%Li&nbEj;6;eJ%4gOF`cJ%rz z*mF%dRWl-~4mF_zcq$Y!5tgEyUc+a>yq0v546>&vYY+ltG89qU;Z1aWMA-pSScOnF zQf-XYy+KQJ*TYtiz;x4_X6L05UIT?{FbayopI#LNLFzU`n<=M(lKVdXSC`?hU7AlW zvHMC?m71caf8wHf9nidv(EP&BJ#yE~2F~kxH2?RDraDguU++n&gC^=3jcg-2@`ZsT zE;~`Ojy7_NkPeAJ(kaM@(gs9XS1_5>tsmwwl{qP&qP>Pp)=g7K)aQ`|x)~`k#Juj^ z&?R=-^|CbBs4YlA+IQ%yu5%S7=ps7(F0=iuFu7>^24hLSM}B<=6yi2lnmZUr6T5nrXvuq!hH8;QC7l;-d2C;q4tf>ju- zjz^n2ZKqh184aH3Q!klHC1Lw2Pbv+Lik)kbSwgELS<4({4Uu(VF6)Sf((tgXX|GHU zLn&xzfD&A!_Z20hbOcYBfp;!zj+j-}Mra0VxUbfxBOeot@ko7dv=Z90K3t3W^waM( zv~`X4{%M*0r|+N4?Atv&*giheXA&}oVlX8{00`wYI?}d;r^x{&ZOaBrS4ljlbQENL zCOg4IS<}=nqXH%uyd>?0vy!~xun?%n)md3) z062;+UlpTy`d^?22Iw#>8Siqi!$n9g-+Q)`hOJz_{H4=!@Pk-;#7FXIfA>UN&74kd z;}Q8|nkok-z5zfPVkL$;JM}BX4SA@J+zcI9)^<5uRRq+lIHsKu2%sXG-QZKlBT$xW zK^grtQ)y4ewl%dYJ-umm#rChK10@PHfB9_XHjC!By>#k>NLK%(ujN`Ub+6Y|z_I99 z9>w-BerwbcJCD~6OBsRZ+dP*yy7HJn|01L1@7;vv4YL)Z2YWQ{>nlVguebYXYS~m3 zqB8*o7OyNNr*U*y&hS>6cHCs}Tq5!jvJG?kt4%C*jx=--CZlgx5l{=|FK?<41)~bl z^9+=q{nLY*|IlJs`yo%wkM_3rj&~Q2(zctHG5Bw4n#q6*vP;nwpgCApa{M1kp0{;T zXN72p{%Auw_pzfHE3C;7lvZ&_J1cm@*v~W7vyzedI489z z8XpM$H6OvJtx@f#jS)1XEL;S?(-X$80)mO2qhR>iB(;SabZ6tu-JyB={ zCfUX-62cBq5hsd6$VwH}TmwaUL4p$o3j%>7z&$-IdUn{{n%uhb3K+WxxW#oy206M|63q z;ta4dr;>^*pS(PiyczYg`ZoGkoTdPH>!<)|>FY}dO?P4*A^df|ENL_sW@6qOX<{C7 zS>Tg_SOLWnA-P2Yh6ENsW;qh!801w$UAmHJL`M4`$m)kt7H3F@!EwnlW_12}IHX6@ z^RsZ|`R13Vp;QHiO4C(Ic7Q@=>d?`IXIyg97JOCZ7 zI!@a`3NWgQkrq*%BQgfIG(eAM;0Hi1$O^n_#@i0`tf=a9S*f%0E_!R{H~8Z~KME+` ze9cp5xaUhB9U-x+6D${r_xKWag-E!9qrKA5fH=b$ozJvesn~?XmW#x*ZK>#62AO#K zVpw~-Clkke$d3*eKSf82p)wTGSr4?$Eh{)mKQsUk3=%0QKgZ$Cg%GVAuIR|F$)4+qZ3v z&@Q#W|%&e%!=MAPIIz*)V~hsb?eZZ zFQ@(VE2myFe$`^}yT0rlgUi%yGXY2mxAtvLo-hnc&>LCQ@w@}4j`Ad{!>~dgj)=Ys zQ8-6EvJF|S4!(kWji_lB{5mTmC~wOIk7kM%=jpRPg~}%W8xVsoQ$03*h~std4x#zk zvmMc9sdhrn-{L|D-|WkxMpK=GMTa)gv9eiM!y}3=q#eGfqtW4?5e=MeIL*~Z4H7In z5CqRjvt_BJ21YN7cnR*EXEKW~rfD&?`i}gisqZ9D*Y~yNvEgcK7e9M8s@?W@z#vYx zWc&rJ-QqPIo*e9*?CA|{4Xl4$sxyv31!W>EpxM;bMYDLr79z{KGJ-gwWLZH>fpuJV zJZgDMfr%^%@Jd-!ioCpXdN`e5oBy45rVNOn(-cIi-;Z51S!h|5s9ZNr-O3+0mbXYjVd^C~P@g#W{eXFTVk zD5<#Ki5hWrB&jn&;_3=@wlEwWOPdBR8{}YRb8-}Zm&wwUZ}kR#wJjhW-m)|_K%fsv zhb}MmTQzDys$S4ZX%H+^wN3*rkbdHq+ZyHQxS{LKc<%RHdg60u2G@PD*&2ktH4SyaDhc*8cohsrR;bj*fSaRf`crq@0&UM29(4O$Bs93Wrd^ZK6``aA{fF$f88N z9XC+}&m-;F*ky6pLY}%!VZB{||ZKFY(RS-UZV=PP-3(6{R z(-Dv;BEjPVWrmJsnb7%ju~5E4z_}Js(-39DS`;!AfFpH>2`BCthjgf6Db9O5i+HSb z8$hyhL9yK{T_irSdj8NSY+?0H-#RU<(%QRxso34!-P1!2*L?Pn5*T#YeUz|^u3=C+ zz>x7cXOx1$T9t@Ya8U=RAkOSL(^IVUkCk&?)A3K^{Zn(N2i(496UpscBP5MXxk$d= zm#MD?lFra6ZiPHH4n zhDC9U3v1R@MhvVZuVHy?HN!i+uJ4$&+Xc;eTJ&pGTk>-JI}?PUYAzV!2*T+tzLb1< zMmoo}n;Cto*hYa@h9U+xAp85|oObY91- zW0YxU9V(3QXmD;-)Uhz*NO)ue`4a#gWt66AggUJx790dy9OhtaC9gvrSQ42=Wk8C_ zm#GS>)U>!jWpzn^;%DMwsw42!KfcN1l}=q9@LGX4qIu&8&80!}gxP_bcZOhI%~>Y{rr3erT1+=v-Vu9GFZ+ohXp#oG39b+Hs(&V?9^` znpgRm^9O;<1K+>>+?lZSgTLm0aO>A;F?;c10P889m>FO`cHLr8FzTtQ+ZqVek~%O# zP!A1tOGgC<^N>Zz_AaP@q8f4+=yv&`+umehW`e2nLF>N#-X^Jc*-w6Sd)JotU3x2S z?=AoOYv*q7tyYVk`W;i7^fs*hp8vTY?QL)C5yM>t=C90U$lAOuYusresZNGsnW*9x zMU9AzI5M>y`Ud+`79mBBu;sNty@L*W#bwBN#fq@oo6zR~vm7afqa%s^qCS0lTtK1m zA(O6tkpf#Qp+1N+)e5ZOB$ZPqNdwEr&bCF?@~JQTjw%25cd+*U37SXy`v-c0-KoKn zbe;;2K?c@2&A=&nn+wqYbdqL7n;0*ogE#c?oB-y%Vni(MGdXM6gYkQNYn9rKBjy|t zvtKS9{T=#5I{!@PIr`7F@xyWBZ~KmE8y-pQ6MW^<(n+*c$!dVtltn{RmFSq)26h~I zizJL`JZ)WNWz1=g2%Y0a-L_>>fr#m9Vq8i@AG`}imZybbTg>!Xb9xqyS1gu_jY||S zuU=D4Q2H!!HSf+0ATCTBi8;E$%LV@MnqV)T(Pbh3idKgD z4^hdeza23s_+wtt-hqyq+j$QAjvJ=#ud<3ej(P^-)QX2lpsN`5qAaeUvXu~sGNddJ zniW?}&U^CUeP8()@AlOH3LS*eS77&p+Zfk2ZQ`X zWh0BiHtXVyPRwgVp4CVScm|+zl;3K&lA0rXj2hX}BnEJxB_M53{kS}`V#(k!oSF5T z(Oz*ynih%wdOF<;H+fe&kLR6hDw}t1{eRFwe8D&I001A02m}BC000301^_}s0stET N0{{R300000007o0qmlps literal 0 HcmV?d00001 From f0c1b5736172fd3f5c913b9abb29e1bb82474f29 Mon Sep 17 00:00:00 2001 From: emmarousseau Date: Wed, 11 Sep 2024 19:50:32 +0200 Subject: [PATCH 5/5] use command directly from umi_tools, full help message, update changelog --- CHANGELOG.md | 2 +- .../umi_tools_prepareforrsem/config.vsh.yaml | 60 +++++++++++++++---- .../umi_tools_prepareforrsem/help.txt | 55 ++++++++++++++--- .../prepare-for-rsem.py | 1 + .../umi_tools_prepareforrsem/script.sh | 26 +++++++- .../umi_tools_prepareforrsem/test.sh | 2 +- 6 files changed, 125 insertions(+), 21 deletions(-) mode change 100755 => 100644 src/umi_tools/umi_tools_prepareforrsem/prepare-for-rsem.py diff --git a/CHANGELOG.md b/CHANGELOG.md index a18686c4..4670b0ac 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -124,7 +124,7 @@ * `umi_tools`: - `umi_tools/umi_tools_extract`: Flexible removal of UMI sequences from fastq reads (PR #71). - - `umi_tools/umi_tools_prepareforquant`: Fix paired-end reads in name sorted BAM file to prepare for RSEM (PR #148). + - `umi_tools/umi_tools_prepareforrsem`: Fix paired-end reads in name sorted BAM file to prepare for RSEM (PR #148). * `falco`: A C++ drop-in replacement of FastQC to assess the quality of sequence read data (PR #43). diff --git a/src/umi_tools/umi_tools_prepareforrsem/config.vsh.yaml b/src/umi_tools/umi_tools_prepareforrsem/config.vsh.yaml index b7f12754..ceac2052 100644 --- a/src/umi_tools/umi_tools_prepareforrsem/config.vsh.yaml +++ b/src/umi_tools/umi_tools_prepareforrsem/config.vsh.yaml @@ -14,6 +14,7 @@ argument_groups: - name: "Input" arguments: - name: "--input" + alternatives: ["-I", "--stdin"] type: file required: true example: $id.transcriptome.bam @@ -21,12 +22,33 @@ argument_groups: - name: "Output" arguments: - name: "--output" + alternatives: ["-S", "--stdout"] type: file direction: output example: $id.transcriptome_sorted.bam - name: "--log" + alternatives: ["-L"] type: file direction: output + description: File with logging information [default = stdout]. + - name: "--error" + alternatives: ["-E"] + type: file + direction: output + description: File with error information [default = stderr]. + - name: "--log2stderr" + type: boolean_true + description: Send logging information to stderr [default = False]. + - name: "--temp_dir" + type: string + description: | + Directory for temporary files. If not set, the bash environmental variable + TMPDIR is used. + - name: "--compresslevel" + type: integer + description: | + Level of Gzip compression to use. Default (6) matchesGNU gzip rather than python + gzip default (which is 9). - name: "Options" arguments: @@ -38,6 +60,27 @@ argument_groups: - name: "--sam" type: boolean_true description: Input and output SAM rather than BAM. + - name: "--timeit" + type: string + description: | + Store timeing information in file [none]. + - name: "--timeit_name" + type: string + description: | + Name in timing file for this class of jobs [all]. + - name: "--timeit_header" + type: boolean_true + description: Add header for timing information [none]. + - name: "--verbose" + alternatives: ["-v"] + type: integer + description: | + Loglevel [1]. The higher, the more output. + - name: "--random_seed" + type: integer + description: | + Random seed to initialize number generator with [none]. + resources: - type: bash_script @@ -51,16 +94,13 @@ test_resources: path: test_data engines: -- type: docker - image: ubuntu:22.04 - setup: - - type: apt - packages: [pip] - - type: python - packages: [umi_tools, pysam] - - type: docker - run: | - pip list --format=freeze | sed 's/==/:/' | sort > /var/software_versions.txt + - type: docker + image: quay.io/biocontainers/umi_tools:1.1.5--py38h0020b31_3 + setup: + - type: docker + run: | + umi_tools -v | sed 's/ version//g' > /var/software_versions.txt + runners: - type: executable diff --git a/src/umi_tools/umi_tools_prepareforrsem/help.txt b/src/umi_tools/umi_tools_prepareforrsem/help.txt index 58923242..efaf4de6 100644 --- a/src/umi_tools/umi_tools_prepareforrsem/help.txt +++ b/src/umi_tools/umi_tools_prepareforrsem/help.txt @@ -1,13 +1,54 @@ +``` +umi_tools prepare-for-rsem --help +``` + prepare_for_rsem - make output from dedup or group compatible with RSEM Usage: umi_tools prepare_for_rsem [OPTIONS] [--stdin=IN_BAM] [--stdout=OUT_BAM] - note: If --stdout is omited, standard out is output. To + note: If --stdout is ommited, standard out is output. To generate a valid BAM file on standard out, please - redirect log with --log=LOGFILE or --log2stderr + redirect log with --log=LOGFILE or --log2stderr + +For full UMI-tools documentation, see https://umi-tools.readthedocs.io/en/latest/ + +Options: + --version show program's version number and exit + + RSEM preparation specific options: + --tags=TAGS Comma-seperated list of tags to transfer from read1 to + read2 + --sam input and output SAM rather than BAM + + input/output options: + -I FILE, --stdin=FILE + file to read stdin from [default = stdin]. + -L FILE, --log=FILE + file with logging information [default = stdout]. + -E FILE, --error=FILE + file with error information [default = stderr]. + -S FILE, --stdout=FILE + file where output is to go [default = stdout]. + --temp-dir=FILE Directory for temporary files. If not set, the bash + environmental variable TMPDIR is used[default = None]. + --log2stderr send logging information to stderr [default = False]. + --compresslevel=COMPRESSLEVEL + Level of Gzip compression to use. Default (6) + matchesGNU gzip rather than python gzip default (which + is 9) + + profiling options: + --timeit=TIMEIT_FILE + store timeing information in file [none]. + --timeit-name=TIMEIT_NAME + name in timing file for this class of jobs [all]. + --timeit-header add header for timing information [none]. - --input,--stdin Input bam file. - --output, --stdout Output bam file. - --log Output log file (optional). - --tags Comma-seperated list of tags to transfer from read1 to read2 (Default: "UG,BX"). - --sam Input and output SAM rather than BAM. \ No newline at end of file + common options: + -v LOGLEVEL, --verbose=LOGLEVEL + loglevel [1]. The higher, the more output. + -h, --help output short help (command line options only). + --help-extended Output full documentation + --random-seed=RANDOM_SEED + random seed to initialize number generator with + [none]. \ No newline at end of file diff --git a/src/umi_tools/umi_tools_prepareforrsem/prepare-for-rsem.py b/src/umi_tools/umi_tools_prepareforrsem/prepare-for-rsem.py old mode 100755 new mode 100644 index 59dd01ad..b53d30ac --- a/src/umi_tools/umi_tools_prepareforrsem/prepare-for-rsem.py +++ b/src/umi_tools/umi_tools_prepareforrsem/prepare-for-rsem.py @@ -50,6 +50,7 @@ import pysam import sys + usage = """ prepare_for_rsem - make output from dedup or group compatible with RSEM diff --git a/src/umi_tools/umi_tools_prepareforrsem/script.sh b/src/umi_tools/umi_tools_prepareforrsem/script.sh index 3ac2fd03..d6b3775f 100755 --- a/src/umi_tools/umi_tools_prepareforrsem/script.sh +++ b/src/umi_tools/umi_tools_prepareforrsem/script.sh @@ -2,9 +2,31 @@ set -eo pipefail -python3 "$meta_resources_dir/prepare-for-rsem.py" \ +unset_if_false=( + par_sam + par_error + par_log2stderr + par_timeit_header ) + +for var in "${unset_if_false[@]}"; do + test_val="${!var}" + [[ "$test_val" == "false" ]] && unset $var +done + +umi_tools prepare-for-rsem \ ${par_log:+--log "${par_log}"} \ ${par_tags:+--tags "${par_tags}"} \ ${par_sam:+--sam} \ --stdin="${par_input}" \ - ${par_output:+--stdout "${par_output}"} + ${par_output:+--stdout "${par_output}"} \ + ${par_error:+--error "${par_error}"} \ + ${par_temp_dir:+--temp-dir "${par_temp_dir}"} \ + ${par_log2stderr:+--log2stderr} \ + ${par_verbose:+--verbose "${par_verbose}"} \ + ${par_random_seed:+--random-seed "${par_random_seed}"} \ + ${par_compresslevel:+--compresslevel "${par_compresslevel}"} + ${par_timeit:+--timeit "${par_timeit}"} \ + ${par_timeit_name:+--timeit-name "${par_timeit_name}"} \ + ${par_timeit_header:+--timeit-header} + + diff --git a/src/umi_tools/umi_tools_prepareforrsem/test.sh b/src/umi_tools/umi_tools_prepareforrsem/test.sh index 5f97a103..c94a202d 100644 --- a/src/umi_tools/umi_tools_prepareforrsem/test.sh +++ b/src/umi_tools/umi_tools_prepareforrsem/test.sh @@ -1,6 +1,7 @@ #!/bin/bash test_dir="$meta_resources_dir/test_data" +apt-get -q update && apt-get -q install -y samtools ################################################################################ echo ">>> Test 1: with --sam:" @@ -30,7 +31,6 @@ echo ">>> Check if output is present" [[ ! -s "$test_dir/test_output.bam" ]] && echo "Output file is empty" && exit 1 echo ">>> Check if output is correct" -apt-get update && apt-get install -y samtools diff <(samtools view "$test_dir/test_output.bam") <(samtools view "$test_dir/test.bam") || (echo "Output is incorrect" && exit 1) ################################################################################