From 38f586bec0ac9e4312b016e29c3aa0bd53f292b2 Mon Sep 17 00:00:00 2001 From: emmarousseau Date: Thu, 11 Apr 2024 11:04:14 +0100 Subject: [PATCH 1/6] initial commit dedup --- CHANGELOG.md | 3 + src/umi_tools/umi_tools_dedup/config.vsh.yaml | 279 ++++++++++++++++++ src/umi_tools/umi_tools_dedup/help.txt | 13 + src/umi_tools/umi_tools_dedup/script.sh | 65 ++++ src/umi_tools/umi_tools_dedup/test.sh | 49 +++ 5 files changed, 409 insertions(+) create mode 100644 src/umi_tools/umi_tools_dedup/config.vsh.yaml create mode 100644 src/umi_tools/umi_tools_dedup/help.txt create mode 100644 src/umi_tools/umi_tools_dedup/script.sh create mode 100644 src/umi_tools/umi_tools_dedup/test.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 4fd7f001..1bef9345 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -39,6 +39,9 @@ - `samtools/flagstat`: Counts the number of alignments in SAM/BAM/CRAM files for each FLAG type (PR #31). - `samtools/idxstats`: Reports alignment summary statistics for a SAM/BAM/CRAM file (PR #32). +* `umi_tools`: + - `umi_tools/umi_tools_dedup`: Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read (PR #). + ## MAJOR CHANGES ## MINOR CHANGES diff --git a/src/umi_tools/umi_tools_dedup/config.vsh.yaml b/src/umi_tools/umi_tools_dedup/config.vsh.yaml new file mode 100644 index 00000000..75306541 --- /dev/null +++ b/src/umi_tools/umi_tools_dedup/config.vsh.yaml @@ -0,0 +1,279 @@ +name: umi_tool_dedup +namespace: umi_tools +description: | + Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read. +keywords: [umi_tools, deduplication, dedup] +links: + homepage: https://umi-tools.readthedocs.io/en/latest/ + documentation: [ https://umi-tools.readthedocs.io/en/latest/reference/dedup.html, + https://umi-tools.readthedocs.io/en/latest/common_options.html#common-options ] + repository: https://github.com/CGATOxford/UMI-tools +references: + doi: 10.1101/gr.209601.116 +license: MIT + +argument_groups: + - name: Inputs + arguments: + - name: --input + alternatives: -I + type: file + description: Input BAM or SAM file. Use --in_sam to specify SAM format. + required: true + - name: --in_sam + type: boolean_true + description: | + By default, inputs are assumed to be in BAM format. Use this options + to specify the use of SAM format for input. + - name: --bai + type: file + description: BAM index + - name: --get_output_stats + type: boolean + description: Whether or not to generate output stats. + - name: --random_seed + type: integer + description: | + Random seed to initialize number generator with. + default: none + + - name: Outputs + arguments: + - name: --output + alternatives: -S + type: file + description: Deduplicated BAM file + required: true + direction: output + - name: --out_sam + type: boolean_true + description: | + By default, outputa are written in BAM format. Use this options to + specify the use of SAM format for output. + - name: --paired + type: boolean_true + description: | + BAM is paired end - output both read pairs. This will also force the + use of the template length to determine reads with the same mapping + coordinates. + - name: --output_stats + type: file + description: Directory containing UMI based deduplication statistics files + direction: output + - name: --extract_umi_method + type: string + description: | + Specify the method by which the barcodes were encoded in the read. + The options are: [read_id, tag, umis]. + default: read_id + - name: --umi_tag + type: string + description: | + The tag containing the UMI sequence. + This is only required if the extract_umi_method is set to tag. + - name: --umi_separator + type: string + description: | + The separator used to separate the UMI from the read sequence. + This is only required if the extract_umi_method is set to id_read. + default: '_' + - name: --umi_tag_split + type: string + description: | + Separate the UMI in tag by and take the first element. + - name: --umi_tag_delimiter + type: string + description: | + Separate the UMI in by and concatenate the elements + - name: --cell_tag + type: string + description: | + The tag containing the cell barcode sequence. + This is only required if the extract_umi_method is set to tag. + - name: --cell_tag_split + type: string + description: | + Separate the cell barcode in tag by and take the first element. + - name: --cell_tag_delimiter + type: string + description: | + Separate the cell barcode in by and concatenate the elements + + - name: Grouping Options + arguments: + - name: --method + type: string + description: | + The method to use for grouping reads. The options are: + [unique, percentile, cluster, adjacency, directional]. + default: directional + - name: --edit_distance_threshold + type: integer + description: | + For the adjacency and cluster methods the threshold for the edit + distance to connect two UMIs in the network can be increased. The + default value of 1 works best unless the UMI is very long (>14bp). + default: 1 + - name: --spliced_is_unique + type: boolean_true + description: | + Causes two reads that start in the same position on the same strand + and having the same UMI to be considered unique if one is spliced + and the other is not. (Uses the ‘N’ cigar operation to test for splicing). + - name: --soft_clip_threshold + type: integer + description: | + Mappers that soft clip will sometimes do so rather than mapping a + spliced read if there is only a small overhang over the exon junction. + By setting this option, you can treat reads with at least this many + bases soft-clipped at the 3’ end as spliced. + default: 4 + - name: --multimapping_detection_method + type: string + description: | + If the sam/bam contains tags to identify multimapping reads, you can + specify for use when selecting the best read at a given loci. Supported + tags are “NH”, “X0” and “XT”. If not specified, the read with the highest + mapping quality will be selected. + - name: --read_length + type: integer + description: | + Use the read length as a criteria when deduping, for e.g sRNA-Seq. + + - name: Single-cell RNA-Seq Options + arguments: + - name: --per_gene + type: boolean_true + description: | + Reads will be grouped together if they have the same gene. This is useful + if your library prep generates PCR duplicates with non identical alignment + positions such as CEL-Seq. Note this option is hardcoded to be on with the + count command. I.e counting is always performed per-gene. Must be combined + with either --gene_tag or --per_contig option. + - name: --gene_tag + type: string + description: | + Deduplicate per gene. The gene information is encoded in the bam read tag + specified. + - name: --assigned_status_tag + type: string + description: | + BAM tag which describes whether a read is assigned to a gene. Defaults to + the same value as given for --gene_tag. + - name: --skip_tags_regex + type: string + description: | + Use in conjunction with the --assigned_status_tag option to skip any reads + where the tag matches this regex. Default ("^[__|Unassigned]") matches + anything which starts with “__” or “Unassigned”. + - name: --per_contig + type: boolean_true + description: | + Deduplicate per contig (field 3 in BAM; RNAME). All reads with the same + contig will be considered to have the same alignment position. This is + useful if you have aligned to a reference transcriptome with one + transcript per gene. If you have aligned to a transcriptome with more + than one transcript per gene, you can supply a map between transcripts + and gene using the --gene_transcript_map option. + - name: --gene_transcript_map + type: file + description: | + A file containing a mapping between gene names and transcript names. + The file should be tab separated with the gene name in the first column + and the transcript name in the second column. + - name: --per_cell + type: boolean_true + description: | + Reads will only be grouped together if they have the same cell barcode. + Can be combined with --per_gene. + + - name: SAM/BAM Options + arguments: + - name: --mapping_quality + type: integer + description: | + Minimium mapping quality (MAPQ) for a read to be retained. + default: 0 + - name: --unmapped_reads + type: string + description: | + How unmapped reads should be handled. + The options are: + "discard": Discard all unmapped reads. + "use": If read2 is unmapped, deduplicate using read1 only. + Requires --paired. + "output": Output unmapped reads/read pairs without UMI + grouping/deduplication. Only available in umi_tools group. + default: discard + - name: --chimeric_pairs + type: string + description: | + How chimeric pairs should be handled. + The options are: + "discard": Discard all chimeric read pairs. + "use": Deduplicate using read1 only. + "output": Output chimeric pairs without UMI grouping/deduplication. + Only available in umi_tools group. + default: use + - name: --unapired_reads + type: string + description: | + How unpaired reads should be handled. + The options are: + "discard": Discard all unpaired reads. + "use": Deduplicate using read1 only. + "output": Output unpaired reads without UMI grouping/deduplication. + Only available in umi_tools group. + default: use + - name: --ignore_umi + type: boolean_true + description: | + Ignore the UMI and group reads using mapping coordinates only. + - name: --subset + type: boolean_true + description: | + Only consider a fraction of the reads, chosen at random. This is useful + for doing saturation analyses. + - name: --chrom + type: string + description: | + Only consider a single chromosome. This is useful for debugging/testing + purposes. + + - name: Group/Dedup Options + arguments: + - name: --no_sort_output + type: boolean_true + description: | + By default, output is sorted. This involves the use of a temporary unsorted + file (saved in --temp-dir). Use this option to turn off sorting. + - name: --buffer_whole_contig + type: boolean_true + description: | + Forces dedup to parse an entire contig before yielding any reads for + deduplication. This is the only way to absolutely guarantee that all reads + with the same start position are grouped together for deduplication since + dedup uses the start position of the read, not the alignment coordinate on + which the reads are sorted. However, by default, dedup reads for another + 1000bp before outputting read groups which will avoid any reads being missed + with short read sequencing (<1000bp). + + +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - type: file + path: test_data +engines: + - type: docker + image: quay.io/biocontainers/umi_tools:1.1.5--py39hf95cd2a_1 + setup: + - type: docker + run: | + umi_tools -v | sed 's/ version//g' > /var/software_versions.txt +runners: +- type: executable +- type: nextflow \ No newline at end of file diff --git a/src/umi_tools/umi_tools_dedup/help.txt b/src/umi_tools/umi_tools_dedup/help.txt new file mode 100644 index 00000000..d3c8fa44 --- /dev/null +++ b/src/umi_tools/umi_tools_dedup/help.txt @@ -0,0 +1,13 @@ +``` +umi_tools dedup +``` + +dedup - Deduplicate reads using UMI and mapping coordinates + +Usage: umi_tools dedup [OPTIONS] [--stdin=IN_BAM] [--stdout=OUT_BAM] + + note: If --stdout is ommited, standard out is output. To + generate a valid BAM file on standard out, please + redirect log with --log=LOGFILE or --log2stderr + +For full UMI-tools documentation, see https://umi-tools.readthedocs.io/en/latest/ \ No newline at end of file diff --git a/src/umi_tools/umi_tools_dedup/script.sh b/src/umi_tools/umi_tools_dedup/script.sh new file mode 100644 index 00000000..57c01258 --- /dev/null +++ b/src/umi_tools/umi_tools_dedup/script.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +set -e + +test_dir="${metal_executable}/test_data" + +[[ "$par_paired" == "false" ]] && unset par_paired +[[ "$par_in_sam" == "false" ]] && unset par_in_sam +[[ "$par_out_sam" == "false" ]] && unset par_out_sam +[[ "$par_spliced_is_unique" == "false" ]] && unset par_spliced_is_unique +[[ "$par_per_gene" == "false" ]] && unset par_per_gene +[[ "$par_per_contig" == "false" ]] && unset par_per_contig +[[ "$par_per_cell" == "false" ]] && unset par_per_cell +[[ "$par_no_sort_output" == "false" ]] && unset par_no_sort_output +[[ "$par_buffer_whole_contig" == "false" ]] && unset par_buffer_whole_contig +[[ "$par_ignore_umi" == "false" ]] && unset par_ignore_umi +[[ "$par_subset" == "false" ]] && unset par_subset + + +$(which umi_tools) dedup \ + -I "$par_input" \ + ${par_in_sam:+--in-sam} \ + ${par_bai:+--bai "$par_bai"} \ + ${par_get_output_stats:+--get-output-stats} \ + ${par_random_seed:+--random-seed "$par_random_seed"} \ + -S "$par_output" \ + ${par_out_sam:+--out-sam} \ + ${par_paired:+--paired} \ + ${par_output_stats:+--output-stats "$par_output_stats"} \ + ${par_extract_umi_method:+--extract-umi-method "$par_extract_umi_method"} \ + ${par_umi_tag:+--umi-tag "$par_umi_tag"} \ + ${par_umi_separator:+--umi-separator "$par_umi_separator"} \ + ${par_umi_tag_split:+--umi-tag-split "$par_umi_tag_split"} \ + ${par_umi_tag_delimiter:+--umi-tag-delimiter "$par_umi_tag_delimiter"} \ + ${par_cell_tag:+--cell-tag "$par_cell_tag"} \ + ${par_cell_tag_split:+--cell-tag-split "$par_cell_tag_split"} \ + ${par_cell_tag_delimiter:+--cell-tag-delimiter "$par_cell_tag_delimiter"} \ + ${par_method:+--method "$par_method"} \ + ${par_edit_distance_threshold:+--edit-distance-threshold "$par_edit_distance_threshold"} \ + ${par_spliced_is_unique:+--spliced-is-unique} \ + ${par_soft_clip_threshold:+--soft-clip-threshold "$par_soft_clip_threshold"} \ + ${par_multimapping_detection_method:+--multimapping-detection-method "$par_multimapping_detection_method"} \ + ${par_read_length:+--read-length "$par_read_length"} \ + ${par_per_gene:+--per-gene} \ + ${par_gene_tag:+--gene-tag "$par_gene_tag"} \ + ${par_assigned_status_tag:+--assigned-status-tag "$par_assigned_status_tag"} \ + ${par_skip_tags_regex:+--skip-tags-regex "$par_skip_tags_regex"} \ + ${par_per_contig:+--per-contig} + ${par_gene_transcript_map:+--gene-transcript-map "$par_gene_transcript_map"} \ + ${par_per_cell:+--per-cell} \ + ${par_mapping_quality:+--mapping-quality "$par_mapping_quality"} \ + ${par_unmapped_reads:+--unmapped-reads "$par_unmapped_reads"} \ + ${par_chimeric_pairs:+--chimeric-pairs "$par_chimeric_pairs"} \ + ${par_unapired_reads:+--unapired-reads "$par_unapired_reads"} \ + ${par_ignore_umi:+--ignore-umi} \ + ${par_subset:+--subset} \ + ${par_chrom:+--chrom "$par_chrom"} \ + ${par_no_sort_output:+--no-sort-output} \ + ${par_buffer_whole_contig:+--buffer-whole-contig} + + +exit 0 \ No newline at end of file diff --git a/src/umi_tools/umi_tools_dedup/test.sh b/src/umi_tools/umi_tools_dedup/test.sh new file mode 100644 index 00000000..1459ec08 --- /dev/null +++ b/src/umi_tools/umi_tools_dedup/test.sh @@ -0,0 +1,49 @@ +#!/bin/bash + +test_dir="${meta_resources_dir}/test_data" +echo ">>> Testing $meta_functionality_name" + +"$meta_executable" \ + --bam "$test_dir/a.sorted.bam" \ + --bai "$test_dir/a.sorted.bam.bai" \ + --output "$test_dir/a.sorted.idxstats" + +echo ">>> Checking whether output exists" +[ ! -f "$test_dir/a.sorted.idxstats" ] && echo "File 'a.sorted.idxstats' does not exist!" && exit 1 + +echo ">>> Checking whether output is non-empty" +[ ! -s "$test_dir/a.sorted.idxstats" ] && echo "File 'a.sorted.idxstats' is empty!" && exit 1 + +echo ">>> Checking whether output is correct" +diff "$test_dir/a.sorted.idxstats" "$test_dir/a_ref.sorted.idxstats" || \ + (echo "Output file a.sorted.idxstats does not match expected output" && exit 1) + +rm "$test_dir/a.sorted.idxstats" + +############################################################################################ + +echo ">>> Testing $meta_functionality_name with singletons in the input" + +"$meta_executable" \ + --bam "$test_dir/test.paired_end.sorted.bam" \ + --bai "$test_dir/test.paired_end.sorted.bam.bai" \ + --output "$test_dir/test.paired_end.sorted.idxstats" + +echo ">>> Checking whether output exists" +[ ! -f "$test_dir/test.paired_end.sorted.idxstats" ] && \ + echo "File 'test.paired_end.sorted.idxstats' does not exist!" && exit 1 + +echo ">>> Checking whether output is non-empty" +[ ! -s "$test_dir/test.paired_end.sorted.idxstats" ] && \ + echo "File 'test.paired_end.sorted.idxstats' is empty!" && exit 1 + +echo ">>> Checking whether output is correct" +diff "$test_dir/test.paired_end.sorted.idxstats" "$test_dir/test_ref.paired_end.sorted.idxstats" || \ + (echo "Output file test.paired_end.sorted.idxstats does not match expected output" && exit 1) + +rm "$test_dir/test.paired_end.sorted.idxstats" + +############################################################################################ + +echo "All tests succeeded!" +exit 0 \ No newline at end of file From 2c269682620a407803e528652198646435ef2c03 Mon Sep 17 00:00:00 2001 From: emmarousseau Date: Thu, 11 Apr 2024 11:38:57 +0100 Subject: [PATCH 2/6] Revert "initial commit dedup" This reverts commit 38f586bec0ac9e4312b016e29c3aa0bd53f292b2. --- CHANGELOG.md | 3 - src/umi_tools/umi_tools_dedup/config.vsh.yaml | 279 ------------------ src/umi_tools/umi_tools_dedup/help.txt | 13 - src/umi_tools/umi_tools_dedup/script.sh | 65 ---- src/umi_tools/umi_tools_dedup/test.sh | 49 --- 5 files changed, 409 deletions(-) delete mode 100644 src/umi_tools/umi_tools_dedup/config.vsh.yaml delete mode 100644 src/umi_tools/umi_tools_dedup/help.txt delete mode 100644 src/umi_tools/umi_tools_dedup/script.sh delete mode 100644 src/umi_tools/umi_tools_dedup/test.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 1bef9345..4fd7f001 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -39,9 +39,6 @@ - `samtools/flagstat`: Counts the number of alignments in SAM/BAM/CRAM files for each FLAG type (PR #31). - `samtools/idxstats`: Reports alignment summary statistics for a SAM/BAM/CRAM file (PR #32). -* `umi_tools`: - - `umi_tools/umi_tools_dedup`: Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read (PR #). - ## MAJOR CHANGES ## MINOR CHANGES diff --git a/src/umi_tools/umi_tools_dedup/config.vsh.yaml b/src/umi_tools/umi_tools_dedup/config.vsh.yaml deleted file mode 100644 index 75306541..00000000 --- a/src/umi_tools/umi_tools_dedup/config.vsh.yaml +++ /dev/null @@ -1,279 +0,0 @@ -name: umi_tool_dedup -namespace: umi_tools -description: | - Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read. -keywords: [umi_tools, deduplication, dedup] -links: - homepage: https://umi-tools.readthedocs.io/en/latest/ - documentation: [ https://umi-tools.readthedocs.io/en/latest/reference/dedup.html, - https://umi-tools.readthedocs.io/en/latest/common_options.html#common-options ] - repository: https://github.com/CGATOxford/UMI-tools -references: - doi: 10.1101/gr.209601.116 -license: MIT - -argument_groups: - - name: Inputs - arguments: - - name: --input - alternatives: -I - type: file - description: Input BAM or SAM file. Use --in_sam to specify SAM format. - required: true - - name: --in_sam - type: boolean_true - description: | - By default, inputs are assumed to be in BAM format. Use this options - to specify the use of SAM format for input. - - name: --bai - type: file - description: BAM index - - name: --get_output_stats - type: boolean - description: Whether or not to generate output stats. - - name: --random_seed - type: integer - description: | - Random seed to initialize number generator with. - default: none - - - name: Outputs - arguments: - - name: --output - alternatives: -S - type: file - description: Deduplicated BAM file - required: true - direction: output - - name: --out_sam - type: boolean_true - description: | - By default, outputa are written in BAM format. Use this options to - specify the use of SAM format for output. - - name: --paired - type: boolean_true - description: | - BAM is paired end - output both read pairs. This will also force the - use of the template length to determine reads with the same mapping - coordinates. - - name: --output_stats - type: file - description: Directory containing UMI based deduplication statistics files - direction: output - - name: --extract_umi_method - type: string - description: | - Specify the method by which the barcodes were encoded in the read. - The options are: [read_id, tag, umis]. - default: read_id - - name: --umi_tag - type: string - description: | - The tag containing the UMI sequence. - This is only required if the extract_umi_method is set to tag. - - name: --umi_separator - type: string - description: | - The separator used to separate the UMI from the read sequence. - This is only required if the extract_umi_method is set to id_read. - default: '_' - - name: --umi_tag_split - type: string - description: | - Separate the UMI in tag by and take the first element. - - name: --umi_tag_delimiter - type: string - description: | - Separate the UMI in by and concatenate the elements - - name: --cell_tag - type: string - description: | - The tag containing the cell barcode sequence. - This is only required if the extract_umi_method is set to tag. - - name: --cell_tag_split - type: string - description: | - Separate the cell barcode in tag by and take the first element. - - name: --cell_tag_delimiter - type: string - description: | - Separate the cell barcode in by and concatenate the elements - - - name: Grouping Options - arguments: - - name: --method - type: string - description: | - The method to use for grouping reads. The options are: - [unique, percentile, cluster, adjacency, directional]. - default: directional - - name: --edit_distance_threshold - type: integer - description: | - For the adjacency and cluster methods the threshold for the edit - distance to connect two UMIs in the network can be increased. The - default value of 1 works best unless the UMI is very long (>14bp). - default: 1 - - name: --spliced_is_unique - type: boolean_true - description: | - Causes two reads that start in the same position on the same strand - and having the same UMI to be considered unique if one is spliced - and the other is not. (Uses the ‘N’ cigar operation to test for splicing). - - name: --soft_clip_threshold - type: integer - description: | - Mappers that soft clip will sometimes do so rather than mapping a - spliced read if there is only a small overhang over the exon junction. - By setting this option, you can treat reads with at least this many - bases soft-clipped at the 3’ end as spliced. - default: 4 - - name: --multimapping_detection_method - type: string - description: | - If the sam/bam contains tags to identify multimapping reads, you can - specify for use when selecting the best read at a given loci. Supported - tags are “NH”, “X0” and “XT”. If not specified, the read with the highest - mapping quality will be selected. - - name: --read_length - type: integer - description: | - Use the read length as a criteria when deduping, for e.g sRNA-Seq. - - - name: Single-cell RNA-Seq Options - arguments: - - name: --per_gene - type: boolean_true - description: | - Reads will be grouped together if they have the same gene. This is useful - if your library prep generates PCR duplicates with non identical alignment - positions such as CEL-Seq. Note this option is hardcoded to be on with the - count command. I.e counting is always performed per-gene. Must be combined - with either --gene_tag or --per_contig option. - - name: --gene_tag - type: string - description: | - Deduplicate per gene. The gene information is encoded in the bam read tag - specified. - - name: --assigned_status_tag - type: string - description: | - BAM tag which describes whether a read is assigned to a gene. Defaults to - the same value as given for --gene_tag. - - name: --skip_tags_regex - type: string - description: | - Use in conjunction with the --assigned_status_tag option to skip any reads - where the tag matches this regex. Default ("^[__|Unassigned]") matches - anything which starts with “__” or “Unassigned”. - - name: --per_contig - type: boolean_true - description: | - Deduplicate per contig (field 3 in BAM; RNAME). All reads with the same - contig will be considered to have the same alignment position. This is - useful if you have aligned to a reference transcriptome with one - transcript per gene. If you have aligned to a transcriptome with more - than one transcript per gene, you can supply a map between transcripts - and gene using the --gene_transcript_map option. - - name: --gene_transcript_map - type: file - description: | - A file containing a mapping between gene names and transcript names. - The file should be tab separated with the gene name in the first column - and the transcript name in the second column. - - name: --per_cell - type: boolean_true - description: | - Reads will only be grouped together if they have the same cell barcode. - Can be combined with --per_gene. - - - name: SAM/BAM Options - arguments: - - name: --mapping_quality - type: integer - description: | - Minimium mapping quality (MAPQ) for a read to be retained. - default: 0 - - name: --unmapped_reads - type: string - description: | - How unmapped reads should be handled. - The options are: - "discard": Discard all unmapped reads. - "use": If read2 is unmapped, deduplicate using read1 only. - Requires --paired. - "output": Output unmapped reads/read pairs without UMI - grouping/deduplication. Only available in umi_tools group. - default: discard - - name: --chimeric_pairs - type: string - description: | - How chimeric pairs should be handled. - The options are: - "discard": Discard all chimeric read pairs. - "use": Deduplicate using read1 only. - "output": Output chimeric pairs without UMI grouping/deduplication. - Only available in umi_tools group. - default: use - - name: --unapired_reads - type: string - description: | - How unpaired reads should be handled. - The options are: - "discard": Discard all unpaired reads. - "use": Deduplicate using read1 only. - "output": Output unpaired reads without UMI grouping/deduplication. - Only available in umi_tools group. - default: use - - name: --ignore_umi - type: boolean_true - description: | - Ignore the UMI and group reads using mapping coordinates only. - - name: --subset - type: boolean_true - description: | - Only consider a fraction of the reads, chosen at random. This is useful - for doing saturation analyses. - - name: --chrom - type: string - description: | - Only consider a single chromosome. This is useful for debugging/testing - purposes. - - - name: Group/Dedup Options - arguments: - - name: --no_sort_output - type: boolean_true - description: | - By default, output is sorted. This involves the use of a temporary unsorted - file (saved in --temp-dir). Use this option to turn off sorting. - - name: --buffer_whole_contig - type: boolean_true - description: | - Forces dedup to parse an entire contig before yielding any reads for - deduplication. This is the only way to absolutely guarantee that all reads - with the same start position are grouped together for deduplication since - dedup uses the start position of the read, not the alignment coordinate on - which the reads are sorted. However, by default, dedup reads for another - 1000bp before outputting read groups which will avoid any reads being missed - with short read sequencing (<1000bp). - - -resources: - - type: bash_script - path: script.sh -test_resources: - - type: bash_script - path: test.sh - - type: file - path: test_data -engines: - - type: docker - image: quay.io/biocontainers/umi_tools:1.1.5--py39hf95cd2a_1 - setup: - - type: docker - run: | - umi_tools -v | sed 's/ version//g' > /var/software_versions.txt -runners: -- type: executable -- type: nextflow \ No newline at end of file diff --git a/src/umi_tools/umi_tools_dedup/help.txt b/src/umi_tools/umi_tools_dedup/help.txt deleted file mode 100644 index d3c8fa44..00000000 --- a/src/umi_tools/umi_tools_dedup/help.txt +++ /dev/null @@ -1,13 +0,0 @@ -``` -umi_tools dedup -``` - -dedup - Deduplicate reads using UMI and mapping coordinates - -Usage: umi_tools dedup [OPTIONS] [--stdin=IN_BAM] [--stdout=OUT_BAM] - - note: If --stdout is ommited, standard out is output. To - generate a valid BAM file on standard out, please - redirect log with --log=LOGFILE or --log2stderr - -For full UMI-tools documentation, see https://umi-tools.readthedocs.io/en/latest/ \ No newline at end of file diff --git a/src/umi_tools/umi_tools_dedup/script.sh b/src/umi_tools/umi_tools_dedup/script.sh deleted file mode 100644 index 57c01258..00000000 --- a/src/umi_tools/umi_tools_dedup/script.sh +++ /dev/null @@ -1,65 +0,0 @@ -#!/bin/bash - -## VIASH START -## VIASH END - -set -e - -test_dir="${metal_executable}/test_data" - -[[ "$par_paired" == "false" ]] && unset par_paired -[[ "$par_in_sam" == "false" ]] && unset par_in_sam -[[ "$par_out_sam" == "false" ]] && unset par_out_sam -[[ "$par_spliced_is_unique" == "false" ]] && unset par_spliced_is_unique -[[ "$par_per_gene" == "false" ]] && unset par_per_gene -[[ "$par_per_contig" == "false" ]] && unset par_per_contig -[[ "$par_per_cell" == "false" ]] && unset par_per_cell -[[ "$par_no_sort_output" == "false" ]] && unset par_no_sort_output -[[ "$par_buffer_whole_contig" == "false" ]] && unset par_buffer_whole_contig -[[ "$par_ignore_umi" == "false" ]] && unset par_ignore_umi -[[ "$par_subset" == "false" ]] && unset par_subset - - -$(which umi_tools) dedup \ - -I "$par_input" \ - ${par_in_sam:+--in-sam} \ - ${par_bai:+--bai "$par_bai"} \ - ${par_get_output_stats:+--get-output-stats} \ - ${par_random_seed:+--random-seed "$par_random_seed"} \ - -S "$par_output" \ - ${par_out_sam:+--out-sam} \ - ${par_paired:+--paired} \ - ${par_output_stats:+--output-stats "$par_output_stats"} \ - ${par_extract_umi_method:+--extract-umi-method "$par_extract_umi_method"} \ - ${par_umi_tag:+--umi-tag "$par_umi_tag"} \ - ${par_umi_separator:+--umi-separator "$par_umi_separator"} \ - ${par_umi_tag_split:+--umi-tag-split "$par_umi_tag_split"} \ - ${par_umi_tag_delimiter:+--umi-tag-delimiter "$par_umi_tag_delimiter"} \ - ${par_cell_tag:+--cell-tag "$par_cell_tag"} \ - ${par_cell_tag_split:+--cell-tag-split "$par_cell_tag_split"} \ - ${par_cell_tag_delimiter:+--cell-tag-delimiter "$par_cell_tag_delimiter"} \ - ${par_method:+--method "$par_method"} \ - ${par_edit_distance_threshold:+--edit-distance-threshold "$par_edit_distance_threshold"} \ - ${par_spliced_is_unique:+--spliced-is-unique} \ - ${par_soft_clip_threshold:+--soft-clip-threshold "$par_soft_clip_threshold"} \ - ${par_multimapping_detection_method:+--multimapping-detection-method "$par_multimapping_detection_method"} \ - ${par_read_length:+--read-length "$par_read_length"} \ - ${par_per_gene:+--per-gene} \ - ${par_gene_tag:+--gene-tag "$par_gene_tag"} \ - ${par_assigned_status_tag:+--assigned-status-tag "$par_assigned_status_tag"} \ - ${par_skip_tags_regex:+--skip-tags-regex "$par_skip_tags_regex"} \ - ${par_per_contig:+--per-contig} - ${par_gene_transcript_map:+--gene-transcript-map "$par_gene_transcript_map"} \ - ${par_per_cell:+--per-cell} \ - ${par_mapping_quality:+--mapping-quality "$par_mapping_quality"} \ - ${par_unmapped_reads:+--unmapped-reads "$par_unmapped_reads"} \ - ${par_chimeric_pairs:+--chimeric-pairs "$par_chimeric_pairs"} \ - ${par_unapired_reads:+--unapired-reads "$par_unapired_reads"} \ - ${par_ignore_umi:+--ignore-umi} \ - ${par_subset:+--subset} \ - ${par_chrom:+--chrom "$par_chrom"} \ - ${par_no_sort_output:+--no-sort-output} \ - ${par_buffer_whole_contig:+--buffer-whole-contig} - - -exit 0 \ No newline at end of file diff --git a/src/umi_tools/umi_tools_dedup/test.sh b/src/umi_tools/umi_tools_dedup/test.sh deleted file mode 100644 index 1459ec08..00000000 --- a/src/umi_tools/umi_tools_dedup/test.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/bin/bash - -test_dir="${meta_resources_dir}/test_data" -echo ">>> Testing $meta_functionality_name" - -"$meta_executable" \ - --bam "$test_dir/a.sorted.bam" \ - --bai "$test_dir/a.sorted.bam.bai" \ - --output "$test_dir/a.sorted.idxstats" - -echo ">>> Checking whether output exists" -[ ! -f "$test_dir/a.sorted.idxstats" ] && echo "File 'a.sorted.idxstats' does not exist!" && exit 1 - -echo ">>> Checking whether output is non-empty" -[ ! -s "$test_dir/a.sorted.idxstats" ] && echo "File 'a.sorted.idxstats' is empty!" && exit 1 - -echo ">>> Checking whether output is correct" -diff "$test_dir/a.sorted.idxstats" "$test_dir/a_ref.sorted.idxstats" || \ - (echo "Output file a.sorted.idxstats does not match expected output" && exit 1) - -rm "$test_dir/a.sorted.idxstats" - -############################################################################################ - -echo ">>> Testing $meta_functionality_name with singletons in the input" - -"$meta_executable" \ - --bam "$test_dir/test.paired_end.sorted.bam" \ - --bai "$test_dir/test.paired_end.sorted.bam.bai" \ - --output "$test_dir/test.paired_end.sorted.idxstats" - -echo ">>> Checking whether output exists" -[ ! -f "$test_dir/test.paired_end.sorted.idxstats" ] && \ - echo "File 'test.paired_end.sorted.idxstats' does not exist!" && exit 1 - -echo ">>> Checking whether output is non-empty" -[ ! -s "$test_dir/test.paired_end.sorted.idxstats" ] && \ - echo "File 'test.paired_end.sorted.idxstats' is empty!" && exit 1 - -echo ">>> Checking whether output is correct" -diff "$test_dir/test.paired_end.sorted.idxstats" "$test_dir/test_ref.paired_end.sorted.idxstats" || \ - (echo "Output file test.paired_end.sorted.idxstats does not match expected output" && exit 1) - -rm "$test_dir/test.paired_end.sorted.idxstats" - -############################################################################################ - -echo "All tests succeeded!" -exit 0 \ No newline at end of file From 20b858983dffefc9e0c03fe46b5c3c244848e7fd Mon Sep 17 00:00:00 2001 From: Emma Rousseau Date: Thu, 31 Oct 2024 18:54:40 +0000 Subject: [PATCH 3/6] mkref component - config, test data, scripts, changelog --- CHANGELOG.md | 5 ++ .../cellranger_mkref/config.vsh.yaml | 73 ++++++++++++++++++ src/cellranger/cellranger_mkref/help.txt | 71 +++++++++++++++++ src/cellranger/cellranger_mkref/script.sh | 38 +++++++++ src/cellranger/cellranger_mkref/test.sh | 42 ++++++++++ .../test_data/reference_small.fa.gz | Bin 0 -> 584 bytes .../test_data/reference_small.gtf.gz | Bin 0 -> 514 bytes .../cellranger_mkref/test_data/script.sh | 51 ++++++++++++ 8 files changed, 280 insertions(+) create mode 100644 src/cellranger/cellranger_mkref/config.vsh.yaml create mode 100644 src/cellranger/cellranger_mkref/help.txt create mode 100644 src/cellranger/cellranger_mkref/script.sh create mode 100644 src/cellranger/cellranger_mkref/test.sh create mode 100644 src/cellranger/cellranger_mkref/test_data/reference_small.fa.gz create mode 100644 src/cellranger/cellranger_mkref/test_data/reference_small.gtf.gz create mode 100755 src/cellranger/cellranger_mkref/test_data/script.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index a2aa5387..54138852 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,11 @@ * `rsem/rsem_calculate_expression`: Calculate expression levels (PR #93). +* `cellranger`: + - `cellranger/cellranger_count`: Align fastq files using Cell Ranger count (PR #163). + - `cellranger/cellranger_mkref`: Build a Cell Ranger-compatible reference folder from user-supplied genome FASTA and gene GTF files (PR #164). + + ## BREAKING CHANGES * `falco`: Fix a typo in the `--reverse_complement` argument (PR #157). diff --git a/src/cellranger/cellranger_mkref/config.vsh.yaml b/src/cellranger/cellranger_mkref/config.vsh.yaml new file mode 100644 index 00000000..b0ee993f --- /dev/null +++ b/src/cellranger/cellranger_mkref/config.vsh.yaml @@ -0,0 +1,73 @@ +name: cellranger_mkref +namespace: cellranger +description: Build a Cell Ranger-compatible reference folder from user-supplied genome FASTA and gene GTF files. +keywords: [ cellranger, single-cell, rna-seq, alignment, reference, gtf, fasta ] +links: + documentation: https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/advanced/references#header + repository: https://github.com/10XGenomics/cellranger/blob/main/lib/python/cellranger/reference_builder.py + homepage: https://www.10xgenomics.com/support/software/cell-ranger/latest + issue_tracker: https://github.com/10XGenomics/cellranger/issues +references: + doi: 10.1038/ncomms14049 +license: Copyright (c) 2023 10x Genomics +authors: + - __merge__: /src/_authors/emma_rousseau.yaml + roles: [ author ] +arguments: + # inputs + - type: file + name: --genome_fasta + required: true + description: Reference genome fasta. + example: genome_sequence.fa.gz + - type: file + name: --transcriptome_gtf + required: true + description: Reference transcriptome annotation. + example: transcriptome_annotation.gtf.gz + - type: string + name: "--reference_version" + description: "Optional reference version string to include with reference" + - type: file + name: --output + direction: output + required: true + description: Output folder + example: cellranger_reference +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - path: test_data + +engines: +- type: docker + image: quay.io/nf-core/cellranger:8.0.0 + setup: + - type: docker + run: | + DEBIAN_FRONTEND=noninteractive apt update && \ + apt upgrade -y && apt install -y procps pigz && rm -rf /var/lib/apt/lists/* + test_setup: + - type: apt + packages: [ git, wget ] + - type: docker + run: | + TARGETARCH="${TARGETARCH:-$(dpkg --print-architecture)}" && \ + TARGETOS="${TARGETOS:-linux}" && \ + PATH="${PATH}:/usr/local/go/bin" && \ + wget https://go.dev/dl/go1.21.4.$TARGETOS-$TARGETARCH.tar.gz && tar -C /usr/local/ -xzf go1.21.4.$TARGETOS-$TARGETARCH.tar.gz && \ + rm go1.21.4.$TARGETOS-$TARGETARCH.tar.gz && \ + git clone --branch v2.5.0 https://github.com/shenwei356/seqkit.git && \ + cd seqkit/seqkit/ && go build && cp seqkit /usr/bin/ && cd ../../ && rm -rf seqkit && rm -r /usr/local/go + - type: docker + run: | + cellranger --version | sed 's/ cellranger-/: /' > /var/software_versions.txt + +runners: +- type: executable +- type: nextflow + directives: + label: [ highmem, highcpu ] diff --git a/src/cellranger/cellranger_mkref/help.txt b/src/cellranger/cellranger_mkref/help.txt new file mode 100644 index 00000000..9cd45cb9 --- /dev/null +++ b/src/cellranger/cellranger_mkref/help.txt @@ -0,0 +1,71 @@ +``` +cellranger mkref -h +``` +Prepare a reference for use with 10x analysis software. Requires a GTF and +FASTA + +Usage: cellranger mkref [OPTIONS] --genome --fasta --genes + +Options: + --genome + Unique genome name, used to name output folder [a-zA-Z0-9_-]+. + Specify multiple genomes by specifying this argument multiple + times; the output folder will be _and_ + --fasta + Path to FASTA file containing your genome reference. Specify + multiple genomes by specifying this argument multiple times + --genes + Path to genes GTF file containing annotated genes for your genome + reference. Specify multiple genomes by specifying this argument + multiple times + --nthreads + Number of threads used during STAR genome index generation. + Defaults to 1 [default: 1] + --memgb + Maximum memory (GB) used [default: 16] + --ref-version + Optional reference version string to include with reference + --dry + Do not execute the pipeline. Generate a pipeline invocation (.mro) + file and stop + --jobmode + Job manager to use. Valid options: local (default), sge, lsf, + slurm or path to a .template file. Search for help on "Cluster + Mode" at support.10xgenomics.com for more details on configuring + the pipeline to use a compute cluster + --localcores + Set max cores the pipeline may request at one time. Only applies + to local jobs + --localmem + Set max GB the pipeline may request at one time. Only applies to + local jobs + --localvmem + Set max virtual address space in GB for the pipeline. Only applies + to local jobs + --mempercore + Reserve enough threads for each job to ensure enough memory will + be available, assuming each core on your cluster has at least this + much memory available. Only applies to cluster jobmodes + --maxjobs + Set max jobs submitted to cluster at one time. Only applies to + cluster jobmodes + --jobinterval + Set delay between submitting jobs to cluster, in ms. Only applies + to cluster jobmodes + --overrides + The path to a JSON file that specifies stage-level overrides for + cores and memory. Finer-grained than --localcores, --mempercore + and --localmem. Consult https://support.10xgenomics.com/ for an + example override file + --output-dir + Output the results to this directory + --uiport + Serve web UI at http://localhost:PORT + --disable-ui + Do not serve the web UI + --noexit + Keep web UI running after pipestance completes or fails + --nopreflight + Skip preflight checks + -h, --help + Print help diff --git a/src/cellranger/cellranger_mkref/script.sh b/src/cellranger/cellranger_mkref/script.sh new file mode 100644 index 00000000..9ee4b25e --- /dev/null +++ b/src/cellranger/cellranger_mkref/script.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +set -eo pipefail + +## VIASH START +par_genome_fasta="resources_test/test_data/reference_small.fa.gz" +par_transcriptome_gtf="resources_test/test_data/reference_small.gtf.gz" +par_output="gencode_v41_annotation_cellranger.tar.gz" +## VIASH END + +# create temporary directory +tmpdir=$(mktemp -d "$VIASH_TEMP/$meta_name-XXXXXXXX") +function clean_up { + rm -rf "$tmpdir" +} +trap clean_up EXIT + +# just to make sure +par_genome_fasta=$(realpath $par_genome_fasta) +par_transcriptome_gtf=$(realpath $par_transcriptome_gtf) +par_output=$(realpath $par_output) + + +echo "> Unzipping input files" +unpigz -c "$par_genome_fasta" > "$tmpdir/genome.fa" + +echo "> Building star index" +cd "$tmpdir" +cellranger mkref \ + --fasta "$tmpdir/genome.fa" \ + --genes "$par_transcriptome_gtf" \ + --genome output \ + ${par_reference_version:+--ref-version $par_reference_version} \ + ${meta_cpus:+--nthreads $meta_cpus} \ + ${meta_memory_gb:+--memgb $(($meta_memory_gb-2))} # always keep 2 gb for the OS itseld + +echo "> Creating archive" +tar --use-compress-program="pigz -k " -cf "$par_output" -C "$tmpdir/output" . \ No newline at end of file diff --git a/src/cellranger/cellranger_mkref/test.sh b/src/cellranger/cellranger_mkref/test.sh new file mode 100644 index 00000000..663c1c59 --- /dev/null +++ b/src/cellranger/cellranger_mkref/test.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +set -eou pipefail + +## VIASH START +meta_executable="bin/viash run src/reference/make_reference/config.vsh.yaml --" +## VIASH END + +# create temporary directory +tmpdir=$(mktemp -d "$meta_temp_dir/$meta_name-XXXXXXXX") +function clean_up { + rm -rf "$tmpdir" +} +trap clean_up EXIT + +function seqkit_head { + input="$1" + output="$2" + if [[ ! -f "$output" ]]; then + echo "> Processing $(basename $input)" + seqkit subseq -r 1:50000 "$input" | gzip > "$output" + fi +} + +seqkit_head "$meta_resources_dir/test_data/reference_small.fa.gz" "$tmpdir/reference_small.fa.gz" +zcat "$meta_resources_dir/test_data/reference_small.gtf.gz" | awk '$4 < 50001 {print ;}' | gzip > "$tmpdir/reference_small.gtf.gz" + +echo "> Running $meta_name, writing to $tmpdir." +$meta_executable \ + --genome_fasta "$tmpdir/reference_small.fa.gz" \ + --transcriptome_gtf "$tmpdir/reference_small.gtf.gz" \ + --output "$tmpdir/myreference.tar.gz" \ + ---cpus ${meta_memory_gb:-1} \ + ---memory ${meta_memory_gb:-5}GB + +exit_code=$? +[[ $exit_code != 0 ]] && echo "Non zero exit code: $exit_code" && exit 1 + +echo ">> Checking whether output can be found" +[[ ! -f "$tmpdir/myreference.tar.gz" ]] && echo "Output tar file could not be found!" && exit 1 + +echo "> Test succeeded!" \ No newline at end of file diff --git a/src/cellranger/cellranger_mkref/test_data/reference_small.fa.gz b/src/cellranger/cellranger_mkref/test_data/reference_small.fa.gz new file mode 100644 index 0000000000000000000000000000000000000000..e24b75a949df25d147877afbaf9a46e91775bb40 GIT binary patch literal 584 zcmV-O0=NAiiwFpZ%_C<319D|%WpZV1V`X1+ZDDL|E@oi>ZIrQbLop0PdrgrQyg=@F zD7XMONRvAI7a<!kLPz=|xt5`kE%CR{wTgrZiJ$N7^a-_GgL^RmAW?d!9O5u+~>uH*~;>#&?zvyud`pbKj+S{Z+PwkZBG97rp5}H2Pko}`z~fM2%0ut<2FcQ??Bg^ zq#I*{`?TXzOrXIMd*Jj3BdM4zdvMBBs@M#6@AJmHvmG#unm~})vnDjcL5FG z0#3P9ke6yUMM86A;DnF^;taIlZAzt-aXRe4bAXldH&>-h_YDr36A?F6K#0H~6;<@# zopOh6P)2v>skqP#L?7_W-hU6}cBwS=Vn3gLasQckTK+kWovUT0$=DQJ@`}7eyDvTb z-nnzpSkDbd_Xea?n%q?rMZK*eIJWyt&WeU}QR`mLcJ{h0AaqZpIS#di4XVElAP{wv znVb!m1qEp_LdF}*buR7!ea6m8D}she1ENRObtoNey+;*O6``dWVs!$iu4W(dMd}8t zlRzP&xW+zPlOBS~xLT7QTF}9EYr9{r3{-R~lR_1Gn>3&m(6519D|%WpZV1V`X1+ZDDL|E@yOR0OgilZ`v>vhOfO}5#@F& z^7qGfMJoi829rA4+sGJ?MoJu%q-gv1vr{rcfOSJbvob&h`$2^n>OosQv z*$~*Tg#(N;z_|ni@YcU8>ui!Pqj)s?Jci-Hlu*eAEPfx=rFNAm({(xr9AwvuI0*S1Vd;l8A0ZkO%+LRYD22P0S6e32zayTYYLXW_GYqn!?B zeW0J_^U2-xDfRwQObzf(v-7`I&bLA=DGaf`F@3;F2!NovY{KLiVTlD7>c1wulLPFF z7$F2-#hEKC)fqXTja|#=ccDN*d01=#!fU(f-(b+KHFUJrDB|jjwRR?6O>?}&qIvfE zlL|sk{|R6rrs?&^h7KTH00!x4V3HDo`!w?OG>jh^BK;f?h8695(P_Ca@a5Fu;ZQ)u zU+C_^$!Ybc56)h%B|kd~jiIA?__f<^KU+xYCZCTREX^3Ue /dev/null; then + echo "seqkit could not be found" + exit 1 +fi + +# create temporary directory and clean up on exit +mkdir -p $TMP_DIR +function clean_up { + rm -rf "$TMP_DIR" +} +trap clean_up EXIT + +# fetch reference +ORIG_FA=$TMP_DIR/reference.fa.gz +if [ ! -f $ORIG_FA ]; then + wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/GRCh38.primary_assembly.genome.fa.gz \ + -O $ORIG_FA +fi + +ORIG_GTF=$TMP_DIR/reference.gtf.gz +if [ ! -f $ORIG_GTF ]; then + wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/gencode.v41.annotation.gtf.gz \ + -O $ORIG_GTF +fi + +# create small reference +START=30000 +END=31500 +CHR=chr1 + +touch $OUT_DIR/reference_small.fa +# subset to small region +seqkit grep -r -p "^$CHR\$" "$ORIG_FA" | \ + seqkit subseq -r "$START:$END" > $OUT_DIR/reference_small.fa + +touch $OUT_DIR/reference_small.gtf +gunzip -c "$ORIG_GTF" | awk -v FS='\t' -v OFS='\t' " + \$1 == \"$CHR\" && \$4 >= $START && \$5 <= $END { + \$4 = \$4 - $START + 1; + \$5 = \$5 - $START + 1; + print; + }" > $OUT_DIR/reference_small.gtf + +gzip $OUT_DIR/reference_small.fa +gzip $OUT_DIR/reference_small.gtf From 88ed15285e5757cc3833a7a57b928a9af1b81af5 Mon Sep 17 00:00:00 2001 From: DriesSchaumont <5946712+DriesSchaumont@users.noreply.github.com> Date: Fri, 8 Nov 2024 09:26:23 +0000 Subject: [PATCH 4/6] Small updates --- CHANGELOG.md | 2 -- .../cellranger_mkref/config.vsh.yaml | 29 ++++++++----------- src/cellranger/cellranger_mkref/script.sh | 10 +++---- src/cellranger/cellranger_mkref/test.sh | 2 +- 4 files changed, 18 insertions(+), 25 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6206a5a1..3905de17 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,7 +17,6 @@ * `rsem/rsem_calculate_expression`: Calculate expression levels (PR #93). * `cellranger`: - - `cellranger/cellranger_count`: Align fastq files using Cell Ranger count (PR #163). - `cellranger/cellranger_mkref`: Build a Cell Ranger-compatible reference folder from user-supplied genome FASTA and gene GTF files (PR #164). @@ -30,7 +29,6 @@ * `nanoplot`: Plotting tool for long read sequencing data and alignments (PR #95). - ## BUG FIXES * `falco`: Fix a typo in the `--reverse_complement` argument (PR #157). diff --git a/src/cellranger/cellranger_mkref/config.vsh.yaml b/src/cellranger/cellranger_mkref/config.vsh.yaml index b0ee993f..b00c8f90 100644 --- a/src/cellranger/cellranger_mkref/config.vsh.yaml +++ b/src/cellranger/cellranger_mkref/config.vsh.yaml @@ -3,13 +3,15 @@ namespace: cellranger description: Build a Cell Ranger-compatible reference folder from user-supplied genome FASTA and gene GTF files. keywords: [ cellranger, single-cell, rna-seq, alignment, reference, gtf, fasta ] links: - documentation: https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/advanced/references#header + documentation: https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/advanced/references repository: https://github.com/10XGenomics/cellranger/blob/main/lib/python/cellranger/reference_builder.py homepage: https://www.10xgenomics.com/support/software/cell-ranger/latest issue_tracker: https://github.com/10XGenomics/cellranger/issues references: doi: 10.1038/ncomms14049 -license: Copyright (c) 2023 10x Genomics +license: Proprietary +requirements: + commands: [cellranger, pigz, unpigz, tar] authors: - __merge__: /src/_authors/emma_rousseau.yaml roles: [ author ] @@ -27,6 +29,7 @@ arguments: example: transcriptome_annotation.gtf.gz - type: string name: "--reference_version" + required: false description: "Optional reference version string to include with reference" - type: file name: --output @@ -44,24 +47,16 @@ test_resources: engines: - type: docker - image: quay.io/nf-core/cellranger:8.0.0 + image: ghcr.io/data-intuitive/cellranger:8.0 setup: - - type: docker - run: | - DEBIAN_FRONTEND=noninteractive apt update && \ - apt upgrade -y && apt install -y procps pigz && rm -rf /var/lib/apt/lists/* + - type: apt + packages: + - procps + - pigz test_setup: - type: apt - packages: [ git, wget ] - - type: docker - run: | - TARGETARCH="${TARGETARCH:-$(dpkg --print-architecture)}" && \ - TARGETOS="${TARGETOS:-linux}" && \ - PATH="${PATH}:/usr/local/go/bin" && \ - wget https://go.dev/dl/go1.21.4.$TARGETOS-$TARGETARCH.tar.gz && tar -C /usr/local/ -xzf go1.21.4.$TARGETOS-$TARGETARCH.tar.gz && \ - rm go1.21.4.$TARGETOS-$TARGETARCH.tar.gz && \ - git clone --branch v2.5.0 https://github.com/shenwei356/seqkit.git && \ - cd seqkit/seqkit/ && go build && cp seqkit /usr/bin/ && cd ../../ && rm -rf seqkit && rm -r /usr/local/go + packages: + - seqkit - type: docker run: | cellranger --version | sed 's/ cellranger-/: /' > /var/software_versions.txt diff --git a/src/cellranger/cellranger_mkref/script.sh b/src/cellranger/cellranger_mkref/script.sh index 9ee4b25e..7179faae 100644 --- a/src/cellranger/cellranger_mkref/script.sh +++ b/src/cellranger/cellranger_mkref/script.sh @@ -3,19 +3,19 @@ set -eo pipefail ## VIASH START -par_genome_fasta="resources_test/test_data/reference_small.fa.gz" -par_transcriptome_gtf="resources_test/test_data/reference_small.gtf.gz" -par_output="gencode_v41_annotation_cellranger.tar.gz" +par_genome_fasta="test_data/reference_small.fa.gz" +par_transcriptome_gtf="test_data/reference_small.gtf.gz" +par_output="output.tar.gz" ## VIASH END # create temporary directory -tmpdir=$(mktemp -d "$VIASH_TEMP/$meta_name-XXXXXXXX") +tmpdir=$(mktemp -d "$meta_temp_dir/$meta_name-XXXXXXXX") function clean_up { rm -rf "$tmpdir" } trap clean_up EXIT -# just to make sure +# We change into the tempdir later, so we need absolute paths. par_genome_fasta=$(realpath $par_genome_fasta) par_transcriptome_gtf=$(realpath $par_transcriptome_gtf) par_output=$(realpath $par_output) diff --git a/src/cellranger/cellranger_mkref/test.sh b/src/cellranger/cellranger_mkref/test.sh index 663c1c59..5c5c1f3d 100644 --- a/src/cellranger/cellranger_mkref/test.sh +++ b/src/cellranger/cellranger_mkref/test.sh @@ -3,7 +3,7 @@ set -eou pipefail ## VIASH START -meta_executable="bin/viash run src/reference/make_reference/config.vsh.yaml --" +meta_executable="viash run src/reference/make_reference/config.vsh.yaml --" ## VIASH END # create temporary directory From 48ce1beda9fcb65d6565346d18f952a384a4fb16 Mon Sep 17 00:00:00 2001 From: DriesSchaumont <5946712+DriesSchaumont@users.noreply.github.com> Date: Fri, 8 Nov 2024 09:29:26 +0000 Subject: [PATCH 5/6] Update CHANGELOG --- CHANGELOG.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 76f73c03..d751ca81 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,9 +20,6 @@ * `cellranger`: - `cellranger/cellranger_mkref`: Build a Cell Ranger-compatible reference folder from user-supplied genome FASTA and gene GTF files (PR #164). - -## BREAKING CHANGES - * `rseqc`: - `rseqc/rseqc_inner_distance`: Calculate inner distance between read pairs (PR #159). - `rseqc/rseqc_inferexperiment`: Infer strandedness from sequencing reads (PR #158). From 919df7a87e23962dc3248446c31a040593224378 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Fri, 8 Nov 2024 10:35:11 +0100 Subject: [PATCH 6/6] Update src/cellranger/cellranger_mkref/script.sh --- src/cellranger/cellranger_mkref/script.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cellranger/cellranger_mkref/script.sh b/src/cellranger/cellranger_mkref/script.sh index 7179faae..03755998 100644 --- a/src/cellranger/cellranger_mkref/script.sh +++ b/src/cellranger/cellranger_mkref/script.sh @@ -32,7 +32,7 @@ cellranger mkref \ --genome output \ ${par_reference_version:+--ref-version $par_reference_version} \ ${meta_cpus:+--nthreads $meta_cpus} \ - ${meta_memory_gb:+--memgb $(($meta_memory_gb-2))} # always keep 2 gb for the OS itseld + ${meta_memory_gb:+--memgb $(($meta_memory_gb-2))} # always keep 2 gb for the OS itself echo "> Creating archive" -tar --use-compress-program="pigz -k " -cf "$par_output" -C "$tmpdir/output" . \ No newline at end of file +tar --use-compress-program="pigz -k " -cf "$par_output" -C "$tmpdir/output" .