From 7d99065ecf66e6bc42b03f8ffcfcfc95ef2d2b72 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Wed, 17 Jul 2024 17:46:44 +0200 Subject: [PATCH 01/10] `bd_rhapsody_make_reference`: Create a reference for the BD Rhapsody pipeline (#75) * `bd_rhapsody/bd_rhapsody_make_reference`: Create a reference for the BD Rhapsody pipeline * add missing metadata * remove unicode * trigger * process comments * add authors * Apply suggestions from code review Co-authored-by: Dorien <41797896+dorien-er@users.noreply.github.com> --------- Co-authored-by: Dorien <41797896+dorien-er@users.noreply.github.com> --- CHANGELOG.md | 6 + src/_authors/robrecht_cannoodt.yaml | 14 ++ src/_authors/weiwei_schultz.yaml | 5 + .../config.vsh.yaml | 143 ++++++++++++++++ .../bd_rhapsody_make_reference/help.txt | 66 +++++++ .../make_rhap_reference_2.2.1_nodocker.cwl | 115 +++++++++++++ .../bd_rhapsody_make_reference/script.py | 161 ++++++++++++++++++ .../bd_rhapsody_make_reference/test.sh | 68 ++++++++ .../test_data/reference_small.fa | 27 +++ .../test_data/reference_small.gtf | 8 + .../test_data/script.sh | 47 +++++ 11 files changed, 660 insertions(+) create mode 100644 src/_authors/robrecht_cannoodt.yaml create mode 100644 src/_authors/weiwei_schultz.yaml create mode 100644 src/bd_rhapsody/bd_rhapsody_make_reference/config.vsh.yaml create mode 100644 src/bd_rhapsody/bd_rhapsody_make_reference/help.txt create mode 100644 src/bd_rhapsody/bd_rhapsody_make_reference/make_rhap_reference_2.2.1_nodocker.cwl create mode 100644 src/bd_rhapsody/bd_rhapsody_make_reference/script.py create mode 100644 src/bd_rhapsody/bd_rhapsody_make_reference/test.sh create mode 100644 src/bd_rhapsody/bd_rhapsody_make_reference/test_data/reference_small.fa create mode 100644 src/bd_rhapsody/bd_rhapsody_make_reference/test_data/reference_small.gtf create mode 100644 src/bd_rhapsody/bd_rhapsody_make_reference/test_data/script.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 80b8b9f3..9cfacdbc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # biobox x.x.x +## NEW FEATURES + +* `bd_rhapsody`: + + - `bd_rhapsody/bd_rhapsody_make_reference`: Create a reference for the BD Rhapsody pipeline (PR #75). + ## BUG FIXES * `pear`: fix component not exiting with the correct exitcode when PEAR fails. diff --git a/src/_authors/robrecht_cannoodt.yaml b/src/_authors/robrecht_cannoodt.yaml new file mode 100644 index 00000000..d7c0f283 --- /dev/null +++ b/src/_authors/robrecht_cannoodt.yaml @@ -0,0 +1,14 @@ +name: Robrecht Cannoodt +info: + links: + email: robrecht@data-intuitive.com + github: rcannood + orcid: "0000-0003-3641-729X" + linkedin: robrechtcannoodt + organizations: + - name: Data Intuitive + href: https://www.data-intuitive.com + role: Data Science Engineer + - name: Open Problems + href: https://openproblems.bio + role: Core Member \ No newline at end of file diff --git a/src/_authors/weiwei_schultz.yaml b/src/_authors/weiwei_schultz.yaml new file mode 100644 index 00000000..324f9378 --- /dev/null +++ b/src/_authors/weiwei_schultz.yaml @@ -0,0 +1,5 @@ +name: Weiwei Schultz +info: + organizations: + - name: Janssen R&D US + role: Associate Director Data Sciences \ No newline at end of file diff --git a/src/bd_rhapsody/bd_rhapsody_make_reference/config.vsh.yaml b/src/bd_rhapsody/bd_rhapsody_make_reference/config.vsh.yaml new file mode 100644 index 00000000..e596bf06 --- /dev/null +++ b/src/bd_rhapsody/bd_rhapsody_make_reference/config.vsh.yaml @@ -0,0 +1,143 @@ +name: bd_rhapsody_make_reference +namespace: bd_rhapsody +description: | + The Reference Files Generator creates an archive containing Genome Index + and Transcriptome annotation files needed for the BD Rhapsody Sequencing + Analysis Pipeline. The app takes as input one or more FASTA and GTF files + and produces a compressed archive in the form of a tar.gz file. The + archive contains: + + - STAR index + - Filtered GTF file +keywords: [genome, reference, index, align] +links: + repository: https://bitbucket.org/CRSwDev/cwl/src/master/v2.2.1/Extra_Utilities/ + documentation: https://bd-rhapsody-bioinfo-docs.genomics.bd.com/resources/extra_utilities.html#make-rhapsody-reference +license: Unknown +authors: + - __merge__: /src/_authors/robrecht_cannoodt.yaml + roles: [ author, maintainer ] + - __merge__: /src/_authors/weiwei_schultz.yaml + roles: [ contributor ] + +argument_groups: + - name: Inputs + arguments: + - type: file + name: --genome_fasta + required: true + description: Reference genome file in FASTA or FASTA.GZ format. The BD Rhapsody Sequencing Analysis Pipeline uses GRCh38 for Human and GRCm39 for Mouse. + example: genome_sequence.fa.gz + multiple: true + info: + config_key: Genome_fasta + - type: file + name: --gtf + required: true + description: | + File path to the transcript annotation files in GTF or GTF.GZ format. The Sequence Analysis Pipeline requires the 'gene_name' or + 'gene_id' attribute to be set on each gene and exon feature. Gene and exon feature lines must have the same attribute, and exons + must have a corresponding gene with the same value. For TCR/BCR assays, the TCR or BCR gene segments must have the 'gene_type' or + 'gene_biotype' attribute set, and the value should begin with 'TR' or 'IG', respectively. + example: transcriptome_annotation.gtf.gz + multiple: true + info: + config_key: Gtf + - type: file + name: --extra_sequences + description: | + File path to additional sequences in FASTA format to use when building the STAR index. (e.g. transgenes or CRISPR guide barcodes). + GTF lines for these sequences will be automatically generated and combined with the main GTF. + required: false + multiple: true + info: + config_key: Extra_sequences + - name: Outputs + arguments: + - type: file + name: --reference_archive + direction: output + required: true + description: | + A Compressed archive containing the Reference Genome Index and annotation GTF files. This archive is meant to be used as an + input in the BD Rhapsody Sequencing Analysis Pipeline. + example: star_index.tar.gz + - name: Arguments + arguments: + - type: string + name: --mitochondrial_contigs + description: | + Names of the Mitochondrial contigs in the provided Reference Genome. Fragments originating from contigs other than these are + identified as 'nuclear fragments' in the ATACseq analysis pipeline. + required: false + multiple: true + default: [chrM, chrMT, M, MT] + info: + config_key: Mitochondrial_contigs + - type: boolean_true + name: --filtering_off + description: | + By default the input Transcript Annotation files are filtered based on the gene_type/gene_biotype attribute. Only features + having the following attribute values are kept: + + - protein_coding + - lncRNA (lincRNA and antisense for Gencode < v31/M22/Ensembl97) + - IG_LV_gene + - IG_V_gene + - IG_V_pseudogene + - IG_D_gene + - IG_J_gene + - IG_J_pseudogene + - IG_C_gene + - IG_C_pseudogene + - TR_V_gene + - TR_V_pseudogene + - TR_D_gene + - TR_J_gene + - TR_J_pseudogene + - TR_C_gene + + If you have already pre-filtered the input Annotation files and/or wish to turn-off the filtering, please set this option to True. + info: + config_key: Filtering_off + - type: boolean_true + name: --wta_only_index + description: Build a WTA only index, otherwise builds a WTA + ATAC index. + info: + config_key: Wta_Only + - type: string + name: --extra_star_params + description: Additional parameters to pass to STAR when building the genome index. Specify exactly like how you would on the command line. + example: --limitGenomeGenerateRAM 48000 --genomeSAindexNbases 11 + required: false + info: + config_key: Extra_STAR_params + +resources: + - type: python_script + path: script.py + - path: make_rhap_reference_2.2.1_nodocker.cwl + +test_resources: + - type: bash_script + path: test.sh + - path: test_data + +requirements: + commands: [ "cwl-runner" ] + +engines: + - type: docker + image: bdgenomics/rhapsody:2.2.1 + setup: + - type: apt + packages: [procps] + - type: python + packages: [cwlref-runner, cwl-runner] + - type: docker + run: | + echo "bdgenomics/rhapsody: 2.2.1" > /var/software_versions.txt + +runners: + - type: executable + - type: nextflow diff --git a/src/bd_rhapsody/bd_rhapsody_make_reference/help.txt b/src/bd_rhapsody/bd_rhapsody_make_reference/help.txt new file mode 100644 index 00000000..cd038b25 --- /dev/null +++ b/src/bd_rhapsody/bd_rhapsody_make_reference/help.txt @@ -0,0 +1,66 @@ +```bash +cwl-runner src/bd_rhapsody/bd_rhapsody_make_reference/make_rhap_reference_2.2.1_nodocker.cwl --help +``` + +usage: src/bd_rhapsody/bd_rhapsody_make_reference/make_rhap_reference_2.2.1_nodocker.cwl + [-h] [--Archive_prefix ARCHIVE_PREFIX] + [--Extra_STAR_params EXTRA_STAR_PARAMS] + [--Extra_sequences EXTRA_SEQUENCES] [--Filtering_off] --Genome_fasta + GENOME_FASTA --Gtf GTF [--Maximum_threads MAXIMUM_THREADS] + [--Mitochondrial_Contigs MITOCHONDRIAL_CONTIGS] [--WTA_Only] + [job_order] + +The Reference Files Generator creates an archive containing Genome Index and +Transcriptome annotation files needed for the BD Rhapsodyâ„¢ Sequencing +Analysis Pipeline. The app takes as input one or more FASTA and GTF files and +produces a compressed archive in the form of a tar.gz file. The archive +contains:\n - STAR index\n - Filtered GTF file + +positional arguments: + job_order Job input json file + +options: + -h, --help show this help message and exit + --Archive_prefix ARCHIVE_PREFIX + A prefix for naming the compressed archive file + containing the Reference genome index and annotation + files. The default value is constructed based on the + input Reference files. + --Extra_STAR_params EXTRA_STAR_PARAMS + Additional parameters to pass to STAR when building + the genome index. Specify exactly like how you would + on the command line. Example: --limitGenomeGenerateRAM + 48000 --genomeSAindexNbases 11 + --Extra_sequences EXTRA_SEQUENCES + Additional sequences in FASTA format to use when + building the STAR index. (E.g. phiX genome) + --Filtering_off By default the input Transcript Annotation files are + filtered based on the gene_type/gene_biotype + attribute. Only features having the following + attribute values are are kept: - protein_coding - + lncRNA (lincRNA and antisense for Gencode < + v31/M22/Ensembl97) - IG_LV_gene - IG_V_gene - + IG_V_pseudogene - IG_D_gene - IG_J_gene - + IG_J_pseudogene - IG_C_gene - IG_C_pseudogene - + TR_V_gene - TR_V_pseudogene - TR_D_gene - TR_J_gene - + TR_J_pseudogene - TR_C_gene If you have already pre- + filtered the input Annotation files and/or wish to + turn-off the filtering, please set this option to + True. + --Genome_fasta GENOME_FASTA + Reference genome file in FASTA format. The BD + Rhapsodyâ„¢ Sequencing Analysis Pipeline uses GRCh38 + for Human and GRCm39 for Mouse. + --Gtf GTF Transcript annotation files in GTF format. The BD + Rhapsodyâ„¢ Sequencing Analysis Pipeline uses Gencode + v42 for Human and M31 for Mouse. + --Maximum_threads MAXIMUM_THREADS + The maximum number of threads to use in the pipeline. + By default, all available cores are used. + --Mitochondrial_Contigs MITOCHONDRIAL_CONTIGS + Names of the Mitochondrial contigs in the provided + Reference Genome. Fragments originating from contigs + other than these are identified as 'nuclear fragments' + in the ATACseq analysis pipeline. + --WTA_Only Build a WTA only index, otherwise builds a WTA + ATAC + index. diff --git a/src/bd_rhapsody/bd_rhapsody_make_reference/make_rhap_reference_2.2.1_nodocker.cwl b/src/bd_rhapsody/bd_rhapsody_make_reference/make_rhap_reference_2.2.1_nodocker.cwl new file mode 100644 index 00000000..fead2c02 --- /dev/null +++ b/src/bd_rhapsody/bd_rhapsody_make_reference/make_rhap_reference_2.2.1_nodocker.cwl @@ -0,0 +1,115 @@ +requirements: + InlineJavascriptRequirement: {} +class: CommandLineTool +label: Reference Files Generator for BD Rhapsodyâ„¢ Sequencing Analysis Pipeline +cwlVersion: v1.2 +doc: >- + The Reference Files Generator creates an archive containing Genome Index and Transcriptome annotation files needed for the BD Rhapsodyâ„¢ Sequencing Analysis Pipeline. The app takes as input one or more FASTA and GTF files and produces a compressed archive in the form of a tar.gz file. The archive contains:\n - STAR index\n - Filtered GTF file + + +baseCommand: run_reference_generator.sh +inputs: + Genome_fasta: + type: File[] + label: Reference Genome + doc: |- + Reference genome file in FASTA format. The BD Rhapsodyâ„¢ Sequencing Analysis Pipeline uses GRCh38 for Human and GRCm39 for Mouse. + inputBinding: + prefix: --reference-genome + shellQuote: false + Gtf: + type: File[] + label: Transcript Annotations + doc: |- + Transcript annotation files in GTF format. The BD Rhapsodyâ„¢ Sequencing Analysis Pipeline uses Gencode v42 for Human and M31 for Mouse. + inputBinding: + prefix: --gtf + shellQuote: false + Extra_sequences: + type: File[]? + label: Extra Sequences + doc: |- + Additional sequences in FASTA format to use when building the STAR index. (E.g. phiX genome) + inputBinding: + prefix: --extra-sequences + shellQuote: false + Mitochondrial_Contigs: + type: string[]? + default: ["chrM", "chrMT", "M", "MT"] + label: Mitochondrial Contig Names + doc: |- + Names of the Mitochondrial contigs in the provided Reference Genome. Fragments originating from contigs other than these are identified as 'nuclear fragments' in the ATACseq analysis pipeline. + inputBinding: + prefix: --mitochondrial-contigs + shellQuote: false + Filtering_off: + type: boolean? + label: Turn off filtering + doc: |- + By default the input Transcript Annotation files are filtered based on the gene_type/gene_biotype attribute. Only features having the following attribute values are are kept: + - protein_coding + - lncRNA (lincRNA and antisense for Gencode < v31/M22/Ensembl97) + - IG_LV_gene + - IG_V_gene + - IG_V_pseudogene + - IG_D_gene + - IG_J_gene + - IG_J_pseudogene + - IG_C_gene + - IG_C_pseudogene + - TR_V_gene + - TR_V_pseudogene + - TR_D_gene + - TR_J_gene + - TR_J_pseudogene + - TR_C_gene + If you have already pre-filtered the input Annotation files and/or wish to turn-off the filtering, please set this option to True. + inputBinding: + prefix: --filtering-off + shellQuote: false + WTA_Only: + type: boolean? + label: WTA only index + doc: Build a WTA only index, otherwise builds a WTA + ATAC index. + inputBinding: + prefix: --wta-only-index + shellQuote: false + Archive_prefix: + type: string? + label: Archive Prefix + doc: |- + A prefix for naming the compressed archive file containing the Reference genome index and annotation files. The default value is constructed based on the input Reference files. + inputBinding: + prefix: --archive-prefix + shellQuote: false + Extra_STAR_params: + type: string? + label: Extra STAR Params + doc: |- + Additional parameters to pass to STAR when building the genome index. Specify exactly like how you would on the command line. + Example: + --limitGenomeGenerateRAM 48000 --genomeSAindexNbases 11 + inputBinding: + prefix: --extra-star-params + shellQuote: true + + Maximum_threads: + type: int? + label: Maximum Number of Threads + doc: |- + The maximum number of threads to use in the pipeline. By default, all available cores are used. + inputBinding: + prefix: --maximum-threads + shellQuote: false + +outputs: + + Archive: + type: File + doc: |- + A Compressed archive containing the Reference Genome Index and annotation GTF files. This archive is meant to be used as an input in the BD Rhapsodyâ„¢ Sequencing Analysis Pipeline. + id: Reference_Archive + label: Reference Files Archive + outputBinding: + glob: '*.tar.gz' + diff --git a/src/bd_rhapsody/bd_rhapsody_make_reference/script.py b/src/bd_rhapsody/bd_rhapsody_make_reference/script.py new file mode 100644 index 00000000..ca635508 --- /dev/null +++ b/src/bd_rhapsody/bd_rhapsody_make_reference/script.py @@ -0,0 +1,161 @@ +import os +import re +import subprocess +import tempfile +from typing import Any +import yaml +import shutil + +## VIASH START +par = { + "genome_fasta": [], + "gtf": [], + "extra_sequences": [], + "mitochondrial_contigs": ["chrM", "chrMT", "M", "MT"], + "filtering_off": False, + "wta_only_index": False, + "extra_star_params": None, + "reference_archive": "output.tar.gz", +} +meta = { + "config": "target/nextflow/reference/build_bdrhap_2_reference/.config.vsh.yaml", + "resources_dir": os.path.abspath("src/reference/build_bdrhap_2_reference"), + "temp_dir": os.getenv("VIASH_TEMP"), + "memory_mb": None, + "cpus": None +} +## VIASH END + +def clean_arg(argument): + argument["clean_name"] = re.sub("^-*", "", argument["name"]) + return argument + +def read_config(path: str) -> dict[str, Any]: + with open(path, "r") as f: + config = yaml.safe_load(f) + + config["all_arguments"] = [ + clean_arg(arg) + for grp in config["argument_groups"] + for arg in grp["arguments"] + ] + + return config + +def strip_margin(text: str) -> str: + return re.sub("(\n?)[ \t]*\|", "\\1", text) + +def process_params(par: dict[str, Any], config) -> str: + # check input parameters + assert par["genome_fasta"], "Pass at least one set of inputs to --genome_fasta." + assert par["gtf"], "Pass at least one set of inputs to --gtf." + assert par["reference_archive"].endswith(".tar.gz"), "Output reference_archive must end with .tar.gz." + + # make paths absolute + for argument in config["all_arguments"]: + if par[argument["clean_name"]] and argument["type"] == "file": + if isinstance(par[argument["clean_name"]], list): + par[argument["clean_name"]] = [ os.path.abspath(f) for f in par[argument["clean_name"]] ] + else: + par[argument["clean_name"]] = os.path.abspath(par[argument["clean_name"]]) + + return par + +def generate_config(par: dict[str, Any], meta, config) -> str: + content_list = [strip_margin(f"""\ + |#!/usr/bin/env cwl-runner + | + |""")] + + + config_key_value_pairs = [] + for argument in config["all_arguments"]: + config_key = (argument.get("info") or {}).get("config_key") + arg_type = argument["type"] + par_value = par[argument["clean_name"]] + if par_value and config_key: + config_key_value_pairs.append((config_key, arg_type, par_value)) + + if meta["cpus"]: + config_key_value_pairs.append(("Maximum_threads", "integer", meta["cpus"])) + + # print(config_key_value_pairs) + + for config_key, arg_type, par_value in config_key_value_pairs: + if arg_type == "file": + str = strip_margin(f"""\ + |{config_key}: + |""") + if isinstance(par_value, list): + for file in par_value: + str += strip_margin(f"""\ + | - class: File + | location: "{file}" + |""") + else: + str += strip_margin(f"""\ + | class: File + | location: "{par_value}" + |""") + content_list.append(str) + else: + content_list.append(strip_margin(f"""\ + |{config_key}: {par_value} + |""")) + + ## Write config to file + return "".join(content_list) + +def get_cwl_file(meta: dict[str, Any]) -> str: + # create cwl file (if need be) + cwl_file=os.path.join(meta["resources_dir"], "make_rhap_reference_2.2.1_nodocker.cwl") + + return cwl_file + +def main(par: dict[str, Any], meta: dict[str, Any]): + config = read_config(meta["config"]) + + # Preprocess params + par = process_params(par, config) + + # fetch cwl file + cwl_file = get_cwl_file(meta) + + # Create output dir if not exists + outdir = os.path.dirname(par["reference_archive"]) + if not os.path.exists(outdir): + os.makedirs(outdir) + + ## Run pipeline + with tempfile.TemporaryDirectory(prefix="cwl-bd_rhapsody_wta-", dir=meta["temp_dir"]) as temp_dir: + # Create params file + config_file = os.path.join(temp_dir, "config.yml") + config_content = generate_config(par, meta, config) + with open(config_file, "w") as f: + f.write(config_content) + + + cmd = [ + "cwl-runner", + "--no-container", + "--preserve-entire-environment", + "--outdir", + temp_dir, + cwl_file, + config_file + ] + + env = dict(os.environ) + env["TMPDIR"] = temp_dir + + print("> " + " ".join(cmd), flush=True) + _ = subprocess.check_call( + cmd, + cwd=os.path.dirname(config_file), + env=env + ) + + shutil.move(os.path.join(temp_dir, "Rhap_reference.tar.gz"), par["reference_archive"]) + +if __name__ == "__main__": + main(par, meta) diff --git a/src/bd_rhapsody/bd_rhapsody_make_reference/test.sh b/src/bd_rhapsody/bd_rhapsody_make_reference/test.sh new file mode 100644 index 00000000..3637160a --- /dev/null +++ b/src/bd_rhapsody/bd_rhapsody_make_reference/test.sh @@ -0,0 +1,68 @@ +#!/bin/bash + +set -e + +############################################# +# helper functions +assert_file_exists() { + [ -f "$1" ] || { echo "File '$1' does not exist" && exit 1; } +} +assert_file_doesnt_exist() { + [ ! -f "$1" ] || { echo "File '$1' exists but shouldn't" && exit 1; } +} +assert_file_empty() { + # () will execute in a shubshell, could you use {;}? + [ ! -s "$1" ] || { echo "File '$1' is not empty but should be" && exit 1; } +} +assert_file_not_empty() { + # [ -s "$1" ] || (echo "File '$1' is empty but shouldn't be" && exit 1) + [ -s "$1" ] || { echo "File '$1' is empty but shouldn't be" && exit 1; } +} +assert_file_contains() { + # grep -q "$2" "$1" || (echo "File '$1' does not contain '$2'" && exit 1) + grep -q "$2" "$1" || { echo "File '$1' does not contain '$2'" && exit 1; } +} +assert_file_not_contains() { + # grep -q "$2" "$1" && (echo "File '$1' contains '$2' but shouldn't" && exit 1) + grep -q "$2" "$1" && { echo "File '$1' contains '$2' but shouldn't" && exit 1; } +} + +in_fa="$meta_resources_dir/test_data/reference_small.fa" +in_gtf="$meta_resources_dir/test_data/reference_small.gtf" + +echo "#############################################" +echo "> Simple run" + +mkdir simple_run +cd simple_run + +out_tar="myreference.tar.gz" + +echo "> Running $meta_name." +$meta_executable \ + --genome_fasta "$in_fa" \ + --gtf "$in_gtf" \ + --reference_archive "$out_tar" \ + --extra_star_params "--genomeSAindexNbases 6" \ + ---cpus 2 + +exit_code=$? +[[ $exit_code != 0 ]] && echo "Non zero exit code: $exit_code" && exit 1 + +assert_file_exists "$out_tar" +assert_file_not_empty "$out_tar" + +echo ">> Checking whether output contains the expected files" +tar -xvf "$out_tar" > /dev/null +assert_file_exists "BD_Rhapsody_Reference_Files/star_index/genomeParameters.txt" +assert_file_exists "BD_Rhapsody_Reference_Files/bwa-mem2_index/reference_small.ann" +assert_file_exists "BD_Rhapsody_Reference_Files/reference_small-processed.gtf" +assert_file_exists "BD_Rhapsody_Reference_Files/mitochondrial_contigs.txt" +assert_file_contains "BD_Rhapsody_Reference_Files/reference_small-processed.gtf" "chr1.*HAVANA.*ENSG00000243485" +assert_file_contains "BD_Rhapsody_Reference_Files/mitochondrial_contigs.txt" 'chrMT' + +cd .. + +echo "#############################################" + +echo "> Tests succeeded!" \ No newline at end of file diff --git a/src/bd_rhapsody/bd_rhapsody_make_reference/test_data/reference_small.fa b/src/bd_rhapsody/bd_rhapsody_make_reference/test_data/reference_small.fa new file mode 100644 index 00000000..386d887c --- /dev/null +++ b/src/bd_rhapsody/bd_rhapsody_make_reference/test_data/reference_small.fa @@ -0,0 +1,27 @@ +>chr1 1 +TGGGGAAGCAAGGCGGAGTTGGGCAGCTCGTGTTCAATGGGTAGAGTTTCAGGCTGGGGT +GATGGAAGGGTGCTGGAAATGAGTGGTAGTGATGGCGGCACAACAGTGTGAATCTACTTA +ATCCCACTGAACTGTATGCTGAAAAATGGTTTAGACGGTGAATTTTAGGTTATGTATGTT +TTACCACAATTTTTAAAAAGCTAGTGAAAAGCTGGTAAAAAGAAAGAAAAGAGGCTTTTT +TAAAAAGTTAAATATATAAAAAGAGCATCATCAGTCCAAAGTCCAGCAGTTGTCCCTCCT +GGAATCCGTTGGCTTGCCTCCGGCATTTTTGGCCCTTGCCTTTTAGGGTTGCCAGATTAA +AAGACAGGATGCCCAGCTAGTTTGAATTTTAGATAAACAACGAATAATTTCGTAGCATAA +ATATGTCCCAAGCTTAGTTTGGGACATACTTATGCTAAAAAACATTATTGGTTGTTTATC +TGAGATTCAGAATTAAGCATTTTATATTTTATTTGCTGCCTCTGGCCACCCTACTCTCTT +CCTAACACTCTCTCCCTCTCCCAGTTTTGTCCGCCTTCCCTGCCTCCTCTTCTGGGGGAG +TTAGATCGAGTTGTAACAAGAACATGCCACTGTCTCGCTGGCTGCAGCGTGTGGTCCCCT +TACCAGAGGTAAAGAAGAGATGGATCTCCACTCATGTTGTAGACAGAATGTTTATGTCCT +CTCCAAATGCTTATGTTGAAACCCTAACCCCTAATGTGATGGTATGTGGAGATGGGCCTT +TGGTAGGTAATTACGGTTAGATGAGGTCATGGGGTGGGGCCCTCATTATAGATCTGGTAA +GAAAAGAGAGCATTGTCTCTGTGTCTCCCTCTCTCTCTCTCTCTCTCTCTCTCATTTCTC +TCTATCTCATTTCTCTCTCTCTCGCTATCTCATTTTTCTCTCTCTCTCTTTCTCTCCTCT +GTCTTTTCCCACCAAGTGAGGATGCGAAGAGAAGGTGGCTGTCTGCAAACCAGGAAGAGA +GCCCTCACCGGGAACCCGTCCAGCTGCCACCTTGAACTTGGACTTCCAAGCCTCCAGAAC +TGTGAGGGATAAATGTATGATTTTAAAGTCGCCCAGTGTGTGGTATTTTGTTTTGACTAA +TACAACCTGAAAACATTTTCCCCTCACTCCACCTGAGCAATATCTGAGTGGCTTAAGGTA +CTCAGGACACAACAAAGGAGAAATGTCCCATGCACAAGGTGCACCCATGCCTGGGTAAAG +CAGCCTGGCACAGAGGGAAGCACACAGGCTCAGGGATCTGCTATTCATTCTTTGTGTGAC +CCTGGGCAAGCCATGAATGGAGCTTCAGTCACCCCATTTGTAATGGGATTTAATTGTGCT +TGCCCTGCCTCCTTTTGAGGGCTGTAGAGAAAAGATGTCAAAGTATTTTGTAATCTGGCT +GGGCGTGGTGGCTCATGCCTGTAATCCTAGCACTTTGGTAGGCTGACGCGAGAGGACTGC +T diff --git a/src/bd_rhapsody/bd_rhapsody_make_reference/test_data/reference_small.gtf b/src/bd_rhapsody/bd_rhapsody_make_reference/test_data/reference_small.gtf new file mode 100644 index 00000000..7ba83523 --- /dev/null +++ b/src/bd_rhapsody/bd_rhapsody_make_reference/test_data/reference_small.gtf @@ -0,0 +1,8 @@ +chr1 HAVANA exon 565 668 . + . gene_id "ENSG00000243485.5"; transcript_id "ENST00000473358.1"; gene_type "lncRNA"; gene_name "MIR1302-2HG"; transcript_type "lncRNA"; transcript_name "MIR1302-2HG-202"; exon_number 2; exon_id "ENSE00001922571.1"; level 2; transcript_support_level "5"; hgnc_id "HGNC:52482"; tag "not_best_in_genome_evidence"; tag "dotter_confirmed"; tag "basic"; tag "Ensembl_canonical"; havana_gene "OTTHUMG00000000959.2"; havana_transcript "OTTHUMT00000002840.1"; +chr1 HAVANA exon 977 1098 . + . gene_id "ENSG00000243485.5"; transcript_id "ENST00000473358.1"; gene_type "lncRNA"; gene_name "MIR1302-2HG"; transcript_type "lncRNA"; transcript_name "MIR1302-2HG-202"; exon_number 3; exon_id "ENSE00001827679.1"; level 2; transcript_support_level "5"; hgnc_id "HGNC:52482"; tag "not_best_in_genome_evidence"; tag "dotter_confirmed"; tag "basic"; tag "Ensembl_canonical"; havana_gene "OTTHUMG00000000959.2"; havana_transcript "OTTHUMT00000002840.1"; +chr1 HAVANA transcript 268 1110 . + . gene_id "ENSG00000243485.5"; transcript_id "ENST00000469289.1"; gene_type "lncRNA"; gene_name "MIR1302-2HG"; transcript_type "lncRNA"; transcript_name "MIR1302-2HG-201"; level 2; transcript_support_level "5"; hgnc_id "HGNC:52482"; tag "not_best_in_genome_evidence"; tag "basic"; havana_gene "OTTHUMG00000000959.2"; havana_transcript "OTTHUMT00000002841.2"; +chr1 HAVANA exon 268 668 . + . gene_id "ENSG00000243485.5"; transcript_id "ENST00000469289.1"; gene_type "lncRNA"; gene_name "MIR1302-2HG"; transcript_type "lncRNA"; transcript_name "MIR1302-2HG-201"; exon_number 1; exon_id "ENSE00001841699.1"; level 2; transcript_support_level "5"; hgnc_id "HGNC:52482"; tag "not_best_in_genome_evidence"; tag "basic"; havana_gene "OTTHUMG00000000959.2"; havana_transcript "OTTHUMT00000002841.2"; +chr1 HAVANA exon 977 1110 . + . gene_id "ENSG00000243485.5"; transcript_id "ENST00000469289.1"; gene_type "lncRNA"; gene_name "MIR1302-2HG"; transcript_type "lncRNA"; transcript_name "MIR1302-2HG-201"; exon_number 2; exon_id "ENSE00001890064.1"; level 2; transcript_support_level "5"; hgnc_id "HGNC:52482"; tag "not_best_in_genome_evidence"; tag "basic"; havana_gene "OTTHUMG00000000959.2"; havana_transcript "OTTHUMT00000002841.2"; +chr1 ENSEMBL gene 367 504 . + . gene_id "ENSG00000284332.1"; gene_type "miRNA"; gene_name "MIR1302-2"; level 3; hgnc_id "HGNC:35294"; +chr1 ENSEMBL transcript 367 504 . + . gene_id "ENSG00000284332.1"; transcript_id "ENST00000607096.1"; gene_type "miRNA"; gene_name "MIR1302-2"; transcript_type "miRNA"; transcript_name "MIR1302-2-201"; level 3; transcript_support_level "NA"; hgnc_id "HGNC:35294"; tag "basic"; tag "Ensembl_canonical"; +chr1 ENSEMBL exon 367 504 . + . gene_id "ENSG00000284332.1"; transcript_id "ENST00000607096.1"; gene_type "miRNA"; gene_name "MIR1302-2"; transcript_type "miRNA"; transcript_name "MIR1302-2-201"; exon_number 1; exon_id "ENSE00003695741.1"; level 3; transcript_support_level "NA"; hgnc_id "HGNC:35294"; tag "basic"; tag "Ensembl_canonical"; diff --git a/src/bd_rhapsody/bd_rhapsody_make_reference/test_data/script.sh b/src/bd_rhapsody/bd_rhapsody_make_reference/test_data/script.sh new file mode 100644 index 00000000..8d468064 --- /dev/null +++ b/src/bd_rhapsody/bd_rhapsody_make_reference/test_data/script.sh @@ -0,0 +1,47 @@ +#!/bin/bash + +TMP_DIR=/tmp/bd_rhapsody_make_reference +OUT_DIR=src/bd_rhapsody/bd_rhapsody_make_reference/test_data + +# check if seqkit is installed +if ! command -v seqkit &> /dev/null; then + echo "seqkit could not be found" + exit 1 +fi + +# create temporary directory and clean up on exit +mkdir -p $TMP_DIR +function clean_up { + rm -rf "$TMP_DIR" +} +trap clean_up EXIT + +# fetch reference +ORIG_FA=$TMP_DIR/reference.fa.gz +if [ ! -f $ORIG_FA ]; then + wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/GRCh38.primary_assembly.genome.fa.gz \ + -O $ORIG_FA +fi + +ORIG_GTF=$TMP_DIR/reference.gtf.gz +if [ ! -f $ORIG_GTF ]; then + wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/gencode.v41.annotation.gtf.gz \ + -O $ORIG_GTF +fi + +# create small reference +START=30000 +END=31500 +CHR=chr1 + +# subset to small region +seqkit grep -r -p "^$CHR\$" "$ORIG_FA" | \ + seqkit subseq -r "$START:$END" > $OUT_DIR/reference_small.fa + +zcat "$ORIG_GTF" | \ + awk -v FS='\t' -v OFS='\t' " + \$1 == \"$CHR\" && \$4 >= $START && \$5 <= $END { + \$4 = \$4 - $START + 1; + \$5 = \$5 - $START + 1; + print; + }" > $OUT_DIR/reference_small.gtf From c2e340d92ea7f153d0c5c9de1cffbc6b88fc4124 Mon Sep 17 00:00:00 2001 From: emmarousseau Date: Wed, 17 Jul 2024 18:10:37 +0200 Subject: [PATCH 02/10] Remove multiple_sep (#78) * initial commit dedup * Revert "initial commit dedup" This reverts commit 38f586bec0ac9e4312b016e29c3aa0bd53f292b2. * get rid of multiple_sep fields in configs * Fix coverage argument's format in config --- src/gffread/config.vsh.yaml | 5 +-- src/gffread/script.sh | 2 ++ src/gffread/test.sh | 2 +- src/samtools/samtools_stats/config.vsh.yaml | 40 ++++++++++----------- src/samtools/samtools_stats/script.sh | 3 ++ src/samtools/samtools_stats/test.sh | 2 +- 6 files changed, 28 insertions(+), 26 deletions(-) diff --git a/src/gffread/config.vsh.yaml b/src/gffread/config.vsh.yaml index d2c41a87..7477a284 100644 --- a/src/gffread/config.vsh.yaml +++ b/src/gffread/config.vsh.yaml @@ -8,8 +8,6 @@ links: references: doi: 10.12688/f1000research.23297.2 license: MIT -requirements: - commands: [ gffread ] argument_groups: - name: Inputs arguments: @@ -52,7 +50,7 @@ argument_groups: required: true description: | Write the output records into . - default: output.gff + example: output.gff - name: --force_exons type: boolean_true description: | @@ -154,7 +152,6 @@ argument_groups: - name: --table type: string multiple: true - multiple_sep: "," description: | Output a simple tab delimited format instead of GFF, with columns having the values of GFF attributes given in ; special pseudo-attributes (prefixed by @) are diff --git a/src/gffread/script.sh b/src/gffread/script.sh index 9c4a2b8f..cd4abf14 100644 --- a/src/gffread/script.sh +++ b/src/gffread/script.sh @@ -50,6 +50,8 @@ [[ "$par_expose_dups" == "false" ]] && unset par_expose_dups [[ "$par_cluster_only" == "false" ]] && unset par_cluster_only +# if par_table is not empty, replace ";" with "," +par_table=$(echo "$par_table" | tr ';' ',') $(which gffread) \ "$par_input" \ diff --git a/src/gffread/test.sh b/src/gffread/test.sh index 326fce50..ea23edcb 100755 --- a/src/gffread/test.sh +++ b/src/gffread/test.sh @@ -86,7 +86,7 @@ diff "$expected_output_dir/transcripts.fa" "$test_output_dir/transcripts.fa" || echo "> Test 4 - Generate table from GFF annotation file" "$meta_executable" \ - --table @id,@chr,@start,@end,@strand,@exons,Name,gene,product \ + --table "@id;@chr;@start;@end;@strand;@exons;Name;gene;product" \ --outfile "$test_output_dir/annotation.tbl" \ --input "$test_dir/sequence.gff3" diff --git a/src/samtools/samtools_stats/config.vsh.yaml b/src/samtools/samtools_stats/config.vsh.yaml index 0d8f57a4..ca630876 100644 --- a/src/samtools/samtools_stats/config.vsh.yaml +++ b/src/samtools/samtools_stats/config.vsh.yaml @@ -30,10 +30,10 @@ argument_groups: - name: --coverage alternatives: -c type: integer - description: | - Coverage distribution min,max,step [1,1000,1]. multiple: true - multiple_sep: ',' + description: | + Coverage distribution min;max;step. Default: [1, 1000, 1]. + example: [1, 1000, 1] - name: --remove_dups alternatives: -d type: boolean_true @@ -48,25 +48,25 @@ argument_groups: alternatives: -f type: string description: | - Required flag, 0 for unset. See also `samtools flags`. - default: "0" + Required flag, 0 for unset. See also `samtools flags`. Default: `"0"`. + example: "0" - name: --filtering_flag alternatives: -F type: string description: | - Filtering flag, 0 for unset. See also `samtools flags`. - default: "0" + Filtering flag, 0 for unset. See also `samtools flags`. Default: `0`. + example: "0" - name: --GC_depth type: double description: | - The size of GC-depth bins (decreasing bin size increases memory requirement). - default: 20000.0 + The size of GC-depth bins (decreasing bin size increases memory requirement). Default: `20000`. + example: 20000.0 - name: --insert_size alternatives: -i type: integer description: | - Maximum insert size. - default: 8000 + Maximum insert size. Default: `8000`. + example: 8000 - name: --id alternatives: -I type: string @@ -76,14 +76,14 @@ argument_groups: alternatives: -l type: integer description: | - Include in the statistics only reads with the given read length. - default: -1 + Include in the statistics only reads with the given read length. Default: `-1`. + example: -1 - name: --most_inserts alternatives: -m type: double description: | - Report only the main part of inserts. - default: 0.99 + Report only the main part of inserts. Default: `0.99`. + example: 0.99 - name: --split_prefix alternatives: -P type: string @@ -93,8 +93,8 @@ argument_groups: alternatives: -q type: integer description: | - The BWA trimming parameter. - default: 0 + The BWA trimming parameter. Default: `0`. + example: 0 - name: --ref_seq alternatives: -r type: file @@ -124,8 +124,8 @@ argument_groups: alternatives: -g type: integer description: | - Only bases with coverage above this value will be included in the target percentage computation. - default: 0 + Only bases with coverage above this value will be included in the target percentage computation. Default: `0`. + example: 0 - name: --input_fmt_option type: string description: | @@ -141,7 +141,7 @@ argument_groups: type: file description: | Output file. - default: "out.txt" + example: "out.txt" required: true direction: output diff --git a/src/samtools/samtools_stats/script.sh b/src/samtools/samtools_stats/script.sh index 6e32e9a5..e3872fc6 100644 --- a/src/samtools/samtools_stats/script.sh +++ b/src/samtools/samtools_stats/script.sh @@ -10,6 +10,9 @@ set -e [[ "$par_sparse" == "false" ]] && unset par_sparse [[ "$par_remove_overlaps" == "false" ]] && unset par_remove_overlaps +# change the coverage input from X;X;X to X,X,X +par_coverage=$(echo "$par_coverage" | tr ';' ',') + samtools stats \ ${par_coverage:+-c "$par_coverage"} \ ${par_remove_dups:+-d} \ diff --git a/src/samtools/samtools_stats/test.sh b/src/samtools/samtools_stats/test.sh index 05d70d30..b515100e 100644 --- a/src/samtools/samtools_stats/test.sh +++ b/src/samtools/samtools_stats/test.sh @@ -17,7 +17,7 @@ echo ">>> Checking whether output is non-empty" [ ! -s "$test_dir/test.paired_end.sorted.txt" ] && echo "File 'test.paired_end.sorted.txt' is empty!" && exit 1 echo ">>> Checking whether output is correct" -# compare using diff, ignoring the line stating the command that was passed. +# compare using diff, ignoring the line stating the command that was passed. diff <(grep -v "^# The command" "$test_dir/test.paired_end.sorted.txt") \ <(grep -v "^# The command" "$test_dir/ref.paired_end.sorted.txt") || \ (echo "Output file ref.paired_end.sorted.txt does not match expected output" && exit 1) From 8e9abad885b27120a56a580ca7d961c64b96ad60 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Wed, 17 Jul 2024 18:14:21 +0200 Subject: [PATCH 03/10] Update CONTRIBUTING.md (#82) * Update CONTRIBUTING.md * update ctb * clean up helper functions * update changelog * update changelog --- CHANGELOG.md | 28 +++- CONTRIBUTING.md | 151 +++++++++++------- .../bd_rhapsody_make_reference/test.sh | 5 +- src/cutadapt/test.sh | 14 +- src/star/star_align_reads/test.sh | 21 ++- 5 files changed, 130 insertions(+), 89 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9cfacdbc..2aad0cb8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,19 +6,33 @@ - `bd_rhapsody/bd_rhapsody_make_reference`: Create a reference for the BD Rhapsody pipeline (PR #75). -## BUG FIXES +## MINOR CHANGES -* `pear`: fix component not exiting with the correct exitcode when PEAR fails. +* `busco` components: update BUSCO to `5.7.1` (PR #72). -* `cutadapt`: fix `--par_quality_cutoff_r2` argument. +## DOCUMENTATION -* `cutadapt`: demultiplexing is now disabled by default. It can be re-enabled by using `demultiplex_mode`. +* Extend the contributing guidelines (PR #82): -* `multiqc`: update multiple separator to `;` (PR #81). + - Update format to Viash 0.9. -## MINOR CHANGES + - Descriptions should be formatted in markdown. + + - Add defaults to descriptions, not as a default of the argument. + + - Explain parameter expansion. -* `busco` components: update BUSCO to `5.7.1`. + - Mention that the contents of the output of components in tests should be checked. + +## BUG FIXES + +* `pear`: fix component not exiting with the correct exitcode when PEAR fails (PR #70). + +* `cutadapt`: fix `--par_quality_cutoff_r2` argument (PR #69). + +* `cutadapt`: demultiplexing is now disabled by default. It can be re-enabled by using `demultiplex_mode` (PR #69). + +* `multiqc`: update multiple separator to `;` (PR #81). # biobox 0.1.0 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 7393bc7e..cee4249a 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -65,22 +65,21 @@ runners: Fill in the relevant metadata fields in the config. Here is an example of the metadata of an existing component. ```yaml -functionality: - name: arriba - description: Detect gene fusions from RNA-Seq data - keywords: [Gene fusion, RNA-Seq] - links: - homepage: https://arriba.readthedocs.io/en/latest/ - documentation: https://arriba.readthedocs.io/en/latest/ - repository: https://github.com/suhrig/arriba - issue_tracker: https://github.com/suhrig/arriba/issues - references: - doi: 10.1101/gr.257246.119 - bibtex: | - @article{ - ... a bibtex entry in case the doi is not available ... - } - license: MIT +name: arriba +description: Detect gene fusions from RNA-Seq data +keywords: [Gene fusion, RNA-Seq] +links: + homepage: https://arriba.readthedocs.io/en/latest/ + documentation: https://arriba.readthedocs.io/en/latest/ + repository: https://github.com/suhrig/arriba + issue_tracker: https://github.com/suhrig/arriba/issues +references: + doi: 10.1101/gr.257246.119 + bibtex: | + @article{ + ... a bibtex entry in case the doi is not available ... + } +license: MIT ``` ### Step 4: Find a suitable container @@ -162,7 +161,7 @@ argument_groups: type: file description: | File in SAM/BAM/CRAM format with main alignments as generated by STAR - (Aligned.out.sam). Arriba extracts candidate reads from this file. + (`Aligned.out.sam`). Arriba extracts candidate reads from this file. required: true example: Aligned.out.bam ``` @@ -175,7 +174,7 @@ Several notes: * Input arguments can have `multiple: true` to allow the user to specify multiple files. - +* The description should be formatted in markdown. ### Step 8: Add arguments for the output files @@ -220,7 +219,7 @@ argument_groups: Note: -* Preferably, these outputs should not be directores but files. For example, if a tool outputs a directory `foo/` containing files `foo/bar.txt` and `foo/baz.txt`, there should be two output arguments `--bar` and `--baz` (as opposed to one output argument which outputs the whole `foo/` directory). +* Preferably, these outputs should not be directories but files. For example, if a tool outputs a directory `foo/` containing files `foo/bar.txt` and `foo/baz.txt`, there should be two output arguments `--bar` and `--baz` (as opposed to one output argument which outputs the whole `foo/` directory). ### Step 9: Add arguments for the other arguments @@ -230,6 +229,8 @@ Finally, add all other arguments to the config file. There are a few exceptions: * Arguments related to printing the information such as printing the version (`-v`, `--version`) or printing the help (`-h`, `--help`) should not be added to the config file. +* If the help lists defaults, do not add them as defaults but to the description. Example: `description: . Default: 10.` + ### Step 10: Add a Docker engine @@ -275,10 +276,13 @@ Next, we need to write a runner script that runs the tool with the input argumen ## VIASH START ## VIASH END +# unset flags +[[ "$par_option" == "false" ]] && unset par_option + xxx \ --input "$par_input" \ --output "$par_output" \ - $([ "$par_option" = "true" ] && echo "--option") + ${par_option:+--option} ``` When building a Viash component, Viash will automatically replace the `## VIASH START` and `## VIASH END` lines (and anything in between) with environment variables based on the arguments specified in the config. @@ -291,6 +295,11 @@ As an example, this is what the Bash script for the `arriba` component looks lik ## VIASH START ## VIASH END +# unset flags +[[ "$par_skip_duplicate_marking" == "false" ]] && unset par_skip_duplicate_marking +[[ "$par_extra_information" == "false" ]] && unset par_extra_information +[[ "$par_fill_gaps" == "false" ]] && unset par_fill_gaps + arriba \ -x "$par_bam" \ -a "$par_genome" \ @@ -298,26 +307,30 @@ arriba \ -o "$par_fusions" \ ${par_known_fusions:+-k "${par_known_fusions}"} \ ${par_blacklist:+-b "${par_blacklist}"} \ - ${par_structural_variants:+-d "${par_structural_variants}"} \ - $([ "$par_skip_duplicate_marking" = "true" ] && echo "-u") \ - $([ "$par_extra_information" = "true" ] && echo "-X") \ - $([ "$par_fill_gaps" = "true" ] && echo "-I") + # ... + ${par_extra_information:+-X} \ + ${par_fill_gaps:+-I} ``` +Notes: -### Step 12: Create test script +* If your arguments can contain special variables (e.g. `$`), you can use quoting (need to find a documentation page for this) to make sure you can use the string as input. Example: `-x ${par_bam@Q}`. +* Optional arguments can be passed to the command conditionally using Bash [parameter expansion](https://www.gnu.org/software/bash/manual/html_node/Shell-Parameter-Expansion.html). For example: `${par_known_fusions:+-k ${par_known_fusions@Q}}` + +* If your tool allows for multiple inputs using a separator other than `;` (which is the default Viash multiple separator), you can substitute these values with a command like: `par_disable_filters=$(echo $par_disable_filters | tr ';' ',')`. + + +### Step 12: Create test script If the unit test requires test resources, these should be provided in the `test_resources` section of the component. ```yaml -functionality: - # ... - test_resources: - - type: bash_script - path: test.sh - - type: file - path: test_data +test_resources: + - type: bash_script + path: test.sh + - type: file + path: test_data ``` Create a test script at `src/xxx/test.sh` that runs the component with the test data. This script should run the component (available with `$meta_executable`) with the test data and check if the output is as expected. The script should exit with a non-zero exit code if the output is not as expected. For example: @@ -325,48 +338,64 @@ Create a test script at `src/xxx/test.sh` that runs the component with the test ```bash #!/bin/bash +set -e + ## VIASH START ## VIASH END -echo "> Run xxx with test data" +############################################# +# helper functions +assert_file_exists() { + [ -f "$1" ] || { echo "File '$1' does not exist" && exit 1; } +} +assert_file_doesnt_exist() { + [ ! -f "$1" ] || { echo "File '$1' exists but shouldn't" && exit 1; } +} +assert_file_empty() { + [ ! -s "$1" ] || { echo "File '$1' is not empty but should be" && exit 1; } +} +assert_file_not_empty() { + [ -s "$1" ] || { echo "File '$1' is empty but shouldn't be" && exit 1; } +} +assert_file_contains() { + grep -q "$2" "$1" || { echo "File '$1' does not contain '$2'" && exit 1; } +} +assert_file_not_contains() { + grep -q "$2" "$1" && { echo "File '$1' contains '$2' but shouldn't" && exit 1; } +} +assert_file_contains_regex() { + grep -q -E "$2" "$1" || { echo "File '$1' does not contain '$2'" && exit 1; } +} +assert_file_not_contains_regex() { + grep -q -E "$2" "$1" && { echo "File '$1' contains '$2' but shouldn't" && exit 1; } +} +############################################# + +echo "> Run $meta_name with test data" "$meta_executable" \ - --input "$meta_resources_dir/test_data/input.txt" \ + --input "$meta_resources_dir/test_data/reads_R1.fastq" \ --output "output.txt" \ --option -echo ">> Checking output" -[ ! -f "output.txt" ] && echo "Output file output.txt does not exist" && exit 1 -``` +echo ">> Check if output exists" +assert_file_exists "output.txt" +echo ">> Check if output is empty" +assert_file_not_empty "output.txt" -For example, this is what the test script for the `arriba` component looks like: +echo ">> Check if output is correct" +assert_file_contains "output.txt" "some expected output" -```bash -#!/bin/bash +echo "> All tests succeeded!" +``` -## VIASH START -## VIASH END +Notes: -echo "> Run arriba with blacklist" -"$meta_executable" \ - --bam "$meta_resources_dir/test_data/A.bam" \ - --genome "$meta_resources_dir/test_data/genome.fasta" \ - --gene_annotation "$meta_resources_dir/test_data/annotation.gtf" \ - --blacklist "$meta_resources_dir/test_data/blacklist.tsv" \ - --fusions "fusions.tsv" \ - --fusions_discarded "fusions_discarded.tsv" \ - --interesting_contigs "1,2" - -echo ">> Checking output" -[ ! -f "fusions.tsv" ] && echo "Output file fusions.tsv does not exist" && exit 1 -[ ! -f "fusions_discarded.tsv" ] && echo "Output file fusions_discarded.tsv does not exist" && exit 1 +* Do always check the contents of the output file. If the output is not deterministic, you can use regular expressions to check the output. -echo ">> Check if output is empty" -[ ! -s "fusions.tsv" ] && echo "Output file fusions.tsv is empty" && exit 1 -[ ! -s "fusions_discarded.tsv" ] && echo "Output file fusions_discarded.tsv is empty" && exit 1 -``` +* If possible, generate your own test data instead of copying it from an external resource. -### Step 12: Create a `/var/software_versions.txt` file +### Step 13: Create a `/var/software_versions.txt` file For the sake of transparency and reproducibility, we require that the versions of the software used in the component are documented. @@ -378,6 +407,8 @@ engines: image: quay.io/biocontainers/xxx:0.1.0--py_0 setup: - type: docker + # note: /var/software_versions.txt should contain: + # arriba: "2.4.0" run: | echo "xxx: \"0.1.0\"" > /var/software_versions.txt ``` diff --git a/src/bd_rhapsody/bd_rhapsody_make_reference/test.sh b/src/bd_rhapsody/bd_rhapsody_make_reference/test.sh index 3637160a..845c1739 100644 --- a/src/bd_rhapsody/bd_rhapsody_make_reference/test.sh +++ b/src/bd_rhapsody/bd_rhapsody_make_reference/test.sh @@ -11,21 +11,18 @@ assert_file_doesnt_exist() { [ ! -f "$1" ] || { echo "File '$1' exists but shouldn't" && exit 1; } } assert_file_empty() { - # () will execute in a shubshell, could you use {;}? [ ! -s "$1" ] || { echo "File '$1' is not empty but should be" && exit 1; } } assert_file_not_empty() { - # [ -s "$1" ] || (echo "File '$1' is empty but shouldn't be" && exit 1) [ -s "$1" ] || { echo "File '$1' is empty but shouldn't be" && exit 1; } } assert_file_contains() { - # grep -q "$2" "$1" || (echo "File '$1' does not contain '$2'" && exit 1) grep -q "$2" "$1" || { echo "File '$1' does not contain '$2'" && exit 1; } } assert_file_not_contains() { - # grep -q "$2" "$1" && (echo "File '$1' contains '$2' but shouldn't" && exit 1) grep -q "$2" "$1" && { echo "File '$1' contains '$2' but shouldn't" && exit 1; } } +############################################# in_fa="$meta_resources_dir/test_data/reference_small.fa" in_gtf="$meta_resources_dir/test_data/reference_small.gtf" diff --git a/src/cutadapt/test.sh b/src/cutadapt/test.sh index 1d6d9c18..28248742 100644 --- a/src/cutadapt/test.sh +++ b/src/cutadapt/test.sh @@ -6,25 +6,25 @@ set -eo pipefail ############################################# # helper functions assert_file_exists() { - [ -f "$1" ] || (echo "File '$1' does not exist" && exit 1) + [ -f "$1" ] || { echo "File '$1' does not exist" && exit 1; } } assert_file_doesnt_exist() { - [ ! -f "$1" ] || (echo "File '$1' exists but shouldn't" && exit 1) + [ ! -f "$1" ] || { echo "File '$1' exists but shouldn't" && exit 1; } } assert_file_empty() { - [ ! -s "$1" ] || (echo "File '$1' is not empty but should be" && exit 1) + [ ! -s "$1" ] || { echo "File '$1' is not empty but should be" && exit 1; } } assert_file_not_empty() { - [ -s "$1" ] || (echo "File '$1' is empty but shouldn't be" && exit 1) + [ -s "$1" ] || { echo "File '$1' is empty but shouldn't be" && exit 1; } } assert_file_contains() { - grep -q "$2" "$1" || (echo "File '$1' does not contain '$2'" && exit 1) + grep -q "$2" "$1" || { echo "File '$1' does not contain '$2'" && exit 1; } } assert_file_not_contains() { - grep -q "$2" "$1" && (echo "File '$1' contains '$2' but shouldn't" && exit 1) + grep -q "$2" "$1" && { echo "File '$1' contains '$2' but shouldn't" && exit 1; } } - ############################################# + mkdir test_multiple_output cd test_multiple_output diff --git a/src/star/star_align_reads/test.sh b/src/star/star_align_reads/test.sh index a15ea599..bd78094d 100644 --- a/src/star/star_align_reads/test.sh +++ b/src/star/star_align_reads/test.sh @@ -7,35 +7,34 @@ meta_executable="target/docker/star/star_align_reads/star_align_reads" meta_resources_dir="src/star/star_align_reads" ## VIASH END -######################################################################################### - +############################################# # helper functions assert_file_exists() { - [ -f "$1" ] || (echo "File '$1' does not exist" && exit 1) + [ -f "$1" ] || { echo "File '$1' does not exist" && exit 1; } } assert_file_doesnt_exist() { - [ ! -f "$1" ] || (echo "File '$1' exists but shouldn't" && exit 1) + [ ! -f "$1" ] || { echo "File '$1' exists but shouldn't" && exit 1; } } assert_file_empty() { - [ ! -s "$1" ] || (echo "File '$1' is not empty but should be" && exit 1) + [ ! -s "$1" ] || { echo "File '$1' is not empty but should be" && exit 1; } } assert_file_not_empty() { - [ -s "$1" ] || (echo "File '$1' is empty but shouldn't be" && exit 1) + [ -s "$1" ] || { echo "File '$1' is empty but shouldn't be" && exit 1; } } assert_file_contains() { - grep -q "$2" "$1" || (echo "File '$1' does not contain '$2'" && exit 1) + grep -q "$2" "$1" || { echo "File '$1' does not contain '$2'" && exit 1; } } assert_file_not_contains() { - grep -q "$2" "$1" && (echo "File '$1' contains '$2' but shouldn't" && exit 1) + grep -q "$2" "$1" && { echo "File '$1' contains '$2' but shouldn't" && exit 1; } } assert_file_contains_regex() { - grep -q -E "$2" "$1" || (echo "File '$1' does not contain '$2'" && exit 1) + grep -q -E "$2" "$1" || { echo "File '$1' does not contain '$2'" && exit 1; } } assert_file_not_contains_regex() { - grep -q -E "$2" "$1" && (echo "File '$1' contains '$2' but shouldn't" && exit 1) + grep -q -E "$2" "$1" && { echo "File '$1' contains '$2' but shouldn't" && exit 1; } } +############################################# -######################################################################################### echo "> Prepare test data" cat > reads_R1.fastq <<'EOF' From 13c5439a0c36f8a1bd3889e68d68ca85672daa62 Mon Sep 17 00:00:00 2001 From: Leila011 Date: Wed, 17 Jul 2024 18:15:08 +0200 Subject: [PATCH 04/10] Add agat convertspgff2gtf (#76) * Fill in the metadata * add help.txt * add test data * update help.txt * add arguments for input file, output file and other arguments * add a Docker engine * Write a runner script * correct --gtf_version choices * update description * update keywords * Create test script * Create a /var/software_versions.txt file * remove duplicated argument * update config * change name to agat_convert_sp_gff2gtf * update license * replace module name by $meta_name in test.sh * Add more info to --gtf_version description * remove extra \ * add additional test: check if the D column in the first line of the GFF was correctly converted into GTF format * update changelog * Markdown: add newline before listing * add test to check if the header contains the right GTF version * cleanup * fix formatting --------- Co-authored-by: Robrecht Cannoodt --- CHANGELOG.md | 3 + .../agat_convert_sp_gff2gtf/config.vsh.yaml | 90 ++++++++++++++++ src/agat/agat_convert_sp_gff2gtf/help.txt | 102 ++++++++++++++++++ src/agat/agat_convert_sp_gff2gtf/script.sh | 10 ++ src/agat/agat_convert_sp_gff2gtf/test.sh | 37 +++++++ .../test_data/0_test.gff | 36 +++++++ .../test_data/script.sh | 9 ++ 7 files changed, 287 insertions(+) create mode 100644 src/agat/agat_convert_sp_gff2gtf/config.vsh.yaml create mode 100644 src/agat/agat_convert_sp_gff2gtf/help.txt create mode 100644 src/agat/agat_convert_sp_gff2gtf/script.sh create mode 100644 src/agat/agat_convert_sp_gff2gtf/test.sh create mode 100644 src/agat/agat_convert_sp_gff2gtf/test_data/0_test.gff create mode 100755 src/agat/agat_convert_sp_gff2gtf/test_data/script.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 2aad0cb8..8f56b22e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -100,6 +100,9 @@ - `bedtools_getfasta`: extract sequences from a FASTA file for each of the intervals defined in a BED/GFF/VCF file (PR #59). +* `agat`: + - `agat_convert_sp_gff2gtf`: convert any GTF/GFF file into a proper GTF file (PR #76). + ## MINOR CHANGES * Uniformize component metadata (PR #23). diff --git a/src/agat/agat_convert_sp_gff2gtf/config.vsh.yaml b/src/agat/agat_convert_sp_gff2gtf/config.vsh.yaml new file mode 100644 index 00000000..b788c7c7 --- /dev/null +++ b/src/agat/agat_convert_sp_gff2gtf/config.vsh.yaml @@ -0,0 +1,90 @@ +name: agat_convert_sp_gff2gtf +namespace: agat +description: | + The script aims to convert any GTF/GFF file into a proper GTF file. Full + information about the format can be found here: + https://agat.readthedocs.io/en/latest/gxf.html You can choose among 7 + different GTF types (1, 2, 2.1, 2.2, 2.5, 3 or relax). Depending the + version selected the script will filter out the features that are not + accepted. For GTF2.5 and 3, every level1 feature (e.g nc_gene + pseudogene) will be converted into gene feature and every level2 feature + (e.g mRNA ncRNA) will be converted into transcript feature. Using the + "relax" option you will produce a GTF-like output keeping all original + feature types (3rd column). No modification will occur e.g. mRNA to + transcript. + + To be fully GTF compliant all feature have a gene_id and a transcript_id + attribute. The gene_id is unique identifier for the genomic source of + the transcript, which is used to group transcripts into genes. The + transcript_id is a unique identifier for the predicted transcript, which + is used to group features into transcripts. +keywords: [gene annotations, GTF conversion] +links: + homepage: https://github.com/NBISweden/AGAT + documentation: https://agat.readthedocs.io/ + issue_tracker: https://github.com/NBISweden/AGAT/issues + repository: https://github.com/NBISweden/AGAT +references: + doi: 10.5281/zenodo.3552717 +license: GPL-3.0 +argument_groups: + - name: Inputs + arguments: + - name: --gff + alternatives: [-i] + description: Input GFF/GTF file that will be read + type: file + required: true + direction: input + example: input.gff + - name: Outputs + arguments: + - name: --output + alternatives: [-o, --out, --outfile, --gtf] + description: Output GTF file. If no output file is specified, the output will be written to STDOUT. + type: file + direction: output + required: true + example: output.gtf + - name: Arguments + arguments: + - name: --gtf_version + description: | + Version of the GTF output (1,2,2.1,2.2,2.5,3 or relax). Default value from AGAT config file (relax for the default config). The script option has the higher priority. + + * relax: all feature types are accepted. + * GTF3 (9 feature types accepted): gene, transcript, exon, CDS, Selenocysteine, start_codon, stop_codon, three_prime_utr and five_prime_utr. + * GTF2.5 (8 feature types accepted): gene, transcript, exon, CDS, UTR, start_codon, stop_codon, Selenocysteine. + * GTF2.2 (9 feature types accepted): CDS, start_codon, stop_codon, 5UTR, 3UTR, inter, inter_CNS, intron_CNS and exon. + * GTF2.1 (6 feature types accepted): CDS, start_codon, stop_codon, exon, 5UTR, 3UTR. + * GTF2 (4 feature types accepted): CDS, start_codon, stop_codon, exon. + * GTF1 (5 feature types accepted): CDS, start_codon, stop_codon, exon, intron. + type: string + choices: [relax, "1", "2", "2.1", "2.2", "2.5", "3"] + required: false + example: "3" + - name: --config + alternatives: [-c] + description: | + Input agat config file. By default AGAT takes as input agat_config.yaml file from the working directory if any, otherwise it takes the orignal agat_config.yaml shipped with AGAT. To get the agat_config.yaml locally type: "agat config --expose". The --config option gives you the possibility to use your own AGAT config file (located elsewhere or named differently). + type: file + required: false + example: custom_agat_config.yaml +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - type: file + path: test_data +engines: + - type: docker + image: quay.io/biocontainers/agat:1.4.0--pl5321hdfd78af_0 + setup: + - type: docker + run: | + agat --version | sed 's/AGAT\s\(.*\)/agat: "\1"/' > /var/software_versions.txt +runners: + - type: executable + - type: nextflow \ No newline at end of file diff --git a/src/agat/agat_convert_sp_gff2gtf/help.txt b/src/agat/agat_convert_sp_gff2gtf/help.txt new file mode 100644 index 00000000..fdd45507 --- /dev/null +++ b/src/agat/agat_convert_sp_gff2gtf/help.txt @@ -0,0 +1,102 @@ +```sh +agat_convert_sp_gff2gtf.pl --help +``` + ------------------------------------------------------------------------------ +| Another GFF Analysis Toolkit (AGAT) - Version: v1.4.0 | +| https://github.com/NBISweden/AGAT | +| National Bioinformatics Infrastructure Sweden (NBIS) - www.nbis.se | + ------------------------------------------------------------------------------ + + +Name: + agat_convert_sp_gff2gtf.pl + +Description: + The script aims to convert any GTF/GFF file into a proper GTF file. Full + information about the format can be found here: + https://agat.readthedocs.io/en/latest/gxf.html You can choose among 7 + different GTF types (1, 2, 2.1, 2.2, 2.5, 3 or relax). Depending the + version selected the script will filter out the features that are not + accepted. For GTF2.5 and 3, every level1 feature (e.g nc_gene + pseudogene) will be converted into gene feature and every level2 feature + (e.g mRNA ncRNA) will be converted into transcript feature. Using the + "relax" option you will produce a GTF-like output keeping all original + feature types (3rd column). No modification will occur e.g. mRNA to + transcript. + + To be fully GTF compliant all feature have a gene_id and a transcript_id + attribute. The gene_id is unique identifier for the genomic source of + the transcript, which is used to group transcripts into genes. The + transcript_id is a unique identifier for the predicted transcript, which + is used to group features into transcripts. + +Usage: + agat_convert_sp_gff2gtf.pl --gff infile.gff [ -o outfile ] + agat_convert_sp_gff2gtf -h + +Options: + --gff, --gtf or -i + Input GFF/GTF file that will be read + + --gtf_version version of the GTF output (1,2,2.1,2.2,2.5,3 or relax). + Default value from AGAT config file (relax for the default config). The + script option has the higher priority. + relax: all feature types are accepted. + + GTF3 (9 feature types accepted): gene, transcript, exon, CDS, + Selenocysteine, start_codon, stop_codon, three_prime_utr and + five_prime_utr + + GTF2.5 (8 feature types accepted): gene, transcript, exon, CDS, + UTR, start_codon, stop_codon, Selenocysteine + + GTF2.2 (9 feature types accepted): CDS, start_codon, stop_codon, + 5UTR, 3UTR, inter, inter_CNS, intron_CNS and exon + + GTF2.1 (6 feature types accepted): CDS, start_codon, stop_codon, + exon, 5UTR, 3UTR + + GTF2 (4 feature types accepted): CDS, start_codon, stop_codon, + exon + + GTF1 (5 feature types accepted): CDS, start_codon, stop_codon, + exon, intron + + -o , --output , --out , --outfile or --gtf + Output GTF file. If no output file is specified, the output will + be written to STDOUT. + + -c or --config + String - Input agat config file. By default AGAT takes as input + agat_config.yaml file from the working directory if any, + otherwise it takes the orignal agat_config.yaml shipped with + AGAT. To get the agat_config.yaml locally type: "agat config + --expose". The --config option gives you the possibility to use + your own AGAT config file (located elsewhere or named + differently). + + -h or --help + Display this helpful text. + +Feedback: + Did you find a bug?: + Do not hesitate to report bugs to help us keep track of the bugs and + their resolution. Please use the GitHub issue tracking system available + at this address: + + https://github.com/NBISweden/AGAT/issues + + Ensure that the bug was not already reported by searching under Issues. + If you're unable to find an (open) issue addressing the problem, open a new one. + Try as much as possible to include in the issue when relevant: + - a clear description, + - as much relevant information as possible, + - the command used, + - a data sample, + - an explanation of the expected behaviour that is not occurring. + + Do you want to contribute?: + You are very welcome, visit this address for the Contributing + guidelines: + https://github.com/NBISweden/AGAT/blob/master/CONTRIBUTING.md + diff --git a/src/agat/agat_convert_sp_gff2gtf/script.sh b/src/agat/agat_convert_sp_gff2gtf/script.sh new file mode 100644 index 00000000..69d66739 --- /dev/null +++ b/src/agat/agat_convert_sp_gff2gtf/script.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +agat_convert_sp_gff2gtf.pl \ + -i "$par_gff" \ + -o "$par_output" \ + ${par_gtf_version:+--gtf_version "${par_gtf_version}"} \ + ${par_config:+--config "${par_config}"} diff --git a/src/agat/agat_convert_sp_gff2gtf/test.sh b/src/agat/agat_convert_sp_gff2gtf/test.sh new file mode 100644 index 00000000..1e7cc142 --- /dev/null +++ b/src/agat/agat_convert_sp_gff2gtf/test.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +test_dir="${meta_resources_dir}/test_data" + +echo "> Run $meta_name with test data" +"$meta_executable" \ + --gff "$test_dir/0_test.gff" \ + --output "output.gtf" + +echo ">> Checking output" +[ ! -f "output.gtf" ] && echo "Output file output.gtf does not exist" && exit 1 + +echo ">> Check if output is empty" +[ ! -s "output.gtf" ] && echo "Output file output.gtf is empty" && exit 1 + +echo ">> Check if the conversion resulted in the right GTF format" +idGFF=$(head -n 2 "$test_dir/0_test.gff" | grep -o 'ID=[^;]*' | cut -d '=' -f 2-) +expectedGTF="gene_id \"$idGFF\"; ID \"$idGFF\";" +extractedGTF=$(head -n 3 "output.gtf" | grep -o 'gene_id "[^"]*"; ID "[^"]*";') +[ "$extractedGTF" != "$expectedGTF" ] && echo "Output file output.gtf does not have the right format" && exit 1 + +rm output.gtf + +echo "> Run $meta_name with test data and GTF version 2.5" +"$meta_executable" \ + --gff "$test_dir/0_test.gff" \ + --output "output.gtf" \ + --gtf_version "2.5" + +echo ">> Check if the output file header display the right GTF version" +grep -q "##gtf-version 2.5" "output.gtf" +[ $? -ne 0 ] && echo "Output file output.gtf header does not display the right GTF version" && exit 1 + +echo "> Test successful" \ No newline at end of file diff --git a/src/agat/agat_convert_sp_gff2gtf/test_data/0_test.gff b/src/agat/agat_convert_sp_gff2gtf/test_data/0_test.gff new file mode 100644 index 00000000..fafe86ed --- /dev/null +++ b/src/agat/agat_convert_sp_gff2gtf/test_data/0_test.gff @@ -0,0 +1,36 @@ +##gff-version 3 +scaffold625 maker gene 337818 343277 . + . ID=CLUHARG00000005458;Name=TUBB3_2 +scaffold625 maker mRNA 337818 343277 . + . ID=CLUHART00000008717;Parent=CLUHARG00000005458 +scaffold625 maker exon 337818 337971 . + . ID=CLUHART00000008717:exon:1404;Parent=CLUHART00000008717 +scaffold625 maker exon 340733 340841 . + . ID=CLUHART00000008717:exon:1405;Parent=CLUHART00000008717 +scaffold625 maker exon 341518 341628 . + . ID=CLUHART00000008717:exon:1406;Parent=CLUHART00000008717 +scaffold625 maker exon 341964 343277 . + . ID=CLUHART00000008717:exon:1407;Parent=CLUHART00000008717 +scaffold625 maker CDS 337915 337971 . + 0 ID=CLUHART00000008717:cds;Parent=CLUHART00000008717 +scaffold625 maker CDS 340733 340841 . + 0 ID=CLUHART00000008717:cds;Parent=CLUHART00000008717 +scaffold625 maker CDS 341518 341628 . + 2 ID=CLUHART00000008717:cds;Parent=CLUHART00000008717 +scaffold625 maker CDS 341964 343033 . + 2 ID=CLUHART00000008717:cds;Parent=CLUHART00000008717 +scaffold625 maker five_prime_UTR 337818 337914 . + . ID=CLUHART00000008717:five_prime_utr;Parent=CLUHART00000008717 +scaffold625 maker three_prime_UTR 343034 343277 . + . ID=CLUHART00000008717:three_prime_utr;Parent=CLUHART00000008717 +scaffold789 maker gene 558184 564780 . + . ID=CLUHARG00000003852;Name=PF11_0240 +scaffold789 maker mRNA 558184 564780 . + . ID=CLUHART00000006146;Parent=CLUHARG00000003852 +scaffold789 maker exon 558184 560123 . + . ID=CLUHART00000006146:exon:995;Parent=CLUHART00000006146 +scaffold789 maker exon 561401 561519 . + . ID=CLUHART00000006146:exon:996;Parent=CLUHART00000006146 +scaffold789 maker exon 564171 564235 . + . ID=CLUHART00000006146:exon:997;Parent=CLUHART00000006146 +scaffold789 maker exon 564372 564780 . + . ID=CLUHART00000006146:exon:998;Parent=CLUHART00000006146 +scaffold789 maker CDS 558191 560123 . + 0 ID=CLUHART00000006146:cds;Parent=CLUHART00000006146 +scaffold789 maker CDS 561401 561519 . + 2 ID=CLUHART00000006146:cds;Parent=CLUHART00000006146 +scaffold789 maker CDS 564171 564235 . + 0 ID=CLUHART00000006146:cds;Parent=CLUHART00000006146 +scaffold789 maker CDS 564372 564588 . + 1 ID=CLUHART00000006146:cds;Parent=CLUHART00000006146 +scaffold789 maker five_prime_UTR 558184 558190 . + . ID=CLUHART00000006146:five_prime_utr;Parent=CLUHART00000006146 +scaffold789 maker three_prime_UTR 564589 564780 . + . ID=CLUHART00000006146:three_prime_utr;Parent=CLUHART00000006146 +scaffold789 maker mRNA 558184 564780 . + . ID=CLUHART00000006147;Parent=CLUHARG00000003852 +scaffold789 maker exon 558184 560123 . + . ID=CLUHART00000006147:exon:997;Parent=CLUHART00000006147 +scaffold789 maker exon 561401 561519 . + . ID=CLUHART00000006147:exon:998;Parent=CLUHART00000006147 +scaffold789 maker exon 562057 562121 . + . ID=CLUHART00000006147:exon:999;Parent=CLUHART00000006147 +scaffold789 maker exon 564372 564780 . + . ID=CLUHART00000006147:exon:1000;Parent=CLUHART00000006147 +scaffold789 maker CDS 558191 560123 . + 0 ID=CLUHART00000006147:cds;Parent=CLUHART00000006147 +scaffold789 maker CDS 561401 561519 . + 2 ID=CLUHART00000006147:cds;Parent=CLUHART00000006147 +scaffold789 maker CDS 562057 562121 . + 0 ID=CLUHART00000006147:cds;Parent=CLUHART00000006147 +scaffold789 maker CDS 564372 564588 . + 1 ID=CLUHART00000006147:cds;Parent=CLUHART00000006147 +scaffold789 maker five_prime_UTR 558184 558190 . + . ID=CLUHART00000006147:five_prime_utr;Parent=CLUHART00000006147 +scaffold789 maker three_prime_UTR 564589 564780 . + . ID=CLUHART00000006147:three_prime_utr;Parent=CLUHART00000006147 diff --git a/src/agat/agat_convert_sp_gff2gtf/test_data/script.sh b/src/agat/agat_convert_sp_gff2gtf/test_data/script.sh new file mode 100755 index 00000000..e453e772 --- /dev/null +++ b/src/agat/agat_convert_sp_gff2gtf/test_data/script.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +# clone repo +if [ ! -d /tmp/agat_source ]; then + git clone --depth 1 --single-branch --branch master https://github.com/NBISweden/AGAT /tmp/agat_source +fi + +# copy test data +cp -r /tmp/agat_source/t/gff_syntax/in/0_test.gff src/agat/agat_convert_sp_gff2gtf/test_data From e615d2abb92e56cfc1e1ace9baa308ce10656f9f Mon Sep 17 00:00:00 2001 From: Jakub Majercik <57993790+jakubmajercik@users.noreply.github.com> Date: Wed, 17 Jul 2024 19:44:21 +0200 Subject: [PATCH 05/10] Seqtk sample (#68) * tests added * tests extended * changelog entry added * reorganized seqtk namespace + added seqtk subseq config and script * added subseq help.txt * revert to seqtk sample only * remove subseq * updated tests, added tags * Update two_pass_mode Co-authored-by: Robrecht Cannoodt * author added to config --------- Co-authored-by: Robrecht Cannoodt --- CHANGELOG.md | 2 + src/_authors/jakub_majercik.yaml | 10 +++ src/seqtk/seqtk_sample/config.vsh.yaml | 57 ++++++++++++++ src/seqtk/seqtk_sample/help.txt | 9 +++ src/seqtk/seqtk_sample/script.sh | 11 +++ src/seqtk/seqtk_sample/test.sh | 104 +++++++++++++++++++++++++ src/seqtk/test_data/reads/a.1.fastq.gz | Bin 0 -> 100 bytes src/seqtk/test_data/reads/a.2.fastq.gz | Bin 0 -> 100 bytes src/seqtk/test_data/reads/a.fastq | 4 + src/seqtk/test_data/reads/a.fastq.gz | Bin 0 -> 44 bytes src/seqtk/test_data/reads/id.list | 1 + src/seqtk/test_data/script.sh | 9 +++ 12 files changed, 207 insertions(+) create mode 100644 src/_authors/jakub_majercik.yaml create mode 100644 src/seqtk/seqtk_sample/config.vsh.yaml create mode 100644 src/seqtk/seqtk_sample/help.txt create mode 100644 src/seqtk/seqtk_sample/script.sh create mode 100644 src/seqtk/seqtk_sample/test.sh create mode 100644 src/seqtk/test_data/reads/a.1.fastq.gz create mode 100644 src/seqtk/test_data/reads/a.2.fastq.gz create mode 100644 src/seqtk/test_data/reads/a.fastq create mode 100644 src/seqtk/test_data/reads/a.fastq.gz create mode 100644 src/seqtk/test_data/reads/id.list create mode 100755 src/seqtk/test_data/script.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 8f56b22e..f6a8676f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -93,6 +93,8 @@ * `falco`: A C++ drop-in replacement of FastQC to assess the quality of sequence read data (PR #43). +* `seqtk/seqtk_sample`: Sample sequences from FASTA/Q(.gz) files to FASTA/Q (PR #68). + * `umitools`: - `umitools_dedup`: Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read (PR #54). diff --git a/src/_authors/jakub_majercik.yaml b/src/_authors/jakub_majercik.yaml new file mode 100644 index 00000000..3b75fffe --- /dev/null +++ b/src/_authors/jakub_majercik.yaml @@ -0,0 +1,10 @@ +name: Jakub Majercik +info: + links: + email: jakub@data-intuitive.com + github: jakubmajercik + linkedin: jakubmajercik + organizations: + - name: Data Intuitive + href: https://www.data-intuitive.com + role: Bioinformatics Engineer \ No newline at end of file diff --git a/src/seqtk/seqtk_sample/config.vsh.yaml b/src/seqtk/seqtk_sample/config.vsh.yaml new file mode 100644 index 00000000..0cd369e7 --- /dev/null +++ b/src/seqtk/seqtk_sample/config.vsh.yaml @@ -0,0 +1,57 @@ +name: seqtk_sample +namespace: seqtk +description: Subsamples sequences from FASTA/Q files. +keywords: [sample, FASTA, FASTQ] +links: + repository: https://github.com/lh3/seqtk/tree/v1.4 +license: MIT +authors: + - __merge__: /src/_authors/jakub_majercik.yaml + roles: [ author, maintainer ] + +argument_groups: + - name: Inputs + arguments: + - name: --input + type: file + description: The input FASTA/Q file. + required: true + + - name: Outputs + arguments: + - name: --output + type: file + description: The output FASTA/Q file. + required: true + direction: output + + - name: Options + arguments: + - name: --seed + type: integer + description: Seed for random generator. + example: 42 + - name: --fraction_number + type: double + description: Fraction or number of sequences to sample. + required: true + example: 0.1 + - name: --two_pass_mode + type: boolean_true + description: Twice as slow but with much reduced memory + +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - type: file + path: ../test_data + +engines: + - type: docker + image: quay.io/biocontainers/seqtk:1.4--he4a0461_2 +runners: + - type: executable + - type: nextflow \ No newline at end of file diff --git a/src/seqtk/seqtk_sample/help.txt b/src/seqtk/seqtk_sample/help.txt new file mode 100644 index 00000000..49f8001b --- /dev/null +++ b/src/seqtk/seqtk_sample/help.txt @@ -0,0 +1,9 @@ +``` +seqtk_subseq +``` +Usage: seqtk subseq [options] | +Options: + -t TAB delimited output + -s strand aware + -l INT sequence line length [0] +Note: Use 'samtools faidx' if only a few regions are intended. \ No newline at end of file diff --git a/src/seqtk/seqtk_sample/script.sh b/src/seqtk/seqtk_sample/script.sh new file mode 100644 index 00000000..01d981b3 --- /dev/null +++ b/src/seqtk/seqtk_sample/script.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +seqtk sample \ + ${par_two_pass_mode:+-2} \ + ${par_seed:+-s "$par_seed"} \ + "$par_input" \ + "$par_fraction_number" \ + > "$par_output" \ No newline at end of file diff --git a/src/seqtk/seqtk_sample/test.sh b/src/seqtk/seqtk_sample/test.sh new file mode 100644 index 00000000..cba5f613 --- /dev/null +++ b/src/seqtk/seqtk_sample/test.sh @@ -0,0 +1,104 @@ +#!/bin/bash + +set -e + +## VIASH START +meta_executable="target/executable/seqtk/seqtk_sample" +meta_resources_dir="src/seqtk" +## VIASH END + +######################################################################################### +mkdir seqtk_sample_se +cd seqtk_sample_se + +echo "> Run seqtk_sample on fastq SE" +"$meta_executable" \ + --input "$meta_resources_dir/test_data/reads/a.1.fastq.gz" \ + --seed 42 \ + --fraction_number 3 \ + --output "sampled.fastq" + +echo ">> Check if output exists" +if [ ! -f "sampled.fastq" ]; then + echo ">> sampled.fastq does not exist" + exit 1 +fi + +echo ">> Count number of samples" +num_samples=$(grep -c '^@' sampled.fastq) +if [ "$num_samples" -ne 3 ]; then + echo ">> sampled.fastq does not contain 3 samples" + exit 1 +fi + +######################################################################################### +cd .. +mkdir seqtk_sample_pe_number +cd seqtk_sample_pe_number + +echo ">> Run seqtk_sample on fastq.gz PE with number of reads" +"$meta_executable" \ + --input "$meta_resources_dir/test_data/reads/a.1.fastq.gz" \ + --seed 42 \ + --fraction_number 3 \ + --output "sampled_1.fastq" + +"$meta_executable" \ + --input "$meta_resources_dir/test_data/reads/a.2.fastq.gz" \ + --seed 42 \ + --fraction_number 3 \ + --output "sampled_2.fastq" + +echo ">> Check if output exists" +if [ ! -f "sampled_1.fastq" ] || [ ! -f "sampled_2.fastq" ]; then + echo ">> One or both output files do not exist" + exit 1 +fi + +echo ">> Compare reads" +# Extract headers +headers1=$(grep '^@' sampled_1.fastq | sed -e's/ 1$//' | sort) +headers2=$(grep '^@' sampled_2.fastq | sed -e 's/ 2$//' | sort) + +# Compare headers +diff <(echo "$headers1") <(echo "$headers2") || { echo "Mismatch detected"; exit 1; } + +echo ">> Count number of samples" +num_headers=$(echo "$headers1" | wc -l) +if [ "$num_headers" -ne 3 ]; then + echo ">> sampled_1.fastq does not contain 3 headers" + exit 1 +fi + +######################################################################################### +cd .. +mkdir seqtk_sample_pe_fraction +cd seqtk_sample_pe_fraction + +echo ">> Run seqtk_sample on fastq.gz PE with fraction of reads" +"$meta_executable" \ + --input "$meta_resources_dir/test_data/reads/a.1.fastq.gz" \ + --seed 42 \ + --fraction_number 0.5 \ + --output "sampled_1.fastq" + +"$meta_executable" \ + --input "$meta_resources_dir/test_data/reads/a.2.fastq.gz" \ + --seed 42 \ + --fraction_number 0.5 \ + --output "sampled_2.fastq" + +echo ">> Check if output exists" +if [ ! -f "sampled_1.fastq" ] || [ ! -f "sampled_2.fastq" ]; then + echo ">> One or both output files do not exist" + exit 1 +fi + +echo ">> Compare reads" +# Extract headers +headers1=$(grep '^@' sampled_1.fastq | sed -e's/ 1$//' | sort) +headers2=$(grep '^@' sampled_2.fastq | sed -e 's/ 2$//' | sort) + +# Compare headers +diff <(echo "$headers1") <(echo "$headers2") || { echo "Mismatch detected"; exit 1; } + diff --git a/src/seqtk/test_data/reads/a.1.fastq.gz b/src/seqtk/test_data/reads/a.1.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..97a72ce5d48317556a145f93c32c87f0e9e5500f GIT binary patch literal 100 zcmV-q0Gt0GiwFRnrn+7N10Bw(6~jOf1wpPTJlJGMx7b%C&OZykT2i1C{p;n6 zLRM|nP{^ij8VcF9T|*&&2 literal 0 HcmV?d00001 diff --git a/src/seqtk/test_data/reads/a.2.fastq.gz b/src/seqtk/test_data/reads/a.2.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..038bc976ac32e8f26be16949bf5632c7090e635b GIT binary patch literal 100 zcmV-q0Gt0GiwFRnrn+7N10Bw*5yUVM1wrm8Zt)RG{ Date: Wed, 17 Jul 2024 23:23:51 +0200 Subject: [PATCH 06/10] switch to viash actions for ci (#86) * switch to viash actions for ci * add changelog entry * ci force --- .github/workflows/test.yaml | 6 ++++-- CHANGELOG.md | 2 ++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 2591978f..30f98b03 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -1,9 +1,11 @@ -name: Component Testing +name: Test components on: pull_request: push: + branches: + - main jobs: test: - uses: viash-hub/toolbox/.github/workflows/test.yaml@main \ No newline at end of file + uses: viash-io/viash-actions/.github/workflows/test.yaml@v6 \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index f6a8676f..c9f8b222 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,8 @@ * `busco` components: update BUSCO to `5.7.1` (PR #72). +* Update CI to reusable workflow in `viash-io/viash-actions` (PR #86). + ## DOCUMENTATION * Extend the contributing guidelines (PR #82): From e8b82b5d968524f495e80afa8092098408d66d1d Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Wed, 17 Jul 2024 23:25:07 +0200 Subject: [PATCH 07/10] fix authorship (#88) * fix authorship * add author * add missing newlines * update changelog * update changelog --- CHANGELOG.md | 2 ++ src/_authors/angela_o_pisco.yaml | 14 ++++++++++++++ src/_authors/dorien_roosen.yaml | 10 ++++++++++ src/_authors/dries_schaumont.yaml | 11 +++++++++++ src/_authors/emma_rousseau.yaml | 10 ++++++++++ src/_authors/jakub_majercik.yaml | 2 +- src/_authors/kai_waldrant.yaml | 14 ++++++++++++++ src/_authors/leila_paquay.yaml | 10 ++++++++++ src/_authors/robrecht_cannoodt.yaml | 2 +- src/_authors/sai_nirmayi_yasa.yaml | 10 ++++++++++ src/_authors/toni_verbeiren.yaml | 9 +++++++++ src/_authors/weiwei_schultz.yaml | 2 +- src/agat/agat_convert_sp_gff2gtf/config.vsh.yaml | 4 ++++ src/arriba/config.vsh.yaml | 3 +++ src/bcl_convert/config.vsh.yaml | 11 +++++++++++ src/bedtools/bedtools_getfasta/config.vsh.yaml | 3 +++ src/busco/busco_download_datasets/config.vsh.yaml | 3 +++ src/busco/busco_list_datasets/config.vsh.yaml | 3 +++ src/busco/busco_run/config.vsh.yaml | 3 +++ src/cutadapt/config.vsh.yaml | 3 +++ src/falco/config.vsh.yaml | 3 +++ src/fastp/config.vsh.yaml | 3 +++ src/featurecounts/config.vsh.yaml | 4 +++- src/gffread/config.vsh.yaml | 3 +++ src/lofreq/call/config.vsh.yaml | 3 +++ src/lofreq/indelqual/config.vsh.yaml | 3 +++ src/multiqc/config.vsh.yaml | 4 +++- src/pear/config.vsh.yaml | 5 ++++- src/salmon/salmon_index/config.vsh.yaml | 4 +++- src/salmon/salmon_quant/config.vsh.yaml | 4 +++- src/samtools/samtools_collate/config.vsh.yaml | 4 +++- src/samtools/samtools_faidx/config.vsh.yaml | 4 +++- src/samtools/samtools_fasta/config.vsh.yaml | 4 +++- src/samtools/samtools_fastq/config.vsh.yaml | 4 +++- src/samtools/samtools_flagstat/config.vsh.yaml | 4 +++- src/samtools/samtools_idxstats/config.vsh.yaml | 4 +++- src/samtools/samtools_index/config.vsh.yaml | 4 +++- src/samtools/samtools_sort/config.vsh.yaml | 4 +++- src/samtools/samtools_stats/config.vsh.yaml | 4 +++- src/samtools/samtools_view/config.vsh.yaml | 4 +++- src/star/star_align_reads/config.vsh.yaml | 5 +++++ src/star/star_genome_generate/config.vsh.yaml | 4 +++- src/umi_tools/umi_tools_dedup/config.vsh.yaml | 4 +++- 43 files changed, 198 insertions(+), 20 deletions(-) create mode 100644 src/_authors/angela_o_pisco.yaml create mode 100644 src/_authors/dorien_roosen.yaml create mode 100644 src/_authors/dries_schaumont.yaml create mode 100644 src/_authors/emma_rousseau.yaml create mode 100644 src/_authors/kai_waldrant.yaml create mode 100644 src/_authors/leila_paquay.yaml create mode 100644 src/_authors/sai_nirmayi_yasa.yaml create mode 100644 src/_authors/toni_verbeiren.yaml diff --git a/CHANGELOG.md b/CHANGELOG.md index c9f8b222..4e6a0369 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,6 +26,8 @@ - Mention that the contents of the output of components in tests should be checked. +* Add authorship to existing components (PR #88). + ## BUG FIXES * `pear`: fix component not exiting with the correct exitcode when PEAR fails (PR #70). diff --git a/src/_authors/angela_o_pisco.yaml b/src/_authors/angela_o_pisco.yaml new file mode 100644 index 00000000..1f0bf58f --- /dev/null +++ b/src/_authors/angela_o_pisco.yaml @@ -0,0 +1,14 @@ +name: Angela Oliveira Pisco +info: + role: Contributor + links: + github: aopisco + orcid: "0000-0003-0142-2355" + linkedin: aopisco + organizations: + - name: Insitro + href: https://insitro.com + role: Director of Computational Biology + - name: Open Problems + href: https://openproblems.bio + role: Core Member diff --git a/src/_authors/dorien_roosen.yaml b/src/_authors/dorien_roosen.yaml new file mode 100644 index 00000000..d67448d8 --- /dev/null +++ b/src/_authors/dorien_roosen.yaml @@ -0,0 +1,10 @@ +name: Dorien Roosen +info: + links: + email: dorien@data-intuitive.com + github: dorien-er + linkedin: dorien-roosen + organizations: + - name: Data Intuitive + href: https://www.data-intuitive.com + role: Data Scientist diff --git a/src/_authors/dries_schaumont.yaml b/src/_authors/dries_schaumont.yaml new file mode 100644 index 00000000..b2678081 --- /dev/null +++ b/src/_authors/dries_schaumont.yaml @@ -0,0 +1,11 @@ +name: Dries Schaumont +info: + links: + email: dries@data-intuitive.com + github: DriesSchaumont + orcid: "0000-0002-4389-0440" + linkedin: dries-schaumont + organizations: + - name: Data Intuitive + href: https://www.data-intuitive.com + role: Data Scientist diff --git a/src/_authors/emma_rousseau.yaml b/src/_authors/emma_rousseau.yaml new file mode 100644 index 00000000..1a9ac456 --- /dev/null +++ b/src/_authors/emma_rousseau.yaml @@ -0,0 +1,10 @@ +name: Emma Rousseau +info: + links: + email: emma@data-intuitive.com + github: emmarousseau + linkedin: emmarousseau1 + organizations: + - name: Data Intuitive + href: https://www.data-intuitive.com + role: Bioinformatician diff --git a/src/_authors/jakub_majercik.yaml b/src/_authors/jakub_majercik.yaml index 3b75fffe..c2a7867d 100644 --- a/src/_authors/jakub_majercik.yaml +++ b/src/_authors/jakub_majercik.yaml @@ -7,4 +7,4 @@ info: organizations: - name: Data Intuitive href: https://www.data-intuitive.com - role: Bioinformatics Engineer \ No newline at end of file + role: Bioinformatics Engineer diff --git a/src/_authors/kai_waldrant.yaml b/src/_authors/kai_waldrant.yaml new file mode 100644 index 00000000..a132c528 --- /dev/null +++ b/src/_authors/kai_waldrant.yaml @@ -0,0 +1,14 @@ +name: Kai Waldrant +info: + links: + email: kai@data-intuitive.com + github: KaiWaldrant + orcid: "0009-0003-8555-1361" + linkedin: kaiwaldrant + organizations: + - name: Data Intuitive + href: https://www.data-intuitive.com + role: Bioinformatician + - name: Open Problems + href: https://openproblems.bio + role: Contributor diff --git a/src/_authors/leila_paquay.yaml b/src/_authors/leila_paquay.yaml new file mode 100644 index 00000000..21aa532d --- /dev/null +++ b/src/_authors/leila_paquay.yaml @@ -0,0 +1,10 @@ +name: Leïla Paquay +info: + links: + email: leila@data-intuitive.com + github: Leila011 + linkedin: leilapaquay + organizations: + - name: Data Intuitive + href: https://www.data-intuitive.com + role: Software Developer diff --git a/src/_authors/robrecht_cannoodt.yaml b/src/_authors/robrecht_cannoodt.yaml index d7c0f283..c4c1bdec 100644 --- a/src/_authors/robrecht_cannoodt.yaml +++ b/src/_authors/robrecht_cannoodt.yaml @@ -11,4 +11,4 @@ info: role: Data Science Engineer - name: Open Problems href: https://openproblems.bio - role: Core Member \ No newline at end of file + role: Core Member diff --git a/src/_authors/sai_nirmayi_yasa.yaml b/src/_authors/sai_nirmayi_yasa.yaml new file mode 100644 index 00000000..9f560c58 --- /dev/null +++ b/src/_authors/sai_nirmayi_yasa.yaml @@ -0,0 +1,10 @@ +name: Sai Nirmayi Yasa +info: + links: + email: nirmayi@data-intuitive.com + github: sainirmayi + linkedin: sai-nirmayi-yasa + organizations: + - name: Data Intuitive + href: https://www.data-intuitive.com + role: Junior Bioinformatics Researcher diff --git a/src/_authors/toni_verbeiren.yaml b/src/_authors/toni_verbeiren.yaml new file mode 100644 index 00000000..2f2f851f --- /dev/null +++ b/src/_authors/toni_verbeiren.yaml @@ -0,0 +1,9 @@ +name: Toni Verbeiren +info: + links: + github: tverbeiren + linkedin: verbeiren + organizations: + - name: Data Intuitive + href: https://www.data-intuitive.com + role: Data Scientist and CEO diff --git a/src/_authors/weiwei_schultz.yaml b/src/_authors/weiwei_schultz.yaml index 324f9378..e4945078 100644 --- a/src/_authors/weiwei_schultz.yaml +++ b/src/_authors/weiwei_schultz.yaml @@ -2,4 +2,4 @@ name: Weiwei Schultz info: organizations: - name: Janssen R&D US - role: Associate Director Data Sciences \ No newline at end of file + role: Associate Director Data Sciences diff --git a/src/agat/agat_convert_sp_gff2gtf/config.vsh.yaml b/src/agat/agat_convert_sp_gff2gtf/config.vsh.yaml index b788c7c7..757cbd85 100644 --- a/src/agat/agat_convert_sp_gff2gtf/config.vsh.yaml +++ b/src/agat/agat_convert_sp_gff2gtf/config.vsh.yaml @@ -27,6 +27,10 @@ links: references: doi: 10.5281/zenodo.3552717 license: GPL-3.0 +authors: + - __merge__: /src/_authors/leila_paquay.yaml + roles: [ author, maintainer ] + argument_groups: - name: Inputs arguments: diff --git a/src/arriba/config.vsh.yaml b/src/arriba/config.vsh.yaml index 8d72d7eb..db5960cf 100644 --- a/src/arriba/config.vsh.yaml +++ b/src/arriba/config.vsh.yaml @@ -11,6 +11,9 @@ license: MIT requirements: cpus: 1 commands: [ arriba ] +authors: + - __merge__: /src/_authors/robrecht_cannoodt.yaml + roles: [ author, maintainer ] argument_groups: - name: Inputs arguments: diff --git a/src/bcl_convert/config.vsh.yaml b/src/bcl_convert/config.vsh.yaml index 657fb1f0..81103776 100644 --- a/src/bcl_convert/config.vsh.yaml +++ b/src/bcl_convert/config.vsh.yaml @@ -4,6 +4,17 @@ description: | Information about upgrading from bcl2fastq via [Upgrading from bcl2fastq to BCL Convert](https://emea.support.illumina.com/bulletins/2020/10/upgrading-from-bcl2fastq-to-bcl-convert.html) and [BCL Convert Compatible Products](https://support.illumina.com/sequencing/sequencing_software/bcl-convert/compatibility.html) +keywords: [demultiplex, fastq, bcl, illumina] +links: + homepage: https://support.illumina.com/sequencing/sequencing_software/bcl-convert.html + documentation: https://support.illumina.com/downloads/bcl-convert-user-guide.html +license: Proprietary +authors: + - __merge__: /src/_authors/toni_verbeiren.yaml + roles: [ author, maintainer ] + - __merge__: /src/_authors/dorien_roosen.yaml + roles: [ author ] + argument_groups: - name: Input arguments arguments: diff --git a/src/bedtools/bedtools_getfasta/config.vsh.yaml b/src/bedtools/bedtools_getfasta/config.vsh.yaml index f1f49a87..fe160b20 100644 --- a/src/bedtools/bedtools_getfasta/config.vsh.yaml +++ b/src/bedtools/bedtools_getfasta/config.vsh.yaml @@ -10,6 +10,9 @@ references: license: GPL-2.0 requirements: commands: [bedtools] +authors: + - __merge__: /src/_authors/dries_schaumont.yaml + roles: [ author, maintainer ] argument_groups: - name: Input arguments diff --git a/src/busco/busco_download_datasets/config.vsh.yaml b/src/busco/busco_download_datasets/config.vsh.yaml index 5297af2e..cce3faa0 100644 --- a/src/busco/busco_download_datasets/config.vsh.yaml +++ b/src/busco/busco_download_datasets/config.vsh.yaml @@ -9,6 +9,9 @@ links: references: doi: 10.1007/978-1-4939-9173-0_14 license: MIT +authors: + - __merge__: /src/_authors/dorien_roosen.yaml + roles: [ author, maintainer ] argument_groups: - name: Inputs arguments: diff --git a/src/busco/busco_list_datasets/config.vsh.yaml b/src/busco/busco_list_datasets/config.vsh.yaml index cac34cc6..93fd0559 100644 --- a/src/busco/busco_list_datasets/config.vsh.yaml +++ b/src/busco/busco_list_datasets/config.vsh.yaml @@ -9,6 +9,9 @@ links: references: doi: 10.1007/978-1-4939-9173-0_14 license: MIT +authors: + - __merge__: /src/_authors/dorien_roosen.yaml + roles: [ author, maintainer ] argument_groups: - name: Outputs arguments: diff --git a/src/busco/busco_run/config.vsh.yaml b/src/busco/busco_run/config.vsh.yaml index 23ee95fb..435e9d2a 100644 --- a/src/busco/busco_run/config.vsh.yaml +++ b/src/busco/busco_run/config.vsh.yaml @@ -9,6 +9,9 @@ links: references: doi: 10.1007/978-1-4939-9173-0_14 license: MIT +authors: + - __merge__: /src/_authors/dorien_roosen.yaml + roles: [ author, maintainer ] argument_groups: - name: Inputs arguments: diff --git a/src/cutadapt/config.vsh.yaml b/src/cutadapt/config.vsh.yaml index b315d0ce..7e36a8e0 100644 --- a/src/cutadapt/config.vsh.yaml +++ b/src/cutadapt/config.vsh.yaml @@ -9,6 +9,9 @@ links: references: doi: 10.14806/ej.17.1.200 license: MIT +authors: + - __merge__: /src/_authors/toni_verbeiren.yaml + roles: [ author, maintainer ] argument_groups: #################################################################### - name: Specify Adapters for R1 diff --git a/src/falco/config.vsh.yaml b/src/falco/config.vsh.yaml index 4d9cf656..de9906ef 100644 --- a/src/falco/config.vsh.yaml +++ b/src/falco/config.vsh.yaml @@ -9,6 +9,9 @@ references: license: GPL-3.0 requirements: commands: [falco] +authors: + - __merge__: /src/_authors/toni_verbeiren.yaml + roles: [ author, maintainer ] # Notes: # - falco as arguments similar to -subsample and we update those to --subsample diff --git a/src/fastp/config.vsh.yaml b/src/fastp/config.vsh.yaml index b7d9062a..f1f8f1ed 100644 --- a/src/fastp/config.vsh.yaml +++ b/src/fastp/config.vsh.yaml @@ -26,6 +26,9 @@ links: references: doi: "10.1093/bioinformatics/bty560" license: MIT +authors: + - __merge__: /src/_authors/robrecht_cannoodt.yaml + roles: [ author, maintainer ] argument_groups: - name: Inputs description: | diff --git a/src/featurecounts/config.vsh.yaml b/src/featurecounts/config.vsh.yaml index 8697b1fe..e17d9ac0 100644 --- a/src/featurecounts/config.vsh.yaml +++ b/src/featurecounts/config.vsh.yaml @@ -11,7 +11,9 @@ references: license: GPL-3.0 requirements: commands: [ featureCounts ] - +authors: + - __merge__: /src/_authors/sai_nirmayi_yasa.yaml + roles: [ author, maintainer ] argument_groups: - name: Inputs arguments: diff --git a/src/gffread/config.vsh.yaml b/src/gffread/config.vsh.yaml index 7477a284..bd985ffb 100644 --- a/src/gffread/config.vsh.yaml +++ b/src/gffread/config.vsh.yaml @@ -8,6 +8,9 @@ links: references: doi: 10.12688/f1000research.23297.2 license: MIT +authors: + - __merge__: /src/_authors/emma_rousseau.yaml + roles: [ author, maintainer ] argument_groups: - name: Inputs arguments: diff --git a/src/lofreq/call/config.vsh.yaml b/src/lofreq/call/config.vsh.yaml index c547de9d..286a040a 100644 --- a/src/lofreq/call/config.vsh.yaml +++ b/src/lofreq/call/config.vsh.yaml @@ -17,6 +17,9 @@ references: license: "MIT" requirements: commands: [ lofreq ] +authors: + - __merge__: /src/_authors/kai_waldrant.yaml + roles: [ author, maintainer ] argument_groups: - name: Inputs arguments: diff --git a/src/lofreq/indelqual/config.vsh.yaml b/src/lofreq/indelqual/config.vsh.yaml index 0524458e..29696c81 100644 --- a/src/lofreq/indelqual/config.vsh.yaml +++ b/src/lofreq/indelqual/config.vsh.yaml @@ -18,6 +18,9 @@ references: license: "MIT" requirements: commands: [ lofreq ] +authors: + - __merge__: /src/_authors/kai_waldrant.yaml + roles: [ author, maintainer ] argument_groups: - name: Inputs arguments: diff --git a/src/multiqc/config.vsh.yaml b/src/multiqc/config.vsh.yaml index df5e38e1..ba305025 100644 --- a/src/multiqc/config.vsh.yaml +++ b/src/multiqc/config.vsh.yaml @@ -11,7 +11,9 @@ info: references: doi: 10.1093/bioinformatics/btw354 licence: GPL v3 or later - +authors: + - __merge__: /src/_authors/dorien_roosen.yaml + roles: [ author, maintainer ] argument_groups: - name: "Input" arguments: diff --git a/src/pear/config.vsh.yaml b/src/pear/config.vsh.yaml index d6dbe6c9..acae10cc 100644 --- a/src/pear/config.vsh.yaml +++ b/src/pear/config.vsh.yaml @@ -12,7 +12,10 @@ references: doi: 10.1093/bioinformatics/btt593 license: "CC-BY-NC-SA-3.0" requirements: - commands: [ pear , gzip ] + commands: [ pear, gzip ] +authors: + - __merge__: /src/_authors/kai_waldrant.yaml + roles: [ author, maintainer ] argument_groups: - name: Inputs arguments: diff --git a/src/salmon/salmon_index/config.vsh.yaml b/src/salmon/salmon_index/config.vsh.yaml index 41c1e05b..925c3000 100644 --- a/src/salmon/salmon_index/config.vsh.yaml +++ b/src/salmon/salmon_index/config.vsh.yaml @@ -12,7 +12,9 @@ references: license: GPL-3.0 requirements: commands: [ salmon ] - +authors: + - __merge__: /src/_authors/sai_nirmayi_yasa.yaml + roles: [ author, maintainer ] argument_groups: - name: Inputs arguments: diff --git a/src/salmon/salmon_quant/config.vsh.yaml b/src/salmon/salmon_quant/config.vsh.yaml index b7e303f4..1f96f0c9 100644 --- a/src/salmon/salmon_quant/config.vsh.yaml +++ b/src/salmon/salmon_quant/config.vsh.yaml @@ -12,7 +12,9 @@ references: license: GPL-3.0 requirements: commands: [ salmon ] - +authors: + - __merge__: /src/_authors/sai_nirmayi_yasa.yaml + roles: [ author, maintainer ] argument_groups: - name: Common input options arguments: diff --git a/src/samtools/samtools_collate/config.vsh.yaml b/src/samtools/samtools_collate/config.vsh.yaml index 669f4cdf..84a3195c 100644 --- a/src/samtools/samtools_collate/config.vsh.yaml +++ b/src/samtools/samtools_collate/config.vsh.yaml @@ -9,7 +9,9 @@ links: references: doi: [10.1093/bioinformatics/btp352, 10.1093/gigascience/giab008] license: MIT/Expat - +authors: + - __merge__: /src/_authors/emma_rousseau.yaml + roles: [ author, maintainer ] argument_groups: - name: Inputs arguments: diff --git a/src/samtools/samtools_faidx/config.vsh.yaml b/src/samtools/samtools_faidx/config.vsh.yaml index c1c9325d..937b0804 100644 --- a/src/samtools/samtools_faidx/config.vsh.yaml +++ b/src/samtools/samtools_faidx/config.vsh.yaml @@ -9,7 +9,9 @@ links: references: doi: [10.1093/bioinformatics/btp352, 10.1093/gigascience/giab008] license: MIT/Expat - +authors: + - __merge__: /src/_authors/emma_rousseau.yaml + roles: [ author, maintainer ] argument_groups: - name: Inputs arguments: diff --git a/src/samtools/samtools_fasta/config.vsh.yaml b/src/samtools/samtools_fasta/config.vsh.yaml index 23517f6c..70ba72b9 100644 --- a/src/samtools/samtools_fasta/config.vsh.yaml +++ b/src/samtools/samtools_fasta/config.vsh.yaml @@ -9,7 +9,9 @@ links: references: doi: [10.1093/bioinformatics/btp352, 10.1093/gigascience/giab008] license: MIT/Expat - +authors: + - __merge__: /src/_authors/emma_rousseau.yaml + roles: [ author, maintainer ] argument_groups: - name: Inputs arguments: diff --git a/src/samtools/samtools_fastq/config.vsh.yaml b/src/samtools/samtools_fastq/config.vsh.yaml index cac7653b..09014ced 100644 --- a/src/samtools/samtools_fastq/config.vsh.yaml +++ b/src/samtools/samtools_fastq/config.vsh.yaml @@ -9,7 +9,9 @@ links: references: doi: [10.1093/bioinformatics/btp352, 10.1093/gigascience/giab008] license: MIT/Expat - +authors: + - __merge__: /src/_authors/emma_rousseau.yaml + roles: [ author, maintainer ] argument_groups: - name: Inputs arguments: diff --git a/src/samtools/samtools_flagstat/config.vsh.yaml b/src/samtools/samtools_flagstat/config.vsh.yaml index 9b4dfbe1..b30f1867 100644 --- a/src/samtools/samtools_flagstat/config.vsh.yaml +++ b/src/samtools/samtools_flagstat/config.vsh.yaml @@ -9,7 +9,9 @@ links: references: doi: [10.1093/bioinformatics/btp352, 10.1093/gigascience/giab008] license: MIT/Expat - +authors: + - __merge__: /src/_authors/emma_rousseau.yaml + roles: [ author, maintainer ] argument_groups: - name: Inputs arguments: diff --git a/src/samtools/samtools_idxstats/config.vsh.yaml b/src/samtools/samtools_idxstats/config.vsh.yaml index 30f21348..16e901d7 100644 --- a/src/samtools/samtools_idxstats/config.vsh.yaml +++ b/src/samtools/samtools_idxstats/config.vsh.yaml @@ -9,7 +9,9 @@ links: references: doi: [10.1093/bioinformatics/btp352, 10.1093/gigascience/giab008] license: MIT/Expat - +authors: + - __merge__: /src/_authors/emma_rousseau.yaml + roles: [ author, maintainer ] argument_groups: - name: Inputs arguments: diff --git a/src/samtools/samtools_index/config.vsh.yaml b/src/samtools/samtools_index/config.vsh.yaml index 8c59a20e..4220c691 100644 --- a/src/samtools/samtools_index/config.vsh.yaml +++ b/src/samtools/samtools_index/config.vsh.yaml @@ -9,7 +9,9 @@ links: references: doi: [10.1093/bioinformatics/btp352, 10.1093/gigascience/giab008] license: MIT/Expat - +authors: + - __merge__: /src/_authors/emma_rousseau.yaml + roles: [ author, maintainer ] argument_groups: - name: Inputs arguments: diff --git a/src/samtools/samtools_sort/config.vsh.yaml b/src/samtools/samtools_sort/config.vsh.yaml index a78800da..e0776c2d 100644 --- a/src/samtools/samtools_sort/config.vsh.yaml +++ b/src/samtools/samtools_sort/config.vsh.yaml @@ -9,7 +9,9 @@ links: references: doi: [10.1093/bioinformatics/btp352, 10.1093/gigascience/giab008] license: MIT/Expat - +authors: + - __merge__: /src/_authors/emma_rousseau.yaml + roles: [ author, maintainer ] argument_groups: - name: Inputs arguments: diff --git a/src/samtools/samtools_stats/config.vsh.yaml b/src/samtools/samtools_stats/config.vsh.yaml index ca630876..b115b4df 100644 --- a/src/samtools/samtools_stats/config.vsh.yaml +++ b/src/samtools/samtools_stats/config.vsh.yaml @@ -9,7 +9,9 @@ links: references: doi: [10.1093/bioinformatics/btp352, 10.1093/gigascience/giab008] license: MIT/Expat - +authors: + - __merge__: /src/_authors/emma_rousseau.yaml + roles: [ author, maintainer ] argument_groups: - name: Inputs arguments: diff --git a/src/samtools/samtools_view/config.vsh.yaml b/src/samtools/samtools_view/config.vsh.yaml index 206b87ac..86dde146 100644 --- a/src/samtools/samtools_view/config.vsh.yaml +++ b/src/samtools/samtools_view/config.vsh.yaml @@ -9,7 +9,9 @@ links: references: doi: [10.1093/bioinformatics/btp352, 10.1093/gigascience/giab008] license: MIT/Expat - +authors: + - __merge__: /src/_authors/emma_rousseau.yaml + roles: [ author, maintainer ] argument_groups: - name: Inputs arguments: diff --git a/src/star/star_align_reads/config.vsh.yaml b/src/star/star_align_reads/config.vsh.yaml index eab65b35..bdc956d3 100644 --- a/src/star/star_align_reads/config.vsh.yaml +++ b/src/star/star_align_reads/config.vsh.yaml @@ -11,6 +11,11 @@ references: license: MIT requirements: commands: [ STAR, python, ps, zcat, bzcat ] +authors: + - __merge__: /src/_authors/angela_o_pisco.yaml + roles: [ author ] + - __merge__: /src/_authors/robrecht_cannoodt.yaml + roles: [ author, maintainer ] # manually taking care of the main input and output arguments argument_groups: - name: Inputs diff --git a/src/star/star_genome_generate/config.vsh.yaml b/src/star/star_genome_generate/config.vsh.yaml index 3adaf7a2..60fa3839 100644 --- a/src/star/star_genome_generate/config.vsh.yaml +++ b/src/star/star_genome_generate/config.vsh.yaml @@ -11,7 +11,9 @@ references: license: MIT requirements: commands: [ STAR ] - +authors: + - __merge__: /src/_authors/sai_nirmayi_yasa.yaml + roles: [ author, maintainer ] argument_groups: - name: "Input" arguments: diff --git a/src/umi_tools/umi_tools_dedup/config.vsh.yaml b/src/umi_tools/umi_tools_dedup/config.vsh.yaml index a02e70a1..e6953e6e 100644 --- a/src/umi_tools/umi_tools_dedup/config.vsh.yaml +++ b/src/umi_tools/umi_tools_dedup/config.vsh.yaml @@ -10,7 +10,9 @@ links: references: doi: 10.1101/gr.209601.116 license: MIT - +authors: + - __merge__: /src/_authors/emma_rousseau.yaml + roles: [ author, maintainer ] argument_groups: - name: Inputs arguments: From 2f9c7f7b38c45e856eda9e35204c8f6befc32f84 Mon Sep 17 00:00:00 2001 From: Theodoro Gasperin Terra Camargo <98555209+tgaspe@users.noreply.github.com> Date: Thu, 18 Jul 2024 17:19:44 -0300 Subject: [PATCH 08/10] Seqtk subseq (#85) * Config file and help.txt file Created: - config.vsh.yaml - help.txt * Added script.sh File added: - script.sh * Created test.sh updates: - changes to config.vsh.yaml - created test.sh - created some test files Problems: - there is some error in config file that is preventing me from running the component and testing * Update on test.sh * update * Bug fixes - config required: false bug * Update test * Update CHANGELOG.md * Improvement on test.sh * Added more test I tried out different option of the command with different fasta and fastq files and different list, but the output does not seem to change. * Update on tests - got unstuck - I need to create a docker image with the lastest version of seqtk - * Bug fixed - removed some test files - fixed bug with the help of Toni - added correct software_versions.txt to config Still Needs: - add one more test to strand aware - fix tab test * Update CHANGELOG.md * Fixed Tabular test bug * Strand Aware Test - implementation of strand aware test - change of format for reg.bed file * Input validation for list file - input validation for name_list parameter * Sugested Changes - removed test_data dir - removed input validation * Added author info * Update CHANGELOG.md * Update theodoro_gasperin.yaml * add newline * add newline * Update src/seqtk/seqtk_subseq/config.vsh.yaml Co-authored-by: Robrecht Cannoodt * Version Fix * Update on config * Helper bed.sh * Deleted _helpers * don't forget exit when a test fails --------- Co-authored-by: Robrecht Cannoodt --- CHANGELOG.md | 30 ++-- src/_authors/theodoro_gasperin.yaml | 10 ++ src/seqtk/seqtk_subseq/config.vsh.yaml | 78 +++++++++++ src/seqtk/seqtk_subseq/help.txt | 9 ++ src/seqtk/seqtk_subseq/script.sh | 15 ++ src/seqtk/seqtk_subseq/test.sh | 182 +++++++++++++++++++++++++ 6 files changed, 305 insertions(+), 19 deletions(-) create mode 100644 src/_authors/theodoro_gasperin.yaml create mode 100644 src/seqtk/seqtk_subseq/config.vsh.yaml create mode 100644 src/seqtk/seqtk_subseq/help.txt create mode 100644 src/seqtk/seqtk_subseq/script.sh create mode 100644 src/seqtk/seqtk_subseq/test.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 4e6a0369..d6256e72 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,9 +2,16 @@ ## NEW FEATURES -* `bd_rhapsody`: +* `bd_rhapsody/bd_rhapsody_make_reference`: Create a reference for the BD Rhapsody pipeline (PR #75). - - `bd_rhapsody/bd_rhapsody_make_reference`: Create a reference for the BD Rhapsody pipeline (PR #75). +* `umitools/umitools_dedup`: Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read (PR #54). + +* `seqtk`: + - `seqtk/seqtk_sample`: Subsamples sequences from FASTA/Q files (PR #68). + - `seqtk/seqtk_subseq`: Extract the sequences (complete or subsequence) from the FASTA/FASTQ files + based on a provided sequence IDs or region coordinates file (PR #85). + +* `agat/agat_convert_sp_gff2gtf`: convert any GTF/GFF file into a proper GTF file (PR #76). ## MINOR CHANGES @@ -38,14 +45,8 @@ * `multiqc`: update multiple separator to `;` (PR #81). -# biobox 0.1.0 - -## BREAKING CHANGES - -* Change default `multiple_sep` to `;` (PR #25). This aligns with an upcoming breaking change in - Viash 0.9.0 in order to avoid issues with the current default separator `:` unintentionally - splitting up certain file paths. +# biobox 0.1.0 ## NEW FEATURES @@ -94,21 +95,12 @@ - `samtools/samtools_fastq`: Converts a SAM/BAM/CRAM file to FASTQ (PR #52). - `samtools/samtools_fastq`: Converts a SAM/BAM/CRAM file to FASTA (PR #53). - * `falco`: A C++ drop-in replacement of FastQC to assess the quality of sequence read data (PR #43). -* `seqtk/seqtk_sample`: Sample sequences from FASTA/Q(.gz) files to FASTA/Q (PR #68). - -* `umitools`: - - `umitools_dedup`: Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read (PR #54). - * `bedtools`: - `bedtools_getfasta`: extract sequences from a FASTA file for each of the intervals defined in a BED/GFF/VCF file (PR #59). -* `agat`: - - `agat_convert_sp_gff2gtf`: convert any GTF/GFF file into a proper GTF file (PR #76). - ## MINOR CHANGES * Uniformize component metadata (PR #23). @@ -129,4 +121,4 @@ * Add escaping character before leading hashtag in the description field of the config file (PR #50). -* Format URL in biobase/bcl_convert description (PR #55). \ No newline at end of file +* Format URL in biobase/bcl_convert description (PR #55). diff --git a/src/_authors/theodoro_gasperin.yaml b/src/_authors/theodoro_gasperin.yaml new file mode 100644 index 00000000..47af96a9 --- /dev/null +++ b/src/_authors/theodoro_gasperin.yaml @@ -0,0 +1,10 @@ +name: Theodoro Gasperin Terra Camargo +info: + links: + email: theodorogtc@gmail.com + github: tgaspe + linkedin: theodoro-gasperin-terra-camargo + organizations: + - name: Data Intuitive + href: https://www.data-intuitive.com + role: Bioinformatician diff --git a/src/seqtk/seqtk_subseq/config.vsh.yaml b/src/seqtk/seqtk_subseq/config.vsh.yaml new file mode 100644 index 00000000..1c2e8c08 --- /dev/null +++ b/src/seqtk/seqtk_subseq/config.vsh.yaml @@ -0,0 +1,78 @@ +name: seqtk_subseq +namespace: seqtk +description: | + Extract subsequences from FASTA/Q files. Takes as input a FASTA/Q file and a name.lst (sequence ids file) or a reg.bed (genomic regions file). +keywords: [subseq, FASTA, FASTQ] +links: + repository: https://github.com/lh3/seqtk/tree/v1.4 +license: MIT +authors: + - __merge__: /src/_authors/theodoro_gasperin.yaml + roles: [ author, maintainer ] + +argument_groups: + - name: Inputs + arguments: + - name: "--input" + type: file + direction: input + description: The input FASTA/Q file. + required: true + example: input.fa + + - name: "--name_list" + type: file + direction: input + description: | + List of sequence names (name.lst) or genomic regions (reg.bed) to extract. + required: true + example: list.lst + + - name: Outputs + arguments: + - name: "--output" + alternatives: -o + type: file + direction: output + description: The output FASTA/Q file. + required: true + default: output.fa + + - name: Options + arguments: + - name: "--tab" + alternatives: -t + type: boolean_true + description: TAB delimited output. + + - name: "--strand_aware" + alternatives: -s + type: boolean_true + description: Strand aware. + + - name: "--sequence_line_length" + alternatives: -l + type: integer + description: | + Sequence line length of input fasta file. Default: 0. + example: 0 + + +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + +engines: + - type: docker + image: quay.io/biocontainers/seqtk:1.4--he4a0461_2 + setup: + - type: docker + run: | + echo $(echo $(seqtk 2>&1) | sed -n 's/.*\(Version: [^ ]*\).*/\1/p') > /var/software_versions.txt + +runners: + - type: executable + - type: nextflow diff --git a/src/seqtk/seqtk_subseq/help.txt b/src/seqtk/seqtk_subseq/help.txt new file mode 100644 index 00000000..5768e4ff --- /dev/null +++ b/src/seqtk/seqtk_subseq/help.txt @@ -0,0 +1,9 @@ +```bash +seqtk subseq +``` +Usage: seqtk subseq [options] | +Options: + -t TAB delimited output + -s strand aware + -l INT sequence line length [0] +Note: Use 'samtools faidx' if only a few regions are intended. \ No newline at end of file diff --git a/src/seqtk/seqtk_subseq/script.sh b/src/seqtk/seqtk_subseq/script.sh new file mode 100644 index 00000000..0aceaf29 --- /dev/null +++ b/src/seqtk/seqtk_subseq/script.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +[[ "$par_tab" == "false" ]] && unset par_tab +[[ "$par_strand_aware" == "false" ]] && unset par_strand_aware + +seqtk subseq \ + ${par_tab:+-t} \ + ${par_strand_aware:+-s} \ + ${par_sequence_line_length:+-l "$par_sequence_line_length"} \ + "$par_input" \ + "$par_name_list" \ + > "$par_output" diff --git a/src/seqtk/seqtk_subseq/test.sh b/src/seqtk/seqtk_subseq/test.sh new file mode 100644 index 00000000..f19cfa4a --- /dev/null +++ b/src/seqtk/seqtk_subseq/test.sh @@ -0,0 +1,182 @@ +#!/bin/bash + +# exit on error +set -e + +## VIASH START +meta_executable="target/executable/seqtk/seqtk_subseq" +meta_resources_dir="src/seqtk" +## VIASH END + +# Create directories for tests +echo "Creating Test Data..." +mkdir test_data + +# Create and populate input.fasta +cat > "test_data/input.fasta" <KU562861.1 +GGAGCAGGAGAGTGTTCGAGTTCAGAGATGTCCATGGCGCCGTACGAGAAGGTGATGGATGACCTGGCCA +AGGGGCAGCAGTTCGCGACGCAGCTGCAGGGCCTCCTCCGGGACTCCCCCAAGGCCGGCCACATCATGGA +>GU056837.1 +CTAATTTTATTTTTTTATAATAATTATTGGAGGAACTAAAACATTAATGAAATAATAATTATCATAATTA +TTAATTACATATTTATTAGGTATAATATTTAAGGAAAAATATATTTTATGTTAATTGTAATAATTAGAAC +>CP097510.1 +CGATTTAGATCGGTGTAGTCAACACACATCCTCCACTTCCATTAGGCTTCTTGACGAGGACTACATTGAC +AGCCACCGAGGGAACCGACCTCCTCAATGAAGTCAGACGCCAAGAGCCTATCAACTTCCTTCTGCACAGC +>JAMFTS010000002.1 +CCTAAACCCTAAACCCTAAACCCCCTACAAACCTTACCCTAAACCCTAAACCCTAAACCCTAAACCCTAA +ACCCGAAACCCTATACCCTAAACCCTAAACCCTAAACCCTAAACCCTAACCCAAACCTAATCCCTAAACC +>MH150936.1 +TAGAAGCTAATGAAAACTTTTCCTTTACTAAAAACCGTCAAACACGGTAAGAAACGCTTTTAATCATTTC +AAAAGCAATCCCAATAGTGGTTACATCCAAACAAAACCCATTTCTTATATTTTCTCAAAAACAGTGAGAG +EOL + +# Update id.list with new entries +cat > "test_data/id.list" < "test_data/reg.bed" < Run seqtk_subseq on FASTA/Q file" +"$meta_executable" \ + --input "../test_data/input.fasta" \ + --name_list "../test_data/id.list" \ + --output "sub_sample.fq" + +expected_output_basic=">KU562861.1 +GGAGCAGGAGAGTGTTCGAGTTCAGAGATGTCCATGGCGCCGTACGAGAAGGTGATGGATGACCTGGCCAAGGGGCAGCAGTTCGCGACGCAGCTGCAGGGCCTCCTCCGGGACTCCCCCAAGGCCGGCCACATCATGGA +>MH150936.1 +TAGAAGCTAATGAAAACTTTTCCTTTACTAAAAACCGTCAAACACGGTAAGAAACGCTTTTAATCATTTCAAAAGCAATCCCAATAGTGGTTACATCCAAACAAAACCCATTTCTTATATTTTCTCAAAAACAGTGAGAG" +output_basic=$(cat sub_sample.fq) + +if [ "$output_basic" != "$expected_output_basic" ]; then + echo "Test failed" + echo "Expected:" + echo "$expected_output_basic" + echo "Got:" + echo "$output_basic" + exit 1 +fi + +######################################################################################### +# Run reg.bed as name list input test +cd .. +mkdir test2 +cd test2 + +echo "> Run seqtk_subseq on FASTA/Q file with BED file as name list" +"$meta_executable" \ + --input "../test_data/input.fasta" \ + --name_list "../test_data/reg.bed" \ + --output "sub_sample.fq" + +expected_output_basic=">KU562861.1:11-20 +AGTGTTCGAG +>MH150936.1:11-20 +TGAAAACTTT" +output_basic=$(cat sub_sample.fq) + +if [ "$output_basic" != "$expected_output_basic" ]; then + echo "Test failed" + echo "Expected:" + echo "$expected_output_basic" + echo "Got:" + echo "$output_basic" + exit 1 +fi + +######################################################################################### +# Run tab option output test +cd .. +mkdir test3 +cd test3 + +echo "> Run seqtk_subseq with TAB option" +"$meta_executable" \ + --tab \ + --input "../test_data/input.fasta" \ + --name_list "../test_data/reg.bed" \ + --output "sub_sample.fq" + +expected_output_tabular=$'KU562861.1\t11\tAGTGTTCGAG\nMH150936.1\t11\tTGAAAACTTT' +output_tabular=$(cat sub_sample.fq) + +if [ "$output_tabular" != "$expected_output_tabular" ]; then + echo "Test failed" + echo "Expected:" + echo "$expected_output_tabular" + echo "Got:" + echo "$output_tabular" + exit 1 +fi + +######################################################################################### +# Run line option output test +cd .. +mkdir test4 +cd test4 + +echo "> Run seqtk_subseq with line length option" +"$meta_executable" \ + --sequence_line_length 5 \ + --input "../test_data/input.fasta" \ + --name_list "../test_data/reg.bed" \ + --output "sub_sample.fq" + +expected_output_wrapped=">KU562861.1:11-20 +AGTGT +TCGAG +>MH150936.1:11-20 +TGAAA +ACTTT" +output_wrapped=$(cat sub_sample.fq) + +if [ "$output_wrapped" != "$expected_output_wrapped" ]; then + echo "Test failed" + echo "Expected:" + echo "$expected_output_wrapped" + echo "Got:" + echo "$output_wrapped" + exit 1 +fi + +######################################################################################### +# Run Strand Aware option output test +cd .. +mkdir test5 +cd test5 + +echo "> Run seqtk_subseq with strand aware option" +"$meta_executable" \ + --strand_aware \ + --input "../test_data/input.fasta" \ + --name_list "../test_data/reg.bed" \ + --output "sub_sample.fq" + +expected_output_wrapped=">KU562861.1:11-20 +AGTGTTCGAG +>MH150936.1:11-20 +AAAGTTTTCA" +output_wrapped=$(cat sub_sample.fq) + +if [ "$output_wrapped" != "$expected_output_wrapped" ]; then + echo "Test failed" + echo "Expected:" + echo "$expected_output_wrapped" + echo "Got:" + echo "$output_wrapped" + exit 1 +fi + +echo "All tests succeeded!" From 3566232d39446d6ac19db154e6046e8b000f51af Mon Sep 17 00:00:00 2001 From: emmarousseau Date: Wed, 24 Jul 2024 17:46:02 +0200 Subject: [PATCH 09/10] UMI-tools extract (#71) * initial commit dedup * Revert "initial commit dedup" This reverts commit 38f586bec0ac9e4312b016e29c3aa0bd53f292b2. * config and help files * tests template * script, preliminary version of tests * new test data * include arguments and script from the rnaseq.vsh version * new version of the script * Fixed bugs, component functional * Small formatting changes, update chagelog * Argument formatting in config * md formatting and ordering changes * remove second link to doc * Update src/umi_tools/umi_tools_extract/config.vsh.yaml Co-authored-by: Robrecht Cannoodt * Reduce the size of the test data and add back missing arguments to config * simplify argument names * Update src/umi_tools/umi_tools_extract/config.vsh.yaml Co-authored-by: Robrecht Cannoodt * more consistent arg names, remove "paired" argument and adapt script --------- Co-authored-by: Robrecht Cannoodt --- CHANGELOG.md | 3 + .../umi_tools_extract/config.vsh.yaml | 197 ++++++++++++++++++ src/umi_tools/umi_tools_extract/help.txt | 106 ++++++++++ src/umi_tools/umi_tools_extract/script.sh | 88 ++++++++ src/umi_tools/umi_tools_extract/test.sh | 86 ++++++++ .../test_data/scrb_seq_fastq.1_30 | 120 +++++++++++ .../test_data/scrb_seq_fastq.1_30.extract | 120 +++++++++++ .../test_data/scrb_seq_fastq.2_30 | 120 +++++++++++ .../test_data/scrb_seq_fastq.2_30.extract | 120 +++++++++++ .../umi_tools_extract/test_data/script.sh | 34 +++ .../test_data/slim_30.extract | 120 +++++++++++ .../umi_tools_extract/test_data/slim_30.fastq | 120 +++++++++++ 12 files changed, 1234 insertions(+) create mode 100644 src/umi_tools/umi_tools_extract/config.vsh.yaml create mode 100644 src/umi_tools/umi_tools_extract/help.txt create mode 100644 src/umi_tools/umi_tools_extract/script.sh create mode 100644 src/umi_tools/umi_tools_extract/test.sh create mode 100644 src/umi_tools/umi_tools_extract/test_data/scrb_seq_fastq.1_30 create mode 100644 src/umi_tools/umi_tools_extract/test_data/scrb_seq_fastq.1_30.extract create mode 100644 src/umi_tools/umi_tools_extract/test_data/scrb_seq_fastq.2_30 create mode 100644 src/umi_tools/umi_tools_extract/test_data/scrb_seq_fastq.2_30.extract create mode 100755 src/umi_tools/umi_tools_extract/test_data/script.sh create mode 100644 src/umi_tools/umi_tools_extract/test_data/slim_30.extract create mode 100644 src/umi_tools/umi_tools_extract/test_data/slim_30.fastq diff --git a/CHANGELOG.md b/CHANGELOG.md index d6256e72..665b587d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -95,6 +95,9 @@ - `samtools/samtools_fastq`: Converts a SAM/BAM/CRAM file to FASTQ (PR #52). - `samtools/samtools_fastq`: Converts a SAM/BAM/CRAM file to FASTA (PR #53). +* `umi_tools`: + -`umi_tools/umi_tools_extract`: Flexible removal of UMI sequences from fastq reads (PR #71). + * `falco`: A C++ drop-in replacement of FastQC to assess the quality of sequence read data (PR #43). * `bedtools`: diff --git a/src/umi_tools/umi_tools_extract/config.vsh.yaml b/src/umi_tools/umi_tools_extract/config.vsh.yaml new file mode 100644 index 00000000..b93c8cb9 --- /dev/null +++ b/src/umi_tools/umi_tools_extract/config.vsh.yaml @@ -0,0 +1,197 @@ +name: umi_tools_extract +namespace: umi_tools +description: | + Flexible removal of UMI sequences from fastq reads. + UMIs are removed and appended to the read name. Any other barcode, for example a library barcode, + is left on the read. Can also filter reads by quality or against a whitelist. +keywords: [ extract, umi-tools, umi, fastq ] +links: + homepage: https://umi-tools.readthedocs.io/en/latest/ + documentation: https://umi-tools.readthedocs.io/en/latest/reference/extract.html + repository: https://github.com/CGATOxford/UMI-tools +references: + doi: 10.1101/gr.209601.116 +license: MIT + +argument_groups: + + - name: Input + arguments: + - name: --input + type: file + required: true + description: File containing the input data. + example: sample.fastq + - name: --read2_in + type: file + required: false + description: File containing the input data for the R2 reads (if paired). If provided, a need to be provided. + example: sample_R2.fastq + - name: --bc_pattern + alternatives: -p + type: string + description: | + The UMI barcode pattern to use e.g. 'NNNNNN' indicates that the first 6 nucleotides + of the read are from the UMI. + - name: --bc_pattern2 + type: string + description: The UMI barcode pattern to use for read 2. + + - name: "Output" + arguments: + - name: --output + type: file + required: true + description: Output file for read 1. + direction: output + - name: --read2_out + type: file + description: Output file for read 2. + direction: output + - name: --filtered_out + type: file + description: | + Write out reads not matching regex pattern or cell barcode whitelist to this file. + - name: --filtered_out2 + type: file + description: | + Write out read pairs not matching regex pattern or cell barcode whitelist to this file. + + - name: Extract Options + arguments: + - name: --extract_method + type: string + choices: [string, regex] + description: | + UMI pattern to use. Default: `string`. + example: "string" + - name: --error_correct_cell + type: boolean_true + description: Error correct cell barcodes to the whitelist. + - name: --whitelist + type: file + description: | + Whitelist of accepted cell barcodes tab-separated format, where column 1 is the whitelisted + cell barcodes and column 2 is the list (comma-separated) of other cell barcodes which should + be corrected to the barcode in column 1. If the --error_correct_cell option is not used, this + column will be ignored. + - name: --blacklist + type: file + description: BlackWhitelist of cell barcodes to discard. + - name: --subset_reads + type: integer + description: Only parse the first N reads. + - name: --quality_filter_threshold + type: integer + description: Remove reads where any UMI base quality score falls below this threshold. + - name: --quality_filter_mask + type: string + description: | + If a UMI base has a quality below this threshold, replace the base with 'N'. + - name: --quality_encoding + type: string + choices: [phred33, phred64, solexa] + description: | + Quality score encoding. Choose from: + * phred33 [33-77] + * phred64 [64-106] + * solexa [59-106] + - name: --reconcile_pairs + type: boolean_true + description: | + Allow read 2 infile to contain reads not in read 1 infile. This enables support for upstream protocols + where read one contains cell barcodes, and the read pairs have been filtered and corrected without regard + to the read2. + - name: --three_prime + alternatives: --3prime + type: boolean_true + description: | + By default the barcode is assumed to be on the 5' end of the read, but use this option to sepecify that it is + on the 3' end instead. This option only works with --extract_method=string since 3' encoding can be specified + explicitly with a regex, e.g `.*(?P.{5})$`. + - name: --ignore_read_pair_suffixes + type: boolean_true + description: | + Ignore "/1" and "/2" read name suffixes. Note that this options is required if the suffixes are not whitespace + separated from the rest of the read name. + arguments: + - name: --umi_separator + type: string + description: | + The character that separates the UMI in the read name. Most likely a colon if you skipped the extraction with + UMI-tools and used other software. Default: `_` + example: "_" + - name: --grouping_method + type: string + choices: [unique, percentile, cluster, adjacency, directional] + description: | + Method to use to determine read groups by subsuming those with similar UMIs. All methods start by identifying + the reads with the same mapping position, but treat similar yet nonidentical UMIs differently. Default: `directional` + example: "directional" + - name: --umi_discard_read + type: integer + choices: [0, 1, 2] + description: | + After UMI barcode extraction discard either R1 or R2 by setting this parameter to 1 or 2, respectively. Default: `0` + example: 0 + + - name: Common Options + arguments: + - name: --log + type: file + description: File with logging information. + direction: output + - name: --log2stderr + type: boolean_true + description: Send logging information to stderr. + direction: output + - name: --verbose + type: integer + description: Log level. The higher, the more output. + - name: --error + type: file + description: File with error information. + direction: output + - name: --temp_dir + type: string + description: | + Directory for temporary files. If not set, the bash environmental variable TMPDIR is used. + - name: --compresslevel + type: integer + description: | + Level of Gzip compression to use. Default=6 matches GNU gzip rather than python gzip default (which is 9). + Default `6`. + example: 6 + - name: --timeit + type: file + description: Store timing information in file. + direction: output + - name: --timeit_name + type: string + description: Name in timing file for this class of jobs. + default: all + - name: --timeit_header + type: boolean_true + description: Add header for timing information. + - name: --random_seed + type: integer + description: Random seed to initialize number generator with. + +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - type: file + path: test_data +engines: + - type: docker + image: quay.io/biocontainers/umi_tools:1.1.4--py310h4b81fae_2 + setup: + - type: docker + run: | + umi_tools -v | sed 's/ version//g' > /var/software_versions.txt +runners: +- type: executable +- type: nextflow \ No newline at end of file diff --git a/src/umi_tools/umi_tools_extract/help.txt b/src/umi_tools/umi_tools_extract/help.txt new file mode 100644 index 00000000..46c77ed0 --- /dev/null +++ b/src/umi_tools/umi_tools_extract/help.txt @@ -0,0 +1,106 @@ +''' +Generated from the following UMI-tools documentation: + https://umi-tools.readthedocs.io/en/latest/common_options.html#common-options + https://umi-tools.readthedocs.io/en/latest/reference/extract.html +''' + +extract - Extract UMI from fastq + +Usage: + + Single-end: + umi_tools extract [OPTIONS] -p PATTERN [-I IN_FASTQ[.gz]] [-S OUT_FASTQ[.gz]] + + Paired end: + umi_tools extract [OPTIONS] -p PATTERN [-I IN_FASTQ[.gz]] [-S OUT_FASTQ[.gz]] --read2-in=IN2_FASTQ[.gz] --read2-out=OUT2_FASTQ[.gz] + + note: If -I/-S are ommited standard in and standard out are used + for input and output. To generate a valid BAM file on + standard out, please redirect log with --log=LOGFILE or + --log2stderr. Input/Output will be (de)compressed if a + filename provided to -S/-I/--read2-in/read2-out ends in .gz + +Common UMI-tools Options: + + -S, --stdout File where output is to go [default = stdout]. + -L, --log File with logging information [default = stdout]. + --log2stderr Send logging information to stderr [default = False]. + -v, --verbose Log level. The higher, the more output [default = 1]. + -E, --error File with error information [default = stderr]. + --temp-dir Directory for temporary files. If not set, the bash environmental variable TMPDIR is used[default = None]. + --compresslevel Level of Gzip compression to use. Default=6 matches GNU gzip rather than python gzip default (which is 9) + + profiling and debugging options: + --timeit Store timing information in file [default=none]. + --timeit-name Name in timing file for this class of jobs [default=all]. + --timeit-header Add header for timing information [default=none]. + --random-seed Random seed to initialize number generator with [default=none]. + +Extract Options: + -I, --stdin File containing the input data [default = stdin]. + --error-correct-cell Error correct cell barcodes to the whitelist (see --whitelist) + --whitelist Whitelist of accepted cell barcodes. The whitelist should be in the following format (tab-separated): + AAAAAA AGAAAA + AAAATC + AAACAT + AAACTA AAACTN,GAACTA + AAATAC + AAATCA GAATCA + AAATGT AAAGGT,CAATGT + Where column 1 is the whitelisted cell barcodes and column 2 is the list (comma-separated) of other cell + barcodes which should be corrected to the barcode in column 1. If the --error-correct-cell option is not + used, this column will be ignored. Any additional columns in the whitelist input, such as the counts columns + from the output of umi_tools whitelist, will be ignored. + --blacklist BlackWhitelist of cell barcodes to discard + --subset-reads=[N] Only parse the first N reads + --quality-filter-threshold Remove reads where any UMI base quality score falls below this threshold + --quality-filter-mask If a UMI base has a quality below this threshold, replace the base with 'N' + --quality-encoding Quality score encoding. Choose from: + 'phred33' [33-77] + 'phred64' [64-106] + 'solexa' [59-106] + --reconcile-pairs Allow read 2 infile to contain reads not in read 1 infile. This enables support for upstream protocols + where read one contains cell barcodes, and the read pairs have been filtered and corrected without regard + to the read2s. + +Experimental options: + Note: These options have not been extensively testing to ensure behaviour is as expected. If you have some suitable input files which + we can use for testing, please contact us. + If you have a library preparation method where the UMI may be in either read, you can use the following options to search for the + UMI in either read: + + --either-read --extract-method --bc-pattern=[PATTERN1] --bc-pattern2=[PATTERN2] + + Where both patterns match, the default behaviour is to discard both reads. If you want to select the read with the UMI with highest + sequence quality, provide --either-read-resolve=quality. + + + --bc-pattern Pattern for barcode(s) on read 1. See --extract-method + --bc-pattern2 Pattern for barcode(s) on read 2. See --extract-method + --extract-method There are two methods enabled to extract the umi barcode (+/- cell barcode). For both methods, the patterns + should be provided using the --bc-pattern and --bc-pattern2 options.x + string: + This should be used where the barcodes are always in the same place in the read. + N = UMI position (required) + C = cell barcode position (optional) + X = sample position (optional) + Bases with Ns and Cs will be extracted and added to the read name. The corresponding sequence qualities will + be removed from the read. Bases with an X will be reattached to the read. + regex: + This method allows for more flexible barcode extraction and should be used where the cell barcodes are variable + in length. Alternatively, the regex option can also be used to filter out reads which do not contain an expected + adapter sequence. The regex must contain groups to define how the barcodes are encoded in the read. + The expected groups in the regex are: + umi_n = UMI positions, where n can be any value (required) + cell_n = cell barcode positions, where n can be any value (optional) + discard_n = positions to discard, where n can be any value (optional) + --3prime By default the barcode is assumed to be on the 5' end of the read, but use this option to sepecify that it is + on the 3' end instead. This option only works with --extract-method=string since 3' encoding can be specified + explicitly with a regex, e.g .*(?P.{5})$ + --read2-in Filename for read pairs + --filtered-out Write out reads not matching regex pattern or cell barcode whitelist to this file + --filtered-out2 Write out read pairs not matching regex pattern or cell barcode whitelist to this file + --ignore-read-pair-suffixes Ignore SOH and STX read name suffixes. Note that this options is required if the suffixes are not whitespace + separated from the rest of the read name + +For full UMI-tools documentation, see https://umi-tools.readthedocs.io/en/latest/ \ No newline at end of file diff --git a/src/umi_tools/umi_tools_extract/script.sh b/src/umi_tools/umi_tools_extract/script.sh new file mode 100644 index 00000000..5e41865d --- /dev/null +++ b/src/umi_tools/umi_tools_extract/script.sh @@ -0,0 +1,88 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +set -exo pipefail + +test_dir="${metal_executable}/test_data" + +[[ "$par_error_correct_cell" == "false" ]] && unset par_error_correct_cell +[[ "$par_reconcile_pairs" == "false" ]] && unset par_reconcile_pairs +[[ "$par_three_prime" == "false" ]] && unset par_three_prime +[[ "$par_ignore_read_pair_suffixes" == "false" ]] && unset par_ignore_read_pair_suffixes +[[ "$par_timeit_header" == "false" ]] && unset par_timeit_header +[[ "$par_log2stderr" == "false" ]] && unset par_log2stderr + + +# Check if we have the correct number of input files and patterns for paired-end or single-end reads + +# For paired-end rends, check that we have two read files, two patterns +# Check for paired-end inputs +if [ -n "$par_input" ] && [ -n "$par_read2_in" ]; then + # Paired-end checks: Ensure both UMI patterns are provided + if [ -z "$par_bc_pattern" ] || [ -z "$par_bc_pattern2" ]; then + echo "Paired end input requires two UMI patterns." + exit 1 + fi +elif [ -n "$par_input" ]; then + # Single-end checks: Ensure no second read or UMI pattern for the second read is provided + if [ -n "$par_bc_pattern2" ]; then + echo "Single end input requires only one read file and one UMI pattern." + exit 1 + fi + # Check that discard_read is not set or set to 0 for single-end reads + if [ -n "$par_umi_discard_read" ] && [ "$par_umi_discard_read" != 0 ]; then + echo "umi_discard_read is only valid when processing paired end reads." + exit 1 + fi +else + # No inputs provided + echo "No input files provided." + exit 1 +fi + + + + +umi_tools extract \ + -I "$par_input" \ + ${par_read2_in:+ --read2-in "$par_read2_in"} \ + -S "$par_output" \ + ${par_read2_out:+--read2-out "$par_read2_out"} \ + ${par_extract_method:+--extract-method "$par_extract_method"} \ + --bc-pattern "$par_bc_pattern" \ + ${par_bc_pattern2:+ --bc-pattern2 "$par_bc_pattern2"} \ + ${par_umi_separator:+--umi-separator "$par_umi_separator"} \ + ${par_output_stats:+--output-stats "$par_output_stats"} \ + ${par_error_correct_cell:+--error-correct-cell} \ + ${par_whitelist:+--whitelist "$par_whitelist"} \ + ${par_blacklist:+--blacklist "$par_blacklist"} \ + ${par_subset_reads:+--subset-reads "$par_subset_reads"} \ + ${par_quality_filter_threshold:+--quality-filter-threshold "$par_quality_filter_threshold"} \ + ${par_quality_filter_mask:+--quality-filter-mask "$par_quality_filter_mask"} \ + ${par_quality_encoding:+--quality-encoding "$par_quality_encoding"} \ + ${par_reconcile_pairs:+--reconcile-pairs} \ + ${par_three_prime:+--3prime} \ + ${par_filtered_out:+--filtered-out "$par_filtered_out"} \ + ${par_filtered_out2:+--filtered-out2 "$par_filtered_out2"} \ + ${par_ignore_read_pair_suffixes:+--ignore-read-pair-suffixes} \ + ${par_random_seed:+--random-seed "$par_random_seed"} \ + ${par_temp_dir:+--temp-dir "$par_temp_dir"} \ + ${par_compresslevel:+--compresslevel "$par_compresslevel"} \ + ${par_timeit:+--timeit "$par_timeit"} \ + ${par_timeit_name:+--timeit-name "$par_timeit_name"} \ + ${par_timeit_header:+--timeit-header} \ + ${par_log:+--log "$par_log"} \ + ${par_log2stderr:+--log2stderr} \ + ${par_verbose:+--verbose "$par_verbose"} \ + ${par_error:+--error "$par_error"} + + +if [ "$par_umi_discard_read" == 1 ]; then + # discard read 1 + rm "$par_read1_out" +elif [ "$par_umi_discard_read" == 2 ]; then + # discard read 2 (-f to bypass file existence check) + rm -f "$par_read2_out" +fi \ No newline at end of file diff --git a/src/umi_tools/umi_tools_extract/test.sh b/src/umi_tools/umi_tools_extract/test.sh new file mode 100644 index 00000000..0de5d5b3 --- /dev/null +++ b/src/umi_tools/umi_tools_extract/test.sh @@ -0,0 +1,86 @@ +#!/bin/bash + +test_dir="${meta_resources_dir}/test_data" + +echo ">>> Testing $meta_functionality_name" + +############################################################################################################ + +echo ">>> Test 1: Testing for paired-end reads" +"$meta_executable" \ + --input "$test_dir/scrb_seq_fastq.1_30"\ + --read2_in "$test_dir/scrb_seq_fastq.2_30" \ + --bc_pattern "CCCCCCNNNNNNNNNN"\ + --bc_pattern2 "CCCCCCNNNNNNNNNN" \ + --extract_method string \ + --umi_separator '_' \ + --grouping_method directional \ + --umi_discard_read 0 \ + --output scrb_seq_fastq.1_30.extract \ + --read2_out scrb_seq_fastq.2_30.extract \ + --random_seed 1 + +echo ">> Checking if the correct files are present" +[[ ! -f "scrb_seq_fastq.1_30.extract" ]] || [[ ! -f "scrb_seq_fastq.2_30.extract" ]] && echo "Reads file missing" && exit 1 +[ ! -s "scrb_seq_fastq.1_30.extract" ] && echo "Read 1 file is empty" && exit 1 +[ ! -s "scrb_seq_fastq.2_30.extract" ] && echo "Read 2 file is empty" && exit 1 + + +echo ">> Checking if the files are correct" +diff -q "${meta_resources_dir}/scrb_seq_fastq.1_30.extract" "$test_dir/scrb_seq_fastq.1_30.extract" || \ + (echo "Read 1 file is not correct" && exit 1) +diff -q "${meta_resources_dir}/scrb_seq_fastq.2_30.extract" "$test_dir/scrb_seq_fastq.2_30.extract" || \ + (echo "Read 2 file is not correct" && exit 1) + +rm scrb_seq_fastq.1_30.extract scrb_seq_fastq.2_30.extract + +############################################################################################################ + +echo ">>> Test 2: Testing for paired-end reads with umi_discard_reads option" +"$meta_executable" \ + --input "$test_dir/scrb_seq_fastq.1_30" \ + --read2_in "$test_dir/scrb_seq_fastq.2_30" \ + --bc_pattern CCCCCCNNNNNNNNNN \ + --bc_pattern2 CCCCCCNNNNNNNNNN \ + --extract_method string \ + --umi_separator '_' \ + --grouping_method directional \ + --umi_discard_read 2 \ + --output scrb_seq_fastq.1_30.extract \ + --random_seed 1 + +echo ">> Checking if the correct files are present" +[ ! -f "scrb_seq_fastq.1_30.extract" ] && echo "Read 1 file is missing" && exit 1 +[ ! -s "scrb_seq_fastq.1_30.extract" ] && echo "Read 1 file is empty" && exit 1 +[ -f "scrb_seq_fastq.2_30.extract" ] && echo "Read 2 is not discarded" && exit 1 + +echo ">> Checking if the files are correct" +diff -q "${meta_resources_dir}/scrb_seq_fastq.1_30.extract" "$test_dir/scrb_seq_fastq.1_30.extract" || \ + (echo "Read 1 file is not correct" && exit 1) + +rm scrb_seq_fastq.1_30.extract + +############################################################################################################ + +echo ">>> Test 3: Testing for single-end reads" +"$meta_executable" \ + --input "$test_dir/slim_30.fastq" \ + --bc_pattern "^(?P.{3}).{4}(?P.{2})" \ + --extract_method regex \ + --umi_separator '_' \ + --grouping_method directional \ + --output slim_30.extract \ + --random_seed 1 + +echo ">> Checking if the correct files are present" +[ ! -f "slim_30.extract" ] && echo "Trimmed reads file missing" && exit 1 +[ ! -s "slim_30.extract" ] && echo "Trimmed reads file is empty" && exit 1 + +echo ">> Checking if the files are correct" +diff -q "${meta_resources_dir}/slim_30.extract" "$test_dir/slim_30.extract" || \ + (echo "Trimmed reads file is not correct" && exit 1) + +rm slim_30.extract + +echo ">>> Test finished successfully" +exit 0 \ No newline at end of file diff --git a/src/umi_tools/umi_tools_extract/test_data/scrb_seq_fastq.1_30 b/src/umi_tools/umi_tools_extract/test_data/scrb_seq_fastq.1_30 new file mode 100644 index 00000000..639f6243 --- /dev/null +++ b/src/umi_tools/umi_tools_extract/test_data/scrb_seq_fastq.1_30 @@ -0,0 +1,120 @@ +@SRR1058032.1 HISEQ:653:H12WDADXX:1:1101:1210:2217 length=17 +AATAACTTCCCGCGTCG ++SRR1058032.1 HISEQ:653:H12WDADXX:1:1101:1210:2217 length=17 +@@@DDDBDDF>FFHGIB +@SRR1058032.2 HISEQ:653:H12WDADXX:1:1101:1191:2236 length=17 +AGCGGGGTGCTCGTCGT ++SRR1058032.2 HISEQ:653:H12WDADXX:1:1101:1191:2236 length=17 +CCCFFFFFHHHHHJJJJ +@SRR1058032.3 HISEQ:653:H12WDADXX:1:1101:1715:2245 length=17 +CTTTAGTACCAGTCCTT ++SRR1058032.3 HISEQ:653:H12WDADXX:1:1101:1715:2245 length=17 +BBCFFDADHHHHHHIJJ +@SRR1058032.4 HISEQ:653:H12WDADXX:1:1101:1905:2212 length=17 +AGGCGTTGTTTTTTTTT ++SRR1058032.4 HISEQ:653:H12WDADXX:1:1101:1905:2212 length=17 +CCCFFFFFHHHHHJJJJ +@SRR1058032.5 HISEQ:653:H12WDADXX:1:1101:1927:2237 length=17 +ATCGAGACATAATTGAT ++SRR1058032.5 HISEQ:653:H12WDADXX:1:1101:1927:2237 length=17 +@B@FFFFFHHHHHJJJJ +@SRR1058032.6 HISEQ:653:H12WDADXX:1:1101:1876:2243 length=17 +TGGGGGCGGTACATGAT ++SRR1058032.6 HISEQ:653:H12WDADXX:1:1101:1876:2243 length=17 +BBBFFFFFHHHHHJJJJ +@SRR1058032.7 HISEQ:653:H12WDADXX:1:1101:2491:2207 length=17 +CTATATGTTTGCGCTGT ++SRR1058032.7 HISEQ:653:H12WDADXX:1:1101:2491:2207 length=17 +1=BDFFFFHHHHHJJJJ +@SRR1058032.8 HISEQ:653:H12WDADXX:1:1101:2513:2219 length=17 +CTCCCGCATGCTGCTGT ++SRR1058032.8 HISEQ:653:H12WDADXX:1:1101:2513:2219 length=17 +?BBFFFFFHHHHHJJJJ +@SRR1058032.9 HISEQ:653:H12WDADXX:1:1101:2604:2231 length=17 +GAGCCCTGAGGGGATCT ++SRR1058032.9 HISEQ:653:H12WDADXX:1:1101:2604:2231 length=17 +1??DDDFD>DFDGFGHG +@SRR1058032.10 HISEQ:653:H12WDADXX:1:1101:2936:2218 length=17 +AGCGGGGTTCGCGGTTT ++SRR1058032.10 HISEQ:653:H12WDADXX:1:1101:2936:2218 length=17 +CCCFFFFFHHHHHJIJI +@SRR1058032.11 HISEQ:653:H12WDADXX:1:1101:3447:2241 length=17 +AGAATTGCCTGGATTTT ++SRR1058032.11 HISEQ:653:H12WDADXX:1:1101:3447:2241 length=17 +@CCFFFFAFHHHGJJJJ +@SRR1058032.12 HISEQ:653:H12WDADXX:1:1101:3620:2196 length=17 +AGGCGGGGCAACGGGTT ++SRR1058032.12 HISEQ:653:H12WDADXX:1:1101:3620:2196 length=17 +CCCFFFFFHHGHHJJHH +@SRR1058032.13 HISEQ:653:H12WDADXX:1:1101:3875:2206 length=17 +GTCCCCGCGTCGTGTAG ++SRR1058032.13 HISEQ:653:H12WDADXX:1:1101:3875:2206 length=17 +@C@FFFFFHFFGHJJJJ +@SRR1058032.14 HISEQ:653:H12WDADXX:1:1101:4131:2215 length=17 +CCACGCATTCACTCGGT ++SRR1058032.14 HISEQ:653:H12WDADXX:1:1101:4131:2215 length=17 +BBBDFFFFHHHHHJJJJ +@SRR1058032.15 HISEQ:653:H12WDADXX:1:1101:4284:2241 length=17 +TGCGCAATAAGCGCTAT ++SRR1058032.15 HISEQ:653:H12WDADXX:1:1101:4284:2241 length=17 ++:=DDDDDBHHGDIBEH +@SRR1058032.16 HISEQ:653:H12WDADXX:1:1101:4599:2232 length=17 +CGCTGGCAGAGCCCGGT ++SRR1058032.16 HISEQ:653:H12WDADXX:1:1101:4599:2232 length=17 +@BCFFFFFHHHHHJJJJ +@SRR1058032.17 HISEQ:653:H12WDADXX:1:1101:5428:2200 length=17 +AGGCGGTGCATAGTCTT ++SRR1058032.17 HISEQ:653:H12WDADXX:1:1101:5428:2200 length=17 +CCCFFFFFHHHHHIJIH +@SRR1058032.18 HISEQ:653:H12WDADXX:1:1101:5336:2218 length=17 +GTCCCCCGCGTGTGACT ++SRR1058032.18 HISEQ:653:H12WDADXX:1:1101:5336:2218 length=17 +GEGCDG9FD# +@SRR1058032.5 HISEQ:653:H12WDADXX:1:1101:1927:2237 length=34 +GTGTAGGGAAAGAGTGTAAGGAAAGAGTGTAGCN ++SRR1058032.5 HISEQ:653:H12WDADXX:1:1101:1927:2237 length=34 +?=??B?DB2ACCAEAEFHHIHHHIHFHCEHHIG# +@SRR1058032.6 HISEQ:653:H12WDADXX:1:1101:1876:2243 length=34 +CCTATATAGTATAGCTTCCCATCTTCTTTGAGAN ++SRR1058032.6 HISEQ:653:H12WDADXX:1:1101:1876:2243 length=34 +CCCFFFFFHDHBHEIIJJJJIIIJJJGGGIGIE# +@SRR1058032.7 HISEQ:653:H12WDADXX:1:1101:2491:2207 length=34 +ATTAAAGACAAACTACAACTCATATGAGGCATTN ++SRR1058032.7 HISEQ:653:H12WDADXX:1:1101:2491:2207 length=34 +@@@DDDADDHHHFBFAHIGBHHFAH;E@@?AB>F@BF3;3?1C?<# +@SRR1058032.11 HISEQ:653:H12WDADXX:1:1101:3447:2241 length=34 +CCCACACTCTTTCCCTACACGACGCTACACTCTN ++SRR1058032.11 HISEQ:653:H12WDADXX:1:1101:3447:2241 length=34 +@@@DDFDDBHBFHGI)C:D@@@B# +@SRR1058032.12 HISEQ:653:H12WDADXX:1:1101:3620:2196 length=34 +GTGTATGGAAAGAGTGTAGGGAAAGAGTGTAGGN ++SRR1058032.12 HISEQ:653:H12WDADXX:1:1101:3620:2196 length=34 +@@@DDDDAHHHFHIABEEEAB??CFBF?C@BFF# +@SRR1058032.13 HISEQ:653:H12WDADXX:1:1101:3875:2206 length=34 +CTCTTTCCCTACACTCTTTCCCTACACGACGCTN ++SRR1058032.13 HISEQ:653:H12WDADXX:1:1101:3875:2206 length=34 +@@@DDDAAADHDHDGDGIIIIIJJJJJJIJIIJ# +@SRR1058032.14 HISEQ:653:H12WDADXX:1:1101:4131:2215 length=34 +GTGTAGCGTCGTGTAGGGAAAGAGTGTGTGGAAN ++SRR1058032.14 HISEQ:653:H12WDADXX:1:1101:4131:2215 length=34 +@@@DDDDD?DFDCAEFHIGGFHEH:D1C:CG@F# +@SRR1058032.15 HISEQ:653:H12WDADXX:1:1101:4284:2241 length=34 +GTGTATGGAAAGAGTGTGCGTCGTACGTGTAGAN ++SRR1058032.15 HISEQ:653:H12WDADXX:1:1101:4284:2241 length=34 +@?@DDFFFHHHHGDAC:CHGGIIGIIIFHFGHB# +@SRR1058032.16 HISEQ:653:H12WDADXX:1:1101:4599:2232 length=34 +ACTCTTTCCCTACACTCTTTCCCTACACGACGCN ++SRR1058032.16 HISEQ:653:H12WDADXX:1:1101:4599:2232 length=34 +@@BCBE@9;EGGGGGIHJJIJHIGG# +@SRR1058032.17 HISEQ:653:H12WDADXX:1:1101:5428:2200 length=34 +GATTCTTCAAATGAGGACTATGCGGGACATGAAN ++SRR1058032.17 HISEQ:653:H12WDADXX:1:1101:5428:2200 length=34 +@@@DDDDDFHHFAHB;FHIIIIIIIIFHEHIHI# +@SRR1058032.18 HISEQ:653:H12WDADXX:1:1101:5336:2218 length=34 +GCGTCGTGTAGGGAAAGAGTGTAGCGTCGTGTAN ++SRR1058032.18 HISEQ:653:H12WDADXX:1:1101:5336:2218 length=34 +@@@DDDDDBHEGGFGHGGIEGII# +@SRR1058032.24 HISEQ:653:H12WDADXX:1:1101:5649:2244 length=34 +AGACGGACCAGAGCGAAAGCATTTGCCAAGAATN ++SRR1058032.24 HISEQ:653:H12WDADXX:1:1101:5649:2244 length=34 +CCCFFFDFGHHHGJIIJJIJHEDD919CGGHJ@# +@SRR1058032.25 HISEQ:653:H12WDADXX:1:1101:5910:2207 length=34 +GAGTATAGGGAAAGAGTTTTTTTTTTTTTTTTTN ++SRR1058032.25 HISEQ:653:H12WDADXX:1:1101:5910:2207 length=34 +?=?DDDD>AB:ACEEGHIJJIJJJJIIJJHFDD# +@SRR1058032.26 HISEQ:653:H12WDADXX:1:1101:5757:2217 length=34 +CCTTTTATACAATACAAAGCTTTGCTTTTTTTTN ++SRR1058032.26 HISEQ:653:H12WDADXX:1:1101:5757:2217 length=34 +???DDDDDDDDD4EEEII@A<:33<33,22110# +@SRR1058032.27 HISEQ:653:H12WDADXX:1:1101:5790:2248 length=34 +ATCACAGCTGGAGAGATCTTGATCTTCATGGTGN ++SRR1058032.27 HISEQ:653:H12WDADXX:1:1101:5790:2248 length=34 +CCCFFFFFHHFHGGIIIIJIEAHCEHHEFECGD# +@SRR1058032.28 HISEQ:653:H12WDADXX:1:1101:6079:2195 length=34 +GTACTAGGCATCGTCATCCAATGCGACGAGTCCN ++SRR1058032.28 HISEQ:653:H12WDADXX:1:1101:6079:2195 length=34 +@@CFFDDFHHGHHIJJJIJJJIGGHIDGGEGCDG9FD# +@SRR1058032.5_ATCGAGGTGTAG_ACATAATTGAGGAAAGAGTG HISEQ:653:H12WDADXX:1:1101:1927:2237 length=34 +TAAGGAAAGAGTGTAGCN ++ +FHHIHHHIHFHCEHHIG# +@SRR1058032.6_TGGGGGCCTATA_CGGTACATGATAGTATAGCT HISEQ:653:H12WDADXX:1:1101:1876:2243 length=34 +TCCCATCTTCTTTGAGAN ++ +JJJJIIIJJJGGGIGIE# +@SRR1058032.7_CTATATATTAAA_GTTTGCGCTGGACAAACTAC HISEQ:653:H12WDADXX:1:1101:2491:2207 length=34 +AACTCATATGAGGCATTN ++ +HIGBHHF@BF3;3?1C?<# +@SRR1058032.11_AGAATTCCCACA_GCCTGGATTTCTCTTTCCCT HISEQ:653:H12WDADXX:1:1101:3447:2241 length=34 +ACACGACGCTACACTCTN ++ +F@GFBFEE>)C:D@@@B# +@SRR1058032.12_AGGCGGGTGTAT_GGCAACGGGTGGAAAGAGTG HISEQ:653:H12WDADXX:1:1101:3620:2196 length=34 +TAGGGAAAGAGTGTAGGN ++ +EEEAB??CFBF?C@BFF# +@SRR1058032.13_GTCCCCCTCTTT_GCGTCGTGTACCCTACACTC HISEQ:653:H12WDADXX:1:1101:3875:2206 length=34 +TTTCCCTACACGACGCTN ++ +GIIIIIJJJJJJIJIIJ# +@SRR1058032.14_CCACGCGTGTAG_ATTCACTCGGCGTCGTGTAG HISEQ:653:H12WDADXX:1:1101:4131:2215 length=34 +GGAAAGAGTGTGTGGAAN ++ +HIGGFHEH:D1C:CG@F# +@SRR1058032.15_TGCGCAGTGTAT_ATAAGCGCTAGGAAAGAGTG HISEQ:653:H12WDADXX:1:1101:4284:2241 length=34 +TGCGTCGTACGTGTAGAN ++ +:CHGGIIGIIIFHFGHB# +@SRR1058032.16_CGCTGGACTCTT_CAGAGCCCGGTCCCTACACT HISEQ:653:H12WDADXX:1:1101:4599:2232 length=34 +CTTTCCCTACACGACGCN ++ +;EGGGGGIHJJIJHIGG# +@SRR1058032.17_AGGCGGGATTCT_TGCATAGTCTTCAAATGAGG HISEQ:653:H12WDADXX:1:1101:5428:2200 length=34 +ACTATGCGGGACATGAAN ++ +FHIIIIIIIIFHEHIHI# +@SRR1058032.18_GTCCCCGCGTCG_CGCGTGTGACTGTAGGGAAA HISEQ:653:H12WDADXX:1:1101:5336:2218 length=34 +GAGTGTAGCGTCGTGTAN ++ +DGF+<BHEGGFGHGGIEGII# +@SRR1058032.24_CGTTAAAGACGG_TAATTGTGGTACCAGAGCGA HISEQ:653:H12WDADXX:1:1101:5649:2244 length=34 +AAGCATTTGCCAAGAATN ++ +JJIJHEDD919CGGHJ@# +@SRR1058032.25_AAAAAAGAGTAT_AAAAAAAAAAAGGGAAAGAG HISEQ:653:H12WDADXX:1:1101:5910:2207 length=34 +TTTTTTTTTTTTTTTTTN ++ +HIJJIJJJJIIJJHFDD# +@SRR1058032.26_GCCGACCCTTTT_CAACGATTTTATACAATACA HISEQ:653:H12WDADXX:1:1101:5757:2217 length=34 +AAGCTTTGCTTTTTTTTN ++ +II@A<:33<33,22110# +@SRR1058032.27_AATCAAATCACA_GACCACTGAAGCTGGAGAGA HISEQ:653:H12WDADXX:1:1101:5790:2248 length=34 +TCTTGATCTTCATGGTGN ++ +IIJIEAHCEHHEFECGD# +@SRR1058032.28_CGCGCTGTACTA_TTTGTTTTTTGGCATCGTCA HISEQ:653:H12WDADXX:1:1101:6079:2195 length=34 +TCCAATGCGACGAGTCCN ++ +JIJJJIGGHIDG slim_30.fastq +head -n 120 scrb_seq_fastq.1 > scrb_seq_fastq.1_30 +head -n 120 scrb_seq_fastq.2 > scrb_seq_fastq.2_30 +rm slim.fastq scrb_seq_fastq.1 scrb_seq_fastq.2 + +# Generate expected output +# Test 1 and 2 +umi_tools extract \ + --stdin "scrb_seq_fastq.1_30" \ + --read2-in "scrb_seq_fastq.2_30" \ + --bc-pattern "CCCCCCNNNNNNNNNN" \ + --bc-pattern2 "CCCCCCNNNNNNNNNN" \ + --extract-method string \ + --stdout scrb_seq_fastq.1_30.extract \ + --read2-out scrb_seq_fastq.2_30.extract \ + --random-seed 1 + +# Test 3 +umi_tools extract \ + --stdin "slim_30.fastq" \ + --bc-pattern "^(?P.{3}).{4}(?P.{2})" \ + --extract-method regex \ + --stdout slim_30.extract \ + --random-seed 1 \ No newline at end of file diff --git a/src/umi_tools/umi_tools_extract/test_data/slim_30.extract b/src/umi_tools/umi_tools_extract/test_data/slim_30.extract new file mode 100644 index 00000000..1c20f782 --- /dev/null +++ b/src/umi_tools/umi_tools_extract/test_data/slim_30.extract @@ -0,0 +1,120 @@ +@SRR2057595.7_CAGAA +GTTCTCTCGGTGGGACCTC ++ +FFFFHHHJJJFGIJIJJIJ +@SRR2057595.9_TTGAA +GTTCTCTGATGCCCTCTTCTGGTGCATCTGAAGACAGCTACAGTGTACTTAGATATAATAAATAAATCTT ++ +FDBDFHHIGGEHJGGIHGHGGCAFCHGIGEHIJJJJIJJJIHIIIIIIJIIIIIGHIIGGIJGIIJIIJ@ +@SRR2057595.14_TGGAT +GTTAGCGGCCCCGGGTTCCTCCCGGGGCTACGCCTGTCTGAGCGTCGCT ++ +FFFFHHHJJIJJJJIGHJJIIJJJJJIJHFHHFFEDEEEEDDDDBDDDD +@SRR2057595.22_ACGAT +GTTAGCGGCCCCGGGTTCCTCCCGGGGCTACGCCTGTCTGAGCGTCGC ++ +FFFFHHHJJJJJJJJIJJJJJJJJJJJJHHHFFFEDEEEEDDDDBDDD +@SRR2057595.23_GCGTT +GTTACCTAAGGCGAGCTCAGGGAGGACAGAAACCTCCCGTGGAGCAGAAGGGCAAAAGCTCGCTTGATCT ++ +FFFFHHHJJJJJJJJJJJJJJJIJJIIJJJJJJJJJJJJIJJHHHHHFFFFDDDDDDDDDDDDDDDDDDA +@SRR2057595.29_ACGTT +GTTCGCGGCCCCGGGTTCCTCCCGGGGCTACGCCTGTCTGAGCGTCGCTT ++ +FFFFHHHJJJJJJJJHIJJJJJJIJJJJHHHFFDEDEEDDCDDDBDDDDD +@SRR2057595.30_GAGAA +GTTGAATCCGTGCTAAGAAGAA ++ +DFFFHHHJJJJIJJJJJJJJJJ +@SRR2057595.33_TCGAT +GTTTCTCGTCTGATCTCGGAAGCTAAGCAGGGTCGGGCCTGGTTAGTACTTGGATGGGAGACCGCCTGGG ++ +FFFFHHHJJJJJJJJJJJJJJJJJJJJJJJJJDHIJJJJIJJJHGGEEHFFFFFFEDDEDDDDDDDDDDB +@SRR2057595.35_ACGCT +GTTACCCGGGGCTACGCCTGTCTGAGCGTCGCT ++ +DFFFHHHJJJJJJIJJJJJJIJJJJJJJHIIJJ +@SRR2057595.38_GGGCC +GTTATGCATGTTTATAGTTTCTAGTTTTGGCATTTTGTGTGGTCTCTTTTTTGTT ++ +DFFFHHHJJJJJJJJJJHJJIJJJIJJJJJJJJJJJJGIGHJHIJJIJJJJJJJJ +@SRR2057595.42_TAGGA +GTTGTAAGTTATACACTGACTAAGTCATCTGTTACTGCCTTCACTGAGTTTTTATTTCCTTT ++ +DFFFHHHJJJJJJJJJJJJJJJJJIIJJJJGJJJJJJJJJJJJJJJIIHIJJJJJJJIJJJI +@SRR2057595.45_CTGGC +GTTTTGCGGAAGGATCATTA ++ +DDDDFFDFFAGFEB@ACB9< +@SRR2057595.65_GCGCG +GTTTGAGCTTGCTCCGTCCACTCAACGCATCGACCTGGTATTGCAGTACCTCCAGGAACGGTGCACCAAG ++ +FFFFHHHJJJJJJHJIHHIIIIIIIJHJBHIHBFHHJI@EHJJHHHHHHHFFFBDE?AEBD=AB@CDBD? +@SRR2057595.67_AAGGT +GTTGTTTTGAGGTCCTGCTCGTGCAGGGT ++ +DDDFHHHHGFHGFGGIIDGHHIGIJJJJ9 +@SRR2057595.69_ATTAT +GGTTTTTGTTTTTCCTCCTTCTCTTTCTAAA ++ +FFFFHHHHJJJJJJJJJJJJJJJJJJIJIJJ +@SRR2057595.70_TTAAA +GGTTTTGTAATTTTATGAGGTCCCATTTGTCAATTCTT ++ +DDDD2CDFA@FBGHCCHFHGBFHGHIGGDHGHIIFCFF +@SRR2057595.71_TGCCA +GGTTTATTAGCATGGCCCCTGCGCAAGGATGACACGCAAATTCGTGAAGCGTTCCATATTT ++ +FFFFHGHHJJJJJJJJJJIIJJIJIJJIFHJIIIJJJIJJJJJJHIIHHHHFFFDEECEEE +@SRR2057595.73_TGACA +GGTTGCGAGTGCCTAGTGGGCCACTTTTGGTAAGCAGAACTGGCGCTGCGGGA ++ +FFFFGFFHC@EBHGHGAEGIIHIIIIJJJJGHIIIJIJIIGHIJIJJIGGEFD +@SRR2057595.74_AATTC +GGTTTAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA ++ +FFFFDFFHFIJJJGGGGJJGDDDDDDDDDDDDDBDDDDDDBBDDDDDDDDDDDDDDDDDDDBBBDDDDBD> +@SRR2057595.77_GCGGA +GTTCTCCCACTTCTGAC ++ +FFFFDHHHIJJJIJJJJ +@SRR2057595.82_GAGAC +GGTTTTCCTCCCGGGGCTACGCCTGTCTGAGCGTCGCT ++ +FFFFHHHHJJJJJJJJJJJJJJJJIJJJJJIJIIJJJH +@SRR2057595.83_TGGAT +GTTGCCCGGGGCTACGCCTGTCTGAGCGTCGCT ++ +DFFFHHHJJIJJJJJJIJJJIJJIGGHIFHGEH +@SRR2057595.86_ACCAC +GGTTTTTTTTTAAATGTAAAGCATAAATAAAAAGCCTTTGTGGACTGTGAAAAAAAAAAAAAAAAAAAAAA ++ +FFFFHHHHJJJJJJJJIIJJJJJJJJJIJJJJJJJJJJJJGIJIIJJIJJJJJJHFDDDDDDDDDDDDDB> +@SRR2057595.88_TCAGC +GGTTCTAAGCATAGATAACCATATATCAGGGGGAGCTCCATGTTCTAGTCCTGCAAGCGCCTGGGCAATAA ++ +FFFFHHHHJJJJJJIJJJJJIJJJJJJIJJIJJIJJJJJJJJJHIJJJJJJIIIHJIHHHFFDDDDEDDD@ +@SRR2057595.99_TGACA +GGTTTCGCTGCGATCTATTGAAAGTCAGCCCTCGACACAAGGGTTTGT ++ +FFFFDHHHIHIIIJJIJJJJJIGEHGFHIJJGHIHADHIIJIJJJIJG diff --git a/src/umi_tools/umi_tools_extract/test_data/slim_30.fastq b/src/umi_tools/umi_tools_extract/test_data/slim_30.fastq new file mode 100644 index 00000000..444a7a7a --- /dev/null +++ b/src/umi_tools/umi_tools_extract/test_data/slim_30.fastq @@ -0,0 +1,120 @@ +@SRR2057595.7 +CAGGTTCAATCTCGGTGGGACCTC ++SRR2057595.7 +1=DFFFFHHHHHJJJFGIJIJJIJ +@SRR2057595.9 +TTGGTTCAATCTGATGCCCTCTTCTGGTGCATCTGAAGACAGCTACAGTGTACTTAGATATAATAAATAAATCTT ++SRR2057595.9 +4=DFDBDHHFHHIGGEHJGGIHGHGGCAFCHGIGEHIJJJJIJJJIHIIIIIIJIIIIIGHIIGGIJGIIJIIJ@ +@SRR2057595.14 +TGGGTTAATGCGGCCCCGGGTTCCTCCCGGGGCTACGCCTGTCTGAGCGTCGCT ++SRR2057595.14 +1=DFFFFHHHHHJJIJJJJIGHJJIIJJJJJIJHFHHFFEDEEEEDDDDBDDDD +@SRR2057595.22 +ACGGTTAATGCGGCCCCGGGTTCCTCCCGGGGCTACGCCTGTCTGAGCGTCGC ++SRR2057595.22 +1=DFFFFHHHHHJJJJJJJJIJJJJJJJJJJJJHHHFFFEDEEEEDDDDBDDD +@SRR2057595.23 +GCGGTTATTCCTAAGGCGAGCTCAGGGAGGACAGAAACCTCCCGTGGAGCAGAAGGGCAAAAGCTCGCTTGATCT ++SRR2057595.23 +1=DFFFFHHHHHJJJJJJJJJJJJJJJIJJIIJJJJJJJJJJJJIJJHHHHHFFFFDDDDDDDDDDDDDDDDDDA +@SRR2057595.29 +ACGGTTCTTGCGGCCCCGGGTTCCTCCCGGGGCTACGCCTGTCTGAGCGTCGCTT ++SRR2057595.29 +1=DFFFFHHHHHJJJJJJJJHIJJJJJJIJJJJHHHFFDEDEEDDCDDDBDDDDD +@SRR2057595.30 +GAGGTTGAAAATCCGTGCTAAGAAGAA ++SRR2057595.30 +4=DDFFFHHHHHJJJJIJJJJJJJJJJ +@SRR2057595.33 +TCGGTTTATCTCGTCTGATCTCGGAAGCTAAGCAGGGTCGGGCCTGGTTAGTACTTGGATGGGAGACCGCCTGGG ++SRR2057595.33 +1=DFFFFHHHHHJJJJJJJJJJJJJJJJJJJJJJJJJDHIJJJJIJJJHGGEEHFFFFFFEDDEDDDDDDDDDDB +@SRR2057595.35 +ACGGTTACTCCCGGGGCTACGCCTGTCTGAGCGTCGCT ++SRR2057595.35 +1=DDFFFHHHHHJJJJJJIJJJJJJIJJJJJJJHIIJJ +@SRR2057595.38 +GGGGTTACCTGCATGTTTATAGTTTCTAGTTTTGGCATTTTGTGTGGTCTCTTTTTTGTT ++SRR2057595.38 +1=DDFFFHHHHHJJJJJJJJJJHJJIJJJIJJJJJJJJJJJJGIGHJHIJJIJJJJJJJJ +@SRR2057595.42 +TAGGTTGGATAAGTTATACACTGACTAAGTCATCTGTTACTGCCTTCACTGAGTTTTTATTTCCTTT ++SRR2057595.42 +1=DDFFFHHHHHJJJJJJJJJJJJJJJJJIIJJJJGJJJJJJJJJJJJJJJIIHIJJJJJJJIJJJI +@SRR2057595.45 +CTGGTTTGCTGCGGAAGGATCATTA ++SRR2057595.45 +1:DDDDDDDFFDFFAGFEB@ACB9< +@SRR2057595.65 +GCGGTTTCGGAGCTTGCTCCGTCCACTCAACGCATCGACCTGGTATTGCAGTACCTCCAGGAACGGTGCACCAAG ++SRR2057595.65 +1=DFFFFHHHHHJJJJJJHJIHHIIIIIIIJHJBHIHBFHHJI@EHJJHHHHHHHFFFBDE?AEBD=AB@CDBD? +@SRR2057595.67 +AAGGTTGGTTTTTGAGGTCCTGCTCGTGCAGGGT ++SRR2057595.67 +1:BDDDFHFHHHHGFHGFGGIIDGHHIGIJJJJ9 +@SRR2057595.69 +ATTGGTTATTTTGTTTTTCCTCCTTCTCTTTCTAAA ++SRR2057595.69 +CCCFFFFFHHHHHJJJJJJJJJJJJJJJJJJIJIJJ +@SRR2057595.70 +TTAGGTTAATTGTAATTTTATGAGGTCCCATTTGTCAATTCTT ++SRR2057595.70 +@@@DDDDD+2CDFA@FBGHCCHFHGBFHGHIGGDHGHIIFCFF +@SRR2057595.71 +TGCGGTTCATATTAGCATGGCCCCTGCGCAAGGATGACACGCAAATTCGTGAAGCGTTCCATATTT ++SRR2057595.71 +CCCFFFFFHHGHHJJJJJJJJJJIIJJIJIJJIFHJIIIJJJIJJJJJJHIIHHHHFFFDEECEEE +@SRR2057595.73 +TGAGGTTCAGCGAGTGCCTAGTGGGCCACTTTTGGTAAGCAGAACTGGCGCTGCGGGA ++SRR2057595.73 +@@@FFFFFHGFFHC@EBHGHGAEGIIHIIIIJJJJGHIIIJIJIIGHIJIJJIGGEFD +@SRR2057595.74 +AATGGTTTCTAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA ++SRR2057595.74 +@CCFFFFFGDFFHFIJJJGGGGJJGDDDDDDDDDDDDDBDDDDDDBBDDDDDDDDDDDDDDDDDDDBBBDDDDBD> +@SRR2057595.77 +GCGGTTCGATCCCACTTCTGAC ++SRR2057595.77 +1=DFFFFHGDHHHIJJJIJJJJ +@SRR2057595.82 +GAGGGTTACTTCCTCCCGGGGCTACGCCTGTCTGAGCGTCGCT ++SRR2057595.82 +CBCFFFFFHHHHHJJJJJJJJJJJJJJJJIJJJJJIJIIJJJH +@SRR2057595.83 +TGGGTTGATCCCGGGGCTACGCCTGTCTGAGCGTCGCT ++SRR2057595.83 +1=DDFFFHHHHHJJIJJJJJJIJJJIJJIGGHIFHGEH +@SRR2057595.86 +ACCGGTTACTTTTTTTAAATGTAAAGCATAAATAAAAAGCCTTTGTGGACTGTGAAAAAAAAAAAAAAAAAAAAAA ++SRR2057595.86 +BCCFFFFFHHHHHJJJJJJJJIIJJJJJJJJJIJJJJJJJJJJJJGIJIIJJIJJJJJJHFDDDDDDDDDDDDDB> +@SRR2057595.88 +TCAGGTTGCCTAAGCATAGATAACCATATATCAGGGGGAGCTCCATGTTCTAGTCCTGCAAGCGCCTGGGCAATAA ++SRR2057595.88 +CCCFFFFFHHHHHJJJJJJIJJJJJIJJJJJJIJJIJJIJJJJJJJJJHIJJJJJJIIIHJIHHHFFDDDDEDDD@ +@SRR2057595.99 +TGAGGTTCATCGCTGCGATCTATTGAAAGTCAGCCCTCGACACAAGGGTTTGT ++SRR2057595.99 +B@CFFFFFFDHHHIHIIIJJIJJJJJIGEHGFHIJJGHIHADHIIJIJJJIJG From da414e72c60758895b16818309d6c147c288dd84 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Mon, 29 Jul 2024 09:55:17 +0200 Subject: [PATCH 10/10] Add star solo component (#62) * add star solo component * change arguments from camelCase to snake_case * get rid of multiple_sep * drop star_solo component and just add arguments to star_align_reads * Update src/star/star_align_reads/script.py Co-authored-by: Dries Schaumont <5946712+DriesSchaumont@users.noreply.github.com> --------- Co-authored-by: Dries Schaumont <5946712+DriesSchaumont@users.noreply.github.com> --- CHANGELOG.md | 10 +- .../star_align_reads/argument_groups.yaml | 1034 +++++++++++++---- src/star/star_align_reads/config.vsh.yaml | 2 + src/star/star_align_reads/script.py | 20 +- src/star/star_align_reads/test.sh | 12 +- .../star_align_reads/utils/process_params.R | 54 +- src/star/star_genome_generate/config.vsh.yaml | 43 +- src/star/star_genome_generate/script.sh | 30 +- src/star/star_genome_generate/test.sh | 6 +- 9 files changed, 903 insertions(+), 308 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 665b587d..c4575cb9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,14 @@ # biobox x.x.x -## NEW FEATURES +## BREAKING CHANGES + +* `star/star_align_reads`: Change all arguments from `--camelCase` to `--snake_case` (PR #62). + +* `star/star_genome_generate`: Change all arguments from `--camelCase` to `--snake_case` (PR #62). + +## NEW FUNCTIONALITY + +* `star/star_align_reads`: Add star solo related arguments (PR #62). * `bd_rhapsody/bd_rhapsody_make_reference`: Create a reference for the BD Rhapsody pipeline (PR #75). diff --git a/src/star/star_align_reads/argument_groups.yaml b/src/star/star_align_reads/argument_groups.yaml index e6a1c874..7c804dd3 100644 --- a/src/star/star_align_reads/argument_groups.yaml +++ b/src/star/star_align_reads/argument_groups.yaml @@ -1,19 +1,23 @@ argument_groups: - name: Run Parameters arguments: - - name: --runRNGseed + - name: --run_rng_seed type: integer description: random number generator seed. + info: + orig_name: --runRNGseed example: 777 - name: Genome Parameters arguments: - - name: --genomeDir + - name: --genome_dir type: file description: path to the directory where genome files are stored (for --runMode alignReads) or will be generated (for --runMode generateGenome) + info: + orig_name: --genomeDir example: ./GenomeDir/ - required: yes - - name: --genomeLoad + required: true + - name: --genome_load type: string description: |- mode of shared memory usage for the genome files. Only used with --runMode alignReads. @@ -23,132 +27,162 @@ argument_groups: - LoadAndExit ... load genome into shared memory and exit, keeping the genome in memory for future runs - Remove ... do not map anything, just remove loaded genome from memory - NoSharedMemory ... do not use shared memory, each job will have its own private copy of the genome + info: + orig_name: --genomeLoad example: NoSharedMemory - - name: --genomeFastaFiles + - name: --genome_fasta_files type: file description: |- path(s) to the fasta files with the genome sequences, separated by spaces. These files should be plain text FASTA files, they *cannot* be zipped. Required for the genome generation (--runMode genomeGenerate). Can also be used in the mapping (--runMode alignReads) to add extra (new) sequences to the genome (e.g. spike-ins). - multiple: yes - multiple_sep: ; - - name: --genomeFileSizes + info: + orig_name: --genomeFastaFiles + multiple: true + - name: --genome_file_sizes type: integer description: genome files exact sizes in bytes. Typically, this should not be defined by the user. + info: + orig_name: --genomeFileSizes example: 0 - multiple: yes - multiple_sep: ; - - name: --genomeTransformOutput + multiple: true + - name: --genome_transform_output type: string description: |- which output to transform back to original genome - SAM ... SAM/BAM alignments - SJ ... splice junctions (SJ.out.tab) - - Quant ... quantifications (from --quantMode option) + - Quant ... quantifications (from --quant_mode option) - None ... no transformation of the output - multiple: yes - multiple_sep: ; - - name: --genomeChrSetMitochondrial + info: + orig_name: --genomeTransformOutput + multiple: true + - name: --genome_chr_set_mitochondrial type: string description: names of the mitochondrial chromosomes. Presently only used for STARsolo statistics output/ + info: + orig_name: --genomeChrSetMitochondrial example: - chrM - M - MT - multiple: yes - multiple_sep: ; + multiple: true - name: Splice Junctions Database arguments: - - name: --sjdbFileChrStartEnd + - name: --sjdb_file_chr_start_end type: string description: path to the files with genomic coordinates (chr start end strand) for the splice junction introns. Multiple files can be supplied and will be concatenated. - multiple: yes - multiple_sep: ; - - name: --sjdbGTFfile + info: + orig_name: --sjdbFileChrStartEnd + multiple: true + - name: --sjdb_gtf_file type: file description: path to the GTF file with annotations - - name: --sjdbGTFchrPrefix + info: + orig_name: --sjdbGTFfile + - name: --sjdb_gtf_chr_prefix type: string description: prefix for chromosome names in a GTF file (e.g. 'chr' for using ENSMEBL annotations with UCSC genomes) - - name: --sjdbGTFfeatureExon + info: + orig_name: --sjdbGTFchrPrefix + - name: --sjdb_gtf_feature_exon type: string description: feature type in GTF file to be used as exons for building transcripts + info: + orig_name: --sjdbGTFfeatureExon example: exon - - name: --sjdbGTFtagExonParentTranscript + - name: --sjdb_gtf_tag_exon_parent_transcript type: string description: GTF attribute name for parent transcript ID (default "transcript_id" works for GTF files) + info: + orig_name: --sjdbGTFtagExonParentTranscript example: transcript_id - - name: --sjdbGTFtagExonParentGene + - name: --sjdb_gtf_tag_exon_parent_gene type: string description: GTF attribute name for parent gene ID (default "gene_id" works for GTF files) + info: + orig_name: --sjdbGTFtagExonParentGene example: gene_id - - name: --sjdbGTFtagExonParentGeneName + - name: --sjdb_gtf_tag_exon_parent_gene_name type: string description: GTF attribute name for parent gene name + info: + orig_name: --sjdbGTFtagExonParentGeneName example: gene_name - multiple: yes - multiple_sep: ; - - name: --sjdbGTFtagExonParentGeneType + multiple: true + - name: --sjdb_gtf_tag_exon_parent_gene_type type: string description: GTF attribute name for parent gene type + info: + orig_name: --sjdbGTFtagExonParentGeneType example: - gene_type - gene_biotype - multiple: yes - multiple_sep: ; - - name: --sjdbOverhang + multiple: true + - name: --sjdb_overhang type: integer description: length of the donor/acceptor sequence on each side of the junctions, ideally = (mate_length - 1) + info: + orig_name: --sjdbOverhang example: 100 - - name: --sjdbScore + - name: --sjdb_score type: integer description: extra alignment score for alignments that cross database junctions + info: + orig_name: --sjdbScore example: 2 - - name: --sjdbInsertSave + - name: --sjdb_insert_save type: string description: |- which files to save when sjdb junctions are inserted on the fly at the mapping step - Basic ... only small junction / transcript files - All ... all files including big Genome, SA and SAindex - this will create a complete genome directory + info: + orig_name: --sjdbInsertSave example: Basic - name: Variation parameters arguments: - - name: --varVCFfile + - name: --var_vcf_file type: string description: path to the VCF file that contains variation data. The 10th column should contain the genotype information, e.g. 0/1 + info: + orig_name: --varVCFfile - name: Read Parameters arguments: - - name: --readFilesType + - name: --read_files_type type: string description: |- format of input read files - Fastx ... FASTA or FASTQ - - SAM SE ... SAM or BAM single-end reads; for BAM use --readFilesCommand samtools view - - SAM PE ... SAM or BAM paired-end reads; for BAM use --readFilesCommand samtools view + - SAM SE ... SAM or BAM single-end reads; for BAM use --read_files_command samtools view + - SAM PE ... SAM or BAM paired-end reads; for BAM use --read_files_command samtools view + info: + orig_name: --readFilesType example: Fastx - - name: --readFilesSAMattrKeep + - name: --read_files_sam_attr_keep type: string description: |- - for --readFilesType SAM SE/PE, which SAM tags to keep in the output BAM, e.g.: --readFilesSAMtagsKeep RG PL + for --read_files_type SAM SE/PE, which SAM tags to keep in the output BAM, e.g.: --readFilesSAMtagsKeep RG PL - All ... keep all tags - None ... do not keep any tags + info: + orig_name: --readFilesSAMattrKeep example: All - multiple: yes - multiple_sep: ; - - name: --readFilesManifest + multiple: true + - name: --read_files_manifest type: file description: |- path to the "manifest" file with the names of read files. The manifest file should contain 3 tab-separated columns: @@ -158,45 +192,57 @@ argument_groups: Spaces, but not tabs are allowed in file names. If read_group_line does not start with ID:, it can only contain one ID field, and ID: will be added to it. If read_group_line starts with ID:, it can contain several fields separated by $tab$, and all fields will be be copied verbatim into SAM @RG header line. - - name: --readFilesPrefix + info: + orig_name: --readFilesManifest + - name: --read_files_prefix type: string description: prefix for the read files names, i.e. it will be added in front of the strings in --readFilesIn - - name: --readFilesCommand + info: + orig_name: --readFilesPrefix + - name: --read_files_command type: string description: |- command line to execute for each of the input file. This command should generate FASTA or FASTQ text and send it to stdout For example: zcat - to uncompress .gz files, bzcat - to uncompress .bz2 files, etc. - multiple: yes - multiple_sep: ; - - name: --readMapNumber + info: + orig_name: --readFilesCommand + multiple: true + - name: --read_map_number type: integer description: |- number of reads to map from the beginning of the file -1: map all reads + info: + orig_name: --readMapNumber example: -1 - - name: --readMatesLengthsIn + - name: --read_mates_lengths_in type: string description: Equal/NotEqual - lengths of names,sequences,qualities for both mates are the same / not the same. NotEqual is safe in all situations. + info: + orig_name: --readMatesLengthsIn example: NotEqual - - name: --readNameSeparator + - name: --read_name_separator type: string description: character(s) separating the part of the read names that will be trimmed in output (read name after space is always trimmed) + info: + orig_name: --readNameSeparator example: / - multiple: yes - multiple_sep: ; - - name: --readQualityScoreBase + multiple: true + - name: --read_quality_score_base type: integer description: number to be subtracted from the ASCII code to get Phred quality score + info: + orig_name: --readQualityScoreBase example: 33 - name: Read Clipping arguments: - - name: --clipAdapterType + - name: --clip_adapter_type type: string description: |- adapter clipping type @@ -204,129 +250,161 @@ argument_groups: - Hamming ... adapter clipping based on Hamming distance, with the number of mismatches controlled by --clip5pAdapterMMp - CellRanger4 ... 5p and 3p adapter clipping similar to CellRanger4. Utilizes Opal package by Martin Sosic: https://github.com/Martinsos/opal - None ... no adapter clipping, all other clip* parameters are disregarded + info: + orig_name: --clipAdapterType example: Hamming - - name: --clip3pNbases + - name: --clip3p_nbases type: integer description: number(s) of bases to clip from 3p of each mate. If one value is given, it will be assumed the same for both mates. + info: + orig_name: --clip3pNbases example: 0 - multiple: yes - multiple_sep: ; - - name: --clip3pAdapterSeq + multiple: true + - name: --clip3p_adapter_seq type: string description: |- adapter sequences to clip from 3p of each mate. If one value is given, it will be assumed the same for both mates. - polyA ... polyA sequence with the length equal to read length - multiple: yes - multiple_sep: ; - - name: --clip3pAdapterMMp + info: + orig_name: --clip3pAdapterSeq + multiple: true + - name: --clip3p_adapter_mm_p type: double description: max proportion of mismatches for 3p adapter clipping for each mate. If one value is given, it will be assumed the same for both mates. + info: + orig_name: --clip3pAdapterMMp example: 0.1 - multiple: yes - multiple_sep: ; - - name: --clip3pAfterAdapterNbases + multiple: true + - name: --clip3p_after_adapter_nbases type: integer description: number of bases to clip from 3p of each mate after the adapter clipping. If one value is given, it will be assumed the same for both mates. + info: + orig_name: --clip3pAfterAdapterNbases example: 0 - multiple: yes - multiple_sep: ; - - name: --clip5pNbases + multiple: true + - name: --clip5p_nbases type: integer description: number(s) of bases to clip from 5p of each mate. If one value is given, it will be assumed the same for both mates. + info: + orig_name: --clip5pNbases example: 0 - multiple: yes - multiple_sep: ; + multiple: true - name: Limits arguments: - - name: --limitGenomeGenerateRAM + - name: --limit_genome_generate_ram type: long description: maximum available RAM (bytes) for genome generation + info: + orig_name: --limitGenomeGenerateRAM example: '31000000000' - - name: --limitIObufferSize + - name: --limit_io_buffer_size type: long description: max available buffers size (bytes) for input/output, per thread + info: + orig_name: --limitIObufferSize example: - 30000000 - 50000000 - multiple: yes - multiple_sep: ; - - name: --limitOutSAMoneReadBytes + multiple: true + - name: --limit_out_sam_one_read_bytes type: long description: 'max size of the SAM record (bytes) for one read. Recommended value: >(2*(LengthMate1+LengthMate2+100)*outFilterMultimapNmax' + info: + orig_name: --limitOutSAMoneReadBytes example: 100000 - - name: --limitOutSJoneRead + - name: --limit_out_sj_one_read type: integer description: max number of junctions for one read (including all multi-mappers) + info: + orig_name: --limitOutSJoneRead example: 1000 - - name: --limitOutSJcollapsed + - name: --limit_out_sj_collapsed type: integer description: max number of collapsed junctions + info: + orig_name: --limitOutSJcollapsed example: 1000000 - - name: --limitBAMsortRAM + - name: --limit_bam_sort_ram type: long description: maximum available RAM (bytes) for sorting BAM. If =0, it will be - set to the genome index size. 0 value can only be used with --genomeLoad NoSharedMemory + set to the genome index size. 0 value can only be used with --genome_load NoSharedMemory option. + info: + orig_name: --limitBAMsortRAM example: 0 - - name: --limitSjdbInsertNsj + - name: --limit_sjdb_insert_nsj type: integer description: maximum number of junctions to be inserted to the genome on the fly at the mapping stage, including those from annotations and those detected in the 1st step of the 2-pass run + info: + orig_name: --limitSjdbInsertNsj example: 1000000 - - name: --limitNreadsSoft + - name: --limit_nreads_soft type: integer description: soft limit on the number of reads + info: + orig_name: --limitNreadsSoft example: -1 - name: 'Output: general' arguments: - - name: --outTmpKeep + - name: --out_tmp_keep type: string description: |- whether to keep the temporary files after STAR runs is finished - None ... remove all temporary files - All ... keep all files - - name: --outStd + info: + orig_name: --outTmpKeep + - name: --out_std type: string description: |- which output will be directed to stdout (standard out) - Log ... log messages - SAM ... alignments in SAM format (which normally are output to Aligned.out.sam file), normal standard output will go into Log.std.out - - BAM_Unsorted ... alignments in BAM format, unsorted. Requires --outSAMtype BAM Unsorted - - BAM_SortedByCoordinate ... alignments in BAM format, sorted by coordinate. Requires --outSAMtype BAM SortedByCoordinate - - BAM_Quant ... alignments to transcriptome in BAM format, unsorted. Requires --quantMode TranscriptomeSAM + - BAM_Unsorted ... alignments in BAM format, unsorted. Requires --out_sam_type BAM Unsorted + - BAM_SortedByCoordinate ... alignments in BAM format, sorted by coordinate. Requires --out_sam_type BAM SortedByCoordinate + - BAM_Quant ... alignments to transcriptome in BAM format, unsorted. Requires --quant_mode TranscriptomeSAM + info: + orig_name: --outStd example: Log - - name: --outReadsUnmapped + - name: --out_reads_unmapped type: string description: |- output of unmapped and partially mapped (i.e. mapped only one mate of a paired end read) reads in separate file(s). - None ... no output - Fastx ... output in separate fasta/fastq files, Unmapped.out.mate1/2 - - name: --outQSconversionAdd + info: + orig_name: --outReadsUnmapped + - name: --out_qs_conversion_add type: integer description: add this number to the quality score (e.g. to convert from Illumina to Sanger, use -31) + info: + orig_name: --outQSconversionAdd example: 0 - - name: --outMultimapperOrder + - name: --out_multimapper_order type: string description: |- order of multimapping alignments in the output files - Old_2.4 ... quasi-random order used before 2.5.0 - Random ... random order of alignments for each multi-mapper. Read mates (pairs) are always adjacent, all alignment for each read stay together. This option will become default in the future releases. + info: + orig_name: --outMultimapperOrder example: Old_2.4 - name: 'Output: SAM and BAM' arguments: - - name: --outSAMtype + - name: --out_sam_type type: string description: |- type of SAM/BAM output @@ -337,11 +415,12 @@ argument_groups: - None ... no SAM/BAM output 2nd, 3rd: - Unsorted ... standard unsorted - - SortedByCoordinate ... sorted by coordinate. This option will allocate extra memory for sorting which can be specified by --limitBAMsortRAM. + - SortedByCoordinate ... sorted by coordinate. This option will allocate extra memory for sorting which can be specified by --limit_bam_sort_ram. + info: + orig_name: --outSAMtype example: SAM - multiple: yes - multiple_sep: ; - - name: --outSAMmode + multiple: true + - name: --out_sam_mode type: string description: |- mode of SAM output @@ -349,15 +428,19 @@ argument_groups: - None ... no SAM output - Full ... full SAM output - NoQS ... full SAM but without quality scores + info: + orig_name: --outSAMmode example: Full - - name: --outSAMstrandField + - name: --out_sam_strand_field type: string description: |- Cufflinks-like strand field flag - None ... not used - intronMotif ... strand derived from the intron motif. This option changes the output alignments: reads with inconsistent and/or non-canonical introns are filtered out. - - name: --outSAMattributes + info: + orig_name: --outSAMstrandField + - name: --out_sam_attributes type: string description: |- a string of desired SAM attributes, in the order desired for the output SAM. Tags can be listed in any combination/order. @@ -368,27 +451,27 @@ argument_groups: - All ... NH HI AS nM NM MD jM jI MC ch ***Alignment: - NH ... number of loci the reads maps to: =1 for unique mappers, >1 for multimappers. Standard SAM tag. - - HI ... multiple alignment index, starts with --outSAMattrIHstart (=1 by default). Standard SAM tag. + - HI ... multiple alignment index, starts with --out_sam_attr_ih_start (=1 by default). Standard SAM tag. - AS ... local alignment score, +1/-1 for matches/mismateches, score* penalties for indels and gaps. For PE reads, total score for two mates. Stadnard SAM tag. - nM ... number of mismatches. For PE reads, sum over two mates. - NM ... edit distance to the reference (number of mismatched + inserted + deleted bases) for each mate. Standard SAM tag. - MD ... string encoding mismatched and deleted reference bases (see standard SAM specifications). Standard SAM tag. - jM ... intron motifs for all junctions (i.e. N in CIGAR): 0: non-canonical; 1: GT/AG, 2: CT/AC, 3: GC/AG, 4: CT/GC, 5: AT/AC, 6: GT/AT. If splice junctions database is used, and a junction is annotated, 20 is added to its motif value. - jI ... start and end of introns for all junctions (1-based). - - XS ... alignment strand according to --outSAMstrandField. + - XS ... alignment strand according to --out_sam_strand_field. - MC ... mate's CIGAR string. Standard SAM tag. - - ch ... marks all segment of all chimeric alingments for --chimOutType WithinBAM output. + - ch ... marks all segment of all chimeric alingments for --chim_out_type WithinBAM output. - cN ... number of bases clipped from the read ends: 5' and 3' ***Variation: - vA ... variant allele - vG ... genomic coordinate of the variant overlapped by the read. - - vW ... 1 - alignment passes WASP filtering; 2,3,4,5,6,7 - alignment does not pass WASP filtering. Requires --waspOutputMode SAMtag. + - vW ... 1 - alignment passes WASP filtering; 2,3,4,5,6,7 - alignment does not pass WASP filtering. Requires --wasp_output_mode SAMtag. - ha ... haplotype (1/2) when mapping to the diploid genome. Requires genome generated with --genomeTransformType Diploid . ***STARsolo: - CR CY UR UY ... sequences and quality scores of cell barcodes and UMIs for the solo* demultiplexing. - GX GN ... gene ID and gene name for unique-gene reads. - gx gn ... gene IDs and gene names for unique- and multi-gene reads. - - CB UB ... error-corrected cell barcodes and UMIs for solo* demultiplexing. Requires --outSAMtype BAM SortedByCoordinate. + - CB UB ... error-corrected cell barcodes and UMIs for solo* demultiplexing. Requires --out_sam_type BAM SortedByCoordinate. - sM ... assessment of CB and UMI. - sS ... sequence of the entire barcode (CB,UMI,adapter). - sQ ... quality of the entire barcode. @@ -396,15 +479,18 @@ argument_groups: ***Unsupported/undocumented: - rB ... alignment block read/genomic coordinates. - vR ... read coordinate of the variant. + info: + orig_name: --outSAMattributes example: Standard - multiple: yes - multiple_sep: ; - - name: --outSAMattrIHstart + multiple: true + - name: --out_sam_attr_ih_start type: integer description: start value for the IH attribute. 0 may be required by some downstream software, such as Cufflinks or StringTie. + info: + orig_name: --outSAMattrIHstart example: 1 - - name: --outSAMunmapped + - name: --out_sam_unmapped type: string description: |- output of unmapped reads in the SAM format @@ -414,112 +500,141 @@ argument_groups: - Within ... output unmapped reads within the main SAM file (i.e. Aligned.out.sam) 2nd word: - KeepPairs ... record unmapped mate for each alignment, and, in case of unsorted output, keep it adjacent to its mapped mate. Only affects multi-mapping reads. - multiple: yes - multiple_sep: ; - - name: --outSAMorder + info: + orig_name: --outSAMunmapped + multiple: true + - name: --out_sam_order type: string description: |- type of sorting for the SAM output Paired: one mate after the other for all paired alignments PairedKeepInputOrder: one mate after the other for all paired alignments, the order is kept the same as in the input FASTQ files + info: + orig_name: --outSAMorder example: Paired - - name: --outSAMprimaryFlag + - name: --out_sam_primary_flag type: string description: |- which alignments are considered primary - all others will be marked with 0x100 bit in the FLAG - OneBestScore ... only one alignment with the best score is primary - AllBestScore ... all alignments with the best score are primary + info: + orig_name: --outSAMprimaryFlag example: OneBestScore - - name: --outSAMreadID + - name: --out_sam_read_id type: string description: |- read ID record type - Standard ... first word (until space) from the FASTx read ID line, removing /1,/2 from the end - Number ... read number (index) in the FASTx file + info: + orig_name: --outSAMreadID example: Standard - - name: --outSAMmapqUnique + - name: --out_sam_mapq_unique type: integer description: '0 to 255: the MAPQ value for unique mappers' + info: + orig_name: --outSAMmapqUnique example: 255 - - name: --outSAMflagOR + - name: --out_sam_flag_or type: integer description: '0 to 65535: sam FLAG will be bitwise OR''d with this value, i.e. FLAG=FLAG | outSAMflagOR. This is applied after all flags have been set by STAR, and after outSAMflagAND. Can be used to set specific bits that are not set otherwise.' + info: + orig_name: --outSAMflagOR example: 0 - - name: --outSAMflagAND + - name: --out_sam_flag_and type: integer description: '0 to 65535: sam FLAG will be bitwise AND''d with this value, i.e. FLAG=FLAG & outSAMflagOR. This is applied after all flags have been set by STAR, but before outSAMflagOR. Can be used to unset specific bits that are not set otherwise.' + info: + orig_name: --outSAMflagAND example: 65535 - - name: --outSAMattrRGline + - name: --out_sam_attr_rg_line type: string description: |- - SAM/BAM read group line. The first word contains the read group identifier and must start with "ID:", e.g. --outSAMattrRGline ID:xxx CN:yy "DS:z z z". + SAM/BAM read group line. The first word contains the read group identifier and must start with "ID:", e.g. --out_sam_attr_rg_line ID:xxx CN:yy "DS:z z z". xxx will be added as RG tag to each output alignment. Any spaces in the tag values have to be double quoted. Comma separated RG lines correspons to different (comma separated) input files in --readFilesIn. Commas have to be surrounded by spaces, e.g. - --outSAMattrRGline ID:xxx , ID:zzz "DS:z z" , ID:yyy DS:yyyy - multiple: yes - multiple_sep: ; - - name: --outSAMheaderHD + --out_sam_attr_rg_line ID:xxx , ID:zzz "DS:z z" , ID:yyy DS:yyyy + info: + orig_name: --outSAMattrRGline + multiple: true + - name: --out_sam_header_hd type: string description: '@HD (header) line of the SAM header' - multiple: yes - multiple_sep: ; - - name: --outSAMheaderPG + info: + orig_name: --outSAMheaderHD + multiple: true + - name: --out_sam_header_pg type: string description: extra @PG (software) line of the SAM header (in addition to STAR) - multiple: yes - multiple_sep: ; - - name: --outSAMheaderCommentFile + info: + orig_name: --outSAMheaderPG + multiple: true + - name: --out_sam_header_comment_file type: string description: path to the file with @CO (comment) lines of the SAM header - - name: --outSAMfilter + info: + orig_name: --outSAMheaderCommentFile + - name: --out_sam_filter type: string description: |- filter the output into main SAM/BAM files - - KeepOnlyAddedReferences ... only keep the reads for which all alignments are to the extra reference sequences added with --genomeFastaFiles at the mapping stage. - - KeepAllAddedReferences ... keep all alignments to the extra reference sequences added with --genomeFastaFiles at the mapping stage. - multiple: yes - multiple_sep: ; - - name: --outSAMmultNmax + - KeepOnlyAddedReferences ... only keep the reads for which all alignments are to the extra reference sequences added with --genome_fasta_files at the mapping stage. + - KeepAllAddedReferences ... keep all alignments to the extra reference sequences added with --genome_fasta_files at the mapping stage. + info: + orig_name: --outSAMfilter + multiple: true + - name: --out_sam_mult_nmax type: integer description: |- max number of multiple alignments for a read that will be output to the SAM/BAM files. Note that if this value is not equal to -1, the top scoring alignment will be output first - - -1 ... all alignments (up to --outFilterMultimapNmax) will be output + - -1 ... all alignments (up to --out_filter_multimap_nmax) will be output + info: + orig_name: --outSAMmultNmax example: -1 - - name: --outSAMtlen + - name: --out_sam_tlen type: integer description: |- calculation method for the TLEN field in the SAM/BAM files - 1 ... leftmost base of the (+)strand mate to rightmost base of the (-)mate. (+)sign for the (+)strand mate - 2 ... leftmost base of any mate to rightmost base of any mate. (+)sign for the mate with the leftmost base. This is different from 1 for overlapping mates with protruding ends + info: + orig_name: --outSAMtlen example: 1 - - name: --outBAMcompression + - name: --out_bam_compression type: integer description: -1 to 10 BAM compression level, -1=default compression (6?), 0=no compression, 10=maximum compression + info: + orig_name: --outBAMcompression example: 1 - - name: --outBAMsortingThreadN + - name: --out_bam_sorting_thread_n type: integer description: '>=0: number of threads for BAM sorting. 0 will default to min(6,--runThreadN).' + info: + orig_name: --outBAMsortingThreadN example: 0 - - name: --outBAMsortingBinsN + - name: --out_bam_sorting_bins_n type: integer description: '>0: number of genome bins for coordinate-sorting' + info: + orig_name: --outBAMsortingBinsN example: 50 - name: BAM processing arguments: - - name: --bamRemoveDuplicatesType + - name: --bam_remove_duplicates_type type: string description: |- mark duplicates in the BAM file, for now only works with (i) sorted BAM fed with inputBAMfile, and (ii) for paired-end alignments only @@ -527,17 +642,21 @@ argument_groups: - - ... no duplicate removal/marking - UniqueIdentical ... mark all multimappers, and duplicate unique mappers. The coordinates, FLAG, CIGAR must be identical - UniqueIdenticalNotMulti ... mark duplicate unique mappers but not multimappers. - - name: --bamRemoveDuplicatesMate2basesN + info: + orig_name: --bamRemoveDuplicatesType + - name: --bam_remove_duplicates_mate2bases_n type: integer description: number of bases from the 5' of mate 2 to use in collapsing (e.g. for RAMPAGE) + info: + orig_name: --bamRemoveDuplicatesMate2basesN example: 0 - name: Output Wiggle arguments: - - name: --outWigType + - name: --out_wig_type type: string description: |- - type of signal output, e.g. "bedGraph" OR "bedGraph read1_5p". Requires sorted BAM: --outSAMtype BAM SortedByCoordinate . + type of signal output, e.g. "bedGraph" OR "bedGraph read1_5p". Requires sorted BAM: --out_sam_type BAM SortedByCoordinate . 1st word: - None ... no signal output @@ -546,85 +665,112 @@ argument_groups: 2nd word: - read1_5p ... signal from only 5' of the 1st read, useful for CAGE/RAMPAGE etc - read2 ... signal from only 2nd read - multiple: yes - multiple_sep: ; - - name: --outWigStrand + info: + orig_name: --outWigType + multiple: true + - name: --out_wig_strand type: string description: |- strandedness of wiggle/bedGraph output - Stranded ... separate strands, str1 and str2 - Unstranded ... collapsed strands + info: + orig_name: --outWigStrand example: Stranded - - name: --outWigReferencesPrefix + - name: --out_wig_references_prefix type: string description: prefix matching reference names to include in the output wiggle file, e.g. "chr", default "-" - include all references - - name: --outWigNorm + info: + orig_name: --outWigReferencesPrefix + - name: --out_wig_norm type: string description: |- type of normalization for the signal - RPM ... reads per million of mapped reads - None ... no normalization, "raw" counts + info: + orig_name: --outWigNorm example: RPM - name: Output Filtering arguments: - - name: --outFilterType + - name: --out_filter_type type: string description: |- type of filtering - Normal ... standard filtering using only current alignment - BySJout ... keep only those reads that contain junctions that passed filtering into SJ.out.tab + info: + orig_name: --outFilterType example: Normal - - name: --outFilterMultimapScoreRange + - name: --out_filter_multimap_score_range type: integer description: the score range below the maximum score for multimapping alignments + info: + orig_name: --outFilterMultimapScoreRange example: 1 - - name: --outFilterMultimapNmax + - name: --out_filter_multimap_nmax type: integer description: |- maximum number of loci the read is allowed to map to. Alignments (all of them) will be output only if the read maps to no more loci than this value. Otherwise no alignments will be output, and the read will be counted as "mapped to too many loci" in the Log.final.out . + info: + orig_name: --outFilterMultimapNmax example: 10 - - name: --outFilterMismatchNmax + - name: --out_filter_mismatch_nmax type: integer description: alignment will be output only if it has no more mismatches than this value. + info: + orig_name: --outFilterMismatchNmax example: 10 - - name: --outFilterMismatchNoverLmax + - name: --out_filter_mismatch_nover_lmax type: double description: alignment will be output only if its ratio of mismatches to *mapped* length is less than or equal to this value. + info: + orig_name: --outFilterMismatchNoverLmax example: 0.3 - - name: --outFilterMismatchNoverReadLmax + - name: --out_filter_mismatch_nover_read_lmax type: double description: alignment will be output only if its ratio of mismatches to *read* length is less than or equal to this value. + info: + orig_name: --outFilterMismatchNoverReadLmax example: 1.0 - - name: --outFilterScoreMin + - name: --out_filter_score_min type: integer description: alignment will be output only if its score is higher than or equal to this value. + info: + orig_name: --outFilterScoreMin example: 0 - - name: --outFilterScoreMinOverLread + - name: --out_filter_score_min_over_lread type: double description: same as outFilterScoreMin, but normalized to read length (sum of mates' lengths for paired-end reads) + info: + orig_name: --outFilterScoreMinOverLread example: 0.66 - - name: --outFilterMatchNmin + - name: --out_filter_match_nmin type: integer description: alignment will be output only if the number of matched bases is higher than or equal to this value. + info: + orig_name: --outFilterMatchNmin example: 0 - - name: --outFilterMatchNminOverLread + - name: --out_filter_match_nmin_over_lread type: double description: sam as outFilterMatchNmin, but normalized to the read length (sum of mates' lengths for paired-end reads). + info: + orig_name: --outFilterMatchNminOverLread example: 0.66 - - name: --outFilterIntronMotifs + - name: --out_filter_intron_motifs type: string description: |- filter alignment using their motifs @@ -632,244 +778,316 @@ argument_groups: - None ... no filtering - RemoveNoncanonical ... filter out alignments that contain non-canonical junctions - RemoveNoncanonicalUnannotated ... filter out alignments that contain non-canonical unannotated junctions when using annotated splice junctions database. The annotated non-canonical junctions will be kept. - - name: --outFilterIntronStrands + info: + orig_name: --outFilterIntronMotifs + - name: --out_filter_intron_strands type: string description: |- filter alignments - RemoveInconsistentStrands ... remove alignments that have junctions with inconsistent strands - None ... no filtering + info: + orig_name: --outFilterIntronStrands example: RemoveInconsistentStrands - name: Output splice junctions (SJ.out.tab) arguments: - - name: --outSJtype + - name: --out_sj_type type: string description: |- type of splice junction output - Standard ... standard SJ.out.tab output - None ... no splice junction output + info: + orig_name: --outSJtype example: Standard - name: 'Output Filtering: Splice Junctions' arguments: - - name: --outSJfilterReads + - name: --out_sj_filter_reads type: string description: |- which reads to consider for collapsed splice junctions output - All ... all reads, unique- and multi-mappers - Unique ... uniquely mapping reads only + info: + orig_name: --outSJfilterReads example: All - - name: --outSJfilterOverhangMin + - name: --out_sj_filter_overhang_min type: integer description: |- minimum overhang length for splice junctions on both sides for: (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif. -1 means no output for that motif does not apply to annotated junctions + info: + orig_name: --outSJfilterOverhangMin example: - 30 - 12 - 12 - 12 - multiple: yes - multiple_sep: ; - - name: --outSJfilterCountUniqueMin + multiple: true + - name: --out_sj_filter_count_unique_min type: integer description: |- minimum uniquely mapping read count per junction for: (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif. -1 means no output for that motif Junctions are output if one of outSJfilterCountUniqueMin OR outSJfilterCountTotalMin conditions are satisfied does not apply to annotated junctions + info: + orig_name: --outSJfilterCountUniqueMin example: - 3 - 1 - 1 - 1 - multiple: yes - multiple_sep: ; - - name: --outSJfilterCountTotalMin + multiple: true + - name: --out_sj_filter_count_total_min type: integer description: |- minimum total (multi-mapping+unique) read count per junction for: (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif. -1 means no output for that motif Junctions are output if one of outSJfilterCountUniqueMin OR outSJfilterCountTotalMin conditions are satisfied does not apply to annotated junctions + info: + orig_name: --outSJfilterCountTotalMin example: - 3 - 1 - 1 - 1 - multiple: yes - multiple_sep: ; - - name: --outSJfilterDistToOtherSJmin + multiple: true + - name: --out_sj_filter_dist_to_other_sj_min type: integer description: |- minimum allowed distance to other junctions' donor/acceptor does not apply to annotated junctions + info: + orig_name: --outSJfilterDistToOtherSJmin example: - 10 - 0 - 5 - 10 - multiple: yes - multiple_sep: ; - - name: --outSJfilterIntronMaxVsReadN + multiple: true + - name: --out_sj_filter_intron_max_vs_read_n type: integer description: |- maximum gap allowed for junctions supported by 1,2,3,,,N reads i.e. by default junctions supported by 1 read can have gaps <=50000b, by 2 reads: <=100000b, by 3 reads: <=200000. by >=4 reads any gap <=alignIntronMax does not apply to annotated junctions + info: + orig_name: --outSJfilterIntronMaxVsReadN example: - 50000 - 100000 - 200000 - multiple: yes - multiple_sep: ; + multiple: true - name: Scoring arguments: - - name: --scoreGap + - name: --score_gap type: integer description: splice junction penalty (independent on intron motif) + info: + orig_name: --scoreGap example: 0 - - name: --scoreGapNoncan + - name: --score_gap_noncan type: integer description: non-canonical junction penalty (in addition to scoreGap) + info: + orig_name: --scoreGapNoncan example: -8 - - name: --scoreGapGCAG + - name: --score_gap_gcag type: integer description: GC/AG and CT/GC junction penalty (in addition to scoreGap) + info: + orig_name: --scoreGapGCAG example: -4 - - name: --scoreGapATAC + - name: --score_gap_atac type: integer description: AT/AC and GT/AT junction penalty (in addition to scoreGap) + info: + orig_name: --scoreGapATAC example: -8 - - name: --scoreGenomicLengthLog2scale + - name: --score_genomic_length_log2scale type: integer description: 'extra score logarithmically scaled with genomic length of the alignment: scoreGenomicLengthLog2scale*log2(genomicLength)' + info: + orig_name: --scoreGenomicLengthLog2scale example: 0 - - name: --scoreDelOpen + - name: --score_del_open type: integer description: deletion open penalty + info: + orig_name: --scoreDelOpen example: -2 - - name: --scoreDelBase + - name: --score_del_base type: integer description: deletion extension penalty per base (in addition to scoreDelOpen) + info: + orig_name: --scoreDelBase example: -2 - - name: --scoreInsOpen + - name: --score_ins_open type: integer description: insertion open penalty + info: + orig_name: --scoreInsOpen example: -2 - - name: --scoreInsBase + - name: --score_ins_base type: integer description: insertion extension penalty per base (in addition to scoreInsOpen) + info: + orig_name: --scoreInsBase example: -2 - - name: --scoreStitchSJshift + - name: --score_stitch_sj_shift type: integer description: maximum score reduction while searching for SJ boundaries in the stitching step + info: + orig_name: --scoreStitchSJshift example: 1 - name: Alignments and Seeding arguments: - - name: --seedSearchStartLmax + - name: --seed_search_start_lmax type: integer description: defines the search start point through the read - the read is split into pieces no longer than this value + info: + orig_name: --seedSearchStartLmax example: 50 - - name: --seedSearchStartLmaxOverLread + - name: --seed_search_start_lmax_over_lread type: double description: seedSearchStartLmax normalized to read length (sum of mates' lengths for paired-end reads) + info: + orig_name: --seedSearchStartLmaxOverLread example: 1.0 - - name: --seedSearchLmax + - name: --seed_search_lmax type: integer description: defines the maximum length of the seeds, if =0 seed length is not limited + info: + orig_name: --seedSearchLmax example: 0 - - name: --seedMultimapNmax + - name: --seed_multimap_nmax type: integer description: only pieces that map fewer than this value are utilized in the stitching procedure + info: + orig_name: --seedMultimapNmax example: 10000 - - name: --seedPerReadNmax + - name: --seed_per_read_nmax type: integer description: max number of seeds per read + info: + orig_name: --seedPerReadNmax example: 1000 - - name: --seedPerWindowNmax + - name: --seed_per_window_nmax type: integer description: max number of seeds per window + info: + orig_name: --seedPerWindowNmax example: 50 - - name: --seedNoneLociPerWindow + - name: --seed_none_loci_per_window type: integer description: max number of one seed loci per window + info: + orig_name: --seedNoneLociPerWindow example: 10 - - name: --seedSplitMin + - name: --seed_split_min type: integer description: min length of the seed sequences split by Ns or mate gap + info: + orig_name: --seedSplitMin example: 12 - - name: --seedMapMin + - name: --seed_map_min type: integer description: min length of seeds to be mapped + info: + orig_name: --seedMapMin example: 5 - - name: --alignIntronMin + - name: --align_intron_min type: integer description: minimum intron size, genomic gap is considered intron if its length>=alignIntronMin, otherwise it is considered Deletion + info: + orig_name: --alignIntronMin example: 21 - - name: --alignIntronMax + - name: --align_intron_max type: integer description: maximum intron size, if 0, max intron size will be determined by (2^winBinNbits)*winAnchorDistNbins + info: + orig_name: --alignIntronMax example: 0 - - name: --alignMatesGapMax + - name: --align_mates_gap_max type: integer description: maximum gap between two mates, if 0, max intron gap will be determined by (2^winBinNbits)*winAnchorDistNbins + info: + orig_name: --alignMatesGapMax example: 0 - - name: --alignSJoverhangMin + - name: --align_sj_overhang_min type: integer description: minimum overhang (i.e. block size) for spliced alignments + info: + orig_name: --alignSJoverhangMin example: 5 - - name: --alignSJstitchMismatchNmax + - name: --align_sj_stitch_mismatch_nmax type: integer description: |- maximum number of mismatches for stitching of the splice junctions (-1: no limit). (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif. + info: + orig_name: --alignSJstitchMismatchNmax example: - 0 - -1 - 0 - 0 - multiple: yes - multiple_sep: ; - - name: --alignSJDBoverhangMin + multiple: true + - name: --align_sjdb_overhang_min type: integer description: minimum overhang (i.e. block size) for annotated (sjdb) spliced alignments + info: + orig_name: --alignSJDBoverhangMin example: 3 - - name: --alignSplicedMateMapLmin + - name: --align_spliced_mate_map_lmin type: integer description: minimum mapped length for a read mate that is spliced + info: + orig_name: --alignSplicedMateMapLmin example: 0 - - name: --alignSplicedMateMapLminOverLmate + - name: --align_spliced_mate_map_lmin_over_lmate type: double description: alignSplicedMateMapLmin normalized to mate length + info: + orig_name: --alignSplicedMateMapLminOverLmate example: 0.66 - - name: --alignWindowsPerReadNmax + - name: --align_windows_per_read_nmax type: integer description: max number of windows per read + info: + orig_name: --alignWindowsPerReadNmax example: 10000 - - name: --alignTranscriptsPerWindowNmax + - name: --align_transcripts_per_window_nmax type: integer description: max number of transcripts per window + info: + orig_name: --alignTranscriptsPerWindowNmax example: 100 - - name: --alignTranscriptsPerReadNmax + - name: --align_transcripts_per_read_nmax type: integer description: max number of different alignments per read to consider + info: + orig_name: --alignTranscriptsPerReadNmax example: 10000 - - name: --alignEndsType + - name: --align_ends_type type: string description: |- type of read ends alignment @@ -878,8 +1096,10 @@ argument_groups: - EndToEnd ... force end-to-end read alignment, do not soft-clip - Extend5pOfRead1 ... fully extend only the 5p of the read1, all other ends: local alignment - Extend5pOfReads12 ... fully extend only the 5p of the both read1 and read2, all other ends: local alignment + info: + orig_name: --alignEndsType example: Local - - name: --alignEndsProtrude + - name: --align_ends_protrude type: string description: |- allow protrusion of alignment ends, i.e. start (end) of the +strand mate downstream of the start (end) of the -strand mate @@ -888,68 +1108,90 @@ argument_groups: 2nd word: string: - ConcordantPair ... report alignments with non-zero protrusion as concordant pairs - DiscordantPair ... report alignments with non-zero protrusion as discordant pairs + info: + orig_name: --alignEndsProtrude example: 0 ConcordantPair - - name: --alignSoftClipAtReferenceEnds + - name: --align_soft_clip_at_reference_ends type: string description: |- allow the soft-clipping of the alignments past the end of the chromosomes - Yes ... allow - No ... prohibit, useful for compatibility with Cufflinks + info: + orig_name: --alignSoftClipAtReferenceEnds example: 'Yes' - - name: --alignInsertionFlush + - name: --align_insertion_flush type: string description: |- how to flush ambiguous insertion positions - None ... insertions are not flushed - Right ... insertions are flushed to the right + info: + orig_name: --alignInsertionFlush - name: Paired-End reads arguments: - - name: --peOverlapNbasesMin + - name: --pe_overlap_nbases_min type: integer description: minimum number of overlapping bases to trigger mates merging and realignment. Specify >0 value to switch on the "merginf of overlapping mates" algorithm. + info: + orig_name: --peOverlapNbasesMin example: 0 - - name: --peOverlapMMp + - name: --pe_overlap_mm_p type: double description: maximum proportion of mismatched bases in the overlap area + info: + orig_name: --peOverlapMMp example: 0.01 - name: Windows, Anchors, Binning arguments: - - name: --winAnchorMultimapNmax + - name: --win_anchor_multimap_nmax type: integer description: max number of loci anchors are allowed to map to + info: + orig_name: --winAnchorMultimapNmax example: 50 - - name: --winBinNbits + - name: --win_bin_nbits type: integer description: =log2(winBin), where winBin is the size of the bin for the windows/clustering, each window will occupy an integer number of bins. + info: + orig_name: --winBinNbits example: 16 - - name: --winAnchorDistNbins + - name: --win_anchor_dist_nbins type: integer description: max number of bins between two anchors that allows aggregation of anchors into one window + info: + orig_name: --winAnchorDistNbins example: 9 - - name: --winFlankNbins + - name: --win_flank_nbins type: integer description: log2(winFlank), where win Flank is the size of the left and right flanking regions for each window + info: + orig_name: --winFlankNbins example: 4 - - name: --winReadCoverageRelativeMin + - name: --win_read_coverage_relative_min type: double description: minimum relative coverage of the read sequence by the seeds in a window, for STARlong algorithm only. + info: + orig_name: --winReadCoverageRelativeMin example: 0.5 - - name: --winReadCoverageBasesMin + - name: --win_read_coverage_bases_min type: integer description: minimum number of bases covered by the seeds in a window , for STARlong algorithm only. + info: + orig_name: --winReadCoverageBasesMin example: 0 - name: Chimeric Alignments arguments: - - name: --chimOutType + - name: --chim_out_type type: string description: |- type of chimeric output @@ -959,83 +1201,109 @@ argument_groups: - WithinBAM ... output into main aligned BAM files (Aligned.*.bam) - WithinBAM HardClip ... (default) hard-clipping in the CIGAR for supplemental chimeric alignments (default if no 2nd word is present) - WithinBAM SoftClip ... soft-clipping in the CIGAR for supplemental chimeric alignments + info: + orig_name: --chimOutType example: Junctions - multiple: yes - multiple_sep: ; - - name: --chimSegmentMin + multiple: true + - name: --chim_segment_min type: integer description: minimum length of chimeric segment length, if ==0, no chimeric output + info: + orig_name: --chimSegmentMin example: 0 - - name: --chimScoreMin + - name: --chim_score_min type: integer description: minimum total (summed) score of the chimeric segments + info: + orig_name: --chimScoreMin example: 0 - - name: --chimScoreDropMax + - name: --chim_score_drop_max type: integer description: max drop (difference) of chimeric score (the sum of scores of all chimeric segments) from the read length + info: + orig_name: --chimScoreDropMax example: 20 - - name: --chimScoreSeparation + - name: --chim_score_separation type: integer description: minimum difference (separation) between the best chimeric score and the next one + info: + orig_name: --chimScoreSeparation example: 10 - - name: --chimScoreJunctionNonGTAG + - name: --chim_score_junction_non_gtag type: integer description: penalty for a non-GT/AG chimeric junction + info: + orig_name: --chimScoreJunctionNonGTAG example: -1 - - name: --chimJunctionOverhangMin + - name: --chim_junction_overhang_min type: integer description: minimum overhang for a chimeric junction + info: + orig_name: --chimJunctionOverhangMin example: 20 - - name: --chimSegmentReadGapMax + - name: --chim_segment_read_gap_max type: integer description: maximum gap in the read sequence between chimeric segments + info: + orig_name: --chimSegmentReadGapMax example: 0 - - name: --chimFilter + - name: --chim_filter type: string description: |- different filters for chimeric alignments - None ... no filtering - banGenomicN ... Ns are not allowed in the genome sequence around the chimeric junction + info: + orig_name: --chimFilter example: banGenomicN - multiple: yes - multiple_sep: ; - - name: --chimMainSegmentMultNmax + multiple: true + - name: --chim_main_segment_mult_nmax type: integer description: maximum number of multi-alignments for the main chimeric segment. =1 will prohibit multimapping main segments. + info: + orig_name: --chimMainSegmentMultNmax example: 10 - - name: --chimMultimapNmax + - name: --chim_multimap_nmax type: integer description: |- maximum number of chimeric multi-alignments - 0 ... use the old scheme for chimeric detection which only considered unique alignments + info: + orig_name: --chimMultimapNmax example: 0 - - name: --chimMultimapScoreRange + - name: --chim_multimap_score_range type: integer description: the score range for multi-mapping chimeras below the best chimeric - score. Only works with --chimMultimapNmax > 1 + score. Only works with --chim_multimap_nmax > 1 + info: + orig_name: --chimMultimapScoreRange example: 1 - - name: --chimNonchimScoreDropMin + - name: --chim_nonchim_score_drop_min type: integer description: to trigger chimeric detection, the drop in the best non-chimeric alignment score with respect to the read length has to be greater than this value + info: + orig_name: --chimNonchimScoreDropMin example: 20 - - name: --chimOutJunctionFormat + - name: --chim_out_junction_format type: integer description: |- formatting type for the Chimeric.out.junction file - 0 ... no comment lines/headers - 1 ... comment lines at the end of the file: command line and Nreads: total, unique/multi-mapping + info: + orig_name: --chimOutJunctionFormat example: 0 - name: Quantification of Annotations arguments: - - name: --quantMode + - name: --quant_mode type: string description: |- types of quantification requested @@ -1043,9 +1311,10 @@ argument_groups: - - ... none - TranscriptomeSAM ... output SAM/BAM alignments to transcriptome into a separate file - GeneCounts ... count reads per gene - multiple: yes - multiple_sep: ; - - name: --quantTranscriptomeBAMcompression + info: + orig_name: --quantMode + multiple: true + - name: --quant_transcriptome_bam_compression type: integer description: |- -2 to 10 transcriptome BAM compression level @@ -1054,8 +1323,10 @@ argument_groups: - -1 ... default compression (6?) - 0 ... no compression - 10 ... maximum compression + info: + orig_name: --quantTranscriptomeBAMcompression example: 1 - - name: --quantTranscriptomeSAMoutput + - name: --quant_transcriptome_sam_output type: string description: |- alignment filtering for TranscriptomeSAM output @@ -1063,26 +1334,301 @@ argument_groups: - BanSingleEnd_BanIndels_ExtendSoftclip ... prohibit indels and single-end alignments, extend softclips - compatible with RSEM - BanSingleEnd ... prohibit single-end alignments, allow indels and softclips - BanSingleEnd_ExtendSoftclip ... prohibit single-end alignments, extend softclips, allow indels + info: + orig_name: --quantTranscriptomeSAMoutput example: BanSingleEnd_BanIndels_ExtendSoftclip - name: 2-pass Mapping arguments: - - name: --twopassMode + - name: --twopass_mode type: string description: |- 2-pass mapping mode. - None ... 1-pass mapping - Basic ... basic 2-pass mapping, with all 1st pass junctions inserted into the genome indices on the fly - - name: --twopass1readsN + info: + orig_name: --twopassMode + - name: --twopass1reads_n type: integer description: number of reads to process for the 1st step. Use very large number (or default -1) to map all reads in the first step. + info: + orig_name: --twopass1readsN example: -1 - name: WASP parameters arguments: - - name: --waspOutputMode + - name: --wasp_output_mode type: string description: |- WASP allele-specific output type. This is re-implementation of the original WASP mappability filtering by Bryce van de Geijn, Graham McVicker, Yoav Gilad & Jonathan K Pritchard. Please cite the original WASP paper: Nature Methods 12, 1061-1063 (2015), https://www.nature.com/articles/nmeth.3582 . - SAMtag ... add WASP tags to the alignments that pass WASP filtering + info: + orig_name: --waspOutputMode +- name: STARsolo (single cell RNA-seq) parameters + arguments: + - name: --solo_type + type: string + description: |- + type of single-cell RNA-seq + + - CB_UMI_Simple ... (a.k.a. Droplet) one UMI and one Cell Barcode of fixed length in read2, e.g. Drop-seq and 10X Chromium. + - CB_UMI_Complex ... multiple Cell Barcodes of varying length, one UMI of fixed length and one adapter sequence of fixed length are allowed in read2 only (e.g. inDrop, ddSeq). + - CB_samTagOut ... output Cell Barcode as CR and/or CB SAm tag. No UMI counting. --readFilesIn cDNA_read1 [cDNA_read2 if paired-end] CellBarcode_read . Requires --out_sam_type BAM Unsorted [and/or SortedByCoordinate] + - SmartSeq ... Smart-seq: each cell in a separate FASTQ (paired- or single-end), barcodes are corresponding read-groups, no UMI sequences, alignments deduplicated according to alignment start and end (after extending soft-clipped bases) + info: + orig_name: --soloType + multiple: true + - name: --solo_cb_type + type: string + description: |- + cell barcode type + + Sequence: cell barcode is a sequence (standard option) + String: cell barcode is an arbitrary string + info: + orig_name: --soloCBtype + example: Sequence + - name: --solo_cb_whitelist + type: string + description: |- + file(s) with whitelist(s) of cell barcodes. Only --solo_type CB_UMI_Complex allows more than one whitelist file. + + - None ... no whitelist: all cell barcodes are allowed + info: + orig_name: --soloCBwhitelist + multiple: true + - name: --solo_cb_start + type: integer + description: cell barcode start base + info: + orig_name: --soloCBstart + example: 1 + - name: --solo_cb_len + type: integer + description: cell barcode length + info: + orig_name: --soloCBlen + example: 16 + - name: --solo_umi_start + type: integer + description: UMI start base + info: + orig_name: --soloUMIstart + example: 17 + - name: --solo_umi_len + type: integer + description: UMI length + info: + orig_name: --soloUMIlen + example: 10 + - name: --solo_barcode_read_length + type: integer + description: |- + length of the barcode read + + - 1 ... equal to sum of soloCBlen+soloUMIlen + - 0 ... not defined, do not check + info: + orig_name: --soloBarcodeReadLength + example: 1 + - name: --solo_barcode_mate + type: integer + description: |- + identifies which read mate contains the barcode (CB+UMI) sequence + + - 0 ... barcode sequence is on separate read, which should always be the last file in the --readFilesIn listed + - 1 ... barcode sequence is a part of mate 1 + - 2 ... barcode sequence is a part of mate 2 + info: + orig_name: --soloBarcodeMate + example: 0 + - name: --solo_cb_position + type: string + description: |- + position of Cell Barcode(s) on the barcode read. + + Presently only works with --solo_type CB_UMI_Complex, and barcodes are assumed to be on Read2. + Format for each barcode: startAnchor_startPosition_endAnchor_endPosition + start(end)Anchor defines the Anchor Base for the CB: 0: read start; 1: read end; 2: adapter start; 3: adapter end + start(end)Position is the 0-based position with of the CB start(end) with respect to the Anchor Base + String for different barcodes are separated by space. + Example: inDrop (Zilionis et al, Nat. Protocols, 2017): + --solo_cb_position 0_0_2_-1 3_1_3_8 + info: + orig_name: --soloCBposition + multiple: true + - name: --solo_umi_position + type: string + description: |- + position of the UMI on the barcode read, same as soloCBposition + + Example: inDrop (Zilionis et al, Nat. Protocols, 2017): + --solo_cb_position 3_9_3_14 + info: + orig_name: --soloUMIposition + - name: --solo_adapter_sequence + type: string + description: adapter sequence to anchor barcodes. Only one adapter sequence is + allowed. + info: + orig_name: --soloAdapterSequence + - name: --solo_adapter_mismatches_nmax + type: integer + description: maximum number of mismatches allowed in adapter sequence. + info: + orig_name: --soloAdapterMismatchesNmax + example: 1 + - name: --solo_cb_match_wl_type + type: string + description: |- + matching the Cell Barcodes to the WhiteList + + - Exact ... only exact matches allowed + - 1MM ... only one match in whitelist with 1 mismatched base allowed. Allowed CBs have to have at least one read with exact match. + - 1MM_multi ... multiple matches in whitelist with 1 mismatched base allowed, posterior probability calculation is used choose one of the matches. + Allowed CBs have to have at least one read with exact match. This option matches best with CellRanger 2.2.0 + - 1MM_multi_pseudocounts ... same as 1MM_Multi, but pseudocounts of 1 are added to all whitelist barcodes. + - 1MM_multi_Nbase_pseudocounts ... same as 1MM_multi_pseudocounts, multimatching to WL is allowed for CBs with N-bases. This option matches best with CellRanger >= 3.0.0 + - EditDist_2 ... allow up to edit distance of 3 fpr each of the barcodes. May include one deletion + one insertion. Only works with --solo_type CB_UMI_Complex. Matches to multiple passlist barcdoes are not allowed. Similar to ParseBio Split-seq pipeline. + info: + orig_name: --soloCBmatchWLtype + example: 1MM_multi + - name: --solo_input_sam_attr_barcode_seq + type: string + description: |- + when inputting reads from a SAM file (--readsFileType SAM SE/PE), these SAM attributes mark the barcode sequence (in proper order). + + For instance, for 10X CellRanger or STARsolo BAMs, use --solo_input_sam_attr_barcode_seq CR UR . + This parameter is required when running STARsolo with input from SAM. + info: + orig_name: --soloInputSAMattrBarcodeSeq + multiple: true + - name: --solo_input_sam_attr_barcode_qual + type: string + description: |- + when inputting reads from a SAM file (--readsFileType SAM SE/PE), these SAM attributes mark the barcode qualities (in proper order). + + For instance, for 10X CellRanger or STARsolo BAMs, use --solo_input_sam_attr_barcode_qual CY UY . + If this parameter is '-' (default), the quality 'H' will be assigned to all bases. + info: + orig_name: --soloInputSAMattrBarcodeQual + multiple: true + - name: --solo_strand + type: string + description: |- + strandedness of the solo libraries: + + - Unstranded ... no strand information + - Forward ... read strand same as the original RNA molecule + - Reverse ... read strand opposite to the original RNA molecule + info: + orig_name: --soloStrand + example: Forward + - name: --solo_features + type: string + description: |- + genomic features for which the UMI counts per Cell Barcode are collected + + - Gene ... genes: reads match the gene transcript + - SJ ... splice junctions: reported in SJ.out.tab + - GeneFull ... full gene (pre-mRNA): count all reads overlapping genes' exons and introns + - GeneFull_ExonOverIntron ... full gene (pre-mRNA): count all reads overlapping genes' exons and introns: prioritize 100% overlap with exons + - GeneFull_Ex50pAS ... full gene (pre-RNA): count all reads overlapping genes' exons and introns: prioritize >50% overlap with exons. Do not count reads with 100% exonic overlap in the antisense direction. + info: + orig_name: --soloFeatures + example: Gene + multiple: true + - name: --solo_multi_mappers + type: string + description: |- + counting method for reads mapping to multiple genes + + - Unique ... count only reads that map to unique genes + - Uniform ... uniformly distribute multi-genic UMIs to all genes + - Rescue ... distribute UMIs proportionally to unique+uniform counts (~ first iteration of EM) + - PropUnique ... distribute UMIs proportionally to unique mappers, if present, and uniformly if not. + - EM ... multi-gene UMIs are distributed using Expectation Maximization algorithm + info: + orig_name: --soloMultiMappers + example: Unique + multiple: true + - name: --solo_umi_dedup + type: string + description: |- + type of UMI deduplication (collapsing) algorithm + + - 1MM_All ... all UMIs with 1 mismatch distance to each other are collapsed (i.e. counted once). + - 1MM_Directional_UMItools ... follows the "directional" method from the UMI-tools by Smith, Heger and Sudbery (Genome Research 2017). + - 1MM_Directional ... same as 1MM_Directional_UMItools, but with more stringent criteria for duplicate UMIs + - Exact ... only exactly matching UMIs are collapsed. + - NoDedup ... no deduplication of UMIs, count all reads. + - 1MM_CR ... CellRanger2-4 algorithm for 1MM UMI collapsing. + info: + orig_name: --soloUMIdedup + example: 1MM_All + multiple: true + - name: --solo_umi_filtering + type: string + description: |- + type of UMI filtering (for reads uniquely mapping to genes) + + - - ... basic filtering: remove UMIs with N and homopolymers (similar to CellRanger 2.2.0). + - MultiGeneUMI ... basic + remove lower-count UMIs that map to more than one gene. + - MultiGeneUMI_All ... basic + remove all UMIs that map to more than one gene. + - MultiGeneUMI_CR ... basic + remove lower-count UMIs that map to more than one gene, matching CellRanger > 3.0.0 . + Only works with --solo_umi_dedup 1MM_CR + info: + orig_name: --soloUMIfiltering + multiple: true + - name: --solo_out_file_names + type: string + description: |- + file names for STARsolo output: + + file_name_prefix gene_names barcode_sequences cell_feature_count_matrix + info: + orig_name: --soloOutFileNames + example: + - Solo.out/ + - features.tsv + - barcodes.tsv + - matrix.mtx + multiple: true + - name: --solo_cell_filter + type: string + description: |- + cell filtering type and parameters + + - None ... do not output filtered cells + - TopCells ... only report top cells by UMI count, followed by the exact number of cells + - CellRanger2.2 ... simple filtering of CellRanger 2.2. + Can be followed by numbers: number of expected cells, robust maximum percentile for UMI count, maximum to minimum ratio for UMI count + The harcoded values are from CellRanger: nExpectedCells=3000; maxPercentile=0.99; maxMinRatio=10 + - EmptyDrops_CR ... EmptyDrops filtering in CellRanger flavor. Please cite the original EmptyDrops paper: A.T.L Lun et al, Genome Biology, 20, 63 (2019): https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1662-y + Can be followed by 10 numeric parameters: nExpectedCells maxPercentile maxMinRatio indMin indMax umiMin umiMinFracMedian candMaxN FDR simN + The harcoded values are from CellRanger: 3000 0.99 10 45000 90000 500 0.01 20000 0.01 10000 + info: + orig_name: --soloCellFilter + example: + - CellRanger2.2 + - '3000' + - '0.99' + - '10' + multiple: true + - name: --solo_out_format_features_gene_field3 + type: string + description: field 3 in the Gene features.tsv file. If "-", then no 3rd field + is output. + info: + orig_name: --soloOutFormatFeaturesGeneField3 + example: Gene Expression + multiple: true + - name: --solo_cell_read_stats + type: string + description: |- + Output reads statistics for each CB + + - Standard ... standard output + info: + orig_name: --soloCellReadStats diff --git a/src/star/star_align_reads/config.vsh.yaml b/src/star/star_align_reads/config.vsh.yaml index bdc956d3..a9a845a1 100644 --- a/src/star/star_align_reads/config.vsh.yaml +++ b/src/star/star_align_reads/config.vsh.yaml @@ -118,6 +118,8 @@ engines: rm -rf /tmp/STAR-${STAR_VERSION} /tmp/${STAR_VERSION}.zip && \ apt-get --purge autoremove -y ${PACKAGES} && \ apt-get clean + - type: python + packages: [ pyyaml ] - type: docker run: | STAR --version | sed 's#\(.*\)#star: "\1"#' > /var/software_versions.txt diff --git a/src/star/star_align_reads/script.py b/src/star/star_align_reads/script.py index f3d64a57..4d9d046f 100644 --- a/src/star/star_align_reads/script.py +++ b/src/star/star_align_reads/script.py @@ -2,6 +2,7 @@ import subprocess import shutil from pathlib import Path +import yaml ## VIASH START par = { @@ -18,10 +19,20 @@ } meta = { "cpus": 8, - "temp_dir": "/tmp" + "temp_dir": "/tmp", + "config": "target/executable/star/star_align_reads/.config.vsh.yaml", } ## VIASH END +# read config +with open(meta["config"], 'r') as stream: + config = yaml.safe_load(stream) +all_arguments = { + arg["name"].lstrip('-'): arg + for argument_group in config["argument_groups"] + for arg in argument_group["arguments"] +} + ################################################## # check and process SE / PE R1 input files input_r1 = par["input"] @@ -87,8 +98,13 @@ cmd_args = [ "STAR" ] for name, value in par.items(): if value is not None: + if name in all_arguments: + arg_info = all_arguments[name].get("info", {}) + cli_name = arg_info.get("orig_name", f"--{name}") + else: + cli_name = f"--{name}" val_to_add = value if isinstance(value, list) else [value] - cmd_args.extend([f"--{name}"] + [str(x) for x in val_to_add]) + cmd_args.extend([cli_name] + [str(x) for x in val_to_add]) print("", flush=True) # run command diff --git a/src/star/star_align_reads/test.sh b/src/star/star_align_reads/test.sh index bd78094d..46566ec0 100644 --- a/src/star/star_align_reads/test.sh +++ b/src/star/star_align_reads/test.sh @@ -88,14 +88,14 @@ cd star_align_reads_se echo "> Run star_align_reads on SE" "$meta_executable" \ --input "../reads_R1.fastq" \ - --genomeDir "../index/" \ + --genome_dir "../index/" \ --aligned_reads "output.sam" \ --log "log.txt" \ - --outReadsUnmapped "Fastx" \ + --out_reads_unmapped "Fastx" \ --unmapped "unmapped.sam" \ - --quantMode "TranscriptomeSAM;GeneCounts" \ + --quant_mode "TranscriptomeSAM;GeneCounts" \ --reads_per_gene "reads_per_gene.tsv" \ - --outSJtype Standard \ + --out_sj_type Standard \ --splice_junctions "splice_junctions.tsv" \ --reads_aligned_to_transcriptome "transcriptome_aligned.bam" \ ${meta_cpus:+---cpus $meta_cpus} @@ -143,10 +143,10 @@ echo ">> Run star_align_reads on PE" "$meta_executable" \ --input ../reads_R1.fastq \ --input_r2 ../reads_R2.fastq \ - --genomeDir ../index/ \ + --genome_dir ../index/ \ --aligned_reads output.bam \ --log log.txt \ - --outReadsUnmapped Fastx \ + --out_reads_unmapped Fastx \ --unmapped unmapped_r1.bam \ --unmapped_r2 unmapped_r2.bam \ ${meta_cpus:+---cpus $meta_cpus} diff --git a/src/star/star_align_reads/utils/process_params.R b/src/star/star_align_reads/utils/process_params.R index ccdc50b3..eee1db65 100644 --- a/src/star/star_align_reads/utils/process_params.R +++ b/src/star/star_align_reads/utils/process_params.R @@ -14,6 +14,14 @@ param_txt <- iconv(param_txt, "UTF-8", "ASCII//TRANSLIT") dev_begin <- grep("#####UnderDevelopment_begin", param_txt) dev_end <- grep("#####UnderDevelopment_end", param_txt) +camel_case_to_snake_case <- function(x) { + x %>% + str_replace_all("([A-Z][A-Z][A-Z]*)", "_\\1_") %>% + str_replace_all("([a-z])([A-Z])", "\\1_\\2") %>% + str_to_lower() %>% + str_replace_all("_$", "") +} + # strip development sections nondev_ix <- unlist(map2(c(1, dev_end + 1), c(dev_begin - 1, length(param_txt)), function(i, j) { if (i >= 1 && i < j) { @@ -128,9 +136,8 @@ out2 <- out %>% # remove arguments that are related to a different runmode filter(!grepl("--runMode", description) | grepl("--runMode alignReads", description)) %>% filter(!grepl("--runMode", group_name) | grepl("--runMode alignReads", group_name)) %>% - filter(!grepl("STARsolo", group_name)) %>% mutate( - viash_arg = paste0("--", name), + viash_arg = paste0("--", camel_case_to_snake_case(name)), type_step1 = type %>% str_replace_all(".*(int, string|string|int|real|double)\\(?(s?).*", "\\1\\2"), viash_type = type_map[gsub("(int, string|string|int|real|double).*", "\\1", type_step1)], @@ -155,28 +162,41 @@ out2 <- out %>% group_name = gsub(" - .*", "", group_name), required = ifelse(name %in% required_args, TRUE, NA) ) -print(out2, n = 200) -out2 %>% mutate(i = row_number()) %>% - # filter(is.na(default_step1) != is.na(viash_default)) %>% - select(-group_name, -description) -out2 %>% filter(!grepl("--runMode", description) | grepl("--runMode alignReads", description)) +# change references to argument names +out3 <- out2 +for (i in seq_len(nrow(out2))) { + orig_name <- paste0("--", out2$name[[i]]) + new_name <- out2$viash_arg[[i]] + out3$description <- str_replace_all(out3$description, orig_name, new_name) +} + +# sanity checks +out3 %>% select(name, viash_arg) %>% as.data.frame() +print(out3, n = 200) +out3 %>% + mutate(i = row_number()) %>% + select(-group_name, -description) +out3 %>% filter(!grepl("--runMode", description) | grepl("--runMode alignReads", description)) -argument_groups <- map(unique(out2$group_name), function(group_name) { - args <- out2 %>% +# create argument groups +argument_groups <- map(unique(out3$group_name), function(group_name) { + args <- out3 %>% filter(group_name == !!group_name) %>% - pmap(function(viash_arg, viash_type, multiple, viash_default, description, required, ...) { - li <- lst( + pmap(function(viash_arg, viash_type, multiple, viash_default, description, required, name, ...) { + li <- list( name = viash_arg, type = viash_type, - description = description + description = description, + info = list( + orig_name = paste0("--", name) + ) ) if (all(!is.na(viash_default))) { li$example <- viash_default } if (!is.na(multiple) && multiple) { li$multiple <- multiple - li$multiple_sep <- ";" } if (!is.na(required) && required) { li$required <- required @@ -186,4 +206,10 @@ argument_groups <- map(unique(out2$group_name), function(group_name) { list(name = group_name, arguments = args) }) -yaml::write_yaml(list(argument_groups = argument_groups), yaml_file) +yaml::write_yaml( + list(argument_groups = argument_groups), + yaml_file, + handlers = list( + logical = yaml::verbatim_logical + ) +) diff --git a/src/star/star_genome_generate/config.vsh.yaml b/src/star/star_genome_generate/config.vsh.yaml index 60fa3839..71c58826 100644 --- a/src/star/star_genome_generate/config.vsh.yaml +++ b/src/star/star_genome_generate/config.vsh.yaml @@ -17,71 +17,68 @@ authors: argument_groups: - name: "Input" arguments: - - name: "--genomeFastaFiles" + - name: "--genome_fasta_files" type: file description: | Path(s) to the fasta files with the genome sequences, separated by spaces. These files should be plain text FASTA files, they *cannot* be zipped. required: true - multiple: yes - multiple_sep: ; - - name: "--sjdbGTFfile" + multiple: true + - name: "--sjdb_gtf_file" type: file description: Path to the GTF file with annotations - - name: --sjdbOverhang + - name: --sjdb_overhang type: integer description: Length of the donor/acceptor sequence on each side of the junctions, ideally = (mate_length - 1) example: 100 - - name: --sjdbGTFchrPrefix + - name: --sjdb_gtf_chr_prefix type: string description: Prefix for chromosome names in a GTF file (e.g. 'chr' for using ENSMEBL annotations with UCSC genomes) - - name: --sjdbGTFfeatureExon + - name: --sjdb_gtf_feature_exon type: string description: Feature type in GTF file to be used as exons for building transcripts example: exon - - name: --sjdbGTFtagExonParentTranscript + - name: --sjdb_gtf_tag_exon_parent_transcript type: string description: GTF attribute name for parent transcript ID (default "transcript_id" works for GTF files) example: transcript_id - - name: --sjdbGTFtagExonParentGene + - name: --sjdb_gtf_tag_exon_parent_gene type: string description: GTF attribute name for parent gene ID (default "gene_id" works for GTF files) example: gene_id - - name: --sjdbGTFtagExonParentGeneName + - name: --sjdb_gtf_tag_exon_parent_gene_name type: string description: GTF attribute name for parent gene name example: gene_name - multiple: yes - multiple_sep: ; - - name: --sjdbGTFtagExonParentGeneType + multiple: true + - name: --sjdb_gtf_tag_exon_parent_gene_type type: string description: GTF attribute name for parent gene type example: - gene_type - gene_biotype - multiple: yes - multiple_sep: ; - - name: --limitGenomeGenerateRAM + multiple: true + - name: --limit_genome_generate_ram type: long description: Maximum available RAM (bytes) for genome generation - example: '31000000000' - - name: --genomeSAindexNbases + example: 31000000000 + - name: --genome_sa_index_nbases type: integer description: Length (bases) of the SA pre-indexing string. Typically between 10 and 15. Longer strings will use much more memory, but allow faster searches. For small genomes, this parameter must be scaled down to min(14, log2(GenomeLength)/2 - 1). example: 14 - - name: --genomeChrBinNbits + - name: --genome_chr_bin_nbits type: integer description: Defined as log2(chrBin), where chrBin is the size of the bins for genome storage. Each chromosome will occupy an integer number of bins. For a genome with large number of contigs, it is recommended to scale this parameter as min(18, log2[max(GenomeLength/NumberOfReferences,ReadLength)]). example: 18 - - name: --genomeSAsparseD + - name: --genome_sa_sparse_d type: integer min: 0 example: 1 description: Suffux array sparsity, i.e. distance between indices. Use bigger numbers to decrease needed RAM at the cost of mapping speed reduction. - - name: --genomeSuffixLengthMax + - name: --genome_suffix_length_max type: integer description: Maximum length of the suffixes, has to be longer than read length. Use -1 for infinite length. example: -1 - - name: --genomeTransformType + - name: --genome_transform_type type: string description: | Type of genome transformation @@ -89,7 +86,7 @@ argument_groups: Haploid ... replace reference alleles with alternative alleles from VCF file (e.g. consensus allele) Diploid ... create two haplotypes for each chromosome listed in VCF file, for genotypes 1|2, assumes perfect phasing (e.g. personal genome) example: None - - name: --genomeTransformVCF + - name: --genome_transform_vcf type: file description: path to VCF file for genome transformation diff --git a/src/star/star_genome_generate/script.sh b/src/star/star_genome_generate/script.sh index cb3b906c..fc232672 100644 --- a/src/star/star_genome_generate/script.sh +++ b/src/star/star_genome_generate/script.sh @@ -10,20 +10,20 @@ mkdir -p $par_index STAR \ --runMode genomeGenerate \ --genomeDir $par_index \ - --genomeFastaFiles $par_genomeFastaFiles \ + --genomeFastaFiles $par_genome_fasta_files \ ${meta_cpus:+--runThreadN "${meta_cpus}"} \ - ${par_sjdbGTFfile:+--sjdbGTFfile "${par_sjdbGTFfile}"} \ + ${par_sjdb_gtf_file:+--sjdbGTFfile "${par_sjdb_gtf_file}"} \ ${par_sjdbOverhang:+--sjdbOverhang "${par_sjdbOverhang}"} \ - ${par_genomeSAindexNbases:+--genomeSAindexNbases "${par_genomeSAindexNbases}"} \ - ${par_sjdbGTFchrPrefix:+--sjdbGTFchrPrefix "${par_sjdbGTFchrPrefix}"} \ - ${par_sjdbGTFfeatureExon:+--sjdbGTFfeatureExon "${par_sjdbGTFfeatureExon}"} \ - ${par_sjdbGTFtagExonParentTranscript:+--sjdbGTFtagExonParentTranscript "${par_sjdbGTFtagExonParentTranscript}"} \ - ${par_sjdbGTFtagExonParentGene:+--sjdbGTFtagExonParentGene "${par_sjdbGTFtagExonParentGene}"} \ - ${par_sjdbGTFtagExonParentGeneName:+--sjdbGTFtagExonParentGeneName "${par_sjdbGTFtagExonParentGeneName}"} \ - ${par_sjdbGTFtagExonParentGeneType:+--sjdbGTFtagExonParentGeneType "${sjdbGTFtagExonParentGeneType}"} \ - ${par_limitGenomeGenerateRAM:+--limitGenomeGenerateRAM "${par_limitGenomeGenerateRAM}"} \ - ${par_genomeChrBinNbits:+--genomeChrBinNbits "${par_genomeChrBinNbits}"} \ - ${par_genomeSAsparseD:+--genomeSAsparseD "${par_genomeSAsparseD}"} \ - ${par_genomeSuffixLengthMax:+--genomeSuffixLengthMax "${par_genomeSuffixLengthMax}"} \ - ${par_genomeTransformType:+--genomeTransformType "${par_genomeTransformType}"} \ - ${par_genomeTransformVCF:+--genomeTransformVCF "${par_genomeTransformVCF}"} \ + ${par_genome_sa_index_nbases:+--genomeSAindexNbases "${par_genome_sa_index_nbases}"} \ + ${par_sjdb_gtf_chr_prefix:+--sjdbGTFchrPrefix "${par_sjdb_gtf_chr_prefix}"} \ + ${par_sjdb_gtf_feature_exon:+--sjdbGTFfeatureExon "${par_sjdb_gtf_feature_exon}"} \ + ${par_sjdb_gtf_tag_exon_parent_transcript:+--sjdbGTFtag_exon_parent_transcript "${par_sjdb_gtf_tag_exon_parent_transcript}"} \ + ${par_sjdb_gtf_tag_exon_parent_gene:+--sjdbGTFtag_exon_parent_gene "${par_sjdb_gtf_tag_exon_parent_gene}"} \ + ${par_sjdb_gtf_tag_exon_parent_geneName:+--sjdbGTFtag_exon_parent_geneName "${par_sjdb_gtf_tag_exon_parent_geneName}"} \ + ${par_sjdb_gtf_tag_exon_parent_geneType:+--sjdbGTFtag_exon_parent_geneType "${sjdbGTFtag_exon_parent_geneType}"} \ + ${par_limit_genome_generate_ram:+--limitGenomeGenerateRAM "${par_limit_genome_generate_ram}"} \ + ${par_genome_chr_bin_nbits:+--genomeChrBinNbits "${par_genome_chr_bin_nbits}"} \ + ${par_genome_sa_sparse_d:+--genomeSAsparseD "${par_genome_sa_sparse_d}"} \ + ${par_genome_suffix_length_max:+--genomeSuffixLengthMax "${par_genome_suffix_length_max}"} \ + ${par_genome_transform_type:+--genomeTransformType "${par_genome_transform_type}"} \ + ${par_genome_transform_vcf:+--genomeTransformVCF "${par_genome_transform_vCF}"} \ diff --git a/src/star/star_genome_generate/test.sh b/src/star/star_genome_generate/test.sh index fd0e4775..681f3494 100644 --- a/src/star/star_genome_generate/test.sh +++ b/src/star/star_genome_generate/test.sh @@ -27,9 +27,9 @@ echo "> Generate index" "$meta_executable" \ ${meta_cpus:+---cpus $meta_cpus} \ --index "star_index/" \ - --genomeFastaFiles "genome.fasta" \ - --sjdbGTFfile "genes.gtf" \ - --genomeSAindexNbases 2 + --genome_fasta_files "genome.fasta" \ + --sjdb_gtf_file "genes.gtf" \ + --genome_sa_index_nbases 4 files=("Genome" "Log.out" "SA" "SAindex" "chrLength.txt" "chrName.txt" "chrNameLength.txt" "chrStart.txt" "exonGeTrInfo.tab" "exonInfo.tab" "geneInfo.tab" "genomeParameters.txt" "sjdbInfo.txt" "sjdbList.fromGTF.out.tab" "sjdbList.out.tab" "transcriptInfo.tab")