From 7d99065ecf66e6bc42b03f8ffcfcfc95ef2d2b72 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Wed, 17 Jul 2024 17:46:44 +0200 Subject: [PATCH 1/7] `bd_rhapsody_make_reference`: Create a reference for the BD Rhapsody pipeline (#75) * `bd_rhapsody/bd_rhapsody_make_reference`: Create a reference for the BD Rhapsody pipeline * add missing metadata * remove unicode * trigger * process comments * add authors * Apply suggestions from code review Co-authored-by: Dorien <41797896+dorien-er@users.noreply.github.com> --------- Co-authored-by: Dorien <41797896+dorien-er@users.noreply.github.com> --- CHANGELOG.md | 6 + src/_authors/robrecht_cannoodt.yaml | 14 ++ src/_authors/weiwei_schultz.yaml | 5 + .../config.vsh.yaml | 143 ++++++++++++++++ .../bd_rhapsody_make_reference/help.txt | 66 +++++++ .../make_rhap_reference_2.2.1_nodocker.cwl | 115 +++++++++++++ .../bd_rhapsody_make_reference/script.py | 161 ++++++++++++++++++ .../bd_rhapsody_make_reference/test.sh | 68 ++++++++ .../test_data/reference_small.fa | 27 +++ .../test_data/reference_small.gtf | 8 + .../test_data/script.sh | 47 +++++ 11 files changed, 660 insertions(+) create mode 100644 src/_authors/robrecht_cannoodt.yaml create mode 100644 src/_authors/weiwei_schultz.yaml create mode 100644 src/bd_rhapsody/bd_rhapsody_make_reference/config.vsh.yaml create mode 100644 src/bd_rhapsody/bd_rhapsody_make_reference/help.txt create mode 100644 src/bd_rhapsody/bd_rhapsody_make_reference/make_rhap_reference_2.2.1_nodocker.cwl create mode 100644 src/bd_rhapsody/bd_rhapsody_make_reference/script.py create mode 100644 src/bd_rhapsody/bd_rhapsody_make_reference/test.sh create mode 100644 src/bd_rhapsody/bd_rhapsody_make_reference/test_data/reference_small.fa create mode 100644 src/bd_rhapsody/bd_rhapsody_make_reference/test_data/reference_small.gtf create mode 100644 src/bd_rhapsody/bd_rhapsody_make_reference/test_data/script.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 80b8b9f3..9cfacdbc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # biobox x.x.x +## NEW FEATURES + +* `bd_rhapsody`: + + - `bd_rhapsody/bd_rhapsody_make_reference`: Create a reference for the BD Rhapsody pipeline (PR #75). + ## BUG FIXES * `pear`: fix component not exiting with the correct exitcode when PEAR fails. diff --git a/src/_authors/robrecht_cannoodt.yaml b/src/_authors/robrecht_cannoodt.yaml new file mode 100644 index 00000000..d7c0f283 --- /dev/null +++ b/src/_authors/robrecht_cannoodt.yaml @@ -0,0 +1,14 @@ +name: Robrecht Cannoodt +info: + links: + email: robrecht@data-intuitive.com + github: rcannood + orcid: "0000-0003-3641-729X" + linkedin: robrechtcannoodt + organizations: + - name: Data Intuitive + href: https://www.data-intuitive.com + role: Data Science Engineer + - name: Open Problems + href: https://openproblems.bio + role: Core Member \ No newline at end of file diff --git a/src/_authors/weiwei_schultz.yaml b/src/_authors/weiwei_schultz.yaml new file mode 100644 index 00000000..324f9378 --- /dev/null +++ b/src/_authors/weiwei_schultz.yaml @@ -0,0 +1,5 @@ +name: Weiwei Schultz +info: + organizations: + - name: Janssen R&D US + role: Associate Director Data Sciences \ No newline at end of file diff --git a/src/bd_rhapsody/bd_rhapsody_make_reference/config.vsh.yaml b/src/bd_rhapsody/bd_rhapsody_make_reference/config.vsh.yaml new file mode 100644 index 00000000..e596bf06 --- /dev/null +++ b/src/bd_rhapsody/bd_rhapsody_make_reference/config.vsh.yaml @@ -0,0 +1,143 @@ +name: bd_rhapsody_make_reference +namespace: bd_rhapsody +description: | + The Reference Files Generator creates an archive containing Genome Index + and Transcriptome annotation files needed for the BD Rhapsody Sequencing + Analysis Pipeline. The app takes as input one or more FASTA and GTF files + and produces a compressed archive in the form of a tar.gz file. The + archive contains: + + - STAR index + - Filtered GTF file +keywords: [genome, reference, index, align] +links: + repository: https://bitbucket.org/CRSwDev/cwl/src/master/v2.2.1/Extra_Utilities/ + documentation: https://bd-rhapsody-bioinfo-docs.genomics.bd.com/resources/extra_utilities.html#make-rhapsody-reference +license: Unknown +authors: + - __merge__: /src/_authors/robrecht_cannoodt.yaml + roles: [ author, maintainer ] + - __merge__: /src/_authors/weiwei_schultz.yaml + roles: [ contributor ] + +argument_groups: + - name: Inputs + arguments: + - type: file + name: --genome_fasta + required: true + description: Reference genome file in FASTA or FASTA.GZ format. The BD Rhapsody Sequencing Analysis Pipeline uses GRCh38 for Human and GRCm39 for Mouse. + example: genome_sequence.fa.gz + multiple: true + info: + config_key: Genome_fasta + - type: file + name: --gtf + required: true + description: | + File path to the transcript annotation files in GTF or GTF.GZ format. The Sequence Analysis Pipeline requires the 'gene_name' or + 'gene_id' attribute to be set on each gene and exon feature. Gene and exon feature lines must have the same attribute, and exons + must have a corresponding gene with the same value. For TCR/BCR assays, the TCR or BCR gene segments must have the 'gene_type' or + 'gene_biotype' attribute set, and the value should begin with 'TR' or 'IG', respectively. + example: transcriptome_annotation.gtf.gz + multiple: true + info: + config_key: Gtf + - type: file + name: --extra_sequences + description: | + File path to additional sequences in FASTA format to use when building the STAR index. (e.g. transgenes or CRISPR guide barcodes). + GTF lines for these sequences will be automatically generated and combined with the main GTF. + required: false + multiple: true + info: + config_key: Extra_sequences + - name: Outputs + arguments: + - type: file + name: --reference_archive + direction: output + required: true + description: | + A Compressed archive containing the Reference Genome Index and annotation GTF files. This archive is meant to be used as an + input in the BD Rhapsody Sequencing Analysis Pipeline. + example: star_index.tar.gz + - name: Arguments + arguments: + - type: string + name: --mitochondrial_contigs + description: | + Names of the Mitochondrial contigs in the provided Reference Genome. Fragments originating from contigs other than these are + identified as 'nuclear fragments' in the ATACseq analysis pipeline. + required: false + multiple: true + default: [chrM, chrMT, M, MT] + info: + config_key: Mitochondrial_contigs + - type: boolean_true + name: --filtering_off + description: | + By default the input Transcript Annotation files are filtered based on the gene_type/gene_biotype attribute. Only features + having the following attribute values are kept: + + - protein_coding + - lncRNA (lincRNA and antisense for Gencode < v31/M22/Ensembl97) + - IG_LV_gene + - IG_V_gene + - IG_V_pseudogene + - IG_D_gene + - IG_J_gene + - IG_J_pseudogene + - IG_C_gene + - IG_C_pseudogene + - TR_V_gene + - TR_V_pseudogene + - TR_D_gene + - TR_J_gene + - TR_J_pseudogene + - TR_C_gene + + If you have already pre-filtered the input Annotation files and/or wish to turn-off the filtering, please set this option to True. + info: + config_key: Filtering_off + - type: boolean_true + name: --wta_only_index + description: Build a WTA only index, otherwise builds a WTA + ATAC index. + info: + config_key: Wta_Only + - type: string + name: --extra_star_params + description: Additional parameters to pass to STAR when building the genome index. Specify exactly like how you would on the command line. + example: --limitGenomeGenerateRAM 48000 --genomeSAindexNbases 11 + required: false + info: + config_key: Extra_STAR_params + +resources: + - type: python_script + path: script.py + - path: make_rhap_reference_2.2.1_nodocker.cwl + +test_resources: + - type: bash_script + path: test.sh + - path: test_data + +requirements: + commands: [ "cwl-runner" ] + +engines: + - type: docker + image: bdgenomics/rhapsody:2.2.1 + setup: + - type: apt + packages: [procps] + - type: python + packages: [cwlref-runner, cwl-runner] + - type: docker + run: | + echo "bdgenomics/rhapsody: 2.2.1" > /var/software_versions.txt + +runners: + - type: executable + - type: nextflow diff --git a/src/bd_rhapsody/bd_rhapsody_make_reference/help.txt b/src/bd_rhapsody/bd_rhapsody_make_reference/help.txt new file mode 100644 index 00000000..cd038b25 --- /dev/null +++ b/src/bd_rhapsody/bd_rhapsody_make_reference/help.txt @@ -0,0 +1,66 @@ +```bash +cwl-runner src/bd_rhapsody/bd_rhapsody_make_reference/make_rhap_reference_2.2.1_nodocker.cwl --help +``` + +usage: src/bd_rhapsody/bd_rhapsody_make_reference/make_rhap_reference_2.2.1_nodocker.cwl + [-h] [--Archive_prefix ARCHIVE_PREFIX] + [--Extra_STAR_params EXTRA_STAR_PARAMS] + [--Extra_sequences EXTRA_SEQUENCES] [--Filtering_off] --Genome_fasta + GENOME_FASTA --Gtf GTF [--Maximum_threads MAXIMUM_THREADS] + [--Mitochondrial_Contigs MITOCHONDRIAL_CONTIGS] [--WTA_Only] + [job_order] + +The Reference Files Generator creates an archive containing Genome Index and +Transcriptome annotation files needed for the BD Rhapsodyâ„¢ Sequencing +Analysis Pipeline. The app takes as input one or more FASTA and GTF files and +produces a compressed archive in the form of a tar.gz file. The archive +contains:\n - STAR index\n - Filtered GTF file + +positional arguments: + job_order Job input json file + +options: + -h, --help show this help message and exit + --Archive_prefix ARCHIVE_PREFIX + A prefix for naming the compressed archive file + containing the Reference genome index and annotation + files. The default value is constructed based on the + input Reference files. + --Extra_STAR_params EXTRA_STAR_PARAMS + Additional parameters to pass to STAR when building + the genome index. Specify exactly like how you would + on the command line. Example: --limitGenomeGenerateRAM + 48000 --genomeSAindexNbases 11 + --Extra_sequences EXTRA_SEQUENCES + Additional sequences in FASTA format to use when + building the STAR index. (E.g. phiX genome) + --Filtering_off By default the input Transcript Annotation files are + filtered based on the gene_type/gene_biotype + attribute. Only features having the following + attribute values are are kept: - protein_coding - + lncRNA (lincRNA and antisense for Gencode < + v31/M22/Ensembl97) - IG_LV_gene - IG_V_gene - + IG_V_pseudogene - IG_D_gene - IG_J_gene - + IG_J_pseudogene - IG_C_gene - IG_C_pseudogene - + TR_V_gene - TR_V_pseudogene - TR_D_gene - TR_J_gene - + TR_J_pseudogene - TR_C_gene If you have already pre- + filtered the input Annotation files and/or wish to + turn-off the filtering, please set this option to + True. + --Genome_fasta GENOME_FASTA + Reference genome file in FASTA format. The BD + Rhapsodyâ„¢ Sequencing Analysis Pipeline uses GRCh38 + for Human and GRCm39 for Mouse. + --Gtf GTF Transcript annotation files in GTF format. The BD + Rhapsodyâ„¢ Sequencing Analysis Pipeline uses Gencode + v42 for Human and M31 for Mouse. + --Maximum_threads MAXIMUM_THREADS + The maximum number of threads to use in the pipeline. + By default, all available cores are used. + --Mitochondrial_Contigs MITOCHONDRIAL_CONTIGS + Names of the Mitochondrial contigs in the provided + Reference Genome. Fragments originating from contigs + other than these are identified as 'nuclear fragments' + in the ATACseq analysis pipeline. + --WTA_Only Build a WTA only index, otherwise builds a WTA + ATAC + index. diff --git a/src/bd_rhapsody/bd_rhapsody_make_reference/make_rhap_reference_2.2.1_nodocker.cwl b/src/bd_rhapsody/bd_rhapsody_make_reference/make_rhap_reference_2.2.1_nodocker.cwl new file mode 100644 index 00000000..fead2c02 --- /dev/null +++ b/src/bd_rhapsody/bd_rhapsody_make_reference/make_rhap_reference_2.2.1_nodocker.cwl @@ -0,0 +1,115 @@ +requirements: + InlineJavascriptRequirement: {} +class: CommandLineTool +label: Reference Files Generator for BD Rhapsodyâ„¢ Sequencing Analysis Pipeline +cwlVersion: v1.2 +doc: >- + The Reference Files Generator creates an archive containing Genome Index and Transcriptome annotation files needed for the BD Rhapsodyâ„¢ Sequencing Analysis Pipeline. The app takes as input one or more FASTA and GTF files and produces a compressed archive in the form of a tar.gz file. The archive contains:\n - STAR index\n - Filtered GTF file + + +baseCommand: run_reference_generator.sh +inputs: + Genome_fasta: + type: File[] + label: Reference Genome + doc: |- + Reference genome file in FASTA format. The BD Rhapsodyâ„¢ Sequencing Analysis Pipeline uses GRCh38 for Human and GRCm39 for Mouse. + inputBinding: + prefix: --reference-genome + shellQuote: false + Gtf: + type: File[] + label: Transcript Annotations + doc: |- + Transcript annotation files in GTF format. The BD Rhapsodyâ„¢ Sequencing Analysis Pipeline uses Gencode v42 for Human and M31 for Mouse. + inputBinding: + prefix: --gtf + shellQuote: false + Extra_sequences: + type: File[]? + label: Extra Sequences + doc: |- + Additional sequences in FASTA format to use when building the STAR index. (E.g. phiX genome) + inputBinding: + prefix: --extra-sequences + shellQuote: false + Mitochondrial_Contigs: + type: string[]? + default: ["chrM", "chrMT", "M", "MT"] + label: Mitochondrial Contig Names + doc: |- + Names of the Mitochondrial contigs in the provided Reference Genome. Fragments originating from contigs other than these are identified as 'nuclear fragments' in the ATACseq analysis pipeline. + inputBinding: + prefix: --mitochondrial-contigs + shellQuote: false + Filtering_off: + type: boolean? + label: Turn off filtering + doc: |- + By default the input Transcript Annotation files are filtered based on the gene_type/gene_biotype attribute. Only features having the following attribute values are are kept: + - protein_coding + - lncRNA (lincRNA and antisense for Gencode < v31/M22/Ensembl97) + - IG_LV_gene + - IG_V_gene + - IG_V_pseudogene + - IG_D_gene + - IG_J_gene + - IG_J_pseudogene + - IG_C_gene + - IG_C_pseudogene + - TR_V_gene + - TR_V_pseudogene + - TR_D_gene + - TR_J_gene + - TR_J_pseudogene + - TR_C_gene + If you have already pre-filtered the input Annotation files and/or wish to turn-off the filtering, please set this option to True. + inputBinding: + prefix: --filtering-off + shellQuote: false + WTA_Only: + type: boolean? + label: WTA only index + doc: Build a WTA only index, otherwise builds a WTA + ATAC index. + inputBinding: + prefix: --wta-only-index + shellQuote: false + Archive_prefix: + type: string? + label: Archive Prefix + doc: |- + A prefix for naming the compressed archive file containing the Reference genome index and annotation files. The default value is constructed based on the input Reference files. + inputBinding: + prefix: --archive-prefix + shellQuote: false + Extra_STAR_params: + type: string? + label: Extra STAR Params + doc: |- + Additional parameters to pass to STAR when building the genome index. Specify exactly like how you would on the command line. + Example: + --limitGenomeGenerateRAM 48000 --genomeSAindexNbases 11 + inputBinding: + prefix: --extra-star-params + shellQuote: true + + Maximum_threads: + type: int? + label: Maximum Number of Threads + doc: |- + The maximum number of threads to use in the pipeline. By default, all available cores are used. + inputBinding: + prefix: --maximum-threads + shellQuote: false + +outputs: + + Archive: + type: File + doc: |- + A Compressed archive containing the Reference Genome Index and annotation GTF files. This archive is meant to be used as an input in the BD Rhapsodyâ„¢ Sequencing Analysis Pipeline. + id: Reference_Archive + label: Reference Files Archive + outputBinding: + glob: '*.tar.gz' + diff --git a/src/bd_rhapsody/bd_rhapsody_make_reference/script.py b/src/bd_rhapsody/bd_rhapsody_make_reference/script.py new file mode 100644 index 00000000..ca635508 --- /dev/null +++ b/src/bd_rhapsody/bd_rhapsody_make_reference/script.py @@ -0,0 +1,161 @@ +import os +import re +import subprocess +import tempfile +from typing import Any +import yaml +import shutil + +## VIASH START +par = { + "genome_fasta": [], + "gtf": [], + "extra_sequences": [], + "mitochondrial_contigs": ["chrM", "chrMT", "M", "MT"], + "filtering_off": False, + "wta_only_index": False, + "extra_star_params": None, + "reference_archive": "output.tar.gz", +} +meta = { + "config": "target/nextflow/reference/build_bdrhap_2_reference/.config.vsh.yaml", + "resources_dir": os.path.abspath("src/reference/build_bdrhap_2_reference"), + "temp_dir": os.getenv("VIASH_TEMP"), + "memory_mb": None, + "cpus": None +} +## VIASH END + +def clean_arg(argument): + argument["clean_name"] = re.sub("^-*", "", argument["name"]) + return argument + +def read_config(path: str) -> dict[str, Any]: + with open(path, "r") as f: + config = yaml.safe_load(f) + + config["all_arguments"] = [ + clean_arg(arg) + for grp in config["argument_groups"] + for arg in grp["arguments"] + ] + + return config + +def strip_margin(text: str) -> str: + return re.sub("(\n?)[ \t]*\|", "\\1", text) + +def process_params(par: dict[str, Any], config) -> str: + # check input parameters + assert par["genome_fasta"], "Pass at least one set of inputs to --genome_fasta." + assert par["gtf"], "Pass at least one set of inputs to --gtf." + assert par["reference_archive"].endswith(".tar.gz"), "Output reference_archive must end with .tar.gz." + + # make paths absolute + for argument in config["all_arguments"]: + if par[argument["clean_name"]] and argument["type"] == "file": + if isinstance(par[argument["clean_name"]], list): + par[argument["clean_name"]] = [ os.path.abspath(f) for f in par[argument["clean_name"]] ] + else: + par[argument["clean_name"]] = os.path.abspath(par[argument["clean_name"]]) + + return par + +def generate_config(par: dict[str, Any], meta, config) -> str: + content_list = [strip_margin(f"""\ + |#!/usr/bin/env cwl-runner + | + |""")] + + + config_key_value_pairs = [] + for argument in config["all_arguments"]: + config_key = (argument.get("info") or {}).get("config_key") + arg_type = argument["type"] + par_value = par[argument["clean_name"]] + if par_value and config_key: + config_key_value_pairs.append((config_key, arg_type, par_value)) + + if meta["cpus"]: + config_key_value_pairs.append(("Maximum_threads", "integer", meta["cpus"])) + + # print(config_key_value_pairs) + + for config_key, arg_type, par_value in config_key_value_pairs: + if arg_type == "file": + str = strip_margin(f"""\ + |{config_key}: + |""") + if isinstance(par_value, list): + for file in par_value: + str += strip_margin(f"""\ + | - class: File + | location: "{file}" + |""") + else: + str += strip_margin(f"""\ + | class: File + | location: "{par_value}" + |""") + content_list.append(str) + else: + content_list.append(strip_margin(f"""\ + |{config_key}: {par_value} + |""")) + + ## Write config to file + return "".join(content_list) + +def get_cwl_file(meta: dict[str, Any]) -> str: + # create cwl file (if need be) + cwl_file=os.path.join(meta["resources_dir"], "make_rhap_reference_2.2.1_nodocker.cwl") + + return cwl_file + +def main(par: dict[str, Any], meta: dict[str, Any]): + config = read_config(meta["config"]) + + # Preprocess params + par = process_params(par, config) + + # fetch cwl file + cwl_file = get_cwl_file(meta) + + # Create output dir if not exists + outdir = os.path.dirname(par["reference_archive"]) + if not os.path.exists(outdir): + os.makedirs(outdir) + + ## Run pipeline + with tempfile.TemporaryDirectory(prefix="cwl-bd_rhapsody_wta-", dir=meta["temp_dir"]) as temp_dir: + # Create params file + config_file = os.path.join(temp_dir, "config.yml") + config_content = generate_config(par, meta, config) + with open(config_file, "w") as f: + f.write(config_content) + + + cmd = [ + "cwl-runner", + "--no-container", + "--preserve-entire-environment", + "--outdir", + temp_dir, + cwl_file, + config_file + ] + + env = dict(os.environ) + env["TMPDIR"] = temp_dir + + print("> " + " ".join(cmd), flush=True) + _ = subprocess.check_call( + cmd, + cwd=os.path.dirname(config_file), + env=env + ) + + shutil.move(os.path.join(temp_dir, "Rhap_reference.tar.gz"), par["reference_archive"]) + +if __name__ == "__main__": + main(par, meta) diff --git a/src/bd_rhapsody/bd_rhapsody_make_reference/test.sh b/src/bd_rhapsody/bd_rhapsody_make_reference/test.sh new file mode 100644 index 00000000..3637160a --- /dev/null +++ b/src/bd_rhapsody/bd_rhapsody_make_reference/test.sh @@ -0,0 +1,68 @@ +#!/bin/bash + +set -e + +############################################# +# helper functions +assert_file_exists() { + [ -f "$1" ] || { echo "File '$1' does not exist" && exit 1; } +} +assert_file_doesnt_exist() { + [ ! -f "$1" ] || { echo "File '$1' exists but shouldn't" && exit 1; } +} +assert_file_empty() { + # () will execute in a shubshell, could you use {;}? + [ ! -s "$1" ] || { echo "File '$1' is not empty but should be" && exit 1; } +} +assert_file_not_empty() { + # [ -s "$1" ] || (echo "File '$1' is empty but shouldn't be" && exit 1) + [ -s "$1" ] || { echo "File '$1' is empty but shouldn't be" && exit 1; } +} +assert_file_contains() { + # grep -q "$2" "$1" || (echo "File '$1' does not contain '$2'" && exit 1) + grep -q "$2" "$1" || { echo "File '$1' does not contain '$2'" && exit 1; } +} +assert_file_not_contains() { + # grep -q "$2" "$1" && (echo "File '$1' contains '$2' but shouldn't" && exit 1) + grep -q "$2" "$1" && { echo "File '$1' contains '$2' but shouldn't" && exit 1; } +} + +in_fa="$meta_resources_dir/test_data/reference_small.fa" +in_gtf="$meta_resources_dir/test_data/reference_small.gtf" + +echo "#############################################" +echo "> Simple run" + +mkdir simple_run +cd simple_run + +out_tar="myreference.tar.gz" + +echo "> Running $meta_name." +$meta_executable \ + --genome_fasta "$in_fa" \ + --gtf "$in_gtf" \ + --reference_archive "$out_tar" \ + --extra_star_params "--genomeSAindexNbases 6" \ + ---cpus 2 + +exit_code=$? +[[ $exit_code != 0 ]] && echo "Non zero exit code: $exit_code" && exit 1 + +assert_file_exists "$out_tar" +assert_file_not_empty "$out_tar" + +echo ">> Checking whether output contains the expected files" +tar -xvf "$out_tar" > /dev/null +assert_file_exists "BD_Rhapsody_Reference_Files/star_index/genomeParameters.txt" +assert_file_exists "BD_Rhapsody_Reference_Files/bwa-mem2_index/reference_small.ann" +assert_file_exists "BD_Rhapsody_Reference_Files/reference_small-processed.gtf" +assert_file_exists "BD_Rhapsody_Reference_Files/mitochondrial_contigs.txt" +assert_file_contains "BD_Rhapsody_Reference_Files/reference_small-processed.gtf" "chr1.*HAVANA.*ENSG00000243485" +assert_file_contains "BD_Rhapsody_Reference_Files/mitochondrial_contigs.txt" 'chrMT' + +cd .. + +echo "#############################################" + +echo "> Tests succeeded!" \ No newline at end of file diff --git a/src/bd_rhapsody/bd_rhapsody_make_reference/test_data/reference_small.fa b/src/bd_rhapsody/bd_rhapsody_make_reference/test_data/reference_small.fa new file mode 100644 index 00000000..386d887c --- /dev/null +++ b/src/bd_rhapsody/bd_rhapsody_make_reference/test_data/reference_small.fa @@ -0,0 +1,27 @@ +>chr1 1 +TGGGGAAGCAAGGCGGAGTTGGGCAGCTCGTGTTCAATGGGTAGAGTTTCAGGCTGGGGT +GATGGAAGGGTGCTGGAAATGAGTGGTAGTGATGGCGGCACAACAGTGTGAATCTACTTA +ATCCCACTGAACTGTATGCTGAAAAATGGTTTAGACGGTGAATTTTAGGTTATGTATGTT +TTACCACAATTTTTAAAAAGCTAGTGAAAAGCTGGTAAAAAGAAAGAAAAGAGGCTTTTT +TAAAAAGTTAAATATATAAAAAGAGCATCATCAGTCCAAAGTCCAGCAGTTGTCCCTCCT +GGAATCCGTTGGCTTGCCTCCGGCATTTTTGGCCCTTGCCTTTTAGGGTTGCCAGATTAA +AAGACAGGATGCCCAGCTAGTTTGAATTTTAGATAAACAACGAATAATTTCGTAGCATAA +ATATGTCCCAAGCTTAGTTTGGGACATACTTATGCTAAAAAACATTATTGGTTGTTTATC +TGAGATTCAGAATTAAGCATTTTATATTTTATTTGCTGCCTCTGGCCACCCTACTCTCTT +CCTAACACTCTCTCCCTCTCCCAGTTTTGTCCGCCTTCCCTGCCTCCTCTTCTGGGGGAG +TTAGATCGAGTTGTAACAAGAACATGCCACTGTCTCGCTGGCTGCAGCGTGTGGTCCCCT +TACCAGAGGTAAAGAAGAGATGGATCTCCACTCATGTTGTAGACAGAATGTTTATGTCCT +CTCCAAATGCTTATGTTGAAACCCTAACCCCTAATGTGATGGTATGTGGAGATGGGCCTT +TGGTAGGTAATTACGGTTAGATGAGGTCATGGGGTGGGGCCCTCATTATAGATCTGGTAA +GAAAAGAGAGCATTGTCTCTGTGTCTCCCTCTCTCTCTCTCTCTCTCTCTCTCATTTCTC +TCTATCTCATTTCTCTCTCTCTCGCTATCTCATTTTTCTCTCTCTCTCTTTCTCTCCTCT +GTCTTTTCCCACCAAGTGAGGATGCGAAGAGAAGGTGGCTGTCTGCAAACCAGGAAGAGA +GCCCTCACCGGGAACCCGTCCAGCTGCCACCTTGAACTTGGACTTCCAAGCCTCCAGAAC +TGTGAGGGATAAATGTATGATTTTAAAGTCGCCCAGTGTGTGGTATTTTGTTTTGACTAA +TACAACCTGAAAACATTTTCCCCTCACTCCACCTGAGCAATATCTGAGTGGCTTAAGGTA +CTCAGGACACAACAAAGGAGAAATGTCCCATGCACAAGGTGCACCCATGCCTGGGTAAAG +CAGCCTGGCACAGAGGGAAGCACACAGGCTCAGGGATCTGCTATTCATTCTTTGTGTGAC +CCTGGGCAAGCCATGAATGGAGCTTCAGTCACCCCATTTGTAATGGGATTTAATTGTGCT +TGCCCTGCCTCCTTTTGAGGGCTGTAGAGAAAAGATGTCAAAGTATTTTGTAATCTGGCT +GGGCGTGGTGGCTCATGCCTGTAATCCTAGCACTTTGGTAGGCTGACGCGAGAGGACTGC +T diff --git a/src/bd_rhapsody/bd_rhapsody_make_reference/test_data/reference_small.gtf b/src/bd_rhapsody/bd_rhapsody_make_reference/test_data/reference_small.gtf new file mode 100644 index 00000000..7ba83523 --- /dev/null +++ b/src/bd_rhapsody/bd_rhapsody_make_reference/test_data/reference_small.gtf @@ -0,0 +1,8 @@ +chr1 HAVANA exon 565 668 . + . gene_id "ENSG00000243485.5"; transcript_id "ENST00000473358.1"; gene_type "lncRNA"; gene_name "MIR1302-2HG"; transcript_type "lncRNA"; transcript_name "MIR1302-2HG-202"; exon_number 2; exon_id "ENSE00001922571.1"; level 2; transcript_support_level "5"; hgnc_id "HGNC:52482"; tag "not_best_in_genome_evidence"; tag "dotter_confirmed"; tag "basic"; tag "Ensembl_canonical"; havana_gene "OTTHUMG00000000959.2"; havana_transcript "OTTHUMT00000002840.1"; +chr1 HAVANA exon 977 1098 . + . gene_id "ENSG00000243485.5"; transcript_id "ENST00000473358.1"; gene_type "lncRNA"; gene_name "MIR1302-2HG"; transcript_type "lncRNA"; transcript_name "MIR1302-2HG-202"; exon_number 3; exon_id "ENSE00001827679.1"; level 2; transcript_support_level "5"; hgnc_id "HGNC:52482"; tag "not_best_in_genome_evidence"; tag "dotter_confirmed"; tag "basic"; tag "Ensembl_canonical"; havana_gene "OTTHUMG00000000959.2"; havana_transcript "OTTHUMT00000002840.1"; +chr1 HAVANA transcript 268 1110 . + . gene_id "ENSG00000243485.5"; transcript_id "ENST00000469289.1"; gene_type "lncRNA"; gene_name "MIR1302-2HG"; transcript_type "lncRNA"; transcript_name "MIR1302-2HG-201"; level 2; transcript_support_level "5"; hgnc_id "HGNC:52482"; tag "not_best_in_genome_evidence"; tag "basic"; havana_gene "OTTHUMG00000000959.2"; havana_transcript "OTTHUMT00000002841.2"; +chr1 HAVANA exon 268 668 . + . gene_id "ENSG00000243485.5"; transcript_id "ENST00000469289.1"; gene_type "lncRNA"; gene_name "MIR1302-2HG"; transcript_type "lncRNA"; transcript_name "MIR1302-2HG-201"; exon_number 1; exon_id "ENSE00001841699.1"; level 2; transcript_support_level "5"; hgnc_id "HGNC:52482"; tag "not_best_in_genome_evidence"; tag "basic"; havana_gene "OTTHUMG00000000959.2"; havana_transcript "OTTHUMT00000002841.2"; +chr1 HAVANA exon 977 1110 . + . gene_id "ENSG00000243485.5"; transcript_id "ENST00000469289.1"; gene_type "lncRNA"; gene_name "MIR1302-2HG"; transcript_type "lncRNA"; transcript_name "MIR1302-2HG-201"; exon_number 2; exon_id "ENSE00001890064.1"; level 2; transcript_support_level "5"; hgnc_id "HGNC:52482"; tag "not_best_in_genome_evidence"; tag "basic"; havana_gene "OTTHUMG00000000959.2"; havana_transcript "OTTHUMT00000002841.2"; +chr1 ENSEMBL gene 367 504 . + . gene_id "ENSG00000284332.1"; gene_type "miRNA"; gene_name "MIR1302-2"; level 3; hgnc_id "HGNC:35294"; +chr1 ENSEMBL transcript 367 504 . + . gene_id "ENSG00000284332.1"; transcript_id "ENST00000607096.1"; gene_type "miRNA"; gene_name "MIR1302-2"; transcript_type "miRNA"; transcript_name "MIR1302-2-201"; level 3; transcript_support_level "NA"; hgnc_id "HGNC:35294"; tag "basic"; tag "Ensembl_canonical"; +chr1 ENSEMBL exon 367 504 . + . gene_id "ENSG00000284332.1"; transcript_id "ENST00000607096.1"; gene_type "miRNA"; gene_name "MIR1302-2"; transcript_type "miRNA"; transcript_name "MIR1302-2-201"; exon_number 1; exon_id "ENSE00003695741.1"; level 3; transcript_support_level "NA"; hgnc_id "HGNC:35294"; tag "basic"; tag "Ensembl_canonical"; diff --git a/src/bd_rhapsody/bd_rhapsody_make_reference/test_data/script.sh b/src/bd_rhapsody/bd_rhapsody_make_reference/test_data/script.sh new file mode 100644 index 00000000..8d468064 --- /dev/null +++ b/src/bd_rhapsody/bd_rhapsody_make_reference/test_data/script.sh @@ -0,0 +1,47 @@ +#!/bin/bash + +TMP_DIR=/tmp/bd_rhapsody_make_reference +OUT_DIR=src/bd_rhapsody/bd_rhapsody_make_reference/test_data + +# check if seqkit is installed +if ! command -v seqkit &> /dev/null; then + echo "seqkit could not be found" + exit 1 +fi + +# create temporary directory and clean up on exit +mkdir -p $TMP_DIR +function clean_up { + rm -rf "$TMP_DIR" +} +trap clean_up EXIT + +# fetch reference +ORIG_FA=$TMP_DIR/reference.fa.gz +if [ ! -f $ORIG_FA ]; then + wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/GRCh38.primary_assembly.genome.fa.gz \ + -O $ORIG_FA +fi + +ORIG_GTF=$TMP_DIR/reference.gtf.gz +if [ ! -f $ORIG_GTF ]; then + wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/gencode.v41.annotation.gtf.gz \ + -O $ORIG_GTF +fi + +# create small reference +START=30000 +END=31500 +CHR=chr1 + +# subset to small region +seqkit grep -r -p "^$CHR\$" "$ORIG_FA" | \ + seqkit subseq -r "$START:$END" > $OUT_DIR/reference_small.fa + +zcat "$ORIG_GTF" | \ + awk -v FS='\t' -v OFS='\t' " + \$1 == \"$CHR\" && \$4 >= $START && \$5 <= $END { + \$4 = \$4 - $START + 1; + \$5 = \$5 - $START + 1; + print; + }" > $OUT_DIR/reference_small.gtf From c2e340d92ea7f153d0c5c9de1cffbc6b88fc4124 Mon Sep 17 00:00:00 2001 From: emmarousseau Date: Wed, 17 Jul 2024 18:10:37 +0200 Subject: [PATCH 2/7] Remove multiple_sep (#78) * initial commit dedup * Revert "initial commit dedup" This reverts commit 38f586bec0ac9e4312b016e29c3aa0bd53f292b2. * get rid of multiple_sep fields in configs * Fix coverage argument's format in config --- src/gffread/config.vsh.yaml | 5 +-- src/gffread/script.sh | 2 ++ src/gffread/test.sh | 2 +- src/samtools/samtools_stats/config.vsh.yaml | 40 ++++++++++----------- src/samtools/samtools_stats/script.sh | 3 ++ src/samtools/samtools_stats/test.sh | 2 +- 6 files changed, 28 insertions(+), 26 deletions(-) diff --git a/src/gffread/config.vsh.yaml b/src/gffread/config.vsh.yaml index d2c41a87..7477a284 100644 --- a/src/gffread/config.vsh.yaml +++ b/src/gffread/config.vsh.yaml @@ -8,8 +8,6 @@ links: references: doi: 10.12688/f1000research.23297.2 license: MIT -requirements: - commands: [ gffread ] argument_groups: - name: Inputs arguments: @@ -52,7 +50,7 @@ argument_groups: required: true description: | Write the output records into . - default: output.gff + example: output.gff - name: --force_exons type: boolean_true description: | @@ -154,7 +152,6 @@ argument_groups: - name: --table type: string multiple: true - multiple_sep: "," description: | Output a simple tab delimited format instead of GFF, with columns having the values of GFF attributes given in ; special pseudo-attributes (prefixed by @) are diff --git a/src/gffread/script.sh b/src/gffread/script.sh index 9c4a2b8f..cd4abf14 100644 --- a/src/gffread/script.sh +++ b/src/gffread/script.sh @@ -50,6 +50,8 @@ [[ "$par_expose_dups" == "false" ]] && unset par_expose_dups [[ "$par_cluster_only" == "false" ]] && unset par_cluster_only +# if par_table is not empty, replace ";" with "," +par_table=$(echo "$par_table" | tr ';' ',') $(which gffread) \ "$par_input" \ diff --git a/src/gffread/test.sh b/src/gffread/test.sh index 326fce50..ea23edcb 100755 --- a/src/gffread/test.sh +++ b/src/gffread/test.sh @@ -86,7 +86,7 @@ diff "$expected_output_dir/transcripts.fa" "$test_output_dir/transcripts.fa" || echo "> Test 4 - Generate table from GFF annotation file" "$meta_executable" \ - --table @id,@chr,@start,@end,@strand,@exons,Name,gene,product \ + --table "@id;@chr;@start;@end;@strand;@exons;Name;gene;product" \ --outfile "$test_output_dir/annotation.tbl" \ --input "$test_dir/sequence.gff3" diff --git a/src/samtools/samtools_stats/config.vsh.yaml b/src/samtools/samtools_stats/config.vsh.yaml index 0d8f57a4..ca630876 100644 --- a/src/samtools/samtools_stats/config.vsh.yaml +++ b/src/samtools/samtools_stats/config.vsh.yaml @@ -30,10 +30,10 @@ argument_groups: - name: --coverage alternatives: -c type: integer - description: | - Coverage distribution min,max,step [1,1000,1]. multiple: true - multiple_sep: ',' + description: | + Coverage distribution min;max;step. Default: [1, 1000, 1]. + example: [1, 1000, 1] - name: --remove_dups alternatives: -d type: boolean_true @@ -48,25 +48,25 @@ argument_groups: alternatives: -f type: string description: | - Required flag, 0 for unset. See also `samtools flags`. - default: "0" + Required flag, 0 for unset. See also `samtools flags`. Default: `"0"`. + example: "0" - name: --filtering_flag alternatives: -F type: string description: | - Filtering flag, 0 for unset. See also `samtools flags`. - default: "0" + Filtering flag, 0 for unset. See also `samtools flags`. Default: `0`. + example: "0" - name: --GC_depth type: double description: | - The size of GC-depth bins (decreasing bin size increases memory requirement). - default: 20000.0 + The size of GC-depth bins (decreasing bin size increases memory requirement). Default: `20000`. + example: 20000.0 - name: --insert_size alternatives: -i type: integer description: | - Maximum insert size. - default: 8000 + Maximum insert size. Default: `8000`. + example: 8000 - name: --id alternatives: -I type: string @@ -76,14 +76,14 @@ argument_groups: alternatives: -l type: integer description: | - Include in the statistics only reads with the given read length. - default: -1 + Include in the statistics only reads with the given read length. Default: `-1`. + example: -1 - name: --most_inserts alternatives: -m type: double description: | - Report only the main part of inserts. - default: 0.99 + Report only the main part of inserts. Default: `0.99`. + example: 0.99 - name: --split_prefix alternatives: -P type: string @@ -93,8 +93,8 @@ argument_groups: alternatives: -q type: integer description: | - The BWA trimming parameter. - default: 0 + The BWA trimming parameter. Default: `0`. + example: 0 - name: --ref_seq alternatives: -r type: file @@ -124,8 +124,8 @@ argument_groups: alternatives: -g type: integer description: | - Only bases with coverage above this value will be included in the target percentage computation. - default: 0 + Only bases with coverage above this value will be included in the target percentage computation. Default: `0`. + example: 0 - name: --input_fmt_option type: string description: | @@ -141,7 +141,7 @@ argument_groups: type: file description: | Output file. - default: "out.txt" + example: "out.txt" required: true direction: output diff --git a/src/samtools/samtools_stats/script.sh b/src/samtools/samtools_stats/script.sh index 6e32e9a5..e3872fc6 100644 --- a/src/samtools/samtools_stats/script.sh +++ b/src/samtools/samtools_stats/script.sh @@ -10,6 +10,9 @@ set -e [[ "$par_sparse" == "false" ]] && unset par_sparse [[ "$par_remove_overlaps" == "false" ]] && unset par_remove_overlaps +# change the coverage input from X;X;X to X,X,X +par_coverage=$(echo "$par_coverage" | tr ';' ',') + samtools stats \ ${par_coverage:+-c "$par_coverage"} \ ${par_remove_dups:+-d} \ diff --git a/src/samtools/samtools_stats/test.sh b/src/samtools/samtools_stats/test.sh index 05d70d30..b515100e 100644 --- a/src/samtools/samtools_stats/test.sh +++ b/src/samtools/samtools_stats/test.sh @@ -17,7 +17,7 @@ echo ">>> Checking whether output is non-empty" [ ! -s "$test_dir/test.paired_end.sorted.txt" ] && echo "File 'test.paired_end.sorted.txt' is empty!" && exit 1 echo ">>> Checking whether output is correct" -# compare using diff, ignoring the line stating the command that was passed. +# compare using diff, ignoring the line stating the command that was passed. diff <(grep -v "^# The command" "$test_dir/test.paired_end.sorted.txt") \ <(grep -v "^# The command" "$test_dir/ref.paired_end.sorted.txt") || \ (echo "Output file ref.paired_end.sorted.txt does not match expected output" && exit 1) From 8e9abad885b27120a56a580ca7d961c64b96ad60 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Wed, 17 Jul 2024 18:14:21 +0200 Subject: [PATCH 3/7] Update CONTRIBUTING.md (#82) * Update CONTRIBUTING.md * update ctb * clean up helper functions * update changelog * update changelog --- CHANGELOG.md | 28 +++- CONTRIBUTING.md | 151 +++++++++++------- .../bd_rhapsody_make_reference/test.sh | 5 +- src/cutadapt/test.sh | 14 +- src/star/star_align_reads/test.sh | 21 ++- 5 files changed, 130 insertions(+), 89 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9cfacdbc..2aad0cb8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,19 +6,33 @@ - `bd_rhapsody/bd_rhapsody_make_reference`: Create a reference for the BD Rhapsody pipeline (PR #75). -## BUG FIXES +## MINOR CHANGES -* `pear`: fix component not exiting with the correct exitcode when PEAR fails. +* `busco` components: update BUSCO to `5.7.1` (PR #72). -* `cutadapt`: fix `--par_quality_cutoff_r2` argument. +## DOCUMENTATION -* `cutadapt`: demultiplexing is now disabled by default. It can be re-enabled by using `demultiplex_mode`. +* Extend the contributing guidelines (PR #82): -* `multiqc`: update multiple separator to `;` (PR #81). + - Update format to Viash 0.9. -## MINOR CHANGES + - Descriptions should be formatted in markdown. + + - Add defaults to descriptions, not as a default of the argument. + + - Explain parameter expansion. -* `busco` components: update BUSCO to `5.7.1`. + - Mention that the contents of the output of components in tests should be checked. + +## BUG FIXES + +* `pear`: fix component not exiting with the correct exitcode when PEAR fails (PR #70). + +* `cutadapt`: fix `--par_quality_cutoff_r2` argument (PR #69). + +* `cutadapt`: demultiplexing is now disabled by default. It can be re-enabled by using `demultiplex_mode` (PR #69). + +* `multiqc`: update multiple separator to `;` (PR #81). # biobox 0.1.0 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 7393bc7e..cee4249a 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -65,22 +65,21 @@ runners: Fill in the relevant metadata fields in the config. Here is an example of the metadata of an existing component. ```yaml -functionality: - name: arriba - description: Detect gene fusions from RNA-Seq data - keywords: [Gene fusion, RNA-Seq] - links: - homepage: https://arriba.readthedocs.io/en/latest/ - documentation: https://arriba.readthedocs.io/en/latest/ - repository: https://github.com/suhrig/arriba - issue_tracker: https://github.com/suhrig/arriba/issues - references: - doi: 10.1101/gr.257246.119 - bibtex: | - @article{ - ... a bibtex entry in case the doi is not available ... - } - license: MIT +name: arriba +description: Detect gene fusions from RNA-Seq data +keywords: [Gene fusion, RNA-Seq] +links: + homepage: https://arriba.readthedocs.io/en/latest/ + documentation: https://arriba.readthedocs.io/en/latest/ + repository: https://github.com/suhrig/arriba + issue_tracker: https://github.com/suhrig/arriba/issues +references: + doi: 10.1101/gr.257246.119 + bibtex: | + @article{ + ... a bibtex entry in case the doi is not available ... + } +license: MIT ``` ### Step 4: Find a suitable container @@ -162,7 +161,7 @@ argument_groups: type: file description: | File in SAM/BAM/CRAM format with main alignments as generated by STAR - (Aligned.out.sam). Arriba extracts candidate reads from this file. + (`Aligned.out.sam`). Arriba extracts candidate reads from this file. required: true example: Aligned.out.bam ``` @@ -175,7 +174,7 @@ Several notes: * Input arguments can have `multiple: true` to allow the user to specify multiple files. - +* The description should be formatted in markdown. ### Step 8: Add arguments for the output files @@ -220,7 +219,7 @@ argument_groups: Note: -* Preferably, these outputs should not be directores but files. For example, if a tool outputs a directory `foo/` containing files `foo/bar.txt` and `foo/baz.txt`, there should be two output arguments `--bar` and `--baz` (as opposed to one output argument which outputs the whole `foo/` directory). +* Preferably, these outputs should not be directories but files. For example, if a tool outputs a directory `foo/` containing files `foo/bar.txt` and `foo/baz.txt`, there should be two output arguments `--bar` and `--baz` (as opposed to one output argument which outputs the whole `foo/` directory). ### Step 9: Add arguments for the other arguments @@ -230,6 +229,8 @@ Finally, add all other arguments to the config file. There are a few exceptions: * Arguments related to printing the information such as printing the version (`-v`, `--version`) or printing the help (`-h`, `--help`) should not be added to the config file. +* If the help lists defaults, do not add them as defaults but to the description. Example: `description: . Default: 10.` + ### Step 10: Add a Docker engine @@ -275,10 +276,13 @@ Next, we need to write a runner script that runs the tool with the input argumen ## VIASH START ## VIASH END +# unset flags +[[ "$par_option" == "false" ]] && unset par_option + xxx \ --input "$par_input" \ --output "$par_output" \ - $([ "$par_option" = "true" ] && echo "--option") + ${par_option:+--option} ``` When building a Viash component, Viash will automatically replace the `## VIASH START` and `## VIASH END` lines (and anything in between) with environment variables based on the arguments specified in the config. @@ -291,6 +295,11 @@ As an example, this is what the Bash script for the `arriba` component looks lik ## VIASH START ## VIASH END +# unset flags +[[ "$par_skip_duplicate_marking" == "false" ]] && unset par_skip_duplicate_marking +[[ "$par_extra_information" == "false" ]] && unset par_extra_information +[[ "$par_fill_gaps" == "false" ]] && unset par_fill_gaps + arriba \ -x "$par_bam" \ -a "$par_genome" \ @@ -298,26 +307,30 @@ arriba \ -o "$par_fusions" \ ${par_known_fusions:+-k "${par_known_fusions}"} \ ${par_blacklist:+-b "${par_blacklist}"} \ - ${par_structural_variants:+-d "${par_structural_variants}"} \ - $([ "$par_skip_duplicate_marking" = "true" ] && echo "-u") \ - $([ "$par_extra_information" = "true" ] && echo "-X") \ - $([ "$par_fill_gaps" = "true" ] && echo "-I") + # ... + ${par_extra_information:+-X} \ + ${par_fill_gaps:+-I} ``` +Notes: -### Step 12: Create test script +* If your arguments can contain special variables (e.g. `$`), you can use quoting (need to find a documentation page for this) to make sure you can use the string as input. Example: `-x ${par_bam@Q}`. +* Optional arguments can be passed to the command conditionally using Bash [parameter expansion](https://www.gnu.org/software/bash/manual/html_node/Shell-Parameter-Expansion.html). For example: `${par_known_fusions:+-k ${par_known_fusions@Q}}` + +* If your tool allows for multiple inputs using a separator other than `;` (which is the default Viash multiple separator), you can substitute these values with a command like: `par_disable_filters=$(echo $par_disable_filters | tr ';' ',')`. + + +### Step 12: Create test script If the unit test requires test resources, these should be provided in the `test_resources` section of the component. ```yaml -functionality: - # ... - test_resources: - - type: bash_script - path: test.sh - - type: file - path: test_data +test_resources: + - type: bash_script + path: test.sh + - type: file + path: test_data ``` Create a test script at `src/xxx/test.sh` that runs the component with the test data. This script should run the component (available with `$meta_executable`) with the test data and check if the output is as expected. The script should exit with a non-zero exit code if the output is not as expected. For example: @@ -325,48 +338,64 @@ Create a test script at `src/xxx/test.sh` that runs the component with the test ```bash #!/bin/bash +set -e + ## VIASH START ## VIASH END -echo "> Run xxx with test data" +############################################# +# helper functions +assert_file_exists() { + [ -f "$1" ] || { echo "File '$1' does not exist" && exit 1; } +} +assert_file_doesnt_exist() { + [ ! -f "$1" ] || { echo "File '$1' exists but shouldn't" && exit 1; } +} +assert_file_empty() { + [ ! -s "$1" ] || { echo "File '$1' is not empty but should be" && exit 1; } +} +assert_file_not_empty() { + [ -s "$1" ] || { echo "File '$1' is empty but shouldn't be" && exit 1; } +} +assert_file_contains() { + grep -q "$2" "$1" || { echo "File '$1' does not contain '$2'" && exit 1; } +} +assert_file_not_contains() { + grep -q "$2" "$1" && { echo "File '$1' contains '$2' but shouldn't" && exit 1; } +} +assert_file_contains_regex() { + grep -q -E "$2" "$1" || { echo "File '$1' does not contain '$2'" && exit 1; } +} +assert_file_not_contains_regex() { + grep -q -E "$2" "$1" && { echo "File '$1' contains '$2' but shouldn't" && exit 1; } +} +############################################# + +echo "> Run $meta_name with test data" "$meta_executable" \ - --input "$meta_resources_dir/test_data/input.txt" \ + --input "$meta_resources_dir/test_data/reads_R1.fastq" \ --output "output.txt" \ --option -echo ">> Checking output" -[ ! -f "output.txt" ] && echo "Output file output.txt does not exist" && exit 1 -``` +echo ">> Check if output exists" +assert_file_exists "output.txt" +echo ">> Check if output is empty" +assert_file_not_empty "output.txt" -For example, this is what the test script for the `arriba` component looks like: +echo ">> Check if output is correct" +assert_file_contains "output.txt" "some expected output" -```bash -#!/bin/bash +echo "> All tests succeeded!" +``` -## VIASH START -## VIASH END +Notes: -echo "> Run arriba with blacklist" -"$meta_executable" \ - --bam "$meta_resources_dir/test_data/A.bam" \ - --genome "$meta_resources_dir/test_data/genome.fasta" \ - --gene_annotation "$meta_resources_dir/test_data/annotation.gtf" \ - --blacklist "$meta_resources_dir/test_data/blacklist.tsv" \ - --fusions "fusions.tsv" \ - --fusions_discarded "fusions_discarded.tsv" \ - --interesting_contigs "1,2" - -echo ">> Checking output" -[ ! -f "fusions.tsv" ] && echo "Output file fusions.tsv does not exist" && exit 1 -[ ! -f "fusions_discarded.tsv" ] && echo "Output file fusions_discarded.tsv does not exist" && exit 1 +* Do always check the contents of the output file. If the output is not deterministic, you can use regular expressions to check the output. -echo ">> Check if output is empty" -[ ! -s "fusions.tsv" ] && echo "Output file fusions.tsv is empty" && exit 1 -[ ! -s "fusions_discarded.tsv" ] && echo "Output file fusions_discarded.tsv is empty" && exit 1 -``` +* If possible, generate your own test data instead of copying it from an external resource. -### Step 12: Create a `/var/software_versions.txt` file +### Step 13: Create a `/var/software_versions.txt` file For the sake of transparency and reproducibility, we require that the versions of the software used in the component are documented. @@ -378,6 +407,8 @@ engines: image: quay.io/biocontainers/xxx:0.1.0--py_0 setup: - type: docker + # note: /var/software_versions.txt should contain: + # arriba: "2.4.0" run: | echo "xxx: \"0.1.0\"" > /var/software_versions.txt ``` diff --git a/src/bd_rhapsody/bd_rhapsody_make_reference/test.sh b/src/bd_rhapsody/bd_rhapsody_make_reference/test.sh index 3637160a..845c1739 100644 --- a/src/bd_rhapsody/bd_rhapsody_make_reference/test.sh +++ b/src/bd_rhapsody/bd_rhapsody_make_reference/test.sh @@ -11,21 +11,18 @@ assert_file_doesnt_exist() { [ ! -f "$1" ] || { echo "File '$1' exists but shouldn't" && exit 1; } } assert_file_empty() { - # () will execute in a shubshell, could you use {;}? [ ! -s "$1" ] || { echo "File '$1' is not empty but should be" && exit 1; } } assert_file_not_empty() { - # [ -s "$1" ] || (echo "File '$1' is empty but shouldn't be" && exit 1) [ -s "$1" ] || { echo "File '$1' is empty but shouldn't be" && exit 1; } } assert_file_contains() { - # grep -q "$2" "$1" || (echo "File '$1' does not contain '$2'" && exit 1) grep -q "$2" "$1" || { echo "File '$1' does not contain '$2'" && exit 1; } } assert_file_not_contains() { - # grep -q "$2" "$1" && (echo "File '$1' contains '$2' but shouldn't" && exit 1) grep -q "$2" "$1" && { echo "File '$1' contains '$2' but shouldn't" && exit 1; } } +############################################# in_fa="$meta_resources_dir/test_data/reference_small.fa" in_gtf="$meta_resources_dir/test_data/reference_small.gtf" diff --git a/src/cutadapt/test.sh b/src/cutadapt/test.sh index 1d6d9c18..28248742 100644 --- a/src/cutadapt/test.sh +++ b/src/cutadapt/test.sh @@ -6,25 +6,25 @@ set -eo pipefail ############################################# # helper functions assert_file_exists() { - [ -f "$1" ] || (echo "File '$1' does not exist" && exit 1) + [ -f "$1" ] || { echo "File '$1' does not exist" && exit 1; } } assert_file_doesnt_exist() { - [ ! -f "$1" ] || (echo "File '$1' exists but shouldn't" && exit 1) + [ ! -f "$1" ] || { echo "File '$1' exists but shouldn't" && exit 1; } } assert_file_empty() { - [ ! -s "$1" ] || (echo "File '$1' is not empty but should be" && exit 1) + [ ! -s "$1" ] || { echo "File '$1' is not empty but should be" && exit 1; } } assert_file_not_empty() { - [ -s "$1" ] || (echo "File '$1' is empty but shouldn't be" && exit 1) + [ -s "$1" ] || { echo "File '$1' is empty but shouldn't be" && exit 1; } } assert_file_contains() { - grep -q "$2" "$1" || (echo "File '$1' does not contain '$2'" && exit 1) + grep -q "$2" "$1" || { echo "File '$1' does not contain '$2'" && exit 1; } } assert_file_not_contains() { - grep -q "$2" "$1" && (echo "File '$1' contains '$2' but shouldn't" && exit 1) + grep -q "$2" "$1" && { echo "File '$1' contains '$2' but shouldn't" && exit 1; } } - ############################################# + mkdir test_multiple_output cd test_multiple_output diff --git a/src/star/star_align_reads/test.sh b/src/star/star_align_reads/test.sh index a15ea599..bd78094d 100644 --- a/src/star/star_align_reads/test.sh +++ b/src/star/star_align_reads/test.sh @@ -7,35 +7,34 @@ meta_executable="target/docker/star/star_align_reads/star_align_reads" meta_resources_dir="src/star/star_align_reads" ## VIASH END -######################################################################################### - +############################################# # helper functions assert_file_exists() { - [ -f "$1" ] || (echo "File '$1' does not exist" && exit 1) + [ -f "$1" ] || { echo "File '$1' does not exist" && exit 1; } } assert_file_doesnt_exist() { - [ ! -f "$1" ] || (echo "File '$1' exists but shouldn't" && exit 1) + [ ! -f "$1" ] || { echo "File '$1' exists but shouldn't" && exit 1; } } assert_file_empty() { - [ ! -s "$1" ] || (echo "File '$1' is not empty but should be" && exit 1) + [ ! -s "$1" ] || { echo "File '$1' is not empty but should be" && exit 1; } } assert_file_not_empty() { - [ -s "$1" ] || (echo "File '$1' is empty but shouldn't be" && exit 1) + [ -s "$1" ] || { echo "File '$1' is empty but shouldn't be" && exit 1; } } assert_file_contains() { - grep -q "$2" "$1" || (echo "File '$1' does not contain '$2'" && exit 1) + grep -q "$2" "$1" || { echo "File '$1' does not contain '$2'" && exit 1; } } assert_file_not_contains() { - grep -q "$2" "$1" && (echo "File '$1' contains '$2' but shouldn't" && exit 1) + grep -q "$2" "$1" && { echo "File '$1' contains '$2' but shouldn't" && exit 1; } } assert_file_contains_regex() { - grep -q -E "$2" "$1" || (echo "File '$1' does not contain '$2'" && exit 1) + grep -q -E "$2" "$1" || { echo "File '$1' does not contain '$2'" && exit 1; } } assert_file_not_contains_regex() { - grep -q -E "$2" "$1" && (echo "File '$1' contains '$2' but shouldn't" && exit 1) + grep -q -E "$2" "$1" && { echo "File '$1' contains '$2' but shouldn't" && exit 1; } } +############################################# -######################################################################################### echo "> Prepare test data" cat > reads_R1.fastq <<'EOF' From 13c5439a0c36f8a1bd3889e68d68ca85672daa62 Mon Sep 17 00:00:00 2001 From: Leila011 Date: Wed, 17 Jul 2024 18:15:08 +0200 Subject: [PATCH 4/7] Add agat convertspgff2gtf (#76) * Fill in the metadata * add help.txt * add test data * update help.txt * add arguments for input file, output file and other arguments * add a Docker engine * Write a runner script * correct --gtf_version choices * update description * update keywords * Create test script * Create a /var/software_versions.txt file * remove duplicated argument * update config * change name to agat_convert_sp_gff2gtf * update license * replace module name by $meta_name in test.sh * Add more info to --gtf_version description * remove extra \ * add additional test: check if the D column in the first line of the GFF was correctly converted into GTF format * update changelog * Markdown: add newline before listing * add test to check if the header contains the right GTF version * cleanup * fix formatting --------- Co-authored-by: Robrecht Cannoodt --- CHANGELOG.md | 3 + .../agat_convert_sp_gff2gtf/config.vsh.yaml | 90 ++++++++++++++++ src/agat/agat_convert_sp_gff2gtf/help.txt | 102 ++++++++++++++++++ src/agat/agat_convert_sp_gff2gtf/script.sh | 10 ++ src/agat/agat_convert_sp_gff2gtf/test.sh | 37 +++++++ .../test_data/0_test.gff | 36 +++++++ .../test_data/script.sh | 9 ++ 7 files changed, 287 insertions(+) create mode 100644 src/agat/agat_convert_sp_gff2gtf/config.vsh.yaml create mode 100644 src/agat/agat_convert_sp_gff2gtf/help.txt create mode 100644 src/agat/agat_convert_sp_gff2gtf/script.sh create mode 100644 src/agat/agat_convert_sp_gff2gtf/test.sh create mode 100644 src/agat/agat_convert_sp_gff2gtf/test_data/0_test.gff create mode 100755 src/agat/agat_convert_sp_gff2gtf/test_data/script.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 2aad0cb8..8f56b22e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -100,6 +100,9 @@ - `bedtools_getfasta`: extract sequences from a FASTA file for each of the intervals defined in a BED/GFF/VCF file (PR #59). +* `agat`: + - `agat_convert_sp_gff2gtf`: convert any GTF/GFF file into a proper GTF file (PR #76). + ## MINOR CHANGES * Uniformize component metadata (PR #23). diff --git a/src/agat/agat_convert_sp_gff2gtf/config.vsh.yaml b/src/agat/agat_convert_sp_gff2gtf/config.vsh.yaml new file mode 100644 index 00000000..b788c7c7 --- /dev/null +++ b/src/agat/agat_convert_sp_gff2gtf/config.vsh.yaml @@ -0,0 +1,90 @@ +name: agat_convert_sp_gff2gtf +namespace: agat +description: | + The script aims to convert any GTF/GFF file into a proper GTF file. Full + information about the format can be found here: + https://agat.readthedocs.io/en/latest/gxf.html You can choose among 7 + different GTF types (1, 2, 2.1, 2.2, 2.5, 3 or relax). Depending the + version selected the script will filter out the features that are not + accepted. For GTF2.5 and 3, every level1 feature (e.g nc_gene + pseudogene) will be converted into gene feature and every level2 feature + (e.g mRNA ncRNA) will be converted into transcript feature. Using the + "relax" option you will produce a GTF-like output keeping all original + feature types (3rd column). No modification will occur e.g. mRNA to + transcript. + + To be fully GTF compliant all feature have a gene_id and a transcript_id + attribute. The gene_id is unique identifier for the genomic source of + the transcript, which is used to group transcripts into genes. The + transcript_id is a unique identifier for the predicted transcript, which + is used to group features into transcripts. +keywords: [gene annotations, GTF conversion] +links: + homepage: https://github.com/NBISweden/AGAT + documentation: https://agat.readthedocs.io/ + issue_tracker: https://github.com/NBISweden/AGAT/issues + repository: https://github.com/NBISweden/AGAT +references: + doi: 10.5281/zenodo.3552717 +license: GPL-3.0 +argument_groups: + - name: Inputs + arguments: + - name: --gff + alternatives: [-i] + description: Input GFF/GTF file that will be read + type: file + required: true + direction: input + example: input.gff + - name: Outputs + arguments: + - name: --output + alternatives: [-o, --out, --outfile, --gtf] + description: Output GTF file. If no output file is specified, the output will be written to STDOUT. + type: file + direction: output + required: true + example: output.gtf + - name: Arguments + arguments: + - name: --gtf_version + description: | + Version of the GTF output (1,2,2.1,2.2,2.5,3 or relax). Default value from AGAT config file (relax for the default config). The script option has the higher priority. + + * relax: all feature types are accepted. + * GTF3 (9 feature types accepted): gene, transcript, exon, CDS, Selenocysteine, start_codon, stop_codon, three_prime_utr and five_prime_utr. + * GTF2.5 (8 feature types accepted): gene, transcript, exon, CDS, UTR, start_codon, stop_codon, Selenocysteine. + * GTF2.2 (9 feature types accepted): CDS, start_codon, stop_codon, 5UTR, 3UTR, inter, inter_CNS, intron_CNS and exon. + * GTF2.1 (6 feature types accepted): CDS, start_codon, stop_codon, exon, 5UTR, 3UTR. + * GTF2 (4 feature types accepted): CDS, start_codon, stop_codon, exon. + * GTF1 (5 feature types accepted): CDS, start_codon, stop_codon, exon, intron. + type: string + choices: [relax, "1", "2", "2.1", "2.2", "2.5", "3"] + required: false + example: "3" + - name: --config + alternatives: [-c] + description: | + Input agat config file. By default AGAT takes as input agat_config.yaml file from the working directory if any, otherwise it takes the orignal agat_config.yaml shipped with AGAT. To get the agat_config.yaml locally type: "agat config --expose". The --config option gives you the possibility to use your own AGAT config file (located elsewhere or named differently). + type: file + required: false + example: custom_agat_config.yaml +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - type: file + path: test_data +engines: + - type: docker + image: quay.io/biocontainers/agat:1.4.0--pl5321hdfd78af_0 + setup: + - type: docker + run: | + agat --version | sed 's/AGAT\s\(.*\)/agat: "\1"/' > /var/software_versions.txt +runners: + - type: executable + - type: nextflow \ No newline at end of file diff --git a/src/agat/agat_convert_sp_gff2gtf/help.txt b/src/agat/agat_convert_sp_gff2gtf/help.txt new file mode 100644 index 00000000..fdd45507 --- /dev/null +++ b/src/agat/agat_convert_sp_gff2gtf/help.txt @@ -0,0 +1,102 @@ +```sh +agat_convert_sp_gff2gtf.pl --help +``` + ------------------------------------------------------------------------------ +| Another GFF Analysis Toolkit (AGAT) - Version: v1.4.0 | +| https://github.com/NBISweden/AGAT | +| National Bioinformatics Infrastructure Sweden (NBIS) - www.nbis.se | + ------------------------------------------------------------------------------ + + +Name: + agat_convert_sp_gff2gtf.pl + +Description: + The script aims to convert any GTF/GFF file into a proper GTF file. Full + information about the format can be found here: + https://agat.readthedocs.io/en/latest/gxf.html You can choose among 7 + different GTF types (1, 2, 2.1, 2.2, 2.5, 3 or relax). Depending the + version selected the script will filter out the features that are not + accepted. For GTF2.5 and 3, every level1 feature (e.g nc_gene + pseudogene) will be converted into gene feature and every level2 feature + (e.g mRNA ncRNA) will be converted into transcript feature. Using the + "relax" option you will produce a GTF-like output keeping all original + feature types (3rd column). No modification will occur e.g. mRNA to + transcript. + + To be fully GTF compliant all feature have a gene_id and a transcript_id + attribute. The gene_id is unique identifier for the genomic source of + the transcript, which is used to group transcripts into genes. The + transcript_id is a unique identifier for the predicted transcript, which + is used to group features into transcripts. + +Usage: + agat_convert_sp_gff2gtf.pl --gff infile.gff [ -o outfile ] + agat_convert_sp_gff2gtf -h + +Options: + --gff, --gtf or -i + Input GFF/GTF file that will be read + + --gtf_version version of the GTF output (1,2,2.1,2.2,2.5,3 or relax). + Default value from AGAT config file (relax for the default config). The + script option has the higher priority. + relax: all feature types are accepted. + + GTF3 (9 feature types accepted): gene, transcript, exon, CDS, + Selenocysteine, start_codon, stop_codon, three_prime_utr and + five_prime_utr + + GTF2.5 (8 feature types accepted): gene, transcript, exon, CDS, + UTR, start_codon, stop_codon, Selenocysteine + + GTF2.2 (9 feature types accepted): CDS, start_codon, stop_codon, + 5UTR, 3UTR, inter, inter_CNS, intron_CNS and exon + + GTF2.1 (6 feature types accepted): CDS, start_codon, stop_codon, + exon, 5UTR, 3UTR + + GTF2 (4 feature types accepted): CDS, start_codon, stop_codon, + exon + + GTF1 (5 feature types accepted): CDS, start_codon, stop_codon, + exon, intron + + -o , --output , --out , --outfile or --gtf + Output GTF file. If no output file is specified, the output will + be written to STDOUT. + + -c or --config + String - Input agat config file. By default AGAT takes as input + agat_config.yaml file from the working directory if any, + otherwise it takes the orignal agat_config.yaml shipped with + AGAT. To get the agat_config.yaml locally type: "agat config + --expose". The --config option gives you the possibility to use + your own AGAT config file (located elsewhere or named + differently). + + -h or --help + Display this helpful text. + +Feedback: + Did you find a bug?: + Do not hesitate to report bugs to help us keep track of the bugs and + their resolution. Please use the GitHub issue tracking system available + at this address: + + https://github.com/NBISweden/AGAT/issues + + Ensure that the bug was not already reported by searching under Issues. + If you're unable to find an (open) issue addressing the problem, open a new one. + Try as much as possible to include in the issue when relevant: + - a clear description, + - as much relevant information as possible, + - the command used, + - a data sample, + - an explanation of the expected behaviour that is not occurring. + + Do you want to contribute?: + You are very welcome, visit this address for the Contributing + guidelines: + https://github.com/NBISweden/AGAT/blob/master/CONTRIBUTING.md + diff --git a/src/agat/agat_convert_sp_gff2gtf/script.sh b/src/agat/agat_convert_sp_gff2gtf/script.sh new file mode 100644 index 00000000..69d66739 --- /dev/null +++ b/src/agat/agat_convert_sp_gff2gtf/script.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +agat_convert_sp_gff2gtf.pl \ + -i "$par_gff" \ + -o "$par_output" \ + ${par_gtf_version:+--gtf_version "${par_gtf_version}"} \ + ${par_config:+--config "${par_config}"} diff --git a/src/agat/agat_convert_sp_gff2gtf/test.sh b/src/agat/agat_convert_sp_gff2gtf/test.sh new file mode 100644 index 00000000..1e7cc142 --- /dev/null +++ b/src/agat/agat_convert_sp_gff2gtf/test.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +test_dir="${meta_resources_dir}/test_data" + +echo "> Run $meta_name with test data" +"$meta_executable" \ + --gff "$test_dir/0_test.gff" \ + --output "output.gtf" + +echo ">> Checking output" +[ ! -f "output.gtf" ] && echo "Output file output.gtf does not exist" && exit 1 + +echo ">> Check if output is empty" +[ ! -s "output.gtf" ] && echo "Output file output.gtf is empty" && exit 1 + +echo ">> Check if the conversion resulted in the right GTF format" +idGFF=$(head -n 2 "$test_dir/0_test.gff" | grep -o 'ID=[^;]*' | cut -d '=' -f 2-) +expectedGTF="gene_id \"$idGFF\"; ID \"$idGFF\";" +extractedGTF=$(head -n 3 "output.gtf" | grep -o 'gene_id "[^"]*"; ID "[^"]*";') +[ "$extractedGTF" != "$expectedGTF" ] && echo "Output file output.gtf does not have the right format" && exit 1 + +rm output.gtf + +echo "> Run $meta_name with test data and GTF version 2.5" +"$meta_executable" \ + --gff "$test_dir/0_test.gff" \ + --output "output.gtf" \ + --gtf_version "2.5" + +echo ">> Check if the output file header display the right GTF version" +grep -q "##gtf-version 2.5" "output.gtf" +[ $? -ne 0 ] && echo "Output file output.gtf header does not display the right GTF version" && exit 1 + +echo "> Test successful" \ No newline at end of file diff --git a/src/agat/agat_convert_sp_gff2gtf/test_data/0_test.gff b/src/agat/agat_convert_sp_gff2gtf/test_data/0_test.gff new file mode 100644 index 00000000..fafe86ed --- /dev/null +++ b/src/agat/agat_convert_sp_gff2gtf/test_data/0_test.gff @@ -0,0 +1,36 @@ +##gff-version 3 +scaffold625 maker gene 337818 343277 . + . ID=CLUHARG00000005458;Name=TUBB3_2 +scaffold625 maker mRNA 337818 343277 . + . ID=CLUHART00000008717;Parent=CLUHARG00000005458 +scaffold625 maker exon 337818 337971 . + . ID=CLUHART00000008717:exon:1404;Parent=CLUHART00000008717 +scaffold625 maker exon 340733 340841 . + . ID=CLUHART00000008717:exon:1405;Parent=CLUHART00000008717 +scaffold625 maker exon 341518 341628 . + . ID=CLUHART00000008717:exon:1406;Parent=CLUHART00000008717 +scaffold625 maker exon 341964 343277 . + . ID=CLUHART00000008717:exon:1407;Parent=CLUHART00000008717 +scaffold625 maker CDS 337915 337971 . + 0 ID=CLUHART00000008717:cds;Parent=CLUHART00000008717 +scaffold625 maker CDS 340733 340841 . + 0 ID=CLUHART00000008717:cds;Parent=CLUHART00000008717 +scaffold625 maker CDS 341518 341628 . + 2 ID=CLUHART00000008717:cds;Parent=CLUHART00000008717 +scaffold625 maker CDS 341964 343033 . + 2 ID=CLUHART00000008717:cds;Parent=CLUHART00000008717 +scaffold625 maker five_prime_UTR 337818 337914 . + . ID=CLUHART00000008717:five_prime_utr;Parent=CLUHART00000008717 +scaffold625 maker three_prime_UTR 343034 343277 . + . ID=CLUHART00000008717:three_prime_utr;Parent=CLUHART00000008717 +scaffold789 maker gene 558184 564780 . + . ID=CLUHARG00000003852;Name=PF11_0240 +scaffold789 maker mRNA 558184 564780 . + . ID=CLUHART00000006146;Parent=CLUHARG00000003852 +scaffold789 maker exon 558184 560123 . + . ID=CLUHART00000006146:exon:995;Parent=CLUHART00000006146 +scaffold789 maker exon 561401 561519 . + . ID=CLUHART00000006146:exon:996;Parent=CLUHART00000006146 +scaffold789 maker exon 564171 564235 . + . ID=CLUHART00000006146:exon:997;Parent=CLUHART00000006146 +scaffold789 maker exon 564372 564780 . + . ID=CLUHART00000006146:exon:998;Parent=CLUHART00000006146 +scaffold789 maker CDS 558191 560123 . + 0 ID=CLUHART00000006146:cds;Parent=CLUHART00000006146 +scaffold789 maker CDS 561401 561519 . + 2 ID=CLUHART00000006146:cds;Parent=CLUHART00000006146 +scaffold789 maker CDS 564171 564235 . + 0 ID=CLUHART00000006146:cds;Parent=CLUHART00000006146 +scaffold789 maker CDS 564372 564588 . + 1 ID=CLUHART00000006146:cds;Parent=CLUHART00000006146 +scaffold789 maker five_prime_UTR 558184 558190 . + . ID=CLUHART00000006146:five_prime_utr;Parent=CLUHART00000006146 +scaffold789 maker three_prime_UTR 564589 564780 . + . ID=CLUHART00000006146:three_prime_utr;Parent=CLUHART00000006146 +scaffold789 maker mRNA 558184 564780 . + . ID=CLUHART00000006147;Parent=CLUHARG00000003852 +scaffold789 maker exon 558184 560123 . + . ID=CLUHART00000006147:exon:997;Parent=CLUHART00000006147 +scaffold789 maker exon 561401 561519 . + . ID=CLUHART00000006147:exon:998;Parent=CLUHART00000006147 +scaffold789 maker exon 562057 562121 . + . ID=CLUHART00000006147:exon:999;Parent=CLUHART00000006147 +scaffold789 maker exon 564372 564780 . + . ID=CLUHART00000006147:exon:1000;Parent=CLUHART00000006147 +scaffold789 maker CDS 558191 560123 . + 0 ID=CLUHART00000006147:cds;Parent=CLUHART00000006147 +scaffold789 maker CDS 561401 561519 . + 2 ID=CLUHART00000006147:cds;Parent=CLUHART00000006147 +scaffold789 maker CDS 562057 562121 . + 0 ID=CLUHART00000006147:cds;Parent=CLUHART00000006147 +scaffold789 maker CDS 564372 564588 . + 1 ID=CLUHART00000006147:cds;Parent=CLUHART00000006147 +scaffold789 maker five_prime_UTR 558184 558190 . + . ID=CLUHART00000006147:five_prime_utr;Parent=CLUHART00000006147 +scaffold789 maker three_prime_UTR 564589 564780 . + . ID=CLUHART00000006147:three_prime_utr;Parent=CLUHART00000006147 diff --git a/src/agat/agat_convert_sp_gff2gtf/test_data/script.sh b/src/agat/agat_convert_sp_gff2gtf/test_data/script.sh new file mode 100755 index 00000000..e453e772 --- /dev/null +++ b/src/agat/agat_convert_sp_gff2gtf/test_data/script.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +# clone repo +if [ ! -d /tmp/agat_source ]; then + git clone --depth 1 --single-branch --branch master https://github.com/NBISweden/AGAT /tmp/agat_source +fi + +# copy test data +cp -r /tmp/agat_source/t/gff_syntax/in/0_test.gff src/agat/agat_convert_sp_gff2gtf/test_data From e615d2abb92e56cfc1e1ace9baa308ce10656f9f Mon Sep 17 00:00:00 2001 From: Jakub Majercik <57993790+jakubmajercik@users.noreply.github.com> Date: Wed, 17 Jul 2024 19:44:21 +0200 Subject: [PATCH 5/7] Seqtk sample (#68) * tests added * tests extended * changelog entry added * reorganized seqtk namespace + added seqtk subseq config and script * added subseq help.txt * revert to seqtk sample only * remove subseq * updated tests, added tags * Update two_pass_mode Co-authored-by: Robrecht Cannoodt * author added to config --------- Co-authored-by: Robrecht Cannoodt --- CHANGELOG.md | 2 + src/_authors/jakub_majercik.yaml | 10 +++ src/seqtk/seqtk_sample/config.vsh.yaml | 57 ++++++++++++++ src/seqtk/seqtk_sample/help.txt | 9 +++ src/seqtk/seqtk_sample/script.sh | 11 +++ src/seqtk/seqtk_sample/test.sh | 104 +++++++++++++++++++++++++ src/seqtk/test_data/reads/a.1.fastq.gz | Bin 0 -> 100 bytes src/seqtk/test_data/reads/a.2.fastq.gz | Bin 0 -> 100 bytes src/seqtk/test_data/reads/a.fastq | 4 + src/seqtk/test_data/reads/a.fastq.gz | Bin 0 -> 44 bytes src/seqtk/test_data/reads/id.list | 1 + src/seqtk/test_data/script.sh | 9 +++ 12 files changed, 207 insertions(+) create mode 100644 src/_authors/jakub_majercik.yaml create mode 100644 src/seqtk/seqtk_sample/config.vsh.yaml create mode 100644 src/seqtk/seqtk_sample/help.txt create mode 100644 src/seqtk/seqtk_sample/script.sh create mode 100644 src/seqtk/seqtk_sample/test.sh create mode 100644 src/seqtk/test_data/reads/a.1.fastq.gz create mode 100644 src/seqtk/test_data/reads/a.2.fastq.gz create mode 100644 src/seqtk/test_data/reads/a.fastq create mode 100644 src/seqtk/test_data/reads/a.fastq.gz create mode 100644 src/seqtk/test_data/reads/id.list create mode 100755 src/seqtk/test_data/script.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 8f56b22e..f6a8676f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -93,6 +93,8 @@ * `falco`: A C++ drop-in replacement of FastQC to assess the quality of sequence read data (PR #43). +* `seqtk/seqtk_sample`: Sample sequences from FASTA/Q(.gz) files to FASTA/Q (PR #68). + * `umitools`: - `umitools_dedup`: Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read (PR #54). diff --git a/src/_authors/jakub_majercik.yaml b/src/_authors/jakub_majercik.yaml new file mode 100644 index 00000000..3b75fffe --- /dev/null +++ b/src/_authors/jakub_majercik.yaml @@ -0,0 +1,10 @@ +name: Jakub Majercik +info: + links: + email: jakub@data-intuitive.com + github: jakubmajercik + linkedin: jakubmajercik + organizations: + - name: Data Intuitive + href: https://www.data-intuitive.com + role: Bioinformatics Engineer \ No newline at end of file diff --git a/src/seqtk/seqtk_sample/config.vsh.yaml b/src/seqtk/seqtk_sample/config.vsh.yaml new file mode 100644 index 00000000..0cd369e7 --- /dev/null +++ b/src/seqtk/seqtk_sample/config.vsh.yaml @@ -0,0 +1,57 @@ +name: seqtk_sample +namespace: seqtk +description: Subsamples sequences from FASTA/Q files. +keywords: [sample, FASTA, FASTQ] +links: + repository: https://github.com/lh3/seqtk/tree/v1.4 +license: MIT +authors: + - __merge__: /src/_authors/jakub_majercik.yaml + roles: [ author, maintainer ] + +argument_groups: + - name: Inputs + arguments: + - name: --input + type: file + description: The input FASTA/Q file. + required: true + + - name: Outputs + arguments: + - name: --output + type: file + description: The output FASTA/Q file. + required: true + direction: output + + - name: Options + arguments: + - name: --seed + type: integer + description: Seed for random generator. + example: 42 + - name: --fraction_number + type: double + description: Fraction or number of sequences to sample. + required: true + example: 0.1 + - name: --two_pass_mode + type: boolean_true + description: Twice as slow but with much reduced memory + +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - type: file + path: ../test_data + +engines: + - type: docker + image: quay.io/biocontainers/seqtk:1.4--he4a0461_2 +runners: + - type: executable + - type: nextflow \ No newline at end of file diff --git a/src/seqtk/seqtk_sample/help.txt b/src/seqtk/seqtk_sample/help.txt new file mode 100644 index 00000000..49f8001b --- /dev/null +++ b/src/seqtk/seqtk_sample/help.txt @@ -0,0 +1,9 @@ +``` +seqtk_subseq +``` +Usage: seqtk subseq [options] | +Options: + -t TAB delimited output + -s strand aware + -l INT sequence line length [0] +Note: Use 'samtools faidx' if only a few regions are intended. \ No newline at end of file diff --git a/src/seqtk/seqtk_sample/script.sh b/src/seqtk/seqtk_sample/script.sh new file mode 100644 index 00000000..01d981b3 --- /dev/null +++ b/src/seqtk/seqtk_sample/script.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +seqtk sample \ + ${par_two_pass_mode:+-2} \ + ${par_seed:+-s "$par_seed"} \ + "$par_input" \ + "$par_fraction_number" \ + > "$par_output" \ No newline at end of file diff --git a/src/seqtk/seqtk_sample/test.sh b/src/seqtk/seqtk_sample/test.sh new file mode 100644 index 00000000..cba5f613 --- /dev/null +++ b/src/seqtk/seqtk_sample/test.sh @@ -0,0 +1,104 @@ +#!/bin/bash + +set -e + +## VIASH START +meta_executable="target/executable/seqtk/seqtk_sample" +meta_resources_dir="src/seqtk" +## VIASH END + +######################################################################################### +mkdir seqtk_sample_se +cd seqtk_sample_se + +echo "> Run seqtk_sample on fastq SE" +"$meta_executable" \ + --input "$meta_resources_dir/test_data/reads/a.1.fastq.gz" \ + --seed 42 \ + --fraction_number 3 \ + --output "sampled.fastq" + +echo ">> Check if output exists" +if [ ! -f "sampled.fastq" ]; then + echo ">> sampled.fastq does not exist" + exit 1 +fi + +echo ">> Count number of samples" +num_samples=$(grep -c '^@' sampled.fastq) +if [ "$num_samples" -ne 3 ]; then + echo ">> sampled.fastq does not contain 3 samples" + exit 1 +fi + +######################################################################################### +cd .. +mkdir seqtk_sample_pe_number +cd seqtk_sample_pe_number + +echo ">> Run seqtk_sample on fastq.gz PE with number of reads" +"$meta_executable" \ + --input "$meta_resources_dir/test_data/reads/a.1.fastq.gz" \ + --seed 42 \ + --fraction_number 3 \ + --output "sampled_1.fastq" + +"$meta_executable" \ + --input "$meta_resources_dir/test_data/reads/a.2.fastq.gz" \ + --seed 42 \ + --fraction_number 3 \ + --output "sampled_2.fastq" + +echo ">> Check if output exists" +if [ ! -f "sampled_1.fastq" ] || [ ! -f "sampled_2.fastq" ]; then + echo ">> One or both output files do not exist" + exit 1 +fi + +echo ">> Compare reads" +# Extract headers +headers1=$(grep '^@' sampled_1.fastq | sed -e's/ 1$//' | sort) +headers2=$(grep '^@' sampled_2.fastq | sed -e 's/ 2$//' | sort) + +# Compare headers +diff <(echo "$headers1") <(echo "$headers2") || { echo "Mismatch detected"; exit 1; } + +echo ">> Count number of samples" +num_headers=$(echo "$headers1" | wc -l) +if [ "$num_headers" -ne 3 ]; then + echo ">> sampled_1.fastq does not contain 3 headers" + exit 1 +fi + +######################################################################################### +cd .. +mkdir seqtk_sample_pe_fraction +cd seqtk_sample_pe_fraction + +echo ">> Run seqtk_sample on fastq.gz PE with fraction of reads" +"$meta_executable" \ + --input "$meta_resources_dir/test_data/reads/a.1.fastq.gz" \ + --seed 42 \ + --fraction_number 0.5 \ + --output "sampled_1.fastq" + +"$meta_executable" \ + --input "$meta_resources_dir/test_data/reads/a.2.fastq.gz" \ + --seed 42 \ + --fraction_number 0.5 \ + --output "sampled_2.fastq" + +echo ">> Check if output exists" +if [ ! -f "sampled_1.fastq" ] || [ ! -f "sampled_2.fastq" ]; then + echo ">> One or both output files do not exist" + exit 1 +fi + +echo ">> Compare reads" +# Extract headers +headers1=$(grep '^@' sampled_1.fastq | sed -e's/ 1$//' | sort) +headers2=$(grep '^@' sampled_2.fastq | sed -e 's/ 2$//' | sort) + +# Compare headers +diff <(echo "$headers1") <(echo "$headers2") || { echo "Mismatch detected"; exit 1; } + diff --git a/src/seqtk/test_data/reads/a.1.fastq.gz b/src/seqtk/test_data/reads/a.1.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..97a72ce5d48317556a145f93c32c87f0e9e5500f GIT binary patch literal 100 zcmV-q0Gt0GiwFRnrn+7N10Bw(6~jOf1wpPTJlJGMx7b%C&OZykT2i1C{p;n6 zLRM|nP{^ij8VcF9T|*&&2 literal 0 HcmV?d00001 diff --git a/src/seqtk/test_data/reads/a.2.fastq.gz b/src/seqtk/test_data/reads/a.2.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..038bc976ac32e8f26be16949bf5632c7090e635b GIT binary patch literal 100 zcmV-q0Gt0GiwFRnrn+7N10Bw*5yUVM1wrm8Zt)RG{ Date: Wed, 17 Jul 2024 23:23:51 +0200 Subject: [PATCH 6/7] switch to viash actions for ci (#86) * switch to viash actions for ci * add changelog entry * ci force --- .github/workflows/test.yaml | 6 ++++-- CHANGELOG.md | 2 ++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 2591978f..30f98b03 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -1,9 +1,11 @@ -name: Component Testing +name: Test components on: pull_request: push: + branches: + - main jobs: test: - uses: viash-hub/toolbox/.github/workflows/test.yaml@main \ No newline at end of file + uses: viash-io/viash-actions/.github/workflows/test.yaml@v6 \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index f6a8676f..c9f8b222 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,8 @@ * `busco` components: update BUSCO to `5.7.1` (PR #72). +* Update CI to reusable workflow in `viash-io/viash-actions` (PR #86). + ## DOCUMENTATION * Extend the contributing guidelines (PR #82): From e8b82b5d968524f495e80afa8092098408d66d1d Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Wed, 17 Jul 2024 23:25:07 +0200 Subject: [PATCH 7/7] fix authorship (#88) * fix authorship * add author * add missing newlines * update changelog * update changelog --- CHANGELOG.md | 2 ++ src/_authors/angela_o_pisco.yaml | 14 ++++++++++++++ src/_authors/dorien_roosen.yaml | 10 ++++++++++ src/_authors/dries_schaumont.yaml | 11 +++++++++++ src/_authors/emma_rousseau.yaml | 10 ++++++++++ src/_authors/jakub_majercik.yaml | 2 +- src/_authors/kai_waldrant.yaml | 14 ++++++++++++++ src/_authors/leila_paquay.yaml | 10 ++++++++++ src/_authors/robrecht_cannoodt.yaml | 2 +- src/_authors/sai_nirmayi_yasa.yaml | 10 ++++++++++ src/_authors/toni_verbeiren.yaml | 9 +++++++++ src/_authors/weiwei_schultz.yaml | 2 +- src/agat/agat_convert_sp_gff2gtf/config.vsh.yaml | 4 ++++ src/arriba/config.vsh.yaml | 3 +++ src/bcl_convert/config.vsh.yaml | 11 +++++++++++ src/bedtools/bedtools_getfasta/config.vsh.yaml | 3 +++ src/busco/busco_download_datasets/config.vsh.yaml | 3 +++ src/busco/busco_list_datasets/config.vsh.yaml | 3 +++ src/busco/busco_run/config.vsh.yaml | 3 +++ src/cutadapt/config.vsh.yaml | 3 +++ src/falco/config.vsh.yaml | 3 +++ src/fastp/config.vsh.yaml | 3 +++ src/featurecounts/config.vsh.yaml | 4 +++- src/gffread/config.vsh.yaml | 3 +++ src/lofreq/call/config.vsh.yaml | 3 +++ src/lofreq/indelqual/config.vsh.yaml | 3 +++ src/multiqc/config.vsh.yaml | 4 +++- src/pear/config.vsh.yaml | 5 ++++- src/salmon/salmon_index/config.vsh.yaml | 4 +++- src/salmon/salmon_quant/config.vsh.yaml | 4 +++- src/samtools/samtools_collate/config.vsh.yaml | 4 +++- src/samtools/samtools_faidx/config.vsh.yaml | 4 +++- src/samtools/samtools_fasta/config.vsh.yaml | 4 +++- src/samtools/samtools_fastq/config.vsh.yaml | 4 +++- src/samtools/samtools_flagstat/config.vsh.yaml | 4 +++- src/samtools/samtools_idxstats/config.vsh.yaml | 4 +++- src/samtools/samtools_index/config.vsh.yaml | 4 +++- src/samtools/samtools_sort/config.vsh.yaml | 4 +++- src/samtools/samtools_stats/config.vsh.yaml | 4 +++- src/samtools/samtools_view/config.vsh.yaml | 4 +++- src/star/star_align_reads/config.vsh.yaml | 5 +++++ src/star/star_genome_generate/config.vsh.yaml | 4 +++- src/umi_tools/umi_tools_dedup/config.vsh.yaml | 4 +++- 43 files changed, 198 insertions(+), 20 deletions(-) create mode 100644 src/_authors/angela_o_pisco.yaml create mode 100644 src/_authors/dorien_roosen.yaml create mode 100644 src/_authors/dries_schaumont.yaml create mode 100644 src/_authors/emma_rousseau.yaml create mode 100644 src/_authors/kai_waldrant.yaml create mode 100644 src/_authors/leila_paquay.yaml create mode 100644 src/_authors/sai_nirmayi_yasa.yaml create mode 100644 src/_authors/toni_verbeiren.yaml diff --git a/CHANGELOG.md b/CHANGELOG.md index c9f8b222..4e6a0369 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,6 +26,8 @@ - Mention that the contents of the output of components in tests should be checked. +* Add authorship to existing components (PR #88). + ## BUG FIXES * `pear`: fix component not exiting with the correct exitcode when PEAR fails (PR #70). diff --git a/src/_authors/angela_o_pisco.yaml b/src/_authors/angela_o_pisco.yaml new file mode 100644 index 00000000..1f0bf58f --- /dev/null +++ b/src/_authors/angela_o_pisco.yaml @@ -0,0 +1,14 @@ +name: Angela Oliveira Pisco +info: + role: Contributor + links: + github: aopisco + orcid: "0000-0003-0142-2355" + linkedin: aopisco + organizations: + - name: Insitro + href: https://insitro.com + role: Director of Computational Biology + - name: Open Problems + href: https://openproblems.bio + role: Core Member diff --git a/src/_authors/dorien_roosen.yaml b/src/_authors/dorien_roosen.yaml new file mode 100644 index 00000000..d67448d8 --- /dev/null +++ b/src/_authors/dorien_roosen.yaml @@ -0,0 +1,10 @@ +name: Dorien Roosen +info: + links: + email: dorien@data-intuitive.com + github: dorien-er + linkedin: dorien-roosen + organizations: + - name: Data Intuitive + href: https://www.data-intuitive.com + role: Data Scientist diff --git a/src/_authors/dries_schaumont.yaml b/src/_authors/dries_schaumont.yaml new file mode 100644 index 00000000..b2678081 --- /dev/null +++ b/src/_authors/dries_schaumont.yaml @@ -0,0 +1,11 @@ +name: Dries Schaumont +info: + links: + email: dries@data-intuitive.com + github: DriesSchaumont + orcid: "0000-0002-4389-0440" + linkedin: dries-schaumont + organizations: + - name: Data Intuitive + href: https://www.data-intuitive.com + role: Data Scientist diff --git a/src/_authors/emma_rousseau.yaml b/src/_authors/emma_rousseau.yaml new file mode 100644 index 00000000..1a9ac456 --- /dev/null +++ b/src/_authors/emma_rousseau.yaml @@ -0,0 +1,10 @@ +name: Emma Rousseau +info: + links: + email: emma@data-intuitive.com + github: emmarousseau + linkedin: emmarousseau1 + organizations: + - name: Data Intuitive + href: https://www.data-intuitive.com + role: Bioinformatician diff --git a/src/_authors/jakub_majercik.yaml b/src/_authors/jakub_majercik.yaml index 3b75fffe..c2a7867d 100644 --- a/src/_authors/jakub_majercik.yaml +++ b/src/_authors/jakub_majercik.yaml @@ -7,4 +7,4 @@ info: organizations: - name: Data Intuitive href: https://www.data-intuitive.com - role: Bioinformatics Engineer \ No newline at end of file + role: Bioinformatics Engineer diff --git a/src/_authors/kai_waldrant.yaml b/src/_authors/kai_waldrant.yaml new file mode 100644 index 00000000..a132c528 --- /dev/null +++ b/src/_authors/kai_waldrant.yaml @@ -0,0 +1,14 @@ +name: Kai Waldrant +info: + links: + email: kai@data-intuitive.com + github: KaiWaldrant + orcid: "0009-0003-8555-1361" + linkedin: kaiwaldrant + organizations: + - name: Data Intuitive + href: https://www.data-intuitive.com + role: Bioinformatician + - name: Open Problems + href: https://openproblems.bio + role: Contributor diff --git a/src/_authors/leila_paquay.yaml b/src/_authors/leila_paquay.yaml new file mode 100644 index 00000000..21aa532d --- /dev/null +++ b/src/_authors/leila_paquay.yaml @@ -0,0 +1,10 @@ +name: Leïla Paquay +info: + links: + email: leila@data-intuitive.com + github: Leila011 + linkedin: leilapaquay + organizations: + - name: Data Intuitive + href: https://www.data-intuitive.com + role: Software Developer diff --git a/src/_authors/robrecht_cannoodt.yaml b/src/_authors/robrecht_cannoodt.yaml index d7c0f283..c4c1bdec 100644 --- a/src/_authors/robrecht_cannoodt.yaml +++ b/src/_authors/robrecht_cannoodt.yaml @@ -11,4 +11,4 @@ info: role: Data Science Engineer - name: Open Problems href: https://openproblems.bio - role: Core Member \ No newline at end of file + role: Core Member diff --git a/src/_authors/sai_nirmayi_yasa.yaml b/src/_authors/sai_nirmayi_yasa.yaml new file mode 100644 index 00000000..9f560c58 --- /dev/null +++ b/src/_authors/sai_nirmayi_yasa.yaml @@ -0,0 +1,10 @@ +name: Sai Nirmayi Yasa +info: + links: + email: nirmayi@data-intuitive.com + github: sainirmayi + linkedin: sai-nirmayi-yasa + organizations: + - name: Data Intuitive + href: https://www.data-intuitive.com + role: Junior Bioinformatics Researcher diff --git a/src/_authors/toni_verbeiren.yaml b/src/_authors/toni_verbeiren.yaml new file mode 100644 index 00000000..2f2f851f --- /dev/null +++ b/src/_authors/toni_verbeiren.yaml @@ -0,0 +1,9 @@ +name: Toni Verbeiren +info: + links: + github: tverbeiren + linkedin: verbeiren + organizations: + - name: Data Intuitive + href: https://www.data-intuitive.com + role: Data Scientist and CEO diff --git a/src/_authors/weiwei_schultz.yaml b/src/_authors/weiwei_schultz.yaml index 324f9378..e4945078 100644 --- a/src/_authors/weiwei_schultz.yaml +++ b/src/_authors/weiwei_schultz.yaml @@ -2,4 +2,4 @@ name: Weiwei Schultz info: organizations: - name: Janssen R&D US - role: Associate Director Data Sciences \ No newline at end of file + role: Associate Director Data Sciences diff --git a/src/agat/agat_convert_sp_gff2gtf/config.vsh.yaml b/src/agat/agat_convert_sp_gff2gtf/config.vsh.yaml index b788c7c7..757cbd85 100644 --- a/src/agat/agat_convert_sp_gff2gtf/config.vsh.yaml +++ b/src/agat/agat_convert_sp_gff2gtf/config.vsh.yaml @@ -27,6 +27,10 @@ links: references: doi: 10.5281/zenodo.3552717 license: GPL-3.0 +authors: + - __merge__: /src/_authors/leila_paquay.yaml + roles: [ author, maintainer ] + argument_groups: - name: Inputs arguments: diff --git a/src/arriba/config.vsh.yaml b/src/arriba/config.vsh.yaml index 8d72d7eb..db5960cf 100644 --- a/src/arriba/config.vsh.yaml +++ b/src/arriba/config.vsh.yaml @@ -11,6 +11,9 @@ license: MIT requirements: cpus: 1 commands: [ arriba ] +authors: + - __merge__: /src/_authors/robrecht_cannoodt.yaml + roles: [ author, maintainer ] argument_groups: - name: Inputs arguments: diff --git a/src/bcl_convert/config.vsh.yaml b/src/bcl_convert/config.vsh.yaml index 657fb1f0..81103776 100644 --- a/src/bcl_convert/config.vsh.yaml +++ b/src/bcl_convert/config.vsh.yaml @@ -4,6 +4,17 @@ description: | Information about upgrading from bcl2fastq via [Upgrading from bcl2fastq to BCL Convert](https://emea.support.illumina.com/bulletins/2020/10/upgrading-from-bcl2fastq-to-bcl-convert.html) and [BCL Convert Compatible Products](https://support.illumina.com/sequencing/sequencing_software/bcl-convert/compatibility.html) +keywords: [demultiplex, fastq, bcl, illumina] +links: + homepage: https://support.illumina.com/sequencing/sequencing_software/bcl-convert.html + documentation: https://support.illumina.com/downloads/bcl-convert-user-guide.html +license: Proprietary +authors: + - __merge__: /src/_authors/toni_verbeiren.yaml + roles: [ author, maintainer ] + - __merge__: /src/_authors/dorien_roosen.yaml + roles: [ author ] + argument_groups: - name: Input arguments arguments: diff --git a/src/bedtools/bedtools_getfasta/config.vsh.yaml b/src/bedtools/bedtools_getfasta/config.vsh.yaml index f1f49a87..fe160b20 100644 --- a/src/bedtools/bedtools_getfasta/config.vsh.yaml +++ b/src/bedtools/bedtools_getfasta/config.vsh.yaml @@ -10,6 +10,9 @@ references: license: GPL-2.0 requirements: commands: [bedtools] +authors: + - __merge__: /src/_authors/dries_schaumont.yaml + roles: [ author, maintainer ] argument_groups: - name: Input arguments diff --git a/src/busco/busco_download_datasets/config.vsh.yaml b/src/busco/busco_download_datasets/config.vsh.yaml index 5297af2e..cce3faa0 100644 --- a/src/busco/busco_download_datasets/config.vsh.yaml +++ b/src/busco/busco_download_datasets/config.vsh.yaml @@ -9,6 +9,9 @@ links: references: doi: 10.1007/978-1-4939-9173-0_14 license: MIT +authors: + - __merge__: /src/_authors/dorien_roosen.yaml + roles: [ author, maintainer ] argument_groups: - name: Inputs arguments: diff --git a/src/busco/busco_list_datasets/config.vsh.yaml b/src/busco/busco_list_datasets/config.vsh.yaml index cac34cc6..93fd0559 100644 --- a/src/busco/busco_list_datasets/config.vsh.yaml +++ b/src/busco/busco_list_datasets/config.vsh.yaml @@ -9,6 +9,9 @@ links: references: doi: 10.1007/978-1-4939-9173-0_14 license: MIT +authors: + - __merge__: /src/_authors/dorien_roosen.yaml + roles: [ author, maintainer ] argument_groups: - name: Outputs arguments: diff --git a/src/busco/busco_run/config.vsh.yaml b/src/busco/busco_run/config.vsh.yaml index 23ee95fb..435e9d2a 100644 --- a/src/busco/busco_run/config.vsh.yaml +++ b/src/busco/busco_run/config.vsh.yaml @@ -9,6 +9,9 @@ links: references: doi: 10.1007/978-1-4939-9173-0_14 license: MIT +authors: + - __merge__: /src/_authors/dorien_roosen.yaml + roles: [ author, maintainer ] argument_groups: - name: Inputs arguments: diff --git a/src/cutadapt/config.vsh.yaml b/src/cutadapt/config.vsh.yaml index b315d0ce..7e36a8e0 100644 --- a/src/cutadapt/config.vsh.yaml +++ b/src/cutadapt/config.vsh.yaml @@ -9,6 +9,9 @@ links: references: doi: 10.14806/ej.17.1.200 license: MIT +authors: + - __merge__: /src/_authors/toni_verbeiren.yaml + roles: [ author, maintainer ] argument_groups: #################################################################### - name: Specify Adapters for R1 diff --git a/src/falco/config.vsh.yaml b/src/falco/config.vsh.yaml index 4d9cf656..de9906ef 100644 --- a/src/falco/config.vsh.yaml +++ b/src/falco/config.vsh.yaml @@ -9,6 +9,9 @@ references: license: GPL-3.0 requirements: commands: [falco] +authors: + - __merge__: /src/_authors/toni_verbeiren.yaml + roles: [ author, maintainer ] # Notes: # - falco as arguments similar to -subsample and we update those to --subsample diff --git a/src/fastp/config.vsh.yaml b/src/fastp/config.vsh.yaml index b7d9062a..f1f8f1ed 100644 --- a/src/fastp/config.vsh.yaml +++ b/src/fastp/config.vsh.yaml @@ -26,6 +26,9 @@ links: references: doi: "10.1093/bioinformatics/bty560" license: MIT +authors: + - __merge__: /src/_authors/robrecht_cannoodt.yaml + roles: [ author, maintainer ] argument_groups: - name: Inputs description: | diff --git a/src/featurecounts/config.vsh.yaml b/src/featurecounts/config.vsh.yaml index 8697b1fe..e17d9ac0 100644 --- a/src/featurecounts/config.vsh.yaml +++ b/src/featurecounts/config.vsh.yaml @@ -11,7 +11,9 @@ references: license: GPL-3.0 requirements: commands: [ featureCounts ] - +authors: + - __merge__: /src/_authors/sai_nirmayi_yasa.yaml + roles: [ author, maintainer ] argument_groups: - name: Inputs arguments: diff --git a/src/gffread/config.vsh.yaml b/src/gffread/config.vsh.yaml index 7477a284..bd985ffb 100644 --- a/src/gffread/config.vsh.yaml +++ b/src/gffread/config.vsh.yaml @@ -8,6 +8,9 @@ links: references: doi: 10.12688/f1000research.23297.2 license: MIT +authors: + - __merge__: /src/_authors/emma_rousseau.yaml + roles: [ author, maintainer ] argument_groups: - name: Inputs arguments: diff --git a/src/lofreq/call/config.vsh.yaml b/src/lofreq/call/config.vsh.yaml index c547de9d..286a040a 100644 --- a/src/lofreq/call/config.vsh.yaml +++ b/src/lofreq/call/config.vsh.yaml @@ -17,6 +17,9 @@ references: license: "MIT" requirements: commands: [ lofreq ] +authors: + - __merge__: /src/_authors/kai_waldrant.yaml + roles: [ author, maintainer ] argument_groups: - name: Inputs arguments: diff --git a/src/lofreq/indelqual/config.vsh.yaml b/src/lofreq/indelqual/config.vsh.yaml index 0524458e..29696c81 100644 --- a/src/lofreq/indelqual/config.vsh.yaml +++ b/src/lofreq/indelqual/config.vsh.yaml @@ -18,6 +18,9 @@ references: license: "MIT" requirements: commands: [ lofreq ] +authors: + - __merge__: /src/_authors/kai_waldrant.yaml + roles: [ author, maintainer ] argument_groups: - name: Inputs arguments: diff --git a/src/multiqc/config.vsh.yaml b/src/multiqc/config.vsh.yaml index df5e38e1..ba305025 100644 --- a/src/multiqc/config.vsh.yaml +++ b/src/multiqc/config.vsh.yaml @@ -11,7 +11,9 @@ info: references: doi: 10.1093/bioinformatics/btw354 licence: GPL v3 or later - +authors: + - __merge__: /src/_authors/dorien_roosen.yaml + roles: [ author, maintainer ] argument_groups: - name: "Input" arguments: diff --git a/src/pear/config.vsh.yaml b/src/pear/config.vsh.yaml index d6dbe6c9..acae10cc 100644 --- a/src/pear/config.vsh.yaml +++ b/src/pear/config.vsh.yaml @@ -12,7 +12,10 @@ references: doi: 10.1093/bioinformatics/btt593 license: "CC-BY-NC-SA-3.0" requirements: - commands: [ pear , gzip ] + commands: [ pear, gzip ] +authors: + - __merge__: /src/_authors/kai_waldrant.yaml + roles: [ author, maintainer ] argument_groups: - name: Inputs arguments: diff --git a/src/salmon/salmon_index/config.vsh.yaml b/src/salmon/salmon_index/config.vsh.yaml index 41c1e05b..925c3000 100644 --- a/src/salmon/salmon_index/config.vsh.yaml +++ b/src/salmon/salmon_index/config.vsh.yaml @@ -12,7 +12,9 @@ references: license: GPL-3.0 requirements: commands: [ salmon ] - +authors: + - __merge__: /src/_authors/sai_nirmayi_yasa.yaml + roles: [ author, maintainer ] argument_groups: - name: Inputs arguments: diff --git a/src/salmon/salmon_quant/config.vsh.yaml b/src/salmon/salmon_quant/config.vsh.yaml index b7e303f4..1f96f0c9 100644 --- a/src/salmon/salmon_quant/config.vsh.yaml +++ b/src/salmon/salmon_quant/config.vsh.yaml @@ -12,7 +12,9 @@ references: license: GPL-3.0 requirements: commands: [ salmon ] - +authors: + - __merge__: /src/_authors/sai_nirmayi_yasa.yaml + roles: [ author, maintainer ] argument_groups: - name: Common input options arguments: diff --git a/src/samtools/samtools_collate/config.vsh.yaml b/src/samtools/samtools_collate/config.vsh.yaml index 669f4cdf..84a3195c 100644 --- a/src/samtools/samtools_collate/config.vsh.yaml +++ b/src/samtools/samtools_collate/config.vsh.yaml @@ -9,7 +9,9 @@ links: references: doi: [10.1093/bioinformatics/btp352, 10.1093/gigascience/giab008] license: MIT/Expat - +authors: + - __merge__: /src/_authors/emma_rousseau.yaml + roles: [ author, maintainer ] argument_groups: - name: Inputs arguments: diff --git a/src/samtools/samtools_faidx/config.vsh.yaml b/src/samtools/samtools_faidx/config.vsh.yaml index c1c9325d..937b0804 100644 --- a/src/samtools/samtools_faidx/config.vsh.yaml +++ b/src/samtools/samtools_faidx/config.vsh.yaml @@ -9,7 +9,9 @@ links: references: doi: [10.1093/bioinformatics/btp352, 10.1093/gigascience/giab008] license: MIT/Expat - +authors: + - __merge__: /src/_authors/emma_rousseau.yaml + roles: [ author, maintainer ] argument_groups: - name: Inputs arguments: diff --git a/src/samtools/samtools_fasta/config.vsh.yaml b/src/samtools/samtools_fasta/config.vsh.yaml index 23517f6c..70ba72b9 100644 --- a/src/samtools/samtools_fasta/config.vsh.yaml +++ b/src/samtools/samtools_fasta/config.vsh.yaml @@ -9,7 +9,9 @@ links: references: doi: [10.1093/bioinformatics/btp352, 10.1093/gigascience/giab008] license: MIT/Expat - +authors: + - __merge__: /src/_authors/emma_rousseau.yaml + roles: [ author, maintainer ] argument_groups: - name: Inputs arguments: diff --git a/src/samtools/samtools_fastq/config.vsh.yaml b/src/samtools/samtools_fastq/config.vsh.yaml index cac7653b..09014ced 100644 --- a/src/samtools/samtools_fastq/config.vsh.yaml +++ b/src/samtools/samtools_fastq/config.vsh.yaml @@ -9,7 +9,9 @@ links: references: doi: [10.1093/bioinformatics/btp352, 10.1093/gigascience/giab008] license: MIT/Expat - +authors: + - __merge__: /src/_authors/emma_rousseau.yaml + roles: [ author, maintainer ] argument_groups: - name: Inputs arguments: diff --git a/src/samtools/samtools_flagstat/config.vsh.yaml b/src/samtools/samtools_flagstat/config.vsh.yaml index 9b4dfbe1..b30f1867 100644 --- a/src/samtools/samtools_flagstat/config.vsh.yaml +++ b/src/samtools/samtools_flagstat/config.vsh.yaml @@ -9,7 +9,9 @@ links: references: doi: [10.1093/bioinformatics/btp352, 10.1093/gigascience/giab008] license: MIT/Expat - +authors: + - __merge__: /src/_authors/emma_rousseau.yaml + roles: [ author, maintainer ] argument_groups: - name: Inputs arguments: diff --git a/src/samtools/samtools_idxstats/config.vsh.yaml b/src/samtools/samtools_idxstats/config.vsh.yaml index 30f21348..16e901d7 100644 --- a/src/samtools/samtools_idxstats/config.vsh.yaml +++ b/src/samtools/samtools_idxstats/config.vsh.yaml @@ -9,7 +9,9 @@ links: references: doi: [10.1093/bioinformatics/btp352, 10.1093/gigascience/giab008] license: MIT/Expat - +authors: + - __merge__: /src/_authors/emma_rousseau.yaml + roles: [ author, maintainer ] argument_groups: - name: Inputs arguments: diff --git a/src/samtools/samtools_index/config.vsh.yaml b/src/samtools/samtools_index/config.vsh.yaml index 8c59a20e..4220c691 100644 --- a/src/samtools/samtools_index/config.vsh.yaml +++ b/src/samtools/samtools_index/config.vsh.yaml @@ -9,7 +9,9 @@ links: references: doi: [10.1093/bioinformatics/btp352, 10.1093/gigascience/giab008] license: MIT/Expat - +authors: + - __merge__: /src/_authors/emma_rousseau.yaml + roles: [ author, maintainer ] argument_groups: - name: Inputs arguments: diff --git a/src/samtools/samtools_sort/config.vsh.yaml b/src/samtools/samtools_sort/config.vsh.yaml index a78800da..e0776c2d 100644 --- a/src/samtools/samtools_sort/config.vsh.yaml +++ b/src/samtools/samtools_sort/config.vsh.yaml @@ -9,7 +9,9 @@ links: references: doi: [10.1093/bioinformatics/btp352, 10.1093/gigascience/giab008] license: MIT/Expat - +authors: + - __merge__: /src/_authors/emma_rousseau.yaml + roles: [ author, maintainer ] argument_groups: - name: Inputs arguments: diff --git a/src/samtools/samtools_stats/config.vsh.yaml b/src/samtools/samtools_stats/config.vsh.yaml index ca630876..b115b4df 100644 --- a/src/samtools/samtools_stats/config.vsh.yaml +++ b/src/samtools/samtools_stats/config.vsh.yaml @@ -9,7 +9,9 @@ links: references: doi: [10.1093/bioinformatics/btp352, 10.1093/gigascience/giab008] license: MIT/Expat - +authors: + - __merge__: /src/_authors/emma_rousseau.yaml + roles: [ author, maintainer ] argument_groups: - name: Inputs arguments: diff --git a/src/samtools/samtools_view/config.vsh.yaml b/src/samtools/samtools_view/config.vsh.yaml index 206b87ac..86dde146 100644 --- a/src/samtools/samtools_view/config.vsh.yaml +++ b/src/samtools/samtools_view/config.vsh.yaml @@ -9,7 +9,9 @@ links: references: doi: [10.1093/bioinformatics/btp352, 10.1093/gigascience/giab008] license: MIT/Expat - +authors: + - __merge__: /src/_authors/emma_rousseau.yaml + roles: [ author, maintainer ] argument_groups: - name: Inputs arguments: diff --git a/src/star/star_align_reads/config.vsh.yaml b/src/star/star_align_reads/config.vsh.yaml index eab65b35..bdc956d3 100644 --- a/src/star/star_align_reads/config.vsh.yaml +++ b/src/star/star_align_reads/config.vsh.yaml @@ -11,6 +11,11 @@ references: license: MIT requirements: commands: [ STAR, python, ps, zcat, bzcat ] +authors: + - __merge__: /src/_authors/angela_o_pisco.yaml + roles: [ author ] + - __merge__: /src/_authors/robrecht_cannoodt.yaml + roles: [ author, maintainer ] # manually taking care of the main input and output arguments argument_groups: - name: Inputs diff --git a/src/star/star_genome_generate/config.vsh.yaml b/src/star/star_genome_generate/config.vsh.yaml index 3adaf7a2..60fa3839 100644 --- a/src/star/star_genome_generate/config.vsh.yaml +++ b/src/star/star_genome_generate/config.vsh.yaml @@ -11,7 +11,9 @@ references: license: MIT requirements: commands: [ STAR ] - +authors: + - __merge__: /src/_authors/sai_nirmayi_yasa.yaml + roles: [ author, maintainer ] argument_groups: - name: "Input" arguments: diff --git a/src/umi_tools/umi_tools_dedup/config.vsh.yaml b/src/umi_tools/umi_tools_dedup/config.vsh.yaml index a02e70a1..e6953e6e 100644 --- a/src/umi_tools/umi_tools_dedup/config.vsh.yaml +++ b/src/umi_tools/umi_tools_dedup/config.vsh.yaml @@ -10,7 +10,9 @@ links: references: doi: 10.1101/gr.209601.116 license: MIT - +authors: + - __merge__: /src/_authors/emma_rousseau.yaml + roles: [ author, maintainer ] argument_groups: - name: Inputs arguments: