From f6216553ba28458315d167ab3e5871062077b3cc Mon Sep 17 00:00:00 2001 From: Dorien <41797896+dorien-er@users.noreply.github.com> Date: Sun, 11 Feb 2024 21:56:36 +0100 Subject: [PATCH] add busco_download_datasets component (#19) * add script, test, config for downalod busco datasets * add changelog entry * fix typos * update changelog * rename busco to busco run * update changelog * fix names and namespaces --------- Co-authored-by: Robrecht Cannoodt --- CHANGELOG.md | 3 +- .../busco_download_datasets/config.vsh.yaml | 45 +++++++++++++++++++ src/busco/busco_download_datasets/script.sh | 14 ++++++ src/busco/busco_download_datasets/test.sh | 15 +++++++ src/busco/busco_list_datasets/config.vsh.yaml | 3 +- src/busco/busco_run/config.vsh.yaml | 5 ++- 6 files changed, 82 insertions(+), 3 deletions(-) create mode 100644 src/busco/busco_download_datasets/config.vsh.yaml create mode 100644 src/busco/busco_download_datasets/script.sh create mode 100644 src/busco/busco_download_datasets/test.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 9d31a2da..ba44f99e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,7 +10,8 @@ * `busco`: - `busco/busco_run`: Assess genome assembly and annotation completeness with single copy orthologs (PR #6). - - `busco/busco_list_datasets`: Lists available busco datasets (PR #18) + - `busco/busco_list_datasets`: Lists available busco datasets (PR #18). + - `busco/busco_download_datasets`: Download busco datasets (PR #19). * `featurecounts`: Assign sequence reads to genomic features (PR #11). diff --git a/src/busco/busco_download_datasets/config.vsh.yaml b/src/busco/busco_download_datasets/config.vsh.yaml new file mode 100644 index 00000000..dc356f8a --- /dev/null +++ b/src/busco/busco_download_datasets/config.vsh.yaml @@ -0,0 +1,45 @@ +functionality: + name: busco_download_datasets + namespace: busco + description: Downloads available busco datasets + info: + keywords: [lineage datasets] + homepage: https://busco.ezlab.org/ + documentation: https://busco.ezlab.org/busco_userguide.html + repository: https://gitlab.com/ezlab/busco + reference: "10.1007/978-1-4939-9173-0_14" + licence: MIT + argument_groups: + - name: Inputs + arguments: + - name: --download + type: string + description: | + Download dataset. Possible values are a specific dataset name, "all", "prokaryota", "eukaryota", or "virus". + The full list of available datasets can be viewed [here](https://busco-data.ezlab.org/v5/data/lineages/) or by running the busco/busco_list_datasets component. + required: true + example: stramenopiles_odb10 + - name: Outputs + arguments: + - name: --download_path + direction: output + type: file + description: | + Local filepath for storing BUSCO dataset downloads + required: false + default: busco_downloads + example: busco_downloads + resources: + - type: bash_script + path: script.sh + test_resources: + - type: bash_script + path: test.sh +platforms: + - type: docker + image: quay.io/biocontainers/busco:5.6.1--pyhdfd78af_0 + setup: + - type: docker + run: | + busco --version | sed 's/BUSCO\s\(.*\)/busco: "\1"/' > /var/software_versions.txt + - type: nextflow diff --git a/src/busco/busco_download_datasets/script.sh b/src/busco/busco_download_datasets/script.sh new file mode 100644 index 00000000..6010c01f --- /dev/null +++ b/src/busco/busco_download_datasets/script.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +## VIASH START +## VIASH END + + +if [ ! -d "$par_download_path" ]; then + mkdir -p "$par_download_path" +fi + +busco \ + --download_path "$par_download_path" \ + --download "$par_download" + diff --git a/src/busco/busco_download_datasets/test.sh b/src/busco/busco_download_datasets/test.sh new file mode 100644 index 00000000..c6baecea --- /dev/null +++ b/src/busco/busco_download_datasets/test.sh @@ -0,0 +1,15 @@ +echo "> Downloading busco stramenopiles_odb10 dataset" + +"$meta_executable" \ + --download stramenopiles_odb10 \ + --download_path downloads + +echo ">> Checking output" +[ ! -f "downloads/file_versions.tsv" ] && echo "file_versions.tsv does not exist" && exit 1 +[ ! -f "downloads/lineages/stramenopiles_odb10/dataset.cfg" ] && echo "dataset.cfg does not exist" && exit 1 + +echo ">> Checking if output is empty" +[ ! -s "downloads/file_versions.tsv" ] && echo "file_versions.tsv is empty" && exit 1 +[ ! -s "downloads/lineages/stramenopiles_odb10/dataset.cfg" ] && echo "dataset.cfg is empty" && exit 1 + +rm -r downloads \ No newline at end of file diff --git a/src/busco/busco_list_datasets/config.vsh.yaml b/src/busco/busco_list_datasets/config.vsh.yaml index 444e2a6d..df612fdc 100644 --- a/src/busco/busco_list_datasets/config.vsh.yaml +++ b/src/busco/busco_list_datasets/config.vsh.yaml @@ -1,5 +1,6 @@ functionality: - name: busco + name: busco_list_datasets + namespace: busco description: Lists the available busco datasets info: keywords: [lineage datasets] diff --git a/src/busco/busco_run/config.vsh.yaml b/src/busco/busco_run/config.vsh.yaml index 2297fc2d..0fdfea2e 100644 --- a/src/busco/busco_run/config.vsh.yaml +++ b/src/busco/busco_run/config.vsh.yaml @@ -1,5 +1,6 @@ functionality: name: busco_run + namespace: busco description: Assessment of genome assembly and annotation completeness with single copy orthologs info: keywords: [Genome assembly, quality control] @@ -37,7 +38,9 @@ functionality: Specify a BUSCO lineage dataset that is most closely related to the assembly or gene set being assessed. The full list of available datasets can be viewed [here](https://busco-data.ezlab.org/v5/data/lineages/) or by running the busco/busco_list_datasets component. When unsure, the "--auto_lineage" flag can be set to automatically find the optimal lineage path. - Requested datasets will automatically be downloaded if not already present in the download folder. + BUSCO will automatically download the requested dataset if it is not already present in the download folder. + You can optionally provide a path to a local dataset instead of a name, e.g. path/to/dataset. + Datasets can be downloaded using the busco/busco_download_dataset component. example: stramenopiles_odb10 - name: Outputs