Add busco (viash-hub#6)

* config file and test data * organize config in argument groups and add script * add busco help text * add tests * add examples to config * add version script * Update src/busco/config.vsh.yaml Co-authored-by: Robrecht Cannoodt <[email protected]> * Update src/busco/script.sh Co-authored-by: Robrecht Cannoodt <[email protected]> * Update src/busco/script.sh Co-authored-by: Robrecht Cannoodt <[email protected]> * Update src/busco/config.vsh.yaml Co-authored-by: Robrecht Cannoodt <[email protected]> * Update version * add script to obtain test data * add changelog entry * Delete src/busco/version.sh * update cpus input * Update src/busco/config.vsh.yaml Co-authored-by: Robrecht Cannoodt <[email protected]> * fix version * Update src/busco/config.vsh.yaml Co-authored-by: Robrecht Cannoodt <[email protected]> * Update src/busco/script.sh Co-authored-by: Robrecht Cannoodt <[email protected]> * Update src/busco/script.sh Co-authored-by: Robrecht Cannoodt <[email protected]> * move into separate module * merge * add outputs * update tests * remove download flags - to be a separate component * modify description of list dataset * fix tests * remove files new comp * remove defaults * Update src/busco/busco/config.vsh.yaml Co-authored-by: Robrecht Cannoodt <[email protected]> * Update src/busco/busco/config.vsh.yaml Co-authored-by: Robrecht Cannoodt <[email protected]> * Update src/busco/busco/config.vsh.yaml Co-authored-by: Robrecht Cannoodt <[email protected]> * Update src/busco/busco/config.vsh.yaml Co-authored-by: Robrecht Cannoodt <[email protected]> * Update src/busco/busco/script.sh Co-authored-by: Robrecht Cannoodt <[email protected]> * fix typo * remove unrequired params * remove unused vars * opt out of run stats by default * update tests * update test * remove directory level * add mkdir * enable copying from symlink * remove sleep command * Update src/busco/config.vsh.yaml Co-authored-by: Robrecht Cannoodt <[email protected]> * Update src/busco/test.sh Co-authored-by: Robrecht Cannoodt <[email protected]> * Update src/busco/test.sh Co-authored-by: Robrecht Cannoodt <[email protected]> * add output tests * fix typo * add genome test data and script * fix typo * typo * use smaller genome --------- Co-authored-by: Robrecht Cannoodt <[email protected]>
emmarousseau · Jan 31, 2024 · 88f7b92 · 88f7b92
1 parent 3e0926c
commit 88f7b92
Show file tree

Hide file tree

Showing 8 changed files with 10,505 additions and 0 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,7 @@
 ## NEW FEATURES
 
 * `arriba`: Detect gene fusions from RNA-seq data (PR #1).
+* `busco`: Assess genome assembly and annotation completeness with single copy orthologs (PR #6).
 
 * `fastp`: An ultra-fast all-in-one FASTQ preprocessor (PR #3).
 

diff --git a/src/busco/config.vsh.yaml b/src/busco/config.vsh.yaml
@@ -0,0 +1,209 @@
+functionality:
+  name: busco
+  description: Assessment of genome assembly and annotation completeness with single copy orthologs
+  info:
+    keywords: [Genome assembly, quality control]
+    homepage: https://busco.ezlab.org/
+    documentation: https://busco.ezlab.org/busco_userguide.html
+    repository: https://gitlab.com/ezlab/busco
+    reference: "10.1007/978-1-4939-9173-0_14"
+    licence: MIT
+  argument_groups:
+    - name: Inputs
+      arguments:
+        - name: --input
+          alternatives: ["-i"]
+          type: file
+          description: |
+            Input sequence file in FASTA format. Can be an assembled genome or transcriptome (DNA), or protein sequences from an annotated gene set. Also possible to use a path to a directory containing multiple input files.
+          required: true
+          example: file.fasta
+        - name: --mode
+          alternatives: ["-m"]
+          type: string
+          choices: [genome, geno, transcriptome, tran, proteins, prot]
+          required: true
+          description: |
+            Specify which BUSCO analysis mode to run. There are three valid modes:
+              - geno or genome, for genome assemblies (DNA)
+              - tran or transcriptome, for transcriptome assemblies (DNA)
+              - prot or proteins, for annotated gene sets (protein)
+          example: proteins
+        - name: --lineage_dataset
+          alternatives: ["-l"]
+          type: string
+          required: false
+          description: |
+            Specify a BUSCO lineage dataset that is most closely related to the assembly or gene set being assessed. 
+            The full list of available datasets can be viewed [here](https://busco-data.ezlab.org/v5/data/lineages/) or by running `busco --list-datasets` (which requires installing the tool).
+            When unsure, the "--auto_lineage" flag can be set to automatically find the optimal lineage path.
+            Requested datasets will automatically be downloaded if not already present in the download folder.
+          example: stramenopiles_odb10
+
+    - name: Outputs
+      arguments:
+        - name: --short_summary_json
+          required: false
+          direction: output
+          type: file
+          example: short_summary.json
+          description: |
+            Output file for short summary in JSON format.
+        - name: --short_summary_txt
+          required: false
+          direction: output
+          type: file
+          example: short_summary.txt
+          description: |
+            Output file for short summary in TXT format.
+        - name: --full_table
+          required: false
+          direction: output
+          type: file
+          example: full_table.tsv
+          description: |
+            Full table output in TSV format.
+        - name: --missing_busco_list
+          required: false
+          direction: output
+          type: file
+          example: missing_busco_list.tsv
+          description: |
+            Missing list output in TSV format.
+        - name: --output_dir
+          required: false
+          direction: output
+          type: file
+          example: output_dir/
+          description: |
+            The full output directory, if so desired.
+
+    - name: Resource and Run Settings
+      arguments:
+        - name: --force
+          type: boolean_true
+          description: |
+            Force rewriting of existing files. Must be used when output files with the provided name already exist.
+        - name: --quiet
+          alternatives: ["-q"]
+          type: boolean_true
+          description: |
+            Disable the info logs, displays only errors.
+        - name: --restart
+          alternatives: ["-r"]
+          type: boolean_true
+          description: |
+            Continue a run that had already partially completed. Restarting skips calls to tools that have completed but performs all pre- and post-processing steps.
+        - name: --tar
+          type: boolean_true
+          description: |
+            Compress some subdirectories with many files to save space.
+
+    - name: Lineage Dataset Settings
+      arguments:
+        - name: --auto_lineage
+          type: boolean_true
+          description: |
+            Run auto-lineage pipelilne to automatically determine BUSCO lineage dataset that is most closely related to the assembly or gene set being assessed.
+        - name: --auto_lineage_euk
+          type: boolean_true
+          description: |
+            Run auto-placement just on eukaryota tree to find optimal lineage path.
+        - name: --auto_lineage_prok
+          type: boolean_true
+          description: |
+            Run auto_lineage just on prokaryota trees to find optimum lineage path.
+        - name: --datasets_version
+          type: string
+          required: false
+          description: |
+            Specify the version of BUSCO datasets
+          example: odb10
+
+    - name: Augustus Settings
+      arguments:
+        - name: --augustus
+          type: boolean_true
+          description: |
+            Use augustus gene predictor for eukaryote runs.
+        - name: --augustus_parameters
+          type: string
+          required: false
+          description: |
+            Additional parameters to be passed to Augustus (see Augustus documentation: https://github.com/Gaius-Augustus/Augustus/blob/master/docs/RUNNING-AUGUSTUS.md).
+            Parameters should be contained within a single string, without whitespace and seperated by commas.
+          example: "--PARAM1=VALUE1,--PARAM2=VALUE2"
+        - name: --augustus_species
+          type: string
+          required: false
+          description: |
+            Specify the augustus species
+        - name: --long
+          type: boolean_true
+          description: |
+            Optimize Augustus self-training mode. This adds considerably to the run time, but can improve results for some non-model organisms.
+
+    - name: BBTools Settings
+      arguments:
+        - name: --contig_break
+          type: integer
+          required: false
+          description: |
+            Number of contiguous Ns to signify a break between contigs in BBTools analysis.
+        - name: --limit
+          type: integer
+          required: false
+          description: |
+            Number of candidate regions (contig or transcript) from the BLAST output to consider per BUSCO.
+            This option is only effective in pipelines using BLAST, i.e. the genome pipeline (see --augustus) or the prokaryota transcriptome pipeline.
+        - name: --scaffold_composition
+          type: boolean_true
+          description: |
+            Writes ACGTN content per scaffold to a file scaffold_composition.txt.
+
+    - name: BLAST Settings
+      arguments:
+        - name: --e_value
+          type: double
+          required: false
+          description: |
+            E-value cutoff for BLAST searches.
+
+    - name: Protein Gene Prediction settings
+      arguments:
+        - name: --miniprot
+          type: boolean_true
+          description: |
+            Use Miniprot gene predictor.
+
+    - name: MetaEuk Settings
+      arguments:
+        - name: --metaeuk_parameters
+          type: string
+          description: |
+            Pass additional arguments to Metaeuk for the first run (see Metaeuk documentation https://github.com/soedinglab/metaeuk).
+            All parameters should be contained within a single string with no white space, with each parameter separated by a comma.
+          example: "--max-overlap=15,--min-exon-aa=15"
+        - name: --metaeuk_rerun_parameters
+          type: string
+          description: |
+            Pass additional arguments to Metaeuk for the second run (see Metaeuk documentation https://github.com/soedinglab/metaeuk).
+            All parameters should be contained within a single string with no white space, with each parameter separated by a comma.
+          example: "--max-overlap=15,--min-exon-aa=15"
+
+  resources:
+    - type: bash_script
+      path: script.sh
+  test_resources:
+    - type: bash_script
+      path: test.sh
+    - type: file
+      path: test_data
+platforms:
+  - type: docker
+    image: quay.io/biocontainers/busco:5.6.1--pyhdfd78af_0
+    setup:
+      - type: docker
+        run: |
+          busco --version | sed 's/BUSCO\s\(.*\)/busco: "\1"/' > /var/software_versions.txt
+  - type: nextflow
diff --git a/src/busco/help.txt b/src/busco/help.txt
@@ -0,0 +1,60 @@
+```bash
+busco -h
+```
+
+Welcome to BUSCO 5.6.1: the Benchmarking Universal Single-Copy Ortholog assessment tool.
+For more detailed usage information, please review the README file provided with this distribution and the BUSCO user guide. Visit this page https://gitlab.com/ezlab/busco#how-to-cite-busco to see how to cite BUSCO
+
+optional arguments:
+  -i SEQUENCE_FILE, --in SEQUENCE_FILE
+                        Input sequence file in FASTA format. Can be an assembled genome or transcriptome (DNA), or protein sequences from an annotated gene set. Also possible to use a path to a directory containing multiple input files.
+  -o OUTPUT, --out OUTPUT
+                        Give your analysis run a recognisable short name. Output folders and files will be labelled with this name. The path to the output folder is set with --out_path.
+  -m MODE, --mode MODE  Specify which BUSCO analysis mode to run.
+                        There are three valid modes:
+                        - geno or genome, for genome assemblies (DNA)
+                        - tran or transcriptome, for transcriptome assemblies (DNA)
+                        - prot or proteins, for annotated gene sets (protein)
+  -l LINEAGE, --lineage_dataset LINEAGE
+                        Specify the name of the BUSCO lineage to be used.
+  --augustus            Use augustus gene predictor for eukaryote runs
+  --augustus_parameters --PARAM1=VALUE1,--PARAM2=VALUE2
+                        Pass additional arguments to Augustus. All arguments should be contained within a single string with no white space, with each argument separated by a comma.
+  --augustus_species AUGUSTUS_SPECIES
+                        Specify a species for Augustus training.
+  --auto-lineage        Run auto-lineage to find optimum lineage path
+  --auto-lineage-euk    Run auto-placement just on eukaryote tree to find optimum lineage path
+  --auto-lineage-prok   Run auto-lineage just on non-eukaryote trees to find optimum lineage path
+  -c N, --cpu N         Specify the number (N=integer) of threads/cores to use.
+  --config CONFIG_FILE  Provide a config file
+  --contig_break n      Number of contiguous Ns to signify a break between contigs. Default is n=10.
+  --datasets_version DATASETS_VERSION
+                        Specify the version of BUSCO datasets, e.g. odb10
+  --download [dataset [dataset ...]]
+                        Download dataset. Possible values are a specific dataset name, "all", "prokaryota", "eukaryota", or "virus". If used together with other command line arguments, make sure to place this last.
+  --download_base_url DOWNLOAD_BASE_URL
+                        Set the url to the remote BUSCO dataset location
+  --download_path DOWNLOAD_PATH
+                        Specify local filepath for storing BUSCO dataset downloads
+  -e N, --evalue N      E-value cutoff for BLAST searches. Allowed formats, 0.001 or 1e-03 (Default: 1e-03)
+  -f, --force           Force rewriting of existing files. Must be used when output files with the provided name already exist.
+  -h, --help            Show this help message and exit
+  --limit N             How many candidate regions (contig or transcript) to consider per BUSCO (default: 3)
+  --list-datasets       Print the list of available BUSCO datasets
+  --long                Optimization Augustus self-training mode (Default: Off); adds considerably to the run time, but can improve results for some non-model organisms
+  --metaeuk_parameters "--PARAM1=VALUE1,--PARAM2=VALUE2"
+                        Pass additional arguments to Metaeuk for the first run. All arguments should be contained within a single string with no white space, with each argument separated by a comma.
+  --metaeuk_rerun_parameters "--PARAM1=VALUE1,--PARAM2=VALUE2"
+                        Pass additional arguments to Metaeuk for the second run. All arguments should be contained within a single string with no white space, with each argument separated by a comma.
+  --miniprot            Use miniprot gene predictor
+  --skip_bbtools        Skip BBTools for assembly statistics
+  --offline             To indicate that BUSCO cannot attempt to download files
+  --opt-out-run-stats   Opt out of data collection. Information on the data collected is available in the user guide.
+  --out_path OUTPUT_PATH
+                        Optional location for results folder, excluding results folder name. Default is current working directory.
+  -q, --quiet           Disable the info logs, displays only errors
+  -r, --restart         Continue a run that had already partially completed.
+  --scaffold_composition
+                        Writes ACGTN content per scaffold to a file scaffold_composition.txt
+  --tar                 Compress some subdirectories with many files to save space
+  -v, --version         Show this version and exit
diff --git a/src/busco/script.sh b/src/busco/script.sh
@@ -0,0 +1,71 @@
+#!/bin/bash
+
+## VIASH START
+## VIASH END
+
+
+[[ "$par_tar" == "false" ]] && unset par_tar
+[[ "$par_force" == "false" ]] && unset par_force
+[[ "$par_quiet" == "false" ]] && unset par_quiet
+[[ "$par_restart" == "false" ]] && unset par_restart
+[[ "$par_auto_lineage" == "false" ]] && unset par_auto_lineage
+[[ "$par_auto_lineage_euk" == "false" ]] && unset par_auto_lineage_euk
+[[ "$par_auto_lineage_prok" == "false" ]] && unset par_auto_lineage_prok
+[[ "$par_augustus" == "false" ]] && unset par_augustus
+[[ "$par_long" == "false" ]] && unset par_long
+[[ "$par_scaffold_composition" == "false" ]] && unset par_scaffold_composition
+[[ "$par_miniprot" == "false" ]] && unset par_miniprot
+
+tmp_dir=$(mktemp -d -p "$meta_temp_dir" busco_XXXXXXXXX)
+prefix=$(openssl rand -hex 8)
+
+busco \
+    --in "$par_input" \
+    --mode "$par_mode" \
+    --out "$prefix" \
+    --out_path "$tmp_dir" \
+    --opt-out-run-stats \
+    ${meta_cpus:+--cpu "${meta_cpus}"} \
+    ${par_lineage_dataset:+--lineage_dataset "$par_lineage_dataset"} \
+    ${par_augustus:+--augustus} \
+    ${par_augustus_parameters:+--augustus_parameters "$par_augustus_parameters"} \
+    ${par_augustus_species:+--augustus_species "$par_augustus_species"} \
+    ${par_auto_lineage:+--auto-lineage} \
+    ${par_auto_lineage_euk:+--auto-lineage-euk} \
+    ${par_auto_lineage_prok:+--auto-lineage-prok} \
+    ${par_contig_break:+--contig_break $par_contig_break} \
+    ${par_datasets_version:+--datasets_version "$par_datasets_version"} \
+    ${par_e_value:+--evalue "$par_e_value"} \
+    ${par_force:+--force} \
+    ${par_limit:+--limit "$par_limit"} \
+    ${par_long:+--long} \
+    ${par_metaeuk_parameters:+--metaeuk_parameters "$par_metaeuk_parameters"} \
+    ${par_metaeuk_rerun_parameters:+--metaeuk_rerun_parameters "$par_metaeuk_rerun_parameters"} \
+    ${par_miniprot:+--miniprot} \
+    ${par_quiet:+--quiet} \
+    ${par_restart:+--restart} \
+    ${par_scaffold_composition:+--scaffold_composition} \
+    ${par_tar:+--tar} \
+
+
+out_dir=$(find "$tmp_dir/$prefix" -maxdepth 1 -name 'run_*')
+
+if [[ -n "$par_short_summary_json" ]]; then
+    cp "$out_dir/short_summary.json" "$par_short_summary_json"
+fi
+if [[ -n "$par_short_summary_txt" ]]; then
+    cp "$out_dir/short_summary.txt" "$par_short_summary_txt"
+fi
+if [[ -n "$par_full_table" ]]; then
+    cp "$out_dir/full_table.tsv" "$par_full_table"
+fi
+if [[ -n "$par_missing_busco_list" ]]; then
+    cp "$out_dir/missing_busco_list.tsv" "$par_missing_busco_list"
+fi
+if [[ -n "$par_output_dir" ]]; then
+    if [[ -d "$par_output_dir" ]]; then
+        rm -r "$par_output_dir"
+    fi
+    cp -r -L "$out_dir" "$par_output_dir"
+fi
+