diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 6e1fc4b3..2591978f 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -3,114 +3,7 @@ name: Component Testing on: pull_request: push: - branches: [ '**' ] jobs: - run_ci_check_job: - runs-on: ubuntu-latest - outputs: - run_ci: ${{ steps.github_cli.outputs.check }} - steps: - - name: 'Check if branch has an existing pull request and the trigger was a push' - id: github_cli - run: | - pull_request=$(gh pr list -R ${{ github.repository }} -H ${{ github.ref_name }} --json url --state open --limit 1 | jq '.[0].url') - # If the branch has a PR and this run was triggered by a push event, do not run - if [[ "$pull_request" != "null" && "$GITHUB_REF_NAME" != "main" && "${{ github.event_name == 'push' }}" == "true" && "${{ !contains(github.event.head_commit.message, 'ci force') }}" == "true" ]]; then - echo "check=false" >> $GITHUB_OUTPUT - else - echo "check=true" >> $GITHUB_OUTPUT - fi - env: - GH_TOKEN: ${{ github.token }} - - # phase 1 - list: - needs: run_ci_check_job - runs-on: ubuntu-latest - if: ${{ needs.run_ci_check_job.outputs.run_ci == 'true' }} - - outputs: - matrix: ${{ steps.set_matrix.outputs.matrix }} - - steps: - - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Get head git commit message - id: get_head_commit_message - run: echo "HEAD_COMMIT_MESSAGE=$(git show -s --format=%s ${{ github.event.pull_request.head.sha || github.sha }})" >> "$GITHUB_OUTPUT" - - - uses: viash-io/viash-actions/setup@v5 - - - name: Check if all config can be parsed if there is no unicode support - run: | - LANG=C viash ns list > /dev/null - - # see https://github.com/viash-io/viash/issues/654 - # and https://github.com/viash-io/viash-actions/pull/27 - # - name: Get changed files - # id: changed-files - # uses: tj-actions/changed-files@v42 - # with: - # separator: ";" - # diff_relative: true - # - id: ns_list - # uses: viash-io/viash-actions/ns-list@v5 - # with: - # platform: docker - # format: json - # query: ^(?!workflows) - # - id: ns_list_filtered - # uses: viash-io/viash-actions/project/detect-changed-components@v5 - # with: - # input_file: "${{ steps.ns_list.outputs.output_file }}" - # - id: set_matrix - # run: | - # echo "matrix=$(jq -c '[ .[] | - # { - # "name": (.functionality.namespace + "/" + .functionality.name), - # "config": .info.config, - # "dir": .info.config | capture("^(?.*\/)").dir - # } - # ]' ${{ contains(steps.get_head_commit_message.outputs.HEAD_COMMIT_MESSAGE, 'ci force') && steps.ns_list.outputs.output_file || steps.ns_list_filtered.outputs.output_file }} )" >> $GITHUB_OUTPUT - - - - id: set_matrix - run: | - viash ns list --format json > ns_list.json - echo "matrix=$(jq -c '[ .[] | - { - "name": (.namespace + "/" + .name), - "config": .build_info.config, - "dir": .build_info.config | capture("^(?.*\/)").dir - } - ]' ns_list.json )" >> $GITHUB_OUTPUT - - # phase 2 - viash_test: - needs: list - if: ${{ needs.list.outputs.matrix != '[]' && needs.list.outputs.matrix != '' }} - runs-on: ubuntu-latest - - strategy: - fail-fast: false - matrix: - component: ${{ fromJson(needs.list.outputs.matrix) }} - - steps: - # Remove unnecessary files to free up space. Otherwise, we get 'no space left on device.' - - uses: data-intuitive/reclaim-the-bytes@v2 - - - uses: actions/checkout@v4 - - - uses: viash-io/viash-actions/setup@v5 - - - name: Run test - timeout-minutes: 30 - run: | - viash test \ - "${{ matrix.component.config }}" \ - --cpus 2 \ - --memory "6gb" \ No newline at end of file + test: + uses: viash-hub/toolbox/.github/workflows/test.yaml@main \ No newline at end of file diff --git a/.gitignore b/.gitignore index ca5262bc..2a64eaac 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ # IDE ignores .idea/ +.vscode/ # R specific ignores .Rhistory diff --git a/.vscode/settings.json b/.vscode/settings.json deleted file mode 100644 index df05379a..00000000 --- a/.vscode/settings.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "yaml.schemas": { - ".vscode/viash_config.yaml": "**.vsh.yaml", - }, - "files.watcherExclude": { - "**/target": true, - ".github": true - } -} \ No newline at end of file diff --git a/.vscode/viash_config.yaml b/.vscode/viash_config.yaml deleted file mode 100644 index 0e38195f..00000000 --- a/.vscode/viash_config.yaml +++ /dev/null @@ -1,3005 +0,0 @@ -$schema: "https://json-schema.org/draft-07/schema#" -definitions: - Config: - description: "A Viash configuration is a YAML file which contains metadata to\ - \ describe the behaviour and build target(s) of a component. \nWe commonly\ - \ name this file `config.vsh.yaml` in our examples, but you can name it however\ - \ you choose. \n" - type: "object" - properties: - organization: - description: "The organization of the package." - type: "string" - license: - description: "The license of the package." - type: "string" - authors: - description: "A list of authors. An author must at least have a name, but\ - \ can also have a list of roles, an e-mail address, and a map of custom\ - \ properties.\n\nSuggested values for roles are:\n \n| Role | Abbrev. |\ - \ Description |\n|------|---------|-------------|\n| maintainer | mnt |\ - \ for the maintainer of the code. Ideally, exactly one maintainer is specified.\ - \ |\n| author | aut | for persons who have made substantial contributions\ - \ to the software. |\n| contributor | ctb| for persons who have made smaller\ - \ contributions (such as code patches).\n| datacontributor | dtc | for persons\ - \ or organisations that contributed data sets for the software\n| copyrightholder\ - \ | cph | for all copyright holders. This is a legal concept so should use\ - \ the legal name of an institution or corporate body.\n| funder | fnd |\ - \ for persons or organizations that furnished financial support for the\ - \ development of the software\n\nThe [full list of roles](https://www.loc.gov/marc/relators/relaterm.html)\ - \ is extremely comprehensive.\n" - type: "array" - items: - $ref: "#/definitions/Author" - status: - description: "Allows setting a component to active, deprecated or disabled." - $ref: "#/definitions/Status" - requirements: - description: "Computational requirements related to running the component.\ - \ \n`cpus` specifies the maximum number of (logical) cpus a component is\ - \ allowed to use., whereas\n`memory` specifies the maximum amount of memory\ - \ a component is allowed to allicate. Memory units must be\nin B, KB, MB,\ - \ GB, TB or PB." - $ref: "#/definitions/ComputationalRequirements" - repositories: - description: "(Pre-)defines repositories that can be used as repository in\ - \ dependencies.\nAllows reusing repository definitions in case it is used\ - \ in multiple dependencies." - type: "array" - items: - $ref: "#/definitions/RepositoryWithName" - dependencies: - description: "Allows listing Viash components required by this Viash component" - type: "array" - items: - $ref: "#/definitions/Dependency" - namespace: - description: "Namespace this component is a part of. See the Namespaces guide\ - \ for more information on namespaces." - type: "string" - functionality: - description: "The functionality describes the behaviour of the script in terms\ - \ of arguments and resources.\nBy specifying a few restrictions (e.g. mandatory\ - \ arguments) and adding some descriptions, Viash will automatically generate\ - \ a stylish command-line interface for you.\n" - $ref: "#/definitions/Functionality" - runners: - description: "A list of runners to execute target artifacts.\n\n - ExecutableRunner\n\ - \ - NextflowRunner\n" - type: "array" - items: - $ref: "#/definitions/Runner" - name: - description: "Name of the component and the filename of the executable when\ - \ built with `viash build`." - type: "string" - build_info: - $ref: "#/definitions/BuildInfo" - argument_groups: - description: "A grouping of the arguments, used to display the help message.\n\ - \n - `name: foo`, the name of the argument group. \n - `description: Description\ - \ of foo`, a description of the argument group. Multiline descriptions are\ - \ supported.\n - `arguments: [arg1, arg2, ...]`, list of the arguments.\n\ - \n" - type: "array" - items: - $ref: "#/definitions/ArgumentGroup" - description: - description: "A description of the component. This will be displayed with\ - \ `--help`." - type: "string" - usage: - description: "A description on how to use the component. This will be displayed\ - \ with `--help` under the 'Usage:' section." - type: "string" - info: - description: "Structured information. Can be any shape: a string, vector,\ - \ map or even nested map." - type: "object" - package_config: - description: "The package config content used during build." - $ref: "#/definitions/PackageConfig" - platforms: - description: "A list of platforms to generate target artifacts for.\n\n -\ - \ Native\n - Docker\n - Nextflow\n" - type: "array" - items: - $ref: "#/definitions/Platform" - version: - description: "Version of the component. This field will be used to version\ - \ the executable and the Docker container." - type: "string" - links: - description: "External links of the component." - $ref: "#/definitions/Links" - references: - description: "References to external resources related to the component." - $ref: "#/definitions/References" - engines: - description: "A list of engine environments to execute target artifacts in.\n\ - \n - NativeEngine\n - DockerEngine\n" - type: "array" - items: - $ref: "#/definitions/Engine" - resources: - description: "Resources are files that support the component. The first resource\ - \ should be a script that will be executed when the component is run. Additional\ - \ resources will be copied to the same directory.\n\nCommon properties:\n\ - \n * type: `file` / `r_script` / `python_script` / `bash_script` / `javascript_script`\ - \ / `scala_script` / `csharp_script`, specifies the type of the resource.\ - \ The first resource cannot be of type `file`. When the type is not specified,\ - \ the default type is simply `file`.\n * dest: filename, the resulting name\ - \ of the resource. From within a script, the file can be accessed at `meta[\"\ - resources_dir\"] + \"/\" + dest`. If unspecified, `dest` will be set to\ - \ the basename of the `path` parameter.\n * path: `path/to/file`, the path\ - \ of the input file. Can be a relative or an absolute path, or a URI. Mutually\ - \ exclusive with `text`.\n * text: ...multiline text..., the content of\ - \ the resulting file specified as a string. Mutually exclusive with `path`.\n\ - \ * is_executable: `true` / `false`, whether the resulting resource file\ - \ should be made executable.\n" - type: "array" - items: - $ref: "#/definitions/Resource" - keywords: - description: "The keywords of the components." - type: "array" - items: - type: "string" - test_resources: - description: "One or more scripts to be used to test the component behaviour\ - \ when `viash test` is invoked. Additional files of type `file` will be\ - \ made available only during testing. Each test script should expect no\ - \ command-line inputs, be platform-independent, and return an exit code\ - \ >0 when unexpected behaviour occurs during testing. See Unit Testing for\ - \ more info." - type: "array" - items: - $ref: "#/definitions/Resource" - required: - - "name" - additionalProperties: false - PackageConfig: - description: "A Viash package configuration file. It's name should be `_viash.yaml`." - type: "object" - properties: - organization: - description: "The organization of the package." - type: "string" - name: - description: "The name of the package." - type: "string" - source: - description: "Which source directory to use for the `viash ns` commands." - type: "string" - description: - description: "A description of the package." - type: "string" - viash_version: - description: "Which version of Viash to use." - type: "string" - config_mods: - oneOf: - - description: "Which config mods to apply." - type: "string" - - type: "array" - items: - description: "Which config mods to apply." - type: "string" - info: - description: "Structured information. Can be any shape: a string, vector,\ - \ map or even nested map." - type: "object" - license: - description: "The license of the package." - type: "string" - references: - description: "References to external resources related to the package." - $ref: "#/definitions/References" - authors: - description: "The authors of the package." - type: "array" - items: - $ref: "#/definitions/Author" - repositories: - description: "Common repository definitions for component dependencies." - type: "array" - items: - $ref: "#/definitions/RepositoryWithName" - keywords: - description: "The keywords of the package." - type: "array" - items: - type: "string" - target: - description: "Which target directory to use for `viash ns build`." - type: "string" - version: - description: "The version of the package." - type: "string" - links: - description: "External links of the package." - $ref: "#/definitions/Links" - required: [] - additionalProperties: false - BuildInfo: - description: "Meta information fields filled in by Viash during build." - type: "object" - properties: - git_tag: - description: "Git tag." - type: "string" - git_remote: - description: "Git remote name." - type: "string" - viash_version: - description: "The Viash version that was used to build the component." - type: "string" - output: - description: "Folder path to the build artifacts." - type: "string" - git_commit: - description: "Git commit hash." - type: "string" - executable: - description: "Output folder with main executable path." - type: "string" - engine: - description: "The engine id used during build." - type: "string" - runner: - description: "The runner id used during build." - type: "string" - config: - description: "Path to the config used during build." - type: "string" - required: - - "config" - additionalProperties: false - Functionality: - description: "The functionality-part of the config file describes the behaviour\ - \ of the script in terms of arguments and resources.\nBy specifying a few restrictions\ - \ (e.g. mandatory arguments) and adding some descriptions, Viash will automatically\ - \ generate a stylish command-line interface for you.\n" - type: "object" - properties: - organization: - description: "The organization of the package." - type: "string" - name: - description: "Name of the component and the filename of the executable when\ - \ built with `viash build`." - type: "string" - argument_groups: - description: "A grouping of the arguments, used to display the help message.\n\ - \n - `name: foo`, the name of the argument group. \n - `description: Description\ - \ of foo`, a description of the argument group. Multiline descriptions are\ - \ supported.\n - `arguments: [arg1, arg2, ...]`, list of the arguments.\n\ - \n" - type: "array" - items: - $ref: "#/definitions/ArgumentGroup" - info: - description: "Structured information. Can be any shape: a string, vector,\ - \ map or even nested map." - type: "object" - license: - description: "The license of the package." - type: "string" - references: - description: "References to external resources related to the component." - $ref: "#/definitions/References" - authors: - description: "A list of authors. An author must at least have a name, but\ - \ can also have a list of roles, an e-mail address, and a map of custom\ - \ properties.\n\nSuggested values for roles are:\n \n| Role | Abbrev. |\ - \ Description |\n|------|---------|-------------|\n| maintainer | mnt |\ - \ for the maintainer of the code. Ideally, exactly one maintainer is specified.\ - \ |\n| author | aut | for persons who have made substantial contributions\ - \ to the software. |\n| contributor | ctb| for persons who have made smaller\ - \ contributions (such as code patches).\n| datacontributor | dtc | for persons\ - \ or organisations that contributed data sets for the software\n| copyrightholder\ - \ | cph | for all copyright holders. This is a legal concept so should use\ - \ the legal name of an institution or corporate body.\n| funder | fnd |\ - \ for persons or organizations that furnished financial support for the\ - \ development of the software\n\nThe [full list of roles](https://www.loc.gov/marc/relators/relaterm.html)\ - \ is extremely comprehensive.\n" - type: "array" - items: - $ref: "#/definitions/Author" - status: - description: "Allows setting a component to active, deprecated or disabled." - $ref: "#/definitions/Status" - requirements: - description: "Computational requirements related to running the component.\ - \ \n`cpus` specifies the maximum number of (logical) cpus a component is\ - \ allowed to use., whereas\n`memory` specifies the maximum amount of memory\ - \ a component is allowed to allicate. Memory units must be\nin B, KB, MB,\ - \ GB, TB or PB." - $ref: "#/definitions/ComputationalRequirements" - repositories: - description: "(Pre-)defines repositories that can be used as repository in\ - \ dependencies.\nAllows reusing repository definitions in case it is used\ - \ in multiple dependencies." - type: "array" - items: - $ref: "#/definitions/RepositoryWithName" - test_resources: - description: "One or more scripts to be used to test the component behaviour\ - \ when `viash test` is invoked. Additional files of type `file` will be\ - \ made available only during testing. Each test script should expect no\ - \ command-line inputs, be platform-independent, and return an exit code\ - \ >0 when unexpected behaviour occurs during testing. See Unit Testing for\ - \ more info." - type: "array" - items: - $ref: "#/definitions/Resource" - dependencies: - description: "Allows listing Viash components required by this Viash component" - type: "array" - items: - $ref: "#/definitions/Dependency" - description: - description: "A description of the component. This will be displayed with\ - \ `--help`." - type: "string" - usage: - description: "A description on how to use the component. This will be displayed\ - \ with `--help` under the 'Usage:' section." - type: "string" - version: - description: "Version of the component. This field will be used to version\ - \ the executable and the Docker container." - type: "string" - links: - description: "External links of the component." - $ref: "#/definitions/Links" - resources: - description: "Resources are files that support the component. The first resource\ - \ should be a script that will be executed when the functionality is run.\ - \ Additional resources will be copied to the same directory.\n\nCommon properties:\n\ - \n * type: `file` / `r_script` / `python_script` / `bash_script` / `javascript_script`\ - \ / `scala_script` / `csharp_script`, specifies the type of the resource.\ - \ The first resource cannot be of type `file`. When the type is not specified,\ - \ the default type is simply `file`.\n * dest: filename, the resulting name\ - \ of the resource. From within a script, the file can be accessed at `meta[\"\ - resources_dir\"] + \"/\" + dest`. If unspecified, `dest` will be set to\ - \ the basename of the `path` parameter.\n * path: `path/to/file`, the path\ - \ of the input file. Can be a relative or an absolute path, or a URI. Mutually\ - \ exclusive with `text`.\n * text: ...multiline text..., the content of\ - \ the resulting file specified as a string. Mutually exclusive with `path`.\n\ - \ * is_executable: `true` / `false`, whether the resulting resource file\ - \ should be made executable.\n" - type: "array" - items: - $ref: "#/definitions/Resource" - keywords: - description: "The keywords of the components." - type: "array" - items: - type: "string" - namespace: - description: "Namespace this component is a part of. See the Namespaces guide\ - \ for more information on namespaces." - type: "string" - arguments: - description: "A list of arguments for this component. For each argument, a\ - \ type and a name must be specified. Depending on the type of argument,\ - \ different properties can be set. See these reference pages per type for\ - \ more information: \n\n - string\n - file\n - integer\n - double\n - boolean\n\ - \ - boolean_true\n - boolean_false\n" - type: "array" - items: - $ref: "#/definitions/Argument" - required: - - "name" - additionalProperties: false - Author: - description: "Author metadata." - type: "object" - properties: - name: - description: "Full name of the author, usually in the name of FirstName MiddleName\ - \ LastName." - type: "string" - email: - description: "E-mail of the author." - type: "string" - info: - description: "Structured information. Can be any shape: a string, vector,\ - \ map or even nested map." - type: "object" - roles: - oneOf: - - description: "Role of the author. Suggested items:\n\n* \"author\": Authors\ - \ who have made substantial contributions to the component.\n* \"maintainer\"\ - : The maintainer of the component.\n* \"contributor\": Authors who have\ - \ made smaller contributions (such as code patches etc.).\n" - type: "string" - - type: "array" - items: - description: "Role of the author. Suggested items:\n\n* \"author\": Authors\ - \ who have made substantial contributions to the component.\n* \"maintainer\"\ - : The maintainer of the component.\n* \"contributor\": Authors who have\ - \ made smaller contributions (such as code patches etc.).\n" - type: "string" - required: - - "name" - additionalProperties: false - ComputationalRequirements: - description: "Computational requirements related to running the component." - type: "object" - properties: - cpus: - description: "The maximum number of (logical) cpus a component is allowed\ - \ to use." - type: "integer" - commands: - description: "A list of commands which should be present on the system for\ - \ the script to function." - type: "array" - items: - type: "string" - memory: - description: "The maximum amount of memory a component is allowed to allocate.\ - \ Unit must be one of B, KB, MB, GB, TB or PB." - type: "string" - required: [] - additionalProperties: false - ArgumentGroup: - description: "A grouping of the arguments, used to display the help message." - type: "object" - properties: - name: - description: "The name of the argument group." - type: "string" - description: - description: "Description of foo`, a description of the argument group. Multiline\ - \ descriptions are supported." - type: "string" - arguments: - description: "List of arguments." - type: "array" - items: - $ref: "#/definitions/Argument" - required: - - "name" - - "arguments" - additionalProperties: false - Links: - description: "Links to external resources related to the component." - type: "object" - properties: - repository: - description: "Source repository url." - type: "string" - documentation: - description: "Documentation website url." - type: "string" - docker_registry: - description: "Docker registry url." - type: "string" - homepage: - description: "Homepage website url." - type: "string" - issue_tracker: - description: "Issue tracker url." - type: "string" - required: [] - additionalProperties: false - References: - description: "References to external resources related to the component." - type: "object" - properties: - bibtex: - oneOf: - - description: "One or multiple BibTeX reference(s) of the component." - type: "string" - - type: "array" - items: - description: "One or multiple BibTeX reference(s) of the component." - type: "string" - doi: - oneOf: - - description: "One or multiple DOI reference(s) of the component." - type: "string" - - type: "array" - items: - description: "One or multiple DOI reference(s) of the component." - type: "string" - additionalProperties: false - Runner: - oneOf: - - $ref: "#/definitions/ExecutableRunner" - - $ref: "#/definitions/NextflowRunner" - ExecutableRunner: - description: "Run code as an executable.\n\nThis runner is the default runner.\ - \ It will generate a bash script that can be run directly.\n\nThis runner is\ - \ also used for the native engine.\n\nThis runner is also used for the docker\ - \ engine.\n" - type: "object" - properties: - docker_setup_strategy: - description: "The Docker setup strategy to use when building a docker engine\ - \ enrivonment.\n\n| Strategy | Description |\n|-----|----------|\n| `alwaysbuild`\ - \ / `build` / `b` | Always build the image from the dockerfile. This is\ - \ the default setup strategy.\n| `alwayscachedbuild` / `cachedbuild` / `cb`\ - \ | Always build the image from the dockerfile, with caching enabled.\n\ - | `ifneedbebuild` | Build the image if it does not exist locally.\n| `ifneedbecachedbuild`\ - \ | Build the image with caching enabled if it does not exist locally, with\ - \ caching enabled.\n| `alwayspull` / `pull` / `p` | Try to pull the container\ - \ from [Docker Hub](https://hub.docker.com) or the specified docker registry.\n\ - | `alwayspullelsebuild` / `pullelsebuild` | Try to pull the image from\ - \ a registry and build it if it doesn't exist.\n| `alwayspullelsecachedbuild`\ - \ / `pullelsecachedbuild` | Try to pull the image from a registry and build\ - \ it with caching if it doesn't exist.\n| `ifneedbepull` | If the image\ - \ does not exist locally, pull the image.\n| `ifneedbepullelsebuild` | \ - \ If the image does not exist locally, pull the image. If the image does\ - \ exist, build it.\n| `ifneedbepullelsecachedbuild` | If the image does\ - \ not exist locally, pull the image. If the image does exist, build it with\ - \ caching enabled.\n| `push` | Push the container to [Docker Hub](https://hub.docker.com)\ - \ or the specified docker registry.\n| `pushifnotpresent` | Push the container\ - \ to [Docker Hub](https://hub.docker.com) or the specified docker registry\ - \ if the tag does not exist yet.\n| `donothing` / `meh` | Do not build or\ - \ pull anything.\n\n" - $ref: "#/definitions/DockerSetupStrategy" - workdir: - description: "The working directory when starting the engine. This doesn't\ - \ change the Dockerfile but gets added as a command-line argument at runtime." - type: "string" - docker_run_args: - oneOf: - - description: "Provide runtime arguments to Docker. See the documentation\ - \ on [`docker run`](https://docs.docker.com/engine/reference/run/) for\ - \ more information." - type: "string" - - type: "array" - items: - description: "Provide runtime arguments to Docker. See the documentation\ - \ on [`docker run`](https://docs.docker.com/engine/reference/run/) for\ - \ more information." - type: "string" - id: - description: "Name of the runner. As with all runners, you can give an runner\ - \ a different name. By specifying `id: foo`, you can target this executor\ - \ (only) by specifying `...` in any of the Viash commands." - type: "string" - port: - oneOf: - - description: "A list of enabled ports. This doesn't change the Dockerfile\ - \ but gets added as a command-line argument at runtime." - type: "integer" - - description: "A list of enabled ports. This doesn't change the Dockerfile\ - \ but gets added as a command-line argument at runtime." - type: "string" - - description: "A list of enabled ports. This doesn't change the Dockerfile\ - \ but gets added as a command-line argument at runtime." - type: "array" - items: - type: "integer" - - description: "A list of enabled ports. This doesn't change the Dockerfile\ - \ but gets added as a command-line argument at runtime." - type: "array" - items: - type: "string" - type: - description: "Run code as an executable.\n\nThis runner is the default runner.\ - \ It will generate a bash script that can be run directly.\n\nThis runner\ - \ is also used for the native engine.\n\nThis runner is also used for the\ - \ docker engine.\n" - const: "executable" - required: - - "type" - additionalProperties: false - NextflowRunner: - description: "Run a Viash component on a Nextflow backend engine.\n" - type: "object" - properties: - auto: - description: "Automated processing flags which can be toggled on or off:\n\ - \n| Flag | Description | Default |\n|---|---------|----|\n| `simplifyInput`\ - \ | If `true`, an input tuple only containing only a single File (e.g. `[\"\ - foo\", file(\"in.h5ad\")]`) is automatically transformed to a map (i.e.\ - \ `[\"foo\", [ input: file(\"in.h5ad\") ] ]`). | `true` |\n| `simplifyOutput`\ - \ | If `true`, an output tuple containing a map with a File (e.g. `[\"foo\"\ - , [ output: file(\"out.h5ad\") ] ]`) is automatically transformed to a map\ - \ (i.e. `[\"foo\", file(\"out.h5ad\")]`). | `false` |\n| `transcript` |\ - \ If `true`, the module's transcripts from `work/` are automatically published\ - \ to `params.transcriptDir`. If not defined, `params.publishDir + \"/_transcripts\"\ - ` will be used. Will throw an error if neither are defined. | `false` |\n\ - | `publish` | If `true`, the module's outputs are automatically published\ - \ to `params.publishDir`. If equal to \"state\", also a `.state.yaml` file\ - \ will be published in the publish dir. Will throw an error if `params.publishDir`\ - \ is not defined. | `false` |\n\n" - $ref: "#/definitions/NextflowAuto" - directives: - description: "Directives are optional settings that affect the execution of\ - \ the process. These mostly match up with the Nextflow counterparts. \n" - $ref: "#/definitions/NextflowDirectives" - container: - description: "Specifies the Docker engine id to be used to run Nextflow." - type: "string" - config: - description: "Allows tweaking how the Nextflow Config file is generated." - $ref: "#/definitions/NextflowConfig" - debug: - description: "Whether or not to print debug messages." - type: "boolean" - id: - description: "Name of the runner. As with all runners, you can give an runner\ - \ a different name. By specifying `id: foo`, you can target this runner\ - \ (only) by specifying `...` in any of the Viash commands." - type: "string" - type: - description: "Run a Viash component on a Nextflow backend engine.\n" - const: "nextflow" - required: - - "type" - additionalProperties: false - Engine: - oneOf: - - $ref: "#/definitions/DockerEngine" - - $ref: "#/definitions/NativeEngine" - NativeEngine: - description: "Running a Viash component on a native engine means that the script\ - \ will be executed in your current environment.\nAny dependencies are assumed\ - \ to have been installed by the user, so the native engine is meant for developers\ - \ (who know what they're doing) or for simple bash scripts (which have no extra\ - \ dependencies).\n" - type: "object" - properties: - id: - description: "Name of the engine. As with all engines, you can give an engine\ - \ a different name. By specifying `id: foo`, you can target this engine\ - \ (only) by specifying `...` in any of the Viash commands." - type: "string" - type: - description: "Running a Viash component on a native engine means that the\ - \ script will be executed in your current environment.\nAny dependencies\ - \ are assumed to have been installed by the user, so the native engine is\ - \ meant for developers (who know what they're doing) or for simple bash\ - \ scripts (which have no extra dependencies).\n" - const: "native" - required: - - "type" - additionalProperties: false - DockerEngine: - description: "Run a Viash component on a Docker backend engine.\nBy specifying\ - \ which dependencies your component needs, users will be able to build a docker\ - \ container from scratch using the setup flag, or pull it from a docker repository.\n" - type: "object" - properties: - organization: - description: "Name of a container's [organization](https://docs.docker.com/docker-hub/orgs/)." - type: "string" - registry: - description: "The URL to the a [custom Docker registry](https://docs.docker.com/registry/)" - type: "string" - image: - description: "The base container to start from. You can also add the tag here\ - \ if you wish." - type: "string" - tag: - description: "Specify a Docker image based on its tag." - type: "string" - target_image: - description: "If anything is specified in the setup section, running the `---setup`\ - \ will result in an image with the name of `:`. If\ - \ nothing is specified in the `setup` section, simply `image` will be used.\ - \ Advanced usage only." - type: "string" - target_tag: - description: "The tag the resulting image gets. Advanced usage only." - type: "string" - namespace_separator: - description: "The separator between the namespace and the name of the component,\ - \ used for determining the image name. Default: \"/\"." - type: "string" - id: - description: "Name of the engine. As with all engines, you can give a engine\ - \ a different name. By specifying `id: foo`, you can target this engine\ - \ (only) by specifying `...` in any of the Viash commands." - type: "string" - target_registry: - description: "The URL where the resulting image will be pushed to. Advanced\ - \ usage only." - type: "string" - type: - description: "Run a Viash component on a Docker backend engine.\nBy specifying\ - \ which dependencies your component needs, users will be able to build a\ - \ docker container from scratch using the setup flag, or pull it from a\ - \ docker repository.\n" - const: "docker" - target_organization: - description: "The organization set in the resulting image. Advanced usage\ - \ only." - type: "string" - setup: - description: "A list of requirements for installing the following types of\ - \ packages:\n\n - apt\n - apk\n - Docker setup instructions\n - JavaScript\n\ - \ - Python\n - R\n - Ruby\n - yum\n\nThe order in which these dependencies\ - \ are specified determines the order in which they will be installed.\n" - type: "array" - items: - $ref: "#/definitions/Requirements" - cmd: - oneOf: - - description: "Set the default command being executed when running the Docker\ - \ container." - type: "string" - - description: "Set the default command being executed when running the Docker\ - \ container." - type: "array" - items: - type: "string" - target_image_source: - description: "The source of the target image. This is used for defining labels\ - \ in the dockerfile." - type: "string" - test_setup: - description: "Additional requirements specific for running unit tests." - type: "array" - items: - $ref: "#/definitions/Requirements" - entrypoint: - oneOf: - - description: "Override the entrypoint of the base container. Default set\ - \ `ENTRYPOINT []`." - type: "string" - - description: "Override the entrypoint of the base container. Default set\ - \ `ENTRYPOINT []`." - type: "array" - items: - type: "string" - required: - - "image" - - "type" - additionalProperties: false - Platform: - oneOf: - - $ref: "#/definitions/NativePlatform" - - $ref: "#/definitions/DockerPlatform" - - $ref: "#/definitions/NextflowPlatform" - NativePlatform: - description: "Running a Viash component on a native platform means that the script\ - \ will be executed in your current environment.\nAny dependencies are assumed\ - \ to have been installed by the user, so the native platform is meant for developers\ - \ (who know what they're doing) or for simple bash scripts (which have no extra\ - \ dependencies).\n" - type: "object" - properties: - id: - description: "As with all platforms, you can give a platform a different name.\ - \ By specifying `id: foo`, you can target this platform (only) by specifying\ - \ `-p foo` in any of the Viash commands." - type: "string" - type: - description: "Running a Viash component on a native platform means that the\ - \ script will be executed in your current environment.\nAny dependencies\ - \ are assumed to have been installed by the user, so the native platform\ - \ is meant for developers (who know what they're doing) or for simple bash\ - \ scripts (which have no extra dependencies).\n" - const: "native" - required: - - "type" - additionalProperties: false - DockerPlatform: - description: "Run a Viash component on a Docker backend platform.\nBy specifying\ - \ which dependencies your component needs, users will be able to build a docker\ - \ container from scratch using the setup flag, or pull it from a docker repository.\n" - type: "object" - properties: - organization: - description: "Name of a container's [organization](https://docs.docker.com/docker-hub/orgs/)." - type: "string" - registry: - description: "The URL to the a [custom Docker registry](https://docs.docker.com/registry/)" - type: "string" - image: - description: "The base container to start from. You can also add the tag here\ - \ if you wish." - type: "string" - tag: - description: "Specify a Docker image based on its tag." - type: "string" - target_tag: - description: "The tag the resulting image gets. Advanced usage only." - type: "string" - run_args: - oneOf: - - description: "Add [docker run](https://docs.docker.com/engine/reference/run/)\ - \ arguments." - type: "string" - - type: "array" - items: - description: "Add [docker run](https://docs.docker.com/engine/reference/run/)\ - \ arguments." - type: "string" - namespace_separator: - description: "The separator between the namespace and the name of the component,\ - \ used for determining the image name. Default: \"/\"." - type: "string" - resolve_volume: - description: "Enables or disables automatic volume mapping. Enabled when set\ - \ to `Automatic` or disabled when set to `Manual`. Default: `Automatic`." - $ref: "#/definitions/DockerResolveVolume" - cmd: - oneOf: - - description: "Set the default command being executed when running the Docker\ - \ container." - type: "string" - - description: "Set the default command being executed when running the Docker\ - \ container." - type: "array" - items: - type: "string" - id: - description: "As with all platforms, you can give a platform a different name.\ - \ By specifying `id: foo`, you can target this platform (only) by specifying\ - \ `-p foo` in any of the Viash commands." - type: "string" - port: - oneOf: - - description: "A list of enabled ports. This doesn't change the Dockerfile\ - \ but gets added as a command-line argument at runtime." - type: "string" - - type: "array" - items: - description: "A list of enabled ports. This doesn't change the Dockerfile\ - \ but gets added as a command-line argument at runtime." - type: "string" - target_registry: - description: "The URL where the resulting image will be pushed to. Advanced\ - \ usage only." - type: "string" - setup: - description: "A list of requirements for installing the following types of\ - \ packages:\n\n - apt\n - apk\n - Docker setup instructions\n - JavaScript\n\ - \ - Python\n - R\n - Ruby\n - yum\n\nThe order in which these dependencies\ - \ are specified determines the order in which they will be installed.\n" - type: "array" - items: - $ref: "#/definitions/Requirements" - workdir: - description: "The working directory when starting the container. This doesn't\ - \ change the Dockerfile but gets added as a command-line argument at runtime." - type: "string" - target_image: - description: "If anything is specified in the setup section, running the `---setup`\ - \ will result in an image with the name of `:`. If\ - \ nothing is specified in the `setup` section, simply `image` will be used.\ - \ Advanced usage only." - type: "string" - target_image_source: - description: "The source of the target image. This is used for defining labels\ - \ in the dockerfile." - type: "string" - test_setup: - description: "Additional requirements specific for running unit tests." - type: "array" - items: - $ref: "#/definitions/Requirements" - entrypoint: - oneOf: - - description: "Override the entrypoint of the base container. Default set\ - \ `ENTRYPOINT []`." - type: "string" - - description: "Override the entrypoint of the base container. Default set\ - \ `ENTRYPOINT []`." - type: "array" - items: - type: "string" - setup_strategy: - description: "The Docker setup strategy to use when building a container.\n\ - \n| Strategy | Description |\n|-----|----------|\n| `alwaysbuild` / `build`\ - \ / `b` | Always build the image from the dockerfile. This is the default\ - \ setup strategy.\n| `alwayscachedbuild` / `cachedbuild` / `cb` | Always\ - \ build the image from the dockerfile, with caching enabled.\n| `ifneedbebuild`\ - \ | Build the image if it does not exist locally.\n| `ifneedbecachedbuild`\ - \ | Build the image with caching enabled if it does not exist locally, with\ - \ caching enabled.\n| `alwayspull` / `pull` / `p` | Try to pull the container\ - \ from [Docker Hub](https://hub.docker.com) or the specified docker registry.\n\ - | `alwayspullelsebuild` / `pullelsebuild` | Try to pull the image from\ - \ a registry and build it if it doesn't exist.\n| `alwayspullelsecachedbuild`\ - \ / `pullelsecachedbuild` | Try to pull the image from a registry and build\ - \ it with caching if it doesn't exist.\n| `ifneedbepull` | If the image\ - \ does not exist locally, pull the image.\n| `ifneedbepullelsebuild` | \ - \ If the image does not exist locally, pull the image. If the image does\ - \ exist, build it.\n| `ifneedbepullelsecachedbuild` | If the image does\ - \ not exist locally, pull the image. If the image does exist, build it with\ - \ caching enabled.\n| `push` | Push the container to [Docker Hub](https://hub.docker.com)\ - \ or the specified docker registry.\n| `pushifnotpresent` | Push the container\ - \ to [Docker Hub](https://hub.docker.com) or the specified docker registry\ - \ if the tag does not exist yet.\n| `donothing` / `meh` | Do not build or\ - \ pull anything.\n\n" - $ref: "#/definitions/DockerSetupStrategy" - type: - description: "Run a Viash component on a Docker backend platform.\nBy specifying\ - \ which dependencies your component needs, users will be able to build a\ - \ docker container from scratch using the setup flag, or pull it from a\ - \ docker repository.\n" - const: "docker" - target_organization: - description: "The organization set in the resulting image. Advanced usage\ - \ only." - type: "string" - required: - - "image" - - "type" - additionalProperties: false - NextflowPlatform: - description: "Platform for generating Nextflow VDSL3 modules." - type: "object" - properties: - auto: - description: "Automated processing flags which can be toggled on or off:\n\ - \n| Flag | Description | Default |\n|---|---------|----|\n| `simplifyInput`\ - \ | If `true`, an input tuple only containing only a single File (e.g. `[\"\ - foo\", file(\"in.h5ad\")]`) is automatically transformed to a map (i.e.\ - \ `[\"foo\", [ input: file(\"in.h5ad\") ] ]`). | `true` |\n| `simplifyOutput`\ - \ | If `true`, an output tuple containing a map with a File (e.g. `[\"foo\"\ - , [ output: file(\"out.h5ad\") ] ]`) is automatically transformed to a map\ - \ (i.e. `[\"foo\", file(\"out.h5ad\")]`). | `false` |\n| `transcript` |\ - \ If `true`, the module's transcripts from `work/` are automatically published\ - \ to `params.transcriptDir`. If not defined, `params.publishDir + \"/_transcripts\"\ - ` will be used. Will throw an error if neither are defined. | `false` |\n\ - | `publish` | If `true`, the module's outputs are automatically published\ - \ to `params.publishDir`. If equal to \"state\", also a `.state.yaml` file\ - \ will be published in the publish dir. Will throw an error if `params.publishDir`\ - \ is not defined. | `false` |\n\n" - $ref: "#/definitions/NextflowAuto" - directives: - description: "Directives are optional settings that affect the execution of\ - \ the process. These mostly match up with the Nextflow counterparts. \n" - $ref: "#/definitions/NextflowDirectives" - container: - description: "Specifies the Docker platform id to be used to run Nextflow." - type: "string" - config: - description: "Allows tweaking how the Nextflow Config file is generated." - $ref: "#/definitions/NextflowConfig" - debug: - description: "Whether or not to print debug messages." - type: "boolean" - id: - description: "Every platform can be given a specific id that can later be\ - \ referred to explicitly when running or building the Viash component." - type: "string" - type: - description: "Platform for generating Nextflow VDSL3 modules." - const: "nextflow" - required: - - "type" - additionalProperties: false - Requirements: - oneOf: - - $ref: "#/definitions/ApkRequirements" - - $ref: "#/definitions/AptRequirements" - - $ref: "#/definitions/DockerRequirements" - - $ref: "#/definitions/JavaScriptRequirements" - - $ref: "#/definitions/PythonRequirements" - - $ref: "#/definitions/RRequirements" - - $ref: "#/definitions/RubyRequirements" - - $ref: "#/definitions/YumRequirements" - ApkRequirements: - description: "Specify which apk packages should be available in order to run the\ - \ component." - type: "object" - properties: - type: - description: "Specify which apk packages should be available in order to run\ - \ the component." - const: "apk" - packages: - oneOf: - - description: "Specifies which packages to install." - type: "string" - - type: "array" - items: - description: "Specifies which packages to install." - type: "string" - required: - - "type" - additionalProperties: false - AptRequirements: - description: "Specify which apt packages should be available in order to run the\ - \ component." - type: "object" - properties: - interactive: - description: "If `false`, the Debian frontend is set to non-interactive (recommended).\ - \ Default: false." - type: "boolean" - type: - description: "Specify which apt packages should be available in order to run\ - \ the component." - const: "apt" - packages: - oneOf: - - description: "Specifies which packages to install." - type: "string" - - type: "array" - items: - description: "Specifies which packages to install." - type: "string" - required: - - "type" - additionalProperties: false - DockerRequirements: - description: "Specify which Docker commands should be run during setup." - type: "object" - properties: - run: - oneOf: - - description: "Specifies which `RUN` entries to add to the Dockerfile while\ - \ building it." - type: "string" - - type: "array" - items: - description: "Specifies which `RUN` entries to add to the Dockerfile while\ - \ building it." - type: "string" - label: - oneOf: - - description: "Specifies which `LABEL` entries to add to the Dockerfile while\ - \ building it." - type: "string" - - type: "array" - items: - description: "Specifies which `LABEL` entries to add to the Dockerfile\ - \ while building it." - type: "string" - build_args: - oneOf: - - description: "Specifies which `ARG` entries to add to the Dockerfile while\ - \ building it." - type: "string" - - type: "array" - items: - description: "Specifies which `ARG` entries to add to the Dockerfile while\ - \ building it." - type: "string" - copy: - oneOf: - - description: "Specifies which `COPY` entries to add to the Dockerfile while\ - \ building it." - type: "string" - - type: "array" - items: - description: "Specifies which `COPY` entries to add to the Dockerfile\ - \ while building it." - type: "string" - type: - description: "Specify which Docker commands should be run during setup." - const: "docker" - add: - oneOf: - - description: "Specifies which `ADD` entries to add to the Dockerfile while\ - \ building it." - type: "string" - - type: "array" - items: - description: "Specifies which `ADD` entries to add to the Dockerfile while\ - \ building it." - type: "string" - env: - oneOf: - - description: "Specifies which `ENV` entries to add to the Dockerfile while\ - \ building it. Unlike `ARG`, `ENV` entries are also accessible from inside\ - \ the container." - type: "string" - - type: "array" - items: - description: "Specifies which `ENV` entries to add to the Dockerfile while\ - \ building it. Unlike `ARG`, `ENV` entries are also accessible from\ - \ inside the container." - type: "string" - required: - - "type" - additionalProperties: false - JavaScriptRequirements: - description: "Specify which JavaScript packages should be available in order to\ - \ run the component." - type: "object" - properties: - github: - oneOf: - - description: "Specifies which packages to install from GitHub." - type: "string" - - type: "array" - items: - description: "Specifies which packages to install from GitHub." - type: "string" - url: - oneOf: - - description: "Specifies which packages to install using a generic URI." - type: "string" - - type: "array" - items: - description: "Specifies which packages to install using a generic URI." - type: "string" - git: - oneOf: - - description: "Specifies which packages to install using a Git URI." - type: "string" - - type: "array" - items: - description: "Specifies which packages to install using a Git URI." - type: "string" - npm: - oneOf: - - description: "Specifies which packages to install from npm." - type: "string" - - type: "array" - items: - description: "Specifies which packages to install from npm." - type: "string" - type: - description: "Specify which JavaScript packages should be available in order\ - \ to run the component." - const: "javascript" - packages: - oneOf: - - description: "Specifies which packages to install from npm." - type: "string" - - type: "array" - items: - description: "Specifies which packages to install from npm." - type: "string" - required: - - "type" - additionalProperties: false - PythonRequirements: - description: "Specify which Python packages should be available in order to run\ - \ the component." - type: "object" - properties: - github: - oneOf: - - description: "Specifies which packages to install from GitHub." - type: "string" - - type: "array" - items: - description: "Specifies which packages to install from GitHub." - type: "string" - gitlab: - oneOf: - - description: "Specifies which packages to install from GitLab." - type: "string" - - type: "array" - items: - description: "Specifies which packages to install from GitLab." - type: "string" - pip: - oneOf: - - description: "Specifies which packages to install from pip." - type: "string" - - type: "array" - items: - description: "Specifies which packages to install from pip." - type: "string" - pypi: - oneOf: - - description: "Specifies which packages to install from PyPI using pip." - type: "string" - - type: "array" - items: - description: "Specifies which packages to install from PyPI using pip." - type: "string" - git: - oneOf: - - description: "Specifies which packages to install using a Git URI." - type: "string" - - type: "array" - items: - description: "Specifies which packages to install using a Git URI." - type: "string" - upgrade: - description: "Sets the `--upgrade` flag when set to true. Default: true." - type: "boolean" - packages: - oneOf: - - description: "Specifies which packages to install from pip." - type: "string" - - type: "array" - items: - description: "Specifies which packages to install from pip." - type: "string" - url: - oneOf: - - description: "Specifies which packages to install using a generic URI." - type: "string" - - type: "array" - items: - description: "Specifies which packages to install using a generic URI." - type: "string" - svn: - oneOf: - - description: "Specifies which packages to install using an SVN URI." - type: "string" - - type: "array" - items: - description: "Specifies which packages to install using an SVN URI." - type: "string" - bazaar: - oneOf: - - description: "Specifies which packages to install using a Bazaar URI." - type: "string" - - type: "array" - items: - description: "Specifies which packages to install using a Bazaar URI." - type: "string" - script: - oneOf: - - description: "Specifies a code block to run as part of the build." - type: "string" - - type: "array" - items: - description: "Specifies a code block to run as part of the build." - type: "string" - type: - description: "Specify which Python packages should be available in order to\ - \ run the component." - const: "python" - mercurial: - oneOf: - - description: "Specifies which packages to install using a Mercurial URI." - type: "string" - - type: "array" - items: - description: "Specifies which packages to install using a Mercurial URI." - type: "string" - user: - description: "Sets the `--user` flag when set to true. Default: false." - type: "boolean" - required: - - "type" - additionalProperties: false - RRequirements: - description: "Specify which R packages should be available in order to run the\ - \ component." - type: "object" - properties: - bioc: - oneOf: - - description: "Specifies which packages to install from BioConductor." - type: "string" - - type: "array" - items: - description: "Specifies which packages to install from BioConductor." - type: "string" - github: - oneOf: - - description: "Specifies which packages to install from GitHub." - type: "string" - - type: "array" - items: - description: "Specifies which packages to install from GitHub." - type: "string" - gitlab: - oneOf: - - description: "Specifies which packages to install from GitLab." - type: "string" - - type: "array" - items: - description: "Specifies which packages to install from GitLab." - type: "string" - url: - oneOf: - - description: "Specifies which packages to install using a generic URI." - type: "string" - - type: "array" - items: - description: "Specifies which packages to install using a generic URI." - type: "string" - bioc_force_install: - description: "Forces packages specified in `bioc` to be reinstalled, even\ - \ if they are already present in the container. Default: false." - type: "boolean" - git: - oneOf: - - description: "Specifies which packages to install using a Git URI." - type: "string" - - type: "array" - items: - description: "Specifies which packages to install using a Git URI." - type: "string" - cran: - oneOf: - - description: "Specifies which packages to install from CRAN." - type: "string" - - type: "array" - items: - description: "Specifies which packages to install from CRAN." - type: "string" - bitbucket: - oneOf: - - description: "Specifies which packages to install from Bitbucket." - type: "string" - - type: "array" - items: - description: "Specifies which packages to install from Bitbucket." - type: "string" - svn: - oneOf: - - description: "Specifies which packages to install using an SVN URI." - type: "string" - - type: "array" - items: - description: "Specifies which packages to install using an SVN URI." - type: "string" - packages: - oneOf: - - description: "Specifies which packages to install from CRAN." - type: "string" - - type: "array" - items: - description: "Specifies which packages to install from CRAN." - type: "string" - script: - oneOf: - - description: "Specifies a code block to run as part of the build." - type: "string" - - type: "array" - items: - description: "Specifies a code block to run as part of the build." - type: "string" - type: - description: "Specify which R packages should be available in order to run\ - \ the component." - const: "r" - required: - - "type" - additionalProperties: false - RubyRequirements: - description: "Specify which Ruby packages should be available in order to run\ - \ the component." - type: "object" - properties: - type: - description: "Specify which Ruby packages should be available in order to\ - \ run the component." - const: "ruby" - packages: - oneOf: - - description: "Specifies which packages to install." - type: "string" - - type: "array" - items: - description: "Specifies which packages to install." - type: "string" - required: - - "type" - additionalProperties: false - YumRequirements: - description: "Specify which yum packages should be available in order to run the\ - \ component." - type: "object" - properties: - type: - description: "Specify which yum packages should be available in order to run\ - \ the component." - const: "yum" - packages: - oneOf: - - description: "Specifies which packages to install." - type: "string" - - type: "array" - items: - description: "Specifies which packages to install." - type: "string" - required: - - "type" - additionalProperties: false - Argument: - oneOf: - - $ref: "#/definitions/BooleanArgument" - - $ref: "#/definitions/BooleanTrueArgument" - - $ref: "#/definitions/BooleanFalseArgument" - - $ref: "#/definitions/DoubleArgument" - - $ref: "#/definitions/FileArgument" - - $ref: "#/definitions/IntegerArgument" - - $ref: "#/definitions/LongArgument" - - $ref: "#/definitions/StringArgument" - BooleanArgument: - description: "A `boolean` type argument has two possible values: `true` or `false`." - type: "object" - properties: - alternatives: - oneOf: - - description: "List of alternative format variations for this argument." - type: "string" - - type: "array" - items: - description: "List of alternative format variations for this argument." - type: "string" - name: - description: "The name of the argument. Can be in the formats `--trim`, `-t`\ - \ or `trim`. The number of dashes determines how values can be passed: \ - \ \n\n - `--trim` is a long option, which can be passed with `executable_name\ - \ --trim`\n - `-t` is a short option, which can be passed with `executable_name\ - \ -t`\n - `trim` is an argument, which can be passed with `executable_name\ - \ trim` \n" - type: "string" - direction: - $ref: "#/definitions/Direction" - info: - description: "Structured information. Can be any shape: a string, vector,\ - \ map or even nested map." - type: "object" - default: - oneOf: - - description: "The default value when no argument value is provided. This\ - \ will not work if the [`required`](#required) property is enabled." - type: "boolean" - - type: "array" - items: - description: "The default value when no argument value is provided. This\ - \ will not work if the [`required`](#required) property is enabled." - type: "boolean" - example: - oneOf: - - description: "An example value for this argument. If no [`default`](#default)\ - \ property was specified, this will be used for that purpose." - type: "boolean" - - type: "array" - items: - description: "An example value for this argument. If no [`default`](#default)\ - \ property was specified, this will be used for that purpose." - type: "boolean" - description: - description: "A description of the argument. This will be displayed with `--help`." - type: "string" - multiple_sep: - description: "The delimiter character for providing [`multiple`](#multiple)\ - \ values. `:` by default." - type: "string" - multiple: - description: "Treat the argument value as an array. Arrays can be passed using\ - \ the delimiter `--foo=1:2:3` or by providing the same argument multiple\ - \ times `--foo 1 --foo 2`. You can use a custom delimiter by using the [`multiple_sep`](#multiple_sep)\ - \ property. `false` by default." - type: "boolean" - type: - description: "A `boolean` type argument has two possible values: `true` or\ - \ `false`." - const: "boolean" - required: - description: "Make the value for this argument required. If set to `true`,\ - \ an error will be produced if no value was provided. `false` by default." - type: "boolean" - required: - - "name" - - "type" - additionalProperties: false - BooleanTrueArgument: - description: "An argument of the `boolean_true` type acts like a `boolean` flag\ - \ with a default value of `false`. When called as an argument it sets the `boolean`\ - \ to `true`." - type: "object" - properties: - alternatives: - oneOf: - - description: "List of alternative format variations for this argument." - type: "string" - - type: "array" - items: - description: "List of alternative format variations for this argument." - type: "string" - name: - description: "The name of the argument. Can be in the formats `--silent`,\ - \ `-s` or `silent`. The number of dashes determines how values can be passed:\ - \ \n\n - `--silent` is a long option, which can be passed with `executable_name\ - \ --silent`\n - `-s` is a short option, which can be passed with `executable_name\ - \ -s`\n - `silent` is an argument, which can be passed with `executable_name\ - \ silent` \n" - type: "string" - direction: - $ref: "#/definitions/Direction" - info: - description: "Structured information. Can be any shape: a string, vector,\ - \ map or even nested map." - type: "object" - description: - description: "A description of the argument. This will be displayed with `--help`." - type: "string" - type: - description: "An argument of the `boolean_true` type acts like a `boolean`\ - \ flag with a default value of `false`. When called as an argument it sets\ - \ the `boolean` to `true`." - const: "boolean_true" - required: - - "name" - - "type" - additionalProperties: false - BooleanFalseArgument: - description: "An argument of the `boolean_false` type acts like an inverted `boolean`\ - \ flag with a default value of `true`. When called as an argument it sets the\ - \ `boolean` to `false`." - type: "object" - properties: - alternatives: - oneOf: - - description: "List of alternative format variations for this argument." - type: "string" - - type: "array" - items: - description: "List of alternative format variations for this argument." - type: "string" - name: - description: "The name of the argument. Can be in the formats `--no-log`,\ - \ `-n` or `no-log`. The number of dashes determines how values can be passed:\ - \ \n\n - `--no-log` is a long option, which can be passed with `executable_name\ - \ --no-log`\n - `-n` is a short option, which can be passed with `executable_name\ - \ -n`\n - `no-log` is an argument, which can be passed with `executable_name\ - \ no-log` \n" - type: "string" - direction: - $ref: "#/definitions/Direction" - info: - description: "Structured information. Can be any shape: a string, vector,\ - \ map or even nested map." - type: "object" - description: - description: "A description of the argument. This will be displayed with `--help`." - type: "string" - type: - description: "An argument of the `boolean_false` type acts like an inverted\ - \ `boolean` flag with a default value of `true`. When called as an argument\ - \ it sets the `boolean` to `false`." - const: "boolean_false" - required: - - "name" - - "type" - additionalProperties: false - DoubleArgument: - description: "A `double` type argument has a numeric value with decimal points" - type: "object" - properties: - alternatives: - oneOf: - - description: "List of alternative format variations for this argument." - type: "string" - - type: "array" - items: - description: "List of alternative format variations for this argument." - type: "string" - name: - description: "The name of the argument. Can be in the formats `--foo`, `-f`\ - \ or `foo`. The number of dashes determines how values can be passed: \n\ - \n - `--foo` is a long option, which can be passed with `executable_name\ - \ --foo=value` or `executable_name --foo value`\n - `-f` is a short option,\ - \ which can be passed with `executable_name -f value`\n - `foo` is an argument,\ - \ which can be passed with `executable_name value` \n" - type: "string" - info: - description: "Structured information. Can be any shape: a string, vector,\ - \ map or even nested map." - type: "object" - max: - description: "Maximum allowed value for this argument. If set and the provided\ - \ value is higher than the maximum, an error will be produced. Can be combined\ - \ with [`min`](#min) to clamp values." - $ref: "#/definitions/DoubleWithInf" - default: - oneOf: - - description: "The default value when no argument value is provided. This\ - \ will not work if the [`required`](#required) property is enabled." - $ref: "#/definitions/DoubleWithInf" - - type: "array" - items: - description: "The default value when no argument value is provided. This\ - \ will not work if the [`required`](#required) property is enabled." - $ref: "#/definitions/DoubleWithInf" - example: - oneOf: - - description: "An example value for this argument. If no [`default`](#default)\ - \ property was specified, this will be used for that purpose." - $ref: "#/definitions/DoubleWithInf" - - type: "array" - items: - description: "An example value for this argument. If no [`default`](#default)\ - \ property was specified, this will be used for that purpose." - $ref: "#/definitions/DoubleWithInf" - description: - description: "A description of the argument. This will be displayed with `--help`." - type: "string" - multiple_sep: - description: "The delimiter character for providing [`multiple`](#multiple)\ - \ values. `:` by default." - type: "string" - min: - description: "Minimum allowed value for this argument. If set and the provided\ - \ value is lower than the minimum, an error will be produced. Can be combined\ - \ with [`max`](#max) to clamp values." - $ref: "#/definitions/DoubleWithInf" - direction: - $ref: "#/definitions/Direction" - multiple: - description: "Treat the argument value as an array. Arrays can be passed using\ - \ the delimiter `--foo=1:2:3` or by providing the same argument multiple\ - \ times `--foo 1 --foo 2`. You can use a custom delimiter by using the [`multiple_sep`](#multiple_sep)\ - \ property. `false` by default." - type: "boolean" - type: - description: "A `double` type argument has a numeric value with decimal points" - const: "double" - required: - description: "Make the value for this argument required. If set to `true`,\ - \ an error will be produced if no value was provided. `false` by default." - type: "boolean" - required: - - "name" - - "type" - additionalProperties: false - FileArgument: - description: "A `file` type argument has a string value that points to a file\ - \ or folder path." - type: "object" - properties: - alternatives: - oneOf: - - description: "List of alternative format variations for this argument." - type: "string" - - type: "array" - items: - description: "List of alternative format variations for this argument." - type: "string" - name: - description: "The name of the argument. Can be in the formats `--foo`, `-f`\ - \ or `foo`. The number of dashes determines how values can be passed: \n\ - \n - `--foo` is a long option, which can be passed with `executable_name\ - \ --foo=value` or `executable_name --foo value`\n - `-f` is a short option,\ - \ which can be passed with `executable_name -f value`\n - `foo` is an argument,\ - \ which can be passed with `executable_name value` \n" - type: "string" - create_parent: - description: "If the output filename is a path and it does not exist, create\ - \ it before executing the script (only for `direction: output`)." - type: "boolean" - direction: - description: "Makes this argument an `input` or an `output`, as in does the\ - \ file/folder needs to be read or written. `input` by default." - $ref: "#/definitions/Direction" - info: - description: "Structured information. Can be any shape: a string, vector,\ - \ map or even nested map." - type: "object" - must_exist: - description: "Checks whether the file or folder exists. For input files, this\ - \ check will happen before the execution of the script, while for output\ - \ files the check will happen afterwards." - type: "boolean" - default: - oneOf: - - description: "The default value when no argument value is provided. This\ - \ will not work if the [`required`](#required) property is enabled." - type: "string" - - type: "array" - items: - description: "The default value when no argument value is provided. This\ - \ will not work if the [`required`](#required) property is enabled." - type: "string" - example: - oneOf: - - description: "An example value for this argument. If no [`default`](#default)\ - \ property was specified, this will be used for that purpose." - type: "string" - - type: "array" - items: - description: "An example value for this argument. If no [`default`](#default)\ - \ property was specified, this will be used for that purpose." - type: "string" - description: - description: "A description of the argument. This will be displayed with `--help`." - type: "string" - multiple_sep: - description: "The delimiter character for providing [`multiple`](#multiple)\ - \ values. `:` by default." - type: "string" - multiple: - description: "Treat the argument value as an array. Arrays can be passed using\ - \ the delimiter `--foo=1:2:3` or by providing the same argument multiple\ - \ times `--foo 1 --foo 2`. You can use a custom delimiter by using the [`multiple_sep`](#multiple_sep)\ - \ property. `false` by default." - type: "boolean" - type: - description: "A `file` type argument has a string value that points to a file\ - \ or folder path." - const: "file" - required: - description: "Make the value for this argument required. If set to `true`,\ - \ an error will be produced if no value was provided. `false` by default." - type: "boolean" - required: - - "name" - - "type" - additionalProperties: false - IntegerArgument: - description: "An `integer` type argument has a numeric value without decimal points." - type: "object" - properties: - alternatives: - oneOf: - - description: "List of alternative format variations for this argument." - type: "string" - - type: "array" - items: - description: "List of alternative format variations for this argument." - type: "string" - name: - description: "The name of the argument. Can be in the formats `--foo`, `-f`\ - \ or `foo`. The number of dashes determines how values can be passed: \n\ - \n - `--foo` is a long option, which can be passed with `executable_name\ - \ --foo=value` or `executable_name --foo value`\n - `-f` is a short option,\ - \ which can be passed with `executable_name -f value`\n - `foo` is an argument,\ - \ which can be passed with `executable_name value` \n" - type: "string" - choices: - description: "Limit the amount of valid values for this argument to those\ - \ set in this list. When set and a value not present in the list is provided,\ - \ an error will be produced." - type: "array" - items: - type: "integer" - info: - description: "Structured information. Can be any shape: a string, vector,\ - \ map or even nested map." - type: "object" - max: - description: "Maximum allowed value for this argument. If set and the provided\ - \ value is higher than the maximum, an error will be produced. Can be combined\ - \ with [`min`](#min) to clamp values." - type: "integer" - default: - oneOf: - - description: "The default value when no argument value is provided. This\ - \ will not work if the [`required`](#required) property is enabled." - type: "integer" - - type: "array" - items: - description: "The default value when no argument value is provided. This\ - \ will not work if the [`required`](#required) property is enabled." - type: "integer" - example: - oneOf: - - description: "An example value for this argument. If no [`default`](#default)\ - \ property was specified, this will be used for that purpose." - type: "integer" - - type: "array" - items: - description: "An example value for this argument. If no [`default`](#default)\ - \ property was specified, this will be used for that purpose." - type: "integer" - description: - description: "A description of the argument. This will be displayed with `--help`." - type: "string" - multiple_sep: - description: "The delimiter character for providing [`multiple`](#multiple)\ - \ values. `:` by default." - type: "string" - min: - description: "Minimum allowed value for this argument. If set and the provided\ - \ value is lower than the minimum, an error will be produced. Can be combined\ - \ with [`max`](#max) to clamp values." - type: "integer" - direction: - $ref: "#/definitions/Direction" - multiple: - description: "Treat the argument value as an array. Arrays can be passed using\ - \ the delimiter `--foo=1:2:3` or by providing the same argument multiple\ - \ times `--foo 1 --foo 2`. You can use a custom delimiter by using the [`multiple_sep`](#multiple_sep)\ - \ property. `false` by default." - type: "boolean" - type: - description: "An `integer` type argument has a numeric value without decimal\ - \ points." - const: "integer" - required: - description: "Make the value for this argument required. If set to `true`,\ - \ an error will be produced if no value was provided. `false` by default." - type: "boolean" - required: - - "name" - - "type" - additionalProperties: false - LongArgument: - description: "An `long` type argument has a numeric value without decimal points." - type: "object" - properties: - alternatives: - oneOf: - - description: "List of alternative format variations for this argument." - type: "string" - - type: "array" - items: - description: "List of alternative format variations for this argument." - type: "string" - name: - description: "The name of the argument. Can be in the formats `--foo`, `-f`\ - \ or `foo`. The number of dashes determines how values can be passed: \n\ - \n - `--foo` is a long option, which can be passed with `executable_name\ - \ --foo=value` or `executable_name --foo value`\n - `-f` is a short option,\ - \ which can be passed with `executable_name -f value`\n - `foo` is an argument,\ - \ which can be passed with `executable_name value` \n" - type: "string" - choices: - description: "Limit the amount of valid values for this argument to those\ - \ set in this list. When set and a value not present in the list is provided,\ - \ an error will be produced." - type: "array" - items: - type: "integer" - info: - description: "Structured information. Can be any shape: a string, vector,\ - \ map or even nested map." - type: "object" - max: - description: "Maximum allowed value for this argument. If set and the provided\ - \ value is higher than the maximum, an error will be produced. Can be combined\ - \ with [`min`](#min) to clamp values." - type: "integer" - default: - oneOf: - - description: "The default value when no argument value is provided. This\ - \ will not work if the [`required`](#required) property is enabled." - type: "integer" - - type: "array" - items: - description: "The default value when no argument value is provided. This\ - \ will not work if the [`required`](#required) property is enabled." - type: "integer" - example: - oneOf: - - description: "An example value for this argument. If no [`default`](#default)\ - \ property was specified, this will be used for that purpose." - type: "integer" - - type: "array" - items: - description: "An example value for this argument. If no [`default`](#default)\ - \ property was specified, this will be used for that purpose." - type: "integer" - description: - description: "A description of the argument. This will be displayed with `--help`." - type: "string" - multiple_sep: - description: "The delimiter character for providing [`multiple`](#multiple)\ - \ values. `:` by default." - type: "string" - min: - description: "Minimum allowed value for this argument. If set and the provided\ - \ value is lower than the minimum, an error will be produced. Can be combined\ - \ with [`max`](#max) to clamp values." - type: "integer" - direction: - $ref: "#/definitions/Direction" - multiple: - description: "Treat the argument value as an array. Arrays can be passed using\ - \ the delimiter `--foo=1:2:3` or by providing the same argument multiple\ - \ times `--foo 1 --foo 2`. You can use a custom delimiter by using the [`multiple_sep`](#multiple_sep)\ - \ property. `false` by default." - type: "boolean" - type: - description: "An `long` type argument has a numeric value without decimal\ - \ points." - const: "long" - required: - description: "Make the value for this argument required. If set to `true`,\ - \ an error will be produced if no value was provided. `false` by default." - type: "boolean" - required: - - "name" - - "type" - additionalProperties: false - StringArgument: - description: "A `string` type argument has a value made up of an ordered sequences\ - \ of characters, like \"Hello\" or \"I'm a string\"." - type: "object" - properties: - alternatives: - oneOf: - - description: "List of alternative format variations for this argument." - type: "string" - - type: "array" - items: - description: "List of alternative format variations for this argument." - type: "string" - name: - description: "The name of the argument. Can be in the formats `--foo`, `-f`\ - \ or `foo`. The number of dashes determines how values can be passed: \n\ - \n - `--foo` is a long option, which can be passed with `executable_name\ - \ --foo=value` or `executable_name --foo value`\n - `-f` is a short option,\ - \ which can be passed with `executable_name -f value`\n - `foo` is an argument,\ - \ which can be passed with `executable_name value` \n" - type: "string" - choices: - description: "Limit the amount of valid values for this argument to those\ - \ set in this list. When set and a value not present in the list is provided,\ - \ an error will be produced." - type: "array" - items: - type: "string" - direction: - $ref: "#/definitions/Direction" - info: - description: "Structured information. Can be any shape: a string, vector,\ - \ map or even nested map." - type: "object" - default: - oneOf: - - description: "The default value when no argument value is provided. This\ - \ will not work if the [`required`](#required) property is enabled." - type: "string" - - type: "array" - items: - description: "The default value when no argument value is provided. This\ - \ will not work if the [`required`](#required) property is enabled." - type: "string" - example: - oneOf: - - description: "An example value for this argument. If no [`default`](#default)\ - \ property was specified, this will be used for that purpose." - type: "string" - - type: "array" - items: - description: "An example value for this argument. If no [`default`](#default)\ - \ property was specified, this will be used for that purpose." - type: "string" - description: - description: "A description of the argument. This will be displayed with `--help`." - type: "string" - multiple_sep: - description: "The delimiter character for providing [`multiple`](#multiple)\ - \ values. `:` by default." - type: "string" - multiple: - description: "Treat the argument value as an array. Arrays can be passed using\ - \ the delimiter `--foo=1:2:3` or by providing the same argument multiple\ - \ times `--foo 1 --foo 2`. You can use a custom delimiter by using the [`multiple_sep`](#multiple_sep)\ - \ property. `false` by default." - type: "boolean" - type: - description: "A `string` type argument has a value made up of an ordered sequences\ - \ of characters, like \"Hello\" or \"I'm a string\"." - const: "string" - required: - description: "Make the value for this argument required. If set to `true`,\ - \ an error will be produced if no value was provided. `false` by default." - type: "boolean" - required: - - "name" - - "type" - additionalProperties: false - Resource: - oneOf: - - $ref: "#/definitions/BashScript" - - $ref: "#/definitions/CSharpScript" - - $ref: "#/definitions/Executable" - - $ref: "#/definitions/JavaScriptScript" - - $ref: "#/definitions/NextflowScript" - - $ref: "#/definitions/PlainFile" - - $ref: "#/definitions/PythonScript" - - $ref: "#/definitions/RScript" - - $ref: "#/definitions/ScalaScript" - BashScript: - description: "An executable Bash script.\nWhen defined in resources, only the\ - \ first entry will be executed when running the built component or when running\ - \ `viash run`.\nWhen defined in test_resources, all entries will be executed\ - \ during `viash test`." - type: "object" - properties: - path: - description: "The path of the input file. Can be a relative or an absolute\ - \ path, or a URI. Mutually exclusive with `text`." - type: "string" - text: - description: "The content of the resulting file specified as a string. Mutually\ - \ exclusive with `path`." - type: "string" - is_executable: - description: "Whether the resulting resource file should be made executable." - type: "boolean" - type: - description: "An executable Bash script.\nWhen defined in resources, only\ - \ the first entry will be executed when running the built component or when\ - \ running `viash run`.\nWhen defined in test_resources, all entries will\ - \ be executed during `viash test`." - const: "bash_script" - dest: - description: "Resulting filename of the resource. From within a script, the\ - \ file can be accessed at `meta[\"resources_dir\"] + \"/\" + dest`. If unspecified,\ - \ `dest` will be set to the basename of the `path` parameter." - type: "string" - required: - - "type" - additionalProperties: false - CSharpScript: - description: "An executable C# script.\nWhen defined in resources, only the first\ - \ entry will be executed when running the built component or when running `viash\ - \ run`.\nWhen defined in test_resources, all entries will be executed during\ - \ `viash test`." - type: "object" - properties: - path: - description: "The path of the input file. Can be a relative or an absolute\ - \ path, or a URI. Mutually exclusive with `text`." - type: "string" - text: - description: "The content of the resulting file specified as a string. Mutually\ - \ exclusive with `path`." - type: "string" - is_executable: - description: "Whether the resulting resource file should be made executable." - type: "boolean" - type: - description: "An executable C# script.\nWhen defined in resources, only the\ - \ first entry will be executed when running the built component or when\ - \ running `viash run`.\nWhen defined in test_resources, all entries will\ - \ be executed during `viash test`." - const: "csharp_script" - dest: - description: "Resulting filename of the resource. From within a script, the\ - \ file can be accessed at `meta[\"resources_dir\"] + \"/\" + dest`. If unspecified,\ - \ `dest` will be set to the basename of the `path` parameter." - type: "string" - required: - - "type" - additionalProperties: false - Executable: - description: "An executable file." - type: "object" - properties: - path: - description: "The path of the input file. Can be a relative or an absolute\ - \ path, or a URI. Mutually exclusive with `text`." - type: "string" - text: - description: "The content of the resulting file specified as a string. Mutually\ - \ exclusive with `path`." - type: "string" - is_executable: - description: "Whether the resulting resource file should be made executable." - type: "boolean" - type: - description: "An executable file." - const: "executable" - dest: - description: "Resulting filename of the resource. From within a script, the\ - \ file can be accessed at `meta[\"resources_dir\"] + \"/\" + dest`. If unspecified,\ - \ `dest` will be set to the basename of the `path` parameter." - type: "string" - required: - - "type" - additionalProperties: false - JavaScriptScript: - description: "An executable JavaScript script.\nWhen defined in resources, only\ - \ the first entry will be executed when running the built component or when\ - \ running `viash run`.\nWhen defined in test_resources, all entries will be\ - \ executed during `viash test`." - type: "object" - properties: - path: - description: "The path of the input file. Can be a relative or an absolute\ - \ path, or a URI. Mutually exclusive with `text`." - type: "string" - text: - description: "The content of the resulting file specified as a string. Mutually\ - \ exclusive with `path`." - type: "string" - is_executable: - description: "Whether the resulting resource file should be made executable." - type: "boolean" - type: - description: "An executable JavaScript script.\nWhen defined in resources,\ - \ only the first entry will be executed when running the built component\ - \ or when running `viash run`.\nWhen defined in test_resources, all entries\ - \ will be executed during `viash test`." - const: "javascript_script" - dest: - description: "Resulting filename of the resource. From within a script, the\ - \ file can be accessed at `meta[\"resources_dir\"] + \"/\" + dest`. If unspecified,\ - \ `dest` will be set to the basename of the `path` parameter." - type: "string" - required: - - "type" - additionalProperties: false - NextflowScript: - description: "A Nextflow script. Work in progress; added mainly for annotation\ - \ at the moment." - type: "object" - properties: - path: - description: "The path of the input file. Can be a relative or an absolute\ - \ path, or a URI. Mutually exclusive with `text`." - type: "string" - text: - description: "The content of the resulting file specified as a string. Mutually\ - \ exclusive with `path`." - type: "string" - entrypoint: - description: "The name of the workflow to be wrapped." - type: "string" - is_executable: - description: "Whether the resulting resource file should be made executable." - type: "boolean" - type: - description: "A Nextflow script. Work in progress; added mainly for annotation\ - \ at the moment." - const: "nextflow_script" - dest: - description: "Resulting filename of the resource. From within a script, the\ - \ file can be accessed at `meta[\"resources_dir\"] + \"/\" + dest`. If unspecified,\ - \ `dest` will be set to the basename of the `path` parameter." - type: "string" - required: - - "entrypoint" - - "type" - additionalProperties: false - PlainFile: - description: "A plain file. This can only be used as a supporting resource for\ - \ the main script or unit tests." - type: "object" - properties: - path: - description: "The path of the input file. Can be a relative or an absolute\ - \ path, or a URI. Mutually exclusive with `text`." - type: "string" - text: - description: "The content of the resulting file specified as a string. Mutually\ - \ exclusive with `path`." - type: "string" - is_executable: - description: "Whether the resulting resource file should be made executable." - type: "boolean" - type: - description: "A plain file. This can only be used as a supporting resource\ - \ for the main script or unit tests." - const: "file" - dest: - description: "Resulting filename of the resource. From within a script, the\ - \ file can be accessed at `meta[\"resources_dir\"] + \"/\" + dest`. If unspecified,\ - \ `dest` will be set to the basename of the `path` parameter." - type: "string" - required: [] - additionalProperties: false - PythonScript: - description: "An executable Python script.\nWhen defined in resources, only the\ - \ first entry will be executed when running the built component or when running\ - \ `viash run`.\nWhen defined in test_resources, all entries will be executed\ - \ during `viash test`." - type: "object" - properties: - path: - description: "The path of the input file. Can be a relative or an absolute\ - \ path, or a URI. Mutually exclusive with `text`." - type: "string" - text: - description: "The content of the resulting file specified as a string. Mutually\ - \ exclusive with `path`." - type: "string" - is_executable: - description: "Whether the resulting resource file should be made executable." - type: "boolean" - type: - description: "An executable Python script.\nWhen defined in resources, only\ - \ the first entry will be executed when running the built component or when\ - \ running `viash run`.\nWhen defined in test_resources, all entries will\ - \ be executed during `viash test`." - const: "python_script" - dest: - description: "Resulting filename of the resource. From within a script, the\ - \ file can be accessed at `meta[\"resources_dir\"] + \"/\" + dest`. If unspecified,\ - \ `dest` will be set to the basename of the `path` parameter." - type: "string" - required: - - "type" - additionalProperties: false - RScript: - description: "An executable R script.\nWhen defined in resources, only the first\ - \ entry will be executed when running the built component or when running `viash\ - \ run`.\nWhen defined in test_resources, all entries will be executed during\ - \ `viash test`." - type: "object" - properties: - path: - description: "The path of the input file. Can be a relative or an absolute\ - \ path, or a URI. Mutually exclusive with `text`." - type: "string" - text: - description: "The content of the resulting file specified as a string. Mutually\ - \ exclusive with `path`." - type: "string" - is_executable: - description: "Whether the resulting resource file should be made executable." - type: "boolean" - type: - description: "An executable R script.\nWhen defined in resources, only the\ - \ first entry will be executed when running the built component or when\ - \ running `viash run`.\nWhen defined in test_resources, all entries will\ - \ be executed during `viash test`." - const: "r_script" - dest: - description: "Resulting filename of the resource. From within a script, the\ - \ file can be accessed at `meta[\"resources_dir\"] + \"/\" + dest`. If unspecified,\ - \ `dest` will be set to the basename of the `path` parameter." - type: "string" - required: - - "type" - additionalProperties: false - ScalaScript: - description: "An executable Scala script.\nWhen defined in resources, only the\ - \ first entry will be executed when running the built component or when running\ - \ `viash run`.\nWhen defined in test_resources, all entries will be executed\ - \ during `viash test`." - type: "object" - properties: - path: - description: "The path of the input file. Can be a relative or an absolute\ - \ path, or a URI. Mutually exclusive with `text`." - type: "string" - text: - description: "The content of the resulting file specified as a string. Mutually\ - \ exclusive with `path`." - type: "string" - is_executable: - description: "Whether the resulting resource file should be made executable." - type: "boolean" - type: - description: "An executable Scala script.\nWhen defined in resources, only\ - \ the first entry will be executed when running the built component or when\ - \ running `viash run`.\nWhen defined in test_resources, all entries will\ - \ be executed during `viash test`." - const: "scala_script" - dest: - description: "Resulting filename of the resource. From within a script, the\ - \ file can be accessed at `meta[\"resources_dir\"] + \"/\" + dest`. If unspecified,\ - \ `dest` will be set to the basename of the `path` parameter." - type: "string" - required: - - "type" - additionalProperties: false - NextflowDirectives: - description: "Directives are optional settings that affect the execution of the\ - \ process.\n" - type: "object" - properties: - beforeScript: - description: "The `beforeScript` directive allows you to execute a custom\ - \ (Bash) snippet before the main process script is run. This may be useful\ - \ to initialise the underlying cluster environment or for other custom initialisation.\n\ - \nSee [`beforeScript`](https://www.nextflow.io/docs/latest/process.html#beforeScript).\n" - type: "string" - module: - oneOf: - - description: "Environment Modules is a package manager that allows you to\ - \ dynamically configure your execution environment and easily switch between\ - \ multiple versions of the same software tool.\n\nIf it is available in\ - \ your system you can use it with Nextflow in order to configure the processes\ - \ execution environment in your pipeline.\n\nIn a process definition you\ - \ can use the `module` directive to load a specific module version to\ - \ be used in the process execution environment.\n\nSee [`module`](https://www.nextflow.io/docs/latest/process.html#module).\n" - type: "string" - - type: "array" - items: - description: "Environment Modules is a package manager that allows you\ - \ to dynamically configure your execution environment and easily switch\ - \ between multiple versions of the same software tool.\n\nIf it is available\ - \ in your system you can use it with Nextflow in order to configure\ - \ the processes execution environment in your pipeline.\n\nIn a process\ - \ definition you can use the `module` directive to load a specific module\ - \ version to be used in the process execution environment.\n\nSee [`module`](https://www.nextflow.io/docs/latest/process.html#module).\n" - type: "string" - queue: - oneOf: - - description: "The `queue` directory allows you to set the queue where jobs\ - \ are scheduled when using a grid based executor in your pipeline.\n\n\ - See [`queue`](https://www.nextflow.io/docs/latest/process.html#queue).\n" - type: "string" - - type: "array" - items: - description: "The `queue` directory allows you to set the queue where\ - \ jobs are scheduled when using a grid based executor in your pipeline.\n\ - \nSee [`queue`](https://www.nextflow.io/docs/latest/process.html#queue).\n" - type: "string" - label: - oneOf: - - description: "The `label` directive allows the annotation of processes with\ - \ mnemonic identifier of your choice.\n\nSee [`label`](https://www.nextflow.io/docs/latest/process.html#label).\n" - type: "string" - - type: "array" - items: - description: "The `label` directive allows the annotation of processes\ - \ with mnemonic identifier of your choice.\n\nSee [`label`](https://www.nextflow.io/docs/latest/process.html#label).\n" - type: "string" - container: - oneOf: - - description: "The `container` directive allows you to execute the process\ - \ script in a Docker container.\n\nIt requires the Docker daemon to be\ - \ running in machine where the pipeline is executed, i.e. the local machine\ - \ when using the local executor or the cluster nodes when the pipeline\ - \ is deployed through a grid executor.\n\nViash implements allows either\ - \ a string value or a map. In case a map is used, the allowed keys are:\ - \ `registry`, `image`, and `tag`. The `image` value must be specified.\n\ - \nSee [`container`](https://www.nextflow.io/docs/latest/process.html#container).\n" - type: "object" - additionalProperties: - type: "string" - - description: "The `container` directive allows you to execute the process\ - \ script in a Docker container.\n\nIt requires the Docker daemon to be\ - \ running in machine where the pipeline is executed, i.e. the local machine\ - \ when using the local executor or the cluster nodes when the pipeline\ - \ is deployed through a grid executor.\n\nViash implements allows either\ - \ a string value or a map. In case a map is used, the allowed keys are:\ - \ `registry`, `image`, and `tag`. The `image` value must be specified.\n\ - \nSee [`container`](https://www.nextflow.io/docs/latest/process.html#container).\n" - type: "string" - publishDir: - oneOf: - - oneOf: - - description: "The `publishDir` directive allows you to publish the process\ - \ output files to a specified folder.\n\nViash implements this directive\ - \ as a plain string or a map. The allowed keywords for the map are:\ - \ `path`, `mode`, `overwrite`, `pattern`, `saveAs`, `enabled`. The `path`\ - \ key and value are required.\nThe allowed values for `mode` are: `symlink`,\ - \ `rellink`, `link`, `copy`, `copyNoFollow`, `move`.\n\nSee [`publishDir`](https://www.nextflow.io/docs/latest/process.html#publishdir).\n" - type: "string" - - description: "The `publishDir` directive allows you to publish the process\ - \ output files to a specified folder.\n\nViash implements this directive\ - \ as a plain string or a map. The allowed keywords for the map are:\ - \ `path`, `mode`, `overwrite`, `pattern`, `saveAs`, `enabled`. The `path`\ - \ key and value are required.\nThe allowed values for `mode` are: `symlink`,\ - \ `rellink`, `link`, `copy`, `copyNoFollow`, `move`.\n\nSee [`publishDir`](https://www.nextflow.io/docs/latest/process.html#publishdir).\n" - type: "object" - additionalProperties: - type: "string" - - type: "array" - items: - oneOf: - - description: "The `publishDir` directive allows you to publish the process\ - \ output files to a specified folder.\n\nViash implements this directive\ - \ as a plain string or a map. The allowed keywords for the map are:\ - \ `path`, `mode`, `overwrite`, `pattern`, `saveAs`, `enabled`. The\ - \ `path` key and value are required.\nThe allowed values for `mode`\ - \ are: `symlink`, `rellink`, `link`, `copy`, `copyNoFollow`, `move`.\n\ - \nSee [`publishDir`](https://www.nextflow.io/docs/latest/process.html#publishdir).\n" - type: "string" - - description: "The `publishDir` directive allows you to publish the process\ - \ output files to a specified folder.\n\nViash implements this directive\ - \ as a plain string or a map. The allowed keywords for the map are:\ - \ `path`, `mode`, `overwrite`, `pattern`, `saveAs`, `enabled`. The\ - \ `path` key and value are required.\nThe allowed values for `mode`\ - \ are: `symlink`, `rellink`, `link`, `copy`, `copyNoFollow`, `move`.\n\ - \nSee [`publishDir`](https://www.nextflow.io/docs/latest/process.html#publishdir).\n" - type: "object" - additionalProperties: - type: "string" - maxForks: - oneOf: - - description: "The `maxForks` directive allows you to define the maximum\ - \ number of process instances that can be executed in parallel. By default\ - \ this value is equals to the number of CPU cores available minus 1.\n\ - \nIf you want to execute a process in a sequential manner, set this directive\ - \ to one.\n\nSee [`maxForks`](https://www.nextflow.io/docs/latest/process.html#maxforks).\n" - type: "string" - - description: "The `maxForks` directive allows you to define the maximum\ - \ number of process instances that can be executed in parallel. By default\ - \ this value is equals to the number of CPU cores available minus 1.\n\ - \nIf you want to execute a process in a sequential manner, set this directive\ - \ to one.\n\nSee [`maxForks`](https://www.nextflow.io/docs/latest/process.html#maxforks).\n" - type: "integer" - maxErrors: - oneOf: - - description: "The `maxErrors` directive allows you to specify the maximum\ - \ number of times a process can fail when using the `retry` error strategy.\ - \ By default this directive is disabled.\n\nSee [`maxErrors`](https://www.nextflow.io/docs/latest/process.html#maxerrors).\n" - type: "string" - - description: "The `maxErrors` directive allows you to specify the maximum\ - \ number of times a process can fail when using the `retry` error strategy.\ - \ By default this directive is disabled.\n\nSee [`maxErrors`](https://www.nextflow.io/docs/latest/process.html#maxerrors).\n" - type: "integer" - cpus: - oneOf: - - description: "The `cpus` directive allows you to define the number of (logical)\ - \ CPU required by the process' task.\n\nSee [`cpus`](https://www.nextflow.io/docs/latest/process.html#cpus).\n" - type: "integer" - - description: "The `cpus` directive allows you to define the number of (logical)\ - \ CPU required by the process' task.\n\nSee [`cpus`](https://www.nextflow.io/docs/latest/process.html#cpus).\n" - type: "string" - accelerator: - description: "The `accelerator` directive allows you to specify the hardware\ - \ accelerator requirement for the task execution e.g. GPU processor.\n\n\ - Viash implements this directive as a map with accepted keywords: `type`,\ - \ `limit`, `request`, and `runtime`.\n\nSee [`accelerator`](https://www.nextflow.io/docs/latest/process.html#accelerator).\n" - type: "object" - additionalProperties: - type: "string" - time: - description: "The `time` directive allows you to define how long a process\ - \ is allowed to run.\n\nSee [`time`](https://www.nextflow.io/docs/latest/process.html#time).\n" - type: "string" - afterScript: - description: "The `afterScript` directive allows you to execute a custom (Bash)\ - \ snippet immediately after the main process has run. This may be useful\ - \ to clean up your staging area.\n\nSee [`afterScript`](https://www.nextflow.io/docs/latest/process.html#afterscript).\n" - type: "string" - executor: - description: "The `executor` defines the underlying system where processes\ - \ are executed. By default a process uses the executor defined globally\ - \ in the nextflow.config file.\n\nThe `executor` directive allows you to\ - \ configure what executor has to be used by the process, overriding the\ - \ default configuration. The following values can be used:\n\n| Name | Executor\ - \ |\n|------|----------|\n| awsbatch | The process is executed using the\ - \ AWS Batch service. | \n| azurebatch | The process is executed using the\ - \ Azure Batch service. | \n| condor | The process is executed using the\ - \ HTCondor job scheduler. | \n| google-lifesciences | The process is executed\ - \ using the Google Genomics Pipelines service. | \n| ignite | The process\ - \ is executed using the Apache Ignite cluster. | \n| k8s | The process is\ - \ executed using the Kubernetes cluster. | \n| local | The process is executed\ - \ in the computer where Nextflow is launched. | \n| lsf | The process is\ - \ executed using the Platform LSF job scheduler. | \n| moab | The process\ - \ is executed using the Moab job scheduler. | \n| nqsii | The process is\ - \ executed using the NQSII job scheduler. | \n| oge | Alias for the sge\ - \ executor. | \n| pbs | The process is executed using the PBS/Torque job\ - \ scheduler. | \n| pbspro | The process is executed using the PBS Pro job\ - \ scheduler. | \n| sge | The process is executed using the Sun Grid Engine\ - \ / Open Grid Engine. | \n| slurm | The process is executed using the SLURM\ - \ job scheduler. | \n| tes | The process is executed using the GA4GH TES\ - \ service. | \n| uge | Alias for the sge executor. |\n\nSee [`executor`](https://www.nextflow.io/docs/latest/process.html#executor).\n" - type: "string" - containerOptions: - oneOf: - - description: "The `containerOptions` directive allows you to specify any\ - \ container execution option supported by the underlying container engine\ - \ (ie. Docker, Singularity, etc). This can be useful to provide container\ - \ settings only for a specific process e.g. mount a custom path.\n\nSee\ - \ [`containerOptions`](https://www.nextflow.io/docs/latest/process.html#containeroptions).\n" - type: "string" - - type: "array" - items: - description: "The `containerOptions` directive allows you to specify any\ - \ container execution option supported by the underlying container engine\ - \ (ie. Docker, Singularity, etc). This can be useful to provide container\ - \ settings only for a specific process e.g. mount a custom path.\n\n\ - See [`containerOptions`](https://www.nextflow.io/docs/latest/process.html#containeroptions).\n" - type: "string" - disk: - description: "The `disk` directive allows you to define how much local disk\ - \ storage the process is allowed to use.\n\nSee [`disk`](https://www.nextflow.io/docs/latest/process.html#disk).\n" - type: "string" - tag: - description: "The `tag` directive allows you to associate each process execution\ - \ with a custom label, so that it will be easier to identify them in the\ - \ log file or in the trace execution report.\n\nFor ease of use, the default\ - \ tag is set to \"$id\", which allows tracking the progression of the channel\ - \ events through the workflow more easily.\n\nSee [`tag`](https://www.nextflow.io/docs/latest/process.html#tag).\n" - type: "string" - conda: - oneOf: - - description: "The `conda` directive allows for the definition of the process\ - \ dependencies using the Conda package manager.\n\nNextflow automatically\ - \ sets up an environment for the given package names listed by in the\ - \ `conda` directive.\n\nSee [`conda`](https://www.nextflow.io/docs/latest/process.html#conda).\n" - type: "string" - - type: "array" - items: - description: "The `conda` directive allows for the definition of the process\ - \ dependencies using the Conda package manager.\n\nNextflow automatically\ - \ sets up an environment for the given package names listed by in the\ - \ `conda` directive.\n\nSee [`conda`](https://www.nextflow.io/docs/latest/process.html#conda).\n" - type: "string" - machineType: - description: " The `machineType` can be used to specify a predefined Google\ - \ Compute Platform machine type when running using the Google Life Sciences\ - \ executor.\n\nSee [`machineType`](https://www.nextflow.io/docs/latest/process.html#machinetype).\n" - type: "string" - stageInMode: - description: "The `stageInMode` directive defines how input files are staged-in\ - \ to the process work directory. The following values are allowed:\n\n|\ - \ Value | Description |\n|-------|-------------| \n| copy | Input files\ - \ are staged in the process work directory by creating a copy. | \n| link\ - \ | Input files are staged in the process work directory by creating an\ - \ (hard) link for each of them. | \n| symlink | Input files are staged in\ - \ the process work directory by creating a symbolic link with an absolute\ - \ path for each of them (default). | \n| rellink | Input files are staged\ - \ in the process work directory by creating a symbolic link with a relative\ - \ path for each of them. | \n\nSee [`stageInMode`](https://www.nextflow.io/docs/latest/process.html#stageinmode).\n" - type: "string" - cache: - oneOf: - - description: "The `cache` directive allows you to store the process results\ - \ to a local cache. When the cache is enabled and the pipeline is launched\ - \ with the resume option, any following attempt to execute the process,\ - \ along with the same inputs, will cause the process execution to be skipped,\ - \ producing the stored data as the actual results.\n\nThe caching feature\ - \ generates a unique key by indexing the process script and inputs. This\ - \ key is used to identify univocally the outputs produced by the process\ - \ execution.\n\nThe `cache` is enabled by default, you can disable it\ - \ for a specific process by setting the cache directive to `false`.\n\n\ - Accepted values are: `true`, `false`, \"deep\", and \"lenient\".\n\nSee\ - \ [`cache`](https://www.nextflow.io/docs/latest/process.html#cache).\n" - type: "boolean" - - description: "The `cache` directive allows you to store the process results\ - \ to a local cache. When the cache is enabled and the pipeline is launched\ - \ with the resume option, any following attempt to execute the process,\ - \ along with the same inputs, will cause the process execution to be skipped,\ - \ producing the stored data as the actual results.\n\nThe caching feature\ - \ generates a unique key by indexing the process script and inputs. This\ - \ key is used to identify univocally the outputs produced by the process\ - \ execution.\n\nThe `cache` is enabled by default, you can disable it\ - \ for a specific process by setting the cache directive to `false`.\n\n\ - Accepted values are: `true`, `false`, \"deep\", and \"lenient\".\n\nSee\ - \ [`cache`](https://www.nextflow.io/docs/latest/process.html#cache).\n" - type: "string" - pod: - oneOf: - - description: "The `pod` directive allows the definition of pods specific\ - \ settings, such as environment variables, secrets and config maps when\ - \ using the Kubernetes executor.\n\nSee [`pod`](https://www.nextflow.io/docs/latest/process.html#pod).\n" - type: "object" - additionalProperties: - type: "string" - - type: "array" - items: - description: "The `pod` directive allows the definition of pods specific\ - \ settings, such as environment variables, secrets and config maps when\ - \ using the Kubernetes executor.\n\nSee [`pod`](https://www.nextflow.io/docs/latest/process.html#pod).\n" - type: "object" - additionalProperties: - type: "string" - penv: - description: "The `penv` directive allows you to define the parallel environment\ - \ to be used when submitting a parallel task to the SGE resource manager.\n\ - \nSee [`penv`](https://www.nextflow.io/docs/latest/process.html#penv).\n" - type: "string" - scratch: - oneOf: - - description: "The `scratch` directive allows you to execute the process\ - \ in a temporary folder that is local to the execution node.\n\nSee [`scratch`](https://www.nextflow.io/docs/latest/process.html#scratch).\n" - type: "boolean" - - description: "The `scratch` directive allows you to execute the process\ - \ in a temporary folder that is local to the execution node.\n\nSee [`scratch`](https://www.nextflow.io/docs/latest/process.html#scratch).\n" - type: "string" - storeDir: - description: "The `storeDir` directive allows you to define a directory that\ - \ is used as a permanent cache for your process results.\n\nSee [`storeDir`](https://www.nextflow.io/docs/latest/process.html#storeDir).\n" - type: "string" - maxRetries: - oneOf: - - description: "The `maxRetries` directive allows you to define the maximum\ - \ number of times a process instance can be re-submitted in case of failure.\ - \ This value is applied only when using the retry error strategy. By default\ - \ only one retry is allowed.\n\nSee [`maxRetries`](https://www.nextflow.io/docs/latest/process.html#maxretries).\n" - type: "string" - - description: "The `maxRetries` directive allows you to define the maximum\ - \ number of times a process instance can be re-submitted in case of failure.\ - \ This value is applied only when using the retry error strategy. By default\ - \ only one retry is allowed.\n\nSee [`maxRetries`](https://www.nextflow.io/docs/latest/process.html#maxretries).\n" - type: "integer" - echo: - oneOf: - - description: "By default the stdout produced by the commands executed in\ - \ all processes is ignored. By setting the `echo` directive to true, you\ - \ can forward the process stdout to the current top running process stdout\ - \ file, showing it in the shell terminal.\n \nSee [`echo`](https://www.nextflow.io/docs/latest/process.html#echo).\n" - type: "boolean" - - description: "By default the stdout produced by the commands executed in\ - \ all processes is ignored. By setting the `echo` directive to true, you\ - \ can forward the process stdout to the current top running process stdout\ - \ file, showing it in the shell terminal.\n \nSee [`echo`](https://www.nextflow.io/docs/latest/process.html#echo).\n" - type: "string" - errorStrategy: - description: "The `errorStrategy` directive allows you to define how an error\ - \ condition is managed by the process. By default when an error status is\ - \ returned by the executed script, the process stops immediately. This in\ - \ turn forces the entire pipeline to terminate.\n\nTable of available error\ - \ strategies:\n| Name | Executor |\n|------|----------|\n| `terminate` |\ - \ Terminates the execution as soon as an error condition is reported. Pending\ - \ jobs are killed (default) |\n| `finish` | Initiates an orderly pipeline\ - \ shutdown when an error condition is raised, waiting the completion of\ - \ any submitted job. |\n| `ignore` | Ignores processes execution errors.\ - \ |\n| `retry` | Re-submit for execution a process returning an error condition.\ - \ |\n\nSee [`errorStrategy`](https://www.nextflow.io/docs/latest/process.html#errorstrategy).\n" - type: "string" - memory: - description: "The `memory` directive allows you to define how much memory\ - \ the process is allowed to use.\n\nSee [`memory`](https://www.nextflow.io/docs/latest/process.html#memory).\n" - type: "string" - stageOutMode: - description: "The `stageOutMode` directive defines how output files are staged-out\ - \ from the scratch directory to the process work directory. The following\ - \ values are allowed:\n\n| Value | Description |\n|-------|-------------|\ - \ \n| copy | Output files are copied from the scratch directory to the work\ - \ directory. | \n| move | Output files are moved from the scratch directory\ - \ to the work directory. | \n| rsync | Output files are copied from the\ - \ scratch directory to the work directory by using the rsync utility. |\n\ - \nSee [`stageOutMode`](https://www.nextflow.io/docs/latest/process.html#stageoutmode).\n" - type: "string" - required: [] - additionalProperties: false - NextflowAuto: - description: "Automated processing flags which can be toggled on or off." - type: "object" - properties: - simplifyInput: - description: "If `true`, an input tuple only containing only a single File\ - \ (e.g. `[\"foo\", file(\"in.h5ad\")]`) is automatically transformed to\ - \ a map (i.e. `[\"foo\", [ input: file(\"in.h5ad\") ] ]`).\n\nDefault: `true`.\n" - type: "boolean" - simplifyOutput: - description: "If `true`, an output tuple containing a map with a File (e.g.\ - \ `[\"foo\", [ output: file(\"out.h5ad\") ] ]`) is automatically transformed\ - \ to a map (i.e. `[\"foo\", file(\"out.h5ad\")]`).\n\nDefault: `false`.\n" - type: "boolean" - publish: - oneOf: - - description: "If `true`, the module's outputs are automatically published\ - \ to `params.publishDir`.\nIf equal to \"state\", also a `.state.yaml`\ - \ file will be published in the publish dir.\nWill throw an error if `params.publishDir`\ - \ is not defined.\n\nDefault: `false`.\n" - type: "boolean" - - description: "If `true`, the module's outputs are automatically published\ - \ to `params.publishDir`.\nIf equal to \"state\", also a `.state.yaml`\ - \ file will be published in the publish dir.\nWill throw an error if `params.publishDir`\ - \ is not defined.\n\nDefault: `false`.\n" - type: "string" - transcript: - description: "If `true`, the module's transcripts from `work/` are automatically\ - \ published to `params.transcriptDir`.\nIf not defined, `params.publishDir\ - \ + \"/_transcripts\"` will be used.\nWill throw an error if neither are\ - \ defined.\n\nDefault: `false`.\n" - type: "boolean" - required: [] - additionalProperties: false - NextflowConfig: - description: "Allows tweaking how the Nextflow Config file is generated." - type: "object" - properties: - labels: - description: "A series of default labels to specify memory and cpu constraints.\n\ - \nThe default memory labels are defined as \"mem1gb\", \"mem2gb\", \"mem4gb\"\ - , ... upto \"mem512tb\" and follows powers of 2.\nThe default cpu labels\ - \ are defined as \"cpu1\", \"cpu2\", \"cpu5\", \"cpu10\", ... upto \"cpu1000\"\ - \ and follows a semi logarithmic scale (1, 2, 5 per decade).\n\nConceptually\ - \ it is possible for a Viash Config to overwrite the full labels parameter,\ - \ however likely it is more efficient to add additional labels\nin the Viash\ - \ Package with a config mod.\n" - type: "object" - additionalProperties: - type: "string" - script: - oneOf: - - description: "Includes a single string or list of strings into the nextflow.config\ - \ file.\nThis can be used to add custom profiles or include an additional\ - \ config file.\n" - type: "string" - - type: "array" - items: - description: "Includes a single string or list of strings into the nextflow.config\ - \ file.\nThis can be used to add custom profiles or include an additional\ - \ config file.\n" - type: "string" - required: [] - additionalProperties: false - Dependency: - description: "Specifies a Viash component (script or executable) that should be\ - \ made available for the code defined in the component.\nThe dependency components\ - \ are collected and copied to the output folder during the Viash build step.\n" - type: "object" - properties: - name: - description: "The full name of the dependency component. This should include\ - \ the namespace." - type: "string" - repository: - oneOf: - - description: "Specifies the repository location where the dependency component\ - \ can be found.\nThis must either be a full definition of the repository\ - \ or the name of a repository referenced as it is defined under repositories.\n\ - Additionally, the full definition can be specified as a single string\ - \ where all parameters such as repository type, url, branch or tag are\ - \ specified.\nOmitting the value sets the dependency as a local dependency,\ - \ ie. the dependency is available in the same namespace as the component.\n" - type: "string" - - description: "Specifies the repository location where the dependency component\ - \ can be found.\nThis must either be a full definition of the repository\ - \ or the name of a repository referenced as it is defined under repositories.\n\ - Additionally, the full definition can be specified as a single string\ - \ where all parameters such as repository type, url, branch or tag are\ - \ specified.\nOmitting the value sets the dependency as a local dependency,\ - \ ie. the dependency is available in the same namespace as the component.\n" - $ref: "#/definitions/Repository" - alias: - description: "An alternative name for the dependency component. This can include\ - \ a namespace if so needed." - type: "string" - required: - - "name" - additionalProperties: false - Repository: - oneOf: - - $ref: "#/definitions/LocalRepository" - - $ref: "#/definitions/GitRepository" - - $ref: "#/definitions/GithubRepository" - - $ref: "#/definitions/ViashhubRepository" - LocalRepository: - description: "Defines a locally present and available repository.\nThis can be\ - \ used to define components from the same code base as the current component.\n\ - Alternatively, this can be used to refer to a code repository present on the\ - \ local hard-drive instead of fetchable remotely, for example during development.\n" - type: "object" - properties: - path: - description: "Defines a subfolder of the repository to use as base to look\ - \ for the dependency components." - type: "string" - tag: - description: "Defines which version of the dependency component to use. Typically\ - \ this can be a specific tag, branch or commit hash." - type: "string" - type: - description: "Defines a locally present and available repository.\nThis can\ - \ be used to define components from the same code base as the current component.\n\ - Alternatively, this can be used to refer to a code repository present on\ - \ the local hard-drive instead of fetchable remotely, for example during\ - \ development.\n" - const: "local" - required: - - "type" - additionalProperties: false - GitRepository: - description: "A Git repository where remote dependency components can be found." - type: "object" - properties: - path: - description: "Defines a subfolder of the repository to use as base to look\ - \ for the dependency components." - type: "string" - tag: - description: "Defines which version of the dependency component to use. Typically\ - \ this can be a specific tag, branch or commit hash." - type: "string" - uri: - description: "The URI of the Git repository." - type: "string" - type: - description: "A Git repository where remote dependency components can be found." - const: "git" - required: - - "uri" - - "type" - additionalProperties: false - GithubRepository: - description: "A GitHub repository where remote dependency components can be found." - type: "object" - properties: - path: - description: "Defines a subfolder of the repository to use as base to look\ - \ for the dependency components." - type: "string" - tag: - description: "Defines which version of the dependency component to use. Typically\ - \ this can be a specific tag, branch or commit hash." - type: "string" - repo: - description: "The name of the GitHub repository." - type: "string" - type: - description: "A GitHub repository where remote dependency components can be\ - \ found." - const: "github" - required: - - "repo" - - "type" - additionalProperties: false - ViashhubRepository: - description: "A Viash-Hub repository where remote dependency components can be\ - \ found." - type: "object" - properties: - path: - description: "Defines a subfolder of the repository to use as base to look\ - \ for the dependency components." - type: "string" - tag: - description: "Defines which version of the dependency component to use. Typically\ - \ this can be a specific tag, branch or commit hash." - type: "string" - repo: - description: "The name of the Viash-Hub repository." - type: "string" - type: - description: "A Viash-Hub repository where remote dependency components can\ - \ be found." - const: "viashhub" - required: - - "repo" - - "type" - additionalProperties: false - RepositoryWithName: - oneOf: - - $ref: "#/definitions/LocalRepositoryWithName" - - $ref: "#/definitions/GitRepositoryWithName" - - $ref: "#/definitions/GithubRepositoryWithName" - - $ref: "#/definitions/ViashhubRepositoryWithName" - LocalRepositoryWithName: - description: "Defines a locally present and available repository.\nThis can be\ - \ used to define components from the same code base as the current component.\n\ - Alternatively, this can be used to refer to a code repository present on the\ - \ local hard-drive instead of fetchable remotely, for example during development.\n" - type: "object" - properties: - name: - description: "The identifier used to refer to this repository from dependencies." - type: "string" - path: - description: "Defines a subfolder of the repository to use as base to look\ - \ for the dependency components." - type: "string" - tag: - description: "Defines which version of the dependency component to use. Typically\ - \ this can be a specific tag, branch or commit hash." - type: "string" - type: - description: "Defines a locally present and available repository.\nThis can\ - \ be used to define components from the same code base as the current component.\n\ - Alternatively, this can be used to refer to a code repository present on\ - \ the local hard-drive instead of fetchable remotely, for example during\ - \ development.\n" - const: "localwithname" - required: - - "name" - - "type" - additionalProperties: false - GitRepositoryWithName: - description: "A Git repository where remote dependency components can be found." - type: "object" - properties: - name: - description: "The identifier used to refer to this repository from dependencies." - type: "string" - path: - description: "Defines a subfolder of the repository to use as base to look\ - \ for the dependency components." - type: "string" - tag: - description: "Defines which version of the dependency component to use. Typically\ - \ this can be a specific tag, branch or commit hash." - type: "string" - uri: - description: "The URI of the Git repository." - type: "string" - type: - description: "A Git repository where remote dependency components can be found." - const: "gitwithname" - required: - - "name" - - "uri" - - "type" - additionalProperties: false - GithubRepositoryWithName: - description: "A GitHub repository where remote dependency components can be found." - type: "object" - properties: - name: - description: "The identifier used to refer to this repository from dependencies." - type: "string" - path: - description: "Defines a subfolder of the repository to use as base to look\ - \ for the dependency components." - type: "string" - tag: - description: "Defines which version of the dependency component to use. Typically\ - \ this can be a specific tag, branch or commit hash." - type: "string" - repo: - description: "The name of the GitHub repository." - type: "string" - type: - description: "A GitHub repository where remote dependency components can be\ - \ found." - const: "githubwithname" - required: - - "name" - - "repo" - - "type" - additionalProperties: false - ViashhubRepositoryWithName: - description: "A Viash-Hub repository where remote dependency components can be\ - \ found." - type: "object" - properties: - name: - description: "The identifier used to refer to this repository from dependencies." - type: "string" - path: - description: "Defines a subfolder of the repository to use as base to look\ - \ for the dependency components." - type: "string" - tag: - description: "Defines which version of the dependency component to use. Typically\ - \ this can be a specific tag, branch or commit hash." - type: "string" - repo: - description: "The name of the Viash-Hub repository." - type: "string" - type: - description: "A Viash-Hub repository where remote dependency components can\ - \ be found." - const: "viashhubwithname" - required: - - "name" - - "repo" - - "type" - additionalProperties: false - DockerSetupStrategy: - enum: - - "cb" - - "ifneedbepullelsecachedbuild" - - "donothing" - - "gentlepush" - - "alwayspullelsebuild" - - "build" - - "alwayspull" - - "alwaysbuild" - - "ifneedbebuild" - - "pullelsebuild" - - "p" - - "alwayspullelsecachedbuild" - - "pull" - - "maybepush" - - "ifneedbepullelsebuild" - - "cachedbuild" - - "pullelsecachedbuild" - - "push" - - "forcepush" - - "alwayspush" - - "b" - - "pushifnotpresent" - - "alwayscachedbuild" - - "meh" - - "ifneedbepull" - - "ifneedbecachedbuild" - $comment: "TODO add descriptions to different strategies" - description: "The Docker setup strategy to use when building a container." - Direction: - enum: - - "input" - - "output" - description: "Makes this argument an `input` or an `output`, as in does the file/folder\ - \ needs to be read or written. `input` by default." - Status: - enum: - - "enabled" - - "disabled" - - "deprecated" - description: "Allows setting a component to active, deprecated or disabled." - DockerResolveVolume: - enum: - - "manual" - - "automatic" - - "auto" - - "Manual" - - "Automatic" - - "Auto" - $comment: "TODO make fully case insensitive" - description: "Enables or disables automatic volume mapping. Enabled when set to\ - \ `Automatic` or disabled when set to `Manual`. Default: `Automatic`" - DoubleStrings: - enum: - - "+.inf" - - "+inf" - - "+infinity" - - "positiveinfinity" - - "positiveinf" - - "-.inf" - - "-inf" - - "-infinity" - - "negativeinfinity" - - "negativeinf" - - ".nan" - - "nan" - DoubleWithInf: - oneOf: - - type: "number" - - $ref: "#/definitions/DoubleStrings" -oneOf: -- $ref: "#/definitions/Config" diff --git a/CHANGELOG.md b/CHANGELOG.md index 8faaf0f2..f6a8676f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,40 @@ -# base unreleased +# biobox x.x.x + +## NEW FEATURES + +* `bd_rhapsody`: + + - `bd_rhapsody/bd_rhapsody_make_reference`: Create a reference for the BD Rhapsody pipeline (PR #75). + +## MINOR CHANGES + +* `busco` components: update BUSCO to `5.7.1` (PR #72). + +## DOCUMENTATION + +* Extend the contributing guidelines (PR #82): + + - Update format to Viash 0.9. + + - Descriptions should be formatted in markdown. + + - Add defaults to descriptions, not as a default of the argument. + + - Explain parameter expansion. + + - Mention that the contents of the output of components in tests should be checked. + +## BUG FIXES + +* `pear`: fix component not exiting with the correct exitcode when PEAR fails (PR #70). + +* `cutadapt`: fix `--par_quality_cutoff_r2` argument (PR #69). + +* `cutadapt`: demultiplexing is now disabled by default. It can be re-enabled by using `demultiplex_mode` (PR #69). + +* `multiqc`: update multiple separator to `;` (PR #81). + +# biobox 0.1.0 ## BREAKING CHANGES @@ -6,6 +42,7 @@ Viash 0.9.0 in order to avoid issues with the current default separator `:` unintentionally splitting up certain file paths. + ## NEW FEATURES * `arriba`: Detect gene fusions from RNA-seq data (PR #1). @@ -17,6 +54,8 @@ - `busco/busco_list_datasets`: Lists available busco datasets (PR #18). - `busco/busco_download_datasets`: Download busco datasets (PR #19). +* `cutadapt`: Remove adapter sequences from high-throughput sequencing reads (PR #7). + * `featurecounts`: Assign sequence reads to genomic features (PR #11). * `bgzip`: Add bgzip functionality to compress and decompress files (PR #13). @@ -29,7 +68,9 @@ * `multiqc`: Aggregate results from bioinformatics analyses across many samples into a single report (PR #42). -* `star/star_align_reads`: Align reads to a reference genome (PR #22). +* `star`: + - `star/star_align_reads`: Align reads to a reference genome (PR #22). + - `star/star_genome_generate`: Generate a genome index for STAR alignment (PR #58). * `gffread`: Validate, filter, convert and perform other operations on GFF files (PR #29). @@ -47,13 +88,22 @@ - `samtools/samtools_collate`: Shuffles and groups reads in SAM/BAM/CRAM files together by their names (PR #42). - `samtools/samtools_view`: Views and converts SAM/BAM/CRAM files (PR #48). - `samtools/samtools_fastq`: Converts a SAM/BAM/CRAM file to FASTQ (PR #52). + - `samtools/samtools_fastq`: Converts a SAM/BAM/CRAM file to FASTA (PR #53). + * `falco`: A C++ drop-in replacement of FastQC to assess the quality of sequence read data (PR #43). * `seqtk/seqtk_sample`: Sample sequences from FASTA/Q(.gz) files to FASTA/Q (PR #68). +* `umitools`: + - `umitools_dedup`: Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read (PR #54). + +* `bedtools`: + - `bedtools_getfasta`: extract sequences from a FASTA file for each of the + intervals defined in a BED/GFF/VCF file (PR #59). -## MAJOR CHANGES +* `agat`: + - `agat_convert_sp_gff2gtf`: convert any GTF/GFF file into a proper GTF file (PR #76). ## MINOR CHANGES @@ -63,8 +113,14 @@ * Update to Viash 0.9.0-RC3 (PR #51). +* Update to Viash 0.9.0-RC6 (PR #63). + +* Switch to viash-hub/toolbox actions (PR #64). + ## DOCUMENTATION +* Update README (PR #64). + ## BUG FIXES * Add escaping character before leading hashtag in the description field of the config file (PR #50). diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 7393bc7e..cee4249a 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -65,22 +65,21 @@ runners: Fill in the relevant metadata fields in the config. Here is an example of the metadata of an existing component. ```yaml -functionality: - name: arriba - description: Detect gene fusions from RNA-Seq data - keywords: [Gene fusion, RNA-Seq] - links: - homepage: https://arriba.readthedocs.io/en/latest/ - documentation: https://arriba.readthedocs.io/en/latest/ - repository: https://github.com/suhrig/arriba - issue_tracker: https://github.com/suhrig/arriba/issues - references: - doi: 10.1101/gr.257246.119 - bibtex: | - @article{ - ... a bibtex entry in case the doi is not available ... - } - license: MIT +name: arriba +description: Detect gene fusions from RNA-Seq data +keywords: [Gene fusion, RNA-Seq] +links: + homepage: https://arriba.readthedocs.io/en/latest/ + documentation: https://arriba.readthedocs.io/en/latest/ + repository: https://github.com/suhrig/arriba + issue_tracker: https://github.com/suhrig/arriba/issues +references: + doi: 10.1101/gr.257246.119 + bibtex: | + @article{ + ... a bibtex entry in case the doi is not available ... + } +license: MIT ``` ### Step 4: Find a suitable container @@ -162,7 +161,7 @@ argument_groups: type: file description: | File in SAM/BAM/CRAM format with main alignments as generated by STAR - (Aligned.out.sam). Arriba extracts candidate reads from this file. + (`Aligned.out.sam`). Arriba extracts candidate reads from this file. required: true example: Aligned.out.bam ``` @@ -175,7 +174,7 @@ Several notes: * Input arguments can have `multiple: true` to allow the user to specify multiple files. - +* The description should be formatted in markdown. ### Step 8: Add arguments for the output files @@ -220,7 +219,7 @@ argument_groups: Note: -* Preferably, these outputs should not be directores but files. For example, if a tool outputs a directory `foo/` containing files `foo/bar.txt` and `foo/baz.txt`, there should be two output arguments `--bar` and `--baz` (as opposed to one output argument which outputs the whole `foo/` directory). +* Preferably, these outputs should not be directories but files. For example, if a tool outputs a directory `foo/` containing files `foo/bar.txt` and `foo/baz.txt`, there should be two output arguments `--bar` and `--baz` (as opposed to one output argument which outputs the whole `foo/` directory). ### Step 9: Add arguments for the other arguments @@ -230,6 +229,8 @@ Finally, add all other arguments to the config file. There are a few exceptions: * Arguments related to printing the information such as printing the version (`-v`, `--version`) or printing the help (`-h`, `--help`) should not be added to the config file. +* If the help lists defaults, do not add them as defaults but to the description. Example: `description: . Default: 10.` + ### Step 10: Add a Docker engine @@ -275,10 +276,13 @@ Next, we need to write a runner script that runs the tool with the input argumen ## VIASH START ## VIASH END +# unset flags +[[ "$par_option" == "false" ]] && unset par_option + xxx \ --input "$par_input" \ --output "$par_output" \ - $([ "$par_option" = "true" ] && echo "--option") + ${par_option:+--option} ``` When building a Viash component, Viash will automatically replace the `## VIASH START` and `## VIASH END` lines (and anything in between) with environment variables based on the arguments specified in the config. @@ -291,6 +295,11 @@ As an example, this is what the Bash script for the `arriba` component looks lik ## VIASH START ## VIASH END +# unset flags +[[ "$par_skip_duplicate_marking" == "false" ]] && unset par_skip_duplicate_marking +[[ "$par_extra_information" == "false" ]] && unset par_extra_information +[[ "$par_fill_gaps" == "false" ]] && unset par_fill_gaps + arriba \ -x "$par_bam" \ -a "$par_genome" \ @@ -298,26 +307,30 @@ arriba \ -o "$par_fusions" \ ${par_known_fusions:+-k "${par_known_fusions}"} \ ${par_blacklist:+-b "${par_blacklist}"} \ - ${par_structural_variants:+-d "${par_structural_variants}"} \ - $([ "$par_skip_duplicate_marking" = "true" ] && echo "-u") \ - $([ "$par_extra_information" = "true" ] && echo "-X") \ - $([ "$par_fill_gaps" = "true" ] && echo "-I") + # ... + ${par_extra_information:+-X} \ + ${par_fill_gaps:+-I} ``` +Notes: -### Step 12: Create test script +* If your arguments can contain special variables (e.g. `$`), you can use quoting (need to find a documentation page for this) to make sure you can use the string as input. Example: `-x ${par_bam@Q}`. +* Optional arguments can be passed to the command conditionally using Bash [parameter expansion](https://www.gnu.org/software/bash/manual/html_node/Shell-Parameter-Expansion.html). For example: `${par_known_fusions:+-k ${par_known_fusions@Q}}` + +* If your tool allows for multiple inputs using a separator other than `;` (which is the default Viash multiple separator), you can substitute these values with a command like: `par_disable_filters=$(echo $par_disable_filters | tr ';' ',')`. + + +### Step 12: Create test script If the unit test requires test resources, these should be provided in the `test_resources` section of the component. ```yaml -functionality: - # ... - test_resources: - - type: bash_script - path: test.sh - - type: file - path: test_data +test_resources: + - type: bash_script + path: test.sh + - type: file + path: test_data ``` Create a test script at `src/xxx/test.sh` that runs the component with the test data. This script should run the component (available with `$meta_executable`) with the test data and check if the output is as expected. The script should exit with a non-zero exit code if the output is not as expected. For example: @@ -325,48 +338,64 @@ Create a test script at `src/xxx/test.sh` that runs the component with the test ```bash #!/bin/bash +set -e + ## VIASH START ## VIASH END -echo "> Run xxx with test data" +############################################# +# helper functions +assert_file_exists() { + [ -f "$1" ] || { echo "File '$1' does not exist" && exit 1; } +} +assert_file_doesnt_exist() { + [ ! -f "$1" ] || { echo "File '$1' exists but shouldn't" && exit 1; } +} +assert_file_empty() { + [ ! -s "$1" ] || { echo "File '$1' is not empty but should be" && exit 1; } +} +assert_file_not_empty() { + [ -s "$1" ] || { echo "File '$1' is empty but shouldn't be" && exit 1; } +} +assert_file_contains() { + grep -q "$2" "$1" || { echo "File '$1' does not contain '$2'" && exit 1; } +} +assert_file_not_contains() { + grep -q "$2" "$1" && { echo "File '$1' contains '$2' but shouldn't" && exit 1; } +} +assert_file_contains_regex() { + grep -q -E "$2" "$1" || { echo "File '$1' does not contain '$2'" && exit 1; } +} +assert_file_not_contains_regex() { + grep -q -E "$2" "$1" && { echo "File '$1' contains '$2' but shouldn't" && exit 1; } +} +############################################# + +echo "> Run $meta_name with test data" "$meta_executable" \ - --input "$meta_resources_dir/test_data/input.txt" \ + --input "$meta_resources_dir/test_data/reads_R1.fastq" \ --output "output.txt" \ --option -echo ">> Checking output" -[ ! -f "output.txt" ] && echo "Output file output.txt does not exist" && exit 1 -``` +echo ">> Check if output exists" +assert_file_exists "output.txt" +echo ">> Check if output is empty" +assert_file_not_empty "output.txt" -For example, this is what the test script for the `arriba` component looks like: +echo ">> Check if output is correct" +assert_file_contains "output.txt" "some expected output" -```bash -#!/bin/bash +echo "> All tests succeeded!" +``` -## VIASH START -## VIASH END +Notes: -echo "> Run arriba with blacklist" -"$meta_executable" \ - --bam "$meta_resources_dir/test_data/A.bam" \ - --genome "$meta_resources_dir/test_data/genome.fasta" \ - --gene_annotation "$meta_resources_dir/test_data/annotation.gtf" \ - --blacklist "$meta_resources_dir/test_data/blacklist.tsv" \ - --fusions "fusions.tsv" \ - --fusions_discarded "fusions_discarded.tsv" \ - --interesting_contigs "1,2" - -echo ">> Checking output" -[ ! -f "fusions.tsv" ] && echo "Output file fusions.tsv does not exist" && exit 1 -[ ! -f "fusions_discarded.tsv" ] && echo "Output file fusions_discarded.tsv does not exist" && exit 1 +* Do always check the contents of the output file. If the output is not deterministic, you can use regular expressions to check the output. -echo ">> Check if output is empty" -[ ! -s "fusions.tsv" ] && echo "Output file fusions.tsv is empty" && exit 1 -[ ! -s "fusions_discarded.tsv" ] && echo "Output file fusions_discarded.tsv is empty" && exit 1 -``` +* If possible, generate your own test data instead of copying it from an external resource. -### Step 12: Create a `/var/software_versions.txt` file +### Step 13: Create a `/var/software_versions.txt` file For the sake of transparency and reproducibility, we require that the versions of the software used in the component are documented. @@ -378,6 +407,8 @@ engines: image: quay.io/biocontainers/xxx:0.1.0--py_0 setup: - type: docker + # note: /var/software_versions.txt should contain: + # arriba: "2.4.0" run: | echo "xxx: \"0.1.0\"" > /var/software_versions.txt ``` diff --git a/README.md b/README.md index ecf807ca..4b497dcd 100644 --- a/README.md +++ b/README.md @@ -1,15 +1,24 @@ -# Base repository for reusable Viash components -This repository is a collection of reproducible and reusable Viash -components. +# 🌱📦 biobox + +[![ViashHub](https://img.shields.io/badge/ViashHub-biobox-7a4baa.png)](https://web.viash-hub.com/packages/biobox) +[![GitHub](https://img.shields.io/badge/GitHub-viash--hub%2Fbiobox-blue.png)](https://github.com/viash-hub/biobox) +[![GitHub +License](https://img.shields.io/github/license/viash-hub/biobox.png)](https://github.com/viash-hub/biobox/blob/main/LICENSE) +[![GitHub +Issues](https://img.shields.io/github/issues/viash-hub/biobox.png)](https://github.com/viash-hub/biobox/issues) +[![Viash +version](https://img.shields.io/badge/Viash-v0.9.0--RC6-blue)](https://viash.io) + +A collection of bioinformatics tools for working with sequence data. ## Objectives - **Reusability**: Facilitating the use of components across various projects and contexts. -- **Reproducibility**: Guaranteeing that bioinformatics analyses can be - reliably replicated. +- **Reproducibility**: Ensuring that components are reproducible and can + be easily shared. - **Best Practices**: Adhering to established standards in software development and bioinformatics. @@ -43,18 +52,21 @@ contribute a component to this repository. 12. Create test script 13. Create a `/var/software_versions.txt` file -See the [CONTRIBUTING](CONTRIBUTING.md) file for more details. +See the +[CONTRIBUTING](https://github.com/viash-hub/biobox/blob/main/CONTRIBUTING.md) +file for more details. ## Support and Community For support, questions, or to join our community: - **Issues**: Submit questions or issues via the [GitHub issue - tracker](https://github.com/viash-hub/base/issues). + tracker](https://github.com/viash-hub/biobox/issues). - **Discussions**: Join our discussions via [GitHub - Discussions](https://github.com/viash-hub/base/discussions). + Discussions](https://github.com/viash-hub/biobox/discussions). ## License This repository is licensed under an MIT license. See the -[LICENSE](LICENSE) file for details. +[LICENSE](https://github.com/viash-hub/biobox/blob/main/LICENSE) file +for details. diff --git a/README.qmd b/README.qmd index 656cdac7..7d36430b 100644 --- a/README.qmd +++ b/README.qmd @@ -1,14 +1,25 @@ --- -title: Base repository for reusable Viash components format: gfm --- +```{r setup, include=FALSE} +project <- yaml::read_yaml("_viash.yaml") +license <- paste0(project$links$repository, "/blob/main/LICENSE") +contributing <- paste0(project$links$repository, "/blob/main/CONTRIBUTING.md") +``` +# 🌱📦 `r project$name` + +[![ViashHub](https://img.shields.io/badge/ViashHub-`r project$name`-7a4baa)](https://web.viash-hub.com/packages/`r project$name`) +[![GitHub](https://img.shields.io/badge/GitHub-viash--hub%2F`r project$name`-blue)](`r project$links$repository`) +[![GitHub License](https://img.shields.io/github/license/viash-hub/`r project$name`)](`r license`) +[![GitHub Issues](https://img.shields.io/github/issues/viash-hub/`r project$name`)](`r project$links$issue_tracker`) +[![Viash version](https://img.shields.io/badge/Viash-v`r gsub("-", "--", project$viash_version)`-blue)](https://viash.io) -This repository is a collection of reproducible and reusable Viash components. +`r project$description` ## Objectives - **Reusability**: Facilitating the use of components across various projects and contexts. -- **Reproducibility**: Guaranteeing that bioinformatics analyses can be reliably replicated. +- **Reproducibility**: Ensuring that components are reproducible and can be easily shared. - **Best Practices**: Adhering to established standards in software development and bioinformatics. ## Contributing @@ -37,15 +48,15 @@ knitr::asis_output( ) ``` -See the [CONTRIBUTING](CONTRIBUTING.md) file for more details. +See the [CONTRIBUTING](`r contributing`) file for more details. ## Support and Community For support, questions, or to join our community: -- **Issues**: Submit questions or issues via the [GitHub issue tracker](https://github.com/viash-hub/base/issues). -- **Discussions**: Join our discussions via [GitHub Discussions](https://github.com/viash-hub/base/discussions). +- **Issues**: Submit questions or issues via the [GitHub issue tracker](`r project$links$issue_tracker`). +- **Discussions**: Join our discussions via [GitHub Discussions](`r project$links$repository`/discussions). ## License -This repository is licensed under an MIT license. See the [LICENSE](LICENSE) file for details. +This repository is licensed under an MIT license. See the [LICENSE](`r license`) file for details. diff --git a/_viash.yaml b/_viash.yaml index 5b6cf3f7..9a240c24 100644 --- a/_viash.yaml +++ b/_viash.yaml @@ -2,12 +2,12 @@ name: biobox description: | A collection of bioinformatics tools for working with sequence data. license: MIT -keywords: [bioinformatics, sequence, alignment, variant calling, dna, rna] +keywords: [bioinformatics, modules, sequencing] links: issue_tracker: https://github.com/viash-hub/biobox/issues - repository: https://github.com/viash-hub/biobbox + repository: https://github.com/viash-hub/biobox -viash_version: 0.9.0-RC3 +viash_version: 0.9.0-RC6 config_mods: | .requirements.commands := ['ps'] diff --git a/src/_authors/robrecht_cannoodt.yaml b/src/_authors/robrecht_cannoodt.yaml new file mode 100644 index 00000000..d7c0f283 --- /dev/null +++ b/src/_authors/robrecht_cannoodt.yaml @@ -0,0 +1,14 @@ +name: Robrecht Cannoodt +info: + links: + email: robrecht@data-intuitive.com + github: rcannood + orcid: "0000-0003-3641-729X" + linkedin: robrechtcannoodt + organizations: + - name: Data Intuitive + href: https://www.data-intuitive.com + role: Data Science Engineer + - name: Open Problems + href: https://openproblems.bio + role: Core Member \ No newline at end of file diff --git a/src/_authors/weiwei_schultz.yaml b/src/_authors/weiwei_schultz.yaml new file mode 100644 index 00000000..324f9378 --- /dev/null +++ b/src/_authors/weiwei_schultz.yaml @@ -0,0 +1,5 @@ +name: Weiwei Schultz +info: + organizations: + - name: Janssen R&D US + role: Associate Director Data Sciences \ No newline at end of file diff --git a/src/agat/agat_convert_sp_gff2gtf/config.vsh.yaml b/src/agat/agat_convert_sp_gff2gtf/config.vsh.yaml new file mode 100644 index 00000000..b788c7c7 --- /dev/null +++ b/src/agat/agat_convert_sp_gff2gtf/config.vsh.yaml @@ -0,0 +1,90 @@ +name: agat_convert_sp_gff2gtf +namespace: agat +description: | + The script aims to convert any GTF/GFF file into a proper GTF file. Full + information about the format can be found here: + https://agat.readthedocs.io/en/latest/gxf.html You can choose among 7 + different GTF types (1, 2, 2.1, 2.2, 2.5, 3 or relax). Depending the + version selected the script will filter out the features that are not + accepted. For GTF2.5 and 3, every level1 feature (e.g nc_gene + pseudogene) will be converted into gene feature and every level2 feature + (e.g mRNA ncRNA) will be converted into transcript feature. Using the + "relax" option you will produce a GTF-like output keeping all original + feature types (3rd column). No modification will occur e.g. mRNA to + transcript. + + To be fully GTF compliant all feature have a gene_id and a transcript_id + attribute. The gene_id is unique identifier for the genomic source of + the transcript, which is used to group transcripts into genes. The + transcript_id is a unique identifier for the predicted transcript, which + is used to group features into transcripts. +keywords: [gene annotations, GTF conversion] +links: + homepage: https://github.com/NBISweden/AGAT + documentation: https://agat.readthedocs.io/ + issue_tracker: https://github.com/NBISweden/AGAT/issues + repository: https://github.com/NBISweden/AGAT +references: + doi: 10.5281/zenodo.3552717 +license: GPL-3.0 +argument_groups: + - name: Inputs + arguments: + - name: --gff + alternatives: [-i] + description: Input GFF/GTF file that will be read + type: file + required: true + direction: input + example: input.gff + - name: Outputs + arguments: + - name: --output + alternatives: [-o, --out, --outfile, --gtf] + description: Output GTF file. If no output file is specified, the output will be written to STDOUT. + type: file + direction: output + required: true + example: output.gtf + - name: Arguments + arguments: + - name: --gtf_version + description: | + Version of the GTF output (1,2,2.1,2.2,2.5,3 or relax). Default value from AGAT config file (relax for the default config). The script option has the higher priority. + + * relax: all feature types are accepted. + * GTF3 (9 feature types accepted): gene, transcript, exon, CDS, Selenocysteine, start_codon, stop_codon, three_prime_utr and five_prime_utr. + * GTF2.5 (8 feature types accepted): gene, transcript, exon, CDS, UTR, start_codon, stop_codon, Selenocysteine. + * GTF2.2 (9 feature types accepted): CDS, start_codon, stop_codon, 5UTR, 3UTR, inter, inter_CNS, intron_CNS and exon. + * GTF2.1 (6 feature types accepted): CDS, start_codon, stop_codon, exon, 5UTR, 3UTR. + * GTF2 (4 feature types accepted): CDS, start_codon, stop_codon, exon. + * GTF1 (5 feature types accepted): CDS, start_codon, stop_codon, exon, intron. + type: string + choices: [relax, "1", "2", "2.1", "2.2", "2.5", "3"] + required: false + example: "3" + - name: --config + alternatives: [-c] + description: | + Input agat config file. By default AGAT takes as input agat_config.yaml file from the working directory if any, otherwise it takes the orignal agat_config.yaml shipped with AGAT. To get the agat_config.yaml locally type: "agat config --expose". The --config option gives you the possibility to use your own AGAT config file (located elsewhere or named differently). + type: file + required: false + example: custom_agat_config.yaml +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - type: file + path: test_data +engines: + - type: docker + image: quay.io/biocontainers/agat:1.4.0--pl5321hdfd78af_0 + setup: + - type: docker + run: | + agat --version | sed 's/AGAT\s\(.*\)/agat: "\1"/' > /var/software_versions.txt +runners: + - type: executable + - type: nextflow \ No newline at end of file diff --git a/src/agat/agat_convert_sp_gff2gtf/help.txt b/src/agat/agat_convert_sp_gff2gtf/help.txt new file mode 100644 index 00000000..fdd45507 --- /dev/null +++ b/src/agat/agat_convert_sp_gff2gtf/help.txt @@ -0,0 +1,102 @@ +```sh +agat_convert_sp_gff2gtf.pl --help +``` + ------------------------------------------------------------------------------ +| Another GFF Analysis Toolkit (AGAT) - Version: v1.4.0 | +| https://github.com/NBISweden/AGAT | +| National Bioinformatics Infrastructure Sweden (NBIS) - www.nbis.se | + ------------------------------------------------------------------------------ + + +Name: + agat_convert_sp_gff2gtf.pl + +Description: + The script aims to convert any GTF/GFF file into a proper GTF file. Full + information about the format can be found here: + https://agat.readthedocs.io/en/latest/gxf.html You can choose among 7 + different GTF types (1, 2, 2.1, 2.2, 2.5, 3 or relax). Depending the + version selected the script will filter out the features that are not + accepted. For GTF2.5 and 3, every level1 feature (e.g nc_gene + pseudogene) will be converted into gene feature and every level2 feature + (e.g mRNA ncRNA) will be converted into transcript feature. Using the + "relax" option you will produce a GTF-like output keeping all original + feature types (3rd column). No modification will occur e.g. mRNA to + transcript. + + To be fully GTF compliant all feature have a gene_id and a transcript_id + attribute. The gene_id is unique identifier for the genomic source of + the transcript, which is used to group transcripts into genes. The + transcript_id is a unique identifier for the predicted transcript, which + is used to group features into transcripts. + +Usage: + agat_convert_sp_gff2gtf.pl --gff infile.gff [ -o outfile ] + agat_convert_sp_gff2gtf -h + +Options: + --gff, --gtf or -i + Input GFF/GTF file that will be read + + --gtf_version version of the GTF output (1,2,2.1,2.2,2.5,3 or relax). + Default value from AGAT config file (relax for the default config). The + script option has the higher priority. + relax: all feature types are accepted. + + GTF3 (9 feature types accepted): gene, transcript, exon, CDS, + Selenocysteine, start_codon, stop_codon, three_prime_utr and + five_prime_utr + + GTF2.5 (8 feature types accepted): gene, transcript, exon, CDS, + UTR, start_codon, stop_codon, Selenocysteine + + GTF2.2 (9 feature types accepted): CDS, start_codon, stop_codon, + 5UTR, 3UTR, inter, inter_CNS, intron_CNS and exon + + GTF2.1 (6 feature types accepted): CDS, start_codon, stop_codon, + exon, 5UTR, 3UTR + + GTF2 (4 feature types accepted): CDS, start_codon, stop_codon, + exon + + GTF1 (5 feature types accepted): CDS, start_codon, stop_codon, + exon, intron + + -o , --output , --out , --outfile or --gtf + Output GTF file. If no output file is specified, the output will + be written to STDOUT. + + -c or --config + String - Input agat config file. By default AGAT takes as input + agat_config.yaml file from the working directory if any, + otherwise it takes the orignal agat_config.yaml shipped with + AGAT. To get the agat_config.yaml locally type: "agat config + --expose". The --config option gives you the possibility to use + your own AGAT config file (located elsewhere or named + differently). + + -h or --help + Display this helpful text. + +Feedback: + Did you find a bug?: + Do not hesitate to report bugs to help us keep track of the bugs and + their resolution. Please use the GitHub issue tracking system available + at this address: + + https://github.com/NBISweden/AGAT/issues + + Ensure that the bug was not already reported by searching under Issues. + If you're unable to find an (open) issue addressing the problem, open a new one. + Try as much as possible to include in the issue when relevant: + - a clear description, + - as much relevant information as possible, + - the command used, + - a data sample, + - an explanation of the expected behaviour that is not occurring. + + Do you want to contribute?: + You are very welcome, visit this address for the Contributing + guidelines: + https://github.com/NBISweden/AGAT/blob/master/CONTRIBUTING.md + diff --git a/src/agat/agat_convert_sp_gff2gtf/script.sh b/src/agat/agat_convert_sp_gff2gtf/script.sh new file mode 100644 index 00000000..69d66739 --- /dev/null +++ b/src/agat/agat_convert_sp_gff2gtf/script.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +agat_convert_sp_gff2gtf.pl \ + -i "$par_gff" \ + -o "$par_output" \ + ${par_gtf_version:+--gtf_version "${par_gtf_version}"} \ + ${par_config:+--config "${par_config}"} diff --git a/src/agat/agat_convert_sp_gff2gtf/test.sh b/src/agat/agat_convert_sp_gff2gtf/test.sh new file mode 100644 index 00000000..1e7cc142 --- /dev/null +++ b/src/agat/agat_convert_sp_gff2gtf/test.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +test_dir="${meta_resources_dir}/test_data" + +echo "> Run $meta_name with test data" +"$meta_executable" \ + --gff "$test_dir/0_test.gff" \ + --output "output.gtf" + +echo ">> Checking output" +[ ! -f "output.gtf" ] && echo "Output file output.gtf does not exist" && exit 1 + +echo ">> Check if output is empty" +[ ! -s "output.gtf" ] && echo "Output file output.gtf is empty" && exit 1 + +echo ">> Check if the conversion resulted in the right GTF format" +idGFF=$(head -n 2 "$test_dir/0_test.gff" | grep -o 'ID=[^;]*' | cut -d '=' -f 2-) +expectedGTF="gene_id \"$idGFF\"; ID \"$idGFF\";" +extractedGTF=$(head -n 3 "output.gtf" | grep -o 'gene_id "[^"]*"; ID "[^"]*";') +[ "$extractedGTF" != "$expectedGTF" ] && echo "Output file output.gtf does not have the right format" && exit 1 + +rm output.gtf + +echo "> Run $meta_name with test data and GTF version 2.5" +"$meta_executable" \ + --gff "$test_dir/0_test.gff" \ + --output "output.gtf" \ + --gtf_version "2.5" + +echo ">> Check if the output file header display the right GTF version" +grep -q "##gtf-version 2.5" "output.gtf" +[ $? -ne 0 ] && echo "Output file output.gtf header does not display the right GTF version" && exit 1 + +echo "> Test successful" \ No newline at end of file diff --git a/src/agat/agat_convert_sp_gff2gtf/test_data/0_test.gff b/src/agat/agat_convert_sp_gff2gtf/test_data/0_test.gff new file mode 100644 index 00000000..fafe86ed --- /dev/null +++ b/src/agat/agat_convert_sp_gff2gtf/test_data/0_test.gff @@ -0,0 +1,36 @@ +##gff-version 3 +scaffold625 maker gene 337818 343277 . + . ID=CLUHARG00000005458;Name=TUBB3_2 +scaffold625 maker mRNA 337818 343277 . + . ID=CLUHART00000008717;Parent=CLUHARG00000005458 +scaffold625 maker exon 337818 337971 . + . ID=CLUHART00000008717:exon:1404;Parent=CLUHART00000008717 +scaffold625 maker exon 340733 340841 . + . ID=CLUHART00000008717:exon:1405;Parent=CLUHART00000008717 +scaffold625 maker exon 341518 341628 . + . ID=CLUHART00000008717:exon:1406;Parent=CLUHART00000008717 +scaffold625 maker exon 341964 343277 . + . ID=CLUHART00000008717:exon:1407;Parent=CLUHART00000008717 +scaffold625 maker CDS 337915 337971 . + 0 ID=CLUHART00000008717:cds;Parent=CLUHART00000008717 +scaffold625 maker CDS 340733 340841 . + 0 ID=CLUHART00000008717:cds;Parent=CLUHART00000008717 +scaffold625 maker CDS 341518 341628 . + 2 ID=CLUHART00000008717:cds;Parent=CLUHART00000008717 +scaffold625 maker CDS 341964 343033 . + 2 ID=CLUHART00000008717:cds;Parent=CLUHART00000008717 +scaffold625 maker five_prime_UTR 337818 337914 . + . ID=CLUHART00000008717:five_prime_utr;Parent=CLUHART00000008717 +scaffold625 maker three_prime_UTR 343034 343277 . + . ID=CLUHART00000008717:three_prime_utr;Parent=CLUHART00000008717 +scaffold789 maker gene 558184 564780 . + . ID=CLUHARG00000003852;Name=PF11_0240 +scaffold789 maker mRNA 558184 564780 . + . ID=CLUHART00000006146;Parent=CLUHARG00000003852 +scaffold789 maker exon 558184 560123 . + . ID=CLUHART00000006146:exon:995;Parent=CLUHART00000006146 +scaffold789 maker exon 561401 561519 . + . ID=CLUHART00000006146:exon:996;Parent=CLUHART00000006146 +scaffold789 maker exon 564171 564235 . + . ID=CLUHART00000006146:exon:997;Parent=CLUHART00000006146 +scaffold789 maker exon 564372 564780 . + . ID=CLUHART00000006146:exon:998;Parent=CLUHART00000006146 +scaffold789 maker CDS 558191 560123 . + 0 ID=CLUHART00000006146:cds;Parent=CLUHART00000006146 +scaffold789 maker CDS 561401 561519 . + 2 ID=CLUHART00000006146:cds;Parent=CLUHART00000006146 +scaffold789 maker CDS 564171 564235 . + 0 ID=CLUHART00000006146:cds;Parent=CLUHART00000006146 +scaffold789 maker CDS 564372 564588 . + 1 ID=CLUHART00000006146:cds;Parent=CLUHART00000006146 +scaffold789 maker five_prime_UTR 558184 558190 . + . ID=CLUHART00000006146:five_prime_utr;Parent=CLUHART00000006146 +scaffold789 maker three_prime_UTR 564589 564780 . + . ID=CLUHART00000006146:three_prime_utr;Parent=CLUHART00000006146 +scaffold789 maker mRNA 558184 564780 . + . ID=CLUHART00000006147;Parent=CLUHARG00000003852 +scaffold789 maker exon 558184 560123 . + . ID=CLUHART00000006147:exon:997;Parent=CLUHART00000006147 +scaffold789 maker exon 561401 561519 . + . ID=CLUHART00000006147:exon:998;Parent=CLUHART00000006147 +scaffold789 maker exon 562057 562121 . + . ID=CLUHART00000006147:exon:999;Parent=CLUHART00000006147 +scaffold789 maker exon 564372 564780 . + . ID=CLUHART00000006147:exon:1000;Parent=CLUHART00000006147 +scaffold789 maker CDS 558191 560123 . + 0 ID=CLUHART00000006147:cds;Parent=CLUHART00000006147 +scaffold789 maker CDS 561401 561519 . + 2 ID=CLUHART00000006147:cds;Parent=CLUHART00000006147 +scaffold789 maker CDS 562057 562121 . + 0 ID=CLUHART00000006147:cds;Parent=CLUHART00000006147 +scaffold789 maker CDS 564372 564588 . + 1 ID=CLUHART00000006147:cds;Parent=CLUHART00000006147 +scaffold789 maker five_prime_UTR 558184 558190 . + . ID=CLUHART00000006147:five_prime_utr;Parent=CLUHART00000006147 +scaffold789 maker three_prime_UTR 564589 564780 . + . ID=CLUHART00000006147:three_prime_utr;Parent=CLUHART00000006147 diff --git a/src/agat/agat_convert_sp_gff2gtf/test_data/script.sh b/src/agat/agat_convert_sp_gff2gtf/test_data/script.sh new file mode 100755 index 00000000..e453e772 --- /dev/null +++ b/src/agat/agat_convert_sp_gff2gtf/test_data/script.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +# clone repo +if [ ! -d /tmp/agat_source ]; then + git clone --depth 1 --single-branch --branch master https://github.com/NBISweden/AGAT /tmp/agat_source +fi + +# copy test data +cp -r /tmp/agat_source/t/gff_syntax/in/0_test.gff src/agat/agat_convert_sp_gff2gtf/test_data diff --git a/src/bd_rhapsody/bd_rhapsody_make_reference/config.vsh.yaml b/src/bd_rhapsody/bd_rhapsody_make_reference/config.vsh.yaml new file mode 100644 index 00000000..e596bf06 --- /dev/null +++ b/src/bd_rhapsody/bd_rhapsody_make_reference/config.vsh.yaml @@ -0,0 +1,143 @@ +name: bd_rhapsody_make_reference +namespace: bd_rhapsody +description: | + The Reference Files Generator creates an archive containing Genome Index + and Transcriptome annotation files needed for the BD Rhapsody Sequencing + Analysis Pipeline. The app takes as input one or more FASTA and GTF files + and produces a compressed archive in the form of a tar.gz file. The + archive contains: + + - STAR index + - Filtered GTF file +keywords: [genome, reference, index, align] +links: + repository: https://bitbucket.org/CRSwDev/cwl/src/master/v2.2.1/Extra_Utilities/ + documentation: https://bd-rhapsody-bioinfo-docs.genomics.bd.com/resources/extra_utilities.html#make-rhapsody-reference +license: Unknown +authors: + - __merge__: /src/_authors/robrecht_cannoodt.yaml + roles: [ author, maintainer ] + - __merge__: /src/_authors/weiwei_schultz.yaml + roles: [ contributor ] + +argument_groups: + - name: Inputs + arguments: + - type: file + name: --genome_fasta + required: true + description: Reference genome file in FASTA or FASTA.GZ format. The BD Rhapsody Sequencing Analysis Pipeline uses GRCh38 for Human and GRCm39 for Mouse. + example: genome_sequence.fa.gz + multiple: true + info: + config_key: Genome_fasta + - type: file + name: --gtf + required: true + description: | + File path to the transcript annotation files in GTF or GTF.GZ format. The Sequence Analysis Pipeline requires the 'gene_name' or + 'gene_id' attribute to be set on each gene and exon feature. Gene and exon feature lines must have the same attribute, and exons + must have a corresponding gene with the same value. For TCR/BCR assays, the TCR or BCR gene segments must have the 'gene_type' or + 'gene_biotype' attribute set, and the value should begin with 'TR' or 'IG', respectively. + example: transcriptome_annotation.gtf.gz + multiple: true + info: + config_key: Gtf + - type: file + name: --extra_sequences + description: | + File path to additional sequences in FASTA format to use when building the STAR index. (e.g. transgenes or CRISPR guide barcodes). + GTF lines for these sequences will be automatically generated and combined with the main GTF. + required: false + multiple: true + info: + config_key: Extra_sequences + - name: Outputs + arguments: + - type: file + name: --reference_archive + direction: output + required: true + description: | + A Compressed archive containing the Reference Genome Index and annotation GTF files. This archive is meant to be used as an + input in the BD Rhapsody Sequencing Analysis Pipeline. + example: star_index.tar.gz + - name: Arguments + arguments: + - type: string + name: --mitochondrial_contigs + description: | + Names of the Mitochondrial contigs in the provided Reference Genome. Fragments originating from contigs other than these are + identified as 'nuclear fragments' in the ATACseq analysis pipeline. + required: false + multiple: true + default: [chrM, chrMT, M, MT] + info: + config_key: Mitochondrial_contigs + - type: boolean_true + name: --filtering_off + description: | + By default the input Transcript Annotation files are filtered based on the gene_type/gene_biotype attribute. Only features + having the following attribute values are kept: + + - protein_coding + - lncRNA (lincRNA and antisense for Gencode < v31/M22/Ensembl97) + - IG_LV_gene + - IG_V_gene + - IG_V_pseudogene + - IG_D_gene + - IG_J_gene + - IG_J_pseudogene + - IG_C_gene + - IG_C_pseudogene + - TR_V_gene + - TR_V_pseudogene + - TR_D_gene + - TR_J_gene + - TR_J_pseudogene + - TR_C_gene + + If you have already pre-filtered the input Annotation files and/or wish to turn-off the filtering, please set this option to True. + info: + config_key: Filtering_off + - type: boolean_true + name: --wta_only_index + description: Build a WTA only index, otherwise builds a WTA + ATAC index. + info: + config_key: Wta_Only + - type: string + name: --extra_star_params + description: Additional parameters to pass to STAR when building the genome index. Specify exactly like how you would on the command line. + example: --limitGenomeGenerateRAM 48000 --genomeSAindexNbases 11 + required: false + info: + config_key: Extra_STAR_params + +resources: + - type: python_script + path: script.py + - path: make_rhap_reference_2.2.1_nodocker.cwl + +test_resources: + - type: bash_script + path: test.sh + - path: test_data + +requirements: + commands: [ "cwl-runner" ] + +engines: + - type: docker + image: bdgenomics/rhapsody:2.2.1 + setup: + - type: apt + packages: [procps] + - type: python + packages: [cwlref-runner, cwl-runner] + - type: docker + run: | + echo "bdgenomics/rhapsody: 2.2.1" > /var/software_versions.txt + +runners: + - type: executable + - type: nextflow diff --git a/src/bd_rhapsody/bd_rhapsody_make_reference/help.txt b/src/bd_rhapsody/bd_rhapsody_make_reference/help.txt new file mode 100644 index 00000000..cd038b25 --- /dev/null +++ b/src/bd_rhapsody/bd_rhapsody_make_reference/help.txt @@ -0,0 +1,66 @@ +```bash +cwl-runner src/bd_rhapsody/bd_rhapsody_make_reference/make_rhap_reference_2.2.1_nodocker.cwl --help +``` + +usage: src/bd_rhapsody/bd_rhapsody_make_reference/make_rhap_reference_2.2.1_nodocker.cwl + [-h] [--Archive_prefix ARCHIVE_PREFIX] + [--Extra_STAR_params EXTRA_STAR_PARAMS] + [--Extra_sequences EXTRA_SEQUENCES] [--Filtering_off] --Genome_fasta + GENOME_FASTA --Gtf GTF [--Maximum_threads MAXIMUM_THREADS] + [--Mitochondrial_Contigs MITOCHONDRIAL_CONTIGS] [--WTA_Only] + [job_order] + +The Reference Files Generator creates an archive containing Genome Index and +Transcriptome annotation files needed for the BD Rhapsodyâ„¢ Sequencing +Analysis Pipeline. The app takes as input one or more FASTA and GTF files and +produces a compressed archive in the form of a tar.gz file. The archive +contains:\n - STAR index\n - Filtered GTF file + +positional arguments: + job_order Job input json file + +options: + -h, --help show this help message and exit + --Archive_prefix ARCHIVE_PREFIX + A prefix for naming the compressed archive file + containing the Reference genome index and annotation + files. The default value is constructed based on the + input Reference files. + --Extra_STAR_params EXTRA_STAR_PARAMS + Additional parameters to pass to STAR when building + the genome index. Specify exactly like how you would + on the command line. Example: --limitGenomeGenerateRAM + 48000 --genomeSAindexNbases 11 + --Extra_sequences EXTRA_SEQUENCES + Additional sequences in FASTA format to use when + building the STAR index. (E.g. phiX genome) + --Filtering_off By default the input Transcript Annotation files are + filtered based on the gene_type/gene_biotype + attribute. Only features having the following + attribute values are are kept: - protein_coding - + lncRNA (lincRNA and antisense for Gencode < + v31/M22/Ensembl97) - IG_LV_gene - IG_V_gene - + IG_V_pseudogene - IG_D_gene - IG_J_gene - + IG_J_pseudogene - IG_C_gene - IG_C_pseudogene - + TR_V_gene - TR_V_pseudogene - TR_D_gene - TR_J_gene - + TR_J_pseudogene - TR_C_gene If you have already pre- + filtered the input Annotation files and/or wish to + turn-off the filtering, please set this option to + True. + --Genome_fasta GENOME_FASTA + Reference genome file in FASTA format. The BD + Rhapsodyâ„¢ Sequencing Analysis Pipeline uses GRCh38 + for Human and GRCm39 for Mouse. + --Gtf GTF Transcript annotation files in GTF format. The BD + Rhapsodyâ„¢ Sequencing Analysis Pipeline uses Gencode + v42 for Human and M31 for Mouse. + --Maximum_threads MAXIMUM_THREADS + The maximum number of threads to use in the pipeline. + By default, all available cores are used. + --Mitochondrial_Contigs MITOCHONDRIAL_CONTIGS + Names of the Mitochondrial contigs in the provided + Reference Genome. Fragments originating from contigs + other than these are identified as 'nuclear fragments' + in the ATACseq analysis pipeline. + --WTA_Only Build a WTA only index, otherwise builds a WTA + ATAC + index. diff --git a/src/bd_rhapsody/bd_rhapsody_make_reference/make_rhap_reference_2.2.1_nodocker.cwl b/src/bd_rhapsody/bd_rhapsody_make_reference/make_rhap_reference_2.2.1_nodocker.cwl new file mode 100644 index 00000000..fead2c02 --- /dev/null +++ b/src/bd_rhapsody/bd_rhapsody_make_reference/make_rhap_reference_2.2.1_nodocker.cwl @@ -0,0 +1,115 @@ +requirements: + InlineJavascriptRequirement: {} +class: CommandLineTool +label: Reference Files Generator for BD Rhapsodyâ„¢ Sequencing Analysis Pipeline +cwlVersion: v1.2 +doc: >- + The Reference Files Generator creates an archive containing Genome Index and Transcriptome annotation files needed for the BD Rhapsodyâ„¢ Sequencing Analysis Pipeline. The app takes as input one or more FASTA and GTF files and produces a compressed archive in the form of a tar.gz file. The archive contains:\n - STAR index\n - Filtered GTF file + + +baseCommand: run_reference_generator.sh +inputs: + Genome_fasta: + type: File[] + label: Reference Genome + doc: |- + Reference genome file in FASTA format. The BD Rhapsodyâ„¢ Sequencing Analysis Pipeline uses GRCh38 for Human and GRCm39 for Mouse. + inputBinding: + prefix: --reference-genome + shellQuote: false + Gtf: + type: File[] + label: Transcript Annotations + doc: |- + Transcript annotation files in GTF format. The BD Rhapsodyâ„¢ Sequencing Analysis Pipeline uses Gencode v42 for Human and M31 for Mouse. + inputBinding: + prefix: --gtf + shellQuote: false + Extra_sequences: + type: File[]? + label: Extra Sequences + doc: |- + Additional sequences in FASTA format to use when building the STAR index. (E.g. phiX genome) + inputBinding: + prefix: --extra-sequences + shellQuote: false + Mitochondrial_Contigs: + type: string[]? + default: ["chrM", "chrMT", "M", "MT"] + label: Mitochondrial Contig Names + doc: |- + Names of the Mitochondrial contigs in the provided Reference Genome. Fragments originating from contigs other than these are identified as 'nuclear fragments' in the ATACseq analysis pipeline. + inputBinding: + prefix: --mitochondrial-contigs + shellQuote: false + Filtering_off: + type: boolean? + label: Turn off filtering + doc: |- + By default the input Transcript Annotation files are filtered based on the gene_type/gene_biotype attribute. Only features having the following attribute values are are kept: + - protein_coding + - lncRNA (lincRNA and antisense for Gencode < v31/M22/Ensembl97) + - IG_LV_gene + - IG_V_gene + - IG_V_pseudogene + - IG_D_gene + - IG_J_gene + - IG_J_pseudogene + - IG_C_gene + - IG_C_pseudogene + - TR_V_gene + - TR_V_pseudogene + - TR_D_gene + - TR_J_gene + - TR_J_pseudogene + - TR_C_gene + If you have already pre-filtered the input Annotation files and/or wish to turn-off the filtering, please set this option to True. + inputBinding: + prefix: --filtering-off + shellQuote: false + WTA_Only: + type: boolean? + label: WTA only index + doc: Build a WTA only index, otherwise builds a WTA + ATAC index. + inputBinding: + prefix: --wta-only-index + shellQuote: false + Archive_prefix: + type: string? + label: Archive Prefix + doc: |- + A prefix for naming the compressed archive file containing the Reference genome index and annotation files. The default value is constructed based on the input Reference files. + inputBinding: + prefix: --archive-prefix + shellQuote: false + Extra_STAR_params: + type: string? + label: Extra STAR Params + doc: |- + Additional parameters to pass to STAR when building the genome index. Specify exactly like how you would on the command line. + Example: + --limitGenomeGenerateRAM 48000 --genomeSAindexNbases 11 + inputBinding: + prefix: --extra-star-params + shellQuote: true + + Maximum_threads: + type: int? + label: Maximum Number of Threads + doc: |- + The maximum number of threads to use in the pipeline. By default, all available cores are used. + inputBinding: + prefix: --maximum-threads + shellQuote: false + +outputs: + + Archive: + type: File + doc: |- + A Compressed archive containing the Reference Genome Index and annotation GTF files. This archive is meant to be used as an input in the BD Rhapsodyâ„¢ Sequencing Analysis Pipeline. + id: Reference_Archive + label: Reference Files Archive + outputBinding: + glob: '*.tar.gz' + diff --git a/src/bd_rhapsody/bd_rhapsody_make_reference/script.py b/src/bd_rhapsody/bd_rhapsody_make_reference/script.py new file mode 100644 index 00000000..ca635508 --- /dev/null +++ b/src/bd_rhapsody/bd_rhapsody_make_reference/script.py @@ -0,0 +1,161 @@ +import os +import re +import subprocess +import tempfile +from typing import Any +import yaml +import shutil + +## VIASH START +par = { + "genome_fasta": [], + "gtf": [], + "extra_sequences": [], + "mitochondrial_contigs": ["chrM", "chrMT", "M", "MT"], + "filtering_off": False, + "wta_only_index": False, + "extra_star_params": None, + "reference_archive": "output.tar.gz", +} +meta = { + "config": "target/nextflow/reference/build_bdrhap_2_reference/.config.vsh.yaml", + "resources_dir": os.path.abspath("src/reference/build_bdrhap_2_reference"), + "temp_dir": os.getenv("VIASH_TEMP"), + "memory_mb": None, + "cpus": None +} +## VIASH END + +def clean_arg(argument): + argument["clean_name"] = re.sub("^-*", "", argument["name"]) + return argument + +def read_config(path: str) -> dict[str, Any]: + with open(path, "r") as f: + config = yaml.safe_load(f) + + config["all_arguments"] = [ + clean_arg(arg) + for grp in config["argument_groups"] + for arg in grp["arguments"] + ] + + return config + +def strip_margin(text: str) -> str: + return re.sub("(\n?)[ \t]*\|", "\\1", text) + +def process_params(par: dict[str, Any], config) -> str: + # check input parameters + assert par["genome_fasta"], "Pass at least one set of inputs to --genome_fasta." + assert par["gtf"], "Pass at least one set of inputs to --gtf." + assert par["reference_archive"].endswith(".tar.gz"), "Output reference_archive must end with .tar.gz." + + # make paths absolute + for argument in config["all_arguments"]: + if par[argument["clean_name"]] and argument["type"] == "file": + if isinstance(par[argument["clean_name"]], list): + par[argument["clean_name"]] = [ os.path.abspath(f) for f in par[argument["clean_name"]] ] + else: + par[argument["clean_name"]] = os.path.abspath(par[argument["clean_name"]]) + + return par + +def generate_config(par: dict[str, Any], meta, config) -> str: + content_list = [strip_margin(f"""\ + |#!/usr/bin/env cwl-runner + | + |""")] + + + config_key_value_pairs = [] + for argument in config["all_arguments"]: + config_key = (argument.get("info") or {}).get("config_key") + arg_type = argument["type"] + par_value = par[argument["clean_name"]] + if par_value and config_key: + config_key_value_pairs.append((config_key, arg_type, par_value)) + + if meta["cpus"]: + config_key_value_pairs.append(("Maximum_threads", "integer", meta["cpus"])) + + # print(config_key_value_pairs) + + for config_key, arg_type, par_value in config_key_value_pairs: + if arg_type == "file": + str = strip_margin(f"""\ + |{config_key}: + |""") + if isinstance(par_value, list): + for file in par_value: + str += strip_margin(f"""\ + | - class: File + | location: "{file}" + |""") + else: + str += strip_margin(f"""\ + | class: File + | location: "{par_value}" + |""") + content_list.append(str) + else: + content_list.append(strip_margin(f"""\ + |{config_key}: {par_value} + |""")) + + ## Write config to file + return "".join(content_list) + +def get_cwl_file(meta: dict[str, Any]) -> str: + # create cwl file (if need be) + cwl_file=os.path.join(meta["resources_dir"], "make_rhap_reference_2.2.1_nodocker.cwl") + + return cwl_file + +def main(par: dict[str, Any], meta: dict[str, Any]): + config = read_config(meta["config"]) + + # Preprocess params + par = process_params(par, config) + + # fetch cwl file + cwl_file = get_cwl_file(meta) + + # Create output dir if not exists + outdir = os.path.dirname(par["reference_archive"]) + if not os.path.exists(outdir): + os.makedirs(outdir) + + ## Run pipeline + with tempfile.TemporaryDirectory(prefix="cwl-bd_rhapsody_wta-", dir=meta["temp_dir"]) as temp_dir: + # Create params file + config_file = os.path.join(temp_dir, "config.yml") + config_content = generate_config(par, meta, config) + with open(config_file, "w") as f: + f.write(config_content) + + + cmd = [ + "cwl-runner", + "--no-container", + "--preserve-entire-environment", + "--outdir", + temp_dir, + cwl_file, + config_file + ] + + env = dict(os.environ) + env["TMPDIR"] = temp_dir + + print("> " + " ".join(cmd), flush=True) + _ = subprocess.check_call( + cmd, + cwd=os.path.dirname(config_file), + env=env + ) + + shutil.move(os.path.join(temp_dir, "Rhap_reference.tar.gz"), par["reference_archive"]) + +if __name__ == "__main__": + main(par, meta) diff --git a/src/bd_rhapsody/bd_rhapsody_make_reference/test.sh b/src/bd_rhapsody/bd_rhapsody_make_reference/test.sh new file mode 100644 index 00000000..845c1739 --- /dev/null +++ b/src/bd_rhapsody/bd_rhapsody_make_reference/test.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +set -e + +############################################# +# helper functions +assert_file_exists() { + [ -f "$1" ] || { echo "File '$1' does not exist" && exit 1; } +} +assert_file_doesnt_exist() { + [ ! -f "$1" ] || { echo "File '$1' exists but shouldn't" && exit 1; } +} +assert_file_empty() { + [ ! -s "$1" ] || { echo "File '$1' is not empty but should be" && exit 1; } +} +assert_file_not_empty() { + [ -s "$1" ] || { echo "File '$1' is empty but shouldn't be" && exit 1; } +} +assert_file_contains() { + grep -q "$2" "$1" || { echo "File '$1' does not contain '$2'" && exit 1; } +} +assert_file_not_contains() { + grep -q "$2" "$1" && { echo "File '$1' contains '$2' but shouldn't" && exit 1; } +} +############################################# + +in_fa="$meta_resources_dir/test_data/reference_small.fa" +in_gtf="$meta_resources_dir/test_data/reference_small.gtf" + +echo "#############################################" +echo "> Simple run" + +mkdir simple_run +cd simple_run + +out_tar="myreference.tar.gz" + +echo "> Running $meta_name." +$meta_executable \ + --genome_fasta "$in_fa" \ + --gtf "$in_gtf" \ + --reference_archive "$out_tar" \ + --extra_star_params "--genomeSAindexNbases 6" \ + ---cpus 2 + +exit_code=$? +[[ $exit_code != 0 ]] && echo "Non zero exit code: $exit_code" && exit 1 + +assert_file_exists "$out_tar" +assert_file_not_empty "$out_tar" + +echo ">> Checking whether output contains the expected files" +tar -xvf "$out_tar" > /dev/null +assert_file_exists "BD_Rhapsody_Reference_Files/star_index/genomeParameters.txt" +assert_file_exists "BD_Rhapsody_Reference_Files/bwa-mem2_index/reference_small.ann" +assert_file_exists "BD_Rhapsody_Reference_Files/reference_small-processed.gtf" +assert_file_exists "BD_Rhapsody_Reference_Files/mitochondrial_contigs.txt" +assert_file_contains "BD_Rhapsody_Reference_Files/reference_small-processed.gtf" "chr1.*HAVANA.*ENSG00000243485" +assert_file_contains "BD_Rhapsody_Reference_Files/mitochondrial_contigs.txt" 'chrMT' + +cd .. + +echo "#############################################" + +echo "> Tests succeeded!" \ No newline at end of file diff --git a/src/bd_rhapsody/bd_rhapsody_make_reference/test_data/reference_small.fa b/src/bd_rhapsody/bd_rhapsody_make_reference/test_data/reference_small.fa new file mode 100644 index 00000000..386d887c --- /dev/null +++ b/src/bd_rhapsody/bd_rhapsody_make_reference/test_data/reference_small.fa @@ -0,0 +1,27 @@ +>chr1 1 +TGGGGAAGCAAGGCGGAGTTGGGCAGCTCGTGTTCAATGGGTAGAGTTTCAGGCTGGGGT +GATGGAAGGGTGCTGGAAATGAGTGGTAGTGATGGCGGCACAACAGTGTGAATCTACTTA +ATCCCACTGAACTGTATGCTGAAAAATGGTTTAGACGGTGAATTTTAGGTTATGTATGTT +TTACCACAATTTTTAAAAAGCTAGTGAAAAGCTGGTAAAAAGAAAGAAAAGAGGCTTTTT +TAAAAAGTTAAATATATAAAAAGAGCATCATCAGTCCAAAGTCCAGCAGTTGTCCCTCCT +GGAATCCGTTGGCTTGCCTCCGGCATTTTTGGCCCTTGCCTTTTAGGGTTGCCAGATTAA +AAGACAGGATGCCCAGCTAGTTTGAATTTTAGATAAACAACGAATAATTTCGTAGCATAA +ATATGTCCCAAGCTTAGTTTGGGACATACTTATGCTAAAAAACATTATTGGTTGTTTATC +TGAGATTCAGAATTAAGCATTTTATATTTTATTTGCTGCCTCTGGCCACCCTACTCTCTT +CCTAACACTCTCTCCCTCTCCCAGTTTTGTCCGCCTTCCCTGCCTCCTCTTCTGGGGGAG +TTAGATCGAGTTGTAACAAGAACATGCCACTGTCTCGCTGGCTGCAGCGTGTGGTCCCCT +TACCAGAGGTAAAGAAGAGATGGATCTCCACTCATGTTGTAGACAGAATGTTTATGTCCT +CTCCAAATGCTTATGTTGAAACCCTAACCCCTAATGTGATGGTATGTGGAGATGGGCCTT +TGGTAGGTAATTACGGTTAGATGAGGTCATGGGGTGGGGCCCTCATTATAGATCTGGTAA +GAAAAGAGAGCATTGTCTCTGTGTCTCCCTCTCTCTCTCTCTCTCTCTCTCTCATTTCTC +TCTATCTCATTTCTCTCTCTCTCGCTATCTCATTTTTCTCTCTCTCTCTTTCTCTCCTCT +GTCTTTTCCCACCAAGTGAGGATGCGAAGAGAAGGTGGCTGTCTGCAAACCAGGAAGAGA +GCCCTCACCGGGAACCCGTCCAGCTGCCACCTTGAACTTGGACTTCCAAGCCTCCAGAAC +TGTGAGGGATAAATGTATGATTTTAAAGTCGCCCAGTGTGTGGTATTTTGTTTTGACTAA +TACAACCTGAAAACATTTTCCCCTCACTCCACCTGAGCAATATCTGAGTGGCTTAAGGTA +CTCAGGACACAACAAAGGAGAAATGTCCCATGCACAAGGTGCACCCATGCCTGGGTAAAG +CAGCCTGGCACAGAGGGAAGCACACAGGCTCAGGGATCTGCTATTCATTCTTTGTGTGAC +CCTGGGCAAGCCATGAATGGAGCTTCAGTCACCCCATTTGTAATGGGATTTAATTGTGCT +TGCCCTGCCTCCTTTTGAGGGCTGTAGAGAAAAGATGTCAAAGTATTTTGTAATCTGGCT +GGGCGTGGTGGCTCATGCCTGTAATCCTAGCACTTTGGTAGGCTGACGCGAGAGGACTGC +T diff --git a/src/bd_rhapsody/bd_rhapsody_make_reference/test_data/reference_small.gtf b/src/bd_rhapsody/bd_rhapsody_make_reference/test_data/reference_small.gtf new file mode 100644 index 00000000..7ba83523 --- /dev/null +++ b/src/bd_rhapsody/bd_rhapsody_make_reference/test_data/reference_small.gtf @@ -0,0 +1,8 @@ +chr1 HAVANA exon 565 668 . + . gene_id "ENSG00000243485.5"; transcript_id "ENST00000473358.1"; gene_type "lncRNA"; gene_name "MIR1302-2HG"; transcript_type "lncRNA"; transcript_name "MIR1302-2HG-202"; exon_number 2; exon_id "ENSE00001922571.1"; level 2; transcript_support_level "5"; hgnc_id "HGNC:52482"; tag "not_best_in_genome_evidence"; tag "dotter_confirmed"; tag "basic"; tag "Ensembl_canonical"; havana_gene "OTTHUMG00000000959.2"; havana_transcript "OTTHUMT00000002840.1"; +chr1 HAVANA exon 977 1098 . + . gene_id "ENSG00000243485.5"; transcript_id "ENST00000473358.1"; gene_type "lncRNA"; gene_name "MIR1302-2HG"; transcript_type "lncRNA"; transcript_name "MIR1302-2HG-202"; exon_number 3; exon_id "ENSE00001827679.1"; level 2; transcript_support_level "5"; hgnc_id "HGNC:52482"; tag "not_best_in_genome_evidence"; tag "dotter_confirmed"; tag "basic"; tag "Ensembl_canonical"; havana_gene "OTTHUMG00000000959.2"; havana_transcript "OTTHUMT00000002840.1"; +chr1 HAVANA transcript 268 1110 . + . gene_id "ENSG00000243485.5"; transcript_id "ENST00000469289.1"; gene_type "lncRNA"; gene_name "MIR1302-2HG"; transcript_type "lncRNA"; transcript_name "MIR1302-2HG-201"; level 2; transcript_support_level "5"; hgnc_id "HGNC:52482"; tag "not_best_in_genome_evidence"; tag "basic"; havana_gene "OTTHUMG00000000959.2"; havana_transcript "OTTHUMT00000002841.2"; +chr1 HAVANA exon 268 668 . + . gene_id "ENSG00000243485.5"; transcript_id "ENST00000469289.1"; gene_type "lncRNA"; gene_name "MIR1302-2HG"; transcript_type "lncRNA"; transcript_name "MIR1302-2HG-201"; exon_number 1; exon_id "ENSE00001841699.1"; level 2; transcript_support_level "5"; hgnc_id "HGNC:52482"; tag "not_best_in_genome_evidence"; tag "basic"; havana_gene "OTTHUMG00000000959.2"; havana_transcript "OTTHUMT00000002841.2"; +chr1 HAVANA exon 977 1110 . + . gene_id "ENSG00000243485.5"; transcript_id "ENST00000469289.1"; gene_type "lncRNA"; gene_name "MIR1302-2HG"; transcript_type "lncRNA"; transcript_name "MIR1302-2HG-201"; exon_number 2; exon_id "ENSE00001890064.1"; level 2; transcript_support_level "5"; hgnc_id "HGNC:52482"; tag "not_best_in_genome_evidence"; tag "basic"; havana_gene "OTTHUMG00000000959.2"; havana_transcript "OTTHUMT00000002841.2"; +chr1 ENSEMBL gene 367 504 . + . gene_id "ENSG00000284332.1"; gene_type "miRNA"; gene_name "MIR1302-2"; level 3; hgnc_id "HGNC:35294"; +chr1 ENSEMBL transcript 367 504 . + . gene_id "ENSG00000284332.1"; transcript_id "ENST00000607096.1"; gene_type "miRNA"; gene_name "MIR1302-2"; transcript_type "miRNA"; transcript_name "MIR1302-2-201"; level 3; transcript_support_level "NA"; hgnc_id "HGNC:35294"; tag "basic"; tag "Ensembl_canonical"; +chr1 ENSEMBL exon 367 504 . + . gene_id "ENSG00000284332.1"; transcript_id "ENST00000607096.1"; gene_type "miRNA"; gene_name "MIR1302-2"; transcript_type "miRNA"; transcript_name "MIR1302-2-201"; exon_number 1; exon_id "ENSE00003695741.1"; level 3; transcript_support_level "NA"; hgnc_id "HGNC:35294"; tag "basic"; tag "Ensembl_canonical"; diff --git a/src/bd_rhapsody/bd_rhapsody_make_reference/test_data/script.sh b/src/bd_rhapsody/bd_rhapsody_make_reference/test_data/script.sh new file mode 100644 index 00000000..8d468064 --- /dev/null +++ b/src/bd_rhapsody/bd_rhapsody_make_reference/test_data/script.sh @@ -0,0 +1,47 @@ +#!/bin/bash + +TMP_DIR=/tmp/bd_rhapsody_make_reference +OUT_DIR=src/bd_rhapsody/bd_rhapsody_make_reference/test_data + +# check if seqkit is installed +if ! command -v seqkit &> /dev/null; then + echo "seqkit could not be found" + exit 1 +fi + +# create temporary directory and clean up on exit +mkdir -p $TMP_DIR +function clean_up { + rm -rf "$TMP_DIR" +} +trap clean_up EXIT + +# fetch reference +ORIG_FA=$TMP_DIR/reference.fa.gz +if [ ! -f $ORIG_FA ]; then + wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/GRCh38.primary_assembly.genome.fa.gz \ + -O $ORIG_FA +fi + +ORIG_GTF=$TMP_DIR/reference.gtf.gz +if [ ! -f $ORIG_GTF ]; then + wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/gencode.v41.annotation.gtf.gz \ + -O $ORIG_GTF +fi + +# create small reference +START=30000 +END=31500 +CHR=chr1 + +# subset to small region +seqkit grep -r -p "^$CHR\$" "$ORIG_FA" | \ + seqkit subseq -r "$START:$END" > $OUT_DIR/reference_small.fa + +zcat "$ORIG_GTF" | \ + awk -v FS='\t' -v OFS='\t' " + \$1 == \"$CHR\" && \$4 >= $START && \$5 <= $END { + \$4 = \$4 - $START + 1; + \$5 = \$5 - $START + 1; + print; + }" > $OUT_DIR/reference_small.gtf diff --git a/src/bedtools/bedtools_getfasta/config.vsh.yaml b/src/bedtools/bedtools_getfasta/config.vsh.yaml new file mode 100644 index 00000000..f1f49a87 --- /dev/null +++ b/src/bedtools/bedtools_getfasta/config.vsh.yaml @@ -0,0 +1,103 @@ +name: bedtools_getfasta +namespace: bedtools +description: Extract sequences from a FASTA file for each of the intervals defined in a BED/GFF/VCF file. +keywords: [sequencing, fasta, BED, GFF, VCF] +links: + documentation: https://bedtools.readthedocs.io/en/latest/content/tools/getfasta.html + repository: https://github.com/arq5x/bedtools2 +references: + doi: 10.1093/bioinformatics/btq033 +license: GPL-2.0 +requirements: + commands: [bedtools] + +argument_groups: + - name: Input arguments + arguments: + - name: --input_fasta + type: file + description: | + FASTA file containing sequences for each interval specified in the input BED file. + The headers in the input FASTA file must exactly match the chromosome column in the BED file. + - name: "--input_bed" + type: file + description: | + BED file containing intervals to extract from the FASTA file. + BED files containing a single region require a newline character + at the end of the line, otherwise a blank output file is produced. + - name: --rna + type: boolean_true + description: | + The FASTA is RNA not DNA. Reverse complementation handled accordingly. + + - name: Run arguments + arguments: + - name: "--strandedness" + type: boolean_true + alternatives: ["-s"] + description: | + Force strandedness. If the feature occupies the antisense strand, the output sequence will + be reverse complemented. By default strandedness is not taken into account. + + - name: Output arguments + arguments: + - name: --output + alternatives: [-o] + required: true + type: file + direction: output + description: | + Output file where the output from the 'bedtools getfasta' commend will + be written to. + - name: --tab + type: boolean_true + description: | + Report extract sequences in a tab-delimited format instead of in FASTA format. + - name: --bed_out + type: boolean_true + description: | + Report extract sequences in a tab-delimited BED format instead of in FASTA format. + - name: "--name" + type: boolean_true + description: | + Set the FASTA header for each extracted sequence to be the "name" and coordinate columns from the BED feature. + - name: "--name_only" + type: boolean_true + description: | + Set the FASTA header for each extracted sequence to be the "name" columns from the BED feature. + - name: "--split" + type: boolean_true + description: | + When --input is in BED12 format, create a separate fasta entry for each block in a BED12 record, + blocks being described in the 11th and 12th column of the BED. + - name: "--full_header" + type: boolean_true + description: | + Use full fasta header. By default, only the word before the first space or tab is used. + +# Arguments not taken into account: +# +# -fo [Specify an output file name. By default, output goes to stdout. +# + +resources: + - type: bash_script + path: script.sh + +test_resources: + - type: bash_script + path: test.sh + +engines: + - type: docker + image: debian:stable-slim + setup: + - type: apt + packages: [bedtools, procps] + - type: docker + run: | + echo "bedtools: \"$(bedtools --version | sed -n 's/^bedtools //p')\"" > /var/software_versions.txt + +runners: + - type: executable + - type: nextflow diff --git a/src/bedtools/bedtools_getfasta/script.sh b/src/bedtools/bedtools_getfasta/script.sh new file mode 100644 index 00000000..8e88b318 --- /dev/null +++ b/src/bedtools/bedtools_getfasta/script.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +set -eo pipefail + +unset_if_false=( par_rna par_strandedness par_tab par_bed_out par_name par_name_only par_split par_full_header ) + +for par in ${unset_if_false[@]}; do + test_val="${!par}" + [[ "$test_val" == "false" ]] && unset $par +done + +bedtools getfasta \ + -fi "$par_input_fasta" \ + -bed "$par_input_bed" \ + ${par_rna:+-rna} \ + ${par_name:+-name} \ + ${par_name_only:+-nameOnly} \ + ${par_tab:+-tab} \ + ${par_bed_out:+-bedOut} \ + ${par_strandedness:+-s} \ + ${par_split:+-split} \ + ${par_full_header:+-fullHeader} > "$par_output" + diff --git a/src/bedtools/bedtools_getfasta/test.sh b/src/bedtools/bedtools_getfasta/test.sh new file mode 100644 index 00000000..a28e3a7e --- /dev/null +++ b/src/bedtools/bedtools_getfasta/test.sh @@ -0,0 +1,119 @@ +#!/usr/bin/env bash +set -eo pipefail + +TMPDIR=$(mktemp -d) +function clean_up { + [[ -d "$TMPDIR" ]] && rm -r "$TMPDIR" +} +trap clean_up EXIT + +# Create dummy test fasta file +cat > "$TMPDIR/test.fa" <chr1 +AAAAAAAACCCCCCCCCCCCCGCTACTGGGGGGGGGGGGGGGGGG +EOF + +TAB="$(printf '\t')" + +# Create dummy bed file +cat > "$TMPDIR/test.bed" < "$TMPDIR/expected.fasta" <chr1:5-10 +AAACC +EOF + +"$meta_executable" \ + --input_bed "$TMPDIR/test.bed" \ + --input_fasta "$TMPDIR/test.fa" \ + --output "$TMPDIR/output.fasta" + +cmp --silent "$TMPDIR/output.fasta" "$TMPDIR/expected.fasta" || { echo "files are different:"; exit 1; } + + +# Create expected bed file for --name +cat > "$TMPDIR/expected_with_name.fasta" <myseq::chr1:5-10 +AAACC +EOF + +"$meta_executable" \ + --input_bed "$TMPDIR/test.bed" \ + --input_fasta "$TMPDIR/test.fa" \ + --name \ + --output "$TMPDIR/output_with_name.fasta" + + +cmp --silent "$TMPDIR/output_with_name.fasta" "$TMPDIR/expected_with_name.fasta" || { echo "Files when using --name are different."; exit 1; } + +# Create expected bed file for --name_only +cat > "$TMPDIR/expected_with_name_only.fasta" <myseq +AAACC +EOF + +"$meta_executable" \ + --input_bed "$TMPDIR/test.bed" \ + --input_fasta "$TMPDIR/test.fa" \ + --name_only \ + --output "$TMPDIR/output_with_name_only.fasta" + +cmp --silent "$TMPDIR/output_with_name_only.fasta" "$TMPDIR/expected_with_name_only.fasta" || { echo "Files when using --name_only are different."; exit 1; } + + +# Create expected tab-delimited file for --tab +cat > "$TMPDIR/expected_tab.out" < "$TMPDIR/expected.bed" < "$TMPDIR/test_strandedness.bed" < "$TMPDIR/expected_strandedness.fasta" <forward(+) +CGCTA +>reverse(-) +TAGCG +EOF + +"$meta_executable" \ + --input_bed "$TMPDIR/test_strandedness.bed" \ + --input_fasta "$TMPDIR/test.fa" \ + -s \ + --name_only \ + --output "$TMPDIR/output_strandedness.fasta" + + +cmp --silent "$TMPDIR/expected_strandedness.fasta" "$TMPDIR/output_strandedness.fasta" || { echo "Files when using -s are different."; exit 1; } + diff --git a/src/bgzip/config.vsh.yaml b/src/bgzip/config.vsh.yaml deleted file mode 100644 index 26e31ae4..00000000 --- a/src/bgzip/config.vsh.yaml +++ /dev/null @@ -1,128 +0,0 @@ -name: bgzip -description: Block compression/decompression utility -links: - homepage: https://www.htslib.org/ - documentation: https://www.htslib.org/doc/bgzip.html - repository: https://github.com/samtools/htslib -references: - doi: 10.1093/gigascience/giab007 -license: MIT -requirements: - commands: [ bgzip ] -argument_groups: - - name: Inputs - arguments: - - name: --input - type: file - direction: input - description: file to be compressed or decompressed - required: true - - name: Outputs - arguments: - - name: --output - type: file - direction: output - description: compressed or decompressed output - required: true - - name: --index_name - alternatives: -I - type: file - direction: output - description: name of BGZF index file [file.gz.gzi] - - name: Arguments - arguments: - - name: --offset - alternatives: -b - type: integer - description: decompress at virtual file pointer (0-based uncompressed offset) - - name: --decompress - alternatives: -d - type: boolean_true - description: decompress the input file - - name: --rebgzip - alternatives: -g - type: boolean_true - description: use an index file to bgzip a file - - name: --index - alternatives: -i - type: boolean_true - description: compress and create BGZF index - - name: --compress_level - alternatives: -l - type: integer - description: compression level to use when compressing; 0 to 9, or -1 for default [-1] - min: -1 - max: 9 - - name: --reindex - alternatives: -r - type: boolean_true - description: (re)index the output file - - name: --size - alternatives: -s - type: integer - description: decompress INT bytes (uncompressed size) - min: 0 - - name: --test - alternatives: -t - type: boolean_true - description: test integrity of compressed file - - name: --binary - type: boolean_true - description: Don't align blocks with text lines -resources: - - type: bash_script - text: | - [[ "$par_decompress" == "false" ]] && unset par_decompress - [[ "$par_rebgzip" == "false" ]] && unset par_rebgzip - [[ "$par_index" == "false" ]] && unset par_index - [[ "$par_reindex" == "false" ]] && unset par_reindex - [[ "$par_test" == "false" ]] && unset par_test - [[ "$par_binary" == "false" ]] && unset par_binary - bgzip -c \ - ${meta_cpus:+--threads "${meta_cpus}"} \ - ${par_offset:+-b "${par_offset}"} \ - ${par_decompress:+-d} \ - ${par_rebgzip:+-g} \ - ${par_index:+-i} \ - ${par_index_name:+-I "${par_index_name}"} \ - ${par_compress_level:+-l "${par_compress_level}"} \ - ${par_reindex:+-r} \ - ${par_size:+-s "${par_size}"} \ - ${par_test:+-t} \ - ${par_binary:+--binary} \ - "$par_input" > "$par_output" -test_resources: - - type: bash_script - text: | - set -e - - "$meta_executable" --input "$meta_resources_dir/test_data/test.vcf" --output "test.vcf.gz" - - echo ">> Checking output of compressing" - [ ! -f "test.vcf.gz" ] && echo "Output file test.vcf.gz does not exist" && exit 1 - - "$meta_executable" --input "test.vcf.gz" --output "test.vcf" --decompress - - echo ">> Checking output of decompressing" - [ ! -f "test.vcf" ] && echo "Output file test.vcf does not exist" && exit 1 - - echo ">> Checking original and decompressed files are the same" - set +e - cmp --silent -- "$meta_resources_dir/test_data/test.vcf" "test.vcf" - [ $? -ne 0 ] && echo "files are different" && exit 1 - set -e - - echo "> Test successful" - - type: file - path: test_data - -engines: - - type: docker - image: quay.io/biocontainers/htslib:1.19--h81da01d_0 - setup: - - type: docker - run: | - bgzip -h | grep 'Version:' 2>&1 | sed 's/Version:\s\(.*\)/bgzip: "\1"/' > /var/software_versions.txt -runners: - - type: executable - - type: nextflow \ No newline at end of file diff --git a/src/bgzip/help.txt b/src/bgzip/help.txt deleted file mode 100644 index d4012efd..00000000 --- a/src/bgzip/help.txt +++ /dev/null @@ -1,22 +0,0 @@ -```bash -bgzip -h -``` - -Version: 1.19 -Usage: bgzip [OPTIONS] [FILE] ... -Options: - -b, --offset INT decompress at virtual file pointer (0-based uncompressed offset) - -c, --stdout write on standard output, keep original files unchanged - -d, --decompress decompress - -f, --force overwrite files without asking - -g, --rebgzip use an index file to bgzip a file - -h, --help give this help - -i, --index compress and create BGZF index - -I, --index-name FILE name of BGZF index file [file.gz.gzi] - -k, --keep don't delete input files during operation - -l, --compress-level INT Compression level to use when compressing; 0 to 9, or -1 for default [-1] - -r, --reindex (re)index compressed file - -s, --size INT decompress INT bytes (uncompressed size) - -t, --test test integrity of compressed file - --binary Don't align blocks with text lines - -@, --threads INT number of compression threads to use [1] diff --git a/src/bgzip/test_data/script.sh b/src/bgzip/test_data/script.sh deleted file mode 100644 index c9114473..00000000 --- a/src/bgzip/test_data/script.sh +++ /dev/null @@ -1,10 +0,0 @@ -# bgzip test data - -# Test data was obtained from https://github.com/snakemake/snakemake-wrappers/tree/master/bio/bgzip/test. - -if [ ! -d /tmp/snakemake-wrappers ]; then - git clone --depth 1 --single-branch --branch master https://github.com/snakemake/snakemake-wrappers /tmp/snakemake-wrappers -fi - -cp -r /tmp/snakemake-wrappers/bio/bgzip/test/* src/bgzip/test_data - diff --git a/src/bgzip/test_data/test.vcf b/src/bgzip/test_data/test.vcf deleted file mode 100644 index 11b5400e..00000000 --- a/src/bgzip/test_data/test.vcf +++ /dev/null @@ -1,23 +0,0 @@ -##fileformat=VCFv4.0 -##fileDate=20090805 -##source=https://www.internationalgenome.org/wiki/Analysis/vcf4.0/ -##reference=1000GenomesPilot-NCBI36 -##phasing=partial -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##FILTER= -##FILTER= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 -20 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,. -20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3 -20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4 -20 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2 -20 1234567 microsat1 GTCT G,GTACT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3 diff --git a/src/busco/busco_download_datasets/config.vsh.yaml b/src/busco/busco_download_datasets/config.vsh.yaml index 04d76dd6..5297af2e 100644 --- a/src/busco/busco_download_datasets/config.vsh.yaml +++ b/src/busco/busco_download_datasets/config.vsh.yaml @@ -37,7 +37,7 @@ test_resources: path: test.sh engines: - type: docker - image: quay.io/biocontainers/busco:5.6.1--pyhdfd78af_0 + image: quay.io/biocontainers/busco:5.7.1--pyhdfd78af_0 setup: - type: docker run: | diff --git a/src/busco/busco_list_datasets/config.vsh.yaml b/src/busco/busco_list_datasets/config.vsh.yaml index 6ada7c84..cac34cc6 100644 --- a/src/busco/busco_list_datasets/config.vsh.yaml +++ b/src/busco/busco_list_datasets/config.vsh.yaml @@ -29,7 +29,7 @@ test_resources: path: test.sh engines: - type: docker - image: quay.io/biocontainers/busco:5.6.1--pyhdfd78af_0 + image: quay.io/biocontainers/busco:5.7.1--pyhdfd78af_0 setup: - type: docker run: | diff --git a/src/busco/busco_run/config.vsh.yaml b/src/busco/busco_run/config.vsh.yaml index d79f03f5..23ee95fb 100644 --- a/src/busco/busco_run/config.vsh.yaml +++ b/src/busco/busco_run/config.vsh.yaml @@ -181,6 +181,10 @@ argument_groups: - name: MetaEuk Settings arguments: + - name: --metaeuk + type: boolean_true + description: | + Use Metaeuk gene predictor. - name: --metaeuk_parameters type: string description: | @@ -204,7 +208,7 @@ test_resources: path: test_data engines: - type: docker - image: quay.io/biocontainers/busco:5.6.1--pyhdfd78af_0 + image: quay.io/biocontainers/busco:5.7.1--pyhdfd78af_0 setup: - type: docker run: | diff --git a/src/busco/busco_run/help.txt b/src/busco/busco_run/help.txt index 2cacec4d..6d83f9be 100644 --- a/src/busco/busco_run/help.txt +++ b/src/busco/busco_run/help.txt @@ -2,7 +2,9 @@ busco -h ``` -Welcome to BUSCO 5.6.1: the Benchmarking Universal Single-Copy Ortholog assessment tool. +usage: busco -i [SEQUENCE_FILE] -l [LINEAGE] -o [OUTPUT_NAME] -m [MODE] [OTHER OPTIONS] + +Welcome to BUSCO 5.7.1: the Benchmarking Universal Single-Copy Ortholog assessment tool. For more detailed usage information, please review the README file provided with this distribution and the BUSCO user guide. Visit this page https://gitlab.com/ezlab/busco#how-to-cite-busco to see how to cite BUSCO optional arguments: @@ -18,7 +20,7 @@ optional arguments: -l LINEAGE, --lineage_dataset LINEAGE Specify the name of the BUSCO lineage to be used. --augustus Use augustus gene predictor for eukaryote runs - --augustus_parameters --PARAM1=VALUE1,--PARAM2=VALUE2 + --augustus_parameters "--PARAM1=VALUE1,--PARAM2=VALUE2" Pass additional arguments to Augustus. All arguments should be contained within a single string with no white space, with each argument separated by a comma. --augustus_species AUGUSTUS_SPECIES Specify a species for Augustus training. @@ -42,11 +44,12 @@ optional arguments: --limit N How many candidate regions (contig or transcript) to consider per BUSCO (default: 3) --list-datasets Print the list of available BUSCO datasets --long Optimization Augustus self-training mode (Default: Off); adds considerably to the run time, but can improve results for some non-model organisms + --metaeuk Use Metaeuk gene predictor --metaeuk_parameters "--PARAM1=VALUE1,--PARAM2=VALUE2" Pass additional arguments to Metaeuk for the first run. All arguments should be contained within a single string with no white space, with each argument separated by a comma. --metaeuk_rerun_parameters "--PARAM1=VALUE1,--PARAM2=VALUE2" Pass additional arguments to Metaeuk for the second run. All arguments should be contained within a single string with no white space, with each argument separated by a comma. - --miniprot Use miniprot gene predictor + --miniprot Use Miniprot gene predictor --skip_bbtools Skip BBTools for assembly statistics --offline To indicate that BUSCO cannot attempt to download files --opt-out-run-stats Opt out of data collection. Information on the data collected is available in the user guide. diff --git a/src/busco/busco_run/script.sh b/src/busco/busco_run/script.sh index 5b562f83..a0ef24de 100644 --- a/src/busco/busco_run/script.sh +++ b/src/busco/busco_run/script.sh @@ -39,6 +39,7 @@ busco \ ${par_force:+--force} \ ${par_limit:+--limit "$par_limit"} \ ${par_long:+--long} \ + ${par_metaeuk:+--metaeuk} \ ${par_metaeuk_parameters:+--metaeuk_parameters "$par_metaeuk_parameters"} \ ${par_metaeuk_rerun_parameters:+--metaeuk_rerun_parameters "$par_metaeuk_rerun_parameters"} \ ${par_miniprot:+--miniprot} \ diff --git a/src/cutadapt/config.vsh.yaml b/src/cutadapt/config.vsh.yaml new file mode 100644 index 00000000..b315d0ce --- /dev/null +++ b/src/cutadapt/config.vsh.yaml @@ -0,0 +1,481 @@ +name: cutadapt +description: | + Cutadapt removes adapter sequences from high-throughput sequencing reads. +keywords: [RNA-seq, scRNA-seq, high-throughput] +links: + homepage: https://cutadapt.readthedocs.io + documentation: https://cutadapt.readthedocs.io + repository: https://github.com/marcelm/cutadapt +references: + doi: 10.14806/ej.17.1.200 +license: MIT +argument_groups: + #################################################################### + - name: Specify Adapters for R1 + arguments: + - name: --adapter + alternatives: [-a] + type: string + multiple: true + description: | + Sequence of an adapter ligated to the 3' end (paired data: + of the first read). The adapter and subsequent bases are + trimmed. If a '$' character is appended ('anchoring'), the + adapter is only found if it is a suffix of the read. + required: false + - name: --front + alternatives: [-g] + type: string + multiple: true + description: | + Sequence of an adapter ligated to the 5' end (paired data: + of the first read). The adapter and any preceding bases + are trimmed. Partial matches at the 5' end are allowed. If + a '^' character is prepended ('anchoring'), the adapter is + only found if it is a prefix of the read. + required: false + - name: --anywhere + alternatives: [-b] + type: string + multiple: true + description: | + Sequence of an adapter that may be ligated to the 5' or 3' + end (paired data: of the first read). Both types of + matches as described under -a and -g are allowed. If the + first base of the read is part of the match, the behavior + is as with -g, otherwise as with -a. This option is mostly + for rescuing failed library preparations - do not use if + you know which end your adapter was ligated to! + required: false + + #################################################################### + - name: Specify Adapters using Fasta files for R1 + arguments: + - name: --adapter_fasta + type: file + multiple: true + description: | + Fasta file containing sequences of an adapter ligated to the 3' end (paired data: + of the first read). The adapter and subsequent bases are + trimmed. If a '$' character is appended ('anchoring'), the + adapter is only found if it is a suffix of the read. + required: false + - name: --front_fasta + type: file + description: | + Fasta file containing sequences of an adapter ligated to the 5' end (paired data: + of the first read). The adapter and any preceding bases + are trimmed. Partial matches at the 5' end are allowed. If + a '^' character is prepended ('anchoring'), the adapter is + only found if it is a prefix of the read. + required: false + - name: --anywhere_fasta + type: file + description: | + Fasta file containing sequences of an adapter that may be ligated to the 5' or 3' + end (paired data: of the first read). Both types of + matches as described under -a and -g are allowed. If the + first base of the read is part of the match, the behavior + is as with -g, otherwise as with -a. This option is mostly + for rescuing failed library preparations - do not use if + you know which end your adapter was ligated to! + required: false + + #################################################################### + - name: Specify Adapters for R2 + arguments: + - name: --adapter_r2 + alternatives: [-A] + type: string + multiple: true + description: | + Sequence of an adapter ligated to the 3' end (paired data: + of the first read). The adapter and subsequent bases are + trimmed. If a '$' character is appended ('anchoring'), the + adapter is only found if it is a suffix of the read. + required: false + - name: --front_r2 + alternatives: [-G] + type: string + multiple: true + description: | + Sequence of an adapter ligated to the 5' end (paired data: + of the first read). The adapter and any preceding bases + are trimmed. Partial matches at the 5' end are allowed. If + a '^' character is prepended ('anchoring'), the adapter is + only found if it is a prefix of the read. + required: false + - name: --anywhere_r2 + alternatives: [-B] + type: string + multiple: true + description: | + Sequence of an adapter that may be ligated to the 5' or 3' + end (paired data: of the first read). Both types of + matches as described under -a and -g are allowed. If the + first base of the read is part of the match, the behavior + is as with -g, otherwise as with -a. This option is mostly + for rescuing failed library preparations - do not use if + you know which end your adapter was ligated to! + required: false + + #################################################################### + - name: Specify Adapters using Fasta files for R2 + arguments: + - name: --adapter_r2_fasta + type: file + description: | + Fasta file containing sequences of an adapter ligated to the 3' end (paired data: + of the first read). The adapter and subsequent bases are + trimmed. If a '$' character is appended ('anchoring'), the + adapter is only found if it is a suffix of the read. + required: false + - name: --front_r2_fasta + type: file + description: | + Fasta file containing sequences of an adapter ligated to the 5' end (paired data: + of the first read). The adapter and any preceding bases + are trimmed. Partial matches at the 5' end are allowed. If + a '^' character is prepended ('anchoring'), the adapter is + only found if it is a prefix of the read. + required: false + - name: --anywhere_r2_fasta + type: file + description: | + Fasta file containing sequences of an adapter that may be ligated to the 5' or 3' + end (paired data: of the first read). Both types of + matches as described under -a and -g are allowed. If the + first base of the read is part of the match, the behavior + is as with -g, otherwise as with -a. This option is mostly + for rescuing failed library preparations - do not use if + you know which end your adapter was ligated to! + required: false + + #################################################################### + - name: Paired-end options + arguments: + - name: --pair_adapters + type: boolean_true + description: | + Treat adapters given with -a/-A etc. as pairs. Either both + or none are removed from each read pair. + - name: --pair_filter + type: string + choices: [any, both, first] + description: | + Which of the reads in a paired-end read have to match the + filtering criterion in order for the pair to be filtered. + - name: --interleaved + type: boolean_true + description: | + Read and/or write interleaved paired-end reads. + + #################################################################### + - name: Input parameters + arguments: + - name: --input + type: file + required: true + description: | + Input fastq file for single-end reads or R1 for paired-end reads. + - name: --input_r2 + type: file + required: false + description: | + Input fastq file for R2 in the case of paired-end reads. + - name: --error_rate + alternatives: [-E, --errors] + type: double + description: | + Maximum allowed error rate (if 0 <= E < 1), or absolute + number of errors for full-length adapter match (if E is an + integer >= 1). Error rate = no. of errors divided by + length of matching region. Default: 0.1 (10%). + example: 0.1 + - name: --no_indels + type: boolean_false + description: | + Allow only mismatches in alignments. + + - name: --times + type: integer + alternatives: [-n] + description: | + Remove up to COUNT adapters from each read. Default: 1. + example: 1 + - name: --overlap + alternatives: [-O] + type: integer + description: | + Require MINLENGTH overlap between read and adapter for an + adapter to be found. The default is 3. + example: 3 + - name: --match_read_wildcards + type: boolean_true + description: | + Interpret IUPAC wildcards in reads. + - name: --no_match_adapter_wildcards + type: boolean_false + description: | + Do not interpret IUPAC wildcards in adapters. + - name: --action + type: string + choices: + - trim + - retain + - mask + - lowercase + - none + description: | + What to do if a match was found. trim: trim adapter and + up- or downstream sequence; retain: trim, but retain + adapter; mask: replace with 'N' characters; lowercase: + convert to lowercase; none: leave unchanged. + The default is trim. + example: trim + - name: --revcomp + alternatives: [--rc] + type: boolean_true + description: | + Check both the read and its reverse complement for adapter + matches. If match is on reverse-complemented version, + output that one. + + #################################################################### + - name: "Demultiplexing options" + arguments: + - name: "--demultiplex_mode" + type: string + choices: ["single", "unique_dual", "combinatorial_dual"] + required: false + description: | + Enable demultiplexing and set the mode for it. + With mode 'unique_dual', adapters from the first and second read are used, + and the indexes from the reads are only used in pairs. This implies + --pair_adapters. + Enabling mode 'combinatorial_dual' allows all combinations of the sets of indexes + on R1 and R2. It is necessary to write each read pair to an output + file depending on the adapters found on both R1 and R2. + Mode 'single', uses indexes or barcodes located at the 5' + end of the R1 read (single). + + #################################################################### + - name: Read modifications + arguments: + - name: --cut + alternatives: [-u] + type: integer + multiple: true + description: | + Remove LEN bases from each read (or R1 if paired; use --cut_r2 + option for R2). If LEN is positive, remove bases from the + beginning. If LEN is negative, remove bases from the end. + Can be used twice if LENs have different signs. Applied + *before* adapter trimming. + - name: --cut_r2 + type: integer + multiple: true + description: | + Remove LEN bases from each read (for R2). If LEN is positive, remove bases from the + beginning. If LEN is negative, remove bases from the end. + Can be used twice if LENs have different signs. Applied + *before* adapter trimming. + - name: --nextseq_trim + type: string + description: | + NextSeq-specific quality trimming (each read). Trims also + dark cycles appearing as high-quality G bases. + - name: --quality_cutoff + alternatives: [-q] + type: string + description: | + Trim low-quality bases from 5' and/or 3' ends of each read + before adapter removal. Applied to both reads if data is + paired. If one value is given, only the 3' end is trimmed. + If two comma-separated cutoffs are given, the 5' end is + trimmed with the first cutoff, the 3' end with the second. + - name: --quality_cutoff_r2 + alternatives: [-Q] + type: string + description: | + Quality-trimming cutoff for R2. Default: same as for R1 + - name: --quality_base + type: integer + description: | + Assume that quality values in FASTQ are encoded as + ascii(quality + N). This needs to be set to 64 for some + old Illumina FASTQ files. The default is 33. + example: 33 + - name: --poly_a + type: boolean_true + description: Trim poly-A tails + - name: --length + alternatives: [-l] + type: integer + description: | + Shorten reads to LENGTH. Positive values remove bases at + the end while negative ones remove bases at the beginning. + This and the following modifications are applied after + adapter trimming. + - name: --trim_n + type: boolean_true + description: Trim N's on ends of reads. + - name: --length_tag + type: string + description: | + Search for TAG followed by a decimal number in the + description field of the read. Replace the decimal number + with the correct length of the trimmed read. For example, + use --length-tag 'length=' to correct fields like + 'length=123'. + example: "length=" + - name: --strip_suffix + type: string + description: | + Remove this suffix from read names if present. Can be + given multiple times. + - name: --prefix + alternatives: [-x] + type: string + description: | + Add this prefix to read names. Use {name} to insert the + name of the matching adapter. + - name: --suffix + alternatives: [-y] + type: string + description: | + Add this suffix to read names; can also include {name} + - name: --rename + type: string + description: | + Rename reads using TEMPLATE containing variables such as + {id}, {adapter_name} etc. (see documentation) + - name: --zero_cap + alternatives: [-z] + type: boolean_true + description: Change negative quality values to zero. + + #################################################################### + - name: Filtering of processed reads + description: | + Filters are applied after above read modifications. Paired-end reads are + always discarded pairwise (see also --pair_filter). + arguments: + - name: --minimum_length + alternatives: [-m] + type: string + description: | + Discard reads shorter than LEN. Default is 0. + When trimming paired-end reads, the minimum lengths for R1 and R2 can be specified separately by separating them with a colon (:). + If the colon syntax is not used, the same minimum length applies to both reads, as discussed above. + Also, one of the values can be omitted to impose no restrictions. + For example, with -m 17:, the length of R1 must be at least 17, but the length of R2 is ignored. + example: "0" + - name: --maximum_length + alternatives: [-M] + type: string + description: | + Discard reads longer than LEN. Default: no limit. + For paired reads, see the remark for --minimum_length + - name: --max_n + type: string + description: | + Discard reads with more than COUNT 'N' bases. If COUNT is + a number between 0 and 1, it is interpreted as a fraction + of the read length. + - name: --max_expected_errors + alternatives: [--max_ee] + type: long + description: | + Discard reads whose expected number of errors (computed + from quality values) exceeds ERRORS. + - name: --max_average_error_rate + alternatives: [--max_aer] + type: long + description: | + as --max_expected_errors (see above), but divided by + length to account for reads of varying length. + - name: --discard_trimmed + alternatives: [--discard] + type: boolean_true + description: | + Discard reads that contain an adapter. Use also -O to + avoid discarding too many randomly matching reads. + - name: --discard_untrimmed + alternatives: [--trimmed_only] + type: boolean_true + description: | + Discard reads that do not contain an adapter. + - name: --discard_casava + type: boolean_true + description: | + Discard reads that did not pass CASAVA filtering (header + has :Y:). + + #################################################################### + - name: Output parameters + arguments: + - name: --report + type: string + choices: [full, minimal] + description: | + Which type of report to print: 'full' (default) or 'minimal'. + example: full + - name: --json + type: boolean_true + description: | + Write report in JSON format to this file. + - name: --output + type: file + description: | + Glob pattern for matching the expected output files. + Should include `$output_dir`. + example: "fastq/*_001.fast[a,q]" + direction: output + required: true + must_exist: true + multiple: true + - name: --fasta + type: boolean_true + description: | + Output FASTA to standard output even on FASTQ input. + - name: --info_file + type: boolean_true + description: | + Write information about each read and its adapter matches + into info.txt in the output directory. + See the documentation for the file format. + # - name: -Z + # - name: --rest_file + # - name: --wildcard-file + # - name: --too_short_output + # - name: --too_long_output + # - name: --untrimmed_output + # - name: --untrimmed_paired_output + # - name: too_short_paired_output + # - name: too_long_paired_output + - name: Debug + arguments: + - type: boolean_true + name: --debug + description: Print debug information +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + +engines: + - type: docker + image: python:3.12 + setup: + - type: python + pip: + - cutadapt + - type: docker + run: | + cutadapt --version | sed 's/\(.*\)/cutadapt: "\1"/' > /var/software_versions.txt +runners: + - type: executable + - type: nextflow diff --git a/src/cutadapt/help.txt b/src/cutadapt/help.txt new file mode 100644 index 00000000..2280c3e2 --- /dev/null +++ b/src/cutadapt/help.txt @@ -0,0 +1,218 @@ +cutadapt version 4.6 + +Copyright (C) 2010 Marcel Martin and contributors + +Cutadapt removes adapter sequences from high-throughput sequencing reads. + +Usage: + cutadapt -a ADAPTER [options] [-o output.fastq] input.fastq + +For paired-end reads: + cutadapt -a ADAPT1 -A ADAPT2 [options] -o out1.fastq -p out2.fastq in1.fastq in2.fastq + +Replace "ADAPTER" with the actual sequence of your 3' adapter. IUPAC wildcard +characters are supported. All reads from input.fastq will be written to +output.fastq with the adapter sequence removed. Adapter matching is +error-tolerant. Multiple adapter sequences can be given (use further -a +options), but only the best-matching adapter will be removed. + +Input may also be in FASTA format. Compressed input and output is supported and +auto-detected from the file name (.gz, .xz, .bz2). Use the file name '-' for +standard input/output. Without the -o option, output is sent to standard output. + +Citation: + +Marcel Martin. Cutadapt removes adapter sequences from high-throughput +sequencing reads. EMBnet.Journal, 17(1):10-12, May 2011. +http://dx.doi.org/10.14806/ej.17.1.200 + +Run "cutadapt --help" to see all command-line options. +See https://cutadapt.readthedocs.io/ for full documentation. + +Options: + -h, --help Show this help message and exit + --version Show version number and exit + --debug Print debug log. Use twice to also print DP matrices + -j CORES, --cores CORES + Number of CPU cores to use. Use 0 to auto-detect. Default: + 1 + +Finding adapters: + Parameters -a, -g, -b specify adapters to be removed from each read (or from + R1 if data is paired-end. If specified multiple times, only the best matching + adapter is trimmed (but see the --times option). Use notation 'file:FILE' to + read adapter sequences from a FASTA file. + + -a ADAPTER, --adapter ADAPTER + Sequence of an adapter ligated to the 3' end (paired data: + of the first read). The adapter and subsequent bases are + trimmed. If a '$' character is appended ('anchoring'), the + adapter is only found if it is a suffix of the read. + -g ADAPTER, --front ADAPTER + Sequence of an adapter ligated to the 5' end (paired data: + of the first read). The adapter and any preceding bases + are trimmed. Partial matches at the 5' end are allowed. If + a '^' character is prepended ('anchoring'), the adapter is + only found if it is a prefix of the read. + -b ADAPTER, --anywhere ADAPTER + Sequence of an adapter that may be ligated to the 5' or 3' + end (paired data: of the first read). Both types of + matches as described under -a and -g are allowed. If the + first base of the read is part of the match, the behavior + is as with -g, otherwise as with -a. This option is mostly + for rescuing failed library preparations - do not use if + you know which end your adapter was ligated to! + -e E, --error-rate E, --errors E + Maximum allowed error rate (if 0 <= E < 1), or absolute + number of errors for full-length adapter match (if E is an + integer >= 1). Error rate = no. of errors divided by + length of matching region. Default: 0.1 (10%) + --no-indels Allow only mismatches in alignments. Default: allow both + mismatches and indels + -n COUNT, --times COUNT + Remove up to COUNT adapters from each read. Default: 1 + -O MINLENGTH, --overlap MINLENGTH + Require MINLENGTH overlap between read and adapter for an + adapter to be found. Default: 3 + --match-read-wildcards + Interpret IUPAC wildcards in reads. Default: False + -N, --no-match-adapter-wildcards + Do not interpret IUPAC wildcards in adapters. + --action {trim,retain,mask,lowercase,none} + What to do if a match was found. trim: trim adapter and + up- or downstream sequence; retain: trim, but retain + adapter; mask: replace with 'N' characters; lowercase: + convert to lowercase; none: leave unchanged. Default: trim + --rc, --revcomp Check both the read and its reverse complement for adapter + matches. If match is on reverse-complemented version, + output that one. Default: check only read + +Additional read modifications: + -u LEN, --cut LEN Remove LEN bases from each read (or R1 if paired; use -U + option for R2). If LEN is positive, remove bases from the + beginning. If LEN is negative, remove bases from the end. + Can be used twice if LENs have different signs. Applied + *before* adapter trimming. + --nextseq-trim 3'CUTOFF + NextSeq-specific quality trimming (each read). Trims also + dark cycles appearing as high-quality G bases. + -q [5'CUTOFF,]3'CUTOFF, --quality-cutoff [5'CUTOFF,]3'CUTOFF + Trim low-quality bases from 5' and/or 3' ends of each read + before adapter removal. Applied to both reads if data is + paired. If one value is given, only the 3' end is trimmed. + If two comma-separated cutoffs are given, the 5' end is + trimmed with the first cutoff, the 3' end with the second. + --quality-base N Assume that quality values in FASTQ are encoded as + ascii(quality + N). This needs to be set to 64 for some + old Illumina FASTQ files. Default: 33 + --poly-a Trim poly-A tails + --length LENGTH, -l LENGTH + Shorten reads to LENGTH. Positive values remove bases at + the end while negative ones remove bases at the beginning. + This and the following modifications are applied after + adapter trimming. + --trim-n Trim N's on ends of reads. + --length-tag TAG Search for TAG followed by a decimal number in the + description field of the read. Replace the decimal number + with the correct length of the trimmed read. For example, + use --length-tag 'length=' to correct fields like + 'length=123'. + --strip-suffix STRIP_SUFFIX + Remove this suffix from read names if present. Can be + given multiple times. + -x PREFIX, --prefix PREFIX + Add this prefix to read names. Use {name} to insert the + name of the matching adapter. + -y SUFFIX, --suffix SUFFIX + Add this suffix to read names; can also include {name} + --rename TEMPLATE Rename reads using TEMPLATE containing variables such as + {id}, {adapter_name} etc. (see documentation) + --zero-cap, -z Change negative quality values to zero. + +Filtering of processed reads: + Filters are applied after above read modifications. Paired-end reads are + always discarded pairwise (see also --pair-filter). + + -m LEN[:LEN2], --minimum-length LEN[:LEN2] + Discard reads shorter than LEN. Default: 0 + -M LEN[:LEN2], --maximum-length LEN[:LEN2] + Discard reads longer than LEN. Default: no limit + --max-n COUNT Discard reads with more than COUNT 'N' bases. If COUNT is + a number between 0 and 1, it is interpreted as a fraction + of the read length. + --max-expected-errors ERRORS, --max-ee ERRORS + Discard reads whose expected number of errors (computed + from quality values) exceeds ERRORS. + --max-average-error-rate ERROR_RATE, --max-aer ERROR_RATE + as --max-expected-errors (see above), but divided by + length to account for reads of varying length. + --discard-trimmed, --discard + Discard reads that contain an adapter. Use also -O to + avoid discarding too many randomly matching reads. + --discard-untrimmed, --trimmed-only + Discard reads that do not contain an adapter. + --discard-casava Discard reads that did not pass CASAVA filtering (header + has :Y:). + +Output: + --quiet Print only error messages. + --report {full,minimal} + Which type of report to print: 'full' or 'minimal'. + Default: full + --json FILE Dump report in JSON format to FILE + -o FILE, --output FILE + Write trimmed reads to FILE. FASTQ or FASTA format is + chosen depending on input. Summary report is sent to + standard output. Use '{name}' for demultiplexing (see + docs). Default: write to standard output + --fasta Output FASTA to standard output even on FASTQ input. + -Z Use compression level 1 for gzipped output files (faster, + but uses more space) + --info-file FILE Write information about each read and its adapter matches + into FILE. See the documentation for the file format. + -r FILE, --rest-file FILE + When the adapter matches in the middle of a read, write + the rest (after the adapter) to FILE. + --wildcard-file FILE When the adapter has N wildcard bases, write adapter bases + matching wildcard positions to FILE. (Inaccurate with + indels.) + --too-short-output FILE + Write reads that are too short (according to length + specified by -m) to FILE. Default: discard reads + --too-long-output FILE + Write reads that are too long (according to length + specified by -M) to FILE. Default: discard reads + --untrimmed-output FILE + Write reads that do not contain any adapter to FILE. + Default: output to same file as trimmed reads + +Paired-end options: + The -A/-G/-B/-U/-Q options work like their lowercase counterparts, but are + applied to R2 (second read in pair) + + -A ADAPTER 3' adapter to be removed from R2 + -G ADAPTER 5' adapter to be removed from R2 + -B ADAPTER 5'/3 adapter to be removed from R2 + -U LENGTH Remove LENGTH bases from R2 + -Q [5'CUTOFF,]3'CUTOFF + Quality-trimming cutoff for R2. Default: same as for R1 + -p FILE, --paired-output FILE + Write R2 to FILE. + --pair-adapters Treat adapters given with -a/-A etc. as pairs. Either both + or none are removed from each read pair. + --pair-filter {any,both,first} + Which of the reads in a paired-end read have to match the + filtering criterion in order for the pair to be filtered. + Default: any + --interleaved Read and/or write interleaved paired-end reads. + --untrimmed-paired-output FILE + Write second read in a pair to this FILE when no adapter + was found. Use with --untrimmed-output. Default: output to + same file as trimmed reads + --too-short-paired-output FILE + Write second read in a pair to this file if pair is too + short. + --too-long-paired-output FILE + Write second read in a pair to this file if pair is too + long. + diff --git a/src/cutadapt/script.sh b/src/cutadapt/script.sh new file mode 100644 index 00000000..20c92724 --- /dev/null +++ b/src/cutadapt/script.sh @@ -0,0 +1,258 @@ +#!/bin/bash + +## VIASH START +par_adapter='AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC;GGATCGGAAGAGCACACGTCTGAACTCCAGTCAC' +par_input='src/cutadapt/test_data/se/a.fastq' +par_report='full' +par_json='false' +par_fasta='false' +par_info_file='false' +par_debug='true' +## VIASH END + +function debug { + [[ "$par_debug" == "true" ]] && echo "DEBUG: $@" +} + +output_dir=$(dirname $par_output) +[[ ! -d $output_dir ]] && mkdir -p $output_dir + +# Init +########################################################### + +echo ">> Paired-end data or not?" + +mode="" +if [[ -z $par_input_r2 ]]; then + mode="se" + echo " Single end" + input="$par_input" +else + echo " Paired end" + mode="pe" + input="$par_input $par_input_r2" +fi + +# Adapter arguments +# - paired and single-end +# - string and fasta +########################################################### + +function add_flags { + local arg=$1 + local flag=$2 + local prefix=$3 + [[ -z $prefix ]] && prefix="" + + # This function should not be called if the input is empty + # but check for it just in case + if [[ -z $arg ]]; then + return + fi + + local output="" + IFS=';' read -r -a array <<< "$arg" + for a in "${array[@]}"; do + output="$output $flag $prefix$a" + done + echo $output +} + +debug ">> Parsing arguments dealing with adapters" +adapter_args=$(echo \ + ${par_adapter:+$(add_flags "$par_adapter" "--adapter")} \ + ${par_adapter_fasta:+$(add_flags "$par_adapter_fasta" "--adapter" "file:")} \ + ${par_front:+$(add_flags "$par_front" "--front")} \ + ${par_front_fasta:+$(add_flags "$par_front_fasta" "--front" "file:")} \ + ${par_anywhere:+$(add_flags "$par_anywhere" "--anywhere")} \ + ${par_anywhere_fasta:+$(add_flags "$par_anywhere_fasta" "--anywhere" "file:")} \ + ${par_adapter_r2:+$(add_flags "$par_adapter_r2" "-A")} \ + ${par_adapter_fasta_r2:+$(add_flags "$par_adapter_fasta_r2" "-A" "file:")} \ + ${par_front_r2:+$(add_flags "$par_front_r2" "-G")} \ + ${par_front_fasta_r2:+$(add_flags "$par_front_fasta_r2" "-G" "file:")} \ + ${par_anywhere_r2:+$(add_flags "$par_anywhere_r2" "-B")} \ + ${par_anywhere_fasta_r2:+$(add_flags "$par_anywhere_fasta_r2" "-B" "file:")} \ +) + +debug "Arguments to cutadapt:" +debug "$adapter_args" +debug + +# Paired-end options +########################################################### +echo ">> Parsing arguments for paired-end reads" +[[ "$par_pair_adapters" == "false" ]] && unset par_pair_adapters +[[ "$par_interleaved" == "false" ]] && unset par_interleaved + +paired_args=$(echo \ + ${par_pair_adapters:+--pair-adapters} \ + ${par_pair_filter:+--pair-filter "${par_pair_filter}"} \ + ${par_interleaved:+--interleaved} +) +debug "Arguments to cutadapt:" +debug $paired_args +debug + +# Input arguments +########################################################### +echo ">> Parsing input arguments" +[[ "$par_no_indels" == "true" ]] && unset par_no_indels +[[ "$par_match_read_wildcards" == "false" ]] && unset par_match_read_wildcards +[[ "$par_no_match_adapter_wildcards" == "true" ]] && unset par_no_match_adapter_wildcards +[[ "$par_revcomp" == "false" ]] && unset par_revcomp + +input_args=$(echo \ + ${par_error_rate:+--error-rate "${par_error_rate}"} \ + ${par_no_indels:+--no-indels} \ + ${par_times:+--times "${par_times}"} \ + ${par_overlap:+--overlap "${par_overlap}"} \ + ${par_match_read_wildcards:+--match-read-wildcards} \ + ${par_no_match_adapter_wildcards:+--no-match-adapter-wildcards} \ + ${par_action:+--action "${par_action}"} \ + ${par_revcomp:+--revcomp} \ +) +debug "Arguments to cutadapt:" +debug $input_args +debug + +# Read modifications +########################################################### +echo ">> Parsing read modification arguments" +[[ "$par_poly_a" == "false" ]] && unset par_poly_a +[[ "$par_trim_n" == "false" ]] && unset par_trim_n +[[ "$par_zero_cap" == "false" ]] && unset par_zero_cap + +mod_args=$(echo \ + ${par_cut:+--cut "${par_cut}"} \ + ${par_cut_r2:+--cut_r2 "${par_cut_r2}"} \ + ${par_nextseq_trim:+--nextseq-trim "${par_nextseq_trim}"} \ + ${par_quality_cutoff:+--quality-cutoff "${par_quality_cutoff}"} \ + ${par_quality_cutoff_r2:+-Q "${par_quality_cutoff_r2}"} \ + ${par_quality_base:+--quality-base "${par_quality_base}"} \ + ${par_poly_a:+--poly-a} \ + ${par_length:+--length "${par_length}"} \ + ${par_trim_n:+--trim-n} \ + ${par_length_tag:+--length-tag "${par_length_tag}"} \ + ${par_strip_suffix:+--strip-suffix "${par_strip_suffix}"} \ + ${par_prefix:+--prefix "${par_prefix}"} \ + ${par_suffix:+--suffix "${par_suffix}"} \ + ${par_rename:+--rename "${par_rename}"} \ + ${par_zero_cap:+--zero-cap} \ +) +debug "Arguments to cutadapt:" +debug $mod_args +debug + +# Filtering of processed reads arguments +########################################################### +echo ">> Filtering of processed reads arguments" +[[ "$par_discard_trimmed" == "false" ]] && unset par_discard_trimmed +[[ "$par_discard_untrimmed" == "false" ]] && unset par_discard_untrimmed +[[ "$par_discard_casava" == "false" ]] && unset par_discard_casava + +# Parse and transform the minimum and maximum length arguments +[[ -z $par_minimum_length ]] + +filter_args=$(echo \ + ${par_minimum_length:+--minimum-length "${par_minimum_length}"} \ + ${par_maximum_length:+--maximum-length "${par_maximum_length}"} \ + ${par_max_n:+--max-n "${par_max_n}"} \ + ${par_max_expected_errors:+--max-expected-errors "${par_max_expected_errors}"} \ + ${par_max_average_error_rate:+--max-average-error-rate "${par_max_average_error_rate}"} \ + ${par_discard_trimmed:+--discard-trimmed} \ + ${par_discard_untrimmed:+--discard-untrimmed} \ + ${par_discard_casava:+--discard-casava} \ +) +debug "Arguments to cutadapt:" +debug $filter_args +debug + +# Optional output arguments +########################################################### +echo ">> Optional arguments" +[[ "$par_json" == "false" ]] && unset par_json +[[ "$par_fasta" == "false" ]] && unset par_fasta +[[ "$par_info_file" == "false" ]] && unset par_info_file + +optional_output_args=$(echo \ + ${par_report:+--report "${par_report}"} \ + ${par_json:+--json "report.json"} \ + ${par_fasta:+--fasta} \ + ${par_info_file:+--info-file "info.txt"} \ +) + +debug "Arguments to cutadapt:" +debug $optional_output_args +debug + +# Output arguments +# We write the output to a directory rather than +# individual files. +########################################################### + +if [[ -z $par_fasta ]]; then + ext="fastq" +else + ext="fasta" +fi + +demultiplex_mode="$par_demultiplex_mode" +if [[ $mode == "se" ]]; then + if [[ "$demultiplex_mode" == "unique_dual" ]] || [[ "$demultiplex_mode" == "combinatorial_dual" ]]; then + echo "Demultiplexing dual indexes is not possible with single-end data." + exit 1 + fi + prefix="trimmed_" + if [[ ! -z "$demultiplex_mode" ]]; then + prefix="{name}_" + fi + output_args=$(echo \ + --output "$output_dir/${prefix}001.$ext" \ + ) +else + demultiplex_indicator_r1='{name}_' + demultiplex_indicator_r2=$demultiplex_indicator_r1 + if [[ "$demultiplex_mode" == "combinatorial_dual" ]]; then + demultiplex_indicator_r1='{name1}_{name2}_' + demultiplex_indicator_r2='{name1}_{name2}_' + fi + prefix_r1="trimmed_" + prefix_r2="trimmed_" + if [[ ! -z "$demultiplex_mode" ]]; then + prefix_r1=$demultiplex_indicator_r1 + prefix_r2=$demultiplex_indicator_r2 + fi + output_args=$(echo \ + --output "$output_dir/${prefix_r1}R1_001.$ext" \ + --paired-output "$output_dir/${prefix_r2}R2_001.$ext" \ + ) +fi + +debug "Arguments to cutadapt:" +debug $output_args +debug + +# Full CLI +# Set the --cores argument to 0 unless meta_cpus is set +########################################################### +echo ">> Running cutadapt" +par_cpus=0 +[[ ! -z $meta_cpus ]] && par_cpus=$meta_cpus + +cli=$(echo \ + $input \ + $adapter_args \ + $paired_args \ + $input_args \ + $mod_args \ + $filter_args \ + $optional_output_args \ + $output_args \ + --cores $par_cpus +) + +debug ">> Full CLI to be run:" +debug cutadapt $cli | sed -e 's/--/\r\n --/g' +debug + +cutadapt $cli diff --git a/src/cutadapt/test.sh b/src/cutadapt/test.sh new file mode 100644 index 00000000..28248742 --- /dev/null +++ b/src/cutadapt/test.sh @@ -0,0 +1,261 @@ +#!/bin/bash + +set -e +set -eo pipefail + +############################################# +# helper functions +assert_file_exists() { + [ -f "$1" ] || { echo "File '$1' does not exist" && exit 1; } +} +assert_file_doesnt_exist() { + [ ! -f "$1" ] || { echo "File '$1' exists but shouldn't" && exit 1; } +} +assert_file_empty() { + [ ! -s "$1" ] || { echo "File '$1' is not empty but should be" && exit 1; } +} +assert_file_not_empty() { + [ -s "$1" ] || { echo "File '$1' is empty but shouldn't be" && exit 1; } +} +assert_file_contains() { + grep -q "$2" "$1" || { echo "File '$1' does not contain '$2'" && exit 1; } +} +assert_file_not_contains() { + grep -q "$2" "$1" && { echo "File '$1' contains '$2' but shouldn't" && exit 1; } +} +############################################# + +mkdir test_multiple_output +cd test_multiple_output + +echo "#############################################" +echo "> Run cutadapt with multiple outputs" + +cat > example.fa <<'EOF' +>read1 +MYSEQUENCEADAPTER +>read2 +MYSEQUENCEADAP +>read3 +MYSEQUENCEADAPTERSOMETHINGELSE +>read4 +MYSEQUENCEADABTER +>read5 +MYSEQUENCEADAPTR +>read6 +MYSEQUENCEADAPPTER +>read7 +ADAPTERMYSEQUENCE +>read8 +PTERMYSEQUENCE +>read9 +SOMETHINGADAPTERMYSEQUENCE +EOF + +"$meta_executable" \ + --report minimal \ + --output "out_test/*.fasta" \ + --adapter ADAPTER \ + --input example.fa \ + --fasta \ + --demultiplex_mode single \ + --no_match_adapter_wildcards \ + --json + +echo ">> Checking output" +assert_file_exists "report.json" +assert_file_exists "out_test/1_001.fasta" +assert_file_exists "out_test/unknown_001.fasta" + +cd .. +echo + +############################################# +mkdir test_simple_single_end +cd test_simple_single_end + +echo "#############################################" +echo "> Run cutadapt on single-end data" + +cat > example.fa <<'EOF' +>read1 +MYSEQUENCEADAPTER +>read2 +MYSEQUENCEADAP +>read3 +MYSEQUENCEADAPTERSOMETHINGELSE +>read4 +MYSEQUENCEADABTER +>read5 +MYSEQUENCEADAPTR +>read6 +MYSEQUENCEADAPPTER +>read7 +ADAPTERMYSEQUENCE +>read8 +PTERMYSEQUENCE +>read9 +SOMETHINGADAPTERMYSEQUENCE +EOF + +"$meta_executable" \ + --report minimal \ + --output "out_test1/*.fasta" \ + --adapter ADAPTER \ + --input example.fa \ + --demultiplex_mode single \ + --fasta \ + --no_match_adapter_wildcards \ + --json + +echo ">> Checking output" +assert_file_exists "report.json" +assert_file_exists "out_test1/1_001.fasta" +assert_file_exists "out_test1/unknown_001.fasta" + +echo ">> Check if output is empty" +assert_file_not_empty "report.json" +assert_file_not_empty "out_test1/1_001.fasta" +assert_file_not_empty "out_test1/unknown_001.fasta" + +echo ">> Check contents" +for i in 1 2 3 7 9; do + assert_file_contains "out_test1/1_001.fasta" ">read$i" +done +for i in 4 5 6 8; do + assert_file_contains "out_test1/unknown_001.fasta" ">read$i" +done + +cd .. +echo + +############################################# +mkdir test_multiple_single_end +cd test_multiple_single_end + +echo "#############################################" +echo "> Run with a combination of inputs" + +cat > example.fa <<'EOF' +>read1 +ACGTACGTACGTAAAAA +>read2 +ACGTACGTACGTCCCCC +>read3 +ACGTACGTACGTGGGGG +>read4 +ACGTACGTACGTTTTTT +EOF + +cat > adapters1.fasta <<'EOF' +>adapter1 +CCCCC +EOF + +cat > adapters2.fasta <<'EOF' +>adapter2 +GGGGG +EOF + +"$meta_executable" \ + --report minimal \ + --output "out_test2/*.fasta" \ + --adapter AAAAA \ + --adapter_fasta adapters1.fasta \ + --adapter_fasta adapters2.fasta \ + --demultiplex_mode single \ + --input example.fa \ + --fasta \ + --json + +echo ">> Checking output" +assert_file_exists "report.json" +assert_file_exists "out_test2/1_001.fasta" +assert_file_exists "out_test2/adapter1_001.fasta" +assert_file_exists "out_test2/adapter2_001.fasta" +assert_file_exists "out_test2/unknown_001.fasta" + +echo ">> Check if output is empty" +assert_file_not_empty "report.json" +assert_file_not_empty "out_test2/1_001.fasta" +assert_file_not_empty "out_test2/adapter1_001.fasta" +assert_file_not_empty "out_test2/adapter2_001.fasta" +assert_file_not_empty "out_test2/unknown_001.fasta" + +echo ">> Check contents" +assert_file_contains "out_test2/1_001.fasta" ">read1" +assert_file_contains "out_test2/adapter1_001.fasta" ">read2" +assert_file_contains "out_test2/adapter2_001.fasta" ">read3" +assert_file_contains "out_test2/unknown_001.fasta" ">read4" + +cd .. +echo + +############################################# +mkdir test_simple_paired_end +cd test_simple_paired_end + +echo "#############################################" +echo "> Run cutadapt on paired-end data" + +cat > example_R1.fastq <<'EOF' +@read1 +ACGTACGTACGTAAAAA ++ +IIIIIIIIIIIIIIIII +@read2 +ACGTACGTACGTCCCCC ++ +IIIIIIIIIIIIIIIII +EOF + +cat > example_R2.fastq <<'EOF' +@read1 +ACGTACGTACGTGGGGG ++ +IIIIIIIIIIIIIIIII +@read2 +ACGTACGTACGTTTTTT ++ +IIIIIIIIIIIIIIIII +EOF + +"$meta_executable" \ + --report minimal \ + --output "out_test3/*.fastq" \ + --adapter AAAAA \ + --adapter_r2 GGGGG \ + --input example_R1.fastq \ + --input_r2 example_R2.fastq \ + --quality_cutoff 20 \ + --demultiplex_mode unique_dual \ + --json \ + ---cpus 1 + +echo ">> Checking output" +assert_file_exists "report.json" +assert_file_exists "out_test3/1_R1_001.fastq" +assert_file_exists "out_test3/1_R2_001.fastq" +assert_file_exists "out_test3/unknown_R1_001.fastq" +assert_file_exists "out_test3/unknown_R2_001.fastq" + +echo ">> Check if output is empty" +assert_file_not_empty "report.json" +assert_file_not_empty "out_test3/1_R1_001.fastq" +assert_file_not_empty "out_test3/1_R2_001.fastq" +assert_file_not_empty "out_test3/unknown_R1_001.fastq" + +echo ">> Check contents" +assert_file_contains "out_test3/1_R1_001.fastq" "@read1" +assert_file_contains "out_test3/1_R2_001.fastq" "@read1" +assert_file_contains "out_test3/unknown_R1_001.fastq" "@read2" +assert_file_contains "out_test3/unknown_R2_001.fastq" "@read2" + +cd .. +echo + +############################################# + +echo "#############################################" +echo "> Test successful" + diff --git a/src/gffread/config.vsh.yaml b/src/gffread/config.vsh.yaml index d2c41a87..7477a284 100644 --- a/src/gffread/config.vsh.yaml +++ b/src/gffread/config.vsh.yaml @@ -8,8 +8,6 @@ links: references: doi: 10.12688/f1000research.23297.2 license: MIT -requirements: - commands: [ gffread ] argument_groups: - name: Inputs arguments: @@ -52,7 +50,7 @@ argument_groups: required: true description: | Write the output records into . - default: output.gff + example: output.gff - name: --force_exons type: boolean_true description: | @@ -154,7 +152,6 @@ argument_groups: - name: --table type: string multiple: true - multiple_sep: "," description: | Output a simple tab delimited format instead of GFF, with columns having the values of GFF attributes given in ; special pseudo-attributes (prefixed by @) are diff --git a/src/gffread/script.sh b/src/gffread/script.sh index 9c4a2b8f..cd4abf14 100644 --- a/src/gffread/script.sh +++ b/src/gffread/script.sh @@ -50,6 +50,8 @@ [[ "$par_expose_dups" == "false" ]] && unset par_expose_dups [[ "$par_cluster_only" == "false" ]] && unset par_cluster_only +# if par_table is not empty, replace ";" with "," +par_table=$(echo "$par_table" | tr ';' ',') $(which gffread) \ "$par_input" \ diff --git a/src/gffread/test.sh b/src/gffread/test.sh index 326fce50..ea23edcb 100755 --- a/src/gffread/test.sh +++ b/src/gffread/test.sh @@ -86,7 +86,7 @@ diff "$expected_output_dir/transcripts.fa" "$test_output_dir/transcripts.fa" || echo "> Test 4 - Generate table from GFF annotation file" "$meta_executable" \ - --table @id,@chr,@start,@end,@strand,@exons,Name,gene,product \ + --table "@id;@chr;@start;@end;@strand;@exons;Name;gene;product" \ --outfile "$test_output_dir/annotation.tbl" \ --input "$test_dir/sequence.gff3" diff --git a/src/multiqc/config.vsh.yaml b/src/multiqc/config.vsh.yaml index 0a3a784b..df5e38e1 100644 --- a/src/multiqc/config.vsh.yaml +++ b/src/multiqc/config.vsh.yaml @@ -54,25 +54,21 @@ argument_groups: - name: "--include_modules" type: string multiple: true - multiple_sep: "," - example: fastqc,cutadapt + example: [fastqc, cutadapt] description: Use only these module - name: "--exclude_modules" type: string multiple: true - multiple_sep: "," - example: fastqc,cutadapt + example: [fastqc, cutadapt] description: Do not use only these modules - name: "--ignore_analysis" type: string multiple: true - multiple_sep: "," - example: run_one/*,run_two/* + example: [run_one/*, run_two/*] - name: "--ignore_samples" type: string multiple: true - multiple_sep: "," - example: sample_1*,sample_3* + example: [sample_1*, sample_3*] - name: "--ignore_symlinks" type: boolean_true description: Ignore symlinked directories and files diff --git a/src/multiqc/script.sh b/src/multiqc/script.sh index 6353eb11..ad8c1c0c 100755 --- a/src/multiqc/script.sh +++ b/src/multiqc/script.sh @@ -38,7 +38,7 @@ IFS=";" read -ra inputs <<< $par_input if [[ -n "$par_include_modules" ]]; then include_modules="" - IFS="," read -ra incl_modules <<< $par_include_modules + IFS=";" read -ra incl_modules <<< $par_include_modules for i in "${incl_modules[@]}"; do include_modules+="--include $i " done @@ -47,7 +47,7 @@ fi if [[ -n "$par_exclude_modules" ]]; then exclude_modules="" - IFS="," read -ra excl_modules <<< $par_exclude_modules + IFS=";" read -ra excl_modules <<< $par_exclude_modules for i in "${excl_modules[@]}"; do exclude_modules+="--exclude $i" done @@ -56,7 +56,7 @@ fi if [[ -n "$par_ignore_analysis" ]]; then ignore="" - IFS="," read -ra ignore_analysis <<< $par_ignore_analysis + IFS=";" read -ra ignore_analysis <<< $par_ignore_analysis for i in "${ignore_analysis[@]}"; do ignore+="--ignore $i " done @@ -65,7 +65,7 @@ fi if [[ -n "$par_ignore_samples" ]]; then ignore_samples="" - IFS="," read -ra ign_samples <<< $par_ignore_samples + IFS=";" read -ra ign_samples <<< $par_ignore_samples for i in "${ign_samples[@]}"; do ignore_samples+="--ignore-samples $i" done diff --git a/src/pear/script.sh b/src/pear/script.sh index f7d6a28f..9eff147b 100644 --- a/src/pear/script.sh +++ b/src/pear/script.sh @@ -1,5 +1,7 @@ #!/bin/bash +set -eo pipefail + ## VIASH START ## VIASH END diff --git a/src/salmon/salmon_index/config.vsh.yaml b/src/salmon/salmon_index/config.vsh.yaml index f24cd3a9..41c1e05b 100644 --- a/src/salmon/salmon_index/config.vsh.yaml +++ b/src/salmon/salmon_index/config.vsh.yaml @@ -19,7 +19,7 @@ argument_groups: - name: --genome type: file description: | - Genome of the organism to prepare the set of decoy sequences. Required to build decoy-aware transccriptome. + Genome of the organism to prepare the set of decoy sequences. Required to build decoy-aware transcriptome. required: false example: genome.fasta - name: --transcripts @@ -110,4 +110,4 @@ engines: salmon index -v 2>&1 | sed 's/salmon \([0-9.]*\)/salmon: \1/' > /var/software_versions.txt runners: - type: executable - - type: nextflow \ No newline at end of file + - type: nextflow diff --git a/src/salmon/salmon_quant/config.vsh.yaml b/src/salmon/salmon_quant/config.vsh.yaml index 47d72665..b7e303f4 100644 --- a/src/salmon/salmon_quant/config.vsh.yaml +++ b/src/salmon/salmon_quant/config.vsh.yaml @@ -42,7 +42,7 @@ argument_groups: type: file description: | Salmon index. - required: true + required: false example: transcriptome_index - name: --unmated_reads alternatives: ["-r"] @@ -320,12 +320,15 @@ argument_groups: example: 0.00001 - name: --write_mappings alternatives: ["-z"] - type: file - direction: output + type: boolean_true description: | If this option is provided, then the selective-alignment results will be written out in SAM-compatible format. By default, output will be directed to stdout, but an alternative file name can be provided instead. + - name: --mapping_sam + type: file + description: Path to file that should output the selective-alignment results in SAM-compatible format. THis option must be provided while using --write_mappings required: false - example: mappings.sam + direction: output + example: mappings.sam - name: --write_qualities type: boolean_true description: | diff --git a/src/salmon/salmon_quant/script.sh b/src/salmon/salmon_quant/script.sh index ace79711..4c9f69d5 100644 --- a/src/salmon/salmon_quant/script.sh +++ b/src/salmon/salmon_quant/script.sh @@ -21,6 +21,7 @@ set -e [[ "$par_softclip_overhangs" == "false" ]] && unset par_softclip_overhangs [[ "$par_full_length_alignment" == "false" ]] && unset par_full_length_alignment [[ "$par_hard_filter" == "false" ]] && unset par_hard_filter +[[ "$par_write_mappings" == "false" ]] && unset par_write_mappings [[ "$par_write_qualities" == "false" ]] && unset par_write_qualities [[ "$par_alternative_init_mode" == "false" ]] && unset par_alternative_init_mode [[ "$par_skip_quant" == "false" ]] && unset par_skip_quant @@ -96,7 +97,7 @@ salmon quant \ ${par_full_length_alignment:+--fullLengthAlignment} \ ${par_hard_filter:+--hardFilter} \ ${par_min_aln_prob:+--minAlnProb "${par_min_aln_prob}"} \ - ${par_write_mappings:+-z "${par_write_mappings}"} \ + ${par_write_mappings:+--write_mappings="${par_mappings_sam}"} \ ${par_write_qualities:+--writeQualities} \ ${par_hit_filter_policy:+--hitFilterPolicy "${par_hit_filter_policy}"} \ ${par_alternative_init_mode:+--alternativeInitMode} \ diff --git a/src/samtools/samtools_fasta/config.vsh.yaml b/src/samtools/samtools_fasta/config.vsh.yaml new file mode 100644 index 00000000..23517f6c --- /dev/null +++ b/src/samtools/samtools_fasta/config.vsh.yaml @@ -0,0 +1,191 @@ +name: samtools_fasta +namespace: samtools +description: Converts a SAM, BAM or CRAM to FASTA format. +keywords: [fasta, bam, sam, cram] +links: + homepage: https://www.htslib.org/ + documentation: https://www.htslib.org/doc/samtools-fasta.html + repository: https://github.com/samtools/samtools +references: + doi: [10.1093/bioinformatics/btp352, 10.1093/gigascience/giab008] +license: MIT/Expat + +argument_groups: + - name: Inputs + arguments: + - name: --input + type: file + description: input SAM/BAM/CRAM file + required: true + - name: Outputs + arguments: + - name: --output + type: file + description: output FASTA file + required: true + direction: output + - name: Options + arguments: + - name: --no_suffix + alternatives: -n + type: boolean_true + description: | + By default, either '/1' or '/2' is added to the end of read names where the corresponding + READ1 or READ2 FLAG bit is set. Using -n causes read names to be left as they are. + - name: --suffix + alternatives: -N + type: boolean_true + description: | + Always add either '/1' or '/2' to the end of read names even when put into different files. + - name: --use_oq + alternatives: -O + type: boolean_true + description: | + Use quality values from OQ tags in preference to standard quality string if available. + - name: --singleton + alternatives: -s + type: file + description: write singleton reads to FILE. + - name: --copy_tags + alternatives: -t + type: boolean_true + description: | + Copy RG, BC and QT tags to the FASTA header line, if they exist. + - name: --copy_tags_list + alternatives: -T + type: string + description: | + Specify a comma-separated list of tags to copy to the FASTA header line, if they exist. + TAGLIST can be blank or `*` to indicate all tags should be copied to the output. If using `*`, + be careful to quote it to avoid unwanted shell expansion. + - name: --read1 + alternatives: -1 + type: file + description: | + Write reads with the READ1 FLAG set (and READ2 not set) to FILE instead of outputting them. + If the -s option is used, only paired reads will be written to this file. + direction: output + - name: --read2 + alternatives: -2 + type: file + description: | + Write reads with the READ2 FLAG set (and READ1 not set) to FILE instead of outputting them. + If the -s option is used, only paired reads will be written to this file. + direction: output + - name: --output_reads + alternatives: -o + type: file + description: | + Write reads with either READ1 FLAG or READ2 flag set to FILE instead of outputting them to stdout. + This is equivalent to -1 FILE -2 FILE. + direction: output + - name: --output_reads_both + alternatives: -0 + type: file + description: | + Write reads where the READ1 and READ2 FLAG bits set are either both set or both unset to FILE + instead of outputting them. + direction: output + - name: --filter_flags + alternatives: -f + type: integer + description: | + Only output alignments with all bits set in INT present in the FLAG field. INT can be specified + in hex by beginning with '0x' (i.e. /^0x[0-9A-F]+/) or in octal by beginning with '0' + (i.e. /^0[0-7]+/). Default: `0`. + example: 0 + - name: --excl_flags + alternatives: -F + type: string + description: | + Do not output alignments with any bits set in INT present in the FLAG field. INT can be specified + in hex by beginning with '0x' (i.e. /^0x[0-9A-F]+/) or in octal by beginning with '0' + (i.e. /^0[0-7]+/). This defaults to 0x900 representing filtering of secondary and + supplementary alignments. Default: `0x900`. + example: "0x900" + - name: --incl_flags + alternatives: --rf + type: string + description: | + Only output alignments with any bits set in INT present in the FLAG field. INT can be specified + in hex by beginning with '0x' (i.e. /^0x[0-9A-F]+/), in octal by beginning with '0' + (i.e. /^0[0-7]+/), as a decimal number not beginning with '0' or as a comma-separated list of + flag names. Default: `0`. + example: 0 + - name: --excl_flags_all + alternatives: -G + type: integer + description: | + Only EXCLUDE reads with all of the bits set in INT present in the FLAG field. INT can be specified + in hex by beginning with '0x' (i.e. /^0x[0-9A-F]+/) or in octal by beginning with '0' (i.e. /^0[0-7]+/). + Default: `0`. + example: 0 + - name: --aux_tag + alternatives: -d + type: string + description: | + Only output alignments containing an auxiliary tag matching both TAG and VAL. If VAL is omitted + then any value is accepted. The tag types supported are i, f, Z, A and H. "B" arrays are not + supported. This is comparable to the method used in samtools view --tag. The option may be specified + multiple times and is equivalent to using the --aux_tag_file option. + - name: --aux_tag_file + alternatives: -D + type: string + description: | + Only output alignments containing an auxiliary tag matching TAG and having a value listed in FILE. + The format of the file is one line per value. This is equivalent to specifying --aux_tag multiple times. + - name: --casava + alternatives: -i + type: boolean_true + description: add Illumina Casava 1.8 format entry to header (eg 1:N:0:ATCACG) + - name: --compression + alternatives: -c + type: integer + description: set compression level when writing gz or bgzf fasta files. + example: 0 + - name: --index1 + alternatives: --i1 + type: file + description: write first index reads to FILE. + - name: --index2 + alternatives: --i2 + type: file + description: write second index reads to FILE. + - name: --barcode_tag + type: string + description: | + Auxiliary tag to find index reads in. Default: `BC`. + example: "BC" + - name: --quality_tag + type: string + description: | + Auxiliary tag to find index quality in. Default: `QT`. + example: "QT" + - name: --index_format + type: string + description: | + string to describe how to parse the barcode and quality tags. For example: + * `i14i8`: the first 14 characters are index 1, the next 8 characters are index 2. + * `n8i14`: ignore the first 8 characters, and use the next 14 characters for index 1. + If the tag contains a separator, then the numeric part can be replaced with`*` to mean + 'read until the separator or end of tag', for example: `n*i*`. + +resources: + - type: bash_script + path: ../samtools_fastq/script.sh +test_resources: + - type: bash_script + path: test.sh + - type: file + path: test_data +engines: + - type: docker + image: quay.io/biocontainers/samtools:1.19.2--h50ea8bc_1 + setup: + - type: docker + run: | + samtools --version 2>&1 | grep -E '^(samtools|Using htslib)' | \ + sed 's#Using ##;s# \([0-9\.]*\)$#: \1#' > /var/software_versions.txt +runners: +- type: executable +- type: nextflow diff --git a/src/samtools/samtools_fasta/help.txt b/src/samtools/samtools_fasta/help.txt new file mode 100644 index 00000000..39ed0d00 --- /dev/null +++ b/src/samtools/samtools_fasta/help.txt @@ -0,0 +1,80 @@ +``` +samtools fastq +``` + +Usage: samtools fastq [options...] + +Description: +Converts a SAM, BAM or CRAM to FASTQ format. + +Options: + -0 FILE write reads designated READ_OTHER to FILE + -1 FILE write reads designated READ1 to FILE + -2 FILE write reads designated READ2 to FILE + -o FILE write reads designated READ1 or READ2 to FILE + note: if a singleton file is specified with -s, only + paired reads will be written to the -1 and -2 files. + -d, --tag TAG[:VAL] + only include reads containing TAG, optionally with value VAL + -f, --require-flags INT + only include reads with all of the FLAGs in INT present [0] + -F, --excl[ude]-flags INT + only include reads with none of the FLAGs in INT present [0x900] + --rf, --incl[ude]-flags INT + only include reads with any of the FLAGs in INT present [0] + -G INT only EXCLUDE reads with all of the FLAGs in INT present [0] + -n don't append /1 and /2 to the read name + -N always append /1 and /2 to the read name + -O output quality in the OQ tag if present + -s FILE write singleton reads designated READ1 or READ2 to FILE + -t copy RG, BC and QT tags to the FASTQ header line + -T TAGLIST copy arbitrary tags to the FASTQ header line, '*' for all + -v INT default quality score if not given in file [1] + -i add Illumina Casava 1.8 format entry to header (eg 1:N:0:ATCACG) + -c INT compression level [0..9] to use when writing bgzf files [1] + --i1 FILE write first index reads to FILE + --i2 FILE write second index reads to FILE + --barcode-tag TAG + Barcode tag [BC] + --quality-tag TAG + Quality tag [QT] + --index-format STR + How to parse barcode and quality tags + + --input-fmt-option OPT[=VAL] + Specify a single input file format option in the form + of OPTION or OPTION=VALUE + --reference FILE + Reference sequence FASTA FILE [null] + -@, --threads INT + Number of additional threads to use [0] + --verbosity INT + Set level of verbosity + +The files will be automatically compressed if the file names have a .gz +or .bgzf extension. The input to this program must be collated by name. +Run 'samtools collate' or 'samtools sort -n' to achieve this. + +Reads are designated READ1 if FLAG READ1 is set and READ2 is not set. +Reads are designated READ2 if FLAG READ1 is not set and READ2 is set. +Otherwise reads are designated READ_OTHER (both flags set or both flags unset). +Run 'samtools flags' for more information on flag codes and meanings. + +The index-format string describes how to parse the barcode and quality tags. +It is made up of 'i' or 'n' followed by a length or '*'. For example: + i14i8 The first 14 characters are index 1, the next 8 are index 2 + n8i14 Ignore the first 8 characters, and use the next 14 for index 1 + +If the tag contains a separator, then the numeric part can be replaced with +'*' to mean 'read until the separator or end of tag', for example: + i*i* Break the tag at the separator into index 1 and index 2 + n*i* Ignore the left part of the tag until the separator, + then use the second part of the tag as index 1 + +Examples: +To get just the paired reads in separate files, use: + samtools fastq -1 pair1.fq -2 pair2.fq -0 /dev/null -s /dev/null -n in.bam + +To get all non-supplementary/secondary reads in a single file, redirect +the output: + samtools fastq in.bam > all_reads.fq \ No newline at end of file diff --git a/src/samtools/samtools_fasta/test.sh b/src/samtools/samtools_fasta/test.sh new file mode 100644 index 00000000..687965ae --- /dev/null +++ b/src/samtools/samtools_fasta/test.sh @@ -0,0 +1,96 @@ +#!/bin/bash + +test_dir="${meta_resources_dir}/test_data" +out_dir="${meta_resources_dir}/out_data" + +############################################################################################ + +echo ">>> Test 1: Convert all reads from a bam file to fasta format" +"$meta_executable" \ + --input "$test_dir/a.bam" \ + --output "$out_dir/a.fa" + +echo ">>> Check if output file exists" +[ ! -f "$out_dir/a.fa" ] && echo "Output file a.fa does not exist" && exit 1 + +echo ">>> Check if output is empty" +[ ! -s "$out_dir/a.fa" ] && echo "Output file a.fa is empty" && exit 1 + +echo ">>> Check if output matches expected output" +diff "$out_dir/a.fa" "$test_dir/a.fa" || + (echo "Output file a.fa does not match expected output" && exit 1) + +rm "$out_dir/a.fa" + +############################################################################################ + +echo ">>> Test 2: Convert all reads from a sam file to fasta format" +"$meta_executable" \ + --input "$test_dir/a.sam" \ + --output "$out_dir/a.fa" + +echo ">>> Check if output file exists" +[ ! -f "$out_dir/a.fa" ] && echo "Output file a.fa does not exist" && exit 1 + +echo ">>> Check if output is empty" +[ ! -s "$out_dir/a.fa" ] && echo "Output file a.fa is empty" && exit 1 + +echo ">>> Check if output matches expected output" +diff "$out_dir/a.fa" "$test_dir/a.fa" || + (echo "Output file a.fa does not match expected output" && exit 1) + +rm "$out_dir/a.fa" + +############################################################################################ + +echo ">>> Test 3: Output reads from bam file to separate files" + +"$meta_executable" \ + --input "$test_dir/a.bam" \ + --read1 "$out_dir/a.1.fa" \ + --read2 "$out_dir/a.2.fa" \ + --output "$out_dir/a.fa" + +echo ">>> Check if output files exist" +[ ! -f "$out_dir/a.1.fa" ] && echo "Output file a.1.fa does not exist" && exit 1 +[ ! -f "$out_dir/a.2.fa" ] && echo "Output file a.2.fa does not exist" && exit 1 +[ ! -f "$out_dir/a.fa" ] && echo "Output file a.fa does not exist" && exit 1 + +echo ">>> Check if output files are empty" +[ ! -s "$out_dir/a.1.fa" ] && echo "Output file a.1.fa is empty" && exit 1 +[ ! -s "$out_dir/a.2.fa" ] && echo "Output file a.2.fa is empty" && exit 1 +# output should be empty since input has no singleton reads + +echo ">>> Check if output files match expected output" +diff "$out_dir/a.1.fa" "$test_dir/a.1.fa" || + (echo "Output file a.1.fa does not match expected output" && exit 1) +diff "$out_dir/a.2.fa" "$test_dir/a.2.fa" || + (echo "Output file a.2.fa does not match expected output" && exit 1) + +rm "$out_dir/a.1.fa" "$out_dir/a.2.fa" "$out_dir/a.fa" + +############################################################################################ + +echo ">>> Test 4: Output only forward reads from bam file to fasta format" + +"$meta_executable" \ + --input "$test_dir/a.sam" \ + --excl_flags "0x80" \ + --output "$out_dir/half.fa" + +echo ">>> Check if output file exists" +[ ! -f "$out_dir/half.fa" ] && echo "Output file half.fa does not exist" && exit 1 + +echo ">>> Check if output is empty" +[ ! -s "$out_dir/half.fa" ] && echo "Output file half.fa is empty" && exit 1 + +echo ">>> Check if output matches expected output" +diff "$out_dir/half.fa" "$test_dir/half.fa" || + (echo "Output file half.fa does not match expected output" && exit 1) + +rm "$out_dir/half.fa" + +############################################################################################ + +echo "All tests succeeded!" +exit 0 \ No newline at end of file diff --git a/src/samtools/samtools_fasta/test_data/a.1.fa b/src/samtools/samtools_fasta/test_data/a.1.fa new file mode 100644 index 00000000..2c9fdbe5 --- /dev/null +++ b/src/samtools/samtools_fasta/test_data/a.1.fa @@ -0,0 +1,6 @@ +>a1 +AAAAAAAAAA +>b1 +AAAAAAAAAA +>c1 +AAAAAAAAAA diff --git a/src/samtools/samtools_fasta/test_data/a.2.fa b/src/samtools/samtools_fasta/test_data/a.2.fa new file mode 100644 index 00000000..2c9fdbe5 --- /dev/null +++ b/src/samtools/samtools_fasta/test_data/a.2.fa @@ -0,0 +1,6 @@ +>a1 +AAAAAAAAAA +>b1 +AAAAAAAAAA +>c1 +AAAAAAAAAA diff --git a/src/samtools/samtools_fasta/test_data/a.bam b/src/samtools/samtools_fasta/test_data/a.bam new file mode 100644 index 00000000..dba1268a Binary files /dev/null and b/src/samtools/samtools_fasta/test_data/a.bam differ diff --git a/src/samtools/samtools_fasta/test_data/a.fa b/src/samtools/samtools_fasta/test_data/a.fa new file mode 100644 index 00000000..693cd395 --- /dev/null +++ b/src/samtools/samtools_fasta/test_data/a.fa @@ -0,0 +1,12 @@ +>a1/1 +AAAAAAAAAA +>b1/1 +AAAAAAAAAA +>c1/1 +AAAAAAAAAA +>a1/2 +AAAAAAAAAA +>b1/2 +AAAAAAAAAA +>c1/2 +AAAAAAAAAA diff --git a/src/samtools/samtools_fasta/test_data/a.sam b/src/samtools/samtools_fasta/test_data/a.sam new file mode 100644 index 00000000..aa8c77b3 --- /dev/null +++ b/src/samtools/samtools_fasta/test_data/a.sam @@ -0,0 +1,7 @@ +@SQ SN:xx LN:20 +a1 99 xx 1 1 10M = 11 20 AAAAAAAAAA ********** +b1 99 xx 1 1 10M = 11 20 AAAAAAAAAA ********** +c1 99 xx 1 1 10M = 11 20 AAAAAAAAAA ********** +a1 147 xx 11 1 10M = 1 -20 TTTTTTTTTT ********** +b1 147 xx 11 1 10M = 1 -20 TTTTTTTTTT ********** +c1 147 xx 11 1 10M = 1 -20 TTTTTTTTTT ********** diff --git a/src/samtools/samtools_fasta/test_data/half.fa b/src/samtools/samtools_fasta/test_data/half.fa new file mode 100644 index 00000000..36cd438c --- /dev/null +++ b/src/samtools/samtools_fasta/test_data/half.fa @@ -0,0 +1,6 @@ +>a1/1 +AAAAAAAAAA +>b1/1 +AAAAAAAAAA +>c1/1 +AAAAAAAAAA diff --git a/src/samtools/samtools_fasta/test_data/script.sh b/src/samtools/samtools_fasta/test_data/script.sh new file mode 100755 index 00000000..b59bc1bd --- /dev/null +++ b/src/samtools/samtools_fasta/test_data/script.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +# dowload test data from snakemake wrapper +if [ ! -d /tmp/fastq_source ]; then + git clone --depth 1 --single-branch --branch master https://github.com/snakemake/snakemake-wrappers.git /tmp/fastq_source +fi + +cp -r /tmp/fastq_source/bio/samtools/fastx/test/*.sam src/samtools/samtools_fastq/test_data/ +cp -r /tmp/fastq_source/bio/samtools/fastq/interleaved/test/mapped/*.bam src/samtools/samtools_fastq/test_data/ +cp -r /tmp/fastq_source/bio/samtools/fastq/interleaved/test/reads/*.fq src/samtools/samtools_fastq/test_data/ +cp -r /tmp/fastq_source/bio/samtools/fastq/separate/test/reads/*.fq src/samtools/samtools_fastq/test_data/ \ No newline at end of file diff --git a/src/samtools/samtools_fastq/config.vsh.yaml b/src/samtools/samtools_fastq/config.vsh.yaml index 39e926f0..cac7653b 100644 --- a/src/samtools/samtools_fastq/config.vsh.yaml +++ b/src/samtools/samtools_fastq/config.vsh.yaml @@ -56,7 +56,7 @@ argument_groups: type: string description: | Specify a comma-separated list of tags to copy to the FASTQ header line, if they exist. - TAGLIST can be blank or * to indicate all tags should be copied to the output. If using *, + TAGLIST can be blank or `*` to indicate all tags should be copied to the output. If using `*`, be careful to quote it to avoid unwanted shell expansion. - name: --read1 alternatives: -1 @@ -91,35 +91,35 @@ argument_groups: type: integer description: | Only output alignments with all bits set in INT present in the FLAG field. INT can be specified - in hex by beginning with `0x' (i.e. /^0x[0-9A-F]+/) or in octal by beginning with `0' - (i.e. /^0[0-7]+/). - default: 0 + in hex by beginning with '0x' (i.e. /^0x[0-9A-F]+/) or in octal by beginning with '0' + (i.e. /^0[0-7]+/). Default: `0`. + example: 0 - name: --excl_flags alternatives: -F type: string description: | Do not output alignments with any bits set in INT present in the FLAG field. INT can be specified - in hex by beginning with `0x' (i.e. /^0x[0-9A-F]+/) or in octal by beginning with `0' + in hex by beginning with '0x' (i.e. /^0x[0-9A-F]+/) or in octal by beginning with '0' (i.e. /^0[0-7]+/). This defaults to 0x900 representing filtering of secondary and - supplementary alignments. - default: 0x900 + supplementary alignments. Default: `0x900`. + example: "0x900" - name: --incl_flags alternatives: --rf type: string description: | Only output alignments with any bits set in INT present in the FLAG field. INT can be specified - in hex by beginning with `0x' (i.e. /^0x[0-9A-F]+/), in octal by beginning with `0' + in hex by beginning with '0x' (i.e. /^0x[0-9A-F]+/), in octal by beginning with '0' (i.e. /^0[0-7]+/), as a decimal number not beginning with '0' or as a comma-separated list of - flag names. - default: 0 + flag names. Default: `0`. + example: 0 - name: --excl_flags_all alternatives: -G type: integer description: | Only EXCLUDE reads with all of the bits set in INT present in the FLAG field. INT can be specified - in hex by beginning with `0x' (i.e. /^0x[0-9A-F]+/) or in octal by beginning with `0' - (i.e. /^0[0-7]+/). - default: 0 + in hex by beginning with '0x' (i.e. /^0x[0-9A-F]+/) or in octal by beginning with '0' (i.e. /^0[0-7]+/). + Default: `0`. + example: 0 - name: --aux_tag alternatives: -d type: string @@ -137,12 +137,13 @@ argument_groups: - name: --casava alternatives: -i type: boolean_true - description: add Illumina Casava 1.8 format entry to header (eg 1:N:0:ATCACG) + description: | + Add Illumina Casava 1.8 format entry to header, for example: `1:N:0:ATCACG`. - name: --compression alternatives: -c type: integer description: set compression level when writing gz or bgzf fastq files. - default: 0 + example: 0 - name: --index1 alternatives: --i1 type: file @@ -153,20 +154,22 @@ argument_groups: description: write second index reads to FILE. - name: --barcode_tag type: string - description: Auxiliary tag to find index reads in. - default: BC + description: | + Auxiliary tag to find index reads in. Default: `BC`. + example: "BC" - name: --quality_tag type: string - description: Auxiliary tag to find index quality in. - default: QT + description: | + Auxiliary tag to find index quality in. Default: `QT`. + example: QT - name: --index_format type: string description: | string to describe how to parse the barcode and quality tags. For example: - [i14i8]: the first 14 characters are index 1, the next 8 characters are index 2. - [n8i14]: ignore the first 8 characters, and use the next 14 characters for index 1. + * `i14i8`: the first 14 characters are index 1, the next 8 characters are index 2. + * `n8i14`: ignore the first 8 characters, and use the next 14 characters for index 1. If the tag contains a separator, then the numeric part can be replaced with '*' to mean - 'read until the separator or end of tag', for example: [n*i*]. + 'read until the separator or end of tag', for example: `n*i*`. resources: - type: bash_script diff --git a/src/samtools/samtools_fastq/script.sh b/src/samtools/samtools_fastq/script.sh index 367432f9..0cad9cfe 100644 --- a/src/samtools/samtools_fastq/script.sh +++ b/src/samtools/samtools_fastq/script.sh @@ -11,7 +11,14 @@ set -e [[ "$par_copy_tags" == "false" ]] && unset par_copy_tags [[ "$par_casava" == "false" ]] && unset par_casava -samtools fastq \ +if [[ "$meta_name" == "samtools_fasta" ]]; then + subcommand=fasta +elif [[ "$meta_name" == "samtools_fastq" ]]; then + subcommand=fastq +else + echo "Unrecognized component name" && exit 1 +fi +samtools "$subcommand" \ ${par_no_suffix:+-n} \ ${par_suffix:+-N} \ ${par_use_oq:+-O} \ diff --git a/src/samtools/samtools_stats/config.vsh.yaml b/src/samtools/samtools_stats/config.vsh.yaml index 0d8f57a4..ca630876 100644 --- a/src/samtools/samtools_stats/config.vsh.yaml +++ b/src/samtools/samtools_stats/config.vsh.yaml @@ -30,10 +30,10 @@ argument_groups: - name: --coverage alternatives: -c type: integer - description: | - Coverage distribution min,max,step [1,1000,1]. multiple: true - multiple_sep: ',' + description: | + Coverage distribution min;max;step. Default: [1, 1000, 1]. + example: [1, 1000, 1] - name: --remove_dups alternatives: -d type: boolean_true @@ -48,25 +48,25 @@ argument_groups: alternatives: -f type: string description: | - Required flag, 0 for unset. See also `samtools flags`. - default: "0" + Required flag, 0 for unset. See also `samtools flags`. Default: `"0"`. + example: "0" - name: --filtering_flag alternatives: -F type: string description: | - Filtering flag, 0 for unset. See also `samtools flags`. - default: "0" + Filtering flag, 0 for unset. See also `samtools flags`. Default: `0`. + example: "0" - name: --GC_depth type: double description: | - The size of GC-depth bins (decreasing bin size increases memory requirement). - default: 20000.0 + The size of GC-depth bins (decreasing bin size increases memory requirement). Default: `20000`. + example: 20000.0 - name: --insert_size alternatives: -i type: integer description: | - Maximum insert size. - default: 8000 + Maximum insert size. Default: `8000`. + example: 8000 - name: --id alternatives: -I type: string @@ -76,14 +76,14 @@ argument_groups: alternatives: -l type: integer description: | - Include in the statistics only reads with the given read length. - default: -1 + Include in the statistics only reads with the given read length. Default: `-1`. + example: -1 - name: --most_inserts alternatives: -m type: double description: | - Report only the main part of inserts. - default: 0.99 + Report only the main part of inserts. Default: `0.99`. + example: 0.99 - name: --split_prefix alternatives: -P type: string @@ -93,8 +93,8 @@ argument_groups: alternatives: -q type: integer description: | - The BWA trimming parameter. - default: 0 + The BWA trimming parameter. Default: `0`. + example: 0 - name: --ref_seq alternatives: -r type: file @@ -124,8 +124,8 @@ argument_groups: alternatives: -g type: integer description: | - Only bases with coverage above this value will be included in the target percentage computation. - default: 0 + Only bases with coverage above this value will be included in the target percentage computation. Default: `0`. + example: 0 - name: --input_fmt_option type: string description: | @@ -141,7 +141,7 @@ argument_groups: type: file description: | Output file. - default: "out.txt" + example: "out.txt" required: true direction: output diff --git a/src/samtools/samtools_stats/script.sh b/src/samtools/samtools_stats/script.sh index 6e32e9a5..e3872fc6 100644 --- a/src/samtools/samtools_stats/script.sh +++ b/src/samtools/samtools_stats/script.sh @@ -10,6 +10,9 @@ set -e [[ "$par_sparse" == "false" ]] && unset par_sparse [[ "$par_remove_overlaps" == "false" ]] && unset par_remove_overlaps +# change the coverage input from X;X;X to X,X,X +par_coverage=$(echo "$par_coverage" | tr ';' ',') + samtools stats \ ${par_coverage:+-c "$par_coverage"} \ ${par_remove_dups:+-d} \ diff --git a/src/samtools/samtools_stats/test.sh b/src/samtools/samtools_stats/test.sh index 05d70d30..b515100e 100644 --- a/src/samtools/samtools_stats/test.sh +++ b/src/samtools/samtools_stats/test.sh @@ -17,7 +17,7 @@ echo ">>> Checking whether output is non-empty" [ ! -s "$test_dir/test.paired_end.sorted.txt" ] && echo "File 'test.paired_end.sorted.txt' is empty!" && exit 1 echo ">>> Checking whether output is correct" -# compare using diff, ignoring the line stating the command that was passed. +# compare using diff, ignoring the line stating the command that was passed. diff <(grep -v "^# The command" "$test_dir/test.paired_end.sorted.txt") \ <(grep -v "^# The command" "$test_dir/ref.paired_end.sorted.txt") || \ (echo "Output file ref.paired_end.sorted.txt does not match expected output" && exit 1) diff --git a/src/star/star_align_reads/config.vsh.yaml b/src/star/star_align_reads/config.vsh.yaml index 8fdd5256..eab65b35 100644 --- a/src/star/star_align_reads/config.vsh.yaml +++ b/src/star/star_align_reads/config.vsh.yaml @@ -72,6 +72,12 @@ argument_groups: description: The output file containing the splice junctions. direction: output example: splice_junctions.tsv + - type: file + name: --reads_aligned_to_transcriptome + required: false + description: The output file containing the alignments to transcriptome in BAM formats. This file is generated when --quantMode is set to TranscriptomeSAM. + direction: output + example: transcriptome_aligned.bam # other arguments are defined in a separate file __merge__: argument_groups.yaml resources: diff --git a/src/star/star_align_reads/script.py b/src/star/star_align_reads/script.py index 2bde8798..f3d64a57 100644 --- a/src/star/star_align_reads/script.py +++ b/src/star/star_align_reads/script.py @@ -58,7 +58,8 @@ "log": "Log.final.out", "splice_junctions": "SJ.out.tab", "unmapped": "Unmapped.out.mate1", - "unmapped_r2": "Unmapped.out.mate2" + "unmapped_r2": "Unmapped.out.mate2", + "reads_aligned_to_transcriptome": "Aligned.toTranscriptome.out.bam" } output_paths = {name: par[name] for name in expected_outputs.keys()} for name in expected_outputs.keys(): diff --git a/src/star/star_align_reads/test.sh b/src/star/star_align_reads/test.sh index 374b9014..bd78094d 100644 --- a/src/star/star_align_reads/test.sh +++ b/src/star/star_align_reads/test.sh @@ -7,35 +7,34 @@ meta_executable="target/docker/star/star_align_reads/star_align_reads" meta_resources_dir="src/star/star_align_reads" ## VIASH END -######################################################################################### - +############################################# # helper functions assert_file_exists() { - [ -f "$1" ] || (echo "File '$1' does not exist" && exit 1) + [ -f "$1" ] || { echo "File '$1' does not exist" && exit 1; } } assert_file_doesnt_exist() { - [ ! -f "$1" ] || (echo "File '$1' exists but shouldn't" && exit 1) + [ ! -f "$1" ] || { echo "File '$1' exists but shouldn't" && exit 1; } } assert_file_empty() { - [ ! -s "$1" ] || (echo "File '$1' is not empty but should be" && exit 1) + [ ! -s "$1" ] || { echo "File '$1' is not empty but should be" && exit 1; } } assert_file_not_empty() { - [ -s "$1" ] || (echo "File '$1' is empty but shouldn't be" && exit 1) + [ -s "$1" ] || { echo "File '$1' is empty but shouldn't be" && exit 1; } } assert_file_contains() { - grep -q "$2" "$1" || (echo "File '$1' does not contain '$2'" && exit 1) + grep -q "$2" "$1" || { echo "File '$1' does not contain '$2'" && exit 1; } } assert_file_not_contains() { - grep -q "$2" "$1" && (echo "File '$1' contains '$2' but shouldn't" && exit 1) + grep -q "$2" "$1" && { echo "File '$1' contains '$2' but shouldn't" && exit 1; } } assert_file_contains_regex() { - grep -q -E "$2" "$1" || (echo "File '$1' does not contain '$2'" && exit 1) + grep -q -E "$2" "$1" || { echo "File '$1' does not contain '$2'" && exit 1; } } assert_file_not_contains_regex() { - grep -q -E "$2" "$1" && (echo "File '$1' contains '$2' but shouldn't" && exit 1) + grep -q -E "$2" "$1" && { echo "File '$1' contains '$2' but shouldn't" && exit 1; } } +############################################# -######################################################################################### echo "> Prepare test data" cat > reads_R1.fastq <<'EOF' @@ -98,6 +97,7 @@ echo "> Run star_align_reads on SE" --reads_per_gene "reads_per_gene.tsv" \ --outSJtype Standard \ --splice_junctions "splice_junctions.tsv" \ + --reads_aligned_to_transcriptome "transcriptome_aligned.bam" \ ${meta_cpus:+---cpus $meta_cpus} # TODO: Test data doesn't contain any chimeric reads yet @@ -111,6 +111,7 @@ assert_file_exists "reads_per_gene.tsv" # assert_file_exists "chimeric_junctions.tsv" assert_file_exists "splice_junctions.tsv" assert_file_exists "unmapped.sam" +assert_file_exists "transcriptome_aligned.bam" echo ">> Check if output contents are not empty" assert_file_not_empty "output.sam" @@ -119,6 +120,7 @@ assert_file_not_empty "reads_per_gene.tsv" # assert_file_not_empty "chimeric_junctions.tsv" # assert_file_not_empty "splice_junctions.tsv" # TODO: test data doesn't contain any splice junctions yet assert_file_not_empty "unmapped.sam" +assert_file_not_empty "transcriptome_aligned.bam" echo ">> Check if output contents are correct" assert_file_contains "log.txt" "Number of input reads \\| 2" diff --git a/src/star/star_genome_generate/config.vsh.yaml b/src/star/star_genome_generate/config.vsh.yaml new file mode 100644 index 00000000..3adaf7a2 --- /dev/null +++ b/src/star/star_genome_generate/config.vsh.yaml @@ -0,0 +1,139 @@ +name: star_genome_generate +namespace: star +description: | + Create index for STAR +keywords: [genome, index, align] +links: + repository: https://github.com/alexdobin/STAR + documentation: https://github.com/alexdobin/STAR/blob/master/doc/STARmanual.pdf +references: + doi: 10.1093/bioinformatics/bts635 +license: MIT +requirements: + commands: [ STAR ] + +argument_groups: +- name: "Input" + arguments: + - name: "--genomeFastaFiles" + type: file + description: | + Path(s) to the fasta files with the genome sequences, separated by spaces. These files should be plain text FASTA files, they *cannot* be zipped. + required: true + multiple: yes + multiple_sep: ; + - name: "--sjdbGTFfile" + type: file + description: Path to the GTF file with annotations + - name: --sjdbOverhang + type: integer + description: Length of the donor/acceptor sequence on each side of the junctions, ideally = (mate_length - 1) + example: 100 + - name: --sjdbGTFchrPrefix + type: string + description: Prefix for chromosome names in a GTF file (e.g. 'chr' for using ENSMEBL annotations with UCSC genomes) + - name: --sjdbGTFfeatureExon + type: string + description: Feature type in GTF file to be used as exons for building transcripts + example: exon + - name: --sjdbGTFtagExonParentTranscript + type: string + description: GTF attribute name for parent transcript ID (default "transcript_id" works for GTF files) + example: transcript_id + - name: --sjdbGTFtagExonParentGene + type: string + description: GTF attribute name for parent gene ID (default "gene_id" works for GTF files) + example: gene_id + - name: --sjdbGTFtagExonParentGeneName + type: string + description: GTF attribute name for parent gene name + example: gene_name + multiple: yes + multiple_sep: ; + - name: --sjdbGTFtagExonParentGeneType + type: string + description: GTF attribute name for parent gene type + example: + - gene_type + - gene_biotype + multiple: yes + multiple_sep: ; + - name: --limitGenomeGenerateRAM + type: long + description: Maximum available RAM (bytes) for genome generation + example: '31000000000' + - name: --genomeSAindexNbases + type: integer + description: Length (bases) of the SA pre-indexing string. Typically between 10 and 15. Longer strings will use much more memory, but allow faster searches. For small genomes, this parameter must be scaled down to min(14, log2(GenomeLength)/2 - 1). + example: 14 + - name: --genomeChrBinNbits + type: integer + description: Defined as log2(chrBin), where chrBin is the size of the bins for genome storage. Each chromosome will occupy an integer number of bins. For a genome with large number of contigs, it is recommended to scale this parameter as min(18, log2[max(GenomeLength/NumberOfReferences,ReadLength)]). + example: 18 + - name: --genomeSAsparseD + type: integer + min: 0 + example: 1 + description: Suffux array sparsity, i.e. distance between indices. Use bigger numbers to decrease needed RAM at the cost of mapping speed reduction. + - name: --genomeSuffixLengthMax + type: integer + description: Maximum length of the suffixes, has to be longer than read length. Use -1 for infinite length. + example: -1 + - name: --genomeTransformType + type: string + description: | + Type of genome transformation + None ... no transformation + Haploid ... replace reference alleles with alternative alleles from VCF file (e.g. consensus allele) + Diploid ... create two haplotypes for each chromosome listed in VCF file, for genotypes 1|2, assumes perfect phasing (e.g. personal genome) + example: None + - name: --genomeTransformVCF + type: file + description: path to VCF file for genome transformation + +- name: "Output" + arguments: + - name: "--index" + type: file + direction: output + description: STAR index directory. + default: STAR_index + required: true + +resources: + - type: bash_script + path: script.sh + +test_resources: + - type: bash_script + path: test.sh + +engines: +- type: docker + image: ubuntu:22.04 + setup: + # setup derived from https://github.com/alexdobin/STAR/blob/master/extras/docker/Dockerfile + - type: docker + env: + - STAR_VERSION 2.7.11b + - PACKAGES gcc g++ make wget zlib1g-dev unzip xxd + run: | + apt-get update && \ + apt-get install -y --no-install-recommends ${PACKAGES} && \ + cd /tmp && \ + wget --no-check-certificate https://github.com/alexdobin/STAR/archive/refs/tags/${STAR_VERSION}.zip && \ + unzip ${STAR_VERSION}.zip && \ + cd STAR-${STAR_VERSION}/source && \ + make STARstatic CXXFLAGS_SIMD=-std=c++11 && \ + cp STAR /usr/local/bin && \ + cd / && \ + rm -rf /tmp/STAR-${STAR_VERSION} /tmp/${STAR_VERSION}.zip && \ + apt-get --purge autoremove -y ${PACKAGES} && \ + apt-get clean + - type: docker + run: | + STAR --version | sed 's#\(.*\)#star: "\1"#' > /var/software_versions.txt + +runners: + - type: executable + - type: nextflow diff --git a/src/star/star_genome_generate/help.txt b/src/star/star_genome_generate/help.txt new file mode 100644 index 00000000..940f639d --- /dev/null +++ b/src/star/star_genome_generate/help.txt @@ -0,0 +1,927 @@ +Usage: STAR [options]... --genomeDir /path/to/genome/index/ --readFilesIn R1.fq R2.fq +Spliced Transcripts Alignment to a Reference (c) Alexander Dobin, 2009-2022 + +STAR version=2.7.11b +STAR compilation time,server,dir=2024-02-11T19:36:26+00:00 :/tmp/STAR-2.7.11b/source +For more details see: + + +### versions +versionGenome 2.7.4a + string: earliest genome index version compatible with this STAR release. Please do not change this value! + +### Parameter Files +parametersFiles - + string: name of a user-defined parameters file, "-": none. Can only be defined on the command line. + +### System +sysShell - + string: path to the shell binary, preferably bash, e.g. /bin/bash. + - ... the default shell is executed, typically /bin/sh. This was reported to fail on some Ubuntu systems - then you need to specify path to bash. + +### Run Parameters +runMode alignReads + string: type of the run. + alignReads ... map reads + genomeGenerate ... generate genome files + inputAlignmentsFromBAM ... input alignments from BAM. Presently only works with --outWigType and --bamRemoveDuplicates options. + liftOver ... lift-over of GTF files (--sjdbGTFfile) between genome assemblies using chain file(s) from --genomeChainFiles. + soloCellFiltering ... STARsolo cell filtering ("calling") without remapping, followed by the path to raw count directory and output (filtered) prefix + +runThreadN 1 + int: number of threads to run STAR + +runDirPerm User_RWX + string: permissions for the directories created at the run-time. + User_RWX ... user-read/write/execute + All_RWX ... all-read/write/execute (same as chmod 777) + +runRNGseed 777 + int: random number generator seed. + + +### Genome Parameters +genomeDir ./GenomeDir/ + string: path to the directory where genome files are stored (for --runMode alignReads) or will be generated (for --runMode generateGenome) + +genomeLoad NoSharedMemory + string: mode of shared memory usage for the genome files. Only used with --runMode alignReads. + LoadAndKeep ... load genome into shared and keep it in memory after run + LoadAndRemove ... load genome into shared but remove it after run + LoadAndExit ... load genome into shared memory and exit, keeping the genome in memory for future runs + Remove ... do not map anything, just remove loaded genome from memory + NoSharedMemory ... do not use shared memory, each job will have its own private copy of the genome + +genomeFastaFiles - + string(s): path(s) to the fasta files with the genome sequences, separated by spaces. These files should be plain text FASTA files, they *cannot* be zipped. + Required for the genome generation (--runMode genomeGenerate). Can also be used in the mapping (--runMode alignReads) to add extra (new) sequences to the genome (e.g. spike-ins). + +genomeChainFiles - + string: chain files for genomic liftover. Only used with --runMode liftOver . + +genomeFileSizes 0 + uint(s)>0: genome files exact sizes in bytes. Typically, this should not be defined by the user. + +genomeTransformOutput None + string(s): which output to transform back to original genome + SAM ... SAM/BAM alignments + SJ ... splice junctions (SJ.out.tab) + Quant ... quantifications (from --quantMode option) + None ... no transformation of the output + +genomeChrSetMitochondrial chrM M MT + string(s): names of the mitochondrial chromosomes. Presently only used for STARsolo statistics output/ + +### Genome Indexing Parameters - only used with --runMode genomeGenerate +genomeChrBinNbits 18 + int: =log2(chrBin), where chrBin is the size of the bins for genome storage: each chromosome will occupy an integer number of bins. For a genome with large number of contigs, it is recommended to scale this parameter as min(18, log2[max(GenomeLength/NumberOfReferences,ReadLength)]). + +genomeSAindexNbases 14 + int: length (bases) of the SA pre-indexing string. Typically between 10 and 15. Longer strings will use much more memory, but allow faster searches. For small genomes, the parameter --genomeSAindexNbases must be scaled down to min(14, log2(GenomeLength)/2 - 1). + +genomeSAsparseD 1 + int>0: suffux array sparsity, i.e. distance between indices: use bigger numbers to decrease needed RAM at the cost of mapping speed reduction + +genomeSuffixLengthMax -1 + int: maximum length of the suffixes, has to be longer than read length. -1 = infinite. + +genomeTransformType None + string: type of genome transformation + None ... no transformation + Haploid ... replace reference alleles with alternative alleles from VCF file (e.g. consensus allele) + Diploid ... create two haplotypes for each chromosome listed in VCF file, for genotypes 1|2, assumes perfect phasing (e.g. personal genome) + +genomeTransformVCF - + string: path to VCF file for genome transformation + + + +#####UnderDevelopment_begin : not supported - do not use +genomeType Full + string: type of genome to generate + Full ... full (normal) genome + Transcriptome ... genome consists of transcript sequences + SuperTransriptome ... genome consists of superTranscript sequences +#####UnderDevelopment_end + +# DEPRECATED: please use --genomeTransformVCF and --genomeTransformType options instead. +#genomeConsensusFile - +# string: VCF file with consensus SNPs (i.e. alternative allele is the major (AF>0.5) allele) +# DEPRECATED + + + +### Splice Junctions Database +sjdbFileChrStartEnd - + string(s): path to the files with genomic coordinates (chr start end strand) for the splice junction introns. Multiple files can be supplied and will be concatenated. + +sjdbGTFfile - + string: path to the GTF file with annotations + +sjdbGTFchrPrefix - + string: prefix for chromosome names in a GTF file (e.g. 'chr' for using ENSMEBL annotations with UCSC genomes) + +sjdbGTFfeatureExon exon + string: feature type in GTF file to be used as exons for building transcripts + +sjdbGTFtagExonParentTranscript transcript_id + string: GTF attribute name for parent transcript ID (default "transcript_id" works for GTF files) + +sjdbGTFtagExonParentGene gene_id + string: GTF attribute name for parent gene ID (default "gene_id" works for GTF files) + +sjdbGTFtagExonParentGeneName gene_name + string(s): GTF attribute name for parent gene name + +sjdbGTFtagExonParentGeneType gene_type gene_biotype + string(s): GTF attribute name for parent gene type + +sjdbOverhang 100 + int>0: length of the donor/acceptor sequence on each side of the junctions, ideally = (mate_length - 1) + +sjdbScore 2 + int: extra alignment score for alignments that cross database junctions + +sjdbInsertSave Basic + string: which files to save when sjdb junctions are inserted on the fly at the mapping step + Basic ... only small junction / transcript files + All ... all files including big Genome, SA and SAindex - this will create a complete genome directory + +### Variation parameters +varVCFfile - + string: path to the VCF file that contains variation data. The 10th column should contain the genotype information, e.g. 0/1 + +### Input Files +inputBAMfile - + string: path to BAM input file, to be used with --runMode inputAlignmentsFromBAM + +### Read Parameters +readFilesType Fastx + string: format of input read files + Fastx ... FASTA or FASTQ + SAM SE ... SAM or BAM single-end reads; for BAM use --readFilesCommand samtools view + SAM PE ... SAM or BAM paired-end reads; for BAM use --readFilesCommand samtools view + +readFilesSAMattrKeep All + string(s): for --readFilesType SAM SE/PE, which SAM tags to keep in the output BAM, e.g.: --readFilesSAMtagsKeep RG PL + All ... keep all tags + None ... do not keep any tags + +readFilesIn Read1 Read2 + string(s): paths to files that contain input read1 (and, if needed, read2) + +readFilesManifest - + string: path to the "manifest" file with the names of read files. The manifest file should contain 3 tab-separated columns: + paired-end reads: read1_file_name $tab$ read2_file_name $tab$ read_group_line. + single-end reads: read1_file_name $tab$ - $tab$ read_group_line. + Spaces, but not tabs are allowed in file names. + If read_group_line does not start with ID:, it can only contain one ID field, and ID: will be added to it. + If read_group_line starts with ID:, it can contain several fields separated by $tab$, and all fields will be be copied verbatim into SAM @RG header line. + +readFilesPrefix - + string: prefix for the read files names, i.e. it will be added in front of the strings in --readFilesIn + +readFilesCommand - + string(s): command line to execute for each of the input file. This command should generate FASTA or FASTQ text and send it to stdout + For example: zcat - to uncompress .gz files, bzcat - to uncompress .bz2 files, etc. + +readMapNumber -1 + int: number of reads to map from the beginning of the file + -1: map all reads + +readMatesLengthsIn NotEqual + string: Equal/NotEqual - lengths of names,sequences,qualities for both mates are the same / not the same. NotEqual is safe in all situations. + +readNameSeparator / + string(s): character(s) separating the part of the read names that will be trimmed in output (read name after space is always trimmed) + +readQualityScoreBase 33 + int>=0: number to be subtracted from the ASCII code to get Phred quality score + +### Read Clipping + +clipAdapterType Hamming + string: adapter clipping type + Hamming ... adapter clipping based on Hamming distance, with the number of mismatches controlled by --clip5pAdapterMMp + CellRanger4 ... 5p and 3p adapter clipping similar to CellRanger4. Utilizes Opal package by Martin Šošić: https://github.com/Martinsos/opal + None ... no adapter clipping, all other clip* parameters are disregarded + +clip3pNbases 0 + int(s): number(s) of bases to clip from 3p of each mate. If one value is given, it will be assumed the same for both mates. + +clip3pAdapterSeq - + string(s): adapter sequences to clip from 3p of each mate. If one value is given, it will be assumed the same for both mates. + polyA ... polyA sequence with the length equal to read length + +clip3pAdapterMMp 0.1 + double(s): max proportion of mismatches for 3p adapter clipping for each mate. If one value is given, it will be assumed the same for both mates. + +clip3pAfterAdapterNbases 0 + int(s): number of bases to clip from 3p of each mate after the adapter clipping. If one value is given, it will be assumed the same for both mates. + +clip5pNbases 0 + int(s): number(s) of bases to clip from 5p of each mate. If one value is given, it will be assumed the same for both mates. + +#####UnderDevelopment_begin : not supported - do not use +clip5pAdapterSeq - + string(s): adapter sequences to clip from 5p of each mate, separated by space. + +clip5pAdapterMMp 0.1 + double(s): max proportion of mismatches for 5p adapter clipping for each mate, separated by space + +clip5pAfterAdapterNbases 0 + int(s): number of bases to clip from 5p of each mate after the adapter clipping, separated by space. +#####UnderDevelopment_end + +### Limits +limitGenomeGenerateRAM 31000000000 + int>0: maximum available RAM (bytes) for genome generation + +limitIObufferSize 30000000 50000000 + int(s)>0: max available buffers size (bytes) for input/output, per thread + +limitOutSAMoneReadBytes 100000 + int>0: max size of the SAM record (bytes) for one read. Recommended value: >(2*(LengthMate1+LengthMate2+100)*outFilterMultimapNmax + +limitOutSJoneRead 1000 + int>0: max number of junctions for one read (including all multi-mappers) + +limitOutSJcollapsed 1000000 + int>0: max number of collapsed junctions + +limitBAMsortRAM 0 + int>=0: maximum available RAM (bytes) for sorting BAM. If =0, it will be set to the genome index size. 0 value can only be used with --genomeLoad NoSharedMemory option. + +limitSjdbInsertNsj 1000000 + int>=0: maximum number of junctions to be inserted to the genome on the fly at the mapping stage, including those from annotations and those detected in the 1st step of the 2-pass run + +limitNreadsSoft -1 + int: soft limit on the number of reads + +### Output: general +outFileNamePrefix ./ + string: output files name prefix (including full or relative path). Can only be defined on the command line. + +outTmpDir - + string: path to a directory that will be used as temporary by STAR. All contents of this directory will be removed! + - ... the temp directory will default to outFileNamePrefix_STARtmp + +outTmpKeep None + string: whether to keep the temporary files after STAR runs is finished + None ... remove all temporary files + All ... keep all files + +outStd Log + string: which output will be directed to stdout (standard out) + Log ... log messages + SAM ... alignments in SAM format (which normally are output to Aligned.out.sam file), normal standard output will go into Log.std.out + BAM_Unsorted ... alignments in BAM format, unsorted. Requires --outSAMtype BAM Unsorted + BAM_SortedByCoordinate ... alignments in BAM format, sorted by coordinate. Requires --outSAMtype BAM SortedByCoordinate + BAM_Quant ... alignments to transcriptome in BAM format, unsorted. Requires --quantMode TranscriptomeSAM + +outReadsUnmapped None + string: output of unmapped and partially mapped (i.e. mapped only one mate of a paired end read) reads in separate file(s). + None ... no output + Fastx ... output in separate fasta/fastq files, Unmapped.out.mate1/2 + +outQSconversionAdd 0 + int: add this number to the quality score (e.g. to convert from Illumina to Sanger, use -31) + +outMultimapperOrder Old_2.4 + string: order of multimapping alignments in the output files + Old_2.4 ... quasi-random order used before 2.5.0 + Random ... random order of alignments for each multi-mapper. Read mates (pairs) are always adjacent, all alignment for each read stay together. This option will become default in the future releases. + +### Output: SAM and BAM +outSAMtype SAM + strings: type of SAM/BAM output + 1st word: + BAM ... output BAM without sorting + SAM ... output SAM without sorting + None ... no SAM/BAM output + 2nd, 3rd: + Unsorted ... standard unsorted + SortedByCoordinate ... sorted by coordinate. This option will allocate extra memory for sorting which can be specified by --limitBAMsortRAM. + +outSAMmode Full + string: mode of SAM output + None ... no SAM output + Full ... full SAM output + NoQS ... full SAM but without quality scores + +outSAMstrandField None + string: Cufflinks-like strand field flag + None ... not used + intronMotif ... strand derived from the intron motif. This option changes the output alignments: reads with inconsistent and/or non-canonical introns are filtered out. + +outSAMattributes Standard + string(s): a string of desired SAM attributes, in the order desired for the output SAM. Tags can be listed in any combination/order. + ***Presets: + None ... no attributes + Standard ... NH HI AS nM + All ... NH HI AS nM NM MD jM jI MC ch + ***Alignment: + NH ... number of loci the reads maps to: =1 for unique mappers, >1 for multimappers. Standard SAM tag. + HI ... multiple alignment index, starts with --outSAMattrIHstart (=1 by default). Standard SAM tag. + AS ... local alignment score, +1/-1 for matches/mismateches, score* penalties for indels and gaps. For PE reads, total score for two mates. Stadnard SAM tag. + nM ... number of mismatches. For PE reads, sum over two mates. + NM ... edit distance to the reference (number of mismatched + inserted + deleted bases) for each mate. Standard SAM tag. + MD ... string encoding mismatched and deleted reference bases (see standard SAM specifications). Standard SAM tag. + jM ... intron motifs for all junctions (i.e. N in CIGAR): 0: non-canonical; 1: GT/AG, 2: CT/AC, 3: GC/AG, 4: CT/GC, 5: AT/AC, 6: GT/AT. If splice junctions database is used, and a junction is annotated, 20 is added to its motif value. + jI ... start and end of introns for all junctions (1-based). + XS ... alignment strand according to --outSAMstrandField. + MC ... mate's CIGAR string. Standard SAM tag. + ch ... marks all segment of all chimeric alingments for --chimOutType WithinBAM output. + cN ... number of bases clipped from the read ends: 5' and 3' + ***Variation: + vA ... variant allele + vG ... genomic coordinate of the variant overlapped by the read. + vW ... 1 - alignment passes WASP filtering; 2,3,4,5,6,7 - alignment does not pass WASP filtering. Requires --waspOutputMode SAMtag. + ha ... haplotype (1/2) when mapping to the diploid genome. Requires genome generated with --genomeTransformType Diploid . + ***STARsolo: + CR CY UR UY ... sequences and quality scores of cell barcodes and UMIs for the solo* demultiplexing. + GX GN ... gene ID and gene name for unique-gene reads. + gx gn ... gene IDs and gene names for unique- and multi-gene reads. + CB UB ... error-corrected cell barcodes and UMIs for solo* demultiplexing. Requires --outSAMtype BAM SortedByCoordinate. + sM ... assessment of CB and UMI. + sS ... sequence of the entire barcode (CB,UMI,adapter). + sQ ... quality of the entire barcode. + sF ... type of feature overlap and number of features for each alignment + ***Unsupported/undocumented: + rB ... alignment block read/genomic coordinates. + vR ... read coordinate of the variant. + +outSAMattrIHstart 1 + int>=0: start value for the IH attribute. 0 may be required by some downstream software, such as Cufflinks or StringTie. + +outSAMunmapped None + string(s): output of unmapped reads in the SAM format + 1st word: + None ... no output + Within ... output unmapped reads within the main SAM file (i.e. Aligned.out.sam) + 2nd word: + KeepPairs ... record unmapped mate for each alignment, and, in case of unsorted output, keep it adjacent to its mapped mate. Only affects multi-mapping reads. + +outSAMorder Paired + string: type of sorting for the SAM output + Paired: one mate after the other for all paired alignments + PairedKeepInputOrder: one mate after the other for all paired alignments, the order is kept the same as in the input FASTQ files + +outSAMprimaryFlag OneBestScore + string: which alignments are considered primary - all others will be marked with 0x100 bit in the FLAG + OneBestScore ... only one alignment with the best score is primary + AllBestScore ... all alignments with the best score are primary + +outSAMreadID Standard + string: read ID record type + Standard ... first word (until space) from the FASTx read ID line, removing /1,/2 from the end + Number ... read number (index) in the FASTx file + +outSAMmapqUnique 255 + int: 0 to 255: the MAPQ value for unique mappers + +outSAMflagOR 0 + int: 0 to 65535: sam FLAG will be bitwise OR'd with this value, i.e. FLAG=FLAG | outSAMflagOR. This is applied after all flags have been set by STAR, and after outSAMflagAND. Can be used to set specific bits that are not set otherwise. + +outSAMflagAND 65535 + int: 0 to 65535: sam FLAG will be bitwise AND'd with this value, i.e. FLAG=FLAG & outSAMflagOR. This is applied after all flags have been set by STAR, but before outSAMflagOR. Can be used to unset specific bits that are not set otherwise. + +outSAMattrRGline - + string(s): SAM/BAM read group line. The first word contains the read group identifier and must start with "ID:", e.g. --outSAMattrRGline ID:xxx CN:yy "DS:z z z". + xxx will be added as RG tag to each output alignment. Any spaces in the tag values have to be double quoted. + Comma separated RG lines correspons to different (comma separated) input files in --readFilesIn. Commas have to be surrounded by spaces, e.g. + --outSAMattrRGline ID:xxx , ID:zzz "DS:z z" , ID:yyy DS:yyyy + +outSAMheaderHD - + strings: @HD (header) line of the SAM header + +outSAMheaderPG - + strings: extra @PG (software) line of the SAM header (in addition to STAR) + +outSAMheaderCommentFile - + string: path to the file with @CO (comment) lines of the SAM header + +outSAMfilter None + string(s): filter the output into main SAM/BAM files + KeepOnlyAddedReferences ... only keep the reads for which all alignments are to the extra reference sequences added with --genomeFastaFiles at the mapping stage. + KeepAllAddedReferences ... keep all alignments to the extra reference sequences added with --genomeFastaFiles at the mapping stage. + + +outSAMmultNmax -1 + int: max number of multiple alignments for a read that will be output to the SAM/BAM files. Note that if this value is not equal to -1, the top scoring alignment will be output first + -1 ... all alignments (up to --outFilterMultimapNmax) will be output + +outSAMtlen 1 + int: calculation method for the TLEN field in the SAM/BAM files + 1 ... leftmost base of the (+)strand mate to rightmost base of the (-)mate. (+)sign for the (+)strand mate + 2 ... leftmost base of any mate to rightmost base of any mate. (+)sign for the mate with the leftmost base. This is different from 1 for overlapping mates with protruding ends + +outBAMcompression 1 + int: -1 to 10 BAM compression level, -1=default compression (6?), 0=no compression, 10=maximum compression + +outBAMsortingThreadN 0 + int: >=0: number of threads for BAM sorting. 0 will default to min(6,--runThreadN). + +outBAMsortingBinsN 50 + int: >0: number of genome bins for coordinate-sorting + +### BAM processing +bamRemoveDuplicatesType - + string: mark duplicates in the BAM file, for now only works with (i) sorted BAM fed with inputBAMfile, and (ii) for paired-end alignments only + - ... no duplicate removal/marking + UniqueIdentical ... mark all multimappers, and duplicate unique mappers. The coordinates, FLAG, CIGAR must be identical + UniqueIdenticalNotMulti ... mark duplicate unique mappers but not multimappers. + +bamRemoveDuplicatesMate2basesN 0 + int>0: number of bases from the 5' of mate 2 to use in collapsing (e.g. for RAMPAGE) + +### Output Wiggle +outWigType None + string(s): type of signal output, e.g. "bedGraph" OR "bedGraph read1_5p". Requires sorted BAM: --outSAMtype BAM SortedByCoordinate . + 1st word: + None ... no signal output + bedGraph ... bedGraph format + wiggle ... wiggle format + 2nd word: + read1_5p ... signal from only 5' of the 1st read, useful for CAGE/RAMPAGE etc + read2 ... signal from only 2nd read + +outWigStrand Stranded + string: strandedness of wiggle/bedGraph output + Stranded ... separate strands, str1 and str2 + Unstranded ... collapsed strands + +outWigReferencesPrefix - + string: prefix matching reference names to include in the output wiggle file, e.g. "chr", default "-" - include all references + +outWigNorm RPM + string: type of normalization for the signal + RPM ... reads per million of mapped reads + None ... no normalization, "raw" counts + +### Output Filtering +outFilterType Normal + string: type of filtering + Normal ... standard filtering using only current alignment + BySJout ... keep only those reads that contain junctions that passed filtering into SJ.out.tab + +outFilterMultimapScoreRange 1 + int: the score range below the maximum score for multimapping alignments + +outFilterMultimapNmax 10 + int: maximum number of loci the read is allowed to map to. Alignments (all of them) will be output only if the read maps to no more loci than this value. + Otherwise no alignments will be output, and the read will be counted as "mapped to too many loci" in the Log.final.out . + +outFilterMismatchNmax 10 + int: alignment will be output only if it has no more mismatches than this value. + +outFilterMismatchNoverLmax 0.3 + real: alignment will be output only if its ratio of mismatches to *mapped* length is less than or equal to this value. + +outFilterMismatchNoverReadLmax 1.0 + real: alignment will be output only if its ratio of mismatches to *read* length is less than or equal to this value. + + +outFilterScoreMin 0 + int: alignment will be output only if its score is higher than or equal to this value. + +outFilterScoreMinOverLread 0.66 + real: same as outFilterScoreMin, but normalized to read length (sum of mates' lengths for paired-end reads) + +outFilterMatchNmin 0 + int: alignment will be output only if the number of matched bases is higher than or equal to this value. + +outFilterMatchNminOverLread 0.66 + real: sam as outFilterMatchNmin, but normalized to the read length (sum of mates' lengths for paired-end reads). + +outFilterIntronMotifs None + string: filter alignment using their motifs + None ... no filtering + RemoveNoncanonical ... filter out alignments that contain non-canonical junctions + RemoveNoncanonicalUnannotated ... filter out alignments that contain non-canonical unannotated junctions when using annotated splice junctions database. The annotated non-canonical junctions will be kept. + +outFilterIntronStrands RemoveInconsistentStrands + string: filter alignments + RemoveInconsistentStrands ... remove alignments that have junctions with inconsistent strands + None ... no filtering + +### Output splice junctions (SJ.out.tab) +outSJtype Standard + string: type of splice junction output + Standard ... standard SJ.out.tab output + None ... no splice junction output + +### Output Filtering: Splice Junctions +outSJfilterReads All + string: which reads to consider for collapsed splice junctions output + All ... all reads, unique- and multi-mappers + Unique ... uniquely mapping reads only + +outSJfilterOverhangMin 30 12 12 12 + 4 integers: minimum overhang length for splice junctions on both sides for: (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif. -1 means no output for that motif + does not apply to annotated junctions + +outSJfilterCountUniqueMin 3 1 1 1 + 4 integers: minimum uniquely mapping read count per junction for: (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif. -1 means no output for that motif + Junctions are output if one of outSJfilterCountUniqueMin OR outSJfilterCountTotalMin conditions are satisfied + does not apply to annotated junctions + +outSJfilterCountTotalMin 3 1 1 1 + 4 integers: minimum total (multi-mapping+unique) read count per junction for: (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif. -1 means no output for that motif + Junctions are output if one of outSJfilterCountUniqueMin OR outSJfilterCountTotalMin conditions are satisfied + does not apply to annotated junctions + +outSJfilterDistToOtherSJmin 10 0 5 10 + 4 integers>=0: minimum allowed distance to other junctions' donor/acceptor + does not apply to annotated junctions + +outSJfilterIntronMaxVsReadN 50000 100000 200000 + N integers>=0: maximum gap allowed for junctions supported by 1,2,3,,,N reads + i.e. by default junctions supported by 1 read can have gaps <=50000b, by 2 reads: <=100000b, by 3 reads: <=200000. by >=4 reads any gap <=alignIntronMax + does not apply to annotated junctions + +### Scoring +scoreGap 0 + int: splice junction penalty (independent on intron motif) + +scoreGapNoncan -8 + int: non-canonical junction penalty (in addition to scoreGap) + +scoreGapGCAG -4 + int: GC/AG and CT/GC junction penalty (in addition to scoreGap) + +scoreGapATAC -8 + int: AT/AC and GT/AT junction penalty (in addition to scoreGap) + +scoreGenomicLengthLog2scale -0.25 + int: extra score logarithmically scaled with genomic length of the alignment: scoreGenomicLengthLog2scale*log2(genomicLength) + +scoreDelOpen -2 + int: deletion open penalty + +scoreDelBase -2 + int: deletion extension penalty per base (in addition to scoreDelOpen) + +scoreInsOpen -2 + int: insertion open penalty + +scoreInsBase -2 + int: insertion extension penalty per base (in addition to scoreInsOpen) + +scoreStitchSJshift 1 + int: maximum score reduction while searching for SJ boundaries in the stitching step + + +### Alignments and Seeding + +seedSearchStartLmax 50 + int>0: defines the search start point through the read - the read is split into pieces no longer than this value + +seedSearchStartLmaxOverLread 1.0 + real: seedSearchStartLmax normalized to read length (sum of mates' lengths for paired-end reads) + +seedSearchLmax 0 + int>=0: defines the maximum length of the seeds, if =0 seed length is not limited + +seedMultimapNmax 10000 + int>0: only pieces that map fewer than this value are utilized in the stitching procedure + +seedPerReadNmax 1000 + int>0: max number of seeds per read + +seedPerWindowNmax 50 + int>0: max number of seeds per window + +seedNoneLociPerWindow 10 + int>0: max number of one seed loci per window + +seedSplitMin 12 + int>0: min length of the seed sequences split by Ns or mate gap + +seedMapMin 5 + int>0: min length of seeds to be mapped + +alignIntronMin 21 + int: minimum intron size, genomic gap is considered intron if its length>=alignIntronMin, otherwise it is considered Deletion + +alignIntronMax 0 + int: maximum intron size, if 0, max intron size will be determined by (2^winBinNbits)*winAnchorDistNbins + +alignMatesGapMax 0 + int: maximum gap between two mates, if 0, max intron gap will be determined by (2^winBinNbits)*winAnchorDistNbins + +alignSJoverhangMin 5 + int>0: minimum overhang (i.e. block size) for spliced alignments + +alignSJstitchMismatchNmax 0 -1 0 0 + 4*int>=0: maximum number of mismatches for stitching of the splice junctions (-1: no limit). + (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif. + +alignSJDBoverhangMin 3 + int>0: minimum overhang (i.e. block size) for annotated (sjdb) spliced alignments + +alignSplicedMateMapLmin 0 + int>0: minimum mapped length for a read mate that is spliced + +alignSplicedMateMapLminOverLmate 0.66 + real>0: alignSplicedMateMapLmin normalized to mate length + +alignWindowsPerReadNmax 10000 + int>0: max number of windows per read + +alignTranscriptsPerWindowNmax 100 + int>0: max number of transcripts per window + +alignTranscriptsPerReadNmax 10000 + int>0: max number of different alignments per read to consider + +alignEndsType Local + string: type of read ends alignment + Local ... standard local alignment with soft-clipping allowed + EndToEnd ... force end-to-end read alignment, do not soft-clip + Extend5pOfRead1 ... fully extend only the 5p of the read1, all other ends: local alignment + Extend5pOfReads12 ... fully extend only the 5p of the both read1 and read2, all other ends: local alignment + +alignEndsProtrude 0 ConcordantPair + int, string: allow protrusion of alignment ends, i.e. start (end) of the +strand mate downstream of the start (end) of the -strand mate + 1st word: int: maximum number of protrusion bases allowed + 2nd word: string: + ConcordantPair ... report alignments with non-zero protrusion as concordant pairs + DiscordantPair ... report alignments with non-zero protrusion as discordant pairs + +alignSoftClipAtReferenceEnds Yes + string: allow the soft-clipping of the alignments past the end of the chromosomes + Yes ... allow + No ... prohibit, useful for compatibility with Cufflinks + +alignInsertionFlush None + string: how to flush ambiguous insertion positions + None ... insertions are not flushed + Right ... insertions are flushed to the right + +### Paired-End reads +peOverlapNbasesMin 0 + int>=0: minimum number of overlapping bases to trigger mates merging and realignment. Specify >0 value to switch on the "merginf of overlapping mates" algorithm. + +peOverlapMMp 0.01 + real, >=0 & <1: maximum proportion of mismatched bases in the overlap area + +### Windows, Anchors, Binning + +winAnchorMultimapNmax 50 + int>0: max number of loci anchors are allowed to map to + +winBinNbits 16 + int>0: =log2(winBin), where winBin is the size of the bin for the windows/clustering, each window will occupy an integer number of bins. + +winAnchorDistNbins 9 + int>0: max number of bins between two anchors that allows aggregation of anchors into one window + +winFlankNbins 4 + int>0: log2(winFlank), where win Flank is the size of the left and right flanking regions for each window + +winReadCoverageRelativeMin 0.5 + real>=0: minimum relative coverage of the read sequence by the seeds in a window, for STARlong algorithm only. + +winReadCoverageBasesMin 0 + int>0: minimum number of bases covered by the seeds in a window , for STARlong algorithm only. + +### Chimeric Alignments +chimOutType Junctions + string(s): type of chimeric output + Junctions ... Chimeric.out.junction + SeparateSAMold ... output old SAM into separate Chimeric.out.sam file + WithinBAM ... output into main aligned BAM files (Aligned.*.bam) + WithinBAM HardClip ... (default) hard-clipping in the CIGAR for supplemental chimeric alignments (default if no 2nd word is present) + WithinBAM SoftClip ... soft-clipping in the CIGAR for supplemental chimeric alignments + +chimSegmentMin 0 + int>=0: minimum length of chimeric segment length, if ==0, no chimeric output + +chimScoreMin 0 + int>=0: minimum total (summed) score of the chimeric segments + +chimScoreDropMax 20 + int>=0: max drop (difference) of chimeric score (the sum of scores of all chimeric segments) from the read length + +chimScoreSeparation 10 + int>=0: minimum difference (separation) between the best chimeric score and the next one + +chimScoreJunctionNonGTAG -1 + int: penalty for a non-GT/AG chimeric junction + +chimJunctionOverhangMin 20 + int>=0: minimum overhang for a chimeric junction + +chimSegmentReadGapMax 0 + int>=0: maximum gap in the read sequence between chimeric segments + +chimFilter banGenomicN + string(s): different filters for chimeric alignments + None ... no filtering + banGenomicN ... Ns are not allowed in the genome sequence around the chimeric junction + +chimMainSegmentMultNmax 10 + int>=1: maximum number of multi-alignments for the main chimeric segment. =1 will prohibit multimapping main segments. + +chimMultimapNmax 0 + int>=0: maximum number of chimeric multi-alignments + 0 ... use the old scheme for chimeric detection which only considered unique alignments + +chimMultimapScoreRange 1 + int>=0: the score range for multi-mapping chimeras below the best chimeric score. Only works with --chimMultimapNmax > 1 + +chimNonchimScoreDropMin 20 + int>=0: to trigger chimeric detection, the drop in the best non-chimeric alignment score with respect to the read length has to be greater than this value + +chimOutJunctionFormat 0 + int: formatting type for the Chimeric.out.junction file + 0 ... no comment lines/headers + 1 ... comment lines at the end of the file: command line and Nreads: total, unique/multi-mapping + +### Quantification of Annotations +quantMode - + string(s): types of quantification requested + - ... none + TranscriptomeSAM ... output SAM/BAM alignments to transcriptome into a separate file + GeneCounts ... count reads per gene + +quantTranscriptomeBAMcompression 1 + int: -2 to 10 transcriptome BAM compression level + -2 ... no BAM output + -1 ... default compression (6?) + 0 ... no compression + 10 ... maximum compression + +quantTranscriptomeSAMoutput BanSingleEnd_BanIndels_ExtendSoftclip + string: alignment filtering for TranscriptomeSAM output + BanSingleEnd_BanIndels_ExtendSoftclip ... prohibit indels and single-end alignments, extend softclips - compatible with RSEM + BanSingleEnd ... prohibit single-end alignments, allow indels and softclips + BanSingleEnd_ExtendSoftclip ... prohibit single-end alignments, extend softclips, allow indels + + +### 2-pass Mapping +twopassMode None + string: 2-pass mapping mode. + None ... 1-pass mapping + Basic ... basic 2-pass mapping, with all 1st pass junctions inserted into the genome indices on the fly + +twopass1readsN -1 + int: number of reads to process for the 1st step. Use very large number (or default -1) to map all reads in the first step. + + +### WASP parameters +waspOutputMode None + string: WASP allele-specific output type. This is re-implementation of the original WASP mappability filtering by Bryce van de Geijn, Graham McVicker, Yoav Gilad & Jonathan K Pritchard. Please cite the original WASP paper: Nature Methods 12, 1061–1063 (2015), https://www.nature.com/articles/nmeth.3582 . + SAMtag ... add WASP tags to the alignments that pass WASP filtering + +### STARsolo (single cell RNA-seq) parameters +soloType None + string(s): type of single-cell RNA-seq + CB_UMI_Simple ... (a.k.a. Droplet) one UMI and one Cell Barcode of fixed length in read2, e.g. Drop-seq and 10X Chromium. + CB_UMI_Complex ... multiple Cell Barcodes of varying length, one UMI of fixed length and one adapter sequence of fixed length are allowed in read2 only (e.g. inDrop, ddSeq). + CB_samTagOut ... output Cell Barcode as CR and/or CB SAm tag. No UMI counting. --readFilesIn cDNA_read1 [cDNA_read2 if paired-end] CellBarcode_read . Requires --outSAMtype BAM Unsorted [and/or SortedByCoordinate] + SmartSeq ... Smart-seq: each cell in a separate FASTQ (paired- or single-end), barcodes are corresponding read-groups, no UMI sequences, alignments deduplicated according to alignment start and end (after extending soft-clipped bases) + +soloCBtype Sequence + string: cell barcode type + Sequence: cell barcode is a sequence (standard option) + String: cell barcode is an arbitrary string + +soloCBwhitelist - + string(s): file(s) with whitelist(s) of cell barcodes. Only --soloType CB_UMI_Complex allows more than one whitelist file. + None ... no whitelist: all cell barcodes are allowed + +soloCBstart 1 + int>0: cell barcode start base + +soloCBlen 16 + int>0: cell barcode length + +soloUMIstart 17 + int>0: UMI start base + +soloUMIlen 10 + int>0: UMI length + +soloBarcodeReadLength 1 + int: length of the barcode read + 1 ... equal to sum of soloCBlen+soloUMIlen + 0 ... not defined, do not check + +soloBarcodeMate 0 + int: identifies which read mate contains the barcode (CB+UMI) sequence + 0 ... barcode sequence is on separate read, which should always be the last file in the --readFilesIn listed + 1 ... barcode sequence is a part of mate 1 + 2 ... barcode sequence is a part of mate 2 + +soloCBposition - + strings(s): position of Cell Barcode(s) on the barcode read. + Presently only works with --soloType CB_UMI_Complex, and barcodes are assumed to be on Read2. + Format for each barcode: startAnchor_startPosition_endAnchor_endPosition + start(end)Anchor defines the Anchor Base for the CB: 0: read start; 1: read end; 2: adapter start; 3: adapter end + start(end)Position is the 0-based position with of the CB start(end) with respect to the Anchor Base + String for different barcodes are separated by space. + Example: inDrop (Zilionis et al, Nat. Protocols, 2017): + --soloCBposition 0_0_2_-1 3_1_3_8 + +soloUMIposition - + string: position of the UMI on the barcode read, same as soloCBposition + Example: inDrop (Zilionis et al, Nat. Protocols, 2017): + --soloCBposition 3_9_3_14 + +soloAdapterSequence - + string: adapter sequence to anchor barcodes. Only one adapter sequence is allowed. + +soloAdapterMismatchesNmax 1 + int>0: maximum number of mismatches allowed in adapter sequence. + +soloCBmatchWLtype 1MM_multi + string: matching the Cell Barcodes to the WhiteList + Exact ... only exact matches allowed + 1MM ... only one match in whitelist with 1 mismatched base allowed. Allowed CBs have to have at least one read with exact match. + 1MM_multi ... multiple matches in whitelist with 1 mismatched base allowed, posterior probability calculation is used choose one of the matches. + Allowed CBs have to have at least one read with exact match. This option matches best with CellRanger 2.2.0 + 1MM_multi_pseudocounts ... same as 1MM_Multi, but pseudocounts of 1 are added to all whitelist barcodes. + 1MM_multi_Nbase_pseudocounts ... same as 1MM_multi_pseudocounts, multimatching to WL is allowed for CBs with N-bases. This option matches best with CellRanger >= 3.0.0 + EditDist_2 ... allow up to edit distance of 3 fpr each of the barcodes. May include one deletion + one insertion. Only works with --soloType CB_UMI_Complex. Matches to multiple passlist barcdoes are not allowed. Similar to ParseBio Split-seq pipeline. + +soloInputSAMattrBarcodeSeq - + string(s): when inputting reads from a SAM file (--readsFileType SAM SE/PE), these SAM attributes mark the barcode sequence (in proper order). + For instance, for 10X CellRanger or STARsolo BAMs, use --soloInputSAMattrBarcodeSeq CR UR . + This parameter is required when running STARsolo with input from SAM. + +soloInputSAMattrBarcodeQual - + string(s): when inputting reads from a SAM file (--readsFileType SAM SE/PE), these SAM attributes mark the barcode qualities (in proper order). + For instance, for 10X CellRanger or STARsolo BAMs, use --soloInputSAMattrBarcodeQual CY UY . + If this parameter is '-' (default), the quality 'H' will be assigned to all bases. + +soloStrand Forward + string: strandedness of the solo libraries: + Unstranded ... no strand information + Forward ... read strand same as the original RNA molecule + Reverse ... read strand opposite to the original RNA molecule + +soloFeatures Gene + string(s): genomic features for which the UMI counts per Cell Barcode are collected + Gene ... genes: reads match the gene transcript + SJ ... splice junctions: reported in SJ.out.tab + GeneFull ... full gene (pre-mRNA): count all reads overlapping genes' exons and introns + GeneFull_ExonOverIntron ... full gene (pre-mRNA): count all reads overlapping genes' exons and introns: prioritize 100% overlap with exons + GeneFull_Ex50pAS ... full gene (pre-RNA): count all reads overlapping genes' exons and introns: prioritize >50% overlap with exons. Do not count reads with 100% exonic overlap in the antisense direction. + +#####UnderDevelopment_begin : not supported - do not use + Transcript3p ... quantification of transcript for 3' protocols +#####UnderDevelopment_end + +soloMultiMappers Unique + string(s): counting method for reads mapping to multiple genes + Unique ... count only reads that map to unique genes + Uniform ... uniformly distribute multi-genic UMIs to all genes + Rescue ... distribute UMIs proportionally to unique+uniform counts (~ first iteration of EM) + PropUnique ... distribute UMIs proportionally to unique mappers, if present, and uniformly if not. + EM ... multi-gene UMIs are distributed using Expectation Maximization algorithm + +soloUMIdedup 1MM_All + string(s): type of UMI deduplication (collapsing) algorithm + 1MM_All ... all UMIs with 1 mismatch distance to each other are collapsed (i.e. counted once). + 1MM_Directional_UMItools ... follows the "directional" method from the UMI-tools by Smith, Heger and Sudbery (Genome Research 2017). + 1MM_Directional ... same as 1MM_Directional_UMItools, but with more stringent criteria for duplicate UMIs + Exact ... only exactly matching UMIs are collapsed. + NoDedup ... no deduplication of UMIs, count all reads. + 1MM_CR ... CellRanger2-4 algorithm for 1MM UMI collapsing. + +soloUMIfiltering - + string(s): type of UMI filtering (for reads uniquely mapping to genes) + - ... basic filtering: remove UMIs with N and homopolymers (similar to CellRanger 2.2.0). + MultiGeneUMI ... basic + remove lower-count UMIs that map to more than one gene. + MultiGeneUMI_All ... basic + remove all UMIs that map to more than one gene. + MultiGeneUMI_CR ... basic + remove lower-count UMIs that map to more than one gene, matching CellRanger > 3.0.0 . + Only works with --soloUMIdedup 1MM_CR + +soloOutFileNames Solo.out/ features.tsv barcodes.tsv matrix.mtx + string(s): file names for STARsolo output: + file_name_prefix gene_names barcode_sequences cell_feature_count_matrix + +soloCellFilter CellRanger2.2 3000 0.99 10 + string(s): cell filtering type and parameters + None ... do not output filtered cells + TopCells ... only report top cells by UMI count, followed by the exact number of cells + CellRanger2.2 ... simple filtering of CellRanger 2.2. + Can be followed by numbers: number of expected cells, robust maximum percentile for UMI count, maximum to minimum ratio for UMI count + The harcoded values are from CellRanger: nExpectedCells=3000; maxPercentile=0.99; maxMinRatio=10 + EmptyDrops_CR ... EmptyDrops filtering in CellRanger flavor. Please cite the original EmptyDrops paper: A.T.L Lun et al, Genome Biology, 20, 63 (2019): https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1662-y + Can be followed by 10 numeric parameters: nExpectedCells maxPercentile maxMinRatio indMin indMax umiMin umiMinFracMedian candMaxN FDR simN + The harcoded values are from CellRanger: 3000 0.99 10 45000 90000 500 0.01 20000 0.01 10000 + +soloOutFormatFeaturesGeneField3 "Gene Expression" + string(s): field 3 in the Gene features.tsv file. If "-", then no 3rd field is output. + +soloCellReadStats None + string: Output reads statistics for each CB + Standard ... standard output + +#####UnderDevelopment_begin : not supported - do not use +soloClusterCBfile - + string: file containing the cluster information for cell barcodes, two columns: CB cluster_index. Only used with --soloFeatures Transcript3p +#####UnderDevelopment_end diff --git a/src/star/star_genome_generate/script.sh b/src/star/star_genome_generate/script.sh new file mode 100644 index 00000000..cb3b906c --- /dev/null +++ b/src/star/star_genome_generate/script.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +set -e + +## VIASH START +## VIASH END + +mkdir -p $par_index + +STAR \ + --runMode genomeGenerate \ + --genomeDir $par_index \ + --genomeFastaFiles $par_genomeFastaFiles \ + ${meta_cpus:+--runThreadN "${meta_cpus}"} \ + ${par_sjdbGTFfile:+--sjdbGTFfile "${par_sjdbGTFfile}"} \ + ${par_sjdbOverhang:+--sjdbOverhang "${par_sjdbOverhang}"} \ + ${par_genomeSAindexNbases:+--genomeSAindexNbases "${par_genomeSAindexNbases}"} \ + ${par_sjdbGTFchrPrefix:+--sjdbGTFchrPrefix "${par_sjdbGTFchrPrefix}"} \ + ${par_sjdbGTFfeatureExon:+--sjdbGTFfeatureExon "${par_sjdbGTFfeatureExon}"} \ + ${par_sjdbGTFtagExonParentTranscript:+--sjdbGTFtagExonParentTranscript "${par_sjdbGTFtagExonParentTranscript}"} \ + ${par_sjdbGTFtagExonParentGene:+--sjdbGTFtagExonParentGene "${par_sjdbGTFtagExonParentGene}"} \ + ${par_sjdbGTFtagExonParentGeneName:+--sjdbGTFtagExonParentGeneName "${par_sjdbGTFtagExonParentGeneName}"} \ + ${par_sjdbGTFtagExonParentGeneType:+--sjdbGTFtagExonParentGeneType "${sjdbGTFtagExonParentGeneType}"} \ + ${par_limitGenomeGenerateRAM:+--limitGenomeGenerateRAM "${par_limitGenomeGenerateRAM}"} \ + ${par_genomeChrBinNbits:+--genomeChrBinNbits "${par_genomeChrBinNbits}"} \ + ${par_genomeSAsparseD:+--genomeSAsparseD "${par_genomeSAsparseD}"} \ + ${par_genomeSuffixLengthMax:+--genomeSuffixLengthMax "${par_genomeSuffixLengthMax}"} \ + ${par_genomeTransformType:+--genomeTransformType "${par_genomeTransformType}"} \ + ${par_genomeTransformVCF:+--genomeTransformVCF "${par_genomeTransformVCF}"} \ diff --git a/src/star/star_genome_generate/test.sh b/src/star/star_genome_generate/test.sh new file mode 100644 index 00000000..fd0e4775 --- /dev/null +++ b/src/star/star_genome_generate/test.sh @@ -0,0 +1,48 @@ +#!/bin/bash + +set -e + +## VIASH START +## VIASH END + +######################################################################################### + +echo "> Prepare test data" + +cat > genome.fasta <<'EOF' +>chr1 +TGGCATGAGCCAACGAACGCTGCCTCATAAGCCTCACACATCCGCGCCTATGTTGTGACTCTCTGTGAGCGTTCGTGGG +GCTCGTCACCACTATGGTTGGCCGGTTAGTAGTGTGACTCCTGGTTTTCTGGAGCTTCTTTAAACCGTAGTCCAGTCAA +TGCGAATGGCACTTCACGACGGACTGTCCTTAGCTCAGGGGA +EOF + +cat > genes.gtf <<'EOF' +chr1 example_source gene 0 50 . + . gene_id "gene1"; transcript_id "transcript1"; +chr1 example_source exon 20 40 . + . gene_id "gene1"; transcript_id "transcript1"; +EOF + +######################################################################################### + +echo "> Generate index" +"$meta_executable" \ + ${meta_cpus:+---cpus $meta_cpus} \ + --index "star_index/" \ + --genomeFastaFiles "genome.fasta" \ + --sjdbGTFfile "genes.gtf" \ + --genomeSAindexNbases 2 + +files=("Genome" "Log.out" "SA" "SAindex" "chrLength.txt" "chrName.txt" "chrNameLength.txt" "chrStart.txt" "exonGeTrInfo.tab" "exonInfo.tab" "geneInfo.tab" "genomeParameters.txt" "sjdbInfo.txt" "sjdbList.fromGTF.out.tab" "sjdbList.out.tab" "transcriptInfo.tab") + +echo ">> Check if output exists" +[ ! -d "star_index" ] && echo "Directory 'star_index' does not exist!" && exit 1 +for file in "${files[@]}"; do + [ ! -f "star_index/$file" ] && echo "File '$file' does not exist in 'star_index'." && exit 1 +done + +echo ">> Check contents of output files" +grep -q "200" "star_index/chrLength.txt" || (echo "Chromosome length in file 'chrLength.txt' is incorrect! " && exit 1) +grep -q "chr1" "star_index/chrName.txt" || (echo "Chromosome name in file 'chrName.txt' is incorrect! " && exit 1) +grep -q "chr1 200" "star_index/chrNameLength.txt" || (echo "Chromosome name in file 'chrNameLength.txt' is incorrect! " && exit 1) + +echo ">>> Test finished successfully" +exit 0 diff --git a/src/umi_tools/umi_tools_dedup/config.vsh.yaml b/src/umi_tools/umi_tools_dedup/config.vsh.yaml new file mode 100644 index 00000000..a02e70a1 --- /dev/null +++ b/src/umi_tools/umi_tools_dedup/config.vsh.yaml @@ -0,0 +1,303 @@ +name: umi_tools_dedup +namespace: umi_tools +description: | + Deduplicate reads based on the mapping co-ordinate and the UMI attached to the read. +keywords: [umi_tools, deduplication, dedup] +links: + homepage: https://umi-tools.readthedocs.io/en/latest/ + documentation: https://umi-tools.readthedocs.io/en/latest/reference/dedup.html + repository: https://github.com/CGATOxford/UMI-tools +references: + doi: 10.1101/gr.209601.116 +license: MIT + +argument_groups: + - name: Inputs + arguments: + - name: --input + alternatives: --stdin + type: file + description: Input BAM or SAM file. Use --in_sam to specify SAM format. + required: true + - name: --in_sam + type: boolean_true + description: | + By default, inputs are assumed to be in BAM format. Use this options to specify the use of SAM + format for input. + - name: --bai + type: file + description: BAM index + - name: --random_seed + type: integer + description: Random seed to initialize number generator with. + + - name: Outputs + arguments: + - name: --output + alternatives: --stdout + type: file + description: Deduplicated BAM file. + required: true + direction: output + - name: --out_sam + type: boolean_true + description: | + By default, outputa are written in BAM format. Use this options to specify the use of SAM format + for output. + - name: --paired + type: boolean_true + description: | + BAM is paired end - output both read pairs. This will also force the use of the template length + to determine reads with the same mapping coordinates. + - name: --output_stats + type: string + description: | + Generate files containing UMI based deduplication statistics files with this prefix in the file names. + - name: --extract_umi_method + type: string + choices: [read_id, tag, umis] + description: | + Specify the method by which the barcodes were encoded in the read. + The options are: + * read_id (default) + * tag + * umis + example: "read_id" + - name: --umi_tag + type: string + description: | + The tag containing the UMI sequence. This is only required if the extract_umi_method is set to tag. + - name: --umi_separator + type: string + description: | + The separator used to separate the UMI from the read sequence. This is only required if the + extract_umi_method is set to id_read. Default: `_`. + example: '_' + - name: --umi_tag_split + type: string + description: Separate the UMI in tag by and take the first element. + - name: --umi_tag_delimiter + type: string + description: Separate the UMI in by and concatenate the elements. + - name: --cell_tag + type: string + description: | + The tag containing the cell barcode sequence. This is only required if the extract_umi_method + is set to tag. + - name: --cell_tag_split + type: string + description: Separate the cell barcode in tag by and take the first element. + - name: --cell_tag_delimiter + type: string + description: Separate the cell barcode in by and concatenate the elements. + + - name: Grouping Options + arguments: + - name: --method + type: string + choices: [unique, percentile, cluster, adjacency, directional] + description: | + The method to use for grouping reads. + The options are: + * unique + * percentile + * cluster + * adjacency + * directional (default) + example: "directional" + - name: --edit_distance_threshold + type: integer + description: | + For the adjacency and cluster methods the threshold for the edit distance to connect two + UMIs in the network can be increased. The default value of 1 works best unless the UMI is + very long (>14bp). Default: `1`. + example: 1 + - name: --spliced_is_unique + type: boolean_true + description: | + Causes two reads that start in the same position on the same strand and having the same UMI + to be considered unique if one is spliced and the other is not. (Uses the 'N' cigar operation + to test for splicing). + - name: --soft_clip_threshold + type: integer + description: | + Mappers that soft clip will sometimes do so rather than mapping a spliced read if there is only + a small overhang over the exon junction. By setting this option, you can treat reads with at + least this many bases soft-clipped at the 3' end as spliced. Default: `4`. + example: 4 + - name: --multimapping_detection_method + type: string + description: | + If the sam/bam contains tags to identify multimapping reads, you can specify for use when selecting + the best read at a given loci. Supported tags are `NH`, `X0` and `XT`. If not specified, the read + with the highest mapping quality will be selected. + - name: --read_length + type: boolean_true + description: Use the read length as a criteria when deduping, for e.g. sRNA-Seq. + + - name: Single-cell RNA-Seq Options + arguments: + - name: --per_gene + type: boolean_true + description: | + Reads will be grouped together if they have the same gene. This is useful if your library prep + generates PCR duplicates with non identical alignment positions such as CEL-Seq. Note this option + is hardcoded to be on with the count command. I.e. counting is always performed per-gene. Must be + combined with either --gene_tag or --per_contig option. + - name: --gene_tag + type: string + description: | + Deduplicate per gene. The gene information is encoded in the bam read tag specified. + - name: --assigned_status_tag + type: string + description: | + BAM tag which describes whether a read is assigned to a gene. Defaults to the same value as given + for --gene_tag. + - name: --skip_tags_regex + type: string + description: | + Use in conjunction with the --assigned_status_tag option to skip any reads where the tag matches + this regex. Default ("^[__|Unassigned]") matches anything which starts with "__" or "Unassigned". + - name: --per_contig + type: boolean_true + description: | + Deduplicate per contig (field 3 in BAM; RNAME). All reads with the sam contig will be considered to + have the same alignment position. This is useful if you have aligned to a reference transcriptome + with one transcript per gene. If you have aligned to a transcriptome with more than one transcript + per gene, you can supply a map between transcripts and gene using the --gene_transcript_map option. + - name: --gene_transcript_map + type: file + description: | + A file containing a mapping between gene names and transcript names. The file should be tab + separated with the gene name in the first column and the transcript name in the second column. + - name: --per_cell + type: boolean_true + description: | + Reads will only be grouped together if they have the same cell barcode. Can be combined with + --per_gene. + + - name: SAM/BAM Options + arguments: + - name: --mapping_quality + type: integer + description: | + Minimium mapping quality (MAPQ) for a read to be retained. Default: `0`. + example: 0 + - name: --unmapped_reads + type: string + description: | + How unmapped reads should be handled. + The options are: + * "discard": Discard all unmapped reads. (default) + * "use": If read2 is unmapped, deduplicate using read1 only. Requires --paired. + * "output": Output unmapped reads/read pairs without UMI grouping/deduplication. Only available in umi_tools group. + example: "discard" + - name: --chimeric_pairs + type: string + choices: [discard, use, output] + description: | + How chimeric pairs should be handled. + The options are: + * "discard": Discard all chimeric read pairs. + * "use": Deduplicate using read1 only. (default) + * "output": Output chimeric pairs without UMI grouping/deduplication. Only available in + umi_tools group. + example: "use" + - name: --unpaired_reads + type: string + choices: [discard, use, output] + description: | + How unpaired reads should be handled. + The options are: + * "discard": Discard all unmapped reads. + * "use": If read2 is unmapped, deduplicate using read1 only. Requires --paired. (default) + * "output": Output unmapped reads/read pairs without UMI grouping/deduplication. Only available + in umi_tools group. + example: "use" + - name: --ignore_umi + type: boolean_true + description: Ignore the UMI and group reads using mapping coordinates only. + - name: --subset + type: double + description: | + Only consider a fraction of the reads, chosen at random. This is useful for doing saturation + analyses. + - name: --chrom + type: string + description: Only consider a single chromosome. This is useful for debugging/testing purposes. + + - name: Group/Dedup Options + arguments: + - name: --no_sort_output + type: boolean_true + description: | + By default, output is sorted. This involves the use of a temporary unsorted file (saved in + --temp_dir). Use this option to turn off sorting. + - name: --buffer_whole_contig + type: boolean_true + description: | + Forces dedup to parse an entire contig before yielding any reads for deduplication. This is the + only way to absolutely guarantee that all reads with the same start position are grouped together + for deduplication since dedup uses the start position of the read, not the alignment coordinate on + which the reads are sorted. However, by default, dedup reads for another 1000bp before outputting + read groups which will avoid any reads being missed with short read sequencing (<1000bp). + + - name: Common Options + arguments: + - name: --log + alternatives: -L + type: file + description: File with logging information. + - name: --log2stderr + type: boolean_true + description: Send logging information to stderr. + - name: --verbose + alternatives: -v + type: integer + description: | + Log level. The higher, the more output. Default: `0`. + example: 0 + - name: --error + alternatives: -E + type: file + description: File with error information. + - name: --temp_dir + type: string + description: | + Directory for temporary files. If not set, the bash environmental variable TMPDIR is used. + - name: --compresslevel + type: integer + description: | + Level of Gzip compression to use. Default=6 matches GNU gzip rather than python gzip default. + Default: `6`. + example: 6 + - name: --timeit + type: file + description: Store timing information in file. + - name: --timeit_name + type: string + description: | + Name in timing file for this class of jobs. Default: `all`. + example: "all" + - name: --timeit_header + type: string + description: Add header for timing information. + +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - type: file + path: test_data +engines: + - type: docker + image: quay.io/biocontainers/umi_tools:1.1.5--py39hf95cd2a_1 + setup: + - type: docker + run: | + umi_tools -v | sed 's/ version//g' > /var/software_versions.txt +runners: +- type: executable +- type: nextflow \ No newline at end of file diff --git a/src/umi_tools/umi_tools_dedup/help.txt b/src/umi_tools/umi_tools_dedup/help.txt new file mode 100644 index 00000000..87baf322 --- /dev/null +++ b/src/umi_tools/umi_tools_dedup/help.txt @@ -0,0 +1,113 @@ +''' +Generated from the following UMI-tools documentation: + https://umi-tools.readthedocs.io/en/latest/common_options.html#common-options + https://umi-tools.readthedocs.io/en/latest/reference/dedup.html +''' + + +dedup - Deduplicate reads using UMI and mapping coordinates + +Usage: umi_tools dedup [OPTIONS] [--stdin=IN_BAM] [--stdout=OUT_BAM] + + note: If --stdout is ommited, standard out is output. To + generate a valid BAM file on standard out, please + redirect log with --log=LOGFILE or --log2stderr + +Common UMI-tools Options: + + -S, --stdout File where output is to go [default = stdout]. + -L, --log File with logging information [default = stdout]. + --log2stderr Send logging information to stderr [default = False]. + -v, --verbose Log level. The higher, the more output [default = 1]. + -E, --error File with error information [default = stderr]. + --temp-dir Directory for temporary files. If not set, the bash environmental variable TMPDIR is used[default = None]. + --compresslevel Level of Gzip compression to use. Default=6 matches GNU gzip rather than python gzip default (which is 9) + + profiling and debugging options: + --timeit Store timing information in file [default=none]. + --timeit-name Name in timing file for this class of jobs [default=all]. + --timeit-header Add header for timing information [default=none]. + --random-seed Random seed to initialize number generator with [default=none]. + +Dedup Options: + --output-stats= One can use the edit distance between UMIs at the same position as an quality control for the + deduplication process by comparing with a null expectation of random sampling. For the random + sampling, the observed frequency of UMIs is used to more reasonably model the null expectation. + Use this option to generate a stats outfiles called: + [PREFIX]_stats_edit_distance.tsv + Reports the (binned) average edit distance between the UMIs at each position. + In addition, this option will trigger reporting of further summary statistics for the UMIs which + may be informative for selecting the optimal deduplication method or debugging. + Each unique UMI sequence may be observed [0-many] times at multiple positions in the BAM. The + following files report the distribution for the frequencies of each UMI. + [PREFIX]_stats_per_umi_per_position.tsv + Tabulates the counts for unique combinations of UMI and position. + [PREFIX]_stats_per_umi_per.tsv + The _stats_per_umi_per.tsv table provides UMI-level summary statistics. + --extract-umi-method= How are the barcodes encoded in the read? + Options are: read_id (default), tag, umis + --umi-separator= Separator between read id and UMI. See --extract-umi-method above. Default=_ + --umi-tag= Tag which contains UMI. See --extract-umi-method above + --umi-tag-split= Separate the UMI in tag by SPLIT and take the first element + --umi-tag-delimiter= Separate the UMI in by DELIMITER and concatenate the elements + --cell-tag= Tag which contains cell barcode. See --extract-umi-method above + --cell-tag-split= Separate the cell barcode in tag by SPLIT and take the first element + --cell-tag-delimiter= Separate the cell barcode in by DELIMITER and concatenate the elements + --method= What method to use to identify group of reads with the same (or similar) UMI(s)? + All methods start by identifying the reads with the same mapping position. + The simplest methods, unique and percentile, group reads with the exact same UMI. + The network-based methods, cluster, adjacency and directional, build networks where + nodes are UMIs and edges connect UMIs with an edit distance <= threshold (usually 1). + The groups of reads are then defined from the network in a method-specific manner. + For all the network-based methods, each read group is equivalent to one read count for the gene. + --edit-distance-threshold= For the adjacency and cluster methods the threshold for the edit distance to connect + two UMIs in the network can be increased. The default value of 1 works best unless + the UMI is very long (>14bp). + --spliced-is-unique Causes two reads that start in the same position on the same strand and having the + same UMI to be considered unique if one is spliced and the other is not. + (Uses the 'N' cigar operation to test for splicing). + --soft-clip-threshold= Mappers that soft clip will sometimes do so rather than mapping a spliced read if + there is only a small overhang over the exon junction. By setting this option, you + can treat reads with at least this many bases soft-clipped at the 3' end as spliced. + Default=4. + --multimapping-detection-method= If the sam/bam contains tags to identify multimapping reads, you can specify + for use when selecting the best read at a given loci. Supported tags are "NH", + "X0" and "XT". If not specified, the read with the highest mapping quality will be selected. + --read-length Use the read length as a criteria when deduping, for e.g sRNA-Seq. + --per-gene Reads will be grouped together if they have the same gene. This is useful if your + library prep generates PCR duplicates with non identical alignment positions such as CEL-Seq. + Note this option is hardcoded to be on with the count command. I.e counting is always + performed per-gene. Must be combined with either --gene-tag or --per-contig option. + --gene-tag= Deduplicate per gene. The gene information is encoded in the bam read tag specified + --assigned-status-tag= BAM tag which describes whether a read is assigned to a gene. Defaults to the same value + as given for --gene-tag + --skip-tags-regex= Use in conjunction with the --assigned-status-tag option to skip any reads where the + tag matches this regex. Default ("^[__|Unassigned]") matches anything which starts with "__" + or "Unassigned": + --per-contig Deduplicate per contig (field 3 in BAM; RNAME). All reads with the same contig will be + considered to have the same alignment position. This is useful if you have aligned to a + reference transcriptome with one transcript per gene. If you have aligned to a transcriptome + with more than one transcript per gene, you can supply a map between transcripts and gene + using the --gene-transcript-map option + --gene-transcript-map= File mapping genes to transcripts (tab separated) + --per-cell Reads will only be grouped together if they have the same cell barcode. Can be combined with --per-gene. + --mapping-quality= Minimium mapping quality (MAPQ) for a read to be retained. Default is 0. + --unmapped-reads=