diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 00000000..90963715 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,6 @@ +version: 2 +updates: + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "daily" \ No newline at end of file diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index a094248b..6e1fc4b3 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -48,34 +48,45 @@ jobs: run: | LANG=C viash ns list > /dev/null - - name: Get changed files - id: changed-files - uses: tj-actions/changed-files@v42 - with: - separator: ";" - diff_relative: true - - - id: ns_list - uses: viash-io/viash-actions/ns-list@v5 - with: - platform: docker - format: json - query: ^(?!workflows) + # see https://github.com/viash-io/viash/issues/654 + # and https://github.com/viash-io/viash-actions/pull/27 + # - name: Get changed files + # id: changed-files + # uses: tj-actions/changed-files@v42 + # with: + # separator: ";" + # diff_relative: true + # - id: ns_list + # uses: viash-io/viash-actions/ns-list@v5 + # with: + # platform: docker + # format: json + # query: ^(?!workflows) + # - id: ns_list_filtered + # uses: viash-io/viash-actions/project/detect-changed-components@v5 + # with: + # input_file: "${{ steps.ns_list.outputs.output_file }}" + # - id: set_matrix + # run: | + # echo "matrix=$(jq -c '[ .[] | + # { + # "name": (.functionality.namespace + "/" + .functionality.name), + # "config": .info.config, + # "dir": .info.config | capture("^(?.*\/)").dir + # } + # ]' ${{ contains(steps.get_head_commit_message.outputs.HEAD_COMMIT_MESSAGE, 'ci force') && steps.ns_list.outputs.output_file || steps.ns_list_filtered.outputs.output_file }} )" >> $GITHUB_OUTPUT - - id: ns_list_filtered - uses: viash-io/viash-actions/project/detect-changed-components@v5 - with: - input_file: "${{ steps.ns_list.outputs.output_file }}" - id: set_matrix run: | + viash ns list --format json > ns_list.json echo "matrix=$(jq -c '[ .[] | { - "name": (.functionality.namespace + "/" + .functionality.name), - "config": .info.config, - "dir": .info.config | capture("^(?.*\/)").dir + "name": (.namespace + "/" + .name), + "config": .build_info.config, + "dir": .build_info.config | capture("^(?.*\/)").dir } - ]' ${{ contains(steps.get_head_commit_message.outputs.HEAD_COMMIT_MESSAGE, 'ci force') && steps.ns_list.outputs.output_file || steps.ns_list_filtered.outputs.output_file }} )" >> $GITHUB_OUTPUT + ]' ns_list.json )" >> $GITHUB_OUTPUT # phase 2 viash_test: diff --git a/.vscode/viash_config.yaml b/.vscode/viash_config.yaml index b7a6aabd..0e38195f 100644 --- a/.vscode/viash_config.yaml +++ b/.vscode/viash_config.yaml @@ -7,29 +7,173 @@ definitions: \ you choose. \n" type: "object" properties: + organization: + description: "The organization of the package." + type: "string" + license: + description: "The license of the package." + type: "string" + authors: + description: "A list of authors. An author must at least have a name, but\ + \ can also have a list of roles, an e-mail address, and a map of custom\ + \ properties.\n\nSuggested values for roles are:\n \n| Role | Abbrev. |\ + \ Description |\n|------|---------|-------------|\n| maintainer | mnt |\ + \ for the maintainer of the code. Ideally, exactly one maintainer is specified.\ + \ |\n| author | aut | for persons who have made substantial contributions\ + \ to the software. |\n| contributor | ctb| for persons who have made smaller\ + \ contributions (such as code patches).\n| datacontributor | dtc | for persons\ + \ or organisations that contributed data sets for the software\n| copyrightholder\ + \ | cph | for all copyright holders. This is a legal concept so should use\ + \ the legal name of an institution or corporate body.\n| funder | fnd |\ + \ for persons or organizations that furnished financial support for the\ + \ development of the software\n\nThe [full list of roles](https://www.loc.gov/marc/relators/relaterm.html)\ + \ is extremely comprehensive.\n" + type: "array" + items: + $ref: "#/definitions/Author" + status: + description: "Allows setting a component to active, deprecated or disabled." + $ref: "#/definitions/Status" + requirements: + description: "Computational requirements related to running the component.\ + \ \n`cpus` specifies the maximum number of (logical) cpus a component is\ + \ allowed to use., whereas\n`memory` specifies the maximum amount of memory\ + \ a component is allowed to allicate. Memory units must be\nin B, KB, MB,\ + \ GB, TB or PB." + $ref: "#/definitions/ComputationalRequirements" + repositories: + description: "(Pre-)defines repositories that can be used as repository in\ + \ dependencies.\nAllows reusing repository definitions in case it is used\ + \ in multiple dependencies." + type: "array" + items: + $ref: "#/definitions/RepositoryWithName" + dependencies: + description: "Allows listing Viash components required by this Viash component" + type: "array" + items: + $ref: "#/definitions/Dependency" + namespace: + description: "Namespace this component is a part of. See the Namespaces guide\ + \ for more information on namespaces." + type: "string" functionality: description: "The functionality describes the behaviour of the script in terms\ \ of arguments and resources.\nBy specifying a few restrictions (e.g. mandatory\ \ arguments) and adding some descriptions, Viash will automatically generate\ \ a stylish command-line interface for you.\n" $ref: "#/definitions/Functionality" + runners: + description: "A list of runners to execute target artifacts.\n\n - ExecutableRunner\n\ + \ - NextflowRunner\n" + type: "array" + items: + $ref: "#/definitions/Runner" + name: + description: "Name of the component and the filename of the executable when\ + \ built with `viash build`." + type: "string" + build_info: + $ref: "#/definitions/BuildInfo" + argument_groups: + description: "A grouping of the arguments, used to display the help message.\n\ + \n - `name: foo`, the name of the argument group. \n - `description: Description\ + \ of foo`, a description of the argument group. Multiline descriptions are\ + \ supported.\n - `arguments: [arg1, arg2, ...]`, list of the arguments.\n\ + \n" + type: "array" + items: + $ref: "#/definitions/ArgumentGroup" + description: + description: "A description of the component. This will be displayed with\ + \ `--help`." + type: "string" + usage: + description: "A description on how to use the component. This will be displayed\ + \ with `--help` under the 'Usage:' section." + type: "string" + info: + description: "Structured information. Can be any shape: a string, vector,\ + \ map or even nested map." + type: "object" + package_config: + description: "The package config content used during build." + $ref: "#/definitions/PackageConfig" platforms: description: "A list of platforms to generate target artifacts for.\n\n -\ \ Native\n - Docker\n - Nextflow\n" type: "array" items: $ref: "#/definitions/Platform" + version: + description: "Version of the component. This field will be used to version\ + \ the executable and the Docker container." + type: "string" + links: + description: "External links of the component." + $ref: "#/definitions/Links" + references: + description: "References to external resources related to the component." + $ref: "#/definitions/References" + engines: + description: "A list of engine environments to execute target artifacts in.\n\ + \n - NativeEngine\n - DockerEngine\n" + type: "array" + items: + $ref: "#/definitions/Engine" + resources: + description: "Resources are files that support the component. The first resource\ + \ should be a script that will be executed when the component is run. Additional\ + \ resources will be copied to the same directory.\n\nCommon properties:\n\ + \n * type: `file` / `r_script` / `python_script` / `bash_script` / `javascript_script`\ + \ / `scala_script` / `csharp_script`, specifies the type of the resource.\ + \ The first resource cannot be of type `file`. When the type is not specified,\ + \ the default type is simply `file`.\n * dest: filename, the resulting name\ + \ of the resource. From within a script, the file can be accessed at `meta[\"\ + resources_dir\"] + \"/\" + dest`. If unspecified, `dest` will be set to\ + \ the basename of the `path` parameter.\n * path: `path/to/file`, the path\ + \ of the input file. Can be a relative or an absolute path, or a URI. Mutually\ + \ exclusive with `text`.\n * text: ...multiline text..., the content of\ + \ the resulting file specified as a string. Mutually exclusive with `path`.\n\ + \ * is_executable: `true` / `false`, whether the resulting resource file\ + \ should be made executable.\n" + type: "array" + items: + $ref: "#/definitions/Resource" + keywords: + description: "The keywords of the components." + type: "array" + items: + type: "string" + test_resources: + description: "One or more scripts to be used to test the component behaviour\ + \ when `viash test` is invoked. Additional files of type `file` will be\ + \ made available only during testing. Each test script should expect no\ + \ command-line inputs, be platform-independent, and return an exit code\ + \ >0 when unexpected behaviour occurs during testing. See Unit Testing for\ + \ more info." + type: "array" + items: + $ref: "#/definitions/Resource" required: - - "functionality" - - "platforms" + - "name" additionalProperties: false - Project: - description: "A Viash project configuration file. It's name should be `_viash.yaml`." + PackageConfig: + description: "A Viash package configuration file. It's name should be `_viash.yaml`." type: "object" properties: + organization: + description: "The organization of the package." + type: "string" + name: + description: "The name of the package." + type: "string" source: description: "Which source directory to use for the `viash ns` commands." type: "string" + description: + description: "A description of the package." + type: "string" viash_version: description: "Which version of Viash to use." type: "string" @@ -41,12 +185,43 @@ definitions: items: description: "Which config mods to apply." type: "string" + info: + description: "Structured information. Can be any shape: a string, vector,\ + \ map or even nested map." + type: "object" + license: + description: "The license of the package." + type: "string" + references: + description: "References to external resources related to the package." + $ref: "#/definitions/References" + authors: + description: "The authors of the package." + type: "array" + items: + $ref: "#/definitions/Author" + repositories: + description: "Common repository definitions for component dependencies." + type: "array" + items: + $ref: "#/definitions/RepositoryWithName" + keywords: + description: "The keywords of the package." + type: "array" + items: + type: "string" target: description: "Which target directory to use for `viash ns build`." type: "string" + version: + description: "The version of the package." + type: "string" + links: + description: "External links of the package." + $ref: "#/definitions/Links" required: [] additionalProperties: false - Info: + BuildInfo: description: "Meta information fields filled in by Viash during build." type: "object" properties: @@ -59,41 +234,26 @@ definitions: viash_version: description: "The Viash version that was used to build the component." type: "string" - config: - description: "Path to the config used during build." - type: "string" output: description: "Folder path to the build artifacts." type: "string" - platform: - description: "The platform id used during build." - type: "string" git_commit: description: "Git commit hash." type: "string" executable: description: "Output folder with main executable path." type: "string" - required: - - "config" - additionalProperties: false - EnvironmentVariables: - description: "Viash checks several environment variables during operation." - type: "object" - properties: - VIASH_VERSION: - description: "A specific Viash version can be set to run the commands with.\ - \ If so required, the specific Viash version will be downloaded.\nThis is\ - \ useful when replicating older results or building Viash components that\ - \ use outdated code.\n" + engine: + description: "The engine id used during build." + type: "string" + runner: + description: "The runner id used during build." type: "string" - VIASH_HOME: - description: "If `VIASH_HOME` is not defined, the fallback `HOME`/.viash is\ - \ used.\n\nLocation where specific downloaded versions of Viash will be\ - \ cached and run from.\n" + config: + description: "Path to the config used during build." type: "string" required: - - "VIASH_HOME" + - "config" additionalProperties: false Functionality: description: "The functionality-part of the config file describes the behaviour\ @@ -102,18 +262,32 @@ definitions: \ generate a stylish command-line interface for you.\n" type: "object" properties: + organization: + description: "The organization of the package." + type: "string" name: description: "Name of the component and the filename of the executable when\ \ built with `viash build`." type: "string" + argument_groups: + description: "A grouping of the arguments, used to display the help message.\n\ + \n - `name: foo`, the name of the argument group. \n - `description: Description\ + \ of foo`, a description of the argument group. Multiline descriptions are\ + \ supported.\n - `arguments: [arg1, arg2, ...]`, list of the arguments.\n\ + \n" + type: "array" + items: + $ref: "#/definitions/ArgumentGroup" info: description: "Structured information. Can be any shape: a string, vector,\ \ map or even nested map." type: "object" - version: - description: "Version of the component. This field will be used to version\ - \ the executable and the Docker container." + license: + description: "The license of the package." type: "string" + references: + description: "References to external resources related to the component." + $ref: "#/definitions/References" authors: description: "A list of authors. An author must at least have a name, but\ \ can also have a list of roles, an e-mail address, and a map of custom\ @@ -149,25 +323,6 @@ definitions: type: "array" items: $ref: "#/definitions/RepositoryWithName" - resources: - description: "Resources are files that support the component. The first resource\ - \ should be a script that will be executed when the functionality is run.\ - \ Additional resources will be copied to the same directory.\n\nCommon properties:\n\ - \n * type: `file` / `r_script` / `python_script` / `bash_script` / `javascript_script`\ - \ / `scala_script` / `csharp_script`, specifies the type of the resource.\ - \ The first resource cannot be of type `file`. When the type is not specified,\ - \ the default type is simply `file`.\n * dest: filename, the resulting name\ - \ of the resource. From within a script, the file can be accessed at `meta[\"\ - resources_dir\"] + \"/\" + dest`. If unspecified, `dest` will be set to\ - \ the basename of the `path` parameter.\n * path: `path/to/file`, the path\ - \ of the input file. Can be a relative or an absolute path, or a URI. Mutually\ - \ exclusive with `text`.\n * text: ...multiline text..., the content of\ - \ the resulting file specified as a string. Mutually exclusive with `path`.\n\ - \ * is_executable: `true` / `false`, whether the resulting resource file\ - \ should be made executable.\n" - type: "array" - items: - $ref: "#/definitions/Resource" test_resources: description: "One or more scripts to be used to test the component behaviour\ \ when `viash test` is invoked. Additional files of type `file` will be\ @@ -183,15 +338,6 @@ definitions: type: "array" items: $ref: "#/definitions/Dependency" - argument_groups: - description: "A grouping of the arguments, used to display the help message.\n\ - \n - `name: foo`, the name of the argument group. \n - `description: Description\ - \ of foo`, a description of the argument group. Multiline descriptions are\ - \ supported.\n - `arguments: [arg1, arg2, ...]`, list of the arguments.\n\ - \n" - type: "array" - items: - $ref: "#/definitions/ArgumentGroup" description: description: "A description of the component. This will be displayed with\ \ `--help`." @@ -200,6 +346,37 @@ definitions: description: "A description on how to use the component. This will be displayed\ \ with `--help` under the 'Usage:' section." type: "string" + version: + description: "Version of the component. This field will be used to version\ + \ the executable and the Docker container." + type: "string" + links: + description: "External links of the component." + $ref: "#/definitions/Links" + resources: + description: "Resources are files that support the component. The first resource\ + \ should be a script that will be executed when the functionality is run.\ + \ Additional resources will be copied to the same directory.\n\nCommon properties:\n\ + \n * type: `file` / `r_script` / `python_script` / `bash_script` / `javascript_script`\ + \ / `scala_script` / `csharp_script`, specifies the type of the resource.\ + \ The first resource cannot be of type `file`. When the type is not specified,\ + \ the default type is simply `file`.\n * dest: filename, the resulting name\ + \ of the resource. From within a script, the file can be accessed at `meta[\"\ + resources_dir\"] + \"/\" + dest`. If unspecified, `dest` will be set to\ + \ the basename of the `path` parameter.\n * path: `path/to/file`, the path\ + \ of the input file. Can be a relative or an absolute path, or a URI. Mutually\ + \ exclusive with `text`.\n * text: ...multiline text..., the content of\ + \ the resulting file specified as a string. Mutually exclusive with `path`.\n\ + \ * is_executable: `true` / `false`, whether the resulting resource file\ + \ should be made executable.\n" + type: "array" + items: + $ref: "#/definitions/Resource" + keywords: + description: "The keywords of the components." + type: "array" + items: + type: "string" namespace: description: "Namespace this component is a part of. See the Namespaces guide\ \ for more information on namespaces." @@ -288,6 +465,295 @@ definitions: - "name" - "arguments" additionalProperties: false + Links: + description: "Links to external resources related to the component." + type: "object" + properties: + repository: + description: "Source repository url." + type: "string" + documentation: + description: "Documentation website url." + type: "string" + docker_registry: + description: "Docker registry url." + type: "string" + homepage: + description: "Homepage website url." + type: "string" + issue_tracker: + description: "Issue tracker url." + type: "string" + required: [] + additionalProperties: false + References: + description: "References to external resources related to the component." + type: "object" + properties: + bibtex: + oneOf: + - description: "One or multiple BibTeX reference(s) of the component." + type: "string" + - type: "array" + items: + description: "One or multiple BibTeX reference(s) of the component." + type: "string" + doi: + oneOf: + - description: "One or multiple DOI reference(s) of the component." + type: "string" + - type: "array" + items: + description: "One or multiple DOI reference(s) of the component." + type: "string" + additionalProperties: false + Runner: + oneOf: + - $ref: "#/definitions/ExecutableRunner" + - $ref: "#/definitions/NextflowRunner" + ExecutableRunner: + description: "Run code as an executable.\n\nThis runner is the default runner.\ + \ It will generate a bash script that can be run directly.\n\nThis runner is\ + \ also used for the native engine.\n\nThis runner is also used for the docker\ + \ engine.\n" + type: "object" + properties: + docker_setup_strategy: + description: "The Docker setup strategy to use when building a docker engine\ + \ enrivonment.\n\n| Strategy | Description |\n|-----|----------|\n| `alwaysbuild`\ + \ / `build` / `b` | Always build the image from the dockerfile. This is\ + \ the default setup strategy.\n| `alwayscachedbuild` / `cachedbuild` / `cb`\ + \ | Always build the image from the dockerfile, with caching enabled.\n\ + | `ifneedbebuild` | Build the image if it does not exist locally.\n| `ifneedbecachedbuild`\ + \ | Build the image with caching enabled if it does not exist locally, with\ + \ caching enabled.\n| `alwayspull` / `pull` / `p` | Try to pull the container\ + \ from [Docker Hub](https://hub.docker.com) or the specified docker registry.\n\ + | `alwayspullelsebuild` / `pullelsebuild` | Try to pull the image from\ + \ a registry and build it if it doesn't exist.\n| `alwayspullelsecachedbuild`\ + \ / `pullelsecachedbuild` | Try to pull the image from a registry and build\ + \ it with caching if it doesn't exist.\n| `ifneedbepull` | If the image\ + \ does not exist locally, pull the image.\n| `ifneedbepullelsebuild` | \ + \ If the image does not exist locally, pull the image. If the image does\ + \ exist, build it.\n| `ifneedbepullelsecachedbuild` | If the image does\ + \ not exist locally, pull the image. If the image does exist, build it with\ + \ caching enabled.\n| `push` | Push the container to [Docker Hub](https://hub.docker.com)\ + \ or the specified docker registry.\n| `pushifnotpresent` | Push the container\ + \ to [Docker Hub](https://hub.docker.com) or the specified docker registry\ + \ if the tag does not exist yet.\n| `donothing` / `meh` | Do not build or\ + \ pull anything.\n\n" + $ref: "#/definitions/DockerSetupStrategy" + workdir: + description: "The working directory when starting the engine. This doesn't\ + \ change the Dockerfile but gets added as a command-line argument at runtime." + type: "string" + docker_run_args: + oneOf: + - description: "Provide runtime arguments to Docker. See the documentation\ + \ on [`docker run`](https://docs.docker.com/engine/reference/run/) for\ + \ more information." + type: "string" + - type: "array" + items: + description: "Provide runtime arguments to Docker. See the documentation\ + \ on [`docker run`](https://docs.docker.com/engine/reference/run/) for\ + \ more information." + type: "string" + id: + description: "Name of the runner. As with all runners, you can give an runner\ + \ a different name. By specifying `id: foo`, you can target this executor\ + \ (only) by specifying `...` in any of the Viash commands." + type: "string" + port: + oneOf: + - description: "A list of enabled ports. This doesn't change the Dockerfile\ + \ but gets added as a command-line argument at runtime." + type: "integer" + - description: "A list of enabled ports. This doesn't change the Dockerfile\ + \ but gets added as a command-line argument at runtime." + type: "string" + - description: "A list of enabled ports. This doesn't change the Dockerfile\ + \ but gets added as a command-line argument at runtime." + type: "array" + items: + type: "integer" + - description: "A list of enabled ports. This doesn't change the Dockerfile\ + \ but gets added as a command-line argument at runtime." + type: "array" + items: + type: "string" + type: + description: "Run code as an executable.\n\nThis runner is the default runner.\ + \ It will generate a bash script that can be run directly.\n\nThis runner\ + \ is also used for the native engine.\n\nThis runner is also used for the\ + \ docker engine.\n" + const: "executable" + required: + - "type" + additionalProperties: false + NextflowRunner: + description: "Run a Viash component on a Nextflow backend engine.\n" + type: "object" + properties: + auto: + description: "Automated processing flags which can be toggled on or off:\n\ + \n| Flag | Description | Default |\n|---|---------|----|\n| `simplifyInput`\ + \ | If `true`, an input tuple only containing only a single File (e.g. `[\"\ + foo\", file(\"in.h5ad\")]`) is automatically transformed to a map (i.e.\ + \ `[\"foo\", [ input: file(\"in.h5ad\") ] ]`). | `true` |\n| `simplifyOutput`\ + \ | If `true`, an output tuple containing a map with a File (e.g. `[\"foo\"\ + , [ output: file(\"out.h5ad\") ] ]`) is automatically transformed to a map\ + \ (i.e. `[\"foo\", file(\"out.h5ad\")]`). | `false` |\n| `transcript` |\ + \ If `true`, the module's transcripts from `work/` are automatically published\ + \ to `params.transcriptDir`. If not defined, `params.publishDir + \"/_transcripts\"\ + ` will be used. Will throw an error if neither are defined. | `false` |\n\ + | `publish` | If `true`, the module's outputs are automatically published\ + \ to `params.publishDir`. If equal to \"state\", also a `.state.yaml` file\ + \ will be published in the publish dir. Will throw an error if `params.publishDir`\ + \ is not defined. | `false` |\n\n" + $ref: "#/definitions/NextflowAuto" + directives: + description: "Directives are optional settings that affect the execution of\ + \ the process. These mostly match up with the Nextflow counterparts. \n" + $ref: "#/definitions/NextflowDirectives" + container: + description: "Specifies the Docker engine id to be used to run Nextflow." + type: "string" + config: + description: "Allows tweaking how the Nextflow Config file is generated." + $ref: "#/definitions/NextflowConfig" + debug: + description: "Whether or not to print debug messages." + type: "boolean" + id: + description: "Name of the runner. As with all runners, you can give an runner\ + \ a different name. By specifying `id: foo`, you can target this runner\ + \ (only) by specifying `...` in any of the Viash commands." + type: "string" + type: + description: "Run a Viash component on a Nextflow backend engine.\n" + const: "nextflow" + required: + - "type" + additionalProperties: false + Engine: + oneOf: + - $ref: "#/definitions/DockerEngine" + - $ref: "#/definitions/NativeEngine" + NativeEngine: + description: "Running a Viash component on a native engine means that the script\ + \ will be executed in your current environment.\nAny dependencies are assumed\ + \ to have been installed by the user, so the native engine is meant for developers\ + \ (who know what they're doing) or for simple bash scripts (which have no extra\ + \ dependencies).\n" + type: "object" + properties: + id: + description: "Name of the engine. As with all engines, you can give an engine\ + \ a different name. By specifying `id: foo`, you can target this engine\ + \ (only) by specifying `...` in any of the Viash commands." + type: "string" + type: + description: "Running a Viash component on a native engine means that the\ + \ script will be executed in your current environment.\nAny dependencies\ + \ are assumed to have been installed by the user, so the native engine is\ + \ meant for developers (who know what they're doing) or for simple bash\ + \ scripts (which have no extra dependencies).\n" + const: "native" + required: + - "type" + additionalProperties: false + DockerEngine: + description: "Run a Viash component on a Docker backend engine.\nBy specifying\ + \ which dependencies your component needs, users will be able to build a docker\ + \ container from scratch using the setup flag, or pull it from a docker repository.\n" + type: "object" + properties: + organization: + description: "Name of a container's [organization](https://docs.docker.com/docker-hub/orgs/)." + type: "string" + registry: + description: "The URL to the a [custom Docker registry](https://docs.docker.com/registry/)" + type: "string" + image: + description: "The base container to start from. You can also add the tag here\ + \ if you wish." + type: "string" + tag: + description: "Specify a Docker image based on its tag." + type: "string" + target_image: + description: "If anything is specified in the setup section, running the `---setup`\ + \ will result in an image with the name of `:`. If\ + \ nothing is specified in the `setup` section, simply `image` will be used.\ + \ Advanced usage only." + type: "string" + target_tag: + description: "The tag the resulting image gets. Advanced usage only." + type: "string" + namespace_separator: + description: "The separator between the namespace and the name of the component,\ + \ used for determining the image name. Default: \"/\"." + type: "string" + id: + description: "Name of the engine. As with all engines, you can give a engine\ + \ a different name. By specifying `id: foo`, you can target this engine\ + \ (only) by specifying `...` in any of the Viash commands." + type: "string" + target_registry: + description: "The URL where the resulting image will be pushed to. Advanced\ + \ usage only." + type: "string" + type: + description: "Run a Viash component on a Docker backend engine.\nBy specifying\ + \ which dependencies your component needs, users will be able to build a\ + \ docker container from scratch using the setup flag, or pull it from a\ + \ docker repository.\n" + const: "docker" + target_organization: + description: "The organization set in the resulting image. Advanced usage\ + \ only." + type: "string" + setup: + description: "A list of requirements for installing the following types of\ + \ packages:\n\n - apt\n - apk\n - Docker setup instructions\n - JavaScript\n\ + \ - Python\n - R\n - Ruby\n - yum\n\nThe order in which these dependencies\ + \ are specified determines the order in which they will be installed.\n" + type: "array" + items: + $ref: "#/definitions/Requirements" + cmd: + oneOf: + - description: "Set the default command being executed when running the Docker\ + \ container." + type: "string" + - description: "Set the default command being executed when running the Docker\ + \ container." + type: "array" + items: + type: "string" + target_image_source: + description: "The source of the target image. This is used for defining labels\ + \ in the dockerfile." + type: "string" + test_setup: + description: "Additional requirements specific for running unit tests." + type: "array" + items: + $ref: "#/definitions/Requirements" + entrypoint: + oneOf: + - description: "Override the entrypoint of the base container. Default set\ + \ `ENTRYPOINT []`." + type: "string" + - description: "Override the entrypoint of the base container. Default set\ + \ `ENTRYPOINT []`." + type: "array" + items: + type: "string" + required: + - "image" + - "type" + additionalProperties: false Platform: oneOf: - $ref: "#/definitions/NativePlatform" @@ -356,6 +822,16 @@ definitions: description: "Enables or disables automatic volume mapping. Enabled when set\ \ to `Automatic` or disabled when set to `Manual`. Default: `Automatic`." $ref: "#/definitions/DockerResolveVolume" + cmd: + oneOf: + - description: "Set the default command being executed when running the Docker\ + \ container." + type: "string" + - description: "Set the default command being executed when running the Docker\ + \ container." + type: "array" + items: + type: "string" id: description: "As with all platforms, you can give a platform a different name.\ \ By specifying `id: foo`, you can target this platform (only) by specifying\ @@ -366,19 +842,15 @@ definitions: - description: "A list of enabled ports. This doesn't change the Dockerfile\ \ but gets added as a command-line argument at runtime." type: "string" - - description: "A list of enabled ports. This doesn't change the Dockerfile\ - \ but gets added as a command-line argument at runtime." - type: "integer" - - description: "A list of enabled ports. This doesn't change the Dockerfile\ - \ but gets added as a command-line argument at runtime." - type: "array" + - type: "array" items: + description: "A list of enabled ports. This doesn't change the Dockerfile\ + \ but gets added as a command-line argument at runtime." type: "string" - - description: "A list of enabled ports. This doesn't change the Dockerfile\ - \ but gets added as a command-line argument at runtime." - type: "array" - items: - type: "integer" + target_registry: + description: "The URL where the resulting image will be pushed to. Advanced\ + \ usage only." + type: "string" setup: description: "A list of requirements for installing the following types of\ \ packages:\n\n - apt\n - apk\n - Docker setup instructions\n - JavaScript\n\ @@ -397,16 +869,6 @@ definitions: \ nothing is specified in the `setup` section, simply `image` will be used.\ \ Advanced usage only." type: "string" - cmd: - oneOf: - - description: "Set the default command being executed when running the Docker\ - \ container." - type: "string" - - description: "Set the default command being executed when running the Docker\ - \ container." - type: "array" - items: - type: "string" target_image_source: description: "The source of the target image. This is used for defining labels\ \ in the dockerfile." @@ -426,10 +888,6 @@ definitions: type: "array" items: type: "string" - target_registry: - description: "The URL where the resulting image will be pushed to. Advanced\ - \ usage only." - type: "string" setup_strategy: description: "The Docker setup strategy to use when building a container.\n\ \n| Strategy | Description |\n|-----|----------|\n| `alwaysbuild` / `build`\ @@ -464,13 +922,6 @@ definitions: description: "The organization set in the resulting image. Advanced usage\ \ only." type: "string" - chown: - description: "In Linux, files created by a Docker container will be owned\ - \ by `root`. With `chown: true`, Viash will automatically change the ownership\ - \ of output files (arguments with `type: file` and `direction: output`)\ - \ to the user running the Viash command after execution of the component.\ - \ Default value: `true`." - type: "boolean" required: - "image" - "type" @@ -969,6 +1420,8 @@ definitions: \ -t`\n - `trim` is an argument, which can be passed with `executable_name\ \ trim` \n" type: "string" + direction: + $ref: "#/definitions/Direction" info: description: "Structured information. Can be any shape: a string, vector,\ \ map or even nested map." @@ -1040,6 +1493,8 @@ definitions: \ -s`\n - `silent` is an argument, which can be passed with `executable_name\ \ silent` \n" type: "string" + direction: + $ref: "#/definitions/Direction" info: description: "Structured information. Can be any shape: a string, vector,\ \ map or even nested map." @@ -1078,6 +1533,8 @@ definitions: \ -n`\n - `no-log` is an argument, which can be passed with `executable_name\ \ no-log` \n" type: "string" + direction: + $ref: "#/definitions/Direction" info: description: "Structured information. Can be any shape: a string, vector,\ \ map or even nested map." @@ -1155,6 +1612,8 @@ definitions: \ value is lower than the minimum, an error will be produced. Can be combined\ \ with [`max`](#max) to clamp values." $ref: "#/definitions/DoubleWithInf" + direction: + $ref: "#/definitions/Direction" multiple: description: "Treat the argument value as an array. Arrays can be passed using\ \ the delimiter `--foo=1:2:3` or by providing the same argument multiple\ @@ -1323,6 +1782,8 @@ definitions: \ value is lower than the minimum, an error will be produced. Can be combined\ \ with [`max`](#max) to clamp values." type: "integer" + direction: + $ref: "#/definitions/Direction" multiple: description: "Treat the argument value as an array. Arrays can be passed using\ \ the delimiter `--foo=1:2:3` or by providing the same argument multiple\ @@ -1409,6 +1870,8 @@ definitions: \ value is lower than the minimum, an error will be produced. Can be combined\ \ with [`max`](#max) to clamp values." type: "integer" + direction: + $ref: "#/definitions/Direction" multiple: description: "Treat the argument value as an array. Arrays can be passed using\ \ the delimiter `--foo=1:2:3` or by providing the same argument multiple\ @@ -1455,6 +1918,8 @@ definitions: type: "array" items: type: "string" + direction: + $ref: "#/definitions/Direction" info: description: "Structured information. Can be any shape: a string, vector,\ \ map or even nested map." @@ -1516,10 +1981,10 @@ definitions: - $ref: "#/definitions/RScript" - $ref: "#/definitions/ScalaScript" BashScript: - description: "An executable Bash script.\nWhen defined in functionality.resources,\ - \ only the first entry will be executed when running the built component or\ - \ when running `viash run`.\nWhen defined in functionality.test_resources, all\ - \ entries will be executed during `viash test`." + description: "An executable Bash script.\nWhen defined in resources, only the\ + \ first entry will be executed when running the built component or when running\ + \ `viash run`.\nWhen defined in test_resources, all entries will be executed\ + \ during `viash test`." type: "object" properties: path: @@ -1534,10 +1999,10 @@ definitions: description: "Whether the resulting resource file should be made executable." type: "boolean" type: - description: "An executable Bash script.\nWhen defined in functionality.resources,\ - \ only the first entry will be executed when running the built component\ - \ or when running `viash run`.\nWhen defined in functionality.test_resources,\ - \ all entries will be executed during `viash test`." + description: "An executable Bash script.\nWhen defined in resources, only\ + \ the first entry will be executed when running the built component or when\ + \ running `viash run`.\nWhen defined in test_resources, all entries will\ + \ be executed during `viash test`." const: "bash_script" dest: description: "Resulting filename of the resource. From within a script, the\ @@ -1548,10 +2013,10 @@ definitions: - "type" additionalProperties: false CSharpScript: - description: "An executable C# script.\nWhen defined in functionality.resources,\ - \ only the first entry will be executed when running the built component or\ - \ when running `viash run`.\nWhen defined in functionality.test_resources, all\ - \ entries will be executed during `viash test`." + description: "An executable C# script.\nWhen defined in resources, only the first\ + \ entry will be executed when running the built component or when running `viash\ + \ run`.\nWhen defined in test_resources, all entries will be executed during\ + \ `viash test`." type: "object" properties: path: @@ -1566,10 +2031,10 @@ definitions: description: "Whether the resulting resource file should be made executable." type: "boolean" type: - description: "An executable C# script.\nWhen defined in functionality.resources,\ - \ only the first entry will be executed when running the built component\ - \ or when running `viash run`.\nWhen defined in functionality.test_resources,\ - \ all entries will be executed during `viash test`." + description: "An executable C# script.\nWhen defined in resources, only the\ + \ first entry will be executed when running the built component or when\ + \ running `viash run`.\nWhen defined in test_resources, all entries will\ + \ be executed during `viash test`." const: "csharp_script" dest: description: "Resulting filename of the resource. From within a script, the\ @@ -1606,10 +2071,10 @@ definitions: - "type" additionalProperties: false JavaScriptScript: - description: "An executable JavaScript script.\nWhen defined in functionality.resources,\ - \ only the first entry will be executed when running the built component or\ - \ when running `viash run`.\nWhen defined in functionality.test_resources, all\ - \ entries will be executed during `viash test`." + description: "An executable JavaScript script.\nWhen defined in resources, only\ + \ the first entry will be executed when running the built component or when\ + \ running `viash run`.\nWhen defined in test_resources, all entries will be\ + \ executed during `viash test`." type: "object" properties: path: @@ -1624,10 +2089,10 @@ definitions: description: "Whether the resulting resource file should be made executable." type: "boolean" type: - description: "An executable JavaScript script.\nWhen defined in functionality.resources,\ + description: "An executable JavaScript script.\nWhen defined in resources,\ \ only the first entry will be executed when running the built component\ - \ or when running `viash run`.\nWhen defined in functionality.test_resources,\ - \ all entries will be executed during `viash test`." + \ or when running `viash run`.\nWhen defined in test_resources, all entries\ + \ will be executed during `viash test`." const: "javascript_script" dest: description: "Resulting filename of the resource. From within a script, the\ @@ -1697,10 +2162,10 @@ definitions: required: [] additionalProperties: false PythonScript: - description: "An executable Python script.\nWhen defined in functionality.resources,\ - \ only the first entry will be executed when running the built component or\ - \ when running `viash run`.\nWhen defined in functionality.test_resources, all\ - \ entries will be executed during `viash test`." + description: "An executable Python script.\nWhen defined in resources, only the\ + \ first entry will be executed when running the built component or when running\ + \ `viash run`.\nWhen defined in test_resources, all entries will be executed\ + \ during `viash test`." type: "object" properties: path: @@ -1715,10 +2180,10 @@ definitions: description: "Whether the resulting resource file should be made executable." type: "boolean" type: - description: "An executable Python script.\nWhen defined in functionality.resources,\ - \ only the first entry will be executed when running the built component\ - \ or when running `viash run`.\nWhen defined in functionality.test_resources,\ - \ all entries will be executed during `viash test`." + description: "An executable Python script.\nWhen defined in resources, only\ + \ the first entry will be executed when running the built component or when\ + \ running `viash run`.\nWhen defined in test_resources, all entries will\ + \ be executed during `viash test`." const: "python_script" dest: description: "Resulting filename of the resource. From within a script, the\ @@ -1729,10 +2194,10 @@ definitions: - "type" additionalProperties: false RScript: - description: "An executable R script.\nWhen defined in functionality.resources,\ - \ only the first entry will be executed when running the built component or\ - \ when running `viash run`.\nWhen defined in functionality.test_resources, all\ - \ entries will be executed during `viash test`." + description: "An executable R script.\nWhen defined in resources, only the first\ + \ entry will be executed when running the built component or when running `viash\ + \ run`.\nWhen defined in test_resources, all entries will be executed during\ + \ `viash test`." type: "object" properties: path: @@ -1747,10 +2212,10 @@ definitions: description: "Whether the resulting resource file should be made executable." type: "boolean" type: - description: "An executable R script.\nWhen defined in functionality.resources,\ - \ only the first entry will be executed when running the built component\ - \ or when running `viash run`.\nWhen defined in functionality.test_resources,\ - \ all entries will be executed during `viash test`." + description: "An executable R script.\nWhen defined in resources, only the\ + \ first entry will be executed when running the built component or when\ + \ running `viash run`.\nWhen defined in test_resources, all entries will\ + \ be executed during `viash test`." const: "r_script" dest: description: "Resulting filename of the resource. From within a script, the\ @@ -1761,10 +2226,10 @@ definitions: - "type" additionalProperties: false ScalaScript: - description: "An executable Scala script.\nWhen defined in functionality.resources,\ - \ only the first entry will be executed when running the built component or\ - \ when running `viash run`.\nWhen defined in functionality.test_resources, all\ - \ entries will be executed during `viash test`." + description: "An executable Scala script.\nWhen defined in resources, only the\ + \ first entry will be executed when running the built component or when running\ + \ `viash run`.\nWhen defined in test_resources, all entries will be executed\ + \ during `viash test`." type: "object" properties: path: @@ -1779,10 +2244,10 @@ definitions: description: "Whether the resulting resource file should be made executable." type: "boolean" type: - description: "An executable Scala script.\nWhen defined in functionality.resources,\ - \ only the first entry will be executed when running the built component\ - \ or when running `viash run`.\nWhen defined in functionality.test_resources,\ - \ all entries will be executed during `viash test`." + description: "An executable Scala script.\nWhen defined in resources, only\ + \ the first entry will be executed when running the built component or when\ + \ running `viash run`.\nWhen defined in test_resources, all entries will\ + \ be executed during `viash test`." const: "scala_script" dest: description: "Resulting filename of the resource. From within a script, the\ @@ -2196,7 +2661,7 @@ definitions: \ and follows a semi logarithmic scale (1, 2, 5 per decade).\n\nConceptually\ \ it is possible for a Viash Config to overwrite the full labels parameter,\ \ however likely it is more efficient to add additional labels\nin the Viash\ - \ Project with a config mod.\n" + \ Package with a config mod.\n" type: "object" additionalProperties: type: "string" @@ -2216,9 +2681,8 @@ definitions: additionalProperties: false Dependency: description: "Specifies a Viash component (script or executable) that should be\ - \ made available for the code defined in the functionality.\nThe dependency\ - \ components are collected and copied to the output folder during the Viash\ - \ build step.\n" + \ made available for the code defined in the component.\nThe dependency components\ + \ are collected and copied to the output folder during the Viash build step.\n" type: "object" properties: name: @@ -2227,19 +2691,21 @@ definitions: type: "string" repository: oneOf: - - description: "Specifies the location where the dependency component can\ - \ be found.\nThis must either be a full definition of the repository or\ - \ the name of a repository refenced as it is defined under functionality.repositories.\n\ + - description: "Specifies the repository location where the dependency component\ + \ can be found.\nThis must either be a full definition of the repository\ + \ or the name of a repository referenced as it is defined under repositories.\n\ Additionally, the full definition can be specified as a single string\ \ where all parameters such as repository type, url, branch or tag are\ - \ specified.\n" + \ specified.\nOmitting the value sets the dependency as a local dependency,\ + \ ie. the dependency is available in the same namespace as the component.\n" type: "string" - - description: "Specifies the location where the dependency component can\ - \ be found.\nThis must either be a full definition of the repository or\ - \ the name of a repository refenced as it is defined under functionality.repositories.\n\ + - description: "Specifies the repository location where the dependency component\ + \ can be found.\nThis must either be a full definition of the repository\ + \ or the name of a repository referenced as it is defined under repositories.\n\ Additionally, the full definition can be specified as a single string\ \ where all parameters such as repository type, url, branch or tag are\ - \ specified.\n" + \ specified.\nOmitting the value sets the dependency as a local dependency,\ + \ ie. the dependency is available in the same namespace as the component.\n" $ref: "#/definitions/Repository" alias: description: "An alternative name for the dependency component. This can include\ diff --git a/_viash.yaml b/_viash.yaml index 65344505..8e09d947 100644 --- a/_viash.yaml +++ b/_viash.yaml @@ -1,5 +1,6 @@ -viash_version: 0.8.5 +name: biobase +description: | + A collection of bioinformatics tools for working with sequence data. +license: MIT -config_mods: | - .functionality.arguments[.multiple == true].multiple_sep := ";" - .functionality.argument_groups[true].arguments[.multiple == true].multiple_sep := ";" \ No newline at end of file +viash_version: 0.9.0-RC2 \ No newline at end of file diff --git a/src/arriba/config.vsh.yaml b/src/arriba/config.vsh.yaml index ac847838..8d72d7eb 100644 --- a/src/arriba/config.vsh.yaml +++ b/src/arriba/config.vsh.yaml @@ -1,385 +1,385 @@ -functionality: - name: arriba - description: Detect gene fusions from RNA-Seq data - info: - keywords: [Gene fusion, RNA-Seq] - links: - homepage: https://arriba.readthedocs.io/en/latest/ - documentation: https://arriba.readthedocs.io/en/latest/ - repository: https://github.com/suhrig/arriba - references: - doi: 10.1101/gr.257246.119 - license: MIT - requirements: - cpus: 1 - commands: [ arriba ] - argument_groups: - - name: Inputs - arguments: - - name: --bam - alternatives: -x +name: arriba +description: Detect gene fusions from RNA-Seq data +keywords: [Gene fusion, RNA-Seq] +links: + homepage: https://arriba.readthedocs.io/en/latest/ + documentation: https://arriba.readthedocs.io/en/latest/ + repository: https://github.com/suhrig/arriba +references: + doi: 10.1101/gr.257246.119 +license: MIT +requirements: + cpus: 1 + commands: [ arriba ] +argument_groups: + - name: Inputs + arguments: + - name: --bam + alternatives: -x + type: file + description: | + File in SAM/BAM/CRAM format with main alignments as generated by STAR + (Aligned.out.sam). Arriba extracts candidate reads from this file. + required: true + example: Aligned.out.bam + - name: --genome + alternatives: -a + type: file + description: | + FastA file with genome sequence (assembly). The file may be gzip-compressed. An + index with the file extension .fai must exist only if CRAM files are processed. + required: true + example: assembly.fa + - name: --gene_annotation + alternatives: -g + type: file + description: | + GTF file with gene annotation. The file may be gzip-compressed. + required: true + example: annotation.gtf + - name: --known_fusions + alternatives: -k + type: file + description: | + File containing known/recurrent fusions. Some cancer entities are often + characterized by fusions between the same pair of genes. In order to boost + sensitivity, a list of known fusions can be supplied using this parameter. The list + must contain two columns with the names of the fused genes, separated by tabs. + required: false + example: known_fusions.tsv + - name: --blacklist + alternatives: -b + type: file + description: | + File containing blacklisted events (recurrent artifacts and transcripts + observed in healthy tissue). + required: false + example: blacklist.tsv + - name: --structural_variants + alternatives: -d + type: file + description: | + Tab-separated file with coordinates of structural variants found using + whole-genome sequencing data. These coordinates serve to increase sensitivity + towards weakly expressed fusions and to eliminate fusions with low evidence. + required: false + example: structural_variants_from_WGS.tsv + - name: --tags + alternatives: -t + type: file + description: | + Tab-separated file containing fusions to annotate with tags in the 'tags' column. + The first two columns specify the genes; the third column specifies the tag. The + file may be gzip-compressed. + required: false + example: tags.tsv + - name: --protein_domains + alternatives: -p + type: file + description: | + File in GFF3 format containing coordinates of the protein domains of genes. The + protein domains retained in a fusion are listed in the column + 'retained_protein_domains'. The file may be gzip-compressed. + required: false + example: protein_domains.gff3 + - name: Outputs + arguments: + - name: --fusions + alternatives: -o type: file + direction: output description: | - File in SAM/BAM/CRAM format with main alignments as generated by STAR - (Aligned.out.sam). Arriba extracts candidate reads from this file. + Output file with fusions that have passed all filters. required: true - example: Aligned.out.bam - - name: --genome - alternatives: -a + example: fusions.tsv + - name: --fusions_discarded + alternatives: -O type: file + direction: output description: | - FastA file with genome sequence (assembly). The file may be gzip-compressed. An - index with the file extension .fai must exist only if CRAM files are processed. - required: true - example: assembly.fa - - name: --gene_annotation - alternatives: -g - type: file + Output file with fusions that were discarded due to filtering. + required: false + example: fusions.discarded.tsv + - name: Arguments + arguments: + - name: --max_genomic_breakpoint_distance + alternatives: -D + type: long description: | - GTF file with gene annotation. The file may be gzip-compressed. - required: true - example: annotation.gtf - - name: --known_fusions - alternatives: -k - type: file + When a file with genomic breakpoints obtained via + whole-genome sequencing is supplied via the --structural_variants + parameter, this parameter determines how far a + genomic breakpoint may be away from a + transcriptomic breakpoint to consider it as a + related event. For events inside genes, the + distance is added to the end of the gene; for + intergenic events, the distance threshold is + applied as is. Default: 100000. + required: false + - name: --strandedness + alternatives: -s + type: string description: | - File containing known/recurrent fusions. Some cancer entities are often - characterized by fusions between the same pair of genes. In order to boost - sensitivity, a list of known fusions can be supplied using this parameter. The list - must contain two columns with the names of the fused genes, separated by tabs. + Whether a strand-specific protocol was used for library preparation, + and if so, the type of strandedness (auto/yes/no/reverse). When + unstranded data is processed, the strand can sometimes be inferred from + splice-patterns. But in unclear situations, stranded data helps + resolve ambiguities. Default: auto + choices: ["auto", "yes", "no", "reverse"] required: false - example: known_fusions.tsv - - name: --blacklist - alternatives: -b - type: file + - name: --interesting_contigs + alternatives: -i + type: string description: | - File containing blacklisted events (recurrent artifacts and transcripts - observed in healthy tissue). + List of interesting contigs. Fusions between genes + on other contigs are ignored. Contigs can be specified with or without the + prefix "chr". Asterisks (*) are treated as wild-cards. + Default: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y AC_* NC_* required: false - example: blacklist.tsv - - name: --structural_variants - alternatives: -d - type: file + multiple: true + example: ["1", "2", "AC_*", "NC_*"] + - name: --viral_contigs + alternatives: -v + type: string description: | - Tab-separated file with coordinates of structural variants found using - whole-genome sequencing data. These coordinates serve to increase sensitivity - towards weakly expressed fusions and to eliminate fusions with low evidence. + List of viral contigs. Asterisks (*) are treated as + wild-cards. + Default: AC_* NC_* required: false - example: structural_variants_from_WGS.tsv - - name: --tags - alternatives: -t - type: file + multiple: true + example: ["AC_*", "NC_*"] + - name: --disable_filters + alternatives: -f + type: string description: | - Tab-separated file containing fusions to annotate with tags in the 'tags' column. - The first two columns specify the genes; the third column specifies the tag. The - file may be gzip-compressed. + List of filters to disable. By default all filters are + enabled. + choices: [ homologs, low_entropy, isoforms, + top_expressed_viral_contigs, viral_contigs, uninteresting_contigs, + non_coding_neighbors, mismatches, duplicates, no_genomic_support, + genomic_support, intronic, end_to_end, relative_support, + low_coverage_viral_contigs, merge_adjacent, mismappers, multimappers, + same_gene, long_gap, internal_tandem_duplication, small_insert_size, + read_through, inconsistently_clipped, intragenic_exonic, + marginal_read_through, spliced, hairpin, blacklist, min_support, + select_best, in_vitro, short_anchor, known_fusions, no_coverage, + homopolymer, many_spliced ] required: false - example: tags.tsv - - name: --protein_domains - alternatives: -p - type: file + multiple: true + - name: --max_e_value + alternatives: -E + type: double description: | - File in GFF3 format containing coordinates of the protein domains of genes. The - protein domains retained in a fusion are listed in the column - 'retained_protein_domains'. The file may be gzip-compressed. + Arriba estimates the number of fusions with a given number of supporting + reads which one would expect to see by random chance. If the expected number + of fusions (e-value) is higher than this threshold, the fusion is + discarded by the 'relative_support' filter. Note: Increasing this + threshold can dramatically increase the number of false positives and may + increase the runtime of resource-intensive steps. Fractional values are + possible. Default: 0.300000 required: false - example: protein_domains.gff3 - - name: Outputs - arguments: - - name: --fusions - alternatives: -o - type: file - direction: output - description: | - Output file with fusions that have passed all filters. - required: true - example: fusions.tsv - - name: --fusions_discarded - alternatives: -O - type: file - direction: output - description: | - Output file with fusions that were discarded due to filtering. - required: false - example: fusions.discarded.tsv - - name: Arguments - arguments: - - name: --max_genomic_breakpoint_distance - alternatives: -D - type: long - description: | - When a file with genomic breakpoints obtained via - whole-genome sequencing is supplied via the --structural_variants - parameter, this parameter determines how far a - genomic breakpoint may be away from a - transcriptomic breakpoint to consider it as a - related event. For events inside genes, the - distance is added to the end of the gene; for - intergenic events, the distance threshold is - applied as is. Default: 100000. - required: false - - name: --strandedness - alternatives: -s - type: string - description: | - Whether a strand-specific protocol was used for library preparation, - and if so, the type of strandedness (auto/yes/no/reverse). When - unstranded data is processed, the strand can sometimes be inferred from - splice-patterns. But in unclear situations, stranded data helps - resolve ambiguities. Default: auto - choices: ["auto", "yes", "no", "reverse"] - required: false - - name: --interesting_contigs - alternatives: -i - type: string - description: | - List of interesting contigs. Fusions between genes - on other contigs are ignored. Contigs can be specified with or without the - prefix "chr". Asterisks (*) are treated as wild-cards. - Default: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y AC_* NC_* - required: false - multiple: true - example: ["1", "2", "AC_*", "NC_*"] - - name: --viral_contigs - alternatives: -v - type: string - description: | - List of viral contigs. Asterisks (*) are treated as - wild-cards. - Default: AC_* NC_* - required: false - multiple: true - example: ["AC_*", "NC_*"] - - name: --disable_filters - alternatives: -f - type: string - description: | - List of filters to disable. By default all filters are - enabled. - choices: [ homologs, low_entropy, isoforms, - top_expressed_viral_contigs, viral_contigs, uninteresting_contigs, - non_coding_neighbors, mismatches, duplicates, no_genomic_support, - genomic_support, intronic, end_to_end, relative_support, - low_coverage_viral_contigs, merge_adjacent, mismappers, multimappers, - same_gene, long_gap, internal_tandem_duplication, small_insert_size, - read_through, inconsistently_clipped, intragenic_exonic, - marginal_read_through, spliced, hairpin, blacklist, min_support, - select_best, in_vitro, short_anchor, known_fusions, no_coverage, - homopolymer, many_spliced ] - required: false - multiple: true - - name: --max_e_value - alternatives: -E - type: double - description: | - Arriba estimates the number of fusions with a given number of supporting - reads which one would expect to see by random chance. If the expected number - of fusions (e-value) is higher than this threshold, the fusion is - discarded by the 'relative_support' filter. Note: Increasing this - threshold can dramatically increase the number of false positives and may - increase the runtime of resource-intensive steps. Fractional values are - possible. Default: 0.300000 - required: false - - name: --min_supporting_reads - alternatives: -S - type: integer - description: | - The 'min_support' filter discards all fusions with fewer than - this many supporting reads (split reads and discordant mates - combined). Default: 2 - required: false - example: 2 - - name: --max_mismappers - alternatives: -m - type: double - description: | - When more than this fraction of supporting reads turns out to be - mismappers, the 'mismappers' filter discards the fusion. Default: - 0.800000 - required: false - example: 0.8 - - name: --max_homolog_identity - alternatives: -L - type: double - description: | - Genes with more than the given fraction of sequence identity are - considered homologs and removed by the 'homologs' filter. - Default: 0.300000 - required: false - example: 0.3 - - name: --homopolymer_length - alternatives: -H - type: integer - description: | - The 'homopolymer' filter removes breakpoints adjacent to - homopolymers of the given length or more. Default: 6 - required: false - example: 6 - - name: --read_through_distance - alternatives: -R - type: integer - description: | - The 'read_through' filter removes read-through fusions - where the breakpoints are less than the given distance away - from each other. Default: 10000 - required: false - example: 10000 - - name : --min_anchor_length - alternatives: -A - type: integer - description: | - Alignment artifacts are often characterized by split reads coming - from only one gene and no discordant mates. Moreover, the split - reads only align to a short stretch in one of the genes. The - 'short_anchor' filter removes these fusions. This parameter sets - the threshold in bp for what the filter considers short. Default: 23 - required: false - example: 23 - - name: --many_spliced_events - alternatives: -M - type: integer - description: | - The 'many_spliced' filter recovers fusions between genes that - have at least this many spliced breakpoints. Default: 4 - required: false - example: 4 - - name: --max_kmer_content - alternatives: -K - type: double - description: | - The 'low_entropy' filter removes reads with repetitive 3-mers. If - the 3-mers make up more than the given fraction of the sequence, then - the read is discarded. Default: 0.600000 - required: false - example: 0.6 - - name: --max_mismatch_pvalue - alternatives: -V - type: double - description: | - The 'mismatches' filter uses a binomial model to calculate a - p-value for observing a given number of mismatches in a read. If - the number of mismatches is too high, the read is discarded. - Default: 0.010000 - required: false - example: 0.05 - - name: --fragment_length - alternatives: -F - type: integer - description: | - When paired-end data is given, the fragment length is estimated - automatically and this parameter has no effect. But when single-end - data is given, the mean fragment length should be specified to - effectively filter fusions that arise from hairpin structures. - Default: 200 - required: false - example: 200 - - name: --max_reads - alternatives: -U - type: integer - description: | - Subsample fusions with more than the given number of supporting reads. This - improves performance without compromising sensitivity, as long as the - threshold is high. Counting of supporting reads beyond the threshold is - inaccurate, obviously. Default: 300 - required: false - example: 300 - - name: --quantile - alternatives: -Q - type: double - description: | - Highly expressed genes are prone to produce artifacts during library - preparation. Genes with an expression above the given quantile are eligible - for filtering by the 'in_vitro' filter. Default: 0.998000 - required: false - example: 0.998 - - name: --exonic_fraction - alternatives: -e - type: double - description: | - The breakpoints of false-positive predictions of intragenic events - are often both in exons. True predictions are more likely to have at - least one breakpoint in an intron, because introns are larger. If the - fraction of exonic sequence between two breakpoints is smaller than - the given fraction, the 'intragenic_exonic' filter discards the - event. Default: 0.330000 - required: false - example: 0.33 - - name: --top_n - alternatives: -T - type: integer - description: | - Only report viral integration sites of the top N most highly expressed viral - contigs. Default: 5 - required: false - example: 5 - - name: --covered_fraction - alternatives: -C - type: double - description: | - Ignore virally associated events if the virus is not fully - expressed, i.e., less than the given fraction of the viral contig is - transcribed. Default: 0.050000 - required: false - example: 0.05 - - name: --max_itd_length - alternatives: -l - type: integer - description: | - Maximum length of internal tandem duplications. Note: Increasing - this value beyond the default can impair performance and lead to many - false positives. Default: 100 - required: false - example: 100 - - name: --min_itd_allele_fraction - alternatives: -z - type: double - description: | - Required fraction of supporting reads to report an internal - tandem duplication. Default: 0.070000 - required: false - example: 0.07 - - name: --min_itd_supporting_reads - alternatives: -Z - type: integer - description: | - Required absolute number of supporting reads to report an - internal tandem duplication. Default: 10 - required: false - example: 10 - - name: --skip_duplicate_marking - alternatives: -u - type: boolean_true - description: | - Instead of performing duplicate marking itself, Arriba relies on duplicate marking by a - preceding program using the BAM_FDUP flag. This makes sense when unique molecular - identifiers (UMI) are used. - - name: --extra_information - alternatives: -X - type: boolean_true - description: | - To reduce the runtime and file size, by default, the columns 'fusion_transcript', - 'peptide_sequence', and 'read_identifiers' are left empty in the file containing - discarded fusion candidates (see parameter -O). When this flag is set, this extra - information is reported in the discarded fusions file. - - name: --fill_gaps - alternatives: -I - type: boolean_true - description: | - If assembly of the fusion transcript sequence from the supporting reads is incomplete - (denoted as '...'), fill the gaps using the assembly sequence wherever possible. - resources: - - type: bash_script - path: script.sh - test_resources: - - type: bash_script - path: test.sh - - type: file - path: test_data -platforms: + - name: --min_supporting_reads + alternatives: -S + type: integer + description: | + The 'min_support' filter discards all fusions with fewer than + this many supporting reads (split reads and discordant mates + combined). Default: 2 + required: false + example: 2 + - name: --max_mismappers + alternatives: -m + type: double + description: | + When more than this fraction of supporting reads turns out to be + mismappers, the 'mismappers' filter discards the fusion. Default: + 0.800000 + required: false + example: 0.8 + - name: --max_homolog_identity + alternatives: -L + type: double + description: | + Genes with more than the given fraction of sequence identity are + considered homologs and removed by the 'homologs' filter. + Default: 0.300000 + required: false + example: 0.3 + - name: --homopolymer_length + alternatives: -H + type: integer + description: | + The 'homopolymer' filter removes breakpoints adjacent to + homopolymers of the given length or more. Default: 6 + required: false + example: 6 + - name: --read_through_distance + alternatives: -R + type: integer + description: | + The 'read_through' filter removes read-through fusions + where the breakpoints are less than the given distance away + from each other. Default: 10000 + required: false + example: 10000 + - name : --min_anchor_length + alternatives: -A + type: integer + description: | + Alignment artifacts are often characterized by split reads coming + from only one gene and no discordant mates. Moreover, the split + reads only align to a short stretch in one of the genes. The + 'short_anchor' filter removes these fusions. This parameter sets + the threshold in bp for what the filter considers short. Default: 23 + required: false + example: 23 + - name: --many_spliced_events + alternatives: -M + type: integer + description: | + The 'many_spliced' filter recovers fusions between genes that + have at least this many spliced breakpoints. Default: 4 + required: false + example: 4 + - name: --max_kmer_content + alternatives: -K + type: double + description: | + The 'low_entropy' filter removes reads with repetitive 3-mers. If + the 3-mers make up more than the given fraction of the sequence, then + the read is discarded. Default: 0.600000 + required: false + example: 0.6 + - name: --max_mismatch_pvalue + alternatives: -V + type: double + description: | + The 'mismatches' filter uses a binomial model to calculate a + p-value for observing a given number of mismatches in a read. If + the number of mismatches is too high, the read is discarded. + Default: 0.010000 + required: false + example: 0.05 + - name: --fragment_length + alternatives: -F + type: integer + description: | + When paired-end data is given, the fragment length is estimated + automatically and this parameter has no effect. But when single-end + data is given, the mean fragment length should be specified to + effectively filter fusions that arise from hairpin structures. + Default: 200 + required: false + example: 200 + - name: --max_reads + alternatives: -U + type: integer + description: | + Subsample fusions with more than the given number of supporting reads. This + improves performance without compromising sensitivity, as long as the + threshold is high. Counting of supporting reads beyond the threshold is + inaccurate, obviously. Default: 300 + required: false + example: 300 + - name: --quantile + alternatives: -Q + type: double + description: | + Highly expressed genes are prone to produce artifacts during library + preparation. Genes with an expression above the given quantile are eligible + for filtering by the 'in_vitro' filter. Default: 0.998000 + required: false + example: 0.998 + - name: --exonic_fraction + alternatives: -e + type: double + description: | + The breakpoints of false-positive predictions of intragenic events + are often both in exons. True predictions are more likely to have at + least one breakpoint in an intron, because introns are larger. If the + fraction of exonic sequence between two breakpoints is smaller than + the given fraction, the 'intragenic_exonic' filter discards the + event. Default: 0.330000 + required: false + example: 0.33 + - name: --top_n + alternatives: -T + type: integer + description: | + Only report viral integration sites of the top N most highly expressed viral + contigs. Default: 5 + required: false + example: 5 + - name: --covered_fraction + alternatives: -C + type: double + description: | + Ignore virally associated events if the virus is not fully + expressed, i.e., less than the given fraction of the viral contig is + transcribed. Default: 0.050000 + required: false + example: 0.05 + - name: --max_itd_length + alternatives: -l + type: integer + description: | + Maximum length of internal tandem duplications. Note: Increasing + this value beyond the default can impair performance and lead to many + false positives. Default: 100 + required: false + example: 100 + - name: --min_itd_allele_fraction + alternatives: -z + type: double + description: | + Required fraction of supporting reads to report an internal + tandem duplication. Default: 0.070000 + required: false + example: 0.07 + - name: --min_itd_supporting_reads + alternatives: -Z + type: integer + description: | + Required absolute number of supporting reads to report an + internal tandem duplication. Default: 10 + required: false + example: 10 + - name: --skip_duplicate_marking + alternatives: -u + type: boolean_true + description: | + Instead of performing duplicate marking itself, Arriba relies on duplicate marking by a + preceding program using the BAM_FDUP flag. This makes sense when unique molecular + identifiers (UMI) are used. + - name: --extra_information + alternatives: -X + type: boolean_true + description: | + To reduce the runtime and file size, by default, the columns 'fusion_transcript', + 'peptide_sequence', and 'read_identifiers' are left empty in the file containing + discarded fusion candidates (see parameter -O). When this flag is set, this extra + information is reported in the discarded fusions file. + - name: --fill_gaps + alternatives: -I + type: boolean_true + description: | + If assembly of the fusion transcript sequence from the supporting reads is incomplete + (denoted as '...'), fill the gaps using the assembly sequence wherever possible. +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - type: file + path: test_data +engines: - type: docker image: quay.io/biocontainers/arriba:2.4.0--h0033a41_2 setup: - type: docker run: | arriba -h | grep 'Version:' 2>&1 | sed 's/Version:\s\(.*\)/arriba: "\1"/' > /var/software_versions.txt - - type: nextflow +runners: + - type: executable + - type: nextflow \ No newline at end of file diff --git a/src/bgzip/config.vsh.yaml b/src/bgzip/config.vsh.yaml index 049d0cbf..26e31ae4 100644 --- a/src/bgzip/config.vsh.yaml +++ b/src/bgzip/config.vsh.yaml @@ -1,128 +1,128 @@ -functionality: - name: bgzip - description: Block compression/decompression utility - info: - links: - homepage: https://www.htslib.org/ - documentation: https://www.htslib.org/doc/bgzip.html - repository: https://github.com/samtools/htslib - references: - doi: 10.1093/gigascience/giab007 - license: MIT - requirements: - commands: [ bgzip ] - argument_groups: - - name: Inputs - arguments: - - name: --input - type: file - direction: input - description: file to be compressed or decompressed - required: true - - name: Outputs - arguments: - - name: --output - type: file - direction: output - description: compressed or decompressed output - required: true - - name: --index_name - alternatives: -I - type: file - direction: output - description: name of BGZF index file [file.gz.gzi] - - name: Arguments - arguments: - - name: --offset - alternatives: -b - type: integer - description: decompress at virtual file pointer (0-based uncompressed offset) - - name: --decompress - alternatives: -d - type: boolean_true - description: decompress the input file - - name: --rebgzip - alternatives: -g - type: boolean_true - description: use an index file to bgzip a file - - name: --index - alternatives: -i - type: boolean_true - description: compress and create BGZF index - - name: --compress_level - alternatives: -l - type: integer - description: compression level to use when compressing; 0 to 9, or -1 for default [-1] - min: -1 - max: 9 - - name: --reindex - alternatives: -r - type: boolean_true - description: (re)index the output file - - name: --size - alternatives: -s - type: integer - description: decompress INT bytes (uncompressed size) - min: 0 - - name: --test - alternatives: -t - type: boolean_true - description: test integrity of compressed file - - name: --binary - type: boolean_true - description: Don't align blocks with text lines - resources: - - type: bash_script - text: | - [[ "$par_decompress" == "false" ]] && unset par_decompress - [[ "$par_rebgzip" == "false" ]] && unset par_rebgzip - [[ "$par_index" == "false" ]] && unset par_index - [[ "$par_reindex" == "false" ]] && unset par_reindex - [[ "$par_test" == "false" ]] && unset par_test - [[ "$par_binary" == "false" ]] && unset par_binary - bgzip -c \ - ${meta_cpus:+--threads "${meta_cpus}"} \ - ${par_offset:+-b "${par_offset}"} \ - ${par_decompress:+-d} \ - ${par_rebgzip:+-g} \ - ${par_index:+-i} \ - ${par_index_name:+-I "${par_index_name}"} \ - ${par_compress_level:+-l "${par_compress_level}"} \ - ${par_reindex:+-r} \ - ${par_size:+-s "${par_size}"} \ - ${par_test:+-t} \ - ${par_binary:+--binary} \ - "$par_input" > "$par_output" - test_resources: - - type: bash_script - text: | - set -e +name: bgzip +description: Block compression/decompression utility +links: + homepage: https://www.htslib.org/ + documentation: https://www.htslib.org/doc/bgzip.html + repository: https://github.com/samtools/htslib +references: + doi: 10.1093/gigascience/giab007 +license: MIT +requirements: + commands: [ bgzip ] +argument_groups: + - name: Inputs + arguments: + - name: --input + type: file + direction: input + description: file to be compressed or decompressed + required: true + - name: Outputs + arguments: + - name: --output + type: file + direction: output + description: compressed or decompressed output + required: true + - name: --index_name + alternatives: -I + type: file + direction: output + description: name of BGZF index file [file.gz.gzi] + - name: Arguments + arguments: + - name: --offset + alternatives: -b + type: integer + description: decompress at virtual file pointer (0-based uncompressed offset) + - name: --decompress + alternatives: -d + type: boolean_true + description: decompress the input file + - name: --rebgzip + alternatives: -g + type: boolean_true + description: use an index file to bgzip a file + - name: --index + alternatives: -i + type: boolean_true + description: compress and create BGZF index + - name: --compress_level + alternatives: -l + type: integer + description: compression level to use when compressing; 0 to 9, or -1 for default [-1] + min: -1 + max: 9 + - name: --reindex + alternatives: -r + type: boolean_true + description: (re)index the output file + - name: --size + alternatives: -s + type: integer + description: decompress INT bytes (uncompressed size) + min: 0 + - name: --test + alternatives: -t + type: boolean_true + description: test integrity of compressed file + - name: --binary + type: boolean_true + description: Don't align blocks with text lines +resources: + - type: bash_script + text: | + [[ "$par_decompress" == "false" ]] && unset par_decompress + [[ "$par_rebgzip" == "false" ]] && unset par_rebgzip + [[ "$par_index" == "false" ]] && unset par_index + [[ "$par_reindex" == "false" ]] && unset par_reindex + [[ "$par_test" == "false" ]] && unset par_test + [[ "$par_binary" == "false" ]] && unset par_binary + bgzip -c \ + ${meta_cpus:+--threads "${meta_cpus}"} \ + ${par_offset:+-b "${par_offset}"} \ + ${par_decompress:+-d} \ + ${par_rebgzip:+-g} \ + ${par_index:+-i} \ + ${par_index_name:+-I "${par_index_name}"} \ + ${par_compress_level:+-l "${par_compress_level}"} \ + ${par_reindex:+-r} \ + ${par_size:+-s "${par_size}"} \ + ${par_test:+-t} \ + ${par_binary:+--binary} \ + "$par_input" > "$par_output" +test_resources: + - type: bash_script + text: | + set -e - "$meta_executable" --input "$meta_resources_dir/test_data/test.vcf" --output "test.vcf.gz" + "$meta_executable" --input "$meta_resources_dir/test_data/test.vcf" --output "test.vcf.gz" - echo ">> Checking output of compressing" - [ ! -f "test.vcf.gz" ] && echo "Output file test.vcf.gz does not exist" && exit 1 + echo ">> Checking output of compressing" + [ ! -f "test.vcf.gz" ] && echo "Output file test.vcf.gz does not exist" && exit 1 - "$meta_executable" --input "test.vcf.gz" --output "test.vcf" --decompress + "$meta_executable" --input "test.vcf.gz" --output "test.vcf" --decompress - echo ">> Checking output of decompressing" - [ ! -f "test.vcf" ] && echo "Output file test.vcf does not exist" && exit 1 + echo ">> Checking output of decompressing" + [ ! -f "test.vcf" ] && echo "Output file test.vcf does not exist" && exit 1 - echo ">> Checking original and decompressed files are the same" - set +e - cmp --silent -- "$meta_resources_dir/test_data/test.vcf" "test.vcf" - [ $? -ne 0 ] && echo "files are different" && exit 1 - set -e - - echo "> Test successful" - - type: file - path: test_data + echo ">> Checking original and decompressed files are the same" + set +e + cmp --silent -- "$meta_resources_dir/test_data/test.vcf" "test.vcf" + [ $? -ne 0 ] && echo "files are different" && exit 1 + set -e + + echo "> Test successful" + - type: file + path: test_data -platforms: +engines: - type: docker image: quay.io/biocontainers/htslib:1.19--h81da01d_0 setup: - type: docker run: | bgzip -h | grep 'Version:' 2>&1 | sed 's/Version:\s\(.*\)/bgzip: "\1"/' > /var/software_versions.txt +runners: + - type: executable - type: nextflow \ No newline at end of file diff --git a/src/busco/busco_download_datasets/config.vsh.yaml b/src/busco/busco_download_datasets/config.vsh.yaml index a592ed89..04d76dd6 100644 --- a/src/busco/busco_download_datasets/config.vsh.yaml +++ b/src/busco/busco_download_datasets/config.vsh.yaml @@ -1,46 +1,47 @@ -functionality: - name: busco_download_datasets - namespace: busco - description: Downloads available busco datasets - info: - links: - homepage: https://busco.ezlab.org/ - documentation: https://busco.ezlab.org/busco_userguide.html - repository: https://gitlab.com/ezlab/busco - references: - doi: 10.1007/978-1-4939-9173-0_14 - license: MIT - argument_groups: - - name: Inputs - arguments: - - name: --download - type: string - description: | - Download dataset. Possible values are a specific dataset name, "all", "prokaryota", "eukaryota", or "virus". - The full list of available datasets can be viewed [here](https://busco-data.ezlab.org/v5/data/lineages/) or by running the busco/busco_list_datasets component. - required: true - example: stramenopiles_odb10 - - name: Outputs - arguments: - - name: --download_path - direction: output - type: file - description: | - Local filepath for storing BUSCO dataset downloads - required: false - default: busco_downloads - example: busco_downloads - resources: - - type: bash_script - path: script.sh - test_resources: - - type: bash_script - path: test.sh -platforms: +name: busco_download_datasets +namespace: busco +description: Downloads available busco datasets +keywords: [lineage datasets] +links: + homepage: https://busco.ezlab.org/ + documentation: https://busco.ezlab.org/busco_userguide.html + repository: https://gitlab.com/ezlab/busco +references: + doi: 10.1007/978-1-4939-9173-0_14 +license: MIT +argument_groups: + - name: Inputs + arguments: + - name: --download + type: string + description: | + Download dataset. Possible values are a specific dataset name, "all", "prokaryota", "eukaryota", or "virus". + The full list of available datasets can be viewed [here](https://busco-data.ezlab.org/v5/data/lineages/) or by running the busco/busco_list_datasets component. + required: true + example: stramenopiles_odb10 + - name: Outputs + arguments: + - name: --download_path + direction: output + type: file + description: | + Local filepath for storing BUSCO dataset downloads + required: false + default: busco_downloads + example: busco_downloads +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh +engines: - type: docker image: quay.io/biocontainers/busco:5.6.1--pyhdfd78af_0 setup: - type: docker run: | busco --version | sed 's/BUSCO\s\(.*\)/busco: "\1"/' > /var/software_versions.txt +runners: + - type: executable - type: nextflow diff --git a/src/busco/busco_list_datasets/config.vsh.yaml b/src/busco/busco_list_datasets/config.vsh.yaml index 004628c9..6ada7c84 100644 --- a/src/busco/busco_list_datasets/config.vsh.yaml +++ b/src/busco/busco_list_datasets/config.vsh.yaml @@ -1,38 +1,39 @@ -functionality: - name: busco_list_datasets - namespace: busco - description: Lists the available busco datasets - info: - links: - homepage: https://busco.ezlab.org/ - documentation: https://busco.ezlab.org/busco_userguide.html - repository: https://gitlab.com/ezlab/busco - references: - doi: 10.1007/978-1-4939-9173-0_14 - license: MIT - argument_groups: - - name: Outputs - arguments: - - name: --output - alternatives: ["-o"] - direction: output - type: file - description: | - Output file of the available busco datasets - required: false - default: busco_dataset_list.txt - example: file.txt - resources: - - type: bash_script - path: script.sh - test_resources: - - type: bash_script - path: test.sh -platforms: +name: busco_list_datasets +namespace: busco +description: Lists the available busco datasets +keywords: [lineage datasets] +links: + homepage: https://busco.ezlab.org/ + documentation: https://busco.ezlab.org/busco_userguide.html + repository: https://gitlab.com/ezlab/busco +references: + doi: 10.1007/978-1-4939-9173-0_14 +license: MIT +argument_groups: + - name: Outputs + arguments: + - name: --output + alternatives: ["-o"] + direction: output + type: file + description: | + Output file of the available busco datasets + required: false + default: busco_dataset_list.txt + example: file.txt +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh +engines: - type: docker image: quay.io/biocontainers/busco:5.6.1--pyhdfd78af_0 setup: - type: docker run: | busco --version | sed 's/BUSCO\s\(.*\)/busco: "\1"/' > /var/software_versions.txt +runners: + - type: executable - type: nextflow diff --git a/src/busco/busco_run/config.vsh.yaml b/src/busco/busco_run/config.vsh.yaml index 8524b068..d79f03f5 100644 --- a/src/busco/busco_run/config.vsh.yaml +++ b/src/busco/busco_run/config.vsh.yaml @@ -1,214 +1,214 @@ -functionality: - name: busco_run - namespace: busco - description: Assessment of genome assembly and annotation completeness with single copy orthologs - info: - keywords: [Genome assembly, quality control] - links: - homepage: https://busco.ezlab.org/ - documentation: https://busco.ezlab.org/busco_userguide.html - repository: https://gitlab.com/ezlab/busco - references: - doi: 10.1007/978-1-4939-9173-0_14 - license: MIT - argument_groups: - - name: Inputs - arguments: - - name: --input - alternatives: ["-i"] - type: file - description: | - Input sequence file in FASTA format. Can be an assembled genome or transcriptome (DNA), or protein sequences from an annotated gene set. Also possible to use a path to a directory containing multiple input files. - required: true - example: file.fasta - - name: --mode - alternatives: ["-m"] - type: string - choices: [genome, geno, transcriptome, tran, proteins, prot] - required: true - description: | - Specify which BUSCO analysis mode to run. There are three valid modes: - - geno or genome, for genome assemblies (DNA) - - tran or transcriptome, for transcriptome assemblies (DNA) - - prot or proteins, for annotated gene sets (protein) - example: proteins - - name: --lineage_dataset - alternatives: ["-l"] - type: string - required: false - description: | - Specify a BUSCO lineage dataset that is most closely related to the assembly or gene set being assessed. - The full list of available datasets can be viewed [here](https://busco-data.ezlab.org/v5/data/lineages/) or by running the busco/busco_list_datasets component. - When unsure, the "--auto_lineage" flag can be set to automatically find the optimal lineage path. - BUSCO will automatically download the requested dataset if it is not already present in the download folder. - You can optionally provide a path to a local dataset instead of a name, e.g. path/to/dataset. - Datasets can be downloaded using the busco/busco_download_dataset component. - example: stramenopiles_odb10 +name: busco_run +namespace: busco +description: Assessment of genome assembly and annotation completeness with single copy orthologs +keywords: [Genome assembly, quality control] +links: + homepage: https://busco.ezlab.org/ + documentation: https://busco.ezlab.org/busco_userguide.html + repository: https://gitlab.com/ezlab/busco +references: + doi: 10.1007/978-1-4939-9173-0_14 +license: MIT +argument_groups: + - name: Inputs + arguments: + - name: --input + alternatives: ["-i"] + type: file + description: | + Input sequence file in FASTA format. Can be an assembled genome or transcriptome (DNA), or protein sequences from an annotated gene set. Also possible to use a path to a directory containing multiple input files. + required: true + example: file.fasta + - name: --mode + alternatives: ["-m"] + type: string + choices: [genome, geno, transcriptome, tran, proteins, prot] + required: true + description: | + Specify which BUSCO analysis mode to run. There are three valid modes: + - geno or genome, for genome assemblies (DNA) + - tran or transcriptome, for transcriptome assemblies (DNA) + - prot or proteins, for annotated gene sets (protein) + example: proteins + - name: --lineage_dataset + alternatives: ["-l"] + type: string + required: false + description: | + Specify a BUSCO lineage dataset that is most closely related to the assembly or gene set being assessed. + The full list of available datasets can be viewed [here](https://busco-data.ezlab.org/v5/data/lineages/) or by running the busco/busco_list_datasets component. + When unsure, the "--auto_lineage" flag can be set to automatically find the optimal lineage path. + BUSCO will automatically download the requested dataset if it is not already present in the download folder. + You can optionally provide a path to a local dataset instead of a name, e.g. path/to/dataset. + Datasets can be downloaded using the busco/busco_download_dataset component. + example: stramenopiles_odb10 - - name: Outputs - arguments: - - name: --short_summary_json - required: false - direction: output - type: file - example: short_summary.json - description: | - Output file for short summary in JSON format. - - name: --short_summary_txt - required: false - direction: output - type: file - example: short_summary.txt - description: | - Output file for short summary in TXT format. - - name: --full_table - required: false - direction: output - type: file - example: full_table.tsv - description: | - Full table output in TSV format. - - name: --missing_busco_list - required: false - direction: output - type: file - example: missing_busco_list.tsv - description: | - Missing list output in TSV format. - - name: --output_dir - required: false - direction: output - type: file - example: output_dir/ - description: | - The full output directory, if so desired. + - name: Outputs + arguments: + - name: --short_summary_json + required: false + direction: output + type: file + example: short_summary.json + description: | + Output file for short summary in JSON format. + - name: --short_summary_txt + required: false + direction: output + type: file + example: short_summary.txt + description: | + Output file for short summary in TXT format. + - name: --full_table + required: false + direction: output + type: file + example: full_table.tsv + description: | + Full table output in TSV format. + - name: --missing_busco_list + required: false + direction: output + type: file + example: missing_busco_list.tsv + description: | + Missing list output in TSV format. + - name: --output_dir + required: false + direction: output + type: file + example: output_dir/ + description: | + The full output directory, if so desired. - - name: Resource and Run Settings - arguments: - - name: --force - type: boolean_true - description: | - Force rewriting of existing files. Must be used when output files with the provided name already exist. - - name: --quiet - alternatives: ["-q"] - type: boolean_true - description: | - Disable the info logs, displays only errors. - - name: --restart - alternatives: ["-r"] - type: boolean_true - description: | - Continue a run that had already partially completed. Restarting skips calls to tools that have completed but performs all pre- and post-processing steps. - - name: --tar - type: boolean_true - description: | - Compress some subdirectories with many files to save space. + - name: Resource and Run Settings + arguments: + - name: --force + type: boolean_true + description: | + Force rewriting of existing files. Must be used when output files with the provided name already exist. + - name: --quiet + alternatives: ["-q"] + type: boolean_true + description: | + Disable the info logs, displays only errors. + - name: --restart + alternatives: ["-r"] + type: boolean_true + description: | + Continue a run that had already partially completed. Restarting skips calls to tools that have completed but performs all pre- and post-processing steps. + - name: --tar + type: boolean_true + description: | + Compress some subdirectories with many files to save space. - - name: Lineage Dataset Settings - arguments: - - name: --auto_lineage - type: boolean_true - description: | - Run auto-lineage pipelilne to automatically determine BUSCO lineage dataset that is most closely related to the assembly or gene set being assessed. - - name: --auto_lineage_euk - type: boolean_true - description: | - Run auto-placement just on eukaryota tree to find optimal lineage path. - - name: --auto_lineage_prok - type: boolean_true - description: | - Run auto_lineage just on prokaryota trees to find optimum lineage path. - - name: --datasets_version - type: string - required: false - description: | - Specify the version of BUSCO datasets - example: odb10 + - name: Lineage Dataset Settings + arguments: + - name: --auto_lineage + type: boolean_true + description: | + Run auto-lineage pipelilne to automatically determine BUSCO lineage dataset that is most closely related to the assembly or gene set being assessed. + - name: --auto_lineage_euk + type: boolean_true + description: | + Run auto-placement just on eukaryota tree to find optimal lineage path. + - name: --auto_lineage_prok + type: boolean_true + description: | + Run auto_lineage just on prokaryota trees to find optimum lineage path. + - name: --datasets_version + type: string + required: false + description: | + Specify the version of BUSCO datasets + example: odb10 - - name: Augustus Settings - arguments: - - name: --augustus - type: boolean_true - description: | - Use augustus gene predictor for eukaryote runs. - - name: --augustus_parameters - type: string - required: false - description: | - Additional parameters to be passed to Augustus (see Augustus documentation: https://github.com/Gaius-Augustus/Augustus/blob/master/docs/RUNNING-AUGUSTUS.md). - Parameters should be contained within a single string, without whitespace and seperated by commas. - example: "--PARAM1=VALUE1,--PARAM2=VALUE2" - - name: --augustus_species - type: string - required: false - description: | - Specify the augustus species - - name: --long - type: boolean_true - description: | - Optimize Augustus self-training mode. This adds considerably to the run time, but can improve results for some non-model organisms. + - name: Augustus Settings + arguments: + - name: --augustus + type: boolean_true + description: | + Use augustus gene predictor for eukaryote runs. + - name: --augustus_parameters + type: string + required: false + description: | + Additional parameters to be passed to Augustus (see Augustus documentation: https://github.com/Gaius-Augustus/Augustus/blob/master/docs/RUNNING-AUGUSTUS.md). + Parameters should be contained within a single string, without whitespace and seperated by commas. + example: "--PARAM1=VALUE1,--PARAM2=VALUE2" + - name: --augustus_species + type: string + required: false + description: | + Specify the augustus species + - name: --long + type: boolean_true + description: | + Optimize Augustus self-training mode. This adds considerably to the run time, but can improve results for some non-model organisms. - - name: BBTools Settings - arguments: - - name: --contig_break - type: integer - required: false - description: | - Number of contiguous Ns to signify a break between contigs in BBTools analysis. - - name: --limit - type: integer - required: false - description: | - Number of candidate regions (contig or transcript) from the BLAST output to consider per BUSCO. - This option is only effective in pipelines using BLAST, i.e. the genome pipeline (see --augustus) or the prokaryota transcriptome pipeline. - - name: --scaffold_composition - type: boolean_true - description: | - Writes ACGTN content per scaffold to a file scaffold_composition.txt. + - name: BBTools Settings + arguments: + - name: --contig_break + type: integer + required: false + description: | + Number of contiguous Ns to signify a break between contigs in BBTools analysis. + - name: --limit + type: integer + required: false + description: | + Number of candidate regions (contig or transcript) from the BLAST output to consider per BUSCO. + This option is only effective in pipelines using BLAST, i.e. the genome pipeline (see --augustus) or the prokaryota transcriptome pipeline. + - name: --scaffold_composition + type: boolean_true + description: | + Writes ACGTN content per scaffold to a file scaffold_composition.txt. - - name: BLAST Settings - arguments: - - name: --e_value - type: double - required: false - description: | - E-value cutoff for BLAST searches. + - name: BLAST Settings + arguments: + - name: --e_value + type: double + required: false + description: | + E-value cutoff for BLAST searches. - - name: Protein Gene Prediction settings - arguments: - - name: --miniprot - type: boolean_true - description: | - Use Miniprot gene predictor. + - name: Protein Gene Prediction settings + arguments: + - name: --miniprot + type: boolean_true + description: | + Use Miniprot gene predictor. - - name: MetaEuk Settings - arguments: - - name: --metaeuk_parameters - type: string - description: | - Pass additional arguments to Metaeuk for the first run (see Metaeuk documentation https://github.com/soedinglab/metaeuk). - All parameters should be contained within a single string with no white space, with each parameter separated by a comma. - example: "--max-overlap=15,--min-exon-aa=15" - - name: --metaeuk_rerun_parameters - type: string - description: | - Pass additional arguments to Metaeuk for the second run (see Metaeuk documentation https://github.com/soedinglab/metaeuk). - All parameters should be contained within a single string with no white space, with each parameter separated by a comma. - example: "--max-overlap=15,--min-exon-aa=15" + - name: MetaEuk Settings + arguments: + - name: --metaeuk_parameters + type: string + description: | + Pass additional arguments to Metaeuk for the first run (see Metaeuk documentation https://github.com/soedinglab/metaeuk). + All parameters should be contained within a single string with no white space, with each parameter separated by a comma. + example: "--max-overlap=15,--min-exon-aa=15" + - name: --metaeuk_rerun_parameters + type: string + description: | + Pass additional arguments to Metaeuk for the second run (see Metaeuk documentation https://github.com/soedinglab/metaeuk). + All parameters should be contained within a single string with no white space, with each parameter separated by a comma. + example: "--max-overlap=15,--min-exon-aa=15" - resources: - - type: bash_script - path: script.sh - test_resources: - - type: bash_script - path: test.sh - - type: file - path: test_data -platforms: +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - type: file + path: test_data +engines: - type: docker image: quay.io/biocontainers/busco:5.6.1--pyhdfd78af_0 setup: - type: docker run: | busco --version | sed 's/BUSCO\s\(.*\)/busco: "\1"/' > /var/software_versions.txt +runners: + - type: executable - type: nextflow diff --git a/src/fastp/config.vsh.yaml b/src/fastp/config.vsh.yaml index 24db55d1..b7d9062a 100644 --- a/src/fastp/config.vsh.yaml +++ b/src/fastp/config.vsh.yaml @@ -1,576 +1,576 @@ -functionality: - name: fastp - description: | - An ultra-fast all-in-one FASTQ preprocessor (QC/adapters/trimming/filtering/splitting/merging...). +name: fastp +description: | + An ultra-fast all-in-one FASTQ preprocessor (QC/adapters/trimming/filtering/splitting/merging...). - Features: + Features: - - comprehensive quality profiling for both before and after filtering data (quality curves, base contents, KMER, Q20/Q30, GC Ratio, duplication, adapter contents...) - - filter out bad reads (too low quality, too short, or too many N...) - - cut low quality bases for per read in its 5' and 3' by evaluating the mean quality from a sliding window (like Trimmomatic but faster). - - trim all reads in front and tail - - cut adapters. Adapter sequences can be automatically detected, which means you don't have to input the adapter sequences to trim them. - - correct mismatched base pairs in overlapped regions of paired end reads, if one base is with high quality while the other is with ultra low quality - - trim polyG in 3' ends, which is commonly seen in NovaSeq/NextSeq data. Trim polyX in 3' ends to remove unwanted polyX tailing (i.e. polyA tailing for mRNA-Seq data) - - preprocess unique molecular identifier (UMI) enabled data, shift UMI to sequence name. - - report JSON format result for further interpreting. - - visualize quality control and filtering results on a single HTML page (like FASTQC but faster and more informative). - - split the output to multiple files (0001.R1.gz, 0002.R1.gz...) to support parallel processing. Two modes can be used, limiting the total split file number, or limitting the lines of each split file. - - support long reads (data from PacBio / Nanopore devices). - - support reading from STDIN and writing to STDOUT - - support interleaved input - - support ultra-fast FASTQ-level deduplication - info: - keywords: [RNA-Seq, Trimming, Quality control] - links: - repository: https://github.com/OpenGene/fastp - documentation: https://github.com/OpenGene/fastp/blob/master/README.md - references: - doi: 10.1093/bioinformatics/bty560 - license: MIT - argument_groups: - - name: Inputs - description: | - `fastp` supports both single-end (SE) and paired-end (PE) input. + - comprehensive quality profiling for both before and after filtering data (quality curves, base contents, KMER, Q20/Q30, GC Ratio, duplication, adapter contents...) + - filter out bad reads (too low quality, too short, or too many N...) + - cut low quality bases for per read in its 5' and 3' by evaluating the mean quality from a sliding window (like Trimmomatic but faster). + - trim all reads in front and tail + - cut adapters. Adapter sequences can be automatically detected, which means you don't have to input the adapter sequences to trim them. + - correct mismatched base pairs in overlapped regions of paired end reads, if one base is with high quality while the other is with ultra low quality + - trim polyG in 3' ends, which is commonly seen in NovaSeq/NextSeq data. Trim polyX in 3' ends to remove unwanted polyX tailing (i.e. polyA tailing for mRNA-Seq data) + - preprocess unique molecular identifier (UMI) enabled data, shift UMI to sequence name. + - report JSON format result for further interpreting. + - visualize quality control and filtering results on a single HTML page (like FASTQC but faster and more informative). + - split the output to multiple files (0001.R1.gz, 0002.R1.gz...) to support parallel processing. Two modes can be used, limiting the total split file number, or limitting the lines of each split file. + - support long reads (data from PacBio / Nanopore devices). + - support reading from STDIN and writing to STDOUT + - support interleaved input + - support ultra-fast FASTQ-level deduplication +keywords: [RNA-Seq, Trimming, Quality control] +links: + repository: https://github.com/OpenGene/fastp + documentation: https://github.com/OpenGene/fastp/blob/master/README.md +references: + doi: "10.1093/bioinformatics/bty560" +license: MIT +argument_groups: + - name: Inputs + description: | + `fastp` supports both single-end (SE) and paired-end (PE) input. - - for SE data, you only have to specify read1 input by `-i` or `--in1`. - - for PE data, you should also specify read2 input by `-I` or `--in2`. - arguments: - - name: --in1 - alternatives: [-i] - type: file - description: Input FastQ file. Must be single-end or paired-end R1. Can be gzipped. - required: true - example: in.R1.fq.gz - - name: --in2 - alternatives: [-I] - type: file - description: Input FastQ file. Must be paired-end R2. Can be gzipped. - required: false - example: in.R2.fq.gz - - name: Outputs - description: | + - for SE data, you only have to specify read1 input by `-i` or `--in1`. + - for PE data, you should also specify read2 input by `-I` or `--in2`. + arguments: + - name: --in1 + alternatives: [-i] + type: file + description: Input FastQ file. Must be single-end or paired-end R1. Can be gzipped. + required: true + example: in.R1.fq.gz + - name: --in2 + alternatives: [-I] + type: file + description: Input FastQ file. Must be paired-end R2. Can be gzipped. + required: false + example: in.R2.fq.gz + - name: Outputs + description: | - - for SE data, you only have to specify read1 output by `-o` or `--out1`. - - for PE data, you should also specify read2 output by `-O` or `--out2`. - - if you don't specify the output file names, no output files will be written, but the QC will still be done for both data before and after filtering. - - the output will be gzip-compressed if its file name ends with `.gz` - arguments: - - name: --out1 - alternatives: [-o] - type: file - description: The single-end or paired-end R1 reads that pass QC. Will be gzipped if its file name ends with `.gz`. - required: true - example: out.R1.fq.gz - direction: output - - name: --out2 - alternatives: [-O] - type: file - description: The paired-end R2 reads that pass QC. Will be gzipped if its file name ends with `.gz`. - required: false - example: out.R2.fq.gz - direction: output - - name: --unpaired1 - type: file - description: Store the reads that `read1` passes filters but its paired `read2` doesn't. - required: false - example: unpaired.R1.fq.gz - direction: output - - name: --unpaired2 - type: file - description: Store the reads that `read2` passes filters but its paired `read1` doesn't. - required: false - example: unpaired.R2.fq.gz - direction: output - - name: --failed_out - type: file - description: | - Store the reads that fail filters. + - for SE data, you only have to specify read1 output by `-o` or `--out1`. + - for PE data, you should also specify read2 output by `-O` or `--out2`. + - if you don't specify the output file names, no output files will be written, but the QC will still be done for both data before and after filtering. + - the output will be gzip-compressed if its file name ends with `.gz` + arguments: + - name: --out1 + alternatives: [-o] + type: file + description: The single-end or paired-end R1 reads that pass QC. Will be gzipped if its file name ends with `.gz`. + required: true + example: out.R1.fq.gz + direction: output + - name: --out2 + alternatives: [-O] + type: file + description: The paired-end R2 reads that pass QC. Will be gzipped if its file name ends with `.gz`. + required: false + example: out.R2.fq.gz + direction: output + - name: --unpaired1 + type: file + description: Store the reads that `read1` passes filters but its paired `read2` doesn't. + required: false + example: unpaired.R1.fq.gz + direction: output + - name: --unpaired2 + type: file + description: Store the reads that `read2` passes filters but its paired `read1` doesn't. + required: false + example: unpaired.R2.fq.gz + direction: output + - name: --failed_out + type: file + description: | + Store the reads that fail filters. - If one read failed and is written to --failed_out, its failure reason will be appended to its read name. For example, failed_quality_filter, failed_too_short etc. - For PE data, if unpaired reads are not stored (by giving --unpaired1 or --unpaired2), the failed pair of reads will be put together. If one read passes the filters but its pair doesn't, the failure reason will be paired_read_is_failing. - required: false - example: failed.fq.gz - direction: output - - name: --overlapped_out - type: file - description: | - For each read pair, output the overlapped region if it has no any mismatched base. - direction: output - - name: Report output arguments - arguments: - - name: --json - alternatives: [-j] - type: file - description: | - The json format report file name - example: out.json - direction: output - - name: --html - type: file - description: | - The html format report file name - example: out.html - direction: output - - name: --report_title - type: string - description: | - The title of the html report, default is "fastp report". - example: fastp report - - name: Adapter trimming - description: | - Adapter trimming is enabled by default, but you can disable it by `-A` or `--disable_adapter_trimming`. Adapter sequences can be automatically detected for both PE/SE data. + If one read failed and is written to --failed_out, its failure reason will be appended to its read name. For example, failed_quality_filter, failed_too_short etc. + For PE data, if unpaired reads are not stored (by giving --unpaired1 or --unpaired2), the failed pair of reads will be put together. If one read passes the filters but its pair doesn't, the failure reason will be paired_read_is_failing. + required: false + example: failed.fq.gz + direction: output + - name: --overlapped_out + type: file + description: | + For each read pair, output the overlapped region if it has no any mismatched base. + direction: output + - name: Report output arguments + arguments: + - name: --json + alternatives: [-j] + type: file + description: | + The json format report file name + example: out.json + direction: output + - name: --html + type: file + description: | + The html format report file name + example: out.html + direction: output + - name: --report_title + type: string + description: | + The title of the html report, default is "fastp report". + example: fastp report + - name: Adapter trimming + description: | + Adapter trimming is enabled by default, but you can disable it by `-A` or `--disable_adapter_trimming`. Adapter sequences can be automatically detected for both PE/SE data. - - For SE data, the adapters are evaluated by analyzing the tails of first ~1M reads. This evaluation may be inacurrate, and you can specify the adapter sequence by `-a` or `--adapter_sequence` option. If adapter sequence is specified, the auto detection for SE data will be disabled. - - For PE data, the adapters can be detected by per-read overlap analysis, which seeks for the overlap of each pair of reads. This method is robust and fast, so normally you don't have to input the adapter sequence even you know it. But you can still specify the adapter sequences for read1 by `--adapter_sequence`, and for read2 by `--adapter_sequence_r2`. If `fastp` fails to find an overlap (i.e. due to low quality bases), it will use these sequences to trim adapters for read1 and read2 respectively. - - For PE data, the adapter sequence auto-detection is disabled by default since the adapters can be trimmed by overlap analysis. However, you can specify `--detect_adapter_for_pe` to enable it. - - For PE data, `fastp` will run a little slower if you specify the sequence adapters or enable adapter auto-detection, but usually result in a slightly cleaner output, since the overlap analysis may fail due to sequencing errors or adapter dimers. - - The most widely used adapter is the Illumina TruSeq adapters. If your data is from the TruSeq library, you can add `--adapter_sequence=AGATCGGAAGAGCACACGTCTGAACTCCAGTCA --adapter_sequence_r2=AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT` to your command lines, or enable auto detection for PE data by specifing `detect_adapter_for_pe`. - - `fastp` contains some built-in known adapter sequences for better auto-detection. If you want to make some adapters to be a part of the built-in adapters, please file an issue. + - For SE data, the adapters are evaluated by analyzing the tails of first ~1M reads. This evaluation may be inacurrate, and you can specify the adapter sequence by `-a` or `--adapter_sequence` option. If adapter sequence is specified, the auto detection for SE data will be disabled. + - For PE data, the adapters can be detected by per-read overlap analysis, which seeks for the overlap of each pair of reads. This method is robust and fast, so normally you don't have to input the adapter sequence even you know it. But you can still specify the adapter sequences for read1 by `--adapter_sequence`, and for read2 by `--adapter_sequence_r2`. If `fastp` fails to find an overlap (i.e. due to low quality bases), it will use these sequences to trim adapters for read1 and read2 respectively. + - For PE data, the adapter sequence auto-detection is disabled by default since the adapters can be trimmed by overlap analysis. However, you can specify `--detect_adapter_for_pe` to enable it. + - For PE data, `fastp` will run a little slower if you specify the sequence adapters or enable adapter auto-detection, but usually result in a slightly cleaner output, since the overlap analysis may fail due to sequencing errors or adapter dimers. + - The most widely used adapter is the Illumina TruSeq adapters. If your data is from the TruSeq library, you can add `--adapter_sequence=AGATCGGAAGAGCACACGTCTGAACTCCAGTCA --adapter_sequence_r2=AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT` to your command lines, or enable auto detection for PE data by specifing `detect_adapter_for_pe`. + - `fastp` contains some built-in known adapter sequences for better auto-detection. If you want to make some adapters to be a part of the built-in adapters, please file an issue. - You can also specify --adapter_fasta to give a FASTA file to tell fastp to trim multiple adapters in this FASTA file. Here is a sample of such adapter FASTA file: + You can also specify --adapter_fasta to give a FASTA file to tell fastp to trim multiple adapters in this FASTA file. Here is a sample of such adapter FASTA file: - ``` - >Illumina TruSeq Adapter Read 1 - AGATCGGAAGAGCACACGTCTGAACTCCAGTCA - >Illumina TruSeq Adapter Read 2 - AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT - >polyA - AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA - ``` + ``` + >Illumina TruSeq Adapter Read 1 + AGATCGGAAGAGCACACGTCTGAACTCCAGTCA + >Illumina TruSeq Adapter Read 2 + AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT + >polyA + AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA + ``` - The adapter sequence in this file should be at least 6bp long, otherwise it will be skipped. And you can give whatever you want to trim, rather than regular sequencing adapters (i.e. polyA). + The adapter sequence in this file should be at least 6bp long, otherwise it will be skipped. And you can give whatever you want to trim, rather than regular sequencing adapters (i.e. polyA). - `fastp` first trims the auto-detected adapter or the adapter sequences given by `--adapter_sequence | --adapter_sequence_r2`, then trims the adapters given by `--adapter_fasta` one by one. + `fastp` first trims the auto-detected adapter or the adapter sequences given by `--adapter_sequence | --adapter_sequence_r2`, then trims the adapters given by `--adapter_fasta` one by one. - The sequence distribution of trimmed adapters can be found at the HTML/JSON reports. - arguments: - - name: --disable_adapter_trimming - alternatives: [-A] - type: boolean_true - description: | - Disable adapter trimming. - - name: --detect_adapter_for_pe - type: boolean_true - description: | - By default, the auto-detection for adapter is for SE data input only, turn on this option to enable it for PE data. - - name: --adapter_sequence - alternatives: [-a] - type: string - description: | - The adapter sequences to be trimmed. For SE data, if not specified, the adapters will be auto-detected. For PE data, this is used if R1/R2 are found not overlapped - - name: --adapter_sequence_r2 - type: string - description: | - The adapter sequences to be trimmed for R2. This is used for PE data if R1/R2 are found overlapped. - - name: --adapter_fasta - type: file - description: | - A FASTA file containing all the adapter sequences to be trimmed. For SE data, if not specified, the adapters will be auto-detected. For PE data, this is used if R1/R2 are found not overlapped. - - name: Base trimming - arguments: - - name: --trim_front1 - alternatives: [-f] - type: integer - description: | - Trimming how many bases in front for read1, default is 0. - example: 0 - - name: --trim_tail1 - alternatives: [-t] - type: integer - description: | - Trimming how many bases in tail for read1, default is 0. - example: 0 - - name: --max_len1 - alternatives: [-b] - type: integer - min: 0 - description: | - If read1 is longer than max_len1, then trim read1 at its tail to make it as long as max_len1. Default 0 means no limitation. - - name: --trim_front2 - alternatives: [-F] - type: integer - description: | - Trimming how many bases in front for read2, default is 0. - example: 0 - - name: --trim_tail2 - alternatives: [-T] - type: integer - description: | - Trimming how many bases in tail for read2, default is 0. - example: 0 - - name: --max_len2 - alternatives: [-B] - type: integer - min: 0 - description: | - If read2 is longer than max_len2, then trim read2 at its tail to make it as long as max_len2. Default 0 means no limitation. - - name: Merging mode - description: Allows merging paired-end reads into a single longer read if they are overlapping. - arguments: - - name: --merge - alternatives: [-m] - type: boolean_true - description: | - For paired-end input, merge each pair of reads into a single read if they are overlapped. The merged reads will be written to the file given by --merged_out, the unmerged reads will be written to the files specified by --out1 and --out2. The merging mode is disabled by default. - - name: --merged_out - type: file - description: | - In the merging mode, specify the file name to store merged output, or specify --stdout to stream the merged output. - direction: output - example: merged.fq.gz - - name: --include_unmerged - type: boolean_true - description: | - In the merging mode, write the unmerged or unpaired reads to the file specified by --merge. Disabled by default. - - name: Additional input arguments - description: Affects how the input is read. - arguments: - - name: --interleaved_in - type: boolean_true - description: | - Indicate that is an interleaved FASTQ which contains both read1 and read2. Disabled by default. - - name: --fix_mgi_id - type: boolean_true - description: | - The MGI FASTQ ID format is not compatible with many BAM operation tools, enable this option to fix it. - - name: --phred64 - alternatives: ["-6"] - type: boolean_true - description: | - Indicate the input is using phred64 scoring (it'll be converted to phred33, so the output will still be phred33) - - name: Additional output arguments - description: Affects how the output is written. - arguments: - - name: --compression - alternatives: ["-z"] - type: integer - description: | - Compression level for gzip output (1 ~ 9). 1 is fastest, 9 is smallest, default is 4. - example: 4 - min: 1 - max: 9 - - name: --dont_overwrite - type: boolean_true - description: | - Don't overwrite existing files. Overwritting is allowed by default. - - name: Logging arguments - arguments: - - name: --verbose - alternatives: [-V] - type: boolean_true - description: Output verbose log information (i.e. when every 1M reads are processed). - - name: Processing arguments - arguments: - - name: --reads_to_process - type: long - description: | - Specify how many reads/pairs to be processed. Default 0 means process all reads. - example: 1000000 - min: 0 - - name: Deduplication arguments - arguments: - - name: --dedup - type: boolean_true - description: | - Enable deduplication to drop the duplicated reads/pairs - - name: --dup_calc_accuracy - type: integer - description: | - Accuracy level to calculate duplication (1~6). Higher level uses more memory (1G, 2G, 4G, 8G, 16G, 24G). Default 1 for no-dedup mode, and 3 for dedup mode. - example: 3 - min: 1 - max: 6 - - name: --dont_eval_duplication - type: boolean_true - description: | - Don't evaluate duplication rate to save time and use less memory. - - name: PolyG tail trimming arguments - arguments: - - name: --trim_poly_g - alternatives: [-g] - type: boolean_true - description: | - Force polyG tail trimming, by default trimming is automatically enabled for Illumina NextSeq/NovaSeq data - - name: --poly_g_min_len - type: integer - description: | - The minimum length to detect polyG in the read tail. 10 by default. - example: 10 - min: 1 - - name: --disable_trim_poly_g - alternatives: [-G] - type: boolean_true - description: | - Disable polyG tail trimming, by default trimming is automatically enabled for Illumina NextSeq/NovaSeq data - - name: PolyX tail trimming arguments - arguments: - - name: --trim_poly_x - alternatives: [-x] - type: boolean_true - description: | - Enable polyX trimming in 3' ends. - - name: --poly_x_min_len - type: integer - description: | - The minimum length to detect polyX in the read tail. 10 by default. - example: 10 - min: 1 - - name: Cut arguments - arguments: - - name: --cut_front - alternatives: ["-5"] - type: integer - description: | - Move a sliding window from front (5') to tail, drop the bases in the window if its mean quality < threshold, stop otherwise. - - name: --cut_tail - alternatives: ["-3"] - type: integer - description: | - Move a sliding window from tail (3') to front, drop the bases in the window if its mean quality < threshold, stop otherwise. - - name: --cut_right - alternatives: ["-r"] - type: integer - description: | - Move a sliding window from front to tail, if meet one window with mean quality < threshold, drop the bases in the window and the right part, and then stop. - - name: --cut_window_size - alternatives: ["-W"] - type: integer - description: | - The window size option shared by cut_front, cut_tail or cut_sliding. Range: 1~1000, default: 4. - example: 4 - min: 1 - - name: --cut_mean_quality - alternatives: ["-M"] - type: integer - description: | - The mean quality requirement option shared by cut_front, cut_tail or cut_sliding. Range: 1~36 default: 20 (Q20) - example: 20 - min: 0 - - name: --cut_front_window_size - type: integer - description: | - The window size option of cut_front, default to cut_window_size if not specified. - example: 4 - min: 1 - - name: --cut_front_mean_quality - type: integer - description: | - The mean quality requirement option of cut_front, default to cut_mean_quality if not specified. - example: 20 - min: 0 - - name: --cut_tail_window_size - type: integer - description: | - The window size option of cut_tail, default to cut_window_size if not specified. - example: 4 - min: 1 - - name: --cut_tail_mean_quality - type: integer - description: | - The mean quality requirement option of cut_tail, default to cut_mean_quality if not specified. - example: 20 - min: 0 - - name: --cut_right_window_size - type: integer - description: | - The window size option of cut_right, default to cut_window_size if not specified. - example: 4 - min: 1 - - name: --cut_right_mean_quality - type: integer - description: | - The mean quality requirement option of cut_right, default to cut_mean_quality if not specified. - example: 20 - min: 0 - - name: Quality filtering arguments - arguments: - - name: --disable_quality_filtering - alternatives: [-Q] - type: boolean_true - description: | - Quality filtering is enabled by default. If this option is specified, quality filtering is disabled. - - name: --qualified_quality_phred - alternatives: [-q] - type: integer - description: | - The quality value that a base is qualified. Default 15 means phred quality >=Q15 is qualified. - example: 15 - min: 0 - - name: --unqualified_percent_limit - alternatives: [-u] - type: integer - description: | - How many percents of bases are allowed to be unqualified (0~100). Default 40 means 40%. - example: 40 - min: 0 - max: 100 - - name: --n_base_limit - alternatives: [-n] - type: integer - description: | - If one read's number of N base is >n_base_limit, then this read/pair is discarded. Default is 5. - example: 5 - min: 0 - - name: --average_qual - alternatives: [-e] - type: integer - description: | - If one read's average quality score =1000), a sequential number prefix will be added to output name ( 0001.out.fq, 0002.out.fq...), disabled by default. - # - name: --split_prefix_digits - # type: integer - # description: | - # The digits for the sequential number padding (1~10), default is 4, so the filename will be padded as 0001.xxx, 0 to disable padding. - # example: 4 - resources: - - type: bash_script - path: script.sh - test_resources: - - type: bash_script - path: test.sh - - type: file - path: test_data -platforms: + The sequence distribution of trimmed adapters can be found at the HTML/JSON reports. + arguments: + - name: --disable_adapter_trimming + alternatives: [-A] + type: boolean_true + description: | + Disable adapter trimming. + - name: --detect_adapter_for_pe + type: boolean_true + description: | + By default, the auto-detection for adapter is for SE data input only, turn on this option to enable it for PE data. + - name: --adapter_sequence + alternatives: [-a] + type: string + description: | + The adapter sequences to be trimmed. For SE data, if not specified, the adapters will be auto-detected. For PE data, this is used if R1/R2 are found not overlapped + - name: --adapter_sequence_r2 + type: string + description: | + The adapter sequences to be trimmed for R2. This is used for PE data if R1/R2 are found overlapped. + - name: --adapter_fasta + type: file + description: | + A FASTA file containing all the adapter sequences to be trimmed. For SE data, if not specified, the adapters will be auto-detected. For PE data, this is used if R1/R2 are found not overlapped. + - name: Base trimming + arguments: + - name: --trim_front1 + alternatives: [-f] + type: integer + description: | + Trimming how many bases in front for read1, default is 0. + example: 0 + - name: --trim_tail1 + alternatives: [-t] + type: integer + description: | + Trimming how many bases in tail for read1, default is 0. + example: 0 + - name: --max_len1 + alternatives: [-b] + type: integer + min: 0 + description: | + If read1 is longer than max_len1, then trim read1 at its tail to make it as long as max_len1. Default 0 means no limitation. + - name: --trim_front2 + alternatives: [-F] + type: integer + description: | + Trimming how many bases in front for read2, default is 0. + example: 0 + - name: --trim_tail2 + alternatives: [-T] + type: integer + description: | + Trimming how many bases in tail for read2, default is 0. + example: 0 + - name: --max_len2 + alternatives: [-B] + type: integer + min: 0 + description: | + If read2 is longer than max_len2, then trim read2 at its tail to make it as long as max_len2. Default 0 means no limitation. + - name: Merging mode + description: Allows merging paired-end reads into a single longer read if they are overlapping. + arguments: + - name: --merge + alternatives: [-m] + type: boolean_true + description: | + For paired-end input, merge each pair of reads into a single read if they are overlapped. The merged reads will be written to the file given by --merged_out, the unmerged reads will be written to the files specified by --out1 and --out2. The merging mode is disabled by default. + - name: --merged_out + type: file + description: | + In the merging mode, specify the file name to store merged output, or specify --stdout to stream the merged output. + direction: output + example: merged.fq.gz + - name: --include_unmerged + type: boolean_true + description: | + In the merging mode, write the unmerged or unpaired reads to the file specified by --merge. Disabled by default. + - name: Additional input arguments + description: Affects how the input is read. + arguments: + - name: --interleaved_in + type: boolean_true + description: | + Indicate that is an interleaved FASTQ which contains both read1 and read2. Disabled by default. + - name: --fix_mgi_id + type: boolean_true + description: | + The MGI FASTQ ID format is not compatible with many BAM operation tools, enable this option to fix it. + - name: --phred64 + alternatives: ["-6"] + type: boolean_true + description: | + Indicate the input is using phred64 scoring (it'll be converted to phred33, so the output will still be phred33) + - name: Additional output arguments + description: Affects how the output is written. + arguments: + - name: --compression + alternatives: ["-z"] + type: integer + description: | + Compression level for gzip output (1 ~ 9). 1 is fastest, 9 is smallest, default is 4. + example: 4 + min: 1 + max: 9 + - name: --dont_overwrite + type: boolean_true + description: | + Don't overwrite existing files. Overwritting is allowed by default. + - name: Logging arguments + arguments: + - name: --verbose + alternatives: [-V] + type: boolean_true + description: Output verbose log information (i.e. when every 1M reads are processed). + - name: Processing arguments + arguments: + - name: --reads_to_process + type: long + description: | + Specify how many reads/pairs to be processed. Default 0 means process all reads. + example: 1000000 + min: 0 + - name: Deduplication arguments + arguments: + - name: --dedup + type: boolean_true + description: | + Enable deduplication to drop the duplicated reads/pairs + - name: --dup_calc_accuracy + type: integer + description: | + Accuracy level to calculate duplication (1~6). Higher level uses more memory (1G, 2G, 4G, 8G, 16G, 24G). Default 1 for no-dedup mode, and 3 for dedup mode. + example: 3 + min: 1 + max: 6 + - name: --dont_eval_duplication + type: boolean_true + description: | + Don't evaluate duplication rate to save time and use less memory. + - name: PolyG tail trimming arguments + arguments: + - name: --trim_poly_g + alternatives: [-g] + type: boolean_true + description: | + Force polyG tail trimming, by default trimming is automatically enabled for Illumina NextSeq/NovaSeq data + - name: --poly_g_min_len + type: integer + description: | + The minimum length to detect polyG in the read tail. 10 by default. + example: 10 + min: 1 + - name: --disable_trim_poly_g + alternatives: [-G] + type: boolean_true + description: | + Disable polyG tail trimming, by default trimming is automatically enabled for Illumina NextSeq/NovaSeq data + - name: PolyX tail trimming arguments + arguments: + - name: --trim_poly_x + alternatives: [-x] + type: boolean_true + description: | + Enable polyX trimming in 3' ends. + - name: --poly_x_min_len + type: integer + description: | + The minimum length to detect polyX in the read tail. 10 by default. + example: 10 + min: 1 + - name: Cut arguments + arguments: + - name: --cut_front + alternatives: ["-5"] + type: integer + description: | + Move a sliding window from front (5') to tail, drop the bases in the window if its mean quality < threshold, stop otherwise. + - name: --cut_tail + alternatives: ["-3"] + type: integer + description: | + Move a sliding window from tail (3') to front, drop the bases in the window if its mean quality < threshold, stop otherwise. + - name: --cut_right + alternatives: ["-r"] + type: integer + description: | + Move a sliding window from front to tail, if meet one window with mean quality < threshold, drop the bases in the window and the right part, and then stop. + - name: --cut_window_size + alternatives: ["-W"] + type: integer + description: | + The window size option shared by cut_front, cut_tail or cut_sliding. Range: 1~1000, default: 4. + example: 4 + min: 1 + - name: --cut_mean_quality + alternatives: ["-M"] + type: integer + description: | + The mean quality requirement option shared by cut_front, cut_tail or cut_sliding. Range: 1~36 default: 20 (Q20) + example: 20 + min: 0 + - name: --cut_front_window_size + type: integer + description: | + The window size option of cut_front, default to cut_window_size if not specified. + example: 4 + min: 1 + - name: --cut_front_mean_quality + type: integer + description: | + The mean quality requirement option of cut_front, default to cut_mean_quality if not specified. + example: 20 + min: 0 + - name: --cut_tail_window_size + type: integer + description: | + The window size option of cut_tail, default to cut_window_size if not specified. + example: 4 + min: 1 + - name: --cut_tail_mean_quality + type: integer + description: | + The mean quality requirement option of cut_tail, default to cut_mean_quality if not specified. + example: 20 + min: 0 + - name: --cut_right_window_size + type: integer + description: | + The window size option of cut_right, default to cut_window_size if not specified. + example: 4 + min: 1 + - name: --cut_right_mean_quality + type: integer + description: | + The mean quality requirement option of cut_right, default to cut_mean_quality if not specified. + example: 20 + min: 0 + - name: Quality filtering arguments + arguments: + - name: --disable_quality_filtering + alternatives: [-Q] + type: boolean_true + description: | + Quality filtering is enabled by default. If this option is specified, quality filtering is disabled. + - name: --qualified_quality_phred + alternatives: [-q] + type: integer + description: | + The quality value that a base is qualified. Default 15 means phred quality >=Q15 is qualified. + example: 15 + min: 0 + - name: --unqualified_percent_limit + alternatives: [-u] + type: integer + description: | + How many percents of bases are allowed to be unqualified (0~100). Default 40 means 40%. + example: 40 + min: 0 + max: 100 + - name: --n_base_limit + alternatives: [-n] + type: integer + description: | + If one read's number of N base is >n_base_limit, then this read/pair is discarded. Default is 5. + example: 5 + min: 0 + - name: --average_qual + alternatives: [-e] + type: integer + description: | + If one read's average quality score =1000), a sequential number prefix will be added to output name ( 0001.out.fq, 0002.out.fq...), disabled by default. + # - name: --split_prefix_digits + # type: integer + # description: | + # The digits for the sequential number padding (1~10), default is 4, so the filename will be padded as 0001.xxx, 0 to disable padding. + # example: 4 +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - type: file + path: test_data +engines: - type: docker image: quay.io/biocontainers/fastp:0.23.4--hadf994f_2 setup: - type: docker run: | fastp --version 2>&1 | sed 's# #: "#;s#$#"#' > /var/software_versions.txt +runners: + - type: executable - type: nextflow diff --git a/src/featurecounts/config.vsh.yaml b/src/featurecounts/config.vsh.yaml index 01ae400a..8697b1fe 100644 --- a/src/featurecounts/config.vsh.yaml +++ b/src/featurecounts/config.vsh.yaml @@ -1,336 +1,336 @@ -functionality: - name: featurecounts - description: | - featureCounts is a read summarization program for counting reads generated from either RNA or genomic DNA sequencing experiments by implementing highly efficient chromosome hashing and feature blocking techniques. It works with either single or paired-end reads and provides a wide range of options appropriate for different sequencing applications. - info: - keywords: ["Read counting", "Genomic features"] - links: - homepage: https://subread.sourceforge.net/ - documentation: https://subread.sourceforge.net/SubreadUsersGuide.pdf - repository: https://github.com/ShiLab-Bioinformatics/subread - references: - doi: 10.1093/bioinformatics/btt656 - license: GPL-3.0 - requirements: - commands: [ featureCounts ] +name: featurecounts +description: | + featureCounts is a read summarization program for counting reads generated from either RNA or genomic DNA sequencing experiments by implementing highly efficient chromosome hashing and feature blocking techniques. It works with either single or paired-end reads and provides a wide range of options appropriate for different sequencing applications. +keywords: ["Read counting", "Genomic features"] +links: + homepage: https://subread.sourceforge.net/ + documentation: https://subread.sourceforge.net/SubreadUsersGuide.pdf + repository: https://github.com/ShiLab-Bioinformatics/subread +references: + doi: "10.1093/bioinformatics/btt656" +license: GPL-3.0 +requirements: + commands: [ featureCounts ] - argument_groups: - - name: Inputs - arguments: - - name: --annotation - alternatives: ["-a"] - type: file - description: | - Name of an annotation file. GTF/GFF format by default. See '--format' option for more format information. - required: true - example: annotation.gtf - - name: --input - alternatives: ["-i"] - type: file - multiple: true - description: | - A list of SAM or BAM format files separated by semi-colon (;). They can be either name or location sorted. Location-sorted paired-end reads are automatically sorted by read names. - required: true - example: input_file1.bam - - - name: Outputs - arguments: - - name: --counts - alternatives: ["-o"] - type: file - direction: output - description: | - Name of output file including read counts in tab delimited format. - required: true - example: features.tsv - - name: --summary - type: file - direction: output - description: | - Summary statistics of counting results in tab delimited format. - required: false - example: summary.tsv - - name: --junctions - type: file - direction: output - description: | - Count number of reads supporting each exon-exon junction. Junctions were identified from those exon-spanning reads in the input (containing 'N' in CIGAR string). - example: junctions.txt - required: false +argument_groups: + - name: Inputs + arguments: + - name: --annotation + alternatives: ["-a"] + type: file + description: | + Name of an annotation file. GTF/GFF format by default. See '--format' option for more format information. + required: true + example: annotation.gtf + - name: --input + alternatives: ["-i"] + type: file + multiple: true + description: | + A list of SAM or BAM format files separated by semi-colon (;). They can be either name or location sorted. Location-sorted paired-end reads are automatically sorted by read names. + required: true + example: input_file1.bam + + - name: Outputs + arguments: + - name: --counts + alternatives: ["-o"] + type: file + direction: output + description: | + Name of output file including read counts in tab delimited format. + required: true + example: features.tsv + - name: --summary + type: file + direction: output + description: | + Summary statistics of counting results in tab delimited format. + required: false + example: summary.tsv + - name: --junctions + type: file + direction: output + description: | + Count number of reads supporting each exon-exon junction. Junctions were identified from those exon-spanning reads in the input (containing 'N' in CIGAR string). + example: junctions.txt + required: false - - name: Annotation - arguments: - - name: --format - alternatives: ["-F"] - type: string - description: | - Specify format of the provided annotation file. Acceptable formats include 'GTF' (or compatible GFF format) and 'SAF'. 'GTF' by default. - choices: [GTF, GFF, SAF] - example: "GTF" - required: false - - name: --feature_type - alternatives: ["-t"] - type: string - description: | - Specify feature type(s) in a GTF annotation. If multiple types are provided, they should be separated by ';' with no space in between. 'exon' by default. Rows in the annotation with a matched feature will be extracted and used for read mapping. - example: "exon" - required: false - multiple: true - - name: --attribute_type - alternatives: ["-g"] - type: string - description: | - Specify attribute type in GTF annotation. 'gene_id' by default. Meta-features used for read counting will be extracted from annotation using the provided value. - example: "gene_id" - required: false - - name: --extra_attributes - type: string - description: | - Extract extra attribute types from the provided GTF annotation and include them in the counting output. These attribute types will not be used to group features. If more than one attribute type is provided they should be separated by semicolon (;). - required: false - multiple: true - - name: --chrom_alias - alternatives: ["-A"] - type: file - description: | - Provide a chromosome name alias file to match chr names in annotation with those in the reads. This should be a two-column comma-delimited text file. Its first column should include chr names in the annotation and its second column should include chr names in the reads. Chr names are case sensitive. No column header should be included in the file. - required: false - example: chrom_alias.csv - - - name: Level of summarization - arguments: - - name: --feature_level - alternatives: ["-f"] - type: boolean_true - description: | - Perform read counting at feature level (eg. counting reads for exons rather than genes). - - - name: Overlap between reads and features - arguments: - - name: --overlapping - alternatives: ["-O"] - type: boolean_true - description: | - Assign reads to all their overlapping meta-features (or features if '--feature_level' is specified). - - name: --min_overlap - type: integer - description: | - Minimum number of overlapping bases in a read that is required for read assignment. 1 by default. Number of overlapping bases is counted from both reads if paired end. If a negative value is provided, then a gap of up to specified size will be allowed between read and the feature that the read is assigned to. - required: false - example: 1 - - name: --frac_overlap - type: double - description: | - Minimum fraction of overlapping bases in a read that is required for read assignment. Value should be within range [0,1]. 0 by default. Number of overlapping bases is counted from both reads if paired end. Both this option and '--min_overlap' option need to be satisfied for read assignment. - required: false - min: 0 - max: 1 - example: 0 - - name: --frac_overlap_feature - type: double - description: | - Minimum fraction of overlapping bases in a feature that is required for read assignment. Value should be within range [0,1]. 0 by default. - required: false - min: 0 - max: 1 - example: 0 - - name: --largest_overlap - type: boolean_true - description: | - Assign reads to a meta-feature/feature that has the largest number of overlapping bases. - - name: --non_overlap - type: integer - description: | - Maximum number of non-overlapping bases in a read (or a read pair) that is allowed when being assigned to a feature. No limit is set by default. - required: false - - name: --non_overlap_feature - type: integer - description: | - Maximum number of non-overlapping bases in a feature that is allowed in read assignment. No limit is set by default. - required: false - - name: --read_extension5 - type: integer - description: | - Reads are extended upstream by bases from their 5' end. - required: false - - name: --read_extension3 - type: integer - description: | - Reads are extended upstream by bases from their 3' end. - required: false - - name: --read2pos - type: integer - description: | - Reduce reads to their 5' most base or 3' most base. Read counting is then performed based on the single base the read is reduced to. - required: false - choices: [3, 5] - - - name: Multi-mapping reads - arguments: - - name: --multi_mapping - alternatives: ["-M"] - type: boolean_true - description: | - Multi-mapping reads will also be counted. For a multi-mapping read, all its reported alignments will be counted. The 'NH' tag in BAM/SAM input is used to detect multi-mapping reads. - - - name: Fractional counting - arguments: - - name: --fraction - type: boolean_true - description: | - Assign fractional counts to features. This option must be used together with '--multi_mapping' or '--overlapping' or both. When '--multi_mapping' is specified, each reported alignment from a multi-mapping read (identified via 'NH' tag) will carry a fractional count of 1/x, instead of 1 (one), where x is the total number of alignments reported for the same read. When '--overlapping' is specified, each overlapping feature will receive a fractional count of 1/y, where y is the total number of features overlapping with the read. When both '--multi_mapping' and '--overlapping' are specified, each alignment will carry a fractional count of 1/(x*y). + - name: Annotation + arguments: + - name: --format + alternatives: ["-F"] + type: string + description: | + Specify format of the provided annotation file. Acceptable formats include 'GTF' (or compatible GFF format) and 'SAF'. 'GTF' by default. + choices: [GTF, GFF, SAF] + example: "GTF" + required: false + - name: --feature_type + alternatives: ["-t"] + type: string + description: | + Specify feature type(s) in a GTF annotation. If multiple types are provided, they should be separated by ';' with no space in between. 'exon' by default. Rows in the annotation with a matched feature will be extracted and used for read mapping. + example: "exon" + required: false + multiple: true + - name: --attribute_type + alternatives: ["-g"] + type: string + description: | + Specify attribute type in GTF annotation. 'gene_id' by default. Meta-features used for read counting will be extracted from annotation using the provided value. + example: "gene_id" + required: false + - name: --extra_attributes + type: string + description: | + Extract extra attribute types from the provided GTF annotation and include them in the counting output. These attribute types will not be used to group features. If more than one attribute type is provided they should be separated by semicolon (;). + required: false + multiple: true + - name: --chrom_alias + alternatives: ["-A"] + type: file + description: | + Provide a chromosome name alias file to match chr names in annotation with those in the reads. This should be a two-column comma-delimited text file. Its first column should include chr names in the annotation and its second column should include chr names in the reads. Chr names are case sensitive. No column header should be included in the file. + required: false + example: chrom_alias.csv + + - name: Level of summarization + arguments: + - name: --feature_level + alternatives: ["-f"] + type: boolean_true + description: | + Perform read counting at feature level (eg. counting reads for exons rather than genes). + + - name: Overlap between reads and features + arguments: + - name: --overlapping + alternatives: ["-O"] + type: boolean_true + description: | + Assign reads to all their overlapping meta-features (or features if '--feature_level' is specified). + - name: --min_overlap + type: integer + description: | + Minimum number of overlapping bases in a read that is required for read assignment. 1 by default. Number of overlapping bases is counted from both reads if paired end. If a negative value is provided, then a gap of up to specified size will be allowed between read and the feature that the read is assigned to. + required: false + example: 1 + - name: --frac_overlap + type: double + description: | + Minimum fraction of overlapping bases in a read that is required for read assignment. Value should be within range [0,1]. 0 by default. Number of overlapping bases is counted from both reads if paired end. Both this option and '--min_overlap' option need to be satisfied for read assignment. + required: false + min: 0 + max: 1 + example: 0 + - name: --frac_overlap_feature + type: double + description: | + Minimum fraction of overlapping bases in a feature that is required for read assignment. Value should be within range [0,1]. 0 by default. + required: false + min: 0 + max: 1 + example: 0 + - name: --largest_overlap + type: boolean_true + description: | + Assign reads to a meta-feature/feature that has the largest number of overlapping bases. + - name: --non_overlap + type: integer + description: | + Maximum number of non-overlapping bases in a read (or a read pair) that is allowed when being assigned to a feature. No limit is set by default. + required: false + - name: --non_overlap_feature + type: integer + description: | + Maximum number of non-overlapping bases in a feature that is allowed in read assignment. No limit is set by default. + required: false + - name: --read_extension5 + type: integer + description: | + Reads are extended upstream by bases from their 5' end. + required: false + - name: --read_extension3 + type: integer + description: | + Reads are extended upstream by bases from their 3' end. + required: false + - name: --read2pos + type: integer + description: | + Reduce reads to their 5' most base or 3' most base. Read counting is then performed based on the single base the read is reduced to. + required: false + choices: [3, 5] + + - name: Multi-mapping reads + arguments: + - name: --multi_mapping + alternatives: ["-M"] + type: boolean_true + description: | + Multi-mapping reads will also be counted. For a multi-mapping read, all its reported alignments will be counted. The 'NH' tag in BAM/SAM input is used to detect multi-mapping reads. + + - name: Fractional counting + arguments: + - name: --fraction + type: boolean_true + description: | + Assign fractional counts to features. This option must be used together with '--multi_mapping' or '--overlapping' or both. When '--multi_mapping' is specified, each reported alignment from a multi-mapping read (identified via 'NH' tag) will carry a fractional count of 1/x, instead of 1 (one), where x is the total number of alignments reported for the same read. When '--overlapping' is specified, each overlapping feature will receive a fractional count of 1/y, where y is the total number of features overlapping with the read. When both '--multi_mapping' and '--overlapping' are specified, each alignment will carry a fractional count of 1/(x*y). - - name: Read filtering - arguments: - - name: --min_map_quality - alternatives: ["-Q"] - type: integer - description: | - The minimum mapping quality score a read must satisfy in order to be counted. For paired-end reads, at least one end should satisfy this criteria. 0 by default. - required: false - example: 0 - - name: --split_only - type: boolean_true - description: | - Count split alignments only (ie. alignments with CIGAR string containing 'N'). An example of split alignments is exon-spanning reads in RNA-seq data. - - name: --non_split_only - type: boolean_true - description: | - If specified, only non-split alignments (CIGAR strings do not contain letter 'N') will be counted. All the other alignments will be ignored. - - name: --primary - type: boolean_true - description: | - Count primary alignments only. Primary alignments are identified using bit 0x100 in SAM/BAM FLAG field. - - name: --ignore_dup - type: boolean_true - description: | - Ignore duplicate reads in read counting. Duplicate reads are identified using bit Ox400 in BAM/SAM FLAG field. The whole read pair is ignored if one of the reads is a duplicate read for paired end data. - - - name: Strandedness - arguments: - - name: --strand - alternatives: ["-s"] - type: integer - description: | - Perform strand-specific read counting. A single integer value (applied to all input files) should be provided. Possible values include: 0 (unstranded), 1 (stranded) and 2 (reversely stranded). Default value is 0 (ie. unstranded read counting carried out for all input files). - choices: [0, 1, 2] - example: 0 - required: false + - name: Read filtering + arguments: + - name: --min_map_quality + alternatives: ["-Q"] + type: integer + description: | + The minimum mapping quality score a read must satisfy in order to be counted. For paired-end reads, at least one end should satisfy this criteria. 0 by default. + required: false + example: 0 + - name: --split_only + type: boolean_true + description: | + Count split alignments only (ie. alignments with CIGAR string containing 'N'). An example of split alignments is exon-spanning reads in RNA-seq data. + - name: --non_split_only + type: boolean_true + description: | + If specified, only non-split alignments (CIGAR strings do not contain letter 'N') will be counted. All the other alignments will be ignored. + - name: --primary + type: boolean_true + description: | + Count primary alignments only. Primary alignments are identified using bit 0x100 in SAM/BAM FLAG field. + - name: --ignore_dup + type: boolean_true + description: | + Ignore duplicate reads in read counting. Duplicate reads are identified using bit Ox400 in BAM/SAM FLAG field. The whole read pair is ignored if one of the reads is a duplicate read for paired end data. + + - name: Strandedness + arguments: + - name: --strand + alternatives: ["-s"] + type: integer + description: | + Perform strand-specific read counting. A single integer value (applied to all input files) should be provided. Possible values include: 0 (unstranded), 1 (stranded) and 2 (reversely stranded). Default value is 0 (ie. unstranded read counting carried out for all input files). + choices: [0, 1, 2] + example: 0 + required: false - - name: Exon-exon junctions - arguments: - - name: --ref_fasta - alternatives: ["-G"] - type: file - description: | - Provide the name of a FASTA-format file that contains the reference sequences used in read mapping that produced the provided SAM/BAM files. - required: false - example: reference.fasta - - - name: Parameters specific to paired end reads - arguments: - - name: --paired - alternatives: ["-p"] - type: boolean_true - description: | - Specify that input data contain paired-end reads. To perform fragment counting (ie. counting read pairs), the '--countReadPairs' parameter should also be specified in addition to this parameter. - - name: --count_read_pairs - type: boolean_true - description: | - Count read pairs (fragments) instead of reads. This option is only applicable for paired-end reads. - - name: --both_aligned - alternatives: ["-B"] - type: boolean_true - description: | - Count read pairs (fragments) instead of reads. This option is only applicable for paired-end reads. - - name: --check_pe_dist - alternatives: ["-P"] - type: boolean_true - description: | - Check validity of paired-end distance when counting read pairs. Use '--min_length' and '--max_length' to set thresholds. - - name: --min_length - alternatives: ["-d"] - type: integer - description: | - Minimum fragment/template length, 50 by default. - required: false - example: 50 - - name: --max_length - alternatives: ["-D"] - type: integer - description: | - Maximum fragment/template length, 600 by default. - required: false - example: 600 - - name: --same_strand - alternatives: ["-C"] - type: boolean_true - description: | - Do not count read pairs that have their two ends mapping to different chromosomes or mapping to same chromosome but on different strands. - - name: --donotsort - type: boolean_true - description: | - Do not sort reads in BAM/SAM input. Note that reads from the same pair are required to be located next to each other in the input. - - - name: Read groups - arguments: - - name: --by_read_group - type: boolean_true - description: | - Assign reads by read group. "RG" tag is required to be present in the input BAM/SAM files. - - - name: Long reads - arguments: - - name: --long_reads - type: boolean_true - description: | - Count long reads such as Nanopore and PacBio reads. Long read counting can only run in one thread and only reads (not read-pairs) can be counted. There is no limitation on the number of 'M' operations allowed in a CIGAR string in long read counting. - - - name: Assignment results for each read - arguments: - - name: --detailed_results - type: file - direction: output - description: | - Directory to save the detailed assignment results. Use `--detailed_results_format` to determine the format of the detailed results. - example: detailed_results/ - required: false - - name: --detailed_results_format - alternatives: ["-R"] - type: string - description: | - Output detailed assignment results for each read or read-pair. Results are saved to a file that is in one of the following formats: CORE, SAM and BAM. See documentaiton for more info about these formats. - required: false - choices: [CORE, SAM, BAM] - - - name: Miscellaneous - arguments: - - name: --max_M_op - type: integer - description: | - Maximum number of 'M' operations allowed in a CIGAR string. 10 by default. Both 'X' and '=' are treated as 'M' and adjacent 'M' operations are merged in the CIGAR string. - required: false - example: 10 - - name: --verbose - type: boolean_true - description: | - Output verbose information for debugging, such as un-matched chromosome/contig names. + - name: Exon-exon junctions + arguments: + - name: --ref_fasta + alternatives: ["-G"] + type: file + description: | + Provide the name of a FASTA-format file that contains the reference sequences used in read mapping that produced the provided SAM/BAM files. + required: false + example: reference.fasta + + - name: Parameters specific to paired end reads + arguments: + - name: --paired + alternatives: ["-p"] + type: boolean_true + description: | + Specify that input data contain paired-end reads. To perform fragment counting (ie. counting read pairs), the '--countReadPairs' parameter should also be specified in addition to this parameter. + - name: --count_read_pairs + type: boolean_true + description: | + Count read pairs (fragments) instead of reads. This option is only applicable for paired-end reads. + - name: --both_aligned + alternatives: ["-B"] + type: boolean_true + description: | + Count read pairs (fragments) instead of reads. This option is only applicable for paired-end reads. + - name: --check_pe_dist + alternatives: ["-P"] + type: boolean_true + description: | + Check validity of paired-end distance when counting read pairs. Use '--min_length' and '--max_length' to set thresholds. + - name: --min_length + alternatives: ["-d"] + type: integer + description: | + Minimum fragment/template length, 50 by default. + required: false + example: 50 + - name: --max_length + alternatives: ["-D"] + type: integer + description: | + Maximum fragment/template length, 600 by default. + required: false + example: 600 + - name: --same_strand + alternatives: ["-C"] + type: boolean_true + description: | + Do not count read pairs that have their two ends mapping to different chromosomes or mapping to same chromosome but on different strands. + - name: --donotsort + type: boolean_true + description: | + Do not sort reads in BAM/SAM input. Note that reads from the same pair are required to be located next to each other in the input. + + - name: Read groups + arguments: + - name: --by_read_group + type: boolean_true + description: | + Assign reads by read group. "RG" tag is required to be present in the input BAM/SAM files. - resources: - - type: bash_script - path: script.sh + - name: Long reads + arguments: + - name: --long_reads + type: boolean_true + description: | + Count long reads such as Nanopore and PacBio reads. Long read counting can only run in one thread and only reads (not read-pairs) can be counted. There is no limitation on the number of 'M' operations allowed in a CIGAR string in long read counting. + + - name: Assignment results for each read + arguments: + - name: --detailed_results + type: file + direction: output + description: | + Directory to save the detailed assignment results. Use `--detailed_results_format` to determine the format of the detailed results. + example: detailed_results/ + required: false + - name: --detailed_results_format + alternatives: ["-R"] + type: string + description: | + Output detailed assignment results for each read or read-pair. Results are saved to a file that is in one of the following formats: CORE, SAM and BAM. See documentaiton for more info about these formats. + required: false + choices: [CORE, SAM, BAM] + + - name: Miscellaneous + arguments: + - name: --max_M_op + type: integer + description: | + Maximum number of 'M' operations allowed in a CIGAR string. 10 by default. Both 'X' and '=' are treated as 'M' and adjacent 'M' operations are merged in the CIGAR string. + required: false + example: 10 + - name: --verbose + type: boolean_true + description: | + Output verbose information for debugging, such as un-matched chromosome/contig names. + +resources: + - type: bash_script + path: script.sh - test_resources: - - type: bash_script - path: test.sh - - type: file - path: test_data +test_resources: + - type: bash_script + path: test.sh + - type: file + path: test_data -platforms: +engines: - type: docker image: quay.io/biocontainers/subread:2.0.6--he4a0461_0 setup: - type: docker run: | featureCounts -v 2>&1 | sed 's/featureCounts v\([0-9.]*\)/featureCounts: \1/' > /var/software_versions.txt +runners: + - type: executable - type: nextflow \ No newline at end of file diff --git a/src/lofreq/call/config.vsh.yaml b/src/lofreq/call/config.vsh.yaml index 97b98e6f..c547de9d 100644 --- a/src/lofreq/call/config.vsh.yaml +++ b/src/lofreq/call/config.vsh.yaml @@ -1,245 +1,243 @@ -functionality: - name: lofreq_call - namespace: lofreq - description: | - Call variants from a BAM file. +name: lofreq_call +namespace: lofreq +description: | + Call variants from a BAM file. - LoFreq* (i.e. LoFreq version 2) is a fast and sensitive variant-caller for inferring SNVs and indels from next-generation sequencing data. It makes full use of base-call qualities and other sources of errors inherent in sequencing (e.g. mapping or base/indel alignment uncertainty), which are usually ignored by other methods or only used for filtering. + LoFreq* (i.e. LoFreq version 2) is a fast and sensitive variant-caller for inferring SNVs and indels from next-generation sequencing data. It makes full use of base-call qualities and other sources of errors inherent in sequencing (e.g. mapping or base/indel alignment uncertainty), which are usually ignored by other methods or only used for filtering. - LoFreq* can run on almost any type of aligned sequencing data (e.g. Illumina, IonTorrent or Pacbio) since no machine- or sequencing-technology dependent thresholds are used. It automatically adapts to changes in coverage and sequencing quality and can therefore be applied to a variety of data-sets e.g. viral/quasispecies, bacterial, metagenomics or somatic data. + LoFreq* can run on almost any type of aligned sequencing data (e.g. Illumina, IonTorrent or Pacbio) since no machine- or sequencing-technology dependent thresholds are used. It automatically adapts to changes in coverage and sequencing quality and can therefore be applied to a variety of data-sets e.g. viral/quasispecies, bacterial, metagenomics or somatic data. - LoFreq* is very sensitive; most notably, it is able to predict variants below the average base-call quality (i.e. sequencing error rate). Each variant call is assigned a p-value which allows for rigorous false positive control. Even though it uses no approximations or heuristics, it is very efficient due to several runtime optimizations and also provides a (pseudo-)parallel implementation. LoFreq* is generic and fast enough to be applied to high-coverage data and large genomes. On a single processor it takes a minute to analyze Dengue genome sequencing data with nearly 4000X coverage, roughly one hour to call SNVs on a 600X coverage E.coli genome and also roughly an hour to run on a 100X coverage human exome dataset. - info: - keywords: [ "variant calling", "low frequancy variant calling", "lofreq", "lofreq/call"] - links: - homepage: https://csb5.github.io/lofreq/ - documentation: https://csb5.github.io/lofreq/commands/ - reference: - doi: 10.1093/nar/gks918 - license: "MIT" - requirements: - commands: [ lofreq ] - argument_groups: - - name: Inputs - arguments: - - name: --input - type: file - description: | - Input BAM file. - required: true - example: "normal.bam" - - name: --input_bai - type: file - description: | - Index file for the input BAM file. - required: true - example: "normal.bai" - - name: --ref - alternatives: -f - type: file - description: | - Indexed reference fasta file (gzip supported). Default: none. - required: true - example: "reference.fasta" - - name: Outputs - arguments: - - name: --out - alternatives: -o - type: file - description: | - Vcf output file. Default: stdout. - required: true - direction: output - example: "output.vcf" - - name: Arguments - arguments: - - name: --region - alternatives: -r - type: string - description: | - Limit calls to this region (chrom:start-end). Default: none. - required: false - example: "chr1:1000-2000" - - name: --bed - alternatives: -l - type: file - description: | - List of positions (chr pos) or regions (BED). Default: none. - required: false - example: "regions.bed" - - name: --min_bq - alternatives: -q - type: integer - description: | - Skip any base with baseQ smaller than INT. Default: 6. - required: false - example: 6 - - name: --min_alt_bq - alternatives: -Q - type: integer - description: | - Skip alternate bases with baseQ smaller than INT. Default: 6. - required: false - example: 6 - - name: --def_alt_bq - alternatives: -R - type: integer - description: | - Overwrite baseQs of alternate bases (that passed bq filter) with this value (-1: use median ref-bq; 0: keep). Default: 0. - required: false - example: 0 - - name: --min_jq - alternatives: -j - type: integer - description: | - Skip any base with joinedQ smaller than INT. Default: 0. - example: 0 - - name: --min_alt_jq - alternatives: -J - type: integer - description: | - Skip alternate bases with joinedQ smaller than INT. Default: 0. - required: false - example: 0 - - name: --def_alt_jq - alternatives: -K - type: integer - description: | - Overwrite joinedQs of alternate bases (that passed jq filter) with this value (-1: use median ref-bq; 0: keep). Default: 0. - required: false - example: 0 - - name: --no_baq - alternatives: -B - type: boolean_true - description: | - Disable use of base-alignment quality (BAQ). - - name: --no_idaq - alternatives: -A - type: boolean_true - description: | - Don't use IDAQ values (NOT recommended under ANY circumstances other than debugging). - - name: --del_baq - alternatives: -D - type: boolean_true - description: | - Delete pre-existing BAQ values, i.e. compute even if already present in BAM. - - name: --no_ext_baq - alternatives: -e - type: boolean_true - description: | - Use 'normal' BAQ (samtools default) instead of extended BAQ (both computed on the fly if not already present in lb tag). - - name: --min_mq - alternatives: -m - type: integer - description: | - Skip reads with mapping quality smaller than INT. Default: 0. - required: false - example: 0 - - name: --max_mq - alternatives: -M - type: integer - description: | - Cap mapping quality at INT. Default: 255. - required: false - example: 255 - - name: --no_mq - alternatives: -N - type: boolean_true - description: | - Don't merge mapping quality in LoFreq's model. - - name: --call_indels - type: boolean_true - description: | - Enable indel calls (note: preprocess your file to include indel alignment qualities!). - - name: --only_indels - type: boolean_true - description: | - Only call indels; no SNVs. - - name: --src_qual - alternatives: -s - type: boolean_true - description: | - Enable computation of source quality. - - name: --ign_vcf - alternatives: -S - type: file - description: | - Ignore variants in this vcf file for source quality computation. Multiple files can be given separated by commas. - required: false - example: "variants.vcf" - - name: --def_nm_q - alternatives: -T - type: integer - description: | - If >= 0, then replace non-match base qualities with this default value. Default: -1. - required: false - example: -1 - - name: --sig - alternatives: -a - type: double - description: | - P-Value cutoff / significance level. Default: 0.010000. - required: false - example: 0.01 - - name: --bonf - alternatives: -b - type: string - description: | - Bonferroni factor. 'dynamic' (increase per actually performed test) or INT. Default: Dynamic. - required: false - example: "dynamic" - - name: --min_cov - alternatives: -C - type: integer - description: | - Test only positions having at least this coverage. Default: 1. - (note: without --no-default-filter default filters (incl. coverage) kick in after predictions are done). - required: false - example: 1 - - name: --max_depth - alternatives: -d - type: integer - description: | - Cap coverage at this depth. Default: 1000000. - required: false - example: 1000000 - - name: --illumina_13 - type: boolean_true - description: | - Assume the quality is Illumina-1.3-1.7/ASCII+64 encoded. - - name: --use_orphan - type: boolean_true - description: | - Count anomalous read pairs (i.e. where mate is not aligned properly). - - name: --plp_summary_only - type: boolean_true - description: | - No variant calling. Just output pileup summary per column. - - name: --no_default_filter - type: boolean_true - description: | - Don't run default 'lofreq filter' automatically after calling variants. - - name: --force_overwrite - type: boolean_true - description: | - Overwrite any existing output. - - name: --verbose - type: boolean_true - description: | - Be verbose. - - name: --debug - type: boolean_true - description: | - Enable debugging. - resources: - - type: bash_script - path: script.sh - test_resources: - - type: bash_script - path: test.sh - - type: file - path: test_data -platforms: + LoFreq* is very sensitive; most notably, it is able to predict variants below the average base-call quality (i.e. sequencing error rate). Each variant call is assigned a p-value which allows for rigorous false positive control. Even though it uses no approximations or heuristics, it is very efficient due to several runtime optimizations and also provides a (pseudo-)parallel implementation. LoFreq* is generic and fast enough to be applied to high-coverage data and large genomes. On a single processor it takes a minute to analyze Dengue genome sequencing data with nearly 4000X coverage, roughly one hour to call SNVs on a 600X coverage E.coli genome and also roughly an hour to run on a 100X coverage human exome dataset. +keywords: [ "variant calling", "low frequancy variant calling", "lofreq", "lofreq/call"] +links: + homepage: https://csb5.github.io/lofreq/ + documentation: https://csb5.github.io/lofreq/commands/ +references: + doi: 10.1093/nar/gks918 +license: "MIT" +requirements: + commands: [ lofreq ] +argument_groups: + - name: Inputs + arguments: + - name: --input + type: file + description: | + Input BAM file. + required: true + example: "normal.bam" + - name: --input_bai + type: file + description: | + Index file for the input BAM file. + required: true + example: "normal.bai" + - name: --ref + alternatives: -f + type: file + description: | + Indexed reference fasta file (gzip supported). Default: none. + required: true + example: "reference.fasta" + - name: Outputs + arguments: + - name: --out + alternatives: -o + type: file + description: | + Vcf output file. Default: stdout. + required: true + direction: output + example: "output.vcf" + - name: Arguments + arguments: + - name: --region + alternatives: -r + type: string + description: | + Limit calls to this region (chrom:start-end). Default: none. + required: false + example: "chr1:1000-2000" + - name: --bed + alternatives: -l + type: file + description: | + List of positions (chr pos) or regions (BED). Default: none. + required: false + example: "regions.bed" + - name: --min_bq + alternatives: -q + type: integer + description: | + Skip any base with baseQ smaller than INT. Default: 6. + required: false + example: 6 + - name: --min_alt_bq + alternatives: -Q + type: integer + description: | + Skip alternate bases with baseQ smaller than INT. Default: 6. + required: false + example: 6 + - name: --def_alt_bq + alternatives: -R + type: integer + description: | + Overwrite baseQs of alternate bases (that passed bq filter) with this value (-1: use median ref-bq; 0: keep). Default: 0. + required: false + example: 0 + - name: --min_jq + alternatives: -j + type: integer + description: | + Skip any base with joinedQ smaller than INT. Default: 0. + example: 0 + - name: --min_alt_jq + alternatives: -J + type: integer + description: | + Skip alternate bases with joinedQ smaller than INT. Default: 0. + required: false + example: 0 + - name: --def_alt_jq + alternatives: -K + type: integer + description: | + Overwrite joinedQs of alternate bases (that passed jq filter) with this value (-1: use median ref-bq; 0: keep). Default: 0. + required: false + example: 0 + - name: --no_baq + alternatives: -B + type: boolean_true + description: | + Disable use of base-alignment quality (BAQ). + - name: --no_idaq + alternatives: -A + type: boolean_true + description: | + Don't use IDAQ values (NOT recommended under ANY circumstances other than debugging). + - name: --del_baq + alternatives: -D + type: boolean_true + description: | + Delete pre-existing BAQ values, i.e. compute even if already present in BAM. + - name: --no_ext_baq + alternatives: -e + type: boolean_true + description: | + Use 'normal' BAQ (samtools default) instead of extended BAQ (both computed on the fly if not already present in lb tag). + - name: --min_mq + alternatives: -m + type: integer + description: | + Skip reads with mapping quality smaller than INT. Default: 0. + required: false + example: 0 + - name: --max_mq + alternatives: -M + type: integer + description: | + Cap mapping quality at INT. Default: 255. + required: false + example: 255 + - name: --no_mq + alternatives: -N + type: boolean_true + description: | + Don't merge mapping quality in LoFreq's model. + - name: --call_indels + type: boolean_true + description: | + Enable indel calls (note: preprocess your file to include indel alignment qualities!). + - name: --only_indels + type: boolean_true + description: | + Only call indels; no SNVs. + - name: --src_qual + alternatives: -s + type: boolean_true + description: | + Enable computation of source quality. + - name: --ign_vcf + alternatives: -S + type: file + description: | + Ignore variants in this vcf file for source quality computation. Multiple files can be given separated by commas. + required: false + example: "variants.vcf" + - name: --def_nm_q + alternatives: -T + type: integer + description: | + If >= 0, then replace non-match base qualities with this default value. Default: -1. + required: false + example: -1 + - name: --sig + alternatives: -a + type: double + description: | + P-Value cutoff / significance level. Default: 0.010000. + required: false + example: 0.01 + - name: --bonf + alternatives: -b + type: string + description: | + Bonferroni factor. 'dynamic' (increase per actually performed test) or INT. Default: Dynamic. + required: false + example: "dynamic" + - name: --min_cov + alternatives: -C + type: integer + description: | + Test only positions having at least this coverage. Default: 1. + (note: without --no-default-filter default filters (incl. coverage) kick in after predictions are done). + required: false + example: 1 + - name: --max_depth + alternatives: -d + type: integer + description: | + Cap coverage at this depth. Default: 1000000. + required: false + example: 1000000 + - name: --illumina_13 + type: boolean_true + description: | + Assume the quality is Illumina-1.3-1.7/ASCII+64 encoded. + - name: --use_orphan + type: boolean_true + description: | + Count anomalous read pairs (i.e. where mate is not aligned properly). + - name: --plp_summary_only + type: boolean_true + description: | + No variant calling. Just output pileup summary per column. + - name: --no_default_filter + type: boolean_true + description: | + Don't run default 'lofreq filter' automatically after calling variants. + - name: --force_overwrite + type: boolean_true + description: | + Overwrite any existing output. + - name: --verbose + type: boolean_true + description: | + Be verbose. + - name: --debug + type: boolean_true + description: | + Enable debugging. +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - type: file + path: test_data +engines: - type: docker image: quay.io/biocontainers/lofreq:2.1.5--py38h794fc9e_10 setup: @@ -247,5 +245,6 @@ platforms: run: | version=$(lofreq version | grep 'version' | sed 's/version: //') && \ echo "lofreq: $version" > /var/software_versions.txt +runners: + - type: executable - type: nextflow - diff --git a/src/lofreq/indelqual/config.vsh.yaml b/src/lofreq/indelqual/config.vsh.yaml index 821d5d72..0524458e 100644 --- a/src/lofreq/indelqual/config.vsh.yaml +++ b/src/lofreq/indelqual/config.vsh.yaml @@ -1,77 +1,75 @@ -functionality: - name: lofreq_indelqual - namespace: lofreq - description: | - Insert indel qualities into BAM file (required for indel predictions). +name: lofreq_indelqual +namespace: lofreq +description: | + Insert indel qualities into BAM file (required for indel predictions). - The preferred way of inserting indel qualities should be via GATK's BQSR (>=2) If that's not possible, use this subcommand. - The command has two modes: 'uniform' and 'dindel': - - 'uniform' will assign a given value uniformly, whereas - - 'dindel' will insert indel qualities based on Dindel (PMID 20980555). - Both will overwrite any existing values. - Do not realign your BAM file afterwards! - info: - keywords: [ "bam", "indel", "qualities", "indelqual", "lofreq", "lofreq/indelqual"] - links: - homepage: https://csb5.github.io/lofreq/ - documentation: https://csb5.github.io/lofreq/commands/ - reference: - doi: 10.1093/nar/gks918 - license: "MIT" - requirements: - commands: [ lofreq ] - argument_groups: - - name: Inputs - arguments: - - name: --input - type: file - description: | - Input BAM file. - required: true - example: "normal.bam" - - name: --ref - alternatives: -f - type: file - description: | - Reference sequence used for mapping (Only required for --dindel). - required: false - example: "reference.fasta" - - name: Outputs - arguments: - - name: --out - alternatives: -o - type: file - description: | - Output BAM file. - required: true - direction: output - example: "output.bam" - - name: Arguments - arguments: - - name: --uniform - alternatives: -u - type: string - description: | - Add this indel quality uniformly to all bases. Use two comma separated values to specify insertion and deletion quality separately. (clashes with --dindel). - required: false - example: "50,50" - - name: --dindel - type: boolean_true - description: | - Add Dindel's indel qualities (Illumina specific) (clashes with -u; needs --ref). - - name: --verbose - type: boolean_true - description: | - Be verbose. - resources: - - type: bash_script - path: script.sh - test_resources: - - type: bash_script - path: test.sh - - type: file - path: test_data -platforms: + The preferred way of inserting indel qualities should be via GATK's BQSR (>=2) If that's not possible, use this subcommand. + The command has two modes: 'uniform' and 'dindel': + - 'uniform' will assign a given value uniformly, whereas + - 'dindel' will insert indel qualities based on Dindel (PMID 20980555). + Both will overwrite any existing values. + Do not realign your BAM file afterwards! +keywords: [ "bam", "indel", "qualities", "indelqual", "lofreq", "lofreq/indelqual"] +links: + homepage: https://csb5.github.io/lofreq/ + documentation: https://csb5.github.io/lofreq/commands/ +references: + doi: 10.1093/nar/gks918 +license: "MIT" +requirements: + commands: [ lofreq ] +argument_groups: + - name: Inputs + arguments: + - name: --input + type: file + description: | + Input BAM file. + required: true + example: "normal.bam" + - name: --ref + alternatives: -f + type: file + description: | + Reference sequence used for mapping (Only required for --dindel). + required: false + example: "reference.fasta" + - name: Outputs + arguments: + - name: --out + alternatives: -o + type: file + description: | + Output BAM file. + required: true + direction: output + example: "output.bam" + - name: Arguments + arguments: + - name: --uniform + alternatives: -u + type: string + description: | + Add this indel quality uniformly to all bases. Use two comma separated values to specify insertion and deletion quality separately. (clashes with --dindel). + required: false + example: "50,50" + - name: --dindel + type: boolean_true + description: | + Add Dindel's indel qualities (Illumina specific) (clashes with -u; needs --ref). + - name: --verbose + type: boolean_true + description: | + Be verbose. +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - type: file + path: test_data +engines: - type: docker image: quay.io/biocontainers/lofreq:2.1.5--py38h794fc9e_10 setup: @@ -79,4 +77,6 @@ platforms: run: | version=$(lofreq version | grep 'version' | sed 's/version: //') && \ echo "lofreq: $version" > /var/software_versions.txt +runners: + - type: executable - type: nextflow diff --git a/src/pear/config.vsh.yaml b/src/pear/config.vsh.yaml index 53921baa..d6dbe6c9 100644 --- a/src/pear/config.vsh.yaml +++ b/src/pear/config.vsh.yaml @@ -1,156 +1,154 @@ -functionality: - name: pear - description: | - PEAR is an ultrafast, memory-efficient and highly accurate pair-end read merger. It is fully parallelized and can run with as low as just a few kilobytes of memory. - - PEAR evaluates all possible paired-end read overlaps and without requiring the target fragment size as input. In addition, it implements a statistical test for minimizing false-positive results. Together with a highly optimized implementation, it can merge millions of paired end reads within a couple of minutes on a standard desktop computer. - info: - keywords: [ "pair-end", "read", "merge" ] - links: - homepage: https://cme.h-its.org/exelixis/web/software/pear - repository: https://github.com/tseemann/PEAR - documentation: https://cme.h-its.org/exelixis/web/software/pear/doc.html - references: - doi: 10.1093/bioinformatics/btt593 - license: "CC-BY-NC-SA-3.0" - requirements: - commands: [ pear , gzip ] - argument_groups: - - name: Inputs - arguments: - - name: --forward_fastq - alternatives: -f - type: file - description: Forward paired-end FASTQ file - required: true - example: "forward.fastq" - - name: --reverse_fastq - alternatives: -r - type: file - description: Reverse paired-end FASTQ file - required: true - example: "reverse.fastq" - - name: Outputs - arguments: - - name: --assembled - type: file - description: The output file containing assembled reads. Can be compressed with gzip. - required: true - direction: output - - name: --unassembled_forward - type: file - description: The output file containing forward reads that could not be assembled. Can be compressed with gzip. - required: true - direction: output - - name: --unassembled_reverse - type: file - description: The output file containing reverse reads that could not be assembled. Can be compressed with gzip. - required: true - direction: output - - name: --discarded - type: file - description: The output file containing reads that were discarded due to too low quality or too many uncalled bases. Can be compressed with gzip. - required: true - direction: output - - name: Arguments - arguments: - - name: --p_value - alternatives: -p - type: double - description: | - Specify a p-value for the statistical test. If the computed p-value of a possible assembly exceeds the specified p-value then paired-end read will not be assembled. Valid options are: 0.0001, 0.001, 0.01, 0.05 and 1.0. Setting 1.0 disables the test. - example: 0.01 - required: false - - name: --min_overlap - alternatives: -v - type: integer - description: | - Specify the minimum overlap size. The minimum overlap may be set to 1 when the statistical test is used. However, further restricting the minimum overlap size to a proper value may reduce false-positive assembles. - required: false - example: 10 - - name: --max_assembly_length - alternatives: -m - type: integer - description: | - Specify the maximum possible length of the assembled sequences. Setting this value to 0 disables the restriction and assembled sequences may be arbitrary long. - required: false - example: 0 - - name: --min_assembly_length - alternatives: -n - type: integer - description: | - Specify the minimum possible length of the assembled sequences. Setting this value to 0 disables the restriction and assembled sequences may be arbitrary short. - required: false - example: 0 - - name: --min_trim_length - alternatives: -t - type: integer - description: | - Specify the minimum length of reads after trimming the low quality part (see option -q) - required: false - example: 1 - - name: --quality_threshold - alternatives: -q - type: integer - description: | - Specify the quality threshold for trimming the low quality part of a read. If the quality scores of two consecutive bases are strictly less than the specified threshold, the rest of the read will be trimmed. - required: false - example: 0 - - name: --max_uncalled_base - alternatives: -u - type: double - description: | - Specify the maximal proportion of uncalled bases in a read. Setting this value to 0 will cause PEAR to discard all reads containing uncalled bases. The other extreme setting is 1 which causes PEAR to process all reads independent on the number of uncalled bases. - example: 1.0 - required: false - - name: --test_method - alternatives: -g - type: integer - description: | - Specify the type of statistical test. Two options are available. 1: Given the minimum allowed overlap, test using the highest OES. Note that due to its discrete nature, this test usually yields a lower p-value for the assembled read than the cut- off (specified by -p). For example, setting the cut-off to 0.05 using this test, the assembled reads might have an actual p-value of 0.02. - 2. Use the acceptance probability (m.a.p). This test methods computes the same probability as test method 1. However, it assumes that the minimal overlap is the observed overlap with the highest OES, instead of the one specified by -v. Therefore, this is not a valid statistical test and the 'p-value' is in fact the maximal probability for accepting the assembly. Nevertheless, we observed in practice that for the case the actual overlap sizes are relatively small, test 2 can correctly assemble more reads with only slightly higher false-positive rate. - required: false - example: 1 - - name: --emperical_freqs - alternatives: -e - type: boolean_true - description: | - Disable empirical base frequencies. - - name: --score_method - alternatives: -s - type: integer - description: | - Specify the scoring method. 1. OES with +1 for match and -1 for mismatch. 2: Assembly score (AS). Use +1 for match and -1 for mismatch multiplied by base quality scores. 3: Ignore quality scores and use +1 for a match and -1 for a mismatch. - required: false - example: 2 - - name: --phred_base - alternatives: -b - type: integer - description: | - Base PHRED quality score. - required: false - example: 33 - - name: --cap - alternatives: -c - type: integer - description: | - Specify the upper bound for the resulting quality score. If set to zero, capping is disabled. - required: false - example: 40 - - name: --nbase - alternatives: -z - type: boolean_true - description: | - When merging a base-pair that consists of two non-equal bases out of which none is degenerate, set the merged base to N and use the highest quality score of the two bases - resources: - - type: bash_script - path: script.sh - test_resources: - - type: bash_script - path: test.sh - - type: file - path: test_data -platforms: +name: pear +description: | + PEAR is an ultrafast, memory-efficient and highly accurate pair-end read merger. It is fully parallelized and can run with as low as just a few kilobytes of memory. + + PEAR evaluates all possible paired-end read overlaps and without requiring the target fragment size as input. In addition, it implements a statistical test for minimizing false-positive results. Together with a highly optimized implementation, it can merge millions of paired end reads within a couple of minutes on a standard desktop computer. +keywords: [ "pair-end", "read", "merge" ] +links: + homepage: https://cme.h-its.org/exelixis/web/software/pear + repository: https://github.com/tseemann/PEAR + documentation: https://cme.h-its.org/exelixis/web/software/pear/doc.html +references: + doi: 10.1093/bioinformatics/btt593 +license: "CC-BY-NC-SA-3.0" +requirements: + commands: [ pear , gzip ] +argument_groups: + - name: Inputs + arguments: + - name: --forward_fastq + alternatives: -f + type: file + description: Forward paired-end FASTQ file + required: true + example: "forward.fastq" + - name: --reverse_fastq + alternatives: -r + type: file + description: Reverse paired-end FASTQ file + required: true + example: "reverse.fastq" + - name: Outputs + arguments: + - name: --assembled + type: file + description: The output file containing assembled reads. Can be compressed with gzip. + required: true + direction: output + - name: --unassembled_forward + type: file + description: The output file containing forward reads that could not be assembled. Can be compressed with gzip. + required: true + direction: output + - name: --unassembled_reverse + type: file + description: The output file containing reverse reads that could not be assembled. Can be compressed with gzip. + required: true + direction: output + - name: --discarded + type: file + description: The output file containing reads that were discarded due to too low quality or too many uncalled bases. Can be compressed with gzip. + required: true + direction: output + - name: Arguments + arguments: + - name: --p_value + alternatives: -p + type: double + description: | + Specify a p-value for the statistical test. If the computed p-value of a possible assembly exceeds the specified p-value then paired-end read will not be assembled. Valid options are: 0.0001, 0.001, 0.01, 0.05 and 1.0. Setting 1.0 disables the test. + example: 0.01 + required: false + - name: --min_overlap + alternatives: -v + type: integer + description: | + Specify the minimum overlap size. The minimum overlap may be set to 1 when the statistical test is used. However, further restricting the minimum overlap size to a proper value may reduce false-positive assembles. + required: false + example: 10 + - name: --max_assembly_length + alternatives: -m + type: integer + description: | + Specify the maximum possible length of the assembled sequences. Setting this value to 0 disables the restriction and assembled sequences may be arbitrary long. + required: false + example: 0 + - name: --min_assembly_length + alternatives: -n + type: integer + description: | + Specify the minimum possible length of the assembled sequences. Setting this value to 0 disables the restriction and assembled sequences may be arbitrary short. + required: false + example: 0 + - name: --min_trim_length + alternatives: -t + type: integer + description: | + Specify the minimum length of reads after trimming the low quality part (see option -q) + required: false + example: 1 + - name: --quality_threshold + alternatives: -q + type: integer + description: | + Specify the quality threshold for trimming the low quality part of a read. If the quality scores of two consecutive bases are strictly less than the specified threshold, the rest of the read will be trimmed. + required: false + example: 0 + - name: --max_uncalled_base + alternatives: -u + type: double + description: | + Specify the maximal proportion of uncalled bases in a read. Setting this value to 0 will cause PEAR to discard all reads containing uncalled bases. The other extreme setting is 1 which causes PEAR to process all reads independent on the number of uncalled bases. + example: 1.0 + required: false + - name: --test_method + alternatives: -g + type: integer + description: | + Specify the type of statistical test. Two options are available. 1: Given the minimum allowed overlap, test using the highest OES. Note that due to its discrete nature, this test usually yields a lower p-value for the assembled read than the cut- off (specified by -p). For example, setting the cut-off to 0.05 using this test, the assembled reads might have an actual p-value of 0.02. + 2. Use the acceptance probability (m.a.p). This test methods computes the same probability as test method 1. However, it assumes that the minimal overlap is the observed overlap with the highest OES, instead of the one specified by -v. Therefore, this is not a valid statistical test and the 'p-value' is in fact the maximal probability for accepting the assembly. Nevertheless, we observed in practice that for the case the actual overlap sizes are relatively small, test 2 can correctly assemble more reads with only slightly higher false-positive rate. + required: false + example: 1 + - name: --emperical_freqs + alternatives: -e + type: boolean_true + description: | + Disable empirical base frequencies. + - name: --score_method + alternatives: -s + type: integer + description: | + Specify the scoring method. 1. OES with +1 for match and -1 for mismatch. 2: Assembly score (AS). Use +1 for match and -1 for mismatch multiplied by base quality scores. 3: Ignore quality scores and use +1 for a match and -1 for a mismatch. + required: false + example: 2 + - name: --phred_base + alternatives: -b + type: integer + description: | + Base PHRED quality score. + required: false + example: 33 + - name: --cap + alternatives: -c + type: integer + description: | + Specify the upper bound for the resulting quality score. If set to zero, capping is disabled. + required: false + example: 40 + - name: --nbase + alternatives: -z + type: boolean_true + description: | + When merging a base-pair that consists of two non-equal bases out of which none is degenerate, set the merged base to N and use the highest quality score of the two bases +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - type: file + path: test_data +engines: - type: docker image: quay.io/biocontainers/pear:0.9.6--h9d449c0_10 setup: @@ -158,4 +156,6 @@ platforms: run: | version=$(pear -h | grep 'PEAR v' | sed 's/PEAR v//' | sed 's/ .*//') && \ echo "pear: $version" > /var/software_versions.txt +runners: + - type: executable - type: nextflow \ No newline at end of file