From b3be000dda2624a083784a1ce18f0619bdf4e660 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Mon, 26 Feb 2024 12:12:57 +0100 Subject: [PATCH 1/3] update to viash 0.9.0-RC2 (#15) * fix argument issues * update to viash 0.9.0-RC1 * move links and reference into info for now * try to use ns-list without platform * update reference to references * update pear * update busco to viash 0.9.0 * update json schema * remove functionality: * remove info: * change platforms into engines and runners * add dependabot * fix matrix jq query * try with patched viash actions * try to get the ci to work * fix * another attempt * fix --- .github/dependabot.yml | 6 + .github/workflows/test.yaml | 53 +- .vscode/viash_config.yaml | 766 ++++++++--- _viash.yaml | 9 +- src/arriba/config.vsh.yaml | 726 +++++------ src/bgzip/config.vsh.yaml | 230 ++-- .../busco_download_datasets/config.vsh.yaml | 79 +- src/busco/busco_list_datasets/config.vsh.yaml | 63 +- src/busco/busco_run/config.vsh.yaml | 396 +++--- src/fastp/config.vsh.yaml | 1116 ++++++++--------- src/featurecounts/config.vsh.yaml | 644 +++++----- src/lofreq/call/config.vsh.yaml | 479 ++++--- src/lofreq/indelqual/config.vsh.yaml | 146 +-- src/pear/config.vsh.yaml | 306 ++--- 14 files changed, 2752 insertions(+), 2267 deletions(-) create mode 100644 .github/dependabot.yml diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 00000000..90963715 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,6 @@ +version: 2 +updates: + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "daily" \ No newline at end of file diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index a094248b..6e1fc4b3 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -48,34 +48,45 @@ jobs: run: | LANG=C viash ns list > /dev/null - - name: Get changed files - id: changed-files - uses: tj-actions/changed-files@v42 - with: - separator: ";" - diff_relative: true - - - id: ns_list - uses: viash-io/viash-actions/ns-list@v5 - with: - platform: docker - format: json - query: ^(?!workflows) + # see https://github.com/viash-io/viash/issues/654 + # and https://github.com/viash-io/viash-actions/pull/27 + # - name: Get changed files + # id: changed-files + # uses: tj-actions/changed-files@v42 + # with: + # separator: ";" + # diff_relative: true + # - id: ns_list + # uses: viash-io/viash-actions/ns-list@v5 + # with: + # platform: docker + # format: json + # query: ^(?!workflows) + # - id: ns_list_filtered + # uses: viash-io/viash-actions/project/detect-changed-components@v5 + # with: + # input_file: "${{ steps.ns_list.outputs.output_file }}" + # - id: set_matrix + # run: | + # echo "matrix=$(jq -c '[ .[] | + # { + # "name": (.functionality.namespace + "/" + .functionality.name), + # "config": .info.config, + # "dir": .info.config | capture("^(?.*\/)").dir + # } + # ]' ${{ contains(steps.get_head_commit_message.outputs.HEAD_COMMIT_MESSAGE, 'ci force') && steps.ns_list.outputs.output_file || steps.ns_list_filtered.outputs.output_file }} )" >> $GITHUB_OUTPUT - - id: ns_list_filtered - uses: viash-io/viash-actions/project/detect-changed-components@v5 - with: - input_file: "${{ steps.ns_list.outputs.output_file }}" - id: set_matrix run: | + viash ns list --format json > ns_list.json echo "matrix=$(jq -c '[ .[] | { - "name": (.functionality.namespace + "/" + .functionality.name), - "config": .info.config, - "dir": .info.config | capture("^(?.*\/)").dir + "name": (.namespace + "/" + .name), + "config": .build_info.config, + "dir": .build_info.config | capture("^(?.*\/)").dir } - ]' ${{ contains(steps.get_head_commit_message.outputs.HEAD_COMMIT_MESSAGE, 'ci force') && steps.ns_list.outputs.output_file || steps.ns_list_filtered.outputs.output_file }} )" >> $GITHUB_OUTPUT + ]' ns_list.json )" >> $GITHUB_OUTPUT # phase 2 viash_test: diff --git a/.vscode/viash_config.yaml b/.vscode/viash_config.yaml index b7a6aabd..0e38195f 100644 --- a/.vscode/viash_config.yaml +++ b/.vscode/viash_config.yaml @@ -7,29 +7,173 @@ definitions: \ you choose. \n" type: "object" properties: + organization: + description: "The organization of the package." + type: "string" + license: + description: "The license of the package." + type: "string" + authors: + description: "A list of authors. An author must at least have a name, but\ + \ can also have a list of roles, an e-mail address, and a map of custom\ + \ properties.\n\nSuggested values for roles are:\n \n| Role | Abbrev. |\ + \ Description |\n|------|---------|-------------|\n| maintainer | mnt |\ + \ for the maintainer of the code. Ideally, exactly one maintainer is specified.\ + \ |\n| author | aut | for persons who have made substantial contributions\ + \ to the software. |\n| contributor | ctb| for persons who have made smaller\ + \ contributions (such as code patches).\n| datacontributor | dtc | for persons\ + \ or organisations that contributed data sets for the software\n| copyrightholder\ + \ | cph | for all copyright holders. This is a legal concept so should use\ + \ the legal name of an institution or corporate body.\n| funder | fnd |\ + \ for persons or organizations that furnished financial support for the\ + \ development of the software\n\nThe [full list of roles](https://www.loc.gov/marc/relators/relaterm.html)\ + \ is extremely comprehensive.\n" + type: "array" + items: + $ref: "#/definitions/Author" + status: + description: "Allows setting a component to active, deprecated or disabled." + $ref: "#/definitions/Status" + requirements: + description: "Computational requirements related to running the component.\ + \ \n`cpus` specifies the maximum number of (logical) cpus a component is\ + \ allowed to use., whereas\n`memory` specifies the maximum amount of memory\ + \ a component is allowed to allicate. Memory units must be\nin B, KB, MB,\ + \ GB, TB or PB." + $ref: "#/definitions/ComputationalRequirements" + repositories: + description: "(Pre-)defines repositories that can be used as repository in\ + \ dependencies.\nAllows reusing repository definitions in case it is used\ + \ in multiple dependencies." + type: "array" + items: + $ref: "#/definitions/RepositoryWithName" + dependencies: + description: "Allows listing Viash components required by this Viash component" + type: "array" + items: + $ref: "#/definitions/Dependency" + namespace: + description: "Namespace this component is a part of. See the Namespaces guide\ + \ for more information on namespaces." + type: "string" functionality: description: "The functionality describes the behaviour of the script in terms\ \ of arguments and resources.\nBy specifying a few restrictions (e.g. mandatory\ \ arguments) and adding some descriptions, Viash will automatically generate\ \ a stylish command-line interface for you.\n" $ref: "#/definitions/Functionality" + runners: + description: "A list of runners to execute target artifacts.\n\n - ExecutableRunner\n\ + \ - NextflowRunner\n" + type: "array" + items: + $ref: "#/definitions/Runner" + name: + description: "Name of the component and the filename of the executable when\ + \ built with `viash build`." + type: "string" + build_info: + $ref: "#/definitions/BuildInfo" + argument_groups: + description: "A grouping of the arguments, used to display the help message.\n\ + \n - `name: foo`, the name of the argument group. \n - `description: Description\ + \ of foo`, a description of the argument group. Multiline descriptions are\ + \ supported.\n - `arguments: [arg1, arg2, ...]`, list of the arguments.\n\ + \n" + type: "array" + items: + $ref: "#/definitions/ArgumentGroup" + description: + description: "A description of the component. This will be displayed with\ + \ `--help`." + type: "string" + usage: + description: "A description on how to use the component. This will be displayed\ + \ with `--help` under the 'Usage:' section." + type: "string" + info: + description: "Structured information. Can be any shape: a string, vector,\ + \ map or even nested map." + type: "object" + package_config: + description: "The package config content used during build." + $ref: "#/definitions/PackageConfig" platforms: description: "A list of platforms to generate target artifacts for.\n\n -\ \ Native\n - Docker\n - Nextflow\n" type: "array" items: $ref: "#/definitions/Platform" + version: + description: "Version of the component. This field will be used to version\ + \ the executable and the Docker container." + type: "string" + links: + description: "External links of the component." + $ref: "#/definitions/Links" + references: + description: "References to external resources related to the component." + $ref: "#/definitions/References" + engines: + description: "A list of engine environments to execute target artifacts in.\n\ + \n - NativeEngine\n - DockerEngine\n" + type: "array" + items: + $ref: "#/definitions/Engine" + resources: + description: "Resources are files that support the component. The first resource\ + \ should be a script that will be executed when the component is run. Additional\ + \ resources will be copied to the same directory.\n\nCommon properties:\n\ + \n * type: `file` / `r_script` / `python_script` / `bash_script` / `javascript_script`\ + \ / `scala_script` / `csharp_script`, specifies the type of the resource.\ + \ The first resource cannot be of type `file`. When the type is not specified,\ + \ the default type is simply `file`.\n * dest: filename, the resulting name\ + \ of the resource. From within a script, the file can be accessed at `meta[\"\ + resources_dir\"] + \"/\" + dest`. If unspecified, `dest` will be set to\ + \ the basename of the `path` parameter.\n * path: `path/to/file`, the path\ + \ of the input file. Can be a relative or an absolute path, or a URI. Mutually\ + \ exclusive with `text`.\n * text: ...multiline text..., the content of\ + \ the resulting file specified as a string. Mutually exclusive with `path`.\n\ + \ * is_executable: `true` / `false`, whether the resulting resource file\ + \ should be made executable.\n" + type: "array" + items: + $ref: "#/definitions/Resource" + keywords: + description: "The keywords of the components." + type: "array" + items: + type: "string" + test_resources: + description: "One or more scripts to be used to test the component behaviour\ + \ when `viash test` is invoked. Additional files of type `file` will be\ + \ made available only during testing. Each test script should expect no\ + \ command-line inputs, be platform-independent, and return an exit code\ + \ >0 when unexpected behaviour occurs during testing. See Unit Testing for\ + \ more info." + type: "array" + items: + $ref: "#/definitions/Resource" required: - - "functionality" - - "platforms" + - "name" additionalProperties: false - Project: - description: "A Viash project configuration file. It's name should be `_viash.yaml`." + PackageConfig: + description: "A Viash package configuration file. It's name should be `_viash.yaml`." type: "object" properties: + organization: + description: "The organization of the package." + type: "string" + name: + description: "The name of the package." + type: "string" source: description: "Which source directory to use for the `viash ns` commands." type: "string" + description: + description: "A description of the package." + type: "string" viash_version: description: "Which version of Viash to use." type: "string" @@ -41,12 +185,43 @@ definitions: items: description: "Which config mods to apply." type: "string" + info: + description: "Structured information. Can be any shape: a string, vector,\ + \ map or even nested map." + type: "object" + license: + description: "The license of the package." + type: "string" + references: + description: "References to external resources related to the package." + $ref: "#/definitions/References" + authors: + description: "The authors of the package." + type: "array" + items: + $ref: "#/definitions/Author" + repositories: + description: "Common repository definitions for component dependencies." + type: "array" + items: + $ref: "#/definitions/RepositoryWithName" + keywords: + description: "The keywords of the package." + type: "array" + items: + type: "string" target: description: "Which target directory to use for `viash ns build`." type: "string" + version: + description: "The version of the package." + type: "string" + links: + description: "External links of the package." + $ref: "#/definitions/Links" required: [] additionalProperties: false - Info: + BuildInfo: description: "Meta information fields filled in by Viash during build." type: "object" properties: @@ -59,41 +234,26 @@ definitions: viash_version: description: "The Viash version that was used to build the component." type: "string" - config: - description: "Path to the config used during build." - type: "string" output: description: "Folder path to the build artifacts." type: "string" - platform: - description: "The platform id used during build." - type: "string" git_commit: description: "Git commit hash." type: "string" executable: description: "Output folder with main executable path." type: "string" - required: - - "config" - additionalProperties: false - EnvironmentVariables: - description: "Viash checks several environment variables during operation." - type: "object" - properties: - VIASH_VERSION: - description: "A specific Viash version can be set to run the commands with.\ - \ If so required, the specific Viash version will be downloaded.\nThis is\ - \ useful when replicating older results or building Viash components that\ - \ use outdated code.\n" + engine: + description: "The engine id used during build." + type: "string" + runner: + description: "The runner id used during build." type: "string" - VIASH_HOME: - description: "If `VIASH_HOME` is not defined, the fallback `HOME`/.viash is\ - \ used.\n\nLocation where specific downloaded versions of Viash will be\ - \ cached and run from.\n" + config: + description: "Path to the config used during build." type: "string" required: - - "VIASH_HOME" + - "config" additionalProperties: false Functionality: description: "The functionality-part of the config file describes the behaviour\ @@ -102,18 +262,32 @@ definitions: \ generate a stylish command-line interface for you.\n" type: "object" properties: + organization: + description: "The organization of the package." + type: "string" name: description: "Name of the component and the filename of the executable when\ \ built with `viash build`." type: "string" + argument_groups: + description: "A grouping of the arguments, used to display the help message.\n\ + \n - `name: foo`, the name of the argument group. \n - `description: Description\ + \ of foo`, a description of the argument group. Multiline descriptions are\ + \ supported.\n - `arguments: [arg1, arg2, ...]`, list of the arguments.\n\ + \n" + type: "array" + items: + $ref: "#/definitions/ArgumentGroup" info: description: "Structured information. Can be any shape: a string, vector,\ \ map or even nested map." type: "object" - version: - description: "Version of the component. This field will be used to version\ - \ the executable and the Docker container." + license: + description: "The license of the package." type: "string" + references: + description: "References to external resources related to the component." + $ref: "#/definitions/References" authors: description: "A list of authors. An author must at least have a name, but\ \ can also have a list of roles, an e-mail address, and a map of custom\ @@ -149,25 +323,6 @@ definitions: type: "array" items: $ref: "#/definitions/RepositoryWithName" - resources: - description: "Resources are files that support the component. The first resource\ - \ should be a script that will be executed when the functionality is run.\ - \ Additional resources will be copied to the same directory.\n\nCommon properties:\n\ - \n * type: `file` / `r_script` / `python_script` / `bash_script` / `javascript_script`\ - \ / `scala_script` / `csharp_script`, specifies the type of the resource.\ - \ The first resource cannot be of type `file`. When the type is not specified,\ - \ the default type is simply `file`.\n * dest: filename, the resulting name\ - \ of the resource. From within a script, the file can be accessed at `meta[\"\ - resources_dir\"] + \"/\" + dest`. If unspecified, `dest` will be set to\ - \ the basename of the `path` parameter.\n * path: `path/to/file`, the path\ - \ of the input file. Can be a relative or an absolute path, or a URI. Mutually\ - \ exclusive with `text`.\n * text: ...multiline text..., the content of\ - \ the resulting file specified as a string. Mutually exclusive with `path`.\n\ - \ * is_executable: `true` / `false`, whether the resulting resource file\ - \ should be made executable.\n" - type: "array" - items: - $ref: "#/definitions/Resource" test_resources: description: "One or more scripts to be used to test the component behaviour\ \ when `viash test` is invoked. Additional files of type `file` will be\ @@ -183,15 +338,6 @@ definitions: type: "array" items: $ref: "#/definitions/Dependency" - argument_groups: - description: "A grouping of the arguments, used to display the help message.\n\ - \n - `name: foo`, the name of the argument group. \n - `description: Description\ - \ of foo`, a description of the argument group. Multiline descriptions are\ - \ supported.\n - `arguments: [arg1, arg2, ...]`, list of the arguments.\n\ - \n" - type: "array" - items: - $ref: "#/definitions/ArgumentGroup" description: description: "A description of the component. This will be displayed with\ \ `--help`." @@ -200,6 +346,37 @@ definitions: description: "A description on how to use the component. This will be displayed\ \ with `--help` under the 'Usage:' section." type: "string" + version: + description: "Version of the component. This field will be used to version\ + \ the executable and the Docker container." + type: "string" + links: + description: "External links of the component." + $ref: "#/definitions/Links" + resources: + description: "Resources are files that support the component. The first resource\ + \ should be a script that will be executed when the functionality is run.\ + \ Additional resources will be copied to the same directory.\n\nCommon properties:\n\ + \n * type: `file` / `r_script` / `python_script` / `bash_script` / `javascript_script`\ + \ / `scala_script` / `csharp_script`, specifies the type of the resource.\ + \ The first resource cannot be of type `file`. When the type is not specified,\ + \ the default type is simply `file`.\n * dest: filename, the resulting name\ + \ of the resource. From within a script, the file can be accessed at `meta[\"\ + resources_dir\"] + \"/\" + dest`. If unspecified, `dest` will be set to\ + \ the basename of the `path` parameter.\n * path: `path/to/file`, the path\ + \ of the input file. Can be a relative or an absolute path, or a URI. Mutually\ + \ exclusive with `text`.\n * text: ...multiline text..., the content of\ + \ the resulting file specified as a string. Mutually exclusive with `path`.\n\ + \ * is_executable: `true` / `false`, whether the resulting resource file\ + \ should be made executable.\n" + type: "array" + items: + $ref: "#/definitions/Resource" + keywords: + description: "The keywords of the components." + type: "array" + items: + type: "string" namespace: description: "Namespace this component is a part of. See the Namespaces guide\ \ for more information on namespaces." @@ -288,6 +465,295 @@ definitions: - "name" - "arguments" additionalProperties: false + Links: + description: "Links to external resources related to the component." + type: "object" + properties: + repository: + description: "Source repository url." + type: "string" + documentation: + description: "Documentation website url." + type: "string" + docker_registry: + description: "Docker registry url." + type: "string" + homepage: + description: "Homepage website url." + type: "string" + issue_tracker: + description: "Issue tracker url." + type: "string" + required: [] + additionalProperties: false + References: + description: "References to external resources related to the component." + type: "object" + properties: + bibtex: + oneOf: + - description: "One or multiple BibTeX reference(s) of the component." + type: "string" + - type: "array" + items: + description: "One or multiple BibTeX reference(s) of the component." + type: "string" + doi: + oneOf: + - description: "One or multiple DOI reference(s) of the component." + type: "string" + - type: "array" + items: + description: "One or multiple DOI reference(s) of the component." + type: "string" + additionalProperties: false + Runner: + oneOf: + - $ref: "#/definitions/ExecutableRunner" + - $ref: "#/definitions/NextflowRunner" + ExecutableRunner: + description: "Run code as an executable.\n\nThis runner is the default runner.\ + \ It will generate a bash script that can be run directly.\n\nThis runner is\ + \ also used for the native engine.\n\nThis runner is also used for the docker\ + \ engine.\n" + type: "object" + properties: + docker_setup_strategy: + description: "The Docker setup strategy to use when building a docker engine\ + \ enrivonment.\n\n| Strategy | Description |\n|-----|----------|\n| `alwaysbuild`\ + \ / `build` / `b` | Always build the image from the dockerfile. This is\ + \ the default setup strategy.\n| `alwayscachedbuild` / `cachedbuild` / `cb`\ + \ | Always build the image from the dockerfile, with caching enabled.\n\ + | `ifneedbebuild` | Build the image if it does not exist locally.\n| `ifneedbecachedbuild`\ + \ | Build the image with caching enabled if it does not exist locally, with\ + \ caching enabled.\n| `alwayspull` / `pull` / `p` | Try to pull the container\ + \ from [Docker Hub](https://hub.docker.com) or the specified docker registry.\n\ + | `alwayspullelsebuild` / `pullelsebuild` | Try to pull the image from\ + \ a registry and build it if it doesn't exist.\n| `alwayspullelsecachedbuild`\ + \ / `pullelsecachedbuild` | Try to pull the image from a registry and build\ + \ it with caching if it doesn't exist.\n| `ifneedbepull` | If the image\ + \ does not exist locally, pull the image.\n| `ifneedbepullelsebuild` | \ + \ If the image does not exist locally, pull the image. If the image does\ + \ exist, build it.\n| `ifneedbepullelsecachedbuild` | If the image does\ + \ not exist locally, pull the image. If the image does exist, build it with\ + \ caching enabled.\n| `push` | Push the container to [Docker Hub](https://hub.docker.com)\ + \ or the specified docker registry.\n| `pushifnotpresent` | Push the container\ + \ to [Docker Hub](https://hub.docker.com) or the specified docker registry\ + \ if the tag does not exist yet.\n| `donothing` / `meh` | Do not build or\ + \ pull anything.\n\n" + $ref: "#/definitions/DockerSetupStrategy" + workdir: + description: "The working directory when starting the engine. This doesn't\ + \ change the Dockerfile but gets added as a command-line argument at runtime." + type: "string" + docker_run_args: + oneOf: + - description: "Provide runtime arguments to Docker. See the documentation\ + \ on [`docker run`](https://docs.docker.com/engine/reference/run/) for\ + \ more information." + type: "string" + - type: "array" + items: + description: "Provide runtime arguments to Docker. See the documentation\ + \ on [`docker run`](https://docs.docker.com/engine/reference/run/) for\ + \ more information." + type: "string" + id: + description: "Name of the runner. As with all runners, you can give an runner\ + \ a different name. By specifying `id: foo`, you can target this executor\ + \ (only) by specifying `...` in any of the Viash commands." + type: "string" + port: + oneOf: + - description: "A list of enabled ports. This doesn't change the Dockerfile\ + \ but gets added as a command-line argument at runtime." + type: "integer" + - description: "A list of enabled ports. This doesn't change the Dockerfile\ + \ but gets added as a command-line argument at runtime." + type: "string" + - description: "A list of enabled ports. This doesn't change the Dockerfile\ + \ but gets added as a command-line argument at runtime." + type: "array" + items: + type: "integer" + - description: "A list of enabled ports. This doesn't change the Dockerfile\ + \ but gets added as a command-line argument at runtime." + type: "array" + items: + type: "string" + type: + description: "Run code as an executable.\n\nThis runner is the default runner.\ + \ It will generate a bash script that can be run directly.\n\nThis runner\ + \ is also used for the native engine.\n\nThis runner is also used for the\ + \ docker engine.\n" + const: "executable" + required: + - "type" + additionalProperties: false + NextflowRunner: + description: "Run a Viash component on a Nextflow backend engine.\n" + type: "object" + properties: + auto: + description: "Automated processing flags which can be toggled on or off:\n\ + \n| Flag | Description | Default |\n|---|---------|----|\n| `simplifyInput`\ + \ | If `true`, an input tuple only containing only a single File (e.g. `[\"\ + foo\", file(\"in.h5ad\")]`) is automatically transformed to a map (i.e.\ + \ `[\"foo\", [ input: file(\"in.h5ad\") ] ]`). | `true` |\n| `simplifyOutput`\ + \ | If `true`, an output tuple containing a map with a File (e.g. `[\"foo\"\ + , [ output: file(\"out.h5ad\") ] ]`) is automatically transformed to a map\ + \ (i.e. `[\"foo\", file(\"out.h5ad\")]`). | `false` |\n| `transcript` |\ + \ If `true`, the module's transcripts from `work/` are automatically published\ + \ to `params.transcriptDir`. If not defined, `params.publishDir + \"/_transcripts\"\ + ` will be used. Will throw an error if neither are defined. | `false` |\n\ + | `publish` | If `true`, the module's outputs are automatically published\ + \ to `params.publishDir`. If equal to \"state\", also a `.state.yaml` file\ + \ will be published in the publish dir. Will throw an error if `params.publishDir`\ + \ is not defined. | `false` |\n\n" + $ref: "#/definitions/NextflowAuto" + directives: + description: "Directives are optional settings that affect the execution of\ + \ the process. These mostly match up with the Nextflow counterparts. \n" + $ref: "#/definitions/NextflowDirectives" + container: + description: "Specifies the Docker engine id to be used to run Nextflow." + type: "string" + config: + description: "Allows tweaking how the Nextflow Config file is generated." + $ref: "#/definitions/NextflowConfig" + debug: + description: "Whether or not to print debug messages." + type: "boolean" + id: + description: "Name of the runner. As with all runners, you can give an runner\ + \ a different name. By specifying `id: foo`, you can target this runner\ + \ (only) by specifying `...` in any of the Viash commands." + type: "string" + type: + description: "Run a Viash component on a Nextflow backend engine.\n" + const: "nextflow" + required: + - "type" + additionalProperties: false + Engine: + oneOf: + - $ref: "#/definitions/DockerEngine" + - $ref: "#/definitions/NativeEngine" + NativeEngine: + description: "Running a Viash component on a native engine means that the script\ + \ will be executed in your current environment.\nAny dependencies are assumed\ + \ to have been installed by the user, so the native engine is meant for developers\ + \ (who know what they're doing) or for simple bash scripts (which have no extra\ + \ dependencies).\n" + type: "object" + properties: + id: + description: "Name of the engine. As with all engines, you can give an engine\ + \ a different name. By specifying `id: foo`, you can target this engine\ + \ (only) by specifying `...` in any of the Viash commands." + type: "string" + type: + description: "Running a Viash component on a native engine means that the\ + \ script will be executed in your current environment.\nAny dependencies\ + \ are assumed to have been installed by the user, so the native engine is\ + \ meant for developers (who know what they're doing) or for simple bash\ + \ scripts (which have no extra dependencies).\n" + const: "native" + required: + - "type" + additionalProperties: false + DockerEngine: + description: "Run a Viash component on a Docker backend engine.\nBy specifying\ + \ which dependencies your component needs, users will be able to build a docker\ + \ container from scratch using the setup flag, or pull it from a docker repository.\n" + type: "object" + properties: + organization: + description: "Name of a container's [organization](https://docs.docker.com/docker-hub/orgs/)." + type: "string" + registry: + description: "The URL to the a [custom Docker registry](https://docs.docker.com/registry/)" + type: "string" + image: + description: "The base container to start from. You can also add the tag here\ + \ if you wish." + type: "string" + tag: + description: "Specify a Docker image based on its tag." + type: "string" + target_image: + description: "If anything is specified in the setup section, running the `---setup`\ + \ will result in an image with the name of `:`. If\ + \ nothing is specified in the `setup` section, simply `image` will be used.\ + \ Advanced usage only." + type: "string" + target_tag: + description: "The tag the resulting image gets. Advanced usage only." + type: "string" + namespace_separator: + description: "The separator between the namespace and the name of the component,\ + \ used for determining the image name. Default: \"/\"." + type: "string" + id: + description: "Name of the engine. As with all engines, you can give a engine\ + \ a different name. By specifying `id: foo`, you can target this engine\ + \ (only) by specifying `...` in any of the Viash commands." + type: "string" + target_registry: + description: "The URL where the resulting image will be pushed to. Advanced\ + \ usage only." + type: "string" + type: + description: "Run a Viash component on a Docker backend engine.\nBy specifying\ + \ which dependencies your component needs, users will be able to build a\ + \ docker container from scratch using the setup flag, or pull it from a\ + \ docker repository.\n" + const: "docker" + target_organization: + description: "The organization set in the resulting image. Advanced usage\ + \ only." + type: "string" + setup: + description: "A list of requirements for installing the following types of\ + \ packages:\n\n - apt\n - apk\n - Docker setup instructions\n - JavaScript\n\ + \ - Python\n - R\n - Ruby\n - yum\n\nThe order in which these dependencies\ + \ are specified determines the order in which they will be installed.\n" + type: "array" + items: + $ref: "#/definitions/Requirements" + cmd: + oneOf: + - description: "Set the default command being executed when running the Docker\ + \ container." + type: "string" + - description: "Set the default command being executed when running the Docker\ + \ container." + type: "array" + items: + type: "string" + target_image_source: + description: "The source of the target image. This is used for defining labels\ + \ in the dockerfile." + type: "string" + test_setup: + description: "Additional requirements specific for running unit tests." + type: "array" + items: + $ref: "#/definitions/Requirements" + entrypoint: + oneOf: + - description: "Override the entrypoint of the base container. Default set\ + \ `ENTRYPOINT []`." + type: "string" + - description: "Override the entrypoint of the base container. Default set\ + \ `ENTRYPOINT []`." + type: "array" + items: + type: "string" + required: + - "image" + - "type" + additionalProperties: false Platform: oneOf: - $ref: "#/definitions/NativePlatform" @@ -356,6 +822,16 @@ definitions: description: "Enables or disables automatic volume mapping. Enabled when set\ \ to `Automatic` or disabled when set to `Manual`. Default: `Automatic`." $ref: "#/definitions/DockerResolveVolume" + cmd: + oneOf: + - description: "Set the default command being executed when running the Docker\ + \ container." + type: "string" + - description: "Set the default command being executed when running the Docker\ + \ container." + type: "array" + items: + type: "string" id: description: "As with all platforms, you can give a platform a different name.\ \ By specifying `id: foo`, you can target this platform (only) by specifying\ @@ -366,19 +842,15 @@ definitions: - description: "A list of enabled ports. This doesn't change the Dockerfile\ \ but gets added as a command-line argument at runtime." type: "string" - - description: "A list of enabled ports. This doesn't change the Dockerfile\ - \ but gets added as a command-line argument at runtime." - type: "integer" - - description: "A list of enabled ports. This doesn't change the Dockerfile\ - \ but gets added as a command-line argument at runtime." - type: "array" + - type: "array" items: + description: "A list of enabled ports. This doesn't change the Dockerfile\ + \ but gets added as a command-line argument at runtime." type: "string" - - description: "A list of enabled ports. This doesn't change the Dockerfile\ - \ but gets added as a command-line argument at runtime." - type: "array" - items: - type: "integer" + target_registry: + description: "The URL where the resulting image will be pushed to. Advanced\ + \ usage only." + type: "string" setup: description: "A list of requirements for installing the following types of\ \ packages:\n\n - apt\n - apk\n - Docker setup instructions\n - JavaScript\n\ @@ -397,16 +869,6 @@ definitions: \ nothing is specified in the `setup` section, simply `image` will be used.\ \ Advanced usage only." type: "string" - cmd: - oneOf: - - description: "Set the default command being executed when running the Docker\ - \ container." - type: "string" - - description: "Set the default command being executed when running the Docker\ - \ container." - type: "array" - items: - type: "string" target_image_source: description: "The source of the target image. This is used for defining labels\ \ in the dockerfile." @@ -426,10 +888,6 @@ definitions: type: "array" items: type: "string" - target_registry: - description: "The URL where the resulting image will be pushed to. Advanced\ - \ usage only." - type: "string" setup_strategy: description: "The Docker setup strategy to use when building a container.\n\ \n| Strategy | Description |\n|-----|----------|\n| `alwaysbuild` / `build`\ @@ -464,13 +922,6 @@ definitions: description: "The organization set in the resulting image. Advanced usage\ \ only." type: "string" - chown: - description: "In Linux, files created by a Docker container will be owned\ - \ by `root`. With `chown: true`, Viash will automatically change the ownership\ - \ of output files (arguments with `type: file` and `direction: output`)\ - \ to the user running the Viash command after execution of the component.\ - \ Default value: `true`." - type: "boolean" required: - "image" - "type" @@ -969,6 +1420,8 @@ definitions: \ -t`\n - `trim` is an argument, which can be passed with `executable_name\ \ trim` \n" type: "string" + direction: + $ref: "#/definitions/Direction" info: description: "Structured information. Can be any shape: a string, vector,\ \ map or even nested map." @@ -1040,6 +1493,8 @@ definitions: \ -s`\n - `silent` is an argument, which can be passed with `executable_name\ \ silent` \n" type: "string" + direction: + $ref: "#/definitions/Direction" info: description: "Structured information. Can be any shape: a string, vector,\ \ map or even nested map." @@ -1078,6 +1533,8 @@ definitions: \ -n`\n - `no-log` is an argument, which can be passed with `executable_name\ \ no-log` \n" type: "string" + direction: + $ref: "#/definitions/Direction" info: description: "Structured information. Can be any shape: a string, vector,\ \ map or even nested map." @@ -1155,6 +1612,8 @@ definitions: \ value is lower than the minimum, an error will be produced. Can be combined\ \ with [`max`](#max) to clamp values." $ref: "#/definitions/DoubleWithInf" + direction: + $ref: "#/definitions/Direction" multiple: description: "Treat the argument value as an array. Arrays can be passed using\ \ the delimiter `--foo=1:2:3` or by providing the same argument multiple\ @@ -1323,6 +1782,8 @@ definitions: \ value is lower than the minimum, an error will be produced. Can be combined\ \ with [`max`](#max) to clamp values." type: "integer" + direction: + $ref: "#/definitions/Direction" multiple: description: "Treat the argument value as an array. Arrays can be passed using\ \ the delimiter `--foo=1:2:3` or by providing the same argument multiple\ @@ -1409,6 +1870,8 @@ definitions: \ value is lower than the minimum, an error will be produced. Can be combined\ \ with [`max`](#max) to clamp values." type: "integer" + direction: + $ref: "#/definitions/Direction" multiple: description: "Treat the argument value as an array. Arrays can be passed using\ \ the delimiter `--foo=1:2:3` or by providing the same argument multiple\ @@ -1455,6 +1918,8 @@ definitions: type: "array" items: type: "string" + direction: + $ref: "#/definitions/Direction" info: description: "Structured information. Can be any shape: a string, vector,\ \ map or even nested map." @@ -1516,10 +1981,10 @@ definitions: - $ref: "#/definitions/RScript" - $ref: "#/definitions/ScalaScript" BashScript: - description: "An executable Bash script.\nWhen defined in functionality.resources,\ - \ only the first entry will be executed when running the built component or\ - \ when running `viash run`.\nWhen defined in functionality.test_resources, all\ - \ entries will be executed during `viash test`." + description: "An executable Bash script.\nWhen defined in resources, only the\ + \ first entry will be executed when running the built component or when running\ + \ `viash run`.\nWhen defined in test_resources, all entries will be executed\ + \ during `viash test`." type: "object" properties: path: @@ -1534,10 +1999,10 @@ definitions: description: "Whether the resulting resource file should be made executable." type: "boolean" type: - description: "An executable Bash script.\nWhen defined in functionality.resources,\ - \ only the first entry will be executed when running the built component\ - \ or when running `viash run`.\nWhen defined in functionality.test_resources,\ - \ all entries will be executed during `viash test`." + description: "An executable Bash script.\nWhen defined in resources, only\ + \ the first entry will be executed when running the built component or when\ + \ running `viash run`.\nWhen defined in test_resources, all entries will\ + \ be executed during `viash test`." const: "bash_script" dest: description: "Resulting filename of the resource. From within a script, the\ @@ -1548,10 +2013,10 @@ definitions: - "type" additionalProperties: false CSharpScript: - description: "An executable C# script.\nWhen defined in functionality.resources,\ - \ only the first entry will be executed when running the built component or\ - \ when running `viash run`.\nWhen defined in functionality.test_resources, all\ - \ entries will be executed during `viash test`." + description: "An executable C# script.\nWhen defined in resources, only the first\ + \ entry will be executed when running the built component or when running `viash\ + \ run`.\nWhen defined in test_resources, all entries will be executed during\ + \ `viash test`." type: "object" properties: path: @@ -1566,10 +2031,10 @@ definitions: description: "Whether the resulting resource file should be made executable." type: "boolean" type: - description: "An executable C# script.\nWhen defined in functionality.resources,\ - \ only the first entry will be executed when running the built component\ - \ or when running `viash run`.\nWhen defined in functionality.test_resources,\ - \ all entries will be executed during `viash test`." + description: "An executable C# script.\nWhen defined in resources, only the\ + \ first entry will be executed when running the built component or when\ + \ running `viash run`.\nWhen defined in test_resources, all entries will\ + \ be executed during `viash test`." const: "csharp_script" dest: description: "Resulting filename of the resource. From within a script, the\ @@ -1606,10 +2071,10 @@ definitions: - "type" additionalProperties: false JavaScriptScript: - description: "An executable JavaScript script.\nWhen defined in functionality.resources,\ - \ only the first entry will be executed when running the built component or\ - \ when running `viash run`.\nWhen defined in functionality.test_resources, all\ - \ entries will be executed during `viash test`." + description: "An executable JavaScript script.\nWhen defined in resources, only\ + \ the first entry will be executed when running the built component or when\ + \ running `viash run`.\nWhen defined in test_resources, all entries will be\ + \ executed during `viash test`." type: "object" properties: path: @@ -1624,10 +2089,10 @@ definitions: description: "Whether the resulting resource file should be made executable." type: "boolean" type: - description: "An executable JavaScript script.\nWhen defined in functionality.resources,\ + description: "An executable JavaScript script.\nWhen defined in resources,\ \ only the first entry will be executed when running the built component\ - \ or when running `viash run`.\nWhen defined in functionality.test_resources,\ - \ all entries will be executed during `viash test`." + \ or when running `viash run`.\nWhen defined in test_resources, all entries\ + \ will be executed during `viash test`." const: "javascript_script" dest: description: "Resulting filename of the resource. From within a script, the\ @@ -1697,10 +2162,10 @@ definitions: required: [] additionalProperties: false PythonScript: - description: "An executable Python script.\nWhen defined in functionality.resources,\ - \ only the first entry will be executed when running the built component or\ - \ when running `viash run`.\nWhen defined in functionality.test_resources, all\ - \ entries will be executed during `viash test`." + description: "An executable Python script.\nWhen defined in resources, only the\ + \ first entry will be executed when running the built component or when running\ + \ `viash run`.\nWhen defined in test_resources, all entries will be executed\ + \ during `viash test`." type: "object" properties: path: @@ -1715,10 +2180,10 @@ definitions: description: "Whether the resulting resource file should be made executable." type: "boolean" type: - description: "An executable Python script.\nWhen defined in functionality.resources,\ - \ only the first entry will be executed when running the built component\ - \ or when running `viash run`.\nWhen defined in functionality.test_resources,\ - \ all entries will be executed during `viash test`." + description: "An executable Python script.\nWhen defined in resources, only\ + \ the first entry will be executed when running the built component or when\ + \ running `viash run`.\nWhen defined in test_resources, all entries will\ + \ be executed during `viash test`." const: "python_script" dest: description: "Resulting filename of the resource. From within a script, the\ @@ -1729,10 +2194,10 @@ definitions: - "type" additionalProperties: false RScript: - description: "An executable R script.\nWhen defined in functionality.resources,\ - \ only the first entry will be executed when running the built component or\ - \ when running `viash run`.\nWhen defined in functionality.test_resources, all\ - \ entries will be executed during `viash test`." + description: "An executable R script.\nWhen defined in resources, only the first\ + \ entry will be executed when running the built component or when running `viash\ + \ run`.\nWhen defined in test_resources, all entries will be executed during\ + \ `viash test`." type: "object" properties: path: @@ -1747,10 +2212,10 @@ definitions: description: "Whether the resulting resource file should be made executable." type: "boolean" type: - description: "An executable R script.\nWhen defined in functionality.resources,\ - \ only the first entry will be executed when running the built component\ - \ or when running `viash run`.\nWhen defined in functionality.test_resources,\ - \ all entries will be executed during `viash test`." + description: "An executable R script.\nWhen defined in resources, only the\ + \ first entry will be executed when running the built component or when\ + \ running `viash run`.\nWhen defined in test_resources, all entries will\ + \ be executed during `viash test`." const: "r_script" dest: description: "Resulting filename of the resource. From within a script, the\ @@ -1761,10 +2226,10 @@ definitions: - "type" additionalProperties: false ScalaScript: - description: "An executable Scala script.\nWhen defined in functionality.resources,\ - \ only the first entry will be executed when running the built component or\ - \ when running `viash run`.\nWhen defined in functionality.test_resources, all\ - \ entries will be executed during `viash test`." + description: "An executable Scala script.\nWhen defined in resources, only the\ + \ first entry will be executed when running the built component or when running\ + \ `viash run`.\nWhen defined in test_resources, all entries will be executed\ + \ during `viash test`." type: "object" properties: path: @@ -1779,10 +2244,10 @@ definitions: description: "Whether the resulting resource file should be made executable." type: "boolean" type: - description: "An executable Scala script.\nWhen defined in functionality.resources,\ - \ only the first entry will be executed when running the built component\ - \ or when running `viash run`.\nWhen defined in functionality.test_resources,\ - \ all entries will be executed during `viash test`." + description: "An executable Scala script.\nWhen defined in resources, only\ + \ the first entry will be executed when running the built component or when\ + \ running `viash run`.\nWhen defined in test_resources, all entries will\ + \ be executed during `viash test`." const: "scala_script" dest: description: "Resulting filename of the resource. From within a script, the\ @@ -2196,7 +2661,7 @@ definitions: \ and follows a semi logarithmic scale (1, 2, 5 per decade).\n\nConceptually\ \ it is possible for a Viash Config to overwrite the full labels parameter,\ \ however likely it is more efficient to add additional labels\nin the Viash\ - \ Project with a config mod.\n" + \ Package with a config mod.\n" type: "object" additionalProperties: type: "string" @@ -2216,9 +2681,8 @@ definitions: additionalProperties: false Dependency: description: "Specifies a Viash component (script or executable) that should be\ - \ made available for the code defined in the functionality.\nThe dependency\ - \ components are collected and copied to the output folder during the Viash\ - \ build step.\n" + \ made available for the code defined in the component.\nThe dependency components\ + \ are collected and copied to the output folder during the Viash build step.\n" type: "object" properties: name: @@ -2227,19 +2691,21 @@ definitions: type: "string" repository: oneOf: - - description: "Specifies the location where the dependency component can\ - \ be found.\nThis must either be a full definition of the repository or\ - \ the name of a repository refenced as it is defined under functionality.repositories.\n\ + - description: "Specifies the repository location where the dependency component\ + \ can be found.\nThis must either be a full definition of the repository\ + \ or the name of a repository referenced as it is defined under repositories.\n\ Additionally, the full definition can be specified as a single string\ \ where all parameters such as repository type, url, branch or tag are\ - \ specified.\n" + \ specified.\nOmitting the value sets the dependency as a local dependency,\ + \ ie. the dependency is available in the same namespace as the component.\n" type: "string" - - description: "Specifies the location where the dependency component can\ - \ be found.\nThis must either be a full definition of the repository or\ - \ the name of a repository refenced as it is defined under functionality.repositories.\n\ + - description: "Specifies the repository location where the dependency component\ + \ can be found.\nThis must either be a full definition of the repository\ + \ or the name of a repository referenced as it is defined under repositories.\n\ Additionally, the full definition can be specified as a single string\ \ where all parameters such as repository type, url, branch or tag are\ - \ specified.\n" + \ specified.\nOmitting the value sets the dependency as a local dependency,\ + \ ie. the dependency is available in the same namespace as the component.\n" $ref: "#/definitions/Repository" alias: description: "An alternative name for the dependency component. This can include\ diff --git a/_viash.yaml b/_viash.yaml index 65344505..8e09d947 100644 --- a/_viash.yaml +++ b/_viash.yaml @@ -1,5 +1,6 @@ -viash_version: 0.8.5 +name: biobase +description: | + A collection of bioinformatics tools for working with sequence data. +license: MIT -config_mods: | - .functionality.arguments[.multiple == true].multiple_sep := ";" - .functionality.argument_groups[true].arguments[.multiple == true].multiple_sep := ";" \ No newline at end of file +viash_version: 0.9.0-RC2 \ No newline at end of file diff --git a/src/arriba/config.vsh.yaml b/src/arriba/config.vsh.yaml index ac847838..8d72d7eb 100644 --- a/src/arriba/config.vsh.yaml +++ b/src/arriba/config.vsh.yaml @@ -1,385 +1,385 @@ -functionality: - name: arriba - description: Detect gene fusions from RNA-Seq data - info: - keywords: [Gene fusion, RNA-Seq] - links: - homepage: https://arriba.readthedocs.io/en/latest/ - documentation: https://arriba.readthedocs.io/en/latest/ - repository: https://github.com/suhrig/arriba - references: - doi: 10.1101/gr.257246.119 - license: MIT - requirements: - cpus: 1 - commands: [ arriba ] - argument_groups: - - name: Inputs - arguments: - - name: --bam - alternatives: -x +name: arriba +description: Detect gene fusions from RNA-Seq data +keywords: [Gene fusion, RNA-Seq] +links: + homepage: https://arriba.readthedocs.io/en/latest/ + documentation: https://arriba.readthedocs.io/en/latest/ + repository: https://github.com/suhrig/arriba +references: + doi: 10.1101/gr.257246.119 +license: MIT +requirements: + cpus: 1 + commands: [ arriba ] +argument_groups: + - name: Inputs + arguments: + - name: --bam + alternatives: -x + type: file + description: | + File in SAM/BAM/CRAM format with main alignments as generated by STAR + (Aligned.out.sam). Arriba extracts candidate reads from this file. + required: true + example: Aligned.out.bam + - name: --genome + alternatives: -a + type: file + description: | + FastA file with genome sequence (assembly). The file may be gzip-compressed. An + index with the file extension .fai must exist only if CRAM files are processed. + required: true + example: assembly.fa + - name: --gene_annotation + alternatives: -g + type: file + description: | + GTF file with gene annotation. The file may be gzip-compressed. + required: true + example: annotation.gtf + - name: --known_fusions + alternatives: -k + type: file + description: | + File containing known/recurrent fusions. Some cancer entities are often + characterized by fusions between the same pair of genes. In order to boost + sensitivity, a list of known fusions can be supplied using this parameter. The list + must contain two columns with the names of the fused genes, separated by tabs. + required: false + example: known_fusions.tsv + - name: --blacklist + alternatives: -b + type: file + description: | + File containing blacklisted events (recurrent artifacts and transcripts + observed in healthy tissue). + required: false + example: blacklist.tsv + - name: --structural_variants + alternatives: -d + type: file + description: | + Tab-separated file with coordinates of structural variants found using + whole-genome sequencing data. These coordinates serve to increase sensitivity + towards weakly expressed fusions and to eliminate fusions with low evidence. + required: false + example: structural_variants_from_WGS.tsv + - name: --tags + alternatives: -t + type: file + description: | + Tab-separated file containing fusions to annotate with tags in the 'tags' column. + The first two columns specify the genes; the third column specifies the tag. The + file may be gzip-compressed. + required: false + example: tags.tsv + - name: --protein_domains + alternatives: -p + type: file + description: | + File in GFF3 format containing coordinates of the protein domains of genes. The + protein domains retained in a fusion are listed in the column + 'retained_protein_domains'. The file may be gzip-compressed. + required: false + example: protein_domains.gff3 + - name: Outputs + arguments: + - name: --fusions + alternatives: -o type: file + direction: output description: | - File in SAM/BAM/CRAM format with main alignments as generated by STAR - (Aligned.out.sam). Arriba extracts candidate reads from this file. + Output file with fusions that have passed all filters. required: true - example: Aligned.out.bam - - name: --genome - alternatives: -a + example: fusions.tsv + - name: --fusions_discarded + alternatives: -O type: file + direction: output description: | - FastA file with genome sequence (assembly). The file may be gzip-compressed. An - index with the file extension .fai must exist only if CRAM files are processed. - required: true - example: assembly.fa - - name: --gene_annotation - alternatives: -g - type: file + Output file with fusions that were discarded due to filtering. + required: false + example: fusions.discarded.tsv + - name: Arguments + arguments: + - name: --max_genomic_breakpoint_distance + alternatives: -D + type: long description: | - GTF file with gene annotation. The file may be gzip-compressed. - required: true - example: annotation.gtf - - name: --known_fusions - alternatives: -k - type: file + When a file with genomic breakpoints obtained via + whole-genome sequencing is supplied via the --structural_variants + parameter, this parameter determines how far a + genomic breakpoint may be away from a + transcriptomic breakpoint to consider it as a + related event. For events inside genes, the + distance is added to the end of the gene; for + intergenic events, the distance threshold is + applied as is. Default: 100000. + required: false + - name: --strandedness + alternatives: -s + type: string description: | - File containing known/recurrent fusions. Some cancer entities are often - characterized by fusions between the same pair of genes. In order to boost - sensitivity, a list of known fusions can be supplied using this parameter. The list - must contain two columns with the names of the fused genes, separated by tabs. + Whether a strand-specific protocol was used for library preparation, + and if so, the type of strandedness (auto/yes/no/reverse). When + unstranded data is processed, the strand can sometimes be inferred from + splice-patterns. But in unclear situations, stranded data helps + resolve ambiguities. Default: auto + choices: ["auto", "yes", "no", "reverse"] required: false - example: known_fusions.tsv - - name: --blacklist - alternatives: -b - type: file + - name: --interesting_contigs + alternatives: -i + type: string description: | - File containing blacklisted events (recurrent artifacts and transcripts - observed in healthy tissue). + List of interesting contigs. Fusions between genes + on other contigs are ignored. Contigs can be specified with or without the + prefix "chr". Asterisks (*) are treated as wild-cards. + Default: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y AC_* NC_* required: false - example: blacklist.tsv - - name: --structural_variants - alternatives: -d - type: file + multiple: true + example: ["1", "2", "AC_*", "NC_*"] + - name: --viral_contigs + alternatives: -v + type: string description: | - Tab-separated file with coordinates of structural variants found using - whole-genome sequencing data. These coordinates serve to increase sensitivity - towards weakly expressed fusions and to eliminate fusions with low evidence. + List of viral contigs. Asterisks (*) are treated as + wild-cards. + Default: AC_* NC_* required: false - example: structural_variants_from_WGS.tsv - - name: --tags - alternatives: -t - type: file + multiple: true + example: ["AC_*", "NC_*"] + - name: --disable_filters + alternatives: -f + type: string description: | - Tab-separated file containing fusions to annotate with tags in the 'tags' column. - The first two columns specify the genes; the third column specifies the tag. The - file may be gzip-compressed. + List of filters to disable. By default all filters are + enabled. + choices: [ homologs, low_entropy, isoforms, + top_expressed_viral_contigs, viral_contigs, uninteresting_contigs, + non_coding_neighbors, mismatches, duplicates, no_genomic_support, + genomic_support, intronic, end_to_end, relative_support, + low_coverage_viral_contigs, merge_adjacent, mismappers, multimappers, + same_gene, long_gap, internal_tandem_duplication, small_insert_size, + read_through, inconsistently_clipped, intragenic_exonic, + marginal_read_through, spliced, hairpin, blacklist, min_support, + select_best, in_vitro, short_anchor, known_fusions, no_coverage, + homopolymer, many_spliced ] required: false - example: tags.tsv - - name: --protein_domains - alternatives: -p - type: file + multiple: true + - name: --max_e_value + alternatives: -E + type: double description: | - File in GFF3 format containing coordinates of the protein domains of genes. The - protein domains retained in a fusion are listed in the column - 'retained_protein_domains'. The file may be gzip-compressed. + Arriba estimates the number of fusions with a given number of supporting + reads which one would expect to see by random chance. If the expected number + of fusions (e-value) is higher than this threshold, the fusion is + discarded by the 'relative_support' filter. Note: Increasing this + threshold can dramatically increase the number of false positives and may + increase the runtime of resource-intensive steps. Fractional values are + possible. Default: 0.300000 required: false - example: protein_domains.gff3 - - name: Outputs - arguments: - - name: --fusions - alternatives: -o - type: file - direction: output - description: | - Output file with fusions that have passed all filters. - required: true - example: fusions.tsv - - name: --fusions_discarded - alternatives: -O - type: file - direction: output - description: | - Output file with fusions that were discarded due to filtering. - required: false - example: fusions.discarded.tsv - - name: Arguments - arguments: - - name: --max_genomic_breakpoint_distance - alternatives: -D - type: long - description: | - When a file with genomic breakpoints obtained via - whole-genome sequencing is supplied via the --structural_variants - parameter, this parameter determines how far a - genomic breakpoint may be away from a - transcriptomic breakpoint to consider it as a - related event. For events inside genes, the - distance is added to the end of the gene; for - intergenic events, the distance threshold is - applied as is. Default: 100000. - required: false - - name: --strandedness - alternatives: -s - type: string - description: | - Whether a strand-specific protocol was used for library preparation, - and if so, the type of strandedness (auto/yes/no/reverse). When - unstranded data is processed, the strand can sometimes be inferred from - splice-patterns. But in unclear situations, stranded data helps - resolve ambiguities. Default: auto - choices: ["auto", "yes", "no", "reverse"] - required: false - - name: --interesting_contigs - alternatives: -i - type: string - description: | - List of interesting contigs. Fusions between genes - on other contigs are ignored. Contigs can be specified with or without the - prefix "chr". Asterisks (*) are treated as wild-cards. - Default: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y AC_* NC_* - required: false - multiple: true - example: ["1", "2", "AC_*", "NC_*"] - - name: --viral_contigs - alternatives: -v - type: string - description: | - List of viral contigs. Asterisks (*) are treated as - wild-cards. - Default: AC_* NC_* - required: false - multiple: true - example: ["AC_*", "NC_*"] - - name: --disable_filters - alternatives: -f - type: string - description: | - List of filters to disable. By default all filters are - enabled. - choices: [ homologs, low_entropy, isoforms, - top_expressed_viral_contigs, viral_contigs, uninteresting_contigs, - non_coding_neighbors, mismatches, duplicates, no_genomic_support, - genomic_support, intronic, end_to_end, relative_support, - low_coverage_viral_contigs, merge_adjacent, mismappers, multimappers, - same_gene, long_gap, internal_tandem_duplication, small_insert_size, - read_through, inconsistently_clipped, intragenic_exonic, - marginal_read_through, spliced, hairpin, blacklist, min_support, - select_best, in_vitro, short_anchor, known_fusions, no_coverage, - homopolymer, many_spliced ] - required: false - multiple: true - - name: --max_e_value - alternatives: -E - type: double - description: | - Arriba estimates the number of fusions with a given number of supporting - reads which one would expect to see by random chance. If the expected number - of fusions (e-value) is higher than this threshold, the fusion is - discarded by the 'relative_support' filter. Note: Increasing this - threshold can dramatically increase the number of false positives and may - increase the runtime of resource-intensive steps. Fractional values are - possible. Default: 0.300000 - required: false - - name: --min_supporting_reads - alternatives: -S - type: integer - description: | - The 'min_support' filter discards all fusions with fewer than - this many supporting reads (split reads and discordant mates - combined). Default: 2 - required: false - example: 2 - - name: --max_mismappers - alternatives: -m - type: double - description: | - When more than this fraction of supporting reads turns out to be - mismappers, the 'mismappers' filter discards the fusion. Default: - 0.800000 - required: false - example: 0.8 - - name: --max_homolog_identity - alternatives: -L - type: double - description: | - Genes with more than the given fraction of sequence identity are - considered homologs and removed by the 'homologs' filter. - Default: 0.300000 - required: false - example: 0.3 - - name: --homopolymer_length - alternatives: -H - type: integer - description: | - The 'homopolymer' filter removes breakpoints adjacent to - homopolymers of the given length or more. Default: 6 - required: false - example: 6 - - name: --read_through_distance - alternatives: -R - type: integer - description: | - The 'read_through' filter removes read-through fusions - where the breakpoints are less than the given distance away - from each other. Default: 10000 - required: false - example: 10000 - - name : --min_anchor_length - alternatives: -A - type: integer - description: | - Alignment artifacts are often characterized by split reads coming - from only one gene and no discordant mates. Moreover, the split - reads only align to a short stretch in one of the genes. The - 'short_anchor' filter removes these fusions. This parameter sets - the threshold in bp for what the filter considers short. Default: 23 - required: false - example: 23 - - name: --many_spliced_events - alternatives: -M - type: integer - description: | - The 'many_spliced' filter recovers fusions between genes that - have at least this many spliced breakpoints. Default: 4 - required: false - example: 4 - - name: --max_kmer_content - alternatives: -K - type: double - description: | - The 'low_entropy' filter removes reads with repetitive 3-mers. If - the 3-mers make up more than the given fraction of the sequence, then - the read is discarded. Default: 0.600000 - required: false - example: 0.6 - - name: --max_mismatch_pvalue - alternatives: -V - type: double - description: | - The 'mismatches' filter uses a binomial model to calculate a - p-value for observing a given number of mismatches in a read. If - the number of mismatches is too high, the read is discarded. - Default: 0.010000 - required: false - example: 0.05 - - name: --fragment_length - alternatives: -F - type: integer - description: | - When paired-end data is given, the fragment length is estimated - automatically and this parameter has no effect. But when single-end - data is given, the mean fragment length should be specified to - effectively filter fusions that arise from hairpin structures. - Default: 200 - required: false - example: 200 - - name: --max_reads - alternatives: -U - type: integer - description: | - Subsample fusions with more than the given number of supporting reads. This - improves performance without compromising sensitivity, as long as the - threshold is high. Counting of supporting reads beyond the threshold is - inaccurate, obviously. Default: 300 - required: false - example: 300 - - name: --quantile - alternatives: -Q - type: double - description: | - Highly expressed genes are prone to produce artifacts during library - preparation. Genes with an expression above the given quantile are eligible - for filtering by the 'in_vitro' filter. Default: 0.998000 - required: false - example: 0.998 - - name: --exonic_fraction - alternatives: -e - type: double - description: | - The breakpoints of false-positive predictions of intragenic events - are often both in exons. True predictions are more likely to have at - least one breakpoint in an intron, because introns are larger. If the - fraction of exonic sequence between two breakpoints is smaller than - the given fraction, the 'intragenic_exonic' filter discards the - event. Default: 0.330000 - required: false - example: 0.33 - - name: --top_n - alternatives: -T - type: integer - description: | - Only report viral integration sites of the top N most highly expressed viral - contigs. Default: 5 - required: false - example: 5 - - name: --covered_fraction - alternatives: -C - type: double - description: | - Ignore virally associated events if the virus is not fully - expressed, i.e., less than the given fraction of the viral contig is - transcribed. Default: 0.050000 - required: false - example: 0.05 - - name: --max_itd_length - alternatives: -l - type: integer - description: | - Maximum length of internal tandem duplications. Note: Increasing - this value beyond the default can impair performance and lead to many - false positives. Default: 100 - required: false - example: 100 - - name: --min_itd_allele_fraction - alternatives: -z - type: double - description: | - Required fraction of supporting reads to report an internal - tandem duplication. Default: 0.070000 - required: false - example: 0.07 - - name: --min_itd_supporting_reads - alternatives: -Z - type: integer - description: | - Required absolute number of supporting reads to report an - internal tandem duplication. Default: 10 - required: false - example: 10 - - name: --skip_duplicate_marking - alternatives: -u - type: boolean_true - description: | - Instead of performing duplicate marking itself, Arriba relies on duplicate marking by a - preceding program using the BAM_FDUP flag. This makes sense when unique molecular - identifiers (UMI) are used. - - name: --extra_information - alternatives: -X - type: boolean_true - description: | - To reduce the runtime and file size, by default, the columns 'fusion_transcript', - 'peptide_sequence', and 'read_identifiers' are left empty in the file containing - discarded fusion candidates (see parameter -O). When this flag is set, this extra - information is reported in the discarded fusions file. - - name: --fill_gaps - alternatives: -I - type: boolean_true - description: | - If assembly of the fusion transcript sequence from the supporting reads is incomplete - (denoted as '...'), fill the gaps using the assembly sequence wherever possible. - resources: - - type: bash_script - path: script.sh - test_resources: - - type: bash_script - path: test.sh - - type: file - path: test_data -platforms: + - name: --min_supporting_reads + alternatives: -S + type: integer + description: | + The 'min_support' filter discards all fusions with fewer than + this many supporting reads (split reads and discordant mates + combined). Default: 2 + required: false + example: 2 + - name: --max_mismappers + alternatives: -m + type: double + description: | + When more than this fraction of supporting reads turns out to be + mismappers, the 'mismappers' filter discards the fusion. Default: + 0.800000 + required: false + example: 0.8 + - name: --max_homolog_identity + alternatives: -L + type: double + description: | + Genes with more than the given fraction of sequence identity are + considered homologs and removed by the 'homologs' filter. + Default: 0.300000 + required: false + example: 0.3 + - name: --homopolymer_length + alternatives: -H + type: integer + description: | + The 'homopolymer' filter removes breakpoints adjacent to + homopolymers of the given length or more. Default: 6 + required: false + example: 6 + - name: --read_through_distance + alternatives: -R + type: integer + description: | + The 'read_through' filter removes read-through fusions + where the breakpoints are less than the given distance away + from each other. Default: 10000 + required: false + example: 10000 + - name : --min_anchor_length + alternatives: -A + type: integer + description: | + Alignment artifacts are often characterized by split reads coming + from only one gene and no discordant mates. Moreover, the split + reads only align to a short stretch in one of the genes. The + 'short_anchor' filter removes these fusions. This parameter sets + the threshold in bp for what the filter considers short. Default: 23 + required: false + example: 23 + - name: --many_spliced_events + alternatives: -M + type: integer + description: | + The 'many_spliced' filter recovers fusions between genes that + have at least this many spliced breakpoints. Default: 4 + required: false + example: 4 + - name: --max_kmer_content + alternatives: -K + type: double + description: | + The 'low_entropy' filter removes reads with repetitive 3-mers. If + the 3-mers make up more than the given fraction of the sequence, then + the read is discarded. Default: 0.600000 + required: false + example: 0.6 + - name: --max_mismatch_pvalue + alternatives: -V + type: double + description: | + The 'mismatches' filter uses a binomial model to calculate a + p-value for observing a given number of mismatches in a read. If + the number of mismatches is too high, the read is discarded. + Default: 0.010000 + required: false + example: 0.05 + - name: --fragment_length + alternatives: -F + type: integer + description: | + When paired-end data is given, the fragment length is estimated + automatically and this parameter has no effect. But when single-end + data is given, the mean fragment length should be specified to + effectively filter fusions that arise from hairpin structures. + Default: 200 + required: false + example: 200 + - name: --max_reads + alternatives: -U + type: integer + description: | + Subsample fusions with more than the given number of supporting reads. This + improves performance without compromising sensitivity, as long as the + threshold is high. Counting of supporting reads beyond the threshold is + inaccurate, obviously. Default: 300 + required: false + example: 300 + - name: --quantile + alternatives: -Q + type: double + description: | + Highly expressed genes are prone to produce artifacts during library + preparation. Genes with an expression above the given quantile are eligible + for filtering by the 'in_vitro' filter. Default: 0.998000 + required: false + example: 0.998 + - name: --exonic_fraction + alternatives: -e + type: double + description: | + The breakpoints of false-positive predictions of intragenic events + are often both in exons. True predictions are more likely to have at + least one breakpoint in an intron, because introns are larger. If the + fraction of exonic sequence between two breakpoints is smaller than + the given fraction, the 'intragenic_exonic' filter discards the + event. Default: 0.330000 + required: false + example: 0.33 + - name: --top_n + alternatives: -T + type: integer + description: | + Only report viral integration sites of the top N most highly expressed viral + contigs. Default: 5 + required: false + example: 5 + - name: --covered_fraction + alternatives: -C + type: double + description: | + Ignore virally associated events if the virus is not fully + expressed, i.e., less than the given fraction of the viral contig is + transcribed. Default: 0.050000 + required: false + example: 0.05 + - name: --max_itd_length + alternatives: -l + type: integer + description: | + Maximum length of internal tandem duplications. Note: Increasing + this value beyond the default can impair performance and lead to many + false positives. Default: 100 + required: false + example: 100 + - name: --min_itd_allele_fraction + alternatives: -z + type: double + description: | + Required fraction of supporting reads to report an internal + tandem duplication. Default: 0.070000 + required: false + example: 0.07 + - name: --min_itd_supporting_reads + alternatives: -Z + type: integer + description: | + Required absolute number of supporting reads to report an + internal tandem duplication. Default: 10 + required: false + example: 10 + - name: --skip_duplicate_marking + alternatives: -u + type: boolean_true + description: | + Instead of performing duplicate marking itself, Arriba relies on duplicate marking by a + preceding program using the BAM_FDUP flag. This makes sense when unique molecular + identifiers (UMI) are used. + - name: --extra_information + alternatives: -X + type: boolean_true + description: | + To reduce the runtime and file size, by default, the columns 'fusion_transcript', + 'peptide_sequence', and 'read_identifiers' are left empty in the file containing + discarded fusion candidates (see parameter -O). When this flag is set, this extra + information is reported in the discarded fusions file. + - name: --fill_gaps + alternatives: -I + type: boolean_true + description: | + If assembly of the fusion transcript sequence from the supporting reads is incomplete + (denoted as '...'), fill the gaps using the assembly sequence wherever possible. +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - type: file + path: test_data +engines: - type: docker image: quay.io/biocontainers/arriba:2.4.0--h0033a41_2 setup: - type: docker run: | arriba -h | grep 'Version:' 2>&1 | sed 's/Version:\s\(.*\)/arriba: "\1"/' > /var/software_versions.txt - - type: nextflow +runners: + - type: executable + - type: nextflow \ No newline at end of file diff --git a/src/bgzip/config.vsh.yaml b/src/bgzip/config.vsh.yaml index 049d0cbf..26e31ae4 100644 --- a/src/bgzip/config.vsh.yaml +++ b/src/bgzip/config.vsh.yaml @@ -1,128 +1,128 @@ -functionality: - name: bgzip - description: Block compression/decompression utility - info: - links: - homepage: https://www.htslib.org/ - documentation: https://www.htslib.org/doc/bgzip.html - repository: https://github.com/samtools/htslib - references: - doi: 10.1093/gigascience/giab007 - license: MIT - requirements: - commands: [ bgzip ] - argument_groups: - - name: Inputs - arguments: - - name: --input - type: file - direction: input - description: file to be compressed or decompressed - required: true - - name: Outputs - arguments: - - name: --output - type: file - direction: output - description: compressed or decompressed output - required: true - - name: --index_name - alternatives: -I - type: file - direction: output - description: name of BGZF index file [file.gz.gzi] - - name: Arguments - arguments: - - name: --offset - alternatives: -b - type: integer - description: decompress at virtual file pointer (0-based uncompressed offset) - - name: --decompress - alternatives: -d - type: boolean_true - description: decompress the input file - - name: --rebgzip - alternatives: -g - type: boolean_true - description: use an index file to bgzip a file - - name: --index - alternatives: -i - type: boolean_true - description: compress and create BGZF index - - name: --compress_level - alternatives: -l - type: integer - description: compression level to use when compressing; 0 to 9, or -1 for default [-1] - min: -1 - max: 9 - - name: --reindex - alternatives: -r - type: boolean_true - description: (re)index the output file - - name: --size - alternatives: -s - type: integer - description: decompress INT bytes (uncompressed size) - min: 0 - - name: --test - alternatives: -t - type: boolean_true - description: test integrity of compressed file - - name: --binary - type: boolean_true - description: Don't align blocks with text lines - resources: - - type: bash_script - text: | - [[ "$par_decompress" == "false" ]] && unset par_decompress - [[ "$par_rebgzip" == "false" ]] && unset par_rebgzip - [[ "$par_index" == "false" ]] && unset par_index - [[ "$par_reindex" == "false" ]] && unset par_reindex - [[ "$par_test" == "false" ]] && unset par_test - [[ "$par_binary" == "false" ]] && unset par_binary - bgzip -c \ - ${meta_cpus:+--threads "${meta_cpus}"} \ - ${par_offset:+-b "${par_offset}"} \ - ${par_decompress:+-d} \ - ${par_rebgzip:+-g} \ - ${par_index:+-i} \ - ${par_index_name:+-I "${par_index_name}"} \ - ${par_compress_level:+-l "${par_compress_level}"} \ - ${par_reindex:+-r} \ - ${par_size:+-s "${par_size}"} \ - ${par_test:+-t} \ - ${par_binary:+--binary} \ - "$par_input" > "$par_output" - test_resources: - - type: bash_script - text: | - set -e +name: bgzip +description: Block compression/decompression utility +links: + homepage: https://www.htslib.org/ + documentation: https://www.htslib.org/doc/bgzip.html + repository: https://github.com/samtools/htslib +references: + doi: 10.1093/gigascience/giab007 +license: MIT +requirements: + commands: [ bgzip ] +argument_groups: + - name: Inputs + arguments: + - name: --input + type: file + direction: input + description: file to be compressed or decompressed + required: true + - name: Outputs + arguments: + - name: --output + type: file + direction: output + description: compressed or decompressed output + required: true + - name: --index_name + alternatives: -I + type: file + direction: output + description: name of BGZF index file [file.gz.gzi] + - name: Arguments + arguments: + - name: --offset + alternatives: -b + type: integer + description: decompress at virtual file pointer (0-based uncompressed offset) + - name: --decompress + alternatives: -d + type: boolean_true + description: decompress the input file + - name: --rebgzip + alternatives: -g + type: boolean_true + description: use an index file to bgzip a file + - name: --index + alternatives: -i + type: boolean_true + description: compress and create BGZF index + - name: --compress_level + alternatives: -l + type: integer + description: compression level to use when compressing; 0 to 9, or -1 for default [-1] + min: -1 + max: 9 + - name: --reindex + alternatives: -r + type: boolean_true + description: (re)index the output file + - name: --size + alternatives: -s + type: integer + description: decompress INT bytes (uncompressed size) + min: 0 + - name: --test + alternatives: -t + type: boolean_true + description: test integrity of compressed file + - name: --binary + type: boolean_true + description: Don't align blocks with text lines +resources: + - type: bash_script + text: | + [[ "$par_decompress" == "false" ]] && unset par_decompress + [[ "$par_rebgzip" == "false" ]] && unset par_rebgzip + [[ "$par_index" == "false" ]] && unset par_index + [[ "$par_reindex" == "false" ]] && unset par_reindex + [[ "$par_test" == "false" ]] && unset par_test + [[ "$par_binary" == "false" ]] && unset par_binary + bgzip -c \ + ${meta_cpus:+--threads "${meta_cpus}"} \ + ${par_offset:+-b "${par_offset}"} \ + ${par_decompress:+-d} \ + ${par_rebgzip:+-g} \ + ${par_index:+-i} \ + ${par_index_name:+-I "${par_index_name}"} \ + ${par_compress_level:+-l "${par_compress_level}"} \ + ${par_reindex:+-r} \ + ${par_size:+-s "${par_size}"} \ + ${par_test:+-t} \ + ${par_binary:+--binary} \ + "$par_input" > "$par_output" +test_resources: + - type: bash_script + text: | + set -e - "$meta_executable" --input "$meta_resources_dir/test_data/test.vcf" --output "test.vcf.gz" + "$meta_executable" --input "$meta_resources_dir/test_data/test.vcf" --output "test.vcf.gz" - echo ">> Checking output of compressing" - [ ! -f "test.vcf.gz" ] && echo "Output file test.vcf.gz does not exist" && exit 1 + echo ">> Checking output of compressing" + [ ! -f "test.vcf.gz" ] && echo "Output file test.vcf.gz does not exist" && exit 1 - "$meta_executable" --input "test.vcf.gz" --output "test.vcf" --decompress + "$meta_executable" --input "test.vcf.gz" --output "test.vcf" --decompress - echo ">> Checking output of decompressing" - [ ! -f "test.vcf" ] && echo "Output file test.vcf does not exist" && exit 1 + echo ">> Checking output of decompressing" + [ ! -f "test.vcf" ] && echo "Output file test.vcf does not exist" && exit 1 - echo ">> Checking original and decompressed files are the same" - set +e - cmp --silent -- "$meta_resources_dir/test_data/test.vcf" "test.vcf" - [ $? -ne 0 ] && echo "files are different" && exit 1 - set -e - - echo "> Test successful" - - type: file - path: test_data + echo ">> Checking original and decompressed files are the same" + set +e + cmp --silent -- "$meta_resources_dir/test_data/test.vcf" "test.vcf" + [ $? -ne 0 ] && echo "files are different" && exit 1 + set -e + + echo "> Test successful" + - type: file + path: test_data -platforms: +engines: - type: docker image: quay.io/biocontainers/htslib:1.19--h81da01d_0 setup: - type: docker run: | bgzip -h | grep 'Version:' 2>&1 | sed 's/Version:\s\(.*\)/bgzip: "\1"/' > /var/software_versions.txt +runners: + - type: executable - type: nextflow \ No newline at end of file diff --git a/src/busco/busco_download_datasets/config.vsh.yaml b/src/busco/busco_download_datasets/config.vsh.yaml index a592ed89..04d76dd6 100644 --- a/src/busco/busco_download_datasets/config.vsh.yaml +++ b/src/busco/busco_download_datasets/config.vsh.yaml @@ -1,46 +1,47 @@ -functionality: - name: busco_download_datasets - namespace: busco - description: Downloads available busco datasets - info: - links: - homepage: https://busco.ezlab.org/ - documentation: https://busco.ezlab.org/busco_userguide.html - repository: https://gitlab.com/ezlab/busco - references: - doi: 10.1007/978-1-4939-9173-0_14 - license: MIT - argument_groups: - - name: Inputs - arguments: - - name: --download - type: string - description: | - Download dataset. Possible values are a specific dataset name, "all", "prokaryota", "eukaryota", or "virus". - The full list of available datasets can be viewed [here](https://busco-data.ezlab.org/v5/data/lineages/) or by running the busco/busco_list_datasets component. - required: true - example: stramenopiles_odb10 - - name: Outputs - arguments: - - name: --download_path - direction: output - type: file - description: | - Local filepath for storing BUSCO dataset downloads - required: false - default: busco_downloads - example: busco_downloads - resources: - - type: bash_script - path: script.sh - test_resources: - - type: bash_script - path: test.sh -platforms: +name: busco_download_datasets +namespace: busco +description: Downloads available busco datasets +keywords: [lineage datasets] +links: + homepage: https://busco.ezlab.org/ + documentation: https://busco.ezlab.org/busco_userguide.html + repository: https://gitlab.com/ezlab/busco +references: + doi: 10.1007/978-1-4939-9173-0_14 +license: MIT +argument_groups: + - name: Inputs + arguments: + - name: --download + type: string + description: | + Download dataset. Possible values are a specific dataset name, "all", "prokaryota", "eukaryota", or "virus". + The full list of available datasets can be viewed [here](https://busco-data.ezlab.org/v5/data/lineages/) or by running the busco/busco_list_datasets component. + required: true + example: stramenopiles_odb10 + - name: Outputs + arguments: + - name: --download_path + direction: output + type: file + description: | + Local filepath for storing BUSCO dataset downloads + required: false + default: busco_downloads + example: busco_downloads +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh +engines: - type: docker image: quay.io/biocontainers/busco:5.6.1--pyhdfd78af_0 setup: - type: docker run: | busco --version | sed 's/BUSCO\s\(.*\)/busco: "\1"/' > /var/software_versions.txt +runners: + - type: executable - type: nextflow diff --git a/src/busco/busco_list_datasets/config.vsh.yaml b/src/busco/busco_list_datasets/config.vsh.yaml index 004628c9..6ada7c84 100644 --- a/src/busco/busco_list_datasets/config.vsh.yaml +++ b/src/busco/busco_list_datasets/config.vsh.yaml @@ -1,38 +1,39 @@ -functionality: - name: busco_list_datasets - namespace: busco - description: Lists the available busco datasets - info: - links: - homepage: https://busco.ezlab.org/ - documentation: https://busco.ezlab.org/busco_userguide.html - repository: https://gitlab.com/ezlab/busco - references: - doi: 10.1007/978-1-4939-9173-0_14 - license: MIT - argument_groups: - - name: Outputs - arguments: - - name: --output - alternatives: ["-o"] - direction: output - type: file - description: | - Output file of the available busco datasets - required: false - default: busco_dataset_list.txt - example: file.txt - resources: - - type: bash_script - path: script.sh - test_resources: - - type: bash_script - path: test.sh -platforms: +name: busco_list_datasets +namespace: busco +description: Lists the available busco datasets +keywords: [lineage datasets] +links: + homepage: https://busco.ezlab.org/ + documentation: https://busco.ezlab.org/busco_userguide.html + repository: https://gitlab.com/ezlab/busco +references: + doi: 10.1007/978-1-4939-9173-0_14 +license: MIT +argument_groups: + - name: Outputs + arguments: + - name: --output + alternatives: ["-o"] + direction: output + type: file + description: | + Output file of the available busco datasets + required: false + default: busco_dataset_list.txt + example: file.txt +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh +engines: - type: docker image: quay.io/biocontainers/busco:5.6.1--pyhdfd78af_0 setup: - type: docker run: | busco --version | sed 's/BUSCO\s\(.*\)/busco: "\1"/' > /var/software_versions.txt +runners: + - type: executable - type: nextflow diff --git a/src/busco/busco_run/config.vsh.yaml b/src/busco/busco_run/config.vsh.yaml index 8524b068..d79f03f5 100644 --- a/src/busco/busco_run/config.vsh.yaml +++ b/src/busco/busco_run/config.vsh.yaml @@ -1,214 +1,214 @@ -functionality: - name: busco_run - namespace: busco - description: Assessment of genome assembly and annotation completeness with single copy orthologs - info: - keywords: [Genome assembly, quality control] - links: - homepage: https://busco.ezlab.org/ - documentation: https://busco.ezlab.org/busco_userguide.html - repository: https://gitlab.com/ezlab/busco - references: - doi: 10.1007/978-1-4939-9173-0_14 - license: MIT - argument_groups: - - name: Inputs - arguments: - - name: --input - alternatives: ["-i"] - type: file - description: | - Input sequence file in FASTA format. Can be an assembled genome or transcriptome (DNA), or protein sequences from an annotated gene set. Also possible to use a path to a directory containing multiple input files. - required: true - example: file.fasta - - name: --mode - alternatives: ["-m"] - type: string - choices: [genome, geno, transcriptome, tran, proteins, prot] - required: true - description: | - Specify which BUSCO analysis mode to run. There are three valid modes: - - geno or genome, for genome assemblies (DNA) - - tran or transcriptome, for transcriptome assemblies (DNA) - - prot or proteins, for annotated gene sets (protein) - example: proteins - - name: --lineage_dataset - alternatives: ["-l"] - type: string - required: false - description: | - Specify a BUSCO lineage dataset that is most closely related to the assembly or gene set being assessed. - The full list of available datasets can be viewed [here](https://busco-data.ezlab.org/v5/data/lineages/) or by running the busco/busco_list_datasets component. - When unsure, the "--auto_lineage" flag can be set to automatically find the optimal lineage path. - BUSCO will automatically download the requested dataset if it is not already present in the download folder. - You can optionally provide a path to a local dataset instead of a name, e.g. path/to/dataset. - Datasets can be downloaded using the busco/busco_download_dataset component. - example: stramenopiles_odb10 +name: busco_run +namespace: busco +description: Assessment of genome assembly and annotation completeness with single copy orthologs +keywords: [Genome assembly, quality control] +links: + homepage: https://busco.ezlab.org/ + documentation: https://busco.ezlab.org/busco_userguide.html + repository: https://gitlab.com/ezlab/busco +references: + doi: 10.1007/978-1-4939-9173-0_14 +license: MIT +argument_groups: + - name: Inputs + arguments: + - name: --input + alternatives: ["-i"] + type: file + description: | + Input sequence file in FASTA format. Can be an assembled genome or transcriptome (DNA), or protein sequences from an annotated gene set. Also possible to use a path to a directory containing multiple input files. + required: true + example: file.fasta + - name: --mode + alternatives: ["-m"] + type: string + choices: [genome, geno, transcriptome, tran, proteins, prot] + required: true + description: | + Specify which BUSCO analysis mode to run. There are three valid modes: + - geno or genome, for genome assemblies (DNA) + - tran or transcriptome, for transcriptome assemblies (DNA) + - prot or proteins, for annotated gene sets (protein) + example: proteins + - name: --lineage_dataset + alternatives: ["-l"] + type: string + required: false + description: | + Specify a BUSCO lineage dataset that is most closely related to the assembly or gene set being assessed. + The full list of available datasets can be viewed [here](https://busco-data.ezlab.org/v5/data/lineages/) or by running the busco/busco_list_datasets component. + When unsure, the "--auto_lineage" flag can be set to automatically find the optimal lineage path. + BUSCO will automatically download the requested dataset if it is not already present in the download folder. + You can optionally provide a path to a local dataset instead of a name, e.g. path/to/dataset. + Datasets can be downloaded using the busco/busco_download_dataset component. + example: stramenopiles_odb10 - - name: Outputs - arguments: - - name: --short_summary_json - required: false - direction: output - type: file - example: short_summary.json - description: | - Output file for short summary in JSON format. - - name: --short_summary_txt - required: false - direction: output - type: file - example: short_summary.txt - description: | - Output file for short summary in TXT format. - - name: --full_table - required: false - direction: output - type: file - example: full_table.tsv - description: | - Full table output in TSV format. - - name: --missing_busco_list - required: false - direction: output - type: file - example: missing_busco_list.tsv - description: | - Missing list output in TSV format. - - name: --output_dir - required: false - direction: output - type: file - example: output_dir/ - description: | - The full output directory, if so desired. + - name: Outputs + arguments: + - name: --short_summary_json + required: false + direction: output + type: file + example: short_summary.json + description: | + Output file for short summary in JSON format. + - name: --short_summary_txt + required: false + direction: output + type: file + example: short_summary.txt + description: | + Output file for short summary in TXT format. + - name: --full_table + required: false + direction: output + type: file + example: full_table.tsv + description: | + Full table output in TSV format. + - name: --missing_busco_list + required: false + direction: output + type: file + example: missing_busco_list.tsv + description: | + Missing list output in TSV format. + - name: --output_dir + required: false + direction: output + type: file + example: output_dir/ + description: | + The full output directory, if so desired. - - name: Resource and Run Settings - arguments: - - name: --force - type: boolean_true - description: | - Force rewriting of existing files. Must be used when output files with the provided name already exist. - - name: --quiet - alternatives: ["-q"] - type: boolean_true - description: | - Disable the info logs, displays only errors. - - name: --restart - alternatives: ["-r"] - type: boolean_true - description: | - Continue a run that had already partially completed. Restarting skips calls to tools that have completed but performs all pre- and post-processing steps. - - name: --tar - type: boolean_true - description: | - Compress some subdirectories with many files to save space. + - name: Resource and Run Settings + arguments: + - name: --force + type: boolean_true + description: | + Force rewriting of existing files. Must be used when output files with the provided name already exist. + - name: --quiet + alternatives: ["-q"] + type: boolean_true + description: | + Disable the info logs, displays only errors. + - name: --restart + alternatives: ["-r"] + type: boolean_true + description: | + Continue a run that had already partially completed. Restarting skips calls to tools that have completed but performs all pre- and post-processing steps. + - name: --tar + type: boolean_true + description: | + Compress some subdirectories with many files to save space. - - name: Lineage Dataset Settings - arguments: - - name: --auto_lineage - type: boolean_true - description: | - Run auto-lineage pipelilne to automatically determine BUSCO lineage dataset that is most closely related to the assembly or gene set being assessed. - - name: --auto_lineage_euk - type: boolean_true - description: | - Run auto-placement just on eukaryota tree to find optimal lineage path. - - name: --auto_lineage_prok - type: boolean_true - description: | - Run auto_lineage just on prokaryota trees to find optimum lineage path. - - name: --datasets_version - type: string - required: false - description: | - Specify the version of BUSCO datasets - example: odb10 + - name: Lineage Dataset Settings + arguments: + - name: --auto_lineage + type: boolean_true + description: | + Run auto-lineage pipelilne to automatically determine BUSCO lineage dataset that is most closely related to the assembly or gene set being assessed. + - name: --auto_lineage_euk + type: boolean_true + description: | + Run auto-placement just on eukaryota tree to find optimal lineage path. + - name: --auto_lineage_prok + type: boolean_true + description: | + Run auto_lineage just on prokaryota trees to find optimum lineage path. + - name: --datasets_version + type: string + required: false + description: | + Specify the version of BUSCO datasets + example: odb10 - - name: Augustus Settings - arguments: - - name: --augustus - type: boolean_true - description: | - Use augustus gene predictor for eukaryote runs. - - name: --augustus_parameters - type: string - required: false - description: | - Additional parameters to be passed to Augustus (see Augustus documentation: https://github.com/Gaius-Augustus/Augustus/blob/master/docs/RUNNING-AUGUSTUS.md). - Parameters should be contained within a single string, without whitespace and seperated by commas. - example: "--PARAM1=VALUE1,--PARAM2=VALUE2" - - name: --augustus_species - type: string - required: false - description: | - Specify the augustus species - - name: --long - type: boolean_true - description: | - Optimize Augustus self-training mode. This adds considerably to the run time, but can improve results for some non-model organisms. + - name: Augustus Settings + arguments: + - name: --augustus + type: boolean_true + description: | + Use augustus gene predictor for eukaryote runs. + - name: --augustus_parameters + type: string + required: false + description: | + Additional parameters to be passed to Augustus (see Augustus documentation: https://github.com/Gaius-Augustus/Augustus/blob/master/docs/RUNNING-AUGUSTUS.md). + Parameters should be contained within a single string, without whitespace and seperated by commas. + example: "--PARAM1=VALUE1,--PARAM2=VALUE2" + - name: --augustus_species + type: string + required: false + description: | + Specify the augustus species + - name: --long + type: boolean_true + description: | + Optimize Augustus self-training mode. This adds considerably to the run time, but can improve results for some non-model organisms. - - name: BBTools Settings - arguments: - - name: --contig_break - type: integer - required: false - description: | - Number of contiguous Ns to signify a break between contigs in BBTools analysis. - - name: --limit - type: integer - required: false - description: | - Number of candidate regions (contig or transcript) from the BLAST output to consider per BUSCO. - This option is only effective in pipelines using BLAST, i.e. the genome pipeline (see --augustus) or the prokaryota transcriptome pipeline. - - name: --scaffold_composition - type: boolean_true - description: | - Writes ACGTN content per scaffold to a file scaffold_composition.txt. + - name: BBTools Settings + arguments: + - name: --contig_break + type: integer + required: false + description: | + Number of contiguous Ns to signify a break between contigs in BBTools analysis. + - name: --limit + type: integer + required: false + description: | + Number of candidate regions (contig or transcript) from the BLAST output to consider per BUSCO. + This option is only effective in pipelines using BLAST, i.e. the genome pipeline (see --augustus) or the prokaryota transcriptome pipeline. + - name: --scaffold_composition + type: boolean_true + description: | + Writes ACGTN content per scaffold to a file scaffold_composition.txt. - - name: BLAST Settings - arguments: - - name: --e_value - type: double - required: false - description: | - E-value cutoff for BLAST searches. + - name: BLAST Settings + arguments: + - name: --e_value + type: double + required: false + description: | + E-value cutoff for BLAST searches. - - name: Protein Gene Prediction settings - arguments: - - name: --miniprot - type: boolean_true - description: | - Use Miniprot gene predictor. + - name: Protein Gene Prediction settings + arguments: + - name: --miniprot + type: boolean_true + description: | + Use Miniprot gene predictor. - - name: MetaEuk Settings - arguments: - - name: --metaeuk_parameters - type: string - description: | - Pass additional arguments to Metaeuk for the first run (see Metaeuk documentation https://github.com/soedinglab/metaeuk). - All parameters should be contained within a single string with no white space, with each parameter separated by a comma. - example: "--max-overlap=15,--min-exon-aa=15" - - name: --metaeuk_rerun_parameters - type: string - description: | - Pass additional arguments to Metaeuk for the second run (see Metaeuk documentation https://github.com/soedinglab/metaeuk). - All parameters should be contained within a single string with no white space, with each parameter separated by a comma. - example: "--max-overlap=15,--min-exon-aa=15" + - name: MetaEuk Settings + arguments: + - name: --metaeuk_parameters + type: string + description: | + Pass additional arguments to Metaeuk for the first run (see Metaeuk documentation https://github.com/soedinglab/metaeuk). + All parameters should be contained within a single string with no white space, with each parameter separated by a comma. + example: "--max-overlap=15,--min-exon-aa=15" + - name: --metaeuk_rerun_parameters + type: string + description: | + Pass additional arguments to Metaeuk for the second run (see Metaeuk documentation https://github.com/soedinglab/metaeuk). + All parameters should be contained within a single string with no white space, with each parameter separated by a comma. + example: "--max-overlap=15,--min-exon-aa=15" - resources: - - type: bash_script - path: script.sh - test_resources: - - type: bash_script - path: test.sh - - type: file - path: test_data -platforms: +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - type: file + path: test_data +engines: - type: docker image: quay.io/biocontainers/busco:5.6.1--pyhdfd78af_0 setup: - type: docker run: | busco --version | sed 's/BUSCO\s\(.*\)/busco: "\1"/' > /var/software_versions.txt +runners: + - type: executable - type: nextflow diff --git a/src/fastp/config.vsh.yaml b/src/fastp/config.vsh.yaml index 24db55d1..b7d9062a 100644 --- a/src/fastp/config.vsh.yaml +++ b/src/fastp/config.vsh.yaml @@ -1,576 +1,576 @@ -functionality: - name: fastp - description: | - An ultra-fast all-in-one FASTQ preprocessor (QC/adapters/trimming/filtering/splitting/merging...). +name: fastp +description: | + An ultra-fast all-in-one FASTQ preprocessor (QC/adapters/trimming/filtering/splitting/merging...). - Features: + Features: - - comprehensive quality profiling for both before and after filtering data (quality curves, base contents, KMER, Q20/Q30, GC Ratio, duplication, adapter contents...) - - filter out bad reads (too low quality, too short, or too many N...) - - cut low quality bases for per read in its 5' and 3' by evaluating the mean quality from a sliding window (like Trimmomatic but faster). - - trim all reads in front and tail - - cut adapters. Adapter sequences can be automatically detected, which means you don't have to input the adapter sequences to trim them. - - correct mismatched base pairs in overlapped regions of paired end reads, if one base is with high quality while the other is with ultra low quality - - trim polyG in 3' ends, which is commonly seen in NovaSeq/NextSeq data. Trim polyX in 3' ends to remove unwanted polyX tailing (i.e. polyA tailing for mRNA-Seq data) - - preprocess unique molecular identifier (UMI) enabled data, shift UMI to sequence name. - - report JSON format result for further interpreting. - - visualize quality control and filtering results on a single HTML page (like FASTQC but faster and more informative). - - split the output to multiple files (0001.R1.gz, 0002.R1.gz...) to support parallel processing. Two modes can be used, limiting the total split file number, or limitting the lines of each split file. - - support long reads (data from PacBio / Nanopore devices). - - support reading from STDIN and writing to STDOUT - - support interleaved input - - support ultra-fast FASTQ-level deduplication - info: - keywords: [RNA-Seq, Trimming, Quality control] - links: - repository: https://github.com/OpenGene/fastp - documentation: https://github.com/OpenGene/fastp/blob/master/README.md - references: - doi: 10.1093/bioinformatics/bty560 - license: MIT - argument_groups: - - name: Inputs - description: | - `fastp` supports both single-end (SE) and paired-end (PE) input. + - comprehensive quality profiling for both before and after filtering data (quality curves, base contents, KMER, Q20/Q30, GC Ratio, duplication, adapter contents...) + - filter out bad reads (too low quality, too short, or too many N...) + - cut low quality bases for per read in its 5' and 3' by evaluating the mean quality from a sliding window (like Trimmomatic but faster). + - trim all reads in front and tail + - cut adapters. Adapter sequences can be automatically detected, which means you don't have to input the adapter sequences to trim them. + - correct mismatched base pairs in overlapped regions of paired end reads, if one base is with high quality while the other is with ultra low quality + - trim polyG in 3' ends, which is commonly seen in NovaSeq/NextSeq data. Trim polyX in 3' ends to remove unwanted polyX tailing (i.e. polyA tailing for mRNA-Seq data) + - preprocess unique molecular identifier (UMI) enabled data, shift UMI to sequence name. + - report JSON format result for further interpreting. + - visualize quality control and filtering results on a single HTML page (like FASTQC but faster and more informative). + - split the output to multiple files (0001.R1.gz, 0002.R1.gz...) to support parallel processing. Two modes can be used, limiting the total split file number, or limitting the lines of each split file. + - support long reads (data from PacBio / Nanopore devices). + - support reading from STDIN and writing to STDOUT + - support interleaved input + - support ultra-fast FASTQ-level deduplication +keywords: [RNA-Seq, Trimming, Quality control] +links: + repository: https://github.com/OpenGene/fastp + documentation: https://github.com/OpenGene/fastp/blob/master/README.md +references: + doi: "10.1093/bioinformatics/bty560" +license: MIT +argument_groups: + - name: Inputs + description: | + `fastp` supports both single-end (SE) and paired-end (PE) input. - - for SE data, you only have to specify read1 input by `-i` or `--in1`. - - for PE data, you should also specify read2 input by `-I` or `--in2`. - arguments: - - name: --in1 - alternatives: [-i] - type: file - description: Input FastQ file. Must be single-end or paired-end R1. Can be gzipped. - required: true - example: in.R1.fq.gz - - name: --in2 - alternatives: [-I] - type: file - description: Input FastQ file. Must be paired-end R2. Can be gzipped. - required: false - example: in.R2.fq.gz - - name: Outputs - description: | + - for SE data, you only have to specify read1 input by `-i` or `--in1`. + - for PE data, you should also specify read2 input by `-I` or `--in2`. + arguments: + - name: --in1 + alternatives: [-i] + type: file + description: Input FastQ file. Must be single-end or paired-end R1. Can be gzipped. + required: true + example: in.R1.fq.gz + - name: --in2 + alternatives: [-I] + type: file + description: Input FastQ file. Must be paired-end R2. Can be gzipped. + required: false + example: in.R2.fq.gz + - name: Outputs + description: | - - for SE data, you only have to specify read1 output by `-o` or `--out1`. - - for PE data, you should also specify read2 output by `-O` or `--out2`. - - if you don't specify the output file names, no output files will be written, but the QC will still be done for both data before and after filtering. - - the output will be gzip-compressed if its file name ends with `.gz` - arguments: - - name: --out1 - alternatives: [-o] - type: file - description: The single-end or paired-end R1 reads that pass QC. Will be gzipped if its file name ends with `.gz`. - required: true - example: out.R1.fq.gz - direction: output - - name: --out2 - alternatives: [-O] - type: file - description: The paired-end R2 reads that pass QC. Will be gzipped if its file name ends with `.gz`. - required: false - example: out.R2.fq.gz - direction: output - - name: --unpaired1 - type: file - description: Store the reads that `read1` passes filters but its paired `read2` doesn't. - required: false - example: unpaired.R1.fq.gz - direction: output - - name: --unpaired2 - type: file - description: Store the reads that `read2` passes filters but its paired `read1` doesn't. - required: false - example: unpaired.R2.fq.gz - direction: output - - name: --failed_out - type: file - description: | - Store the reads that fail filters. + - for SE data, you only have to specify read1 output by `-o` or `--out1`. + - for PE data, you should also specify read2 output by `-O` or `--out2`. + - if you don't specify the output file names, no output files will be written, but the QC will still be done for both data before and after filtering. + - the output will be gzip-compressed if its file name ends with `.gz` + arguments: + - name: --out1 + alternatives: [-o] + type: file + description: The single-end or paired-end R1 reads that pass QC. Will be gzipped if its file name ends with `.gz`. + required: true + example: out.R1.fq.gz + direction: output + - name: --out2 + alternatives: [-O] + type: file + description: The paired-end R2 reads that pass QC. Will be gzipped if its file name ends with `.gz`. + required: false + example: out.R2.fq.gz + direction: output + - name: --unpaired1 + type: file + description: Store the reads that `read1` passes filters but its paired `read2` doesn't. + required: false + example: unpaired.R1.fq.gz + direction: output + - name: --unpaired2 + type: file + description: Store the reads that `read2` passes filters but its paired `read1` doesn't. + required: false + example: unpaired.R2.fq.gz + direction: output + - name: --failed_out + type: file + description: | + Store the reads that fail filters. - If one read failed and is written to --failed_out, its failure reason will be appended to its read name. For example, failed_quality_filter, failed_too_short etc. - For PE data, if unpaired reads are not stored (by giving --unpaired1 or --unpaired2), the failed pair of reads will be put together. If one read passes the filters but its pair doesn't, the failure reason will be paired_read_is_failing. - required: false - example: failed.fq.gz - direction: output - - name: --overlapped_out - type: file - description: | - For each read pair, output the overlapped region if it has no any mismatched base. - direction: output - - name: Report output arguments - arguments: - - name: --json - alternatives: [-j] - type: file - description: | - The json format report file name - example: out.json - direction: output - - name: --html - type: file - description: | - The html format report file name - example: out.html - direction: output - - name: --report_title - type: string - description: | - The title of the html report, default is "fastp report". - example: fastp report - - name: Adapter trimming - description: | - Adapter trimming is enabled by default, but you can disable it by `-A` or `--disable_adapter_trimming`. Adapter sequences can be automatically detected for both PE/SE data. + If one read failed and is written to --failed_out, its failure reason will be appended to its read name. For example, failed_quality_filter, failed_too_short etc. + For PE data, if unpaired reads are not stored (by giving --unpaired1 or --unpaired2), the failed pair of reads will be put together. If one read passes the filters but its pair doesn't, the failure reason will be paired_read_is_failing. + required: false + example: failed.fq.gz + direction: output + - name: --overlapped_out + type: file + description: | + For each read pair, output the overlapped region if it has no any mismatched base. + direction: output + - name: Report output arguments + arguments: + - name: --json + alternatives: [-j] + type: file + description: | + The json format report file name + example: out.json + direction: output + - name: --html + type: file + description: | + The html format report file name + example: out.html + direction: output + - name: --report_title + type: string + description: | + The title of the html report, default is "fastp report". + example: fastp report + - name: Adapter trimming + description: | + Adapter trimming is enabled by default, but you can disable it by `-A` or `--disable_adapter_trimming`. Adapter sequences can be automatically detected for both PE/SE data. - - For SE data, the adapters are evaluated by analyzing the tails of first ~1M reads. This evaluation may be inacurrate, and you can specify the adapter sequence by `-a` or `--adapter_sequence` option. If adapter sequence is specified, the auto detection for SE data will be disabled. - - For PE data, the adapters can be detected by per-read overlap analysis, which seeks for the overlap of each pair of reads. This method is robust and fast, so normally you don't have to input the adapter sequence even you know it. But you can still specify the adapter sequences for read1 by `--adapter_sequence`, and for read2 by `--adapter_sequence_r2`. If `fastp` fails to find an overlap (i.e. due to low quality bases), it will use these sequences to trim adapters for read1 and read2 respectively. - - For PE data, the adapter sequence auto-detection is disabled by default since the adapters can be trimmed by overlap analysis. However, you can specify `--detect_adapter_for_pe` to enable it. - - For PE data, `fastp` will run a little slower if you specify the sequence adapters or enable adapter auto-detection, but usually result in a slightly cleaner output, since the overlap analysis may fail due to sequencing errors or adapter dimers. - - The most widely used adapter is the Illumina TruSeq adapters. If your data is from the TruSeq library, you can add `--adapter_sequence=AGATCGGAAGAGCACACGTCTGAACTCCAGTCA --adapter_sequence_r2=AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT` to your command lines, or enable auto detection for PE data by specifing `detect_adapter_for_pe`. - - `fastp` contains some built-in known adapter sequences for better auto-detection. If you want to make some adapters to be a part of the built-in adapters, please file an issue. + - For SE data, the adapters are evaluated by analyzing the tails of first ~1M reads. This evaluation may be inacurrate, and you can specify the adapter sequence by `-a` or `--adapter_sequence` option. If adapter sequence is specified, the auto detection for SE data will be disabled. + - For PE data, the adapters can be detected by per-read overlap analysis, which seeks for the overlap of each pair of reads. This method is robust and fast, so normally you don't have to input the adapter sequence even you know it. But you can still specify the adapter sequences for read1 by `--adapter_sequence`, and for read2 by `--adapter_sequence_r2`. If `fastp` fails to find an overlap (i.e. due to low quality bases), it will use these sequences to trim adapters for read1 and read2 respectively. + - For PE data, the adapter sequence auto-detection is disabled by default since the adapters can be trimmed by overlap analysis. However, you can specify `--detect_adapter_for_pe` to enable it. + - For PE data, `fastp` will run a little slower if you specify the sequence adapters or enable adapter auto-detection, but usually result in a slightly cleaner output, since the overlap analysis may fail due to sequencing errors or adapter dimers. + - The most widely used adapter is the Illumina TruSeq adapters. If your data is from the TruSeq library, you can add `--adapter_sequence=AGATCGGAAGAGCACACGTCTGAACTCCAGTCA --adapter_sequence_r2=AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT` to your command lines, or enable auto detection for PE data by specifing `detect_adapter_for_pe`. + - `fastp` contains some built-in known adapter sequences for better auto-detection. If you want to make some adapters to be a part of the built-in adapters, please file an issue. - You can also specify --adapter_fasta to give a FASTA file to tell fastp to trim multiple adapters in this FASTA file. Here is a sample of such adapter FASTA file: + You can also specify --adapter_fasta to give a FASTA file to tell fastp to trim multiple adapters in this FASTA file. Here is a sample of such adapter FASTA file: - ``` - >Illumina TruSeq Adapter Read 1 - AGATCGGAAGAGCACACGTCTGAACTCCAGTCA - >Illumina TruSeq Adapter Read 2 - AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT - >polyA - AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA - ``` + ``` + >Illumina TruSeq Adapter Read 1 + AGATCGGAAGAGCACACGTCTGAACTCCAGTCA + >Illumina TruSeq Adapter Read 2 + AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT + >polyA + AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA + ``` - The adapter sequence in this file should be at least 6bp long, otherwise it will be skipped. And you can give whatever you want to trim, rather than regular sequencing adapters (i.e. polyA). + The adapter sequence in this file should be at least 6bp long, otherwise it will be skipped. And you can give whatever you want to trim, rather than regular sequencing adapters (i.e. polyA). - `fastp` first trims the auto-detected adapter or the adapter sequences given by `--adapter_sequence | --adapter_sequence_r2`, then trims the adapters given by `--adapter_fasta` one by one. + `fastp` first trims the auto-detected adapter or the adapter sequences given by `--adapter_sequence | --adapter_sequence_r2`, then trims the adapters given by `--adapter_fasta` one by one. - The sequence distribution of trimmed adapters can be found at the HTML/JSON reports. - arguments: - - name: --disable_adapter_trimming - alternatives: [-A] - type: boolean_true - description: | - Disable adapter trimming. - - name: --detect_adapter_for_pe - type: boolean_true - description: | - By default, the auto-detection for adapter is for SE data input only, turn on this option to enable it for PE data. - - name: --adapter_sequence - alternatives: [-a] - type: string - description: | - The adapter sequences to be trimmed. For SE data, if not specified, the adapters will be auto-detected. For PE data, this is used if R1/R2 are found not overlapped - - name: --adapter_sequence_r2 - type: string - description: | - The adapter sequences to be trimmed for R2. This is used for PE data if R1/R2 are found overlapped. - - name: --adapter_fasta - type: file - description: | - A FASTA file containing all the adapter sequences to be trimmed. For SE data, if not specified, the adapters will be auto-detected. For PE data, this is used if R1/R2 are found not overlapped. - - name: Base trimming - arguments: - - name: --trim_front1 - alternatives: [-f] - type: integer - description: | - Trimming how many bases in front for read1, default is 0. - example: 0 - - name: --trim_tail1 - alternatives: [-t] - type: integer - description: | - Trimming how many bases in tail for read1, default is 0. - example: 0 - - name: --max_len1 - alternatives: [-b] - type: integer - min: 0 - description: | - If read1 is longer than max_len1, then trim read1 at its tail to make it as long as max_len1. Default 0 means no limitation. - - name: --trim_front2 - alternatives: [-F] - type: integer - description: | - Trimming how many bases in front for read2, default is 0. - example: 0 - - name: --trim_tail2 - alternatives: [-T] - type: integer - description: | - Trimming how many bases in tail for read2, default is 0. - example: 0 - - name: --max_len2 - alternatives: [-B] - type: integer - min: 0 - description: | - If read2 is longer than max_len2, then trim read2 at its tail to make it as long as max_len2. Default 0 means no limitation. - - name: Merging mode - description: Allows merging paired-end reads into a single longer read if they are overlapping. - arguments: - - name: --merge - alternatives: [-m] - type: boolean_true - description: | - For paired-end input, merge each pair of reads into a single read if they are overlapped. The merged reads will be written to the file given by --merged_out, the unmerged reads will be written to the files specified by --out1 and --out2. The merging mode is disabled by default. - - name: --merged_out - type: file - description: | - In the merging mode, specify the file name to store merged output, or specify --stdout to stream the merged output. - direction: output - example: merged.fq.gz - - name: --include_unmerged - type: boolean_true - description: | - In the merging mode, write the unmerged or unpaired reads to the file specified by --merge. Disabled by default. - - name: Additional input arguments - description: Affects how the input is read. - arguments: - - name: --interleaved_in - type: boolean_true - description: | - Indicate that is an interleaved FASTQ which contains both read1 and read2. Disabled by default. - - name: --fix_mgi_id - type: boolean_true - description: | - The MGI FASTQ ID format is not compatible with many BAM operation tools, enable this option to fix it. - - name: --phred64 - alternatives: ["-6"] - type: boolean_true - description: | - Indicate the input is using phred64 scoring (it'll be converted to phred33, so the output will still be phred33) - - name: Additional output arguments - description: Affects how the output is written. - arguments: - - name: --compression - alternatives: ["-z"] - type: integer - description: | - Compression level for gzip output (1 ~ 9). 1 is fastest, 9 is smallest, default is 4. - example: 4 - min: 1 - max: 9 - - name: --dont_overwrite - type: boolean_true - description: | - Don't overwrite existing files. Overwritting is allowed by default. - - name: Logging arguments - arguments: - - name: --verbose - alternatives: [-V] - type: boolean_true - description: Output verbose log information (i.e. when every 1M reads are processed). - - name: Processing arguments - arguments: - - name: --reads_to_process - type: long - description: | - Specify how many reads/pairs to be processed. Default 0 means process all reads. - example: 1000000 - min: 0 - - name: Deduplication arguments - arguments: - - name: --dedup - type: boolean_true - description: | - Enable deduplication to drop the duplicated reads/pairs - - name: --dup_calc_accuracy - type: integer - description: | - Accuracy level to calculate duplication (1~6). Higher level uses more memory (1G, 2G, 4G, 8G, 16G, 24G). Default 1 for no-dedup mode, and 3 for dedup mode. - example: 3 - min: 1 - max: 6 - - name: --dont_eval_duplication - type: boolean_true - description: | - Don't evaluate duplication rate to save time and use less memory. - - name: PolyG tail trimming arguments - arguments: - - name: --trim_poly_g - alternatives: [-g] - type: boolean_true - description: | - Force polyG tail trimming, by default trimming is automatically enabled for Illumina NextSeq/NovaSeq data - - name: --poly_g_min_len - type: integer - description: | - The minimum length to detect polyG in the read tail. 10 by default. - example: 10 - min: 1 - - name: --disable_trim_poly_g - alternatives: [-G] - type: boolean_true - description: | - Disable polyG tail trimming, by default trimming is automatically enabled for Illumina NextSeq/NovaSeq data - - name: PolyX tail trimming arguments - arguments: - - name: --trim_poly_x - alternatives: [-x] - type: boolean_true - description: | - Enable polyX trimming in 3' ends. - - name: --poly_x_min_len - type: integer - description: | - The minimum length to detect polyX in the read tail. 10 by default. - example: 10 - min: 1 - - name: Cut arguments - arguments: - - name: --cut_front - alternatives: ["-5"] - type: integer - description: | - Move a sliding window from front (5') to tail, drop the bases in the window if its mean quality < threshold, stop otherwise. - - name: --cut_tail - alternatives: ["-3"] - type: integer - description: | - Move a sliding window from tail (3') to front, drop the bases in the window if its mean quality < threshold, stop otherwise. - - name: --cut_right - alternatives: ["-r"] - type: integer - description: | - Move a sliding window from front to tail, if meet one window with mean quality < threshold, drop the bases in the window and the right part, and then stop. - - name: --cut_window_size - alternatives: ["-W"] - type: integer - description: | - The window size option shared by cut_front, cut_tail or cut_sliding. Range: 1~1000, default: 4. - example: 4 - min: 1 - - name: --cut_mean_quality - alternatives: ["-M"] - type: integer - description: | - The mean quality requirement option shared by cut_front, cut_tail or cut_sliding. Range: 1~36 default: 20 (Q20) - example: 20 - min: 0 - - name: --cut_front_window_size - type: integer - description: | - The window size option of cut_front, default to cut_window_size if not specified. - example: 4 - min: 1 - - name: --cut_front_mean_quality - type: integer - description: | - The mean quality requirement option of cut_front, default to cut_mean_quality if not specified. - example: 20 - min: 0 - - name: --cut_tail_window_size - type: integer - description: | - The window size option of cut_tail, default to cut_window_size if not specified. - example: 4 - min: 1 - - name: --cut_tail_mean_quality - type: integer - description: | - The mean quality requirement option of cut_tail, default to cut_mean_quality if not specified. - example: 20 - min: 0 - - name: --cut_right_window_size - type: integer - description: | - The window size option of cut_right, default to cut_window_size if not specified. - example: 4 - min: 1 - - name: --cut_right_mean_quality - type: integer - description: | - The mean quality requirement option of cut_right, default to cut_mean_quality if not specified. - example: 20 - min: 0 - - name: Quality filtering arguments - arguments: - - name: --disable_quality_filtering - alternatives: [-Q] - type: boolean_true - description: | - Quality filtering is enabled by default. If this option is specified, quality filtering is disabled. - - name: --qualified_quality_phred - alternatives: [-q] - type: integer - description: | - The quality value that a base is qualified. Default 15 means phred quality >=Q15 is qualified. - example: 15 - min: 0 - - name: --unqualified_percent_limit - alternatives: [-u] - type: integer - description: | - How many percents of bases are allowed to be unqualified (0~100). Default 40 means 40%. - example: 40 - min: 0 - max: 100 - - name: --n_base_limit - alternatives: [-n] - type: integer - description: | - If one read's number of N base is >n_base_limit, then this read/pair is discarded. Default is 5. - example: 5 - min: 0 - - name: --average_qual - alternatives: [-e] - type: integer - description: | - If one read's average quality score =1000), a sequential number prefix will be added to output name ( 0001.out.fq, 0002.out.fq...), disabled by default. - # - name: --split_prefix_digits - # type: integer - # description: | - # The digits for the sequential number padding (1~10), default is 4, so the filename will be padded as 0001.xxx, 0 to disable padding. - # example: 4 - resources: - - type: bash_script - path: script.sh - test_resources: - - type: bash_script - path: test.sh - - type: file - path: test_data -platforms: + The sequence distribution of trimmed adapters can be found at the HTML/JSON reports. + arguments: + - name: --disable_adapter_trimming + alternatives: [-A] + type: boolean_true + description: | + Disable adapter trimming. + - name: --detect_adapter_for_pe + type: boolean_true + description: | + By default, the auto-detection for adapter is for SE data input only, turn on this option to enable it for PE data. + - name: --adapter_sequence + alternatives: [-a] + type: string + description: | + The adapter sequences to be trimmed. For SE data, if not specified, the adapters will be auto-detected. For PE data, this is used if R1/R2 are found not overlapped + - name: --adapter_sequence_r2 + type: string + description: | + The adapter sequences to be trimmed for R2. This is used for PE data if R1/R2 are found overlapped. + - name: --adapter_fasta + type: file + description: | + A FASTA file containing all the adapter sequences to be trimmed. For SE data, if not specified, the adapters will be auto-detected. For PE data, this is used if R1/R2 are found not overlapped. + - name: Base trimming + arguments: + - name: --trim_front1 + alternatives: [-f] + type: integer + description: | + Trimming how many bases in front for read1, default is 0. + example: 0 + - name: --trim_tail1 + alternatives: [-t] + type: integer + description: | + Trimming how many bases in tail for read1, default is 0. + example: 0 + - name: --max_len1 + alternatives: [-b] + type: integer + min: 0 + description: | + If read1 is longer than max_len1, then trim read1 at its tail to make it as long as max_len1. Default 0 means no limitation. + - name: --trim_front2 + alternatives: [-F] + type: integer + description: | + Trimming how many bases in front for read2, default is 0. + example: 0 + - name: --trim_tail2 + alternatives: [-T] + type: integer + description: | + Trimming how many bases in tail for read2, default is 0. + example: 0 + - name: --max_len2 + alternatives: [-B] + type: integer + min: 0 + description: | + If read2 is longer than max_len2, then trim read2 at its tail to make it as long as max_len2. Default 0 means no limitation. + - name: Merging mode + description: Allows merging paired-end reads into a single longer read if they are overlapping. + arguments: + - name: --merge + alternatives: [-m] + type: boolean_true + description: | + For paired-end input, merge each pair of reads into a single read if they are overlapped. The merged reads will be written to the file given by --merged_out, the unmerged reads will be written to the files specified by --out1 and --out2. The merging mode is disabled by default. + - name: --merged_out + type: file + description: | + In the merging mode, specify the file name to store merged output, or specify --stdout to stream the merged output. + direction: output + example: merged.fq.gz + - name: --include_unmerged + type: boolean_true + description: | + In the merging mode, write the unmerged or unpaired reads to the file specified by --merge. Disabled by default. + - name: Additional input arguments + description: Affects how the input is read. + arguments: + - name: --interleaved_in + type: boolean_true + description: | + Indicate that is an interleaved FASTQ which contains both read1 and read2. Disabled by default. + - name: --fix_mgi_id + type: boolean_true + description: | + The MGI FASTQ ID format is not compatible with many BAM operation tools, enable this option to fix it. + - name: --phred64 + alternatives: ["-6"] + type: boolean_true + description: | + Indicate the input is using phred64 scoring (it'll be converted to phred33, so the output will still be phred33) + - name: Additional output arguments + description: Affects how the output is written. + arguments: + - name: --compression + alternatives: ["-z"] + type: integer + description: | + Compression level for gzip output (1 ~ 9). 1 is fastest, 9 is smallest, default is 4. + example: 4 + min: 1 + max: 9 + - name: --dont_overwrite + type: boolean_true + description: | + Don't overwrite existing files. Overwritting is allowed by default. + - name: Logging arguments + arguments: + - name: --verbose + alternatives: [-V] + type: boolean_true + description: Output verbose log information (i.e. when every 1M reads are processed). + - name: Processing arguments + arguments: + - name: --reads_to_process + type: long + description: | + Specify how many reads/pairs to be processed. Default 0 means process all reads. + example: 1000000 + min: 0 + - name: Deduplication arguments + arguments: + - name: --dedup + type: boolean_true + description: | + Enable deduplication to drop the duplicated reads/pairs + - name: --dup_calc_accuracy + type: integer + description: | + Accuracy level to calculate duplication (1~6). Higher level uses more memory (1G, 2G, 4G, 8G, 16G, 24G). Default 1 for no-dedup mode, and 3 for dedup mode. + example: 3 + min: 1 + max: 6 + - name: --dont_eval_duplication + type: boolean_true + description: | + Don't evaluate duplication rate to save time and use less memory. + - name: PolyG tail trimming arguments + arguments: + - name: --trim_poly_g + alternatives: [-g] + type: boolean_true + description: | + Force polyG tail trimming, by default trimming is automatically enabled for Illumina NextSeq/NovaSeq data + - name: --poly_g_min_len + type: integer + description: | + The minimum length to detect polyG in the read tail. 10 by default. + example: 10 + min: 1 + - name: --disable_trim_poly_g + alternatives: [-G] + type: boolean_true + description: | + Disable polyG tail trimming, by default trimming is automatically enabled for Illumina NextSeq/NovaSeq data + - name: PolyX tail trimming arguments + arguments: + - name: --trim_poly_x + alternatives: [-x] + type: boolean_true + description: | + Enable polyX trimming in 3' ends. + - name: --poly_x_min_len + type: integer + description: | + The minimum length to detect polyX in the read tail. 10 by default. + example: 10 + min: 1 + - name: Cut arguments + arguments: + - name: --cut_front + alternatives: ["-5"] + type: integer + description: | + Move a sliding window from front (5') to tail, drop the bases in the window if its mean quality < threshold, stop otherwise. + - name: --cut_tail + alternatives: ["-3"] + type: integer + description: | + Move a sliding window from tail (3') to front, drop the bases in the window if its mean quality < threshold, stop otherwise. + - name: --cut_right + alternatives: ["-r"] + type: integer + description: | + Move a sliding window from front to tail, if meet one window with mean quality < threshold, drop the bases in the window and the right part, and then stop. + - name: --cut_window_size + alternatives: ["-W"] + type: integer + description: | + The window size option shared by cut_front, cut_tail or cut_sliding. Range: 1~1000, default: 4. + example: 4 + min: 1 + - name: --cut_mean_quality + alternatives: ["-M"] + type: integer + description: | + The mean quality requirement option shared by cut_front, cut_tail or cut_sliding. Range: 1~36 default: 20 (Q20) + example: 20 + min: 0 + - name: --cut_front_window_size + type: integer + description: | + The window size option of cut_front, default to cut_window_size if not specified. + example: 4 + min: 1 + - name: --cut_front_mean_quality + type: integer + description: | + The mean quality requirement option of cut_front, default to cut_mean_quality if not specified. + example: 20 + min: 0 + - name: --cut_tail_window_size + type: integer + description: | + The window size option of cut_tail, default to cut_window_size if not specified. + example: 4 + min: 1 + - name: --cut_tail_mean_quality + type: integer + description: | + The mean quality requirement option of cut_tail, default to cut_mean_quality if not specified. + example: 20 + min: 0 + - name: --cut_right_window_size + type: integer + description: | + The window size option of cut_right, default to cut_window_size if not specified. + example: 4 + min: 1 + - name: --cut_right_mean_quality + type: integer + description: | + The mean quality requirement option of cut_right, default to cut_mean_quality if not specified. + example: 20 + min: 0 + - name: Quality filtering arguments + arguments: + - name: --disable_quality_filtering + alternatives: [-Q] + type: boolean_true + description: | + Quality filtering is enabled by default. If this option is specified, quality filtering is disabled. + - name: --qualified_quality_phred + alternatives: [-q] + type: integer + description: | + The quality value that a base is qualified. Default 15 means phred quality >=Q15 is qualified. + example: 15 + min: 0 + - name: --unqualified_percent_limit + alternatives: [-u] + type: integer + description: | + How many percents of bases are allowed to be unqualified (0~100). Default 40 means 40%. + example: 40 + min: 0 + max: 100 + - name: --n_base_limit + alternatives: [-n] + type: integer + description: | + If one read's number of N base is >n_base_limit, then this read/pair is discarded. Default is 5. + example: 5 + min: 0 + - name: --average_qual + alternatives: [-e] + type: integer + description: | + If one read's average quality score =1000), a sequential number prefix will be added to output name ( 0001.out.fq, 0002.out.fq...), disabled by default. + # - name: --split_prefix_digits + # type: integer + # description: | + # The digits for the sequential number padding (1~10), default is 4, so the filename will be padded as 0001.xxx, 0 to disable padding. + # example: 4 +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - type: file + path: test_data +engines: - type: docker image: quay.io/biocontainers/fastp:0.23.4--hadf994f_2 setup: - type: docker run: | fastp --version 2>&1 | sed 's# #: "#;s#$#"#' > /var/software_versions.txt +runners: + - type: executable - type: nextflow diff --git a/src/featurecounts/config.vsh.yaml b/src/featurecounts/config.vsh.yaml index 01ae400a..8697b1fe 100644 --- a/src/featurecounts/config.vsh.yaml +++ b/src/featurecounts/config.vsh.yaml @@ -1,336 +1,336 @@ -functionality: - name: featurecounts - description: | - featureCounts is a read summarization program for counting reads generated from either RNA or genomic DNA sequencing experiments by implementing highly efficient chromosome hashing and feature blocking techniques. It works with either single or paired-end reads and provides a wide range of options appropriate for different sequencing applications. - info: - keywords: ["Read counting", "Genomic features"] - links: - homepage: https://subread.sourceforge.net/ - documentation: https://subread.sourceforge.net/SubreadUsersGuide.pdf - repository: https://github.com/ShiLab-Bioinformatics/subread - references: - doi: 10.1093/bioinformatics/btt656 - license: GPL-3.0 - requirements: - commands: [ featureCounts ] +name: featurecounts +description: | + featureCounts is a read summarization program for counting reads generated from either RNA or genomic DNA sequencing experiments by implementing highly efficient chromosome hashing and feature blocking techniques. It works with either single or paired-end reads and provides a wide range of options appropriate for different sequencing applications. +keywords: ["Read counting", "Genomic features"] +links: + homepage: https://subread.sourceforge.net/ + documentation: https://subread.sourceforge.net/SubreadUsersGuide.pdf + repository: https://github.com/ShiLab-Bioinformatics/subread +references: + doi: "10.1093/bioinformatics/btt656" +license: GPL-3.0 +requirements: + commands: [ featureCounts ] - argument_groups: - - name: Inputs - arguments: - - name: --annotation - alternatives: ["-a"] - type: file - description: | - Name of an annotation file. GTF/GFF format by default. See '--format' option for more format information. - required: true - example: annotation.gtf - - name: --input - alternatives: ["-i"] - type: file - multiple: true - description: | - A list of SAM or BAM format files separated by semi-colon (;). They can be either name or location sorted. Location-sorted paired-end reads are automatically sorted by read names. - required: true - example: input_file1.bam - - - name: Outputs - arguments: - - name: --counts - alternatives: ["-o"] - type: file - direction: output - description: | - Name of output file including read counts in tab delimited format. - required: true - example: features.tsv - - name: --summary - type: file - direction: output - description: | - Summary statistics of counting results in tab delimited format. - required: false - example: summary.tsv - - name: --junctions - type: file - direction: output - description: | - Count number of reads supporting each exon-exon junction. Junctions were identified from those exon-spanning reads in the input (containing 'N' in CIGAR string). - example: junctions.txt - required: false +argument_groups: + - name: Inputs + arguments: + - name: --annotation + alternatives: ["-a"] + type: file + description: | + Name of an annotation file. GTF/GFF format by default. See '--format' option for more format information. + required: true + example: annotation.gtf + - name: --input + alternatives: ["-i"] + type: file + multiple: true + description: | + A list of SAM or BAM format files separated by semi-colon (;). They can be either name or location sorted. Location-sorted paired-end reads are automatically sorted by read names. + required: true + example: input_file1.bam + + - name: Outputs + arguments: + - name: --counts + alternatives: ["-o"] + type: file + direction: output + description: | + Name of output file including read counts in tab delimited format. + required: true + example: features.tsv + - name: --summary + type: file + direction: output + description: | + Summary statistics of counting results in tab delimited format. + required: false + example: summary.tsv + - name: --junctions + type: file + direction: output + description: | + Count number of reads supporting each exon-exon junction. Junctions were identified from those exon-spanning reads in the input (containing 'N' in CIGAR string). + example: junctions.txt + required: false - - name: Annotation - arguments: - - name: --format - alternatives: ["-F"] - type: string - description: | - Specify format of the provided annotation file. Acceptable formats include 'GTF' (or compatible GFF format) and 'SAF'. 'GTF' by default. - choices: [GTF, GFF, SAF] - example: "GTF" - required: false - - name: --feature_type - alternatives: ["-t"] - type: string - description: | - Specify feature type(s) in a GTF annotation. If multiple types are provided, they should be separated by ';' with no space in between. 'exon' by default. Rows in the annotation with a matched feature will be extracted and used for read mapping. - example: "exon" - required: false - multiple: true - - name: --attribute_type - alternatives: ["-g"] - type: string - description: | - Specify attribute type in GTF annotation. 'gene_id' by default. Meta-features used for read counting will be extracted from annotation using the provided value. - example: "gene_id" - required: false - - name: --extra_attributes - type: string - description: | - Extract extra attribute types from the provided GTF annotation and include them in the counting output. These attribute types will not be used to group features. If more than one attribute type is provided they should be separated by semicolon (;). - required: false - multiple: true - - name: --chrom_alias - alternatives: ["-A"] - type: file - description: | - Provide a chromosome name alias file to match chr names in annotation with those in the reads. This should be a two-column comma-delimited text file. Its first column should include chr names in the annotation and its second column should include chr names in the reads. Chr names are case sensitive. No column header should be included in the file. - required: false - example: chrom_alias.csv - - - name: Level of summarization - arguments: - - name: --feature_level - alternatives: ["-f"] - type: boolean_true - description: | - Perform read counting at feature level (eg. counting reads for exons rather than genes). - - - name: Overlap between reads and features - arguments: - - name: --overlapping - alternatives: ["-O"] - type: boolean_true - description: | - Assign reads to all their overlapping meta-features (or features if '--feature_level' is specified). - - name: --min_overlap - type: integer - description: | - Minimum number of overlapping bases in a read that is required for read assignment. 1 by default. Number of overlapping bases is counted from both reads if paired end. If a negative value is provided, then a gap of up to specified size will be allowed between read and the feature that the read is assigned to. - required: false - example: 1 - - name: --frac_overlap - type: double - description: | - Minimum fraction of overlapping bases in a read that is required for read assignment. Value should be within range [0,1]. 0 by default. Number of overlapping bases is counted from both reads if paired end. Both this option and '--min_overlap' option need to be satisfied for read assignment. - required: false - min: 0 - max: 1 - example: 0 - - name: --frac_overlap_feature - type: double - description: | - Minimum fraction of overlapping bases in a feature that is required for read assignment. Value should be within range [0,1]. 0 by default. - required: false - min: 0 - max: 1 - example: 0 - - name: --largest_overlap - type: boolean_true - description: | - Assign reads to a meta-feature/feature that has the largest number of overlapping bases. - - name: --non_overlap - type: integer - description: | - Maximum number of non-overlapping bases in a read (or a read pair) that is allowed when being assigned to a feature. No limit is set by default. - required: false - - name: --non_overlap_feature - type: integer - description: | - Maximum number of non-overlapping bases in a feature that is allowed in read assignment. No limit is set by default. - required: false - - name: --read_extension5 - type: integer - description: | - Reads are extended upstream by bases from their 5' end. - required: false - - name: --read_extension3 - type: integer - description: | - Reads are extended upstream by bases from their 3' end. - required: false - - name: --read2pos - type: integer - description: | - Reduce reads to their 5' most base or 3' most base. Read counting is then performed based on the single base the read is reduced to. - required: false - choices: [3, 5] - - - name: Multi-mapping reads - arguments: - - name: --multi_mapping - alternatives: ["-M"] - type: boolean_true - description: | - Multi-mapping reads will also be counted. For a multi-mapping read, all its reported alignments will be counted. The 'NH' tag in BAM/SAM input is used to detect multi-mapping reads. - - - name: Fractional counting - arguments: - - name: --fraction - type: boolean_true - description: | - Assign fractional counts to features. This option must be used together with '--multi_mapping' or '--overlapping' or both. When '--multi_mapping' is specified, each reported alignment from a multi-mapping read (identified via 'NH' tag) will carry a fractional count of 1/x, instead of 1 (one), where x is the total number of alignments reported for the same read. When '--overlapping' is specified, each overlapping feature will receive a fractional count of 1/y, where y is the total number of features overlapping with the read. When both '--multi_mapping' and '--overlapping' are specified, each alignment will carry a fractional count of 1/(x*y). + - name: Annotation + arguments: + - name: --format + alternatives: ["-F"] + type: string + description: | + Specify format of the provided annotation file. Acceptable formats include 'GTF' (or compatible GFF format) and 'SAF'. 'GTF' by default. + choices: [GTF, GFF, SAF] + example: "GTF" + required: false + - name: --feature_type + alternatives: ["-t"] + type: string + description: | + Specify feature type(s) in a GTF annotation. If multiple types are provided, they should be separated by ';' with no space in between. 'exon' by default. Rows in the annotation with a matched feature will be extracted and used for read mapping. + example: "exon" + required: false + multiple: true + - name: --attribute_type + alternatives: ["-g"] + type: string + description: | + Specify attribute type in GTF annotation. 'gene_id' by default. Meta-features used for read counting will be extracted from annotation using the provided value. + example: "gene_id" + required: false + - name: --extra_attributes + type: string + description: | + Extract extra attribute types from the provided GTF annotation and include them in the counting output. These attribute types will not be used to group features. If more than one attribute type is provided they should be separated by semicolon (;). + required: false + multiple: true + - name: --chrom_alias + alternatives: ["-A"] + type: file + description: | + Provide a chromosome name alias file to match chr names in annotation with those in the reads. This should be a two-column comma-delimited text file. Its first column should include chr names in the annotation and its second column should include chr names in the reads. Chr names are case sensitive. No column header should be included in the file. + required: false + example: chrom_alias.csv + + - name: Level of summarization + arguments: + - name: --feature_level + alternatives: ["-f"] + type: boolean_true + description: | + Perform read counting at feature level (eg. counting reads for exons rather than genes). + + - name: Overlap between reads and features + arguments: + - name: --overlapping + alternatives: ["-O"] + type: boolean_true + description: | + Assign reads to all their overlapping meta-features (or features if '--feature_level' is specified). + - name: --min_overlap + type: integer + description: | + Minimum number of overlapping bases in a read that is required for read assignment. 1 by default. Number of overlapping bases is counted from both reads if paired end. If a negative value is provided, then a gap of up to specified size will be allowed between read and the feature that the read is assigned to. + required: false + example: 1 + - name: --frac_overlap + type: double + description: | + Minimum fraction of overlapping bases in a read that is required for read assignment. Value should be within range [0,1]. 0 by default. Number of overlapping bases is counted from both reads if paired end. Both this option and '--min_overlap' option need to be satisfied for read assignment. + required: false + min: 0 + max: 1 + example: 0 + - name: --frac_overlap_feature + type: double + description: | + Minimum fraction of overlapping bases in a feature that is required for read assignment. Value should be within range [0,1]. 0 by default. + required: false + min: 0 + max: 1 + example: 0 + - name: --largest_overlap + type: boolean_true + description: | + Assign reads to a meta-feature/feature that has the largest number of overlapping bases. + - name: --non_overlap + type: integer + description: | + Maximum number of non-overlapping bases in a read (or a read pair) that is allowed when being assigned to a feature. No limit is set by default. + required: false + - name: --non_overlap_feature + type: integer + description: | + Maximum number of non-overlapping bases in a feature that is allowed in read assignment. No limit is set by default. + required: false + - name: --read_extension5 + type: integer + description: | + Reads are extended upstream by bases from their 5' end. + required: false + - name: --read_extension3 + type: integer + description: | + Reads are extended upstream by bases from their 3' end. + required: false + - name: --read2pos + type: integer + description: | + Reduce reads to their 5' most base or 3' most base. Read counting is then performed based on the single base the read is reduced to. + required: false + choices: [3, 5] + + - name: Multi-mapping reads + arguments: + - name: --multi_mapping + alternatives: ["-M"] + type: boolean_true + description: | + Multi-mapping reads will also be counted. For a multi-mapping read, all its reported alignments will be counted. The 'NH' tag in BAM/SAM input is used to detect multi-mapping reads. + + - name: Fractional counting + arguments: + - name: --fraction + type: boolean_true + description: | + Assign fractional counts to features. This option must be used together with '--multi_mapping' or '--overlapping' or both. When '--multi_mapping' is specified, each reported alignment from a multi-mapping read (identified via 'NH' tag) will carry a fractional count of 1/x, instead of 1 (one), where x is the total number of alignments reported for the same read. When '--overlapping' is specified, each overlapping feature will receive a fractional count of 1/y, where y is the total number of features overlapping with the read. When both '--multi_mapping' and '--overlapping' are specified, each alignment will carry a fractional count of 1/(x*y). - - name: Read filtering - arguments: - - name: --min_map_quality - alternatives: ["-Q"] - type: integer - description: | - The minimum mapping quality score a read must satisfy in order to be counted. For paired-end reads, at least one end should satisfy this criteria. 0 by default. - required: false - example: 0 - - name: --split_only - type: boolean_true - description: | - Count split alignments only (ie. alignments with CIGAR string containing 'N'). An example of split alignments is exon-spanning reads in RNA-seq data. - - name: --non_split_only - type: boolean_true - description: | - If specified, only non-split alignments (CIGAR strings do not contain letter 'N') will be counted. All the other alignments will be ignored. - - name: --primary - type: boolean_true - description: | - Count primary alignments only. Primary alignments are identified using bit 0x100 in SAM/BAM FLAG field. - - name: --ignore_dup - type: boolean_true - description: | - Ignore duplicate reads in read counting. Duplicate reads are identified using bit Ox400 in BAM/SAM FLAG field. The whole read pair is ignored if one of the reads is a duplicate read for paired end data. - - - name: Strandedness - arguments: - - name: --strand - alternatives: ["-s"] - type: integer - description: | - Perform strand-specific read counting. A single integer value (applied to all input files) should be provided. Possible values include: 0 (unstranded), 1 (stranded) and 2 (reversely stranded). Default value is 0 (ie. unstranded read counting carried out for all input files). - choices: [0, 1, 2] - example: 0 - required: false + - name: Read filtering + arguments: + - name: --min_map_quality + alternatives: ["-Q"] + type: integer + description: | + The minimum mapping quality score a read must satisfy in order to be counted. For paired-end reads, at least one end should satisfy this criteria. 0 by default. + required: false + example: 0 + - name: --split_only + type: boolean_true + description: | + Count split alignments only (ie. alignments with CIGAR string containing 'N'). An example of split alignments is exon-spanning reads in RNA-seq data. + - name: --non_split_only + type: boolean_true + description: | + If specified, only non-split alignments (CIGAR strings do not contain letter 'N') will be counted. All the other alignments will be ignored. + - name: --primary + type: boolean_true + description: | + Count primary alignments only. Primary alignments are identified using bit 0x100 in SAM/BAM FLAG field. + - name: --ignore_dup + type: boolean_true + description: | + Ignore duplicate reads in read counting. Duplicate reads are identified using bit Ox400 in BAM/SAM FLAG field. The whole read pair is ignored if one of the reads is a duplicate read for paired end data. + + - name: Strandedness + arguments: + - name: --strand + alternatives: ["-s"] + type: integer + description: | + Perform strand-specific read counting. A single integer value (applied to all input files) should be provided. Possible values include: 0 (unstranded), 1 (stranded) and 2 (reversely stranded). Default value is 0 (ie. unstranded read counting carried out for all input files). + choices: [0, 1, 2] + example: 0 + required: false - - name: Exon-exon junctions - arguments: - - name: --ref_fasta - alternatives: ["-G"] - type: file - description: | - Provide the name of a FASTA-format file that contains the reference sequences used in read mapping that produced the provided SAM/BAM files. - required: false - example: reference.fasta - - - name: Parameters specific to paired end reads - arguments: - - name: --paired - alternatives: ["-p"] - type: boolean_true - description: | - Specify that input data contain paired-end reads. To perform fragment counting (ie. counting read pairs), the '--countReadPairs' parameter should also be specified in addition to this parameter. - - name: --count_read_pairs - type: boolean_true - description: | - Count read pairs (fragments) instead of reads. This option is only applicable for paired-end reads. - - name: --both_aligned - alternatives: ["-B"] - type: boolean_true - description: | - Count read pairs (fragments) instead of reads. This option is only applicable for paired-end reads. - - name: --check_pe_dist - alternatives: ["-P"] - type: boolean_true - description: | - Check validity of paired-end distance when counting read pairs. Use '--min_length' and '--max_length' to set thresholds. - - name: --min_length - alternatives: ["-d"] - type: integer - description: | - Minimum fragment/template length, 50 by default. - required: false - example: 50 - - name: --max_length - alternatives: ["-D"] - type: integer - description: | - Maximum fragment/template length, 600 by default. - required: false - example: 600 - - name: --same_strand - alternatives: ["-C"] - type: boolean_true - description: | - Do not count read pairs that have their two ends mapping to different chromosomes or mapping to same chromosome but on different strands. - - name: --donotsort - type: boolean_true - description: | - Do not sort reads in BAM/SAM input. Note that reads from the same pair are required to be located next to each other in the input. - - - name: Read groups - arguments: - - name: --by_read_group - type: boolean_true - description: | - Assign reads by read group. "RG" tag is required to be present in the input BAM/SAM files. - - - name: Long reads - arguments: - - name: --long_reads - type: boolean_true - description: | - Count long reads such as Nanopore and PacBio reads. Long read counting can only run in one thread and only reads (not read-pairs) can be counted. There is no limitation on the number of 'M' operations allowed in a CIGAR string in long read counting. - - - name: Assignment results for each read - arguments: - - name: --detailed_results - type: file - direction: output - description: | - Directory to save the detailed assignment results. Use `--detailed_results_format` to determine the format of the detailed results. - example: detailed_results/ - required: false - - name: --detailed_results_format - alternatives: ["-R"] - type: string - description: | - Output detailed assignment results for each read or read-pair. Results are saved to a file that is in one of the following formats: CORE, SAM and BAM. See documentaiton for more info about these formats. - required: false - choices: [CORE, SAM, BAM] - - - name: Miscellaneous - arguments: - - name: --max_M_op - type: integer - description: | - Maximum number of 'M' operations allowed in a CIGAR string. 10 by default. Both 'X' and '=' are treated as 'M' and adjacent 'M' operations are merged in the CIGAR string. - required: false - example: 10 - - name: --verbose - type: boolean_true - description: | - Output verbose information for debugging, such as un-matched chromosome/contig names. + - name: Exon-exon junctions + arguments: + - name: --ref_fasta + alternatives: ["-G"] + type: file + description: | + Provide the name of a FASTA-format file that contains the reference sequences used in read mapping that produced the provided SAM/BAM files. + required: false + example: reference.fasta + + - name: Parameters specific to paired end reads + arguments: + - name: --paired + alternatives: ["-p"] + type: boolean_true + description: | + Specify that input data contain paired-end reads. To perform fragment counting (ie. counting read pairs), the '--countReadPairs' parameter should also be specified in addition to this parameter. + - name: --count_read_pairs + type: boolean_true + description: | + Count read pairs (fragments) instead of reads. This option is only applicable for paired-end reads. + - name: --both_aligned + alternatives: ["-B"] + type: boolean_true + description: | + Count read pairs (fragments) instead of reads. This option is only applicable for paired-end reads. + - name: --check_pe_dist + alternatives: ["-P"] + type: boolean_true + description: | + Check validity of paired-end distance when counting read pairs. Use '--min_length' and '--max_length' to set thresholds. + - name: --min_length + alternatives: ["-d"] + type: integer + description: | + Minimum fragment/template length, 50 by default. + required: false + example: 50 + - name: --max_length + alternatives: ["-D"] + type: integer + description: | + Maximum fragment/template length, 600 by default. + required: false + example: 600 + - name: --same_strand + alternatives: ["-C"] + type: boolean_true + description: | + Do not count read pairs that have their two ends mapping to different chromosomes or mapping to same chromosome but on different strands. + - name: --donotsort + type: boolean_true + description: | + Do not sort reads in BAM/SAM input. Note that reads from the same pair are required to be located next to each other in the input. + + - name: Read groups + arguments: + - name: --by_read_group + type: boolean_true + description: | + Assign reads by read group. "RG" tag is required to be present in the input BAM/SAM files. - resources: - - type: bash_script - path: script.sh + - name: Long reads + arguments: + - name: --long_reads + type: boolean_true + description: | + Count long reads such as Nanopore and PacBio reads. Long read counting can only run in one thread and only reads (not read-pairs) can be counted. There is no limitation on the number of 'M' operations allowed in a CIGAR string in long read counting. + + - name: Assignment results for each read + arguments: + - name: --detailed_results + type: file + direction: output + description: | + Directory to save the detailed assignment results. Use `--detailed_results_format` to determine the format of the detailed results. + example: detailed_results/ + required: false + - name: --detailed_results_format + alternatives: ["-R"] + type: string + description: | + Output detailed assignment results for each read or read-pair. Results are saved to a file that is in one of the following formats: CORE, SAM and BAM. See documentaiton for more info about these formats. + required: false + choices: [CORE, SAM, BAM] + + - name: Miscellaneous + arguments: + - name: --max_M_op + type: integer + description: | + Maximum number of 'M' operations allowed in a CIGAR string. 10 by default. Both 'X' and '=' are treated as 'M' and adjacent 'M' operations are merged in the CIGAR string. + required: false + example: 10 + - name: --verbose + type: boolean_true + description: | + Output verbose information for debugging, such as un-matched chromosome/contig names. + +resources: + - type: bash_script + path: script.sh - test_resources: - - type: bash_script - path: test.sh - - type: file - path: test_data +test_resources: + - type: bash_script + path: test.sh + - type: file + path: test_data -platforms: +engines: - type: docker image: quay.io/biocontainers/subread:2.0.6--he4a0461_0 setup: - type: docker run: | featureCounts -v 2>&1 | sed 's/featureCounts v\([0-9.]*\)/featureCounts: \1/' > /var/software_versions.txt +runners: + - type: executable - type: nextflow \ No newline at end of file diff --git a/src/lofreq/call/config.vsh.yaml b/src/lofreq/call/config.vsh.yaml index 97b98e6f..c547de9d 100644 --- a/src/lofreq/call/config.vsh.yaml +++ b/src/lofreq/call/config.vsh.yaml @@ -1,245 +1,243 @@ -functionality: - name: lofreq_call - namespace: lofreq - description: | - Call variants from a BAM file. +name: lofreq_call +namespace: lofreq +description: | + Call variants from a BAM file. - LoFreq* (i.e. LoFreq version 2) is a fast and sensitive variant-caller for inferring SNVs and indels from next-generation sequencing data. It makes full use of base-call qualities and other sources of errors inherent in sequencing (e.g. mapping or base/indel alignment uncertainty), which are usually ignored by other methods or only used for filtering. + LoFreq* (i.e. LoFreq version 2) is a fast and sensitive variant-caller for inferring SNVs and indels from next-generation sequencing data. It makes full use of base-call qualities and other sources of errors inherent in sequencing (e.g. mapping or base/indel alignment uncertainty), which are usually ignored by other methods or only used for filtering. - LoFreq* can run on almost any type of aligned sequencing data (e.g. Illumina, IonTorrent or Pacbio) since no machine- or sequencing-technology dependent thresholds are used. It automatically adapts to changes in coverage and sequencing quality and can therefore be applied to a variety of data-sets e.g. viral/quasispecies, bacterial, metagenomics or somatic data. + LoFreq* can run on almost any type of aligned sequencing data (e.g. Illumina, IonTorrent or Pacbio) since no machine- or sequencing-technology dependent thresholds are used. It automatically adapts to changes in coverage and sequencing quality and can therefore be applied to a variety of data-sets e.g. viral/quasispecies, bacterial, metagenomics or somatic data. - LoFreq* is very sensitive; most notably, it is able to predict variants below the average base-call quality (i.e. sequencing error rate). Each variant call is assigned a p-value which allows for rigorous false positive control. Even though it uses no approximations or heuristics, it is very efficient due to several runtime optimizations and also provides a (pseudo-)parallel implementation. LoFreq* is generic and fast enough to be applied to high-coverage data and large genomes. On a single processor it takes a minute to analyze Dengue genome sequencing data with nearly 4000X coverage, roughly one hour to call SNVs on a 600X coverage E.coli genome and also roughly an hour to run on a 100X coverage human exome dataset. - info: - keywords: [ "variant calling", "low frequancy variant calling", "lofreq", "lofreq/call"] - links: - homepage: https://csb5.github.io/lofreq/ - documentation: https://csb5.github.io/lofreq/commands/ - reference: - doi: 10.1093/nar/gks918 - license: "MIT" - requirements: - commands: [ lofreq ] - argument_groups: - - name: Inputs - arguments: - - name: --input - type: file - description: | - Input BAM file. - required: true - example: "normal.bam" - - name: --input_bai - type: file - description: | - Index file for the input BAM file. - required: true - example: "normal.bai" - - name: --ref - alternatives: -f - type: file - description: | - Indexed reference fasta file (gzip supported). Default: none. - required: true - example: "reference.fasta" - - name: Outputs - arguments: - - name: --out - alternatives: -o - type: file - description: | - Vcf output file. Default: stdout. - required: true - direction: output - example: "output.vcf" - - name: Arguments - arguments: - - name: --region - alternatives: -r - type: string - description: | - Limit calls to this region (chrom:start-end). Default: none. - required: false - example: "chr1:1000-2000" - - name: --bed - alternatives: -l - type: file - description: | - List of positions (chr pos) or regions (BED). Default: none. - required: false - example: "regions.bed" - - name: --min_bq - alternatives: -q - type: integer - description: | - Skip any base with baseQ smaller than INT. Default: 6. - required: false - example: 6 - - name: --min_alt_bq - alternatives: -Q - type: integer - description: | - Skip alternate bases with baseQ smaller than INT. Default: 6. - required: false - example: 6 - - name: --def_alt_bq - alternatives: -R - type: integer - description: | - Overwrite baseQs of alternate bases (that passed bq filter) with this value (-1: use median ref-bq; 0: keep). Default: 0. - required: false - example: 0 - - name: --min_jq - alternatives: -j - type: integer - description: | - Skip any base with joinedQ smaller than INT. Default: 0. - example: 0 - - name: --min_alt_jq - alternatives: -J - type: integer - description: | - Skip alternate bases with joinedQ smaller than INT. Default: 0. - required: false - example: 0 - - name: --def_alt_jq - alternatives: -K - type: integer - description: | - Overwrite joinedQs of alternate bases (that passed jq filter) with this value (-1: use median ref-bq; 0: keep). Default: 0. - required: false - example: 0 - - name: --no_baq - alternatives: -B - type: boolean_true - description: | - Disable use of base-alignment quality (BAQ). - - name: --no_idaq - alternatives: -A - type: boolean_true - description: | - Don't use IDAQ values (NOT recommended under ANY circumstances other than debugging). - - name: --del_baq - alternatives: -D - type: boolean_true - description: | - Delete pre-existing BAQ values, i.e. compute even if already present in BAM. - - name: --no_ext_baq - alternatives: -e - type: boolean_true - description: | - Use 'normal' BAQ (samtools default) instead of extended BAQ (both computed on the fly if not already present in lb tag). - - name: --min_mq - alternatives: -m - type: integer - description: | - Skip reads with mapping quality smaller than INT. Default: 0. - required: false - example: 0 - - name: --max_mq - alternatives: -M - type: integer - description: | - Cap mapping quality at INT. Default: 255. - required: false - example: 255 - - name: --no_mq - alternatives: -N - type: boolean_true - description: | - Don't merge mapping quality in LoFreq's model. - - name: --call_indels - type: boolean_true - description: | - Enable indel calls (note: preprocess your file to include indel alignment qualities!). - - name: --only_indels - type: boolean_true - description: | - Only call indels; no SNVs. - - name: --src_qual - alternatives: -s - type: boolean_true - description: | - Enable computation of source quality. - - name: --ign_vcf - alternatives: -S - type: file - description: | - Ignore variants in this vcf file for source quality computation. Multiple files can be given separated by commas. - required: false - example: "variants.vcf" - - name: --def_nm_q - alternatives: -T - type: integer - description: | - If >= 0, then replace non-match base qualities with this default value. Default: -1. - required: false - example: -1 - - name: --sig - alternatives: -a - type: double - description: | - P-Value cutoff / significance level. Default: 0.010000. - required: false - example: 0.01 - - name: --bonf - alternatives: -b - type: string - description: | - Bonferroni factor. 'dynamic' (increase per actually performed test) or INT. Default: Dynamic. - required: false - example: "dynamic" - - name: --min_cov - alternatives: -C - type: integer - description: | - Test only positions having at least this coverage. Default: 1. - (note: without --no-default-filter default filters (incl. coverage) kick in after predictions are done). - required: false - example: 1 - - name: --max_depth - alternatives: -d - type: integer - description: | - Cap coverage at this depth. Default: 1000000. - required: false - example: 1000000 - - name: --illumina_13 - type: boolean_true - description: | - Assume the quality is Illumina-1.3-1.7/ASCII+64 encoded. - - name: --use_orphan - type: boolean_true - description: | - Count anomalous read pairs (i.e. where mate is not aligned properly). - - name: --plp_summary_only - type: boolean_true - description: | - No variant calling. Just output pileup summary per column. - - name: --no_default_filter - type: boolean_true - description: | - Don't run default 'lofreq filter' automatically after calling variants. - - name: --force_overwrite - type: boolean_true - description: | - Overwrite any existing output. - - name: --verbose - type: boolean_true - description: | - Be verbose. - - name: --debug - type: boolean_true - description: | - Enable debugging. - resources: - - type: bash_script - path: script.sh - test_resources: - - type: bash_script - path: test.sh - - type: file - path: test_data -platforms: + LoFreq* is very sensitive; most notably, it is able to predict variants below the average base-call quality (i.e. sequencing error rate). Each variant call is assigned a p-value which allows for rigorous false positive control. Even though it uses no approximations or heuristics, it is very efficient due to several runtime optimizations and also provides a (pseudo-)parallel implementation. LoFreq* is generic and fast enough to be applied to high-coverage data and large genomes. On a single processor it takes a minute to analyze Dengue genome sequencing data with nearly 4000X coverage, roughly one hour to call SNVs on a 600X coverage E.coli genome and also roughly an hour to run on a 100X coverage human exome dataset. +keywords: [ "variant calling", "low frequancy variant calling", "lofreq", "lofreq/call"] +links: + homepage: https://csb5.github.io/lofreq/ + documentation: https://csb5.github.io/lofreq/commands/ +references: + doi: 10.1093/nar/gks918 +license: "MIT" +requirements: + commands: [ lofreq ] +argument_groups: + - name: Inputs + arguments: + - name: --input + type: file + description: | + Input BAM file. + required: true + example: "normal.bam" + - name: --input_bai + type: file + description: | + Index file for the input BAM file. + required: true + example: "normal.bai" + - name: --ref + alternatives: -f + type: file + description: | + Indexed reference fasta file (gzip supported). Default: none. + required: true + example: "reference.fasta" + - name: Outputs + arguments: + - name: --out + alternatives: -o + type: file + description: | + Vcf output file. Default: stdout. + required: true + direction: output + example: "output.vcf" + - name: Arguments + arguments: + - name: --region + alternatives: -r + type: string + description: | + Limit calls to this region (chrom:start-end). Default: none. + required: false + example: "chr1:1000-2000" + - name: --bed + alternatives: -l + type: file + description: | + List of positions (chr pos) or regions (BED). Default: none. + required: false + example: "regions.bed" + - name: --min_bq + alternatives: -q + type: integer + description: | + Skip any base with baseQ smaller than INT. Default: 6. + required: false + example: 6 + - name: --min_alt_bq + alternatives: -Q + type: integer + description: | + Skip alternate bases with baseQ smaller than INT. Default: 6. + required: false + example: 6 + - name: --def_alt_bq + alternatives: -R + type: integer + description: | + Overwrite baseQs of alternate bases (that passed bq filter) with this value (-1: use median ref-bq; 0: keep). Default: 0. + required: false + example: 0 + - name: --min_jq + alternatives: -j + type: integer + description: | + Skip any base with joinedQ smaller than INT. Default: 0. + example: 0 + - name: --min_alt_jq + alternatives: -J + type: integer + description: | + Skip alternate bases with joinedQ smaller than INT. Default: 0. + required: false + example: 0 + - name: --def_alt_jq + alternatives: -K + type: integer + description: | + Overwrite joinedQs of alternate bases (that passed jq filter) with this value (-1: use median ref-bq; 0: keep). Default: 0. + required: false + example: 0 + - name: --no_baq + alternatives: -B + type: boolean_true + description: | + Disable use of base-alignment quality (BAQ). + - name: --no_idaq + alternatives: -A + type: boolean_true + description: | + Don't use IDAQ values (NOT recommended under ANY circumstances other than debugging). + - name: --del_baq + alternatives: -D + type: boolean_true + description: | + Delete pre-existing BAQ values, i.e. compute even if already present in BAM. + - name: --no_ext_baq + alternatives: -e + type: boolean_true + description: | + Use 'normal' BAQ (samtools default) instead of extended BAQ (both computed on the fly if not already present in lb tag). + - name: --min_mq + alternatives: -m + type: integer + description: | + Skip reads with mapping quality smaller than INT. Default: 0. + required: false + example: 0 + - name: --max_mq + alternatives: -M + type: integer + description: | + Cap mapping quality at INT. Default: 255. + required: false + example: 255 + - name: --no_mq + alternatives: -N + type: boolean_true + description: | + Don't merge mapping quality in LoFreq's model. + - name: --call_indels + type: boolean_true + description: | + Enable indel calls (note: preprocess your file to include indel alignment qualities!). + - name: --only_indels + type: boolean_true + description: | + Only call indels; no SNVs. + - name: --src_qual + alternatives: -s + type: boolean_true + description: | + Enable computation of source quality. + - name: --ign_vcf + alternatives: -S + type: file + description: | + Ignore variants in this vcf file for source quality computation. Multiple files can be given separated by commas. + required: false + example: "variants.vcf" + - name: --def_nm_q + alternatives: -T + type: integer + description: | + If >= 0, then replace non-match base qualities with this default value. Default: -1. + required: false + example: -1 + - name: --sig + alternatives: -a + type: double + description: | + P-Value cutoff / significance level. Default: 0.010000. + required: false + example: 0.01 + - name: --bonf + alternatives: -b + type: string + description: | + Bonferroni factor. 'dynamic' (increase per actually performed test) or INT. Default: Dynamic. + required: false + example: "dynamic" + - name: --min_cov + alternatives: -C + type: integer + description: | + Test only positions having at least this coverage. Default: 1. + (note: without --no-default-filter default filters (incl. coverage) kick in after predictions are done). + required: false + example: 1 + - name: --max_depth + alternatives: -d + type: integer + description: | + Cap coverage at this depth. Default: 1000000. + required: false + example: 1000000 + - name: --illumina_13 + type: boolean_true + description: | + Assume the quality is Illumina-1.3-1.7/ASCII+64 encoded. + - name: --use_orphan + type: boolean_true + description: | + Count anomalous read pairs (i.e. where mate is not aligned properly). + - name: --plp_summary_only + type: boolean_true + description: | + No variant calling. Just output pileup summary per column. + - name: --no_default_filter + type: boolean_true + description: | + Don't run default 'lofreq filter' automatically after calling variants. + - name: --force_overwrite + type: boolean_true + description: | + Overwrite any existing output. + - name: --verbose + type: boolean_true + description: | + Be verbose. + - name: --debug + type: boolean_true + description: | + Enable debugging. +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - type: file + path: test_data +engines: - type: docker image: quay.io/biocontainers/lofreq:2.1.5--py38h794fc9e_10 setup: @@ -247,5 +245,6 @@ platforms: run: | version=$(lofreq version | grep 'version' | sed 's/version: //') && \ echo "lofreq: $version" > /var/software_versions.txt +runners: + - type: executable - type: nextflow - diff --git a/src/lofreq/indelqual/config.vsh.yaml b/src/lofreq/indelqual/config.vsh.yaml index 821d5d72..0524458e 100644 --- a/src/lofreq/indelqual/config.vsh.yaml +++ b/src/lofreq/indelqual/config.vsh.yaml @@ -1,77 +1,75 @@ -functionality: - name: lofreq_indelqual - namespace: lofreq - description: | - Insert indel qualities into BAM file (required for indel predictions). +name: lofreq_indelqual +namespace: lofreq +description: | + Insert indel qualities into BAM file (required for indel predictions). - The preferred way of inserting indel qualities should be via GATK's BQSR (>=2) If that's not possible, use this subcommand. - The command has two modes: 'uniform' and 'dindel': - - 'uniform' will assign a given value uniformly, whereas - - 'dindel' will insert indel qualities based on Dindel (PMID 20980555). - Both will overwrite any existing values. - Do not realign your BAM file afterwards! - info: - keywords: [ "bam", "indel", "qualities", "indelqual", "lofreq", "lofreq/indelqual"] - links: - homepage: https://csb5.github.io/lofreq/ - documentation: https://csb5.github.io/lofreq/commands/ - reference: - doi: 10.1093/nar/gks918 - license: "MIT" - requirements: - commands: [ lofreq ] - argument_groups: - - name: Inputs - arguments: - - name: --input - type: file - description: | - Input BAM file. - required: true - example: "normal.bam" - - name: --ref - alternatives: -f - type: file - description: | - Reference sequence used for mapping (Only required for --dindel). - required: false - example: "reference.fasta" - - name: Outputs - arguments: - - name: --out - alternatives: -o - type: file - description: | - Output BAM file. - required: true - direction: output - example: "output.bam" - - name: Arguments - arguments: - - name: --uniform - alternatives: -u - type: string - description: | - Add this indel quality uniformly to all bases. Use two comma separated values to specify insertion and deletion quality separately. (clashes with --dindel). - required: false - example: "50,50" - - name: --dindel - type: boolean_true - description: | - Add Dindel's indel qualities (Illumina specific) (clashes with -u; needs --ref). - - name: --verbose - type: boolean_true - description: | - Be verbose. - resources: - - type: bash_script - path: script.sh - test_resources: - - type: bash_script - path: test.sh - - type: file - path: test_data -platforms: + The preferred way of inserting indel qualities should be via GATK's BQSR (>=2) If that's not possible, use this subcommand. + The command has two modes: 'uniform' and 'dindel': + - 'uniform' will assign a given value uniformly, whereas + - 'dindel' will insert indel qualities based on Dindel (PMID 20980555). + Both will overwrite any existing values. + Do not realign your BAM file afterwards! +keywords: [ "bam", "indel", "qualities", "indelqual", "lofreq", "lofreq/indelqual"] +links: + homepage: https://csb5.github.io/lofreq/ + documentation: https://csb5.github.io/lofreq/commands/ +references: + doi: 10.1093/nar/gks918 +license: "MIT" +requirements: + commands: [ lofreq ] +argument_groups: + - name: Inputs + arguments: + - name: --input + type: file + description: | + Input BAM file. + required: true + example: "normal.bam" + - name: --ref + alternatives: -f + type: file + description: | + Reference sequence used for mapping (Only required for --dindel). + required: false + example: "reference.fasta" + - name: Outputs + arguments: + - name: --out + alternatives: -o + type: file + description: | + Output BAM file. + required: true + direction: output + example: "output.bam" + - name: Arguments + arguments: + - name: --uniform + alternatives: -u + type: string + description: | + Add this indel quality uniformly to all bases. Use two comma separated values to specify insertion and deletion quality separately. (clashes with --dindel). + required: false + example: "50,50" + - name: --dindel + type: boolean_true + description: | + Add Dindel's indel qualities (Illumina specific) (clashes with -u; needs --ref). + - name: --verbose + type: boolean_true + description: | + Be verbose. +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - type: file + path: test_data +engines: - type: docker image: quay.io/biocontainers/lofreq:2.1.5--py38h794fc9e_10 setup: @@ -79,4 +77,6 @@ platforms: run: | version=$(lofreq version | grep 'version' | sed 's/version: //') && \ echo "lofreq: $version" > /var/software_versions.txt +runners: + - type: executable - type: nextflow diff --git a/src/pear/config.vsh.yaml b/src/pear/config.vsh.yaml index 53921baa..d6dbe6c9 100644 --- a/src/pear/config.vsh.yaml +++ b/src/pear/config.vsh.yaml @@ -1,156 +1,154 @@ -functionality: - name: pear - description: | - PEAR is an ultrafast, memory-efficient and highly accurate pair-end read merger. It is fully parallelized and can run with as low as just a few kilobytes of memory. - - PEAR evaluates all possible paired-end read overlaps and without requiring the target fragment size as input. In addition, it implements a statistical test for minimizing false-positive results. Together with a highly optimized implementation, it can merge millions of paired end reads within a couple of minutes on a standard desktop computer. - info: - keywords: [ "pair-end", "read", "merge" ] - links: - homepage: https://cme.h-its.org/exelixis/web/software/pear - repository: https://github.com/tseemann/PEAR - documentation: https://cme.h-its.org/exelixis/web/software/pear/doc.html - references: - doi: 10.1093/bioinformatics/btt593 - license: "CC-BY-NC-SA-3.0" - requirements: - commands: [ pear , gzip ] - argument_groups: - - name: Inputs - arguments: - - name: --forward_fastq - alternatives: -f - type: file - description: Forward paired-end FASTQ file - required: true - example: "forward.fastq" - - name: --reverse_fastq - alternatives: -r - type: file - description: Reverse paired-end FASTQ file - required: true - example: "reverse.fastq" - - name: Outputs - arguments: - - name: --assembled - type: file - description: The output file containing assembled reads. Can be compressed with gzip. - required: true - direction: output - - name: --unassembled_forward - type: file - description: The output file containing forward reads that could not be assembled. Can be compressed with gzip. - required: true - direction: output - - name: --unassembled_reverse - type: file - description: The output file containing reverse reads that could not be assembled. Can be compressed with gzip. - required: true - direction: output - - name: --discarded - type: file - description: The output file containing reads that were discarded due to too low quality or too many uncalled bases. Can be compressed with gzip. - required: true - direction: output - - name: Arguments - arguments: - - name: --p_value - alternatives: -p - type: double - description: | - Specify a p-value for the statistical test. If the computed p-value of a possible assembly exceeds the specified p-value then paired-end read will not be assembled. Valid options are: 0.0001, 0.001, 0.01, 0.05 and 1.0. Setting 1.0 disables the test. - example: 0.01 - required: false - - name: --min_overlap - alternatives: -v - type: integer - description: | - Specify the minimum overlap size. The minimum overlap may be set to 1 when the statistical test is used. However, further restricting the minimum overlap size to a proper value may reduce false-positive assembles. - required: false - example: 10 - - name: --max_assembly_length - alternatives: -m - type: integer - description: | - Specify the maximum possible length of the assembled sequences. Setting this value to 0 disables the restriction and assembled sequences may be arbitrary long. - required: false - example: 0 - - name: --min_assembly_length - alternatives: -n - type: integer - description: | - Specify the minimum possible length of the assembled sequences. Setting this value to 0 disables the restriction and assembled sequences may be arbitrary short. - required: false - example: 0 - - name: --min_trim_length - alternatives: -t - type: integer - description: | - Specify the minimum length of reads after trimming the low quality part (see option -q) - required: false - example: 1 - - name: --quality_threshold - alternatives: -q - type: integer - description: | - Specify the quality threshold for trimming the low quality part of a read. If the quality scores of two consecutive bases are strictly less than the specified threshold, the rest of the read will be trimmed. - required: false - example: 0 - - name: --max_uncalled_base - alternatives: -u - type: double - description: | - Specify the maximal proportion of uncalled bases in a read. Setting this value to 0 will cause PEAR to discard all reads containing uncalled bases. The other extreme setting is 1 which causes PEAR to process all reads independent on the number of uncalled bases. - example: 1.0 - required: false - - name: --test_method - alternatives: -g - type: integer - description: | - Specify the type of statistical test. Two options are available. 1: Given the minimum allowed overlap, test using the highest OES. Note that due to its discrete nature, this test usually yields a lower p-value for the assembled read than the cut- off (specified by -p). For example, setting the cut-off to 0.05 using this test, the assembled reads might have an actual p-value of 0.02. - 2. Use the acceptance probability (m.a.p). This test methods computes the same probability as test method 1. However, it assumes that the minimal overlap is the observed overlap with the highest OES, instead of the one specified by -v. Therefore, this is not a valid statistical test and the 'p-value' is in fact the maximal probability for accepting the assembly. Nevertheless, we observed in practice that for the case the actual overlap sizes are relatively small, test 2 can correctly assemble more reads with only slightly higher false-positive rate. - required: false - example: 1 - - name: --emperical_freqs - alternatives: -e - type: boolean_true - description: | - Disable empirical base frequencies. - - name: --score_method - alternatives: -s - type: integer - description: | - Specify the scoring method. 1. OES with +1 for match and -1 for mismatch. 2: Assembly score (AS). Use +1 for match and -1 for mismatch multiplied by base quality scores. 3: Ignore quality scores and use +1 for a match and -1 for a mismatch. - required: false - example: 2 - - name: --phred_base - alternatives: -b - type: integer - description: | - Base PHRED quality score. - required: false - example: 33 - - name: --cap - alternatives: -c - type: integer - description: | - Specify the upper bound for the resulting quality score. If set to zero, capping is disabled. - required: false - example: 40 - - name: --nbase - alternatives: -z - type: boolean_true - description: | - When merging a base-pair that consists of two non-equal bases out of which none is degenerate, set the merged base to N and use the highest quality score of the two bases - resources: - - type: bash_script - path: script.sh - test_resources: - - type: bash_script - path: test.sh - - type: file - path: test_data -platforms: +name: pear +description: | + PEAR is an ultrafast, memory-efficient and highly accurate pair-end read merger. It is fully parallelized and can run with as low as just a few kilobytes of memory. + + PEAR evaluates all possible paired-end read overlaps and without requiring the target fragment size as input. In addition, it implements a statistical test for minimizing false-positive results. Together with a highly optimized implementation, it can merge millions of paired end reads within a couple of minutes on a standard desktop computer. +keywords: [ "pair-end", "read", "merge" ] +links: + homepage: https://cme.h-its.org/exelixis/web/software/pear + repository: https://github.com/tseemann/PEAR + documentation: https://cme.h-its.org/exelixis/web/software/pear/doc.html +references: + doi: 10.1093/bioinformatics/btt593 +license: "CC-BY-NC-SA-3.0" +requirements: + commands: [ pear , gzip ] +argument_groups: + - name: Inputs + arguments: + - name: --forward_fastq + alternatives: -f + type: file + description: Forward paired-end FASTQ file + required: true + example: "forward.fastq" + - name: --reverse_fastq + alternatives: -r + type: file + description: Reverse paired-end FASTQ file + required: true + example: "reverse.fastq" + - name: Outputs + arguments: + - name: --assembled + type: file + description: The output file containing assembled reads. Can be compressed with gzip. + required: true + direction: output + - name: --unassembled_forward + type: file + description: The output file containing forward reads that could not be assembled. Can be compressed with gzip. + required: true + direction: output + - name: --unassembled_reverse + type: file + description: The output file containing reverse reads that could not be assembled. Can be compressed with gzip. + required: true + direction: output + - name: --discarded + type: file + description: The output file containing reads that were discarded due to too low quality or too many uncalled bases. Can be compressed with gzip. + required: true + direction: output + - name: Arguments + arguments: + - name: --p_value + alternatives: -p + type: double + description: | + Specify a p-value for the statistical test. If the computed p-value of a possible assembly exceeds the specified p-value then paired-end read will not be assembled. Valid options are: 0.0001, 0.001, 0.01, 0.05 and 1.0. Setting 1.0 disables the test. + example: 0.01 + required: false + - name: --min_overlap + alternatives: -v + type: integer + description: | + Specify the minimum overlap size. The minimum overlap may be set to 1 when the statistical test is used. However, further restricting the minimum overlap size to a proper value may reduce false-positive assembles. + required: false + example: 10 + - name: --max_assembly_length + alternatives: -m + type: integer + description: | + Specify the maximum possible length of the assembled sequences. Setting this value to 0 disables the restriction and assembled sequences may be arbitrary long. + required: false + example: 0 + - name: --min_assembly_length + alternatives: -n + type: integer + description: | + Specify the minimum possible length of the assembled sequences. Setting this value to 0 disables the restriction and assembled sequences may be arbitrary short. + required: false + example: 0 + - name: --min_trim_length + alternatives: -t + type: integer + description: | + Specify the minimum length of reads after trimming the low quality part (see option -q) + required: false + example: 1 + - name: --quality_threshold + alternatives: -q + type: integer + description: | + Specify the quality threshold for trimming the low quality part of a read. If the quality scores of two consecutive bases are strictly less than the specified threshold, the rest of the read will be trimmed. + required: false + example: 0 + - name: --max_uncalled_base + alternatives: -u + type: double + description: | + Specify the maximal proportion of uncalled bases in a read. Setting this value to 0 will cause PEAR to discard all reads containing uncalled bases. The other extreme setting is 1 which causes PEAR to process all reads independent on the number of uncalled bases. + example: 1.0 + required: false + - name: --test_method + alternatives: -g + type: integer + description: | + Specify the type of statistical test. Two options are available. 1: Given the minimum allowed overlap, test using the highest OES. Note that due to its discrete nature, this test usually yields a lower p-value for the assembled read than the cut- off (specified by -p). For example, setting the cut-off to 0.05 using this test, the assembled reads might have an actual p-value of 0.02. + 2. Use the acceptance probability (m.a.p). This test methods computes the same probability as test method 1. However, it assumes that the minimal overlap is the observed overlap with the highest OES, instead of the one specified by -v. Therefore, this is not a valid statistical test and the 'p-value' is in fact the maximal probability for accepting the assembly. Nevertheless, we observed in practice that for the case the actual overlap sizes are relatively small, test 2 can correctly assemble more reads with only slightly higher false-positive rate. + required: false + example: 1 + - name: --emperical_freqs + alternatives: -e + type: boolean_true + description: | + Disable empirical base frequencies. + - name: --score_method + alternatives: -s + type: integer + description: | + Specify the scoring method. 1. OES with +1 for match and -1 for mismatch. 2: Assembly score (AS). Use +1 for match and -1 for mismatch multiplied by base quality scores. 3: Ignore quality scores and use +1 for a match and -1 for a mismatch. + required: false + example: 2 + - name: --phred_base + alternatives: -b + type: integer + description: | + Base PHRED quality score. + required: false + example: 33 + - name: --cap + alternatives: -c + type: integer + description: | + Specify the upper bound for the resulting quality score. If set to zero, capping is disabled. + required: false + example: 40 + - name: --nbase + alternatives: -z + type: boolean_true + description: | + When merging a base-pair that consists of two non-equal bases out of which none is degenerate, set the merged base to N and use the highest quality score of the two bases +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - type: file + path: test_data +engines: - type: docker image: quay.io/biocontainers/pear:0.9.6--h9d449c0_10 setup: @@ -158,4 +156,6 @@ platforms: run: | version=$(pear -h | grep 'PEAR v' | sed 's/PEAR v//' | sed 's/ .*//') && \ echo "pear: $version" > /var/software_versions.txt +runners: + - type: executable - type: nextflow \ No newline at end of file From 8acef309bf08b3c99c720b54c2ebdda433bdc710 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Tue, 27 Feb 2024 14:29:21 +0100 Subject: [PATCH 2/3] Add star_align_reads component (#22) * add star align component * refactor script * refactor variables and script * ensure zcat and bzcat are also installed * change utf8 into ascii * clean up quotes * fix renamed argument * better utf8 to ascii conversion * update format to viash 0.9 * add star version to docker * add TODOs * add newline * wip check contents of star output * remove test data * rename component * remove todo * remove starsolo params * Apply suggestions from code review Co-authored-by: Dries Schaumont <5946712+DriesSchaumont@users.noreply.github.com> * add missing slash --------- Co-authored-by: Dries Schaumont <5946712+DriesSchaumont@users.noreply.github.com> --- CHANGELOG.md | 2 + .../star_align_reads/argument_groups.yaml | 1088 +++++++++++++++++ src/star/star_align_reads/config.vsh.yaml | 115 ++ src/star/star_align_reads/help.txt | 927 ++++++++++++++ src/star/star_align_reads/script.py | 109 ++ src/star/star_align_reads/test.sh | 173 +++ .../star_align_reads/utils/process_params.R | 189 +++ 7 files changed, 2603 insertions(+) create mode 100644 src/star/star_align_reads/argument_groups.yaml create mode 100644 src/star/star_align_reads/config.vsh.yaml create mode 100644 src/star/star_align_reads/help.txt create mode 100644 src/star/star_align_reads/script.py create mode 100644 src/star/star_align_reads/test.sh create mode 100644 src/star/star_align_reads/utils/process_params.R diff --git a/CHANGELOG.md b/CHANGELOG.md index 0f6fd740..bd6a639a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,8 @@ * `lofreq/indelqual`: Insert indel qualities into BAM file (PR #17). +* `star/star_align_reads`: Align reads to a reference genome (PR #22). + ## MAJOR CHANGES ## MINOR CHANGES diff --git a/src/star/star_align_reads/argument_groups.yaml b/src/star/star_align_reads/argument_groups.yaml new file mode 100644 index 00000000..e6a1c874 --- /dev/null +++ b/src/star/star_align_reads/argument_groups.yaml @@ -0,0 +1,1088 @@ +argument_groups: +- name: Run Parameters + arguments: + - name: --runRNGseed + type: integer + description: random number generator seed. + example: 777 +- name: Genome Parameters + arguments: + - name: --genomeDir + type: file + description: path to the directory where genome files are stored (for --runMode + alignReads) or will be generated (for --runMode generateGenome) + example: ./GenomeDir/ + required: yes + - name: --genomeLoad + type: string + description: |- + mode of shared memory usage for the genome files. Only used with --runMode alignReads. + + - LoadAndKeep ... load genome into shared and keep it in memory after run + - LoadAndRemove ... load genome into shared but remove it after run + - LoadAndExit ... load genome into shared memory and exit, keeping the genome in memory for future runs + - Remove ... do not map anything, just remove loaded genome from memory + - NoSharedMemory ... do not use shared memory, each job will have its own private copy of the genome + example: NoSharedMemory + - name: --genomeFastaFiles + type: file + description: |- + path(s) to the fasta files with the genome sequences, separated by spaces. These files should be plain text FASTA files, they *cannot* be zipped. + + Required for the genome generation (--runMode genomeGenerate). Can also be used in the mapping (--runMode alignReads) to add extra (new) sequences to the genome (e.g. spike-ins). + multiple: yes + multiple_sep: ; + - name: --genomeFileSizes + type: integer + description: genome files exact sizes in bytes. Typically, this should not be + defined by the user. + example: 0 + multiple: yes + multiple_sep: ; + - name: --genomeTransformOutput + type: string + description: |- + which output to transform back to original genome + + - SAM ... SAM/BAM alignments + - SJ ... splice junctions (SJ.out.tab) + - Quant ... quantifications (from --quantMode option) + - None ... no transformation of the output + multiple: yes + multiple_sep: ; + - name: --genomeChrSetMitochondrial + type: string + description: names of the mitochondrial chromosomes. Presently only used for STARsolo + statistics output/ + example: + - chrM + - M + - MT + multiple: yes + multiple_sep: ; +- name: Splice Junctions Database + arguments: + - name: --sjdbFileChrStartEnd + type: string + description: path to the files with genomic coordinates (chr start + end strand) for the splice junction introns. Multiple files can be supplied + and will be concatenated. + multiple: yes + multiple_sep: ; + - name: --sjdbGTFfile + type: file + description: path to the GTF file with annotations + - name: --sjdbGTFchrPrefix + type: string + description: prefix for chromosome names in a GTF file (e.g. 'chr' for using ENSMEBL + annotations with UCSC genomes) + - name: --sjdbGTFfeatureExon + type: string + description: feature type in GTF file to be used as exons for building transcripts + example: exon + - name: --sjdbGTFtagExonParentTranscript + type: string + description: GTF attribute name for parent transcript ID (default "transcript_id" + works for GTF files) + example: transcript_id + - name: --sjdbGTFtagExonParentGene + type: string + description: GTF attribute name for parent gene ID (default "gene_id" works for + GTF files) + example: gene_id + - name: --sjdbGTFtagExonParentGeneName + type: string + description: GTF attribute name for parent gene name + example: gene_name + multiple: yes + multiple_sep: ; + - name: --sjdbGTFtagExonParentGeneType + type: string + description: GTF attribute name for parent gene type + example: + - gene_type + - gene_biotype + multiple: yes + multiple_sep: ; + - name: --sjdbOverhang + type: integer + description: length of the donor/acceptor sequence on each side of the junctions, + ideally = (mate_length - 1) + example: 100 + - name: --sjdbScore + type: integer + description: extra alignment score for alignments that cross database junctions + example: 2 + - name: --sjdbInsertSave + type: string + description: |- + which files to save when sjdb junctions are inserted on the fly at the mapping step + + - Basic ... only small junction / transcript files + - All ... all files including big Genome, SA and SAindex - this will create a complete genome directory + example: Basic +- name: Variation parameters + arguments: + - name: --varVCFfile + type: string + description: path to the VCF file that contains variation data. The 10th column + should contain the genotype information, e.g. 0/1 +- name: Read Parameters + arguments: + - name: --readFilesType + type: string + description: |- + format of input read files + + - Fastx ... FASTA or FASTQ + - SAM SE ... SAM or BAM single-end reads; for BAM use --readFilesCommand samtools view + - SAM PE ... SAM or BAM paired-end reads; for BAM use --readFilesCommand samtools view + example: Fastx + - name: --readFilesSAMattrKeep + type: string + description: |- + for --readFilesType SAM SE/PE, which SAM tags to keep in the output BAM, e.g.: --readFilesSAMtagsKeep RG PL + + - All ... keep all tags + - None ... do not keep any tags + example: All + multiple: yes + multiple_sep: ; + - name: --readFilesManifest + type: file + description: |- + path to the "manifest" file with the names of read files. The manifest file should contain 3 tab-separated columns: + + paired-end reads: read1_file_name $tab$ read2_file_name $tab$ read_group_line. + single-end reads: read1_file_name $tab$ - $tab$ read_group_line. + Spaces, but not tabs are allowed in file names. + If read_group_line does not start with ID:, it can only contain one ID field, and ID: will be added to it. + If read_group_line starts with ID:, it can contain several fields separated by $tab$, and all fields will be be copied verbatim into SAM @RG header line. + - name: --readFilesPrefix + type: string + description: prefix for the read files names, i.e. it will be added in front of + the strings in --readFilesIn + - name: --readFilesCommand + type: string + description: |- + command line to execute for each of the input file. This command should generate FASTA or FASTQ text and send it to stdout + + For example: zcat - to uncompress .gz files, bzcat - to uncompress .bz2 files, etc. + multiple: yes + multiple_sep: ; + - name: --readMapNumber + type: integer + description: |- + number of reads to map from the beginning of the file + + -1: map all reads + example: -1 + - name: --readMatesLengthsIn + type: string + description: Equal/NotEqual - lengths of names,sequences,qualities for both mates + are the same / not the same. NotEqual is safe in all situations. + example: NotEqual + - name: --readNameSeparator + type: string + description: character(s) separating the part of the read names that will be trimmed + in output (read name after space is always trimmed) + example: / + multiple: yes + multiple_sep: ; + - name: --readQualityScoreBase + type: integer + description: number to be subtracted from the ASCII code to get Phred quality + score + example: 33 +- name: Read Clipping + arguments: + - name: --clipAdapterType + type: string + description: |- + adapter clipping type + + - Hamming ... adapter clipping based on Hamming distance, with the number of mismatches controlled by --clip5pAdapterMMp + - CellRanger4 ... 5p and 3p adapter clipping similar to CellRanger4. Utilizes Opal package by Martin Sosic: https://github.com/Martinsos/opal + - None ... no adapter clipping, all other clip* parameters are disregarded + example: Hamming + - name: --clip3pNbases + type: integer + description: number(s) of bases to clip from 3p of each mate. If one value is + given, it will be assumed the same for both mates. + example: 0 + multiple: yes + multiple_sep: ; + - name: --clip3pAdapterSeq + type: string + description: |- + adapter sequences to clip from 3p of each mate. If one value is given, it will be assumed the same for both mates. + + - polyA ... polyA sequence with the length equal to read length + multiple: yes + multiple_sep: ; + - name: --clip3pAdapterMMp + type: double + description: max proportion of mismatches for 3p adapter clipping for each mate. If + one value is given, it will be assumed the same for both mates. + example: 0.1 + multiple: yes + multiple_sep: ; + - name: --clip3pAfterAdapterNbases + type: integer + description: number of bases to clip from 3p of each mate after the adapter clipping. + If one value is given, it will be assumed the same for both mates. + example: 0 + multiple: yes + multiple_sep: ; + - name: --clip5pNbases + type: integer + description: number(s) of bases to clip from 5p of each mate. If one value is + given, it will be assumed the same for both mates. + example: 0 + multiple: yes + multiple_sep: ; +- name: Limits + arguments: + - name: --limitGenomeGenerateRAM + type: long + description: maximum available RAM (bytes) for genome generation + example: '31000000000' + - name: --limitIObufferSize + type: long + description: max available buffers size (bytes) for input/output, per thread + example: + - 30000000 + - 50000000 + multiple: yes + multiple_sep: ; + - name: --limitOutSAMoneReadBytes + type: long + description: 'max size of the SAM record (bytes) for one read. Recommended value: + >(2*(LengthMate1+LengthMate2+100)*outFilterMultimapNmax' + example: 100000 + - name: --limitOutSJoneRead + type: integer + description: max number of junctions for one read (including all multi-mappers) + example: 1000 + - name: --limitOutSJcollapsed + type: integer + description: max number of collapsed junctions + example: 1000000 + - name: --limitBAMsortRAM + type: long + description: maximum available RAM (bytes) for sorting BAM. If =0, it will be + set to the genome index size. 0 value can only be used with --genomeLoad NoSharedMemory + option. + example: 0 + - name: --limitSjdbInsertNsj + type: integer + description: maximum number of junctions to be inserted to the genome on the fly + at the mapping stage, including those from annotations and those detected in + the 1st step of the 2-pass run + example: 1000000 + - name: --limitNreadsSoft + type: integer + description: soft limit on the number of reads + example: -1 +- name: 'Output: general' + arguments: + - name: --outTmpKeep + type: string + description: |- + whether to keep the temporary files after STAR runs is finished + + - None ... remove all temporary files + - All ... keep all files + - name: --outStd + type: string + description: |- + which output will be directed to stdout (standard out) + + - Log ... log messages + - SAM ... alignments in SAM format (which normally are output to Aligned.out.sam file), normal standard output will go into Log.std.out + - BAM_Unsorted ... alignments in BAM format, unsorted. Requires --outSAMtype BAM Unsorted + - BAM_SortedByCoordinate ... alignments in BAM format, sorted by coordinate. Requires --outSAMtype BAM SortedByCoordinate + - BAM_Quant ... alignments to transcriptome in BAM format, unsorted. Requires --quantMode TranscriptomeSAM + example: Log + - name: --outReadsUnmapped + type: string + description: |- + output of unmapped and partially mapped (i.e. mapped only one mate of a paired end read) reads in separate file(s). + + - None ... no output + - Fastx ... output in separate fasta/fastq files, Unmapped.out.mate1/2 + - name: --outQSconversionAdd + type: integer + description: add this number to the quality score (e.g. to convert from Illumina + to Sanger, use -31) + example: 0 + - name: --outMultimapperOrder + type: string + description: |- + order of multimapping alignments in the output files + + - Old_2.4 ... quasi-random order used before 2.5.0 + - Random ... random order of alignments for each multi-mapper. Read mates (pairs) are always adjacent, all alignment for each read stay together. This option will become default in the future releases. + example: Old_2.4 +- name: 'Output: SAM and BAM' + arguments: + - name: --outSAMtype + type: string + description: |- + type of SAM/BAM output + + 1st word: + - BAM ... output BAM without sorting + - SAM ... output SAM without sorting + - None ... no SAM/BAM output + 2nd, 3rd: + - Unsorted ... standard unsorted + - SortedByCoordinate ... sorted by coordinate. This option will allocate extra memory for sorting which can be specified by --limitBAMsortRAM. + example: SAM + multiple: yes + multiple_sep: ; + - name: --outSAMmode + type: string + description: |- + mode of SAM output + + - None ... no SAM output + - Full ... full SAM output + - NoQS ... full SAM but without quality scores + example: Full + - name: --outSAMstrandField + type: string + description: |- + Cufflinks-like strand field flag + + - None ... not used + - intronMotif ... strand derived from the intron motif. This option changes the output alignments: reads with inconsistent and/or non-canonical introns are filtered out. + - name: --outSAMattributes + type: string + description: |- + a string of desired SAM attributes, in the order desired for the output SAM. Tags can be listed in any combination/order. + + ***Presets: + - None ... no attributes + - Standard ... NH HI AS nM + - All ... NH HI AS nM NM MD jM jI MC ch + ***Alignment: + - NH ... number of loci the reads maps to: =1 for unique mappers, >1 for multimappers. Standard SAM tag. + - HI ... multiple alignment index, starts with --outSAMattrIHstart (=1 by default). Standard SAM tag. + - AS ... local alignment score, +1/-1 for matches/mismateches, score* penalties for indels and gaps. For PE reads, total score for two mates. Stadnard SAM tag. + - nM ... number of mismatches. For PE reads, sum over two mates. + - NM ... edit distance to the reference (number of mismatched + inserted + deleted bases) for each mate. Standard SAM tag. + - MD ... string encoding mismatched and deleted reference bases (see standard SAM specifications). Standard SAM tag. + - jM ... intron motifs for all junctions (i.e. N in CIGAR): 0: non-canonical; 1: GT/AG, 2: CT/AC, 3: GC/AG, 4: CT/GC, 5: AT/AC, 6: GT/AT. If splice junctions database is used, and a junction is annotated, 20 is added to its motif value. + - jI ... start and end of introns for all junctions (1-based). + - XS ... alignment strand according to --outSAMstrandField. + - MC ... mate's CIGAR string. Standard SAM tag. + - ch ... marks all segment of all chimeric alingments for --chimOutType WithinBAM output. + - cN ... number of bases clipped from the read ends: 5' and 3' + ***Variation: + - vA ... variant allele + - vG ... genomic coordinate of the variant overlapped by the read. + - vW ... 1 - alignment passes WASP filtering; 2,3,4,5,6,7 - alignment does not pass WASP filtering. Requires --waspOutputMode SAMtag. + - ha ... haplotype (1/2) when mapping to the diploid genome. Requires genome generated with --genomeTransformType Diploid . + ***STARsolo: + - CR CY UR UY ... sequences and quality scores of cell barcodes and UMIs for the solo* demultiplexing. + - GX GN ... gene ID and gene name for unique-gene reads. + - gx gn ... gene IDs and gene names for unique- and multi-gene reads. + - CB UB ... error-corrected cell barcodes and UMIs for solo* demultiplexing. Requires --outSAMtype BAM SortedByCoordinate. + - sM ... assessment of CB and UMI. + - sS ... sequence of the entire barcode (CB,UMI,adapter). + - sQ ... quality of the entire barcode. + - sF ... type of feature overlap and number of features for each alignment + ***Unsupported/undocumented: + - rB ... alignment block read/genomic coordinates. + - vR ... read coordinate of the variant. + example: Standard + multiple: yes + multiple_sep: ; + - name: --outSAMattrIHstart + type: integer + description: start value for the IH attribute. 0 may be required by some downstream + software, such as Cufflinks or StringTie. + example: 1 + - name: --outSAMunmapped + type: string + description: |- + output of unmapped reads in the SAM format + + 1st word: + - None ... no output + - Within ... output unmapped reads within the main SAM file (i.e. Aligned.out.sam) + 2nd word: + - KeepPairs ... record unmapped mate for each alignment, and, in case of unsorted output, keep it adjacent to its mapped mate. Only affects multi-mapping reads. + multiple: yes + multiple_sep: ; + - name: --outSAMorder + type: string + description: |- + type of sorting for the SAM output + + Paired: one mate after the other for all paired alignments + PairedKeepInputOrder: one mate after the other for all paired alignments, the order is kept the same as in the input FASTQ files + example: Paired + - name: --outSAMprimaryFlag + type: string + description: |- + which alignments are considered primary - all others will be marked with 0x100 bit in the FLAG + + - OneBestScore ... only one alignment with the best score is primary + - AllBestScore ... all alignments with the best score are primary + example: OneBestScore + - name: --outSAMreadID + type: string + description: |- + read ID record type + + - Standard ... first word (until space) from the FASTx read ID line, removing /1,/2 from the end + - Number ... read number (index) in the FASTx file + example: Standard + - name: --outSAMmapqUnique + type: integer + description: '0 to 255: the MAPQ value for unique mappers' + example: 255 + - name: --outSAMflagOR + type: integer + description: '0 to 65535: sam FLAG will be bitwise OR''d with this value, i.e. + FLAG=FLAG | outSAMflagOR. This is applied after all flags have been set by STAR, + and after outSAMflagAND. Can be used to set specific bits that are not set otherwise.' + example: 0 + - name: --outSAMflagAND + type: integer + description: '0 to 65535: sam FLAG will be bitwise AND''d with this value, i.e. + FLAG=FLAG & outSAMflagOR. This is applied after all flags have been set by STAR, + but before outSAMflagOR. Can be used to unset specific bits that are not set + otherwise.' + example: 65535 + - name: --outSAMattrRGline + type: string + description: |- + SAM/BAM read group line. The first word contains the read group identifier and must start with "ID:", e.g. --outSAMattrRGline ID:xxx CN:yy "DS:z z z". + + xxx will be added as RG tag to each output alignment. Any spaces in the tag values have to be double quoted. + Comma separated RG lines correspons to different (comma separated) input files in --readFilesIn. Commas have to be surrounded by spaces, e.g. + --outSAMattrRGline ID:xxx , ID:zzz "DS:z z" , ID:yyy DS:yyyy + multiple: yes + multiple_sep: ; + - name: --outSAMheaderHD + type: string + description: '@HD (header) line of the SAM header' + multiple: yes + multiple_sep: ; + - name: --outSAMheaderPG + type: string + description: extra @PG (software) line of the SAM header (in addition to STAR) + multiple: yes + multiple_sep: ; + - name: --outSAMheaderCommentFile + type: string + description: path to the file with @CO (comment) lines of the SAM header + - name: --outSAMfilter + type: string + description: |- + filter the output into main SAM/BAM files + + - KeepOnlyAddedReferences ... only keep the reads for which all alignments are to the extra reference sequences added with --genomeFastaFiles at the mapping stage. + - KeepAllAddedReferences ... keep all alignments to the extra reference sequences added with --genomeFastaFiles at the mapping stage. + multiple: yes + multiple_sep: ; + - name: --outSAMmultNmax + type: integer + description: |- + max number of multiple alignments for a read that will be output to the SAM/BAM files. Note that if this value is not equal to -1, the top scoring alignment will be output first + + - -1 ... all alignments (up to --outFilterMultimapNmax) will be output + example: -1 + - name: --outSAMtlen + type: integer + description: |- + calculation method for the TLEN field in the SAM/BAM files + + - 1 ... leftmost base of the (+)strand mate to rightmost base of the (-)mate. (+)sign for the (+)strand mate + - 2 ... leftmost base of any mate to rightmost base of any mate. (+)sign for the mate with the leftmost base. This is different from 1 for overlapping mates with protruding ends + example: 1 + - name: --outBAMcompression + type: integer + description: -1 to 10 BAM compression level, -1=default compression (6?), 0=no + compression, 10=maximum compression + example: 1 + - name: --outBAMsortingThreadN + type: integer + description: '>=0: number of threads for BAM sorting. 0 will default to min(6,--runThreadN).' + example: 0 + - name: --outBAMsortingBinsN + type: integer + description: '>0: number of genome bins for coordinate-sorting' + example: 50 +- name: BAM processing + arguments: + - name: --bamRemoveDuplicatesType + type: string + description: |- + mark duplicates in the BAM file, for now only works with (i) sorted BAM fed with inputBAMfile, and (ii) for paired-end alignments only + + - - ... no duplicate removal/marking + - UniqueIdentical ... mark all multimappers, and duplicate unique mappers. The coordinates, FLAG, CIGAR must be identical + - UniqueIdenticalNotMulti ... mark duplicate unique mappers but not multimappers. + - name: --bamRemoveDuplicatesMate2basesN + type: integer + description: number of bases from the 5' of mate 2 to use in collapsing (e.g. + for RAMPAGE) + example: 0 +- name: Output Wiggle + arguments: + - name: --outWigType + type: string + description: |- + type of signal output, e.g. "bedGraph" OR "bedGraph read1_5p". Requires sorted BAM: --outSAMtype BAM SortedByCoordinate . + + 1st word: + - None ... no signal output + - bedGraph ... bedGraph format + - wiggle ... wiggle format + 2nd word: + - read1_5p ... signal from only 5' of the 1st read, useful for CAGE/RAMPAGE etc + - read2 ... signal from only 2nd read + multiple: yes + multiple_sep: ; + - name: --outWigStrand + type: string + description: |- + strandedness of wiggle/bedGraph output + + - Stranded ... separate strands, str1 and str2 + - Unstranded ... collapsed strands + example: Stranded + - name: --outWigReferencesPrefix + type: string + description: prefix matching reference names to include in the output wiggle file, + e.g. "chr", default "-" - include all references + - name: --outWigNorm + type: string + description: |- + type of normalization for the signal + + - RPM ... reads per million of mapped reads + - None ... no normalization, "raw" counts + example: RPM +- name: Output Filtering + arguments: + - name: --outFilterType + type: string + description: |- + type of filtering + + - Normal ... standard filtering using only current alignment + - BySJout ... keep only those reads that contain junctions that passed filtering into SJ.out.tab + example: Normal + - name: --outFilterMultimapScoreRange + type: integer + description: the score range below the maximum score for multimapping alignments + example: 1 + - name: --outFilterMultimapNmax + type: integer + description: |- + maximum number of loci the read is allowed to map to. Alignments (all of them) will be output only if the read maps to no more loci than this value. + + Otherwise no alignments will be output, and the read will be counted as "mapped to too many loci" in the Log.final.out . + example: 10 + - name: --outFilterMismatchNmax + type: integer + description: alignment will be output only if it has no more mismatches than this + value. + example: 10 + - name: --outFilterMismatchNoverLmax + type: double + description: alignment will be output only if its ratio of mismatches to *mapped* + length is less than or equal to this value. + example: 0.3 + - name: --outFilterMismatchNoverReadLmax + type: double + description: alignment will be output only if its ratio of mismatches to *read* + length is less than or equal to this value. + example: 1.0 + - name: --outFilterScoreMin + type: integer + description: alignment will be output only if its score is higher than or equal + to this value. + example: 0 + - name: --outFilterScoreMinOverLread + type: double + description: same as outFilterScoreMin, but normalized to read length (sum of + mates' lengths for paired-end reads) + example: 0.66 + - name: --outFilterMatchNmin + type: integer + description: alignment will be output only if the number of matched bases is higher + than or equal to this value. + example: 0 + - name: --outFilterMatchNminOverLread + type: double + description: sam as outFilterMatchNmin, but normalized to the read length (sum + of mates' lengths for paired-end reads). + example: 0.66 + - name: --outFilterIntronMotifs + type: string + description: |- + filter alignment using their motifs + + - None ... no filtering + - RemoveNoncanonical ... filter out alignments that contain non-canonical junctions + - RemoveNoncanonicalUnannotated ... filter out alignments that contain non-canonical unannotated junctions when using annotated splice junctions database. The annotated non-canonical junctions will be kept. + - name: --outFilterIntronStrands + type: string + description: |- + filter alignments + + - RemoveInconsistentStrands ... remove alignments that have junctions with inconsistent strands + - None ... no filtering + example: RemoveInconsistentStrands +- name: Output splice junctions (SJ.out.tab) + arguments: + - name: --outSJtype + type: string + description: |- + type of splice junction output + + - Standard ... standard SJ.out.tab output + - None ... no splice junction output + example: Standard +- name: 'Output Filtering: Splice Junctions' + arguments: + - name: --outSJfilterReads + type: string + description: |- + which reads to consider for collapsed splice junctions output + + - All ... all reads, unique- and multi-mappers + - Unique ... uniquely mapping reads only + example: All + - name: --outSJfilterOverhangMin + type: integer + description: |- + minimum overhang length for splice junctions on both sides for: (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif. -1 means no output for that motif + + does not apply to annotated junctions + example: + - 30 + - 12 + - 12 + - 12 + multiple: yes + multiple_sep: ; + - name: --outSJfilterCountUniqueMin + type: integer + description: |- + minimum uniquely mapping read count per junction for: (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif. -1 means no output for that motif + + Junctions are output if one of outSJfilterCountUniqueMin OR outSJfilterCountTotalMin conditions are satisfied + does not apply to annotated junctions + example: + - 3 + - 1 + - 1 + - 1 + multiple: yes + multiple_sep: ; + - name: --outSJfilterCountTotalMin + type: integer + description: |- + minimum total (multi-mapping+unique) read count per junction for: (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif. -1 means no output for that motif + + Junctions are output if one of outSJfilterCountUniqueMin OR outSJfilterCountTotalMin conditions are satisfied + does not apply to annotated junctions + example: + - 3 + - 1 + - 1 + - 1 + multiple: yes + multiple_sep: ; + - name: --outSJfilterDistToOtherSJmin + type: integer + description: |- + minimum allowed distance to other junctions' donor/acceptor + + does not apply to annotated junctions + example: + - 10 + - 0 + - 5 + - 10 + multiple: yes + multiple_sep: ; + - name: --outSJfilterIntronMaxVsReadN + type: integer + description: |- + maximum gap allowed for junctions supported by 1,2,3,,,N reads + + i.e. by default junctions supported by 1 read can have gaps <=50000b, by 2 reads: <=100000b, by 3 reads: <=200000. by >=4 reads any gap <=alignIntronMax + does not apply to annotated junctions + example: + - 50000 + - 100000 + - 200000 + multiple: yes + multiple_sep: ; +- name: Scoring + arguments: + - name: --scoreGap + type: integer + description: splice junction penalty (independent on intron motif) + example: 0 + - name: --scoreGapNoncan + type: integer + description: non-canonical junction penalty (in addition to scoreGap) + example: -8 + - name: --scoreGapGCAG + type: integer + description: GC/AG and CT/GC junction penalty (in addition to scoreGap) + example: -4 + - name: --scoreGapATAC + type: integer + description: AT/AC and GT/AT junction penalty (in addition to scoreGap) + example: -8 + - name: --scoreGenomicLengthLog2scale + type: integer + description: 'extra score logarithmically scaled with genomic length of the alignment: + scoreGenomicLengthLog2scale*log2(genomicLength)' + example: 0 + - name: --scoreDelOpen + type: integer + description: deletion open penalty + example: -2 + - name: --scoreDelBase + type: integer + description: deletion extension penalty per base (in addition to scoreDelOpen) + example: -2 + - name: --scoreInsOpen + type: integer + description: insertion open penalty + example: -2 + - name: --scoreInsBase + type: integer + description: insertion extension penalty per base (in addition to scoreInsOpen) + example: -2 + - name: --scoreStitchSJshift + type: integer + description: maximum score reduction while searching for SJ boundaries in the + stitching step + example: 1 +- name: Alignments and Seeding + arguments: + - name: --seedSearchStartLmax + type: integer + description: defines the search start point through the read - the read is split + into pieces no longer than this value + example: 50 + - name: --seedSearchStartLmaxOverLread + type: double + description: seedSearchStartLmax normalized to read length (sum of mates' lengths + for paired-end reads) + example: 1.0 + - name: --seedSearchLmax + type: integer + description: defines the maximum length of the seeds, if =0 seed length is not + limited + example: 0 + - name: --seedMultimapNmax + type: integer + description: only pieces that map fewer than this value are utilized in the stitching + procedure + example: 10000 + - name: --seedPerReadNmax + type: integer + description: max number of seeds per read + example: 1000 + - name: --seedPerWindowNmax + type: integer + description: max number of seeds per window + example: 50 + - name: --seedNoneLociPerWindow + type: integer + description: max number of one seed loci per window + example: 10 + - name: --seedSplitMin + type: integer + description: min length of the seed sequences split by Ns or mate gap + example: 12 + - name: --seedMapMin + type: integer + description: min length of seeds to be mapped + example: 5 + - name: --alignIntronMin + type: integer + description: minimum intron size, genomic gap is considered intron if its length>=alignIntronMin, + otherwise it is considered Deletion + example: 21 + - name: --alignIntronMax + type: integer + description: maximum intron size, if 0, max intron size will be determined by + (2^winBinNbits)*winAnchorDistNbins + example: 0 + - name: --alignMatesGapMax + type: integer + description: maximum gap between two mates, if 0, max intron gap will be determined + by (2^winBinNbits)*winAnchorDistNbins + example: 0 + - name: --alignSJoverhangMin + type: integer + description: minimum overhang (i.e. block size) for spliced alignments + example: 5 + - name: --alignSJstitchMismatchNmax + type: integer + description: |- + maximum number of mismatches for stitching of the splice junctions (-1: no limit). + + (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif. + example: + - 0 + - -1 + - 0 + - 0 + multiple: yes + multiple_sep: ; + - name: --alignSJDBoverhangMin + type: integer + description: minimum overhang (i.e. block size) for annotated (sjdb) spliced alignments + example: 3 + - name: --alignSplicedMateMapLmin + type: integer + description: minimum mapped length for a read mate that is spliced + example: 0 + - name: --alignSplicedMateMapLminOverLmate + type: double + description: alignSplicedMateMapLmin normalized to mate length + example: 0.66 + - name: --alignWindowsPerReadNmax + type: integer + description: max number of windows per read + example: 10000 + - name: --alignTranscriptsPerWindowNmax + type: integer + description: max number of transcripts per window + example: 100 + - name: --alignTranscriptsPerReadNmax + type: integer + description: max number of different alignments per read to consider + example: 10000 + - name: --alignEndsType + type: string + description: |- + type of read ends alignment + + - Local ... standard local alignment with soft-clipping allowed + - EndToEnd ... force end-to-end read alignment, do not soft-clip + - Extend5pOfRead1 ... fully extend only the 5p of the read1, all other ends: local alignment + - Extend5pOfReads12 ... fully extend only the 5p of the both read1 and read2, all other ends: local alignment + example: Local + - name: --alignEndsProtrude + type: string + description: |- + allow protrusion of alignment ends, i.e. start (end) of the +strand mate downstream of the start (end) of the -strand mate + + 1st word: int: maximum number of protrusion bases allowed + 2nd word: string: + - ConcordantPair ... report alignments with non-zero protrusion as concordant pairs + - DiscordantPair ... report alignments with non-zero protrusion as discordant pairs + example: 0 ConcordantPair + - name: --alignSoftClipAtReferenceEnds + type: string + description: |- + allow the soft-clipping of the alignments past the end of the chromosomes + + - Yes ... allow + - No ... prohibit, useful for compatibility with Cufflinks + example: 'Yes' + - name: --alignInsertionFlush + type: string + description: |- + how to flush ambiguous insertion positions + + - None ... insertions are not flushed + - Right ... insertions are flushed to the right +- name: Paired-End reads + arguments: + - name: --peOverlapNbasesMin + type: integer + description: minimum number of overlapping bases to trigger mates merging and + realignment. Specify >0 value to switch on the "merginf of overlapping mates" + algorithm. + example: 0 + - name: --peOverlapMMp + type: double + description: maximum proportion of mismatched bases in the overlap area + example: 0.01 +- name: Windows, Anchors, Binning + arguments: + - name: --winAnchorMultimapNmax + type: integer + description: max number of loci anchors are allowed to map to + example: 50 + - name: --winBinNbits + type: integer + description: =log2(winBin), where winBin is the size of the bin for the windows/clustering, + each window will occupy an integer number of bins. + example: 16 + - name: --winAnchorDistNbins + type: integer + description: max number of bins between two anchors that allows aggregation of + anchors into one window + example: 9 + - name: --winFlankNbins + type: integer + description: log2(winFlank), where win Flank is the size of the left and right + flanking regions for each window + example: 4 + - name: --winReadCoverageRelativeMin + type: double + description: minimum relative coverage of the read sequence by the seeds in a + window, for STARlong algorithm only. + example: 0.5 + - name: --winReadCoverageBasesMin + type: integer + description: minimum number of bases covered by the seeds in a window , for STARlong + algorithm only. + example: 0 +- name: Chimeric Alignments + arguments: + - name: --chimOutType + type: string + description: |- + type of chimeric output + + - Junctions ... Chimeric.out.junction + - SeparateSAMold ... output old SAM into separate Chimeric.out.sam file + - WithinBAM ... output into main aligned BAM files (Aligned.*.bam) + - WithinBAM HardClip ... (default) hard-clipping in the CIGAR for supplemental chimeric alignments (default if no 2nd word is present) + - WithinBAM SoftClip ... soft-clipping in the CIGAR for supplemental chimeric alignments + example: Junctions + multiple: yes + multiple_sep: ; + - name: --chimSegmentMin + type: integer + description: minimum length of chimeric segment length, if ==0, no chimeric output + example: 0 + - name: --chimScoreMin + type: integer + description: minimum total (summed) score of the chimeric segments + example: 0 + - name: --chimScoreDropMax + type: integer + description: max drop (difference) of chimeric score (the sum of scores of all + chimeric segments) from the read length + example: 20 + - name: --chimScoreSeparation + type: integer + description: minimum difference (separation) between the best chimeric score and + the next one + example: 10 + - name: --chimScoreJunctionNonGTAG + type: integer + description: penalty for a non-GT/AG chimeric junction + example: -1 + - name: --chimJunctionOverhangMin + type: integer + description: minimum overhang for a chimeric junction + example: 20 + - name: --chimSegmentReadGapMax + type: integer + description: maximum gap in the read sequence between chimeric segments + example: 0 + - name: --chimFilter + type: string + description: |- + different filters for chimeric alignments + + - None ... no filtering + - banGenomicN ... Ns are not allowed in the genome sequence around the chimeric junction + example: banGenomicN + multiple: yes + multiple_sep: ; + - name: --chimMainSegmentMultNmax + type: integer + description: maximum number of multi-alignments for the main chimeric segment. + =1 will prohibit multimapping main segments. + example: 10 + - name: --chimMultimapNmax + type: integer + description: |- + maximum number of chimeric multi-alignments + + - 0 ... use the old scheme for chimeric detection which only considered unique alignments + example: 0 + - name: --chimMultimapScoreRange + type: integer + description: the score range for multi-mapping chimeras below the best chimeric + score. Only works with --chimMultimapNmax > 1 + example: 1 + - name: --chimNonchimScoreDropMin + type: integer + description: to trigger chimeric detection, the drop in the best non-chimeric + alignment score with respect to the read length has to be greater than this + value + example: 20 + - name: --chimOutJunctionFormat + type: integer + description: |- + formatting type for the Chimeric.out.junction file + + - 0 ... no comment lines/headers + - 1 ... comment lines at the end of the file: command line and Nreads: total, unique/multi-mapping + example: 0 +- name: Quantification of Annotations + arguments: + - name: --quantMode + type: string + description: |- + types of quantification requested + + - - ... none + - TranscriptomeSAM ... output SAM/BAM alignments to transcriptome into a separate file + - GeneCounts ... count reads per gene + multiple: yes + multiple_sep: ; + - name: --quantTranscriptomeBAMcompression + type: integer + description: |- + -2 to 10 transcriptome BAM compression level + + - -2 ... no BAM output + - -1 ... default compression (6?) + - 0 ... no compression + - 10 ... maximum compression + example: 1 + - name: --quantTranscriptomeSAMoutput + type: string + description: |- + alignment filtering for TranscriptomeSAM output + + - BanSingleEnd_BanIndels_ExtendSoftclip ... prohibit indels and single-end alignments, extend softclips - compatible with RSEM + - BanSingleEnd ... prohibit single-end alignments, allow indels and softclips + - BanSingleEnd_ExtendSoftclip ... prohibit single-end alignments, extend softclips, allow indels + example: BanSingleEnd_BanIndels_ExtendSoftclip +- name: 2-pass Mapping + arguments: + - name: --twopassMode + type: string + description: |- + 2-pass mapping mode. + + - None ... 1-pass mapping + - Basic ... basic 2-pass mapping, with all 1st pass junctions inserted into the genome indices on the fly + - name: --twopass1readsN + type: integer + description: number of reads to process for the 1st step. Use very large number + (or default -1) to map all reads in the first step. + example: -1 +- name: WASP parameters + arguments: + - name: --waspOutputMode + type: string + description: |- + WASP allele-specific output type. This is re-implementation of the original WASP mappability filtering by Bryce van de Geijn, Graham McVicker, Yoav Gilad & Jonathan K Pritchard. Please cite the original WASP paper: Nature Methods 12, 1061-1063 (2015), https://www.nature.com/articles/nmeth.3582 . + + - SAMtag ... add WASP tags to the alignments that pass WASP filtering diff --git a/src/star/star_align_reads/config.vsh.yaml b/src/star/star_align_reads/config.vsh.yaml new file mode 100644 index 00000000..8fdd5256 --- /dev/null +++ b/src/star/star_align_reads/config.vsh.yaml @@ -0,0 +1,115 @@ +name: star_align_reads +namespace: star +description: | + Aligns reads to a reference genome using STAR. +keywords: [align, fasta, genome] +links: + repository: https://github.com/alexdobin/STAR + documentation: https://github.com/alexdobin/STAR/blob/master/doc/STARmanual.pdf +references: + doi: 10.1093/bioinformatics/bts635 +license: MIT +requirements: + commands: [ STAR, python, ps, zcat, bzcat ] +# manually taking care of the main input and output arguments +argument_groups: + - name: Inputs + arguments: + - type: file + name: --input + alternatives: --readFilesIn + required: true + description: The single-end or paired-end R1 FastQ files to be processed. + example: [ mysample_S1_L001_R1_001.fastq.gz ] + multiple: true + - type: file + name: --input_r2 + required: false + description: The paired-end R2 FastQ files to be processed. Only required if --input is a paired-end R1 file. + example: [ mysample_S1_L001_R2_001.fastq.gz ] + multiple: true + - name: Outputs + arguments: + - type: file + name: --aligned_reads + required: true + description: The output file containing the aligned reads. + direction: output + example: aligned_reads.bam + - type: file + name: --reads_per_gene + required: false + description: The output file containing the number of reads per gene. + direction: output + example: reads_per_gene.tsv + - type: file + name: --unmapped + required: false + description: The output file containing the unmapped reads. + direction: output + example: unmapped.fastq + - type: file + name: --unmapped_r2 + required: false + description: The output file containing the unmapped R2 reads. + direction: output + example: unmapped_r2.fastq + - type: file + name: --chimeric_junctions + required: false + description: The output file containing the chimeric junctions. + direction: output + example: chimeric_junctions.tsv + - type: file + name: --log + required: false + description: The output file containing the log of the alignment process. + direction: output + example: log.txt + - type: file + name: --splice_junctions + required: false + description: The output file containing the splice junctions. + direction: output + example: splice_junctions.tsv +# other arguments are defined in a separate file +__merge__: argument_groups.yaml +resources: + - type: python_script + path: script.py +test_resources: + - type: bash_script + path: test.sh +engines: + - type: docker + image: python:3.12-slim + setup: + - type: apt + packages: + - procps + - gzip + - bzip2 + # setup derived from https://github.com/alexdobin/STAR/blob/master/extras/docker/Dockerfile + - type: docker + env: + - STAR_VERSION 2.7.11b + - PACKAGES gcc g++ make wget zlib1g-dev unzip xxd + run: | + apt-get update && \ + apt-get install -y --no-install-recommends ${PACKAGES} && \ + cd /tmp && \ + wget --no-check-certificate https://github.com/alexdobin/STAR/archive/refs/tags/${STAR_VERSION}.zip && \ + unzip ${STAR_VERSION}.zip && \ + cd STAR-${STAR_VERSION}/source && \ + make STARstatic CXXFLAGS_SIMD=-std=c++11 && \ + cp STAR /usr/local/bin && \ + cd / && \ + rm -rf /tmp/STAR-${STAR_VERSION} /tmp/${STAR_VERSION}.zip && \ + apt-get --purge autoremove -y ${PACKAGES} && \ + apt-get clean + - type: docker + run: | + STAR --version | sed 's#\(.*\)#star: "\1"#' > /var/software_versions.txt +runners: + - type: executable + - type: nextflow diff --git a/src/star/star_align_reads/help.txt b/src/star/star_align_reads/help.txt new file mode 100644 index 00000000..940f639d --- /dev/null +++ b/src/star/star_align_reads/help.txt @@ -0,0 +1,927 @@ +Usage: STAR [options]... --genomeDir /path/to/genome/index/ --readFilesIn R1.fq R2.fq +Spliced Transcripts Alignment to a Reference (c) Alexander Dobin, 2009-2022 + +STAR version=2.7.11b +STAR compilation time,server,dir=2024-02-11T19:36:26+00:00 :/tmp/STAR-2.7.11b/source +For more details see: + + +### versions +versionGenome 2.7.4a + string: earliest genome index version compatible with this STAR release. Please do not change this value! + +### Parameter Files +parametersFiles - + string: name of a user-defined parameters file, "-": none. Can only be defined on the command line. + +### System +sysShell - + string: path to the shell binary, preferably bash, e.g. /bin/bash. + - ... the default shell is executed, typically /bin/sh. This was reported to fail on some Ubuntu systems - then you need to specify path to bash. + +### Run Parameters +runMode alignReads + string: type of the run. + alignReads ... map reads + genomeGenerate ... generate genome files + inputAlignmentsFromBAM ... input alignments from BAM. Presently only works with --outWigType and --bamRemoveDuplicates options. + liftOver ... lift-over of GTF files (--sjdbGTFfile) between genome assemblies using chain file(s) from --genomeChainFiles. + soloCellFiltering ... STARsolo cell filtering ("calling") without remapping, followed by the path to raw count directory and output (filtered) prefix + +runThreadN 1 + int: number of threads to run STAR + +runDirPerm User_RWX + string: permissions for the directories created at the run-time. + User_RWX ... user-read/write/execute + All_RWX ... all-read/write/execute (same as chmod 777) + +runRNGseed 777 + int: random number generator seed. + + +### Genome Parameters +genomeDir ./GenomeDir/ + string: path to the directory where genome files are stored (for --runMode alignReads) or will be generated (for --runMode generateGenome) + +genomeLoad NoSharedMemory + string: mode of shared memory usage for the genome files. Only used with --runMode alignReads. + LoadAndKeep ... load genome into shared and keep it in memory after run + LoadAndRemove ... load genome into shared but remove it after run + LoadAndExit ... load genome into shared memory and exit, keeping the genome in memory for future runs + Remove ... do not map anything, just remove loaded genome from memory + NoSharedMemory ... do not use shared memory, each job will have its own private copy of the genome + +genomeFastaFiles - + string(s): path(s) to the fasta files with the genome sequences, separated by spaces. These files should be plain text FASTA files, they *cannot* be zipped. + Required for the genome generation (--runMode genomeGenerate). Can also be used in the mapping (--runMode alignReads) to add extra (new) sequences to the genome (e.g. spike-ins). + +genomeChainFiles - + string: chain files for genomic liftover. Only used with --runMode liftOver . + +genomeFileSizes 0 + uint(s)>0: genome files exact sizes in bytes. Typically, this should not be defined by the user. + +genomeTransformOutput None + string(s): which output to transform back to original genome + SAM ... SAM/BAM alignments + SJ ... splice junctions (SJ.out.tab) + Quant ... quantifications (from --quantMode option) + None ... no transformation of the output + +genomeChrSetMitochondrial chrM M MT + string(s): names of the mitochondrial chromosomes. Presently only used for STARsolo statistics output/ + +### Genome Indexing Parameters - only used with --runMode genomeGenerate +genomeChrBinNbits 18 + int: =log2(chrBin), where chrBin is the size of the bins for genome storage: each chromosome will occupy an integer number of bins. For a genome with large number of contigs, it is recommended to scale this parameter as min(18, log2[max(GenomeLength/NumberOfReferences,ReadLength)]). + +genomeSAindexNbases 14 + int: length (bases) of the SA pre-indexing string. Typically between 10 and 15. Longer strings will use much more memory, but allow faster searches. For small genomes, the parameter --genomeSAindexNbases must be scaled down to min(14, log2(GenomeLength)/2 - 1). + +genomeSAsparseD 1 + int>0: suffux array sparsity, i.e. distance between indices: use bigger numbers to decrease needed RAM at the cost of mapping speed reduction + +genomeSuffixLengthMax -1 + int: maximum length of the suffixes, has to be longer than read length. -1 = infinite. + +genomeTransformType None + string: type of genome transformation + None ... no transformation + Haploid ... replace reference alleles with alternative alleles from VCF file (e.g. consensus allele) + Diploid ... create two haplotypes for each chromosome listed in VCF file, for genotypes 1|2, assumes perfect phasing (e.g. personal genome) + +genomeTransformVCF - + string: path to VCF file for genome transformation + + + +#####UnderDevelopment_begin : not supported - do not use +genomeType Full + string: type of genome to generate + Full ... full (normal) genome + Transcriptome ... genome consists of transcript sequences + SuperTransriptome ... genome consists of superTranscript sequences +#####UnderDevelopment_end + +# DEPRECATED: please use --genomeTransformVCF and --genomeTransformType options instead. +#genomeConsensusFile - +# string: VCF file with consensus SNPs (i.e. alternative allele is the major (AF>0.5) allele) +# DEPRECATED + + + +### Splice Junctions Database +sjdbFileChrStartEnd - + string(s): path to the files with genomic coordinates (chr start end strand) for the splice junction introns. Multiple files can be supplied and will be concatenated. + +sjdbGTFfile - + string: path to the GTF file with annotations + +sjdbGTFchrPrefix - + string: prefix for chromosome names in a GTF file (e.g. 'chr' for using ENSMEBL annotations with UCSC genomes) + +sjdbGTFfeatureExon exon + string: feature type in GTF file to be used as exons for building transcripts + +sjdbGTFtagExonParentTranscript transcript_id + string: GTF attribute name for parent transcript ID (default "transcript_id" works for GTF files) + +sjdbGTFtagExonParentGene gene_id + string: GTF attribute name for parent gene ID (default "gene_id" works for GTF files) + +sjdbGTFtagExonParentGeneName gene_name + string(s): GTF attribute name for parent gene name + +sjdbGTFtagExonParentGeneType gene_type gene_biotype + string(s): GTF attribute name for parent gene type + +sjdbOverhang 100 + int>0: length of the donor/acceptor sequence on each side of the junctions, ideally = (mate_length - 1) + +sjdbScore 2 + int: extra alignment score for alignments that cross database junctions + +sjdbInsertSave Basic + string: which files to save when sjdb junctions are inserted on the fly at the mapping step + Basic ... only small junction / transcript files + All ... all files including big Genome, SA and SAindex - this will create a complete genome directory + +### Variation parameters +varVCFfile - + string: path to the VCF file that contains variation data. The 10th column should contain the genotype information, e.g. 0/1 + +### Input Files +inputBAMfile - + string: path to BAM input file, to be used with --runMode inputAlignmentsFromBAM + +### Read Parameters +readFilesType Fastx + string: format of input read files + Fastx ... FASTA or FASTQ + SAM SE ... SAM or BAM single-end reads; for BAM use --readFilesCommand samtools view + SAM PE ... SAM or BAM paired-end reads; for BAM use --readFilesCommand samtools view + +readFilesSAMattrKeep All + string(s): for --readFilesType SAM SE/PE, which SAM tags to keep in the output BAM, e.g.: --readFilesSAMtagsKeep RG PL + All ... keep all tags + None ... do not keep any tags + +readFilesIn Read1 Read2 + string(s): paths to files that contain input read1 (and, if needed, read2) + +readFilesManifest - + string: path to the "manifest" file with the names of read files. The manifest file should contain 3 tab-separated columns: + paired-end reads: read1_file_name $tab$ read2_file_name $tab$ read_group_line. + single-end reads: read1_file_name $tab$ - $tab$ read_group_line. + Spaces, but not tabs are allowed in file names. + If read_group_line does not start with ID:, it can only contain one ID field, and ID: will be added to it. + If read_group_line starts with ID:, it can contain several fields separated by $tab$, and all fields will be be copied verbatim into SAM @RG header line. + +readFilesPrefix - + string: prefix for the read files names, i.e. it will be added in front of the strings in --readFilesIn + +readFilesCommand - + string(s): command line to execute for each of the input file. This command should generate FASTA or FASTQ text and send it to stdout + For example: zcat - to uncompress .gz files, bzcat - to uncompress .bz2 files, etc. + +readMapNumber -1 + int: number of reads to map from the beginning of the file + -1: map all reads + +readMatesLengthsIn NotEqual + string: Equal/NotEqual - lengths of names,sequences,qualities for both mates are the same / not the same. NotEqual is safe in all situations. + +readNameSeparator / + string(s): character(s) separating the part of the read names that will be trimmed in output (read name after space is always trimmed) + +readQualityScoreBase 33 + int>=0: number to be subtracted from the ASCII code to get Phred quality score + +### Read Clipping + +clipAdapterType Hamming + string: adapter clipping type + Hamming ... adapter clipping based on Hamming distance, with the number of mismatches controlled by --clip5pAdapterMMp + CellRanger4 ... 5p and 3p adapter clipping similar to CellRanger4. Utilizes Opal package by Martin Šošić: https://github.com/Martinsos/opal + None ... no adapter clipping, all other clip* parameters are disregarded + +clip3pNbases 0 + int(s): number(s) of bases to clip from 3p of each mate. If one value is given, it will be assumed the same for both mates. + +clip3pAdapterSeq - + string(s): adapter sequences to clip from 3p of each mate. If one value is given, it will be assumed the same for both mates. + polyA ... polyA sequence with the length equal to read length + +clip3pAdapterMMp 0.1 + double(s): max proportion of mismatches for 3p adapter clipping for each mate. If one value is given, it will be assumed the same for both mates. + +clip3pAfterAdapterNbases 0 + int(s): number of bases to clip from 3p of each mate after the adapter clipping. If one value is given, it will be assumed the same for both mates. + +clip5pNbases 0 + int(s): number(s) of bases to clip from 5p of each mate. If one value is given, it will be assumed the same for both mates. + +#####UnderDevelopment_begin : not supported - do not use +clip5pAdapterSeq - + string(s): adapter sequences to clip from 5p of each mate, separated by space. + +clip5pAdapterMMp 0.1 + double(s): max proportion of mismatches for 5p adapter clipping for each mate, separated by space + +clip5pAfterAdapterNbases 0 + int(s): number of bases to clip from 5p of each mate after the adapter clipping, separated by space. +#####UnderDevelopment_end + +### Limits +limitGenomeGenerateRAM 31000000000 + int>0: maximum available RAM (bytes) for genome generation + +limitIObufferSize 30000000 50000000 + int(s)>0: max available buffers size (bytes) for input/output, per thread + +limitOutSAMoneReadBytes 100000 + int>0: max size of the SAM record (bytes) for one read. Recommended value: >(2*(LengthMate1+LengthMate2+100)*outFilterMultimapNmax + +limitOutSJoneRead 1000 + int>0: max number of junctions for one read (including all multi-mappers) + +limitOutSJcollapsed 1000000 + int>0: max number of collapsed junctions + +limitBAMsortRAM 0 + int>=0: maximum available RAM (bytes) for sorting BAM. If =0, it will be set to the genome index size. 0 value can only be used with --genomeLoad NoSharedMemory option. + +limitSjdbInsertNsj 1000000 + int>=0: maximum number of junctions to be inserted to the genome on the fly at the mapping stage, including those from annotations and those detected in the 1st step of the 2-pass run + +limitNreadsSoft -1 + int: soft limit on the number of reads + +### Output: general +outFileNamePrefix ./ + string: output files name prefix (including full or relative path). Can only be defined on the command line. + +outTmpDir - + string: path to a directory that will be used as temporary by STAR. All contents of this directory will be removed! + - ... the temp directory will default to outFileNamePrefix_STARtmp + +outTmpKeep None + string: whether to keep the temporary files after STAR runs is finished + None ... remove all temporary files + All ... keep all files + +outStd Log + string: which output will be directed to stdout (standard out) + Log ... log messages + SAM ... alignments in SAM format (which normally are output to Aligned.out.sam file), normal standard output will go into Log.std.out + BAM_Unsorted ... alignments in BAM format, unsorted. Requires --outSAMtype BAM Unsorted + BAM_SortedByCoordinate ... alignments in BAM format, sorted by coordinate. Requires --outSAMtype BAM SortedByCoordinate + BAM_Quant ... alignments to transcriptome in BAM format, unsorted. Requires --quantMode TranscriptomeSAM + +outReadsUnmapped None + string: output of unmapped and partially mapped (i.e. mapped only one mate of a paired end read) reads in separate file(s). + None ... no output + Fastx ... output in separate fasta/fastq files, Unmapped.out.mate1/2 + +outQSconversionAdd 0 + int: add this number to the quality score (e.g. to convert from Illumina to Sanger, use -31) + +outMultimapperOrder Old_2.4 + string: order of multimapping alignments in the output files + Old_2.4 ... quasi-random order used before 2.5.0 + Random ... random order of alignments for each multi-mapper. Read mates (pairs) are always adjacent, all alignment for each read stay together. This option will become default in the future releases. + +### Output: SAM and BAM +outSAMtype SAM + strings: type of SAM/BAM output + 1st word: + BAM ... output BAM without sorting + SAM ... output SAM without sorting + None ... no SAM/BAM output + 2nd, 3rd: + Unsorted ... standard unsorted + SortedByCoordinate ... sorted by coordinate. This option will allocate extra memory for sorting which can be specified by --limitBAMsortRAM. + +outSAMmode Full + string: mode of SAM output + None ... no SAM output + Full ... full SAM output + NoQS ... full SAM but without quality scores + +outSAMstrandField None + string: Cufflinks-like strand field flag + None ... not used + intronMotif ... strand derived from the intron motif. This option changes the output alignments: reads with inconsistent and/or non-canonical introns are filtered out. + +outSAMattributes Standard + string(s): a string of desired SAM attributes, in the order desired for the output SAM. Tags can be listed in any combination/order. + ***Presets: + None ... no attributes + Standard ... NH HI AS nM + All ... NH HI AS nM NM MD jM jI MC ch + ***Alignment: + NH ... number of loci the reads maps to: =1 for unique mappers, >1 for multimappers. Standard SAM tag. + HI ... multiple alignment index, starts with --outSAMattrIHstart (=1 by default). Standard SAM tag. + AS ... local alignment score, +1/-1 for matches/mismateches, score* penalties for indels and gaps. For PE reads, total score for two mates. Stadnard SAM tag. + nM ... number of mismatches. For PE reads, sum over two mates. + NM ... edit distance to the reference (number of mismatched + inserted + deleted bases) for each mate. Standard SAM tag. + MD ... string encoding mismatched and deleted reference bases (see standard SAM specifications). Standard SAM tag. + jM ... intron motifs for all junctions (i.e. N in CIGAR): 0: non-canonical; 1: GT/AG, 2: CT/AC, 3: GC/AG, 4: CT/GC, 5: AT/AC, 6: GT/AT. If splice junctions database is used, and a junction is annotated, 20 is added to its motif value. + jI ... start and end of introns for all junctions (1-based). + XS ... alignment strand according to --outSAMstrandField. + MC ... mate's CIGAR string. Standard SAM tag. + ch ... marks all segment of all chimeric alingments for --chimOutType WithinBAM output. + cN ... number of bases clipped from the read ends: 5' and 3' + ***Variation: + vA ... variant allele + vG ... genomic coordinate of the variant overlapped by the read. + vW ... 1 - alignment passes WASP filtering; 2,3,4,5,6,7 - alignment does not pass WASP filtering. Requires --waspOutputMode SAMtag. + ha ... haplotype (1/2) when mapping to the diploid genome. Requires genome generated with --genomeTransformType Diploid . + ***STARsolo: + CR CY UR UY ... sequences and quality scores of cell barcodes and UMIs for the solo* demultiplexing. + GX GN ... gene ID and gene name for unique-gene reads. + gx gn ... gene IDs and gene names for unique- and multi-gene reads. + CB UB ... error-corrected cell barcodes and UMIs for solo* demultiplexing. Requires --outSAMtype BAM SortedByCoordinate. + sM ... assessment of CB and UMI. + sS ... sequence of the entire barcode (CB,UMI,adapter). + sQ ... quality of the entire barcode. + sF ... type of feature overlap and number of features for each alignment + ***Unsupported/undocumented: + rB ... alignment block read/genomic coordinates. + vR ... read coordinate of the variant. + +outSAMattrIHstart 1 + int>=0: start value for the IH attribute. 0 may be required by some downstream software, such as Cufflinks or StringTie. + +outSAMunmapped None + string(s): output of unmapped reads in the SAM format + 1st word: + None ... no output + Within ... output unmapped reads within the main SAM file (i.e. Aligned.out.sam) + 2nd word: + KeepPairs ... record unmapped mate for each alignment, and, in case of unsorted output, keep it adjacent to its mapped mate. Only affects multi-mapping reads. + +outSAMorder Paired + string: type of sorting for the SAM output + Paired: one mate after the other for all paired alignments + PairedKeepInputOrder: one mate after the other for all paired alignments, the order is kept the same as in the input FASTQ files + +outSAMprimaryFlag OneBestScore + string: which alignments are considered primary - all others will be marked with 0x100 bit in the FLAG + OneBestScore ... only one alignment with the best score is primary + AllBestScore ... all alignments with the best score are primary + +outSAMreadID Standard + string: read ID record type + Standard ... first word (until space) from the FASTx read ID line, removing /1,/2 from the end + Number ... read number (index) in the FASTx file + +outSAMmapqUnique 255 + int: 0 to 255: the MAPQ value for unique mappers + +outSAMflagOR 0 + int: 0 to 65535: sam FLAG will be bitwise OR'd with this value, i.e. FLAG=FLAG | outSAMflagOR. This is applied after all flags have been set by STAR, and after outSAMflagAND. Can be used to set specific bits that are not set otherwise. + +outSAMflagAND 65535 + int: 0 to 65535: sam FLAG will be bitwise AND'd with this value, i.e. FLAG=FLAG & outSAMflagOR. This is applied after all flags have been set by STAR, but before outSAMflagOR. Can be used to unset specific bits that are not set otherwise. + +outSAMattrRGline - + string(s): SAM/BAM read group line. The first word contains the read group identifier and must start with "ID:", e.g. --outSAMattrRGline ID:xxx CN:yy "DS:z z z". + xxx will be added as RG tag to each output alignment. Any spaces in the tag values have to be double quoted. + Comma separated RG lines correspons to different (comma separated) input files in --readFilesIn. Commas have to be surrounded by spaces, e.g. + --outSAMattrRGline ID:xxx , ID:zzz "DS:z z" , ID:yyy DS:yyyy + +outSAMheaderHD - + strings: @HD (header) line of the SAM header + +outSAMheaderPG - + strings: extra @PG (software) line of the SAM header (in addition to STAR) + +outSAMheaderCommentFile - + string: path to the file with @CO (comment) lines of the SAM header + +outSAMfilter None + string(s): filter the output into main SAM/BAM files + KeepOnlyAddedReferences ... only keep the reads for which all alignments are to the extra reference sequences added with --genomeFastaFiles at the mapping stage. + KeepAllAddedReferences ... keep all alignments to the extra reference sequences added with --genomeFastaFiles at the mapping stage. + + +outSAMmultNmax -1 + int: max number of multiple alignments for a read that will be output to the SAM/BAM files. Note that if this value is not equal to -1, the top scoring alignment will be output first + -1 ... all alignments (up to --outFilterMultimapNmax) will be output + +outSAMtlen 1 + int: calculation method for the TLEN field in the SAM/BAM files + 1 ... leftmost base of the (+)strand mate to rightmost base of the (-)mate. (+)sign for the (+)strand mate + 2 ... leftmost base of any mate to rightmost base of any mate. (+)sign for the mate with the leftmost base. This is different from 1 for overlapping mates with protruding ends + +outBAMcompression 1 + int: -1 to 10 BAM compression level, -1=default compression (6?), 0=no compression, 10=maximum compression + +outBAMsortingThreadN 0 + int: >=0: number of threads for BAM sorting. 0 will default to min(6,--runThreadN). + +outBAMsortingBinsN 50 + int: >0: number of genome bins for coordinate-sorting + +### BAM processing +bamRemoveDuplicatesType - + string: mark duplicates in the BAM file, for now only works with (i) sorted BAM fed with inputBAMfile, and (ii) for paired-end alignments only + - ... no duplicate removal/marking + UniqueIdentical ... mark all multimappers, and duplicate unique mappers. The coordinates, FLAG, CIGAR must be identical + UniqueIdenticalNotMulti ... mark duplicate unique mappers but not multimappers. + +bamRemoveDuplicatesMate2basesN 0 + int>0: number of bases from the 5' of mate 2 to use in collapsing (e.g. for RAMPAGE) + +### Output Wiggle +outWigType None + string(s): type of signal output, e.g. "bedGraph" OR "bedGraph read1_5p". Requires sorted BAM: --outSAMtype BAM SortedByCoordinate . + 1st word: + None ... no signal output + bedGraph ... bedGraph format + wiggle ... wiggle format + 2nd word: + read1_5p ... signal from only 5' of the 1st read, useful for CAGE/RAMPAGE etc + read2 ... signal from only 2nd read + +outWigStrand Stranded + string: strandedness of wiggle/bedGraph output + Stranded ... separate strands, str1 and str2 + Unstranded ... collapsed strands + +outWigReferencesPrefix - + string: prefix matching reference names to include in the output wiggle file, e.g. "chr", default "-" - include all references + +outWigNorm RPM + string: type of normalization for the signal + RPM ... reads per million of mapped reads + None ... no normalization, "raw" counts + +### Output Filtering +outFilterType Normal + string: type of filtering + Normal ... standard filtering using only current alignment + BySJout ... keep only those reads that contain junctions that passed filtering into SJ.out.tab + +outFilterMultimapScoreRange 1 + int: the score range below the maximum score for multimapping alignments + +outFilterMultimapNmax 10 + int: maximum number of loci the read is allowed to map to. Alignments (all of them) will be output only if the read maps to no more loci than this value. + Otherwise no alignments will be output, and the read will be counted as "mapped to too many loci" in the Log.final.out . + +outFilterMismatchNmax 10 + int: alignment will be output only if it has no more mismatches than this value. + +outFilterMismatchNoverLmax 0.3 + real: alignment will be output only if its ratio of mismatches to *mapped* length is less than or equal to this value. + +outFilterMismatchNoverReadLmax 1.0 + real: alignment will be output only if its ratio of mismatches to *read* length is less than or equal to this value. + + +outFilterScoreMin 0 + int: alignment will be output only if its score is higher than or equal to this value. + +outFilterScoreMinOverLread 0.66 + real: same as outFilterScoreMin, but normalized to read length (sum of mates' lengths for paired-end reads) + +outFilterMatchNmin 0 + int: alignment will be output only if the number of matched bases is higher than or equal to this value. + +outFilterMatchNminOverLread 0.66 + real: sam as outFilterMatchNmin, but normalized to the read length (sum of mates' lengths for paired-end reads). + +outFilterIntronMotifs None + string: filter alignment using their motifs + None ... no filtering + RemoveNoncanonical ... filter out alignments that contain non-canonical junctions + RemoveNoncanonicalUnannotated ... filter out alignments that contain non-canonical unannotated junctions when using annotated splice junctions database. The annotated non-canonical junctions will be kept. + +outFilterIntronStrands RemoveInconsistentStrands + string: filter alignments + RemoveInconsistentStrands ... remove alignments that have junctions with inconsistent strands + None ... no filtering + +### Output splice junctions (SJ.out.tab) +outSJtype Standard + string: type of splice junction output + Standard ... standard SJ.out.tab output + None ... no splice junction output + +### Output Filtering: Splice Junctions +outSJfilterReads All + string: which reads to consider for collapsed splice junctions output + All ... all reads, unique- and multi-mappers + Unique ... uniquely mapping reads only + +outSJfilterOverhangMin 30 12 12 12 + 4 integers: minimum overhang length for splice junctions on both sides for: (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif. -1 means no output for that motif + does not apply to annotated junctions + +outSJfilterCountUniqueMin 3 1 1 1 + 4 integers: minimum uniquely mapping read count per junction for: (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif. -1 means no output for that motif + Junctions are output if one of outSJfilterCountUniqueMin OR outSJfilterCountTotalMin conditions are satisfied + does not apply to annotated junctions + +outSJfilterCountTotalMin 3 1 1 1 + 4 integers: minimum total (multi-mapping+unique) read count per junction for: (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif. -1 means no output for that motif + Junctions are output if one of outSJfilterCountUniqueMin OR outSJfilterCountTotalMin conditions are satisfied + does not apply to annotated junctions + +outSJfilterDistToOtherSJmin 10 0 5 10 + 4 integers>=0: minimum allowed distance to other junctions' donor/acceptor + does not apply to annotated junctions + +outSJfilterIntronMaxVsReadN 50000 100000 200000 + N integers>=0: maximum gap allowed for junctions supported by 1,2,3,,,N reads + i.e. by default junctions supported by 1 read can have gaps <=50000b, by 2 reads: <=100000b, by 3 reads: <=200000. by >=4 reads any gap <=alignIntronMax + does not apply to annotated junctions + +### Scoring +scoreGap 0 + int: splice junction penalty (independent on intron motif) + +scoreGapNoncan -8 + int: non-canonical junction penalty (in addition to scoreGap) + +scoreGapGCAG -4 + int: GC/AG and CT/GC junction penalty (in addition to scoreGap) + +scoreGapATAC -8 + int: AT/AC and GT/AT junction penalty (in addition to scoreGap) + +scoreGenomicLengthLog2scale -0.25 + int: extra score logarithmically scaled with genomic length of the alignment: scoreGenomicLengthLog2scale*log2(genomicLength) + +scoreDelOpen -2 + int: deletion open penalty + +scoreDelBase -2 + int: deletion extension penalty per base (in addition to scoreDelOpen) + +scoreInsOpen -2 + int: insertion open penalty + +scoreInsBase -2 + int: insertion extension penalty per base (in addition to scoreInsOpen) + +scoreStitchSJshift 1 + int: maximum score reduction while searching for SJ boundaries in the stitching step + + +### Alignments and Seeding + +seedSearchStartLmax 50 + int>0: defines the search start point through the read - the read is split into pieces no longer than this value + +seedSearchStartLmaxOverLread 1.0 + real: seedSearchStartLmax normalized to read length (sum of mates' lengths for paired-end reads) + +seedSearchLmax 0 + int>=0: defines the maximum length of the seeds, if =0 seed length is not limited + +seedMultimapNmax 10000 + int>0: only pieces that map fewer than this value are utilized in the stitching procedure + +seedPerReadNmax 1000 + int>0: max number of seeds per read + +seedPerWindowNmax 50 + int>0: max number of seeds per window + +seedNoneLociPerWindow 10 + int>0: max number of one seed loci per window + +seedSplitMin 12 + int>0: min length of the seed sequences split by Ns or mate gap + +seedMapMin 5 + int>0: min length of seeds to be mapped + +alignIntronMin 21 + int: minimum intron size, genomic gap is considered intron if its length>=alignIntronMin, otherwise it is considered Deletion + +alignIntronMax 0 + int: maximum intron size, if 0, max intron size will be determined by (2^winBinNbits)*winAnchorDistNbins + +alignMatesGapMax 0 + int: maximum gap between two mates, if 0, max intron gap will be determined by (2^winBinNbits)*winAnchorDistNbins + +alignSJoverhangMin 5 + int>0: minimum overhang (i.e. block size) for spliced alignments + +alignSJstitchMismatchNmax 0 -1 0 0 + 4*int>=0: maximum number of mismatches for stitching of the splice junctions (-1: no limit). + (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif. + +alignSJDBoverhangMin 3 + int>0: minimum overhang (i.e. block size) for annotated (sjdb) spliced alignments + +alignSplicedMateMapLmin 0 + int>0: minimum mapped length for a read mate that is spliced + +alignSplicedMateMapLminOverLmate 0.66 + real>0: alignSplicedMateMapLmin normalized to mate length + +alignWindowsPerReadNmax 10000 + int>0: max number of windows per read + +alignTranscriptsPerWindowNmax 100 + int>0: max number of transcripts per window + +alignTranscriptsPerReadNmax 10000 + int>0: max number of different alignments per read to consider + +alignEndsType Local + string: type of read ends alignment + Local ... standard local alignment with soft-clipping allowed + EndToEnd ... force end-to-end read alignment, do not soft-clip + Extend5pOfRead1 ... fully extend only the 5p of the read1, all other ends: local alignment + Extend5pOfReads12 ... fully extend only the 5p of the both read1 and read2, all other ends: local alignment + +alignEndsProtrude 0 ConcordantPair + int, string: allow protrusion of alignment ends, i.e. start (end) of the +strand mate downstream of the start (end) of the -strand mate + 1st word: int: maximum number of protrusion bases allowed + 2nd word: string: + ConcordantPair ... report alignments with non-zero protrusion as concordant pairs + DiscordantPair ... report alignments with non-zero protrusion as discordant pairs + +alignSoftClipAtReferenceEnds Yes + string: allow the soft-clipping of the alignments past the end of the chromosomes + Yes ... allow + No ... prohibit, useful for compatibility with Cufflinks + +alignInsertionFlush None + string: how to flush ambiguous insertion positions + None ... insertions are not flushed + Right ... insertions are flushed to the right + +### Paired-End reads +peOverlapNbasesMin 0 + int>=0: minimum number of overlapping bases to trigger mates merging and realignment. Specify >0 value to switch on the "merginf of overlapping mates" algorithm. + +peOverlapMMp 0.01 + real, >=0 & <1: maximum proportion of mismatched bases in the overlap area + +### Windows, Anchors, Binning + +winAnchorMultimapNmax 50 + int>0: max number of loci anchors are allowed to map to + +winBinNbits 16 + int>0: =log2(winBin), where winBin is the size of the bin for the windows/clustering, each window will occupy an integer number of bins. + +winAnchorDistNbins 9 + int>0: max number of bins between two anchors that allows aggregation of anchors into one window + +winFlankNbins 4 + int>0: log2(winFlank), where win Flank is the size of the left and right flanking regions for each window + +winReadCoverageRelativeMin 0.5 + real>=0: minimum relative coverage of the read sequence by the seeds in a window, for STARlong algorithm only. + +winReadCoverageBasesMin 0 + int>0: minimum number of bases covered by the seeds in a window , for STARlong algorithm only. + +### Chimeric Alignments +chimOutType Junctions + string(s): type of chimeric output + Junctions ... Chimeric.out.junction + SeparateSAMold ... output old SAM into separate Chimeric.out.sam file + WithinBAM ... output into main aligned BAM files (Aligned.*.bam) + WithinBAM HardClip ... (default) hard-clipping in the CIGAR for supplemental chimeric alignments (default if no 2nd word is present) + WithinBAM SoftClip ... soft-clipping in the CIGAR for supplemental chimeric alignments + +chimSegmentMin 0 + int>=0: minimum length of chimeric segment length, if ==0, no chimeric output + +chimScoreMin 0 + int>=0: minimum total (summed) score of the chimeric segments + +chimScoreDropMax 20 + int>=0: max drop (difference) of chimeric score (the sum of scores of all chimeric segments) from the read length + +chimScoreSeparation 10 + int>=0: minimum difference (separation) between the best chimeric score and the next one + +chimScoreJunctionNonGTAG -1 + int: penalty for a non-GT/AG chimeric junction + +chimJunctionOverhangMin 20 + int>=0: minimum overhang for a chimeric junction + +chimSegmentReadGapMax 0 + int>=0: maximum gap in the read sequence between chimeric segments + +chimFilter banGenomicN + string(s): different filters for chimeric alignments + None ... no filtering + banGenomicN ... Ns are not allowed in the genome sequence around the chimeric junction + +chimMainSegmentMultNmax 10 + int>=1: maximum number of multi-alignments for the main chimeric segment. =1 will prohibit multimapping main segments. + +chimMultimapNmax 0 + int>=0: maximum number of chimeric multi-alignments + 0 ... use the old scheme for chimeric detection which only considered unique alignments + +chimMultimapScoreRange 1 + int>=0: the score range for multi-mapping chimeras below the best chimeric score. Only works with --chimMultimapNmax > 1 + +chimNonchimScoreDropMin 20 + int>=0: to trigger chimeric detection, the drop in the best non-chimeric alignment score with respect to the read length has to be greater than this value + +chimOutJunctionFormat 0 + int: formatting type for the Chimeric.out.junction file + 0 ... no comment lines/headers + 1 ... comment lines at the end of the file: command line and Nreads: total, unique/multi-mapping + +### Quantification of Annotations +quantMode - + string(s): types of quantification requested + - ... none + TranscriptomeSAM ... output SAM/BAM alignments to transcriptome into a separate file + GeneCounts ... count reads per gene + +quantTranscriptomeBAMcompression 1 + int: -2 to 10 transcriptome BAM compression level + -2 ... no BAM output + -1 ... default compression (6?) + 0 ... no compression + 10 ... maximum compression + +quantTranscriptomeSAMoutput BanSingleEnd_BanIndels_ExtendSoftclip + string: alignment filtering for TranscriptomeSAM output + BanSingleEnd_BanIndels_ExtendSoftclip ... prohibit indels and single-end alignments, extend softclips - compatible with RSEM + BanSingleEnd ... prohibit single-end alignments, allow indels and softclips + BanSingleEnd_ExtendSoftclip ... prohibit single-end alignments, extend softclips, allow indels + + +### 2-pass Mapping +twopassMode None + string: 2-pass mapping mode. + None ... 1-pass mapping + Basic ... basic 2-pass mapping, with all 1st pass junctions inserted into the genome indices on the fly + +twopass1readsN -1 + int: number of reads to process for the 1st step. Use very large number (or default -1) to map all reads in the first step. + + +### WASP parameters +waspOutputMode None + string: WASP allele-specific output type. This is re-implementation of the original WASP mappability filtering by Bryce van de Geijn, Graham McVicker, Yoav Gilad & Jonathan K Pritchard. Please cite the original WASP paper: Nature Methods 12, 1061–1063 (2015), https://www.nature.com/articles/nmeth.3582 . + SAMtag ... add WASP tags to the alignments that pass WASP filtering + +### STARsolo (single cell RNA-seq) parameters +soloType None + string(s): type of single-cell RNA-seq + CB_UMI_Simple ... (a.k.a. Droplet) one UMI and one Cell Barcode of fixed length in read2, e.g. Drop-seq and 10X Chromium. + CB_UMI_Complex ... multiple Cell Barcodes of varying length, one UMI of fixed length and one adapter sequence of fixed length are allowed in read2 only (e.g. inDrop, ddSeq). + CB_samTagOut ... output Cell Barcode as CR and/or CB SAm tag. No UMI counting. --readFilesIn cDNA_read1 [cDNA_read2 if paired-end] CellBarcode_read . Requires --outSAMtype BAM Unsorted [and/or SortedByCoordinate] + SmartSeq ... Smart-seq: each cell in a separate FASTQ (paired- or single-end), barcodes are corresponding read-groups, no UMI sequences, alignments deduplicated according to alignment start and end (after extending soft-clipped bases) + +soloCBtype Sequence + string: cell barcode type + Sequence: cell barcode is a sequence (standard option) + String: cell barcode is an arbitrary string + +soloCBwhitelist - + string(s): file(s) with whitelist(s) of cell barcodes. Only --soloType CB_UMI_Complex allows more than one whitelist file. + None ... no whitelist: all cell barcodes are allowed + +soloCBstart 1 + int>0: cell barcode start base + +soloCBlen 16 + int>0: cell barcode length + +soloUMIstart 17 + int>0: UMI start base + +soloUMIlen 10 + int>0: UMI length + +soloBarcodeReadLength 1 + int: length of the barcode read + 1 ... equal to sum of soloCBlen+soloUMIlen + 0 ... not defined, do not check + +soloBarcodeMate 0 + int: identifies which read mate contains the barcode (CB+UMI) sequence + 0 ... barcode sequence is on separate read, which should always be the last file in the --readFilesIn listed + 1 ... barcode sequence is a part of mate 1 + 2 ... barcode sequence is a part of mate 2 + +soloCBposition - + strings(s): position of Cell Barcode(s) on the barcode read. + Presently only works with --soloType CB_UMI_Complex, and barcodes are assumed to be on Read2. + Format for each barcode: startAnchor_startPosition_endAnchor_endPosition + start(end)Anchor defines the Anchor Base for the CB: 0: read start; 1: read end; 2: adapter start; 3: adapter end + start(end)Position is the 0-based position with of the CB start(end) with respect to the Anchor Base + String for different barcodes are separated by space. + Example: inDrop (Zilionis et al, Nat. Protocols, 2017): + --soloCBposition 0_0_2_-1 3_1_3_8 + +soloUMIposition - + string: position of the UMI on the barcode read, same as soloCBposition + Example: inDrop (Zilionis et al, Nat. Protocols, 2017): + --soloCBposition 3_9_3_14 + +soloAdapterSequence - + string: adapter sequence to anchor barcodes. Only one adapter sequence is allowed. + +soloAdapterMismatchesNmax 1 + int>0: maximum number of mismatches allowed in adapter sequence. + +soloCBmatchWLtype 1MM_multi + string: matching the Cell Barcodes to the WhiteList + Exact ... only exact matches allowed + 1MM ... only one match in whitelist with 1 mismatched base allowed. Allowed CBs have to have at least one read with exact match. + 1MM_multi ... multiple matches in whitelist with 1 mismatched base allowed, posterior probability calculation is used choose one of the matches. + Allowed CBs have to have at least one read with exact match. This option matches best with CellRanger 2.2.0 + 1MM_multi_pseudocounts ... same as 1MM_Multi, but pseudocounts of 1 are added to all whitelist barcodes. + 1MM_multi_Nbase_pseudocounts ... same as 1MM_multi_pseudocounts, multimatching to WL is allowed for CBs with N-bases. This option matches best with CellRanger >= 3.0.0 + EditDist_2 ... allow up to edit distance of 3 fpr each of the barcodes. May include one deletion + one insertion. Only works with --soloType CB_UMI_Complex. Matches to multiple passlist barcdoes are not allowed. Similar to ParseBio Split-seq pipeline. + +soloInputSAMattrBarcodeSeq - + string(s): when inputting reads from a SAM file (--readsFileType SAM SE/PE), these SAM attributes mark the barcode sequence (in proper order). + For instance, for 10X CellRanger or STARsolo BAMs, use --soloInputSAMattrBarcodeSeq CR UR . + This parameter is required when running STARsolo with input from SAM. + +soloInputSAMattrBarcodeQual - + string(s): when inputting reads from a SAM file (--readsFileType SAM SE/PE), these SAM attributes mark the barcode qualities (in proper order). + For instance, for 10X CellRanger or STARsolo BAMs, use --soloInputSAMattrBarcodeQual CY UY . + If this parameter is '-' (default), the quality 'H' will be assigned to all bases. + +soloStrand Forward + string: strandedness of the solo libraries: + Unstranded ... no strand information + Forward ... read strand same as the original RNA molecule + Reverse ... read strand opposite to the original RNA molecule + +soloFeatures Gene + string(s): genomic features for which the UMI counts per Cell Barcode are collected + Gene ... genes: reads match the gene transcript + SJ ... splice junctions: reported in SJ.out.tab + GeneFull ... full gene (pre-mRNA): count all reads overlapping genes' exons and introns + GeneFull_ExonOverIntron ... full gene (pre-mRNA): count all reads overlapping genes' exons and introns: prioritize 100% overlap with exons + GeneFull_Ex50pAS ... full gene (pre-RNA): count all reads overlapping genes' exons and introns: prioritize >50% overlap with exons. Do not count reads with 100% exonic overlap in the antisense direction. + +#####UnderDevelopment_begin : not supported - do not use + Transcript3p ... quantification of transcript for 3' protocols +#####UnderDevelopment_end + +soloMultiMappers Unique + string(s): counting method for reads mapping to multiple genes + Unique ... count only reads that map to unique genes + Uniform ... uniformly distribute multi-genic UMIs to all genes + Rescue ... distribute UMIs proportionally to unique+uniform counts (~ first iteration of EM) + PropUnique ... distribute UMIs proportionally to unique mappers, if present, and uniformly if not. + EM ... multi-gene UMIs are distributed using Expectation Maximization algorithm + +soloUMIdedup 1MM_All + string(s): type of UMI deduplication (collapsing) algorithm + 1MM_All ... all UMIs with 1 mismatch distance to each other are collapsed (i.e. counted once). + 1MM_Directional_UMItools ... follows the "directional" method from the UMI-tools by Smith, Heger and Sudbery (Genome Research 2017). + 1MM_Directional ... same as 1MM_Directional_UMItools, but with more stringent criteria for duplicate UMIs + Exact ... only exactly matching UMIs are collapsed. + NoDedup ... no deduplication of UMIs, count all reads. + 1MM_CR ... CellRanger2-4 algorithm for 1MM UMI collapsing. + +soloUMIfiltering - + string(s): type of UMI filtering (for reads uniquely mapping to genes) + - ... basic filtering: remove UMIs with N and homopolymers (similar to CellRanger 2.2.0). + MultiGeneUMI ... basic + remove lower-count UMIs that map to more than one gene. + MultiGeneUMI_All ... basic + remove all UMIs that map to more than one gene. + MultiGeneUMI_CR ... basic + remove lower-count UMIs that map to more than one gene, matching CellRanger > 3.0.0 . + Only works with --soloUMIdedup 1MM_CR + +soloOutFileNames Solo.out/ features.tsv barcodes.tsv matrix.mtx + string(s): file names for STARsolo output: + file_name_prefix gene_names barcode_sequences cell_feature_count_matrix + +soloCellFilter CellRanger2.2 3000 0.99 10 + string(s): cell filtering type and parameters + None ... do not output filtered cells + TopCells ... only report top cells by UMI count, followed by the exact number of cells + CellRanger2.2 ... simple filtering of CellRanger 2.2. + Can be followed by numbers: number of expected cells, robust maximum percentile for UMI count, maximum to minimum ratio for UMI count + The harcoded values are from CellRanger: nExpectedCells=3000; maxPercentile=0.99; maxMinRatio=10 + EmptyDrops_CR ... EmptyDrops filtering in CellRanger flavor. Please cite the original EmptyDrops paper: A.T.L Lun et al, Genome Biology, 20, 63 (2019): https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1662-y + Can be followed by 10 numeric parameters: nExpectedCells maxPercentile maxMinRatio indMin indMax umiMin umiMinFracMedian candMaxN FDR simN + The harcoded values are from CellRanger: 3000 0.99 10 45000 90000 500 0.01 20000 0.01 10000 + +soloOutFormatFeaturesGeneField3 "Gene Expression" + string(s): field 3 in the Gene features.tsv file. If "-", then no 3rd field is output. + +soloCellReadStats None + string: Output reads statistics for each CB + Standard ... standard output + +#####UnderDevelopment_begin : not supported - do not use +soloClusterCBfile - + string: file containing the cluster information for cell barcodes, two columns: CB cluster_index. Only used with --soloFeatures Transcript3p +#####UnderDevelopment_end diff --git a/src/star/star_align_reads/script.py b/src/star/star_align_reads/script.py new file mode 100644 index 00000000..2bde8798 --- /dev/null +++ b/src/star/star_align_reads/script.py @@ -0,0 +1,109 @@ +import tempfile +import subprocess +import shutil +from pathlib import Path + +## VIASH START +par = { + "input": [ + "src/star/star_align_reads/test_data/a_R1.1.fastq", + "src/star/star_align_reads/test_data/a_R1.2.fastq", + ], + "input_r2": [ + "src/star/star_align_reads/test_data/a_R2.1.fastq", + "src/star/star_align_reads/test_data/a_R2.2.fastq", + ], + "genomeDir": "src/star/star_align_reads/test_data/genome.fasta", + "aligned_reads": "aligned_reads.sam" +} +meta = { + "cpus": 8, + "temp_dir": "/tmp" +} +## VIASH END + +################################################## +# check and process SE / PE R1 input files +input_r1 = par["input"] +readFilesIn = ",".join(par["input"]) +par["input"] = None + +# check and process PE R2 input files +input_r2 = par["input_r2"] +if input_r2 is not None: + if len(input_r1) != len(input_r2): + raise ValueError("The number of R1 and R2 files do not match.") + readFilesIn = [readFilesIn, ",".join(par["input_r2"])] + par["input_r2"] = None + +# store readFilesIn +par["readFilesIn"] = readFilesIn + +################################################## + +# determine readFilesCommand +if input_r1[0].endswith(".gz"): + print(">> Input files are gzipped, setting readFilesCommand to zcat", flush=True) + par["readFilesCommand"] = "zcat" +elif input_r1[0].endswith(".bz2"): + print(">> Input files are bzipped, setting readFilesCommand to bzcat", flush=True) + par["readFilesCommand"] = "bzcat" + +################################################## +# store output paths +expected_outputs = { + "aligned_reads": ["Aligned.out.sam", "Aligned.out.bam"], + "reads_per_gene": "ReadsPerGene.out.tab", + "chimeric_junctions": "Chimeric.out.junction", + "log": "Log.final.out", + "splice_junctions": "SJ.out.tab", + "unmapped": "Unmapped.out.mate1", + "unmapped_r2": "Unmapped.out.mate2" +} +output_paths = {name: par[name] for name in expected_outputs.keys()} +for name in expected_outputs.keys(): + par[name] = None + +################################################## +# process other args +par["runMode"] = "alignReads" + +if "cpus" in meta and meta["cpus"]: + par["runThreadN"] = meta["cpus"] + +################################################## +# run STAR and move output to final destination +with tempfile.TemporaryDirectory(prefix="star-", dir=meta["temp_dir"], ignore_cleanup_errors=True) as temp_dir: + print(">> Constructing command", flush=True) + + # set output paths + temp_dir = Path(temp_dir) + par["outTmpDir"] = temp_dir / "tempdir" + out_dir = temp_dir / "out" + par["outFileNamePrefix"] = f"{out_dir}/" # star needs this slash + + # construct command + cmd_args = [ "STAR" ] + for name, value in par.items(): + if value is not None: + val_to_add = value if isinstance(value, list) else [value] + cmd_args.extend([f"--{name}"] + [str(x) for x in val_to_add]) + print("", flush=True) + + # run command + print(">> Running STAR with command:", flush=True) + print(f"+ {' '.join(cmd_args)}", end="\n\n", flush=True) + subprocess.run( + cmd_args, + check=True + ) + print(">> STAR finished successfully", end="\n\n", flush=True) + + # move output to final destination + print(">> Moving output to final destination", flush=True) + for name, paths in expected_outputs.items(): + for expected_path in [paths] if isinstance(paths, str) else paths: + expected_full_path = out_dir / expected_path + if output_paths[name] and expected_full_path.is_file(): + print(f">> Moving {expected_path} to {output_paths[name]}", flush=True) + shutil.move(expected_full_path, output_paths[name]) diff --git a/src/star/star_align_reads/test.sh b/src/star/star_align_reads/test.sh new file mode 100644 index 00000000..374b9014 --- /dev/null +++ b/src/star/star_align_reads/test.sh @@ -0,0 +1,173 @@ +#!/bin/bash + +set -e + +## VIASH START +meta_executable="target/docker/star/star_align_reads/star_align_reads" +meta_resources_dir="src/star/star_align_reads" +## VIASH END + +######################################################################################### + +# helper functions +assert_file_exists() { + [ -f "$1" ] || (echo "File '$1' does not exist" && exit 1) +} +assert_file_doesnt_exist() { + [ ! -f "$1" ] || (echo "File '$1' exists but shouldn't" && exit 1) +} +assert_file_empty() { + [ ! -s "$1" ] || (echo "File '$1' is not empty but should be" && exit 1) +} +assert_file_not_empty() { + [ -s "$1" ] || (echo "File '$1' is empty but shouldn't be" && exit 1) +} +assert_file_contains() { + grep -q "$2" "$1" || (echo "File '$1' does not contain '$2'" && exit 1) +} +assert_file_not_contains() { + grep -q "$2" "$1" && (echo "File '$1' contains '$2' but shouldn't" && exit 1) +} +assert_file_contains_regex() { + grep -q -E "$2" "$1" || (echo "File '$1' does not contain '$2'" && exit 1) +} +assert_file_not_contains_regex() { + grep -q -E "$2" "$1" && (echo "File '$1' contains '$2' but shouldn't" && exit 1) +} + +######################################################################################### +echo "> Prepare test data" + +cat > reads_R1.fastq <<'EOF' +@SEQ_ID1 +ACGCTGCCTCATAAGCCTCACACAT ++ +IIIIIIIIIIIIIIIIIIIIIIIII +@SEQ_ID2 +ACCCGCAAGATTAGGCTCCGTACAC ++ +!!!!!!!!!!!!!!!!!!!!!!!!! +EOF + +cat > reads_R2.fastq <<'EOF' +@SEQ_ID1 +ATGTGTGAGGCTTATGAGGCAGCGT ++ +IIIIIIIIIIIIIIIIIIIIIIIII +@SEQ_ID2 +GTGTACGGAGCCTAATCTTGCAGGG ++ +!!!!!!!!!!!!!!!!!!!!!!!!! +EOF + +cat > genome.fasta <<'EOF' +>chr1 +TGGCATGAGCCAACGAACGCTGCCTCATAAGCCTCACACATCCGCGCCTATGTTGTGACTCTCTGTGAGCGTTCGTGGG +GCTCGTCACCACTATGGTTGGCCGGTTAGTAGTGTGACTCCTGGTTTTCTGGAGCTTCTTTAAACCGTAGTCCAGTCAA +TGCGAATGGCACTTCACGACGGACTGTCCTTAGCTCAGGGGA +EOF + +cat > genes.gtf <<'EOF' +chr1 example_source gene 0 50 . + . gene_id "gene1"; transcript_id "transcript1"; +chr1 example_source exon 20 40 . + . gene_id "gene1"; transcript_id "transcript1"; +EOF + +echo "> Generate index" +STAR \ + ${meta_cpus:+--runThreadN $meta_cpus} \ + --runMode genomeGenerate \ + --genomeDir "index/" \ + --genomeFastaFiles "genome.fasta" \ + --sjdbGTFfile "genes.gtf" \ + --genomeSAindexNbases 2 + +######################################################################################### + +mkdir star_align_reads_se +cd star_align_reads_se + +echo "> Run star_align_reads on SE" +"$meta_executable" \ + --input "../reads_R1.fastq" \ + --genomeDir "../index/" \ + --aligned_reads "output.sam" \ + --log "log.txt" \ + --outReadsUnmapped "Fastx" \ + --unmapped "unmapped.sam" \ + --quantMode "TranscriptomeSAM;GeneCounts" \ + --reads_per_gene "reads_per_gene.tsv" \ + --outSJtype Standard \ + --splice_junctions "splice_junctions.tsv" \ + ${meta_cpus:+---cpus $meta_cpus} + +# TODO: Test data doesn't contain any chimeric reads yet +# --chimOutType "Junctions" \ +# --chimeric_junctions "chimeric_junctions.tsv" \ + +echo ">> Check if output exists" +assert_file_exists "output.sam" +assert_file_exists "log.txt" +assert_file_exists "reads_per_gene.tsv" +# assert_file_exists "chimeric_junctions.tsv" +assert_file_exists "splice_junctions.tsv" +assert_file_exists "unmapped.sam" + +echo ">> Check if output contents are not empty" +assert_file_not_empty "output.sam" +assert_file_not_empty "log.txt" +assert_file_not_empty "reads_per_gene.tsv" +# assert_file_not_empty "chimeric_junctions.tsv" +# assert_file_not_empty "splice_junctions.tsv" # TODO: test data doesn't contain any splice junctions yet +assert_file_not_empty "unmapped.sam" + +echo ">> Check if output contents are correct" +assert_file_contains "log.txt" "Number of input reads \\| 2" +assert_file_contains "log.txt" "Number of reads unmapped: too short \\| 1" +assert_file_contains "log.txt" "Uniquely mapped reads number \\| 1" +assert_file_contains "reads_per_gene.tsv" "gene1 1 1 0" +assert_file_contains "reads_per_gene.tsv" "N_unmapped 1 1 1" +assert_file_contains "output.sam" "SEQ_ID1 0 chr1 17 255 25M \\* 0 0 ACGCTGCCTCATAAGCCTCACACAT IIIIIIIIIIIIIIIIIIIIIIIII NH:i:1 HI:i:1 AS:i:24 nM:i:0" +assert_file_contains "unmapped.sam" "@SEQ_ID2 0:N:" +assert_file_contains "unmapped.sam" "ACCCGCAAGATTAGGCTCCGTACAC" + +cd .. + +######################################################################################### + +mkdir star_align_reads_pe_minimal +cd star_align_reads_pe_minimal + +echo ">> Run star_align_reads on PE" +"$meta_executable" \ + --input ../reads_R1.fastq \ + --input_r2 ../reads_R2.fastq \ + --genomeDir ../index/ \ + --aligned_reads output.bam \ + --log log.txt \ + --outReadsUnmapped Fastx \ + --unmapped unmapped_r1.bam \ + --unmapped_r2 unmapped_r2.bam \ + ${meta_cpus:+---cpus $meta_cpus} + +echo ">> Check if output exists" +assert_file_exists "output.bam" +assert_file_exists "log.txt" +assert_file_exists "unmapped_r1.bam" +assert_file_exists "unmapped_r2.bam" + +echo ">> Check if output contents are not empty" +assert_file_not_empty "output.bam" +assert_file_not_empty "log.txt" +assert_file_not_empty "unmapped_r1.bam" +assert_file_not_empty "unmapped_r2.bam" + +echo ">> Check if output contents are correct" +assert_file_contains "log.txt" "Number of input reads \\| 2" +assert_file_contains "log.txt" "Number of reads unmapped: too short \\| 1" +assert_file_contains "log.txt" "Uniquely mapped reads number \\| 1" + +cd .. + +######################################################################################### + +echo "> Test successful" diff --git a/src/star/star_align_reads/utils/process_params.R b/src/star/star_align_reads/utils/process_params.R new file mode 100644 index 00000000..ccdc50b3 --- /dev/null +++ b/src/star/star_align_reads/utils/process_params.R @@ -0,0 +1,189 @@ +library(tidyverse) + +# This script processes the STAR aligner's help file +# to create a viash argument_groups.yaml file. + +local_file <- "src/star/star_align_reads/help.txt" +yaml_file <- "src/star/star_align_reads/argument_groups.yaml" + +param_txt <- readr::read_lines(local_file) + +# replace non-ascii characters with their ascii approximations +param_txt <- iconv(param_txt, "UTF-8", "ASCII//TRANSLIT") + +dev_begin <- grep("#####UnderDevelopment_begin", param_txt) +dev_end <- grep("#####UnderDevelopment_end", param_txt) + +# strip development sections +nondev_ix <- unlist(map2(c(1, dev_end + 1), c(dev_begin - 1, length(param_txt)), function(i, j) { + if (i >= 1 && i < j) { + seq(i, j, 1) + } else { + NULL + } +})) + +param_txt2 <- param_txt[nondev_ix] + +# strip comments +param_txt3 <- param_txt2[-grep("^#[^#]", param_txt2)] + +# detect groups +group_ix <- grep("^### ", param_txt3) + +out <- map2_dfr( + group_ix, + c(group_ix[-1] - 1, length(param_txt3)), + function(group_start, group_end) { + # cat("group_start <- ", group_start, "; group_end <- ", group_end, "\n", sep = "") + group_name <- gsub("^### ", "", param_txt3[[group_start]]) + + group_txt <- param_txt3[seq(group_start + 1, group_end)] + + arg_ix <- grep("^[^ ]", group_txt) + + arguments <- map2_dfr( + arg_ix, + c(arg_ix[-1] - 1, length(group_txt)), + function(arg_start, arg_end) { + # cat("arg_start <- ", arg_start, "; arg_end <- ", arg_end, "\n", sep = "") + + # process name and default + first_txt <- group_txt[[arg_start]] + first_regex <- "^([^ ]*) +(.*) *$" + if (!grepl(first_regex, first_txt)) { + stop("Line '", first_txt, "' did not match regex '", first_regex, "'") + } + name <- gsub(first_regex, "\\1", first_txt) + default <- gsub(first_regex, "\\2", first_txt) + + # process type and first description + second_txt <- group_txt[[arg_start + 1]] + second_regex <- "^ +([^:]*):[ ]+(.*)$" + if (!grepl(second_regex, second_txt)) { + stop("Line '", second_txt, "' did not match regex '", second_regex, "'") + } + type <- gsub(second_regex, "\\1", second_txt) + desc_start <- str_trim(gsub(second_regex, "\\2", second_txt)) + + # process more description + desc_cont1 <- group_txt[seq(arg_start + 2, arg_end)] + + desc <- + if (sum(str_length(desc_cont1)) == 0) { + desc_start + } else { + # detect margin + margins <- str_extract(desc_cont1, "^( +)") %>% na.omit + margin <- margins[[which.min(str_length(margins))]] + desc_cont2 <- gsub(paste0("^", margin), "", desc_cont1) + desc_cont3 <- ifelse(grepl("\\.\\.\\.", desc_cont2), paste0("- ", desc_cont2), desc_cont2) + desc_cont4 <- str_trim(desc_cont3) + + # construct desc + str_trim(paste0(c(desc_start, "", desc_cont4), "\n", collapse = "")) + } + + tibble( + group_name, + name, + default, + type, + description = desc + ) + } + ) + + arguments + } +) + +# todo: manually fix alignEndsProtrude? +# assigning types +type_map <- c("string" = "string", "int" = "integer", "real" = "double", "double" = "double", "int, string" = "string") +file_args <- c("genomeDir", "readFilesIn", "sjdbGTFfile", "genomeFastaFiles", "genomeChainFiles", "readFilesManifest") +long_args <- c("limitGenomeGenerateRAM", "limitIObufferSize", "limitOutSAMoneReadBytes", "limitBAMsortRAM") +required_args <- c("genomeDir", "readFilesIn") + +# converting examples +as_safe_int <- function(x) tryCatch({as.integer(x)}, warning = function(e) { bit64::as.integer64(x) }) +safe_split <- function(x) strsplit(x, "'[^']*'(*SKIP)(*F)|\"[^\"]*\"(*SKIP)(*F)|\\s+", perl = TRUE)[[1]] %>% gsub("^[\"']|[\"']$", "", .) +trafos <- list( + string = function(x) x, + integer = as_safe_int, + double = as.numeric, + strings = function(x) safe_split(x), + integers = function(x) sapply(safe_split(x), as_safe_int), + doubles = function(x) as.numeric(safe_split(x)) +) +# remove arguments that are not relevant for viash +removed_args <- c("versionGenome", "parametersFiles", "sysShell", "runDirPerm") +# these settings are defined by the viash component +manual_args <- c("runThreadN", "outTmpDir", "runMode", "outFileNamePrefix", "readFilesIn") + +# make viash-like values +out2 <- out %>% + # remove arguments that are not relevant for viash + filter(!name %in% c(removed_args, manual_args)) %>% + # remove arguments that are related to a different runmode + filter(!grepl("--runMode", description) | grepl("--runMode alignReads", description)) %>% + filter(!grepl("--runMode", group_name) | grepl("--runMode alignReads", group_name)) %>% + filter(!grepl("STARsolo", group_name)) %>% + mutate( + viash_arg = paste0("--", name), + type_step1 = type %>% + str_replace_all(".*(int, string|string|int|real|double)\\(?(s?).*", "\\1\\2"), + viash_type = type_map[gsub("(int, string|string|int|real|double).*", "\\1", type_step1)], + multiple = type_step1 == "int, string" | grepl("s$", type_step1) | grepl("^[4N][\\* ]", type), + default_step1 = default %>% + {ifelse(. %in% c("-", "None"), NA_character_, .)}, + viash_default = + mapply( + default_step1, + paste0(viash_type, ifelse(multiple, "s", "")), + FUN = function(str, typ) trafos[[typ]](str) + ), + # viash_type = ifelse(sapply(viash_default, bit64::is.integer64), "long", viash_type), + # update type + viash_type = case_when( + name %in% long_args ~ "long", + name %in% file_args ~ "file", + TRUE ~ viash_type + ), + # turn longs into character because yaml::write_yaml doesn't handle longs well + viash_default = ifelse(sapply(viash_default, bit64::is.integer64), map(viash_default, as.character), viash_default), + group_name = gsub(" - .*", "", group_name), + required = ifelse(name %in% required_args, TRUE, NA) + ) +print(out2, n = 200) +out2 %>% mutate(i = row_number()) %>% + # filter(is.na(default_step1) != is.na(viash_default)) %>% + select(-group_name, -description) + +out2 %>% filter(!grepl("--runMode", description) | grepl("--runMode alignReads", description)) + +argument_groups <- map(unique(out2$group_name), function(group_name) { + args <- out2 %>% + filter(group_name == !!group_name) %>% + pmap(function(viash_arg, viash_type, multiple, viash_default, description, required, ...) { + li <- lst( + name = viash_arg, + type = viash_type, + description = description + ) + if (all(!is.na(viash_default))) { + li$example <- viash_default + } + if (!is.na(multiple) && multiple) { + li$multiple <- multiple + li$multiple_sep <- ";" + } + if (!is.na(required) && required) { + li$required <- required + } + li + }) + list(name = group_name, arguments = args) +}) + +yaml::write_yaml(list(argument_groups = argument_groups), yaml_file) From 2db6d2997d5a38a1c11758bb5ce401b3e7ff947c Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Tue, 27 Feb 2024 21:32:39 +0100 Subject: [PATCH 3/3] add more info to the project config --- _viash.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/_viash.yaml b/_viash.yaml index 8e09d947..a72a1ab7 100644 --- a/_viash.yaml +++ b/_viash.yaml @@ -2,5 +2,9 @@ name: biobase description: | A collection of bioinformatics tools for working with sequence data. license: MIT +keywords: [bioinformatics, sequence, alignment, variant calling] +links: + issue_tracker: https://github.com/viash-hub/biobase/issues + repository: https://github.com/viash-hub/biobase viash_version: 0.9.0-RC2 \ No newline at end of file