diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 9974802..f2ab0c5 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,173 +1,37 @@ # Include shared CI include: - - project: "epi2melabs/ci-templates" - file: "wf-containers.yaml" + - project: "epi2melabs/ci-templates" + file: "wf-containers.yaml" variables: - # We'll use the single-file case for these runs - NF_WORKFLOW_OPTS: "--fastq test_data/reads.fastq.gz" - CI_FLAVOUR: "new" # set to "classic" for old-style CI - TEST_CMD: "bash test/run_fastq_ingress_test.sh" - S3_TEST_DATA: "s3://ont-exd-int-s3-euwst1-epi2me-labs/wf-template/cw-1019/test_data" - - -# Remove this block in downstream templates -singularity-run: - tags: [] # no need for big ram -# end - + NF_WORKFLOW_OPTS: + "--fastq test_data/fastq --reference test_data/reference.fasta \ + --threads 2" + CI_FLAVOUR: "new" docker-run: - - # Remove this directive in downstream templates - tags: [] # no need for big ram - - # Define a 1D job matrix to inject a variable named MATRIX_NAME into - # the CI environment, we can use the value of MATRIX_NAME to determine - # which options to apply as part of the rules block below - # NOTE There is a slightly cleaner way to define this matrix to include - # the variables, but it is broken when using long strings! See CW-756 - parallel: - matrix: - # TODO: do we really need to run all from s3? - - MATRIX_NAME: [ - "single-file", "single-file-s3", - "case01", "case01-s3", - "case02", "case02-s3", - "case03", "case03-s3", - "case04", "case04-s3", - "case05", "case05-s3", - "case06", "case06-s3", - "case07", "case07-s3", - "case08", "case08-s3", case08-unclassified, case08-unclassified-s3, - "case09", "case09-s3", - "case10", "case10-s3", - "case11", "case11-s3", - ] - rules: - # NOTE As we're overriding the rules block for the included docker-run - # we must redefine this CI_COMMIT_BRANCH rule to prevent docker-run - # being incorrectly scheduled for "detached merge request pipelines" etc. - - if: ($CI_COMMIT_BRANCH == null || $CI_COMMIT_BRANCH == "dev-template") - when: never - - if: $MATRIX_NAME == "single-file" - variables: - NF_WORKFLOW_OPTS: "--fastq test_data/reads.fastq.gz" - AFTER_NEXTFLOW_CMD: $TEST_CMD test_data/reads.fastq.gz $$PWD/$$CI_PROJECT_NAME - - if: $MATRIX_NAME == "single-file-s3" - variables: - NF_WORKFLOW_OPTS: "--fastq $S3_TEST_DATA/reads.fastq.gz" - AFTER_NEXTFLOW_CMD: $TEST_CMD $S3_TEST_DATA/reads.fastq.gz $$PWD/$$CI_PROJECT_NAME - - - if: $MATRIX_NAME == "case01" - variables: - NF_WORKFLOW_OPTS: "--fastq test_data/fastq_ingress/case01" - AFTER_NEXTFLOW_CMD: $TEST_CMD test_data/fastq_ingress/case01 $$PWD/$$CI_PROJECT_NAME - - if: $MATRIX_NAME == "case01-s3" - variables: - NF_WORKFLOW_OPTS: "--fastq $S3_TEST_DATA/fastq_ingress/case01" - AFTER_NEXTFLOW_CMD: $TEST_CMD $S3_TEST_DATA/fastq_ingress/case01 $$PWD/$$CI_PROJECT_NAME - - - if: $MATRIX_NAME == "case02" - variables: - NF_WORKFLOW_OPTS: "--fastq test_data/fastq_ingress/case02" - AFTER_NEXTFLOW_CMD: $TEST_CMD test_data/fastq_ingress/case02 $$PWD/$$CI_PROJECT_NAME - - if: $MATRIX_NAME == "case02-s3" - variables: - NF_WORKFLOW_OPTS: "--fastq $S3_TEST_DATA/fastq_ingress/case02" - AFTER_NEXTFLOW_CMD: $TEST_CMD $S3_TEST_DATA/fastq_ingress/case02 $$PWD/$$CI_PROJECT_NAME - - - if: $MATRIX_NAME == "case03" - variables: - NF_WORKFLOW_OPTS: "--fastq test_data/fastq_ingress/case03" - ASSERT_NEXTFLOW_FAILURE: "1" - ASSERT_NEXTFLOW_FAILURE_REXP: "Input directory '.*' cannot contain FASTQ files and sub-directories." - - if: $MATRIX_NAME == "case03-s3" - variables: - NF_WORKFLOW_OPTS: "--fastq $S3_TEST_DATA/fastq_ingress/case03" - ASSERT_NEXTFLOW_FAILURE: "1" - ASSERT_NEXTFLOW_FAILURE_REXP: "Input directory '.*' cannot contain FASTQ files and sub-directories." - - - if: $MATRIX_NAME == "case04" - variables: - NF_WORKFLOW_OPTS: "--fastq test_data/fastq_ingress/case04" - AFTER_NEXTFLOW_CMD: $TEST_CMD test_data/fastq_ingress/case04 $$PWD/$$CI_PROJECT_NAME - - if: $MATRIX_NAME == "case04-s3" - variables: - NF_WORKFLOW_OPTS: "--fastq $S3_TEST_DATA/fastq_ingress/case04" - AFTER_NEXTFLOW_CMD: $TEST_CMD $S3_TEST_DATA/fastq_ingress/case04 $$PWD/$$CI_PROJECT_NAME - - - if: $MATRIX_NAME == "case05" - variables: - NF_WORKFLOW_OPTS: "--fastq test_data/fastq_ingress/case05 --sample_sheet test_data/fastq_ingress/case05/sample_sheet.csv" - AFTER_NEXTFLOW_CMD: $TEST_CMD test_data/fastq_ingress/case05 $$PWD/$$CI_PROJECT_NAME - - if: $MATRIX_NAME == "case05-s3" - variables: - NF_WORKFLOW_OPTS: "--fastq $S3_TEST_DATA/fastq_ingress/case05 --sample_sheet $S3_TEST_DATA/fastq_ingress/case05/sample_sheet.csv" - AFTER_NEXTFLOW_CMD: $TEST_CMD $S3_TEST_DATA/fastq_ingress/case05 $$PWD/$$CI_PROJECT_NAME $S3_TEST_DATA/fastq_ingress/case05/sample_sheet.csv - - - if: $MATRIX_NAME == "case06" - variables: - NF_WORKFLOW_OPTS: "--fastq test_data/fastq_ingress/case06 --sample_sheet test_data/fastq_ingress/case06/sample_sheet.csv" - AFTER_NEXTFLOW_CMD: $TEST_CMD test_data/fastq_ingress/case06 $$PWD/$$CI_PROJECT_NAME - - if: $MATRIX_NAME == "case06-s3" - variables: - NF_WORKFLOW_OPTS: "--fastq $S3_TEST_DATA/fastq_ingress/case06 --sample_sheet $S3_TEST_DATA/fastq_ingress/case06/sample_sheet.csv" - AFTER_NEXTFLOW_CMD: $TEST_CMD $S3_TEST_DATA/fastq_ingress/case06 $$PWD/$$CI_PROJECT_NAME $S3_TEST_DATA/fastq_ingress/case06/sample_sheet.csv - - - if: $MATRIX_NAME == "case07" - variables: - NF_WORKFLOW_OPTS: "--fastq test_data/fastq_ingress/case07 --sample_sheet test_data/fastq_ingress/case07/sample_sheet.csv" - AFTER_NEXTFLOW_CMD: $TEST_CMD test_data/fastq_ingress/case07 $$PWD/$$CI_PROJECT_NAME - - if: $MATRIX_NAME == "case07-s3" - variables: - NF_WORKFLOW_OPTS: "--fastq $S3_TEST_DATA/fastq_ingress/case07 --sample_sheet $S3_TEST_DATA/fastq_ingress/case07/sample_sheet.csv" - AFTER_NEXTFLOW_CMD: $TEST_CMD $S3_TEST_DATA/fastq_ingress/case07 $$PWD/$$CI_PROJECT_NAME $S3_TEST_DATA/fastq_ingress/case07/sample_sheet.csv - - - if: $MATRIX_NAME == "case08" - variables: - NF_WORKFLOW_OPTS: "--fastq test_data/fastq_ingress/case08" - AFTER_NEXTFLOW_CMD: $TEST_CMD test_data/fastq_ingress/case08 $$PWD/$$CI_PROJECT_NAME - - if: $MATRIX_NAME == "case08-s3" - variables: - NF_WORKFLOW_OPTS: "--fastq $S3_TEST_DATA/fastq_ingress/case08" - AFTER_NEXTFLOW_CMD: $TEST_CMD $S3_TEST_DATA/fastq_ingress/case08 $$PWD/$$CI_PROJECT_NAME - - if: $MATRIX_NAME == "case08-unclassified" - variables: - NF_WORKFLOW_OPTS: "--fastq test_data/fastq_ingress/case08 --analyse_unclassified" - AFTER_NEXTFLOW_CMD: $TEST_CMD test_data/fastq_ingress/case08 $$PWD/$$CI_PROJECT_NAME - - if: $MATRIX_NAME == "case08-unclassified-s3" - variables: - NF_WORKFLOW_OPTS: "--fastq $S3_TEST_DATA/fastq_ingress/case08 --analyse_unclassified" - AFTER_NEXTFLOW_CMD: $TEST_CMD $S3_TEST_DATA/fastq_ingress/case08 $$PWD/$$CI_PROJECT_NAME - - - if: $MATRIX_NAME == "case09" - variables: - NF_WORKFLOW_OPTS: "--fastq test_data/fastq_ingress/case09" - AFTER_NEXTFLOW_CMD: $TEST_CMD test_data/fastq_ingress/case09 $$PWD/$$CI_PROJECT_NAME - - if: $MATRIX_NAME == "case09-s3" - variables: - NF_WORKFLOW_OPTS: "--fastq $S3_TEST_DATA/fastq_ingress/case09" - AFTER_NEXTFLOW_CMD: $TEST_CMD $S3_TEST_DATA/fastq_ingress/case09 $$PWD/$$CI_PROJECT_NAME - - - if: $MATRIX_NAME == "case10" - variables: - NF_WORKFLOW_OPTS: "--fastq test_data/fastq_ingress/case10" - ASSERT_NEXTFLOW_FAILURE: "1" - ASSERT_NEXTFLOW_FAILURE_REXP: "Input directory '.*' cannot contain more than one level of sub-directories with FASTQ files." - - if: $MATRIX_NAME == "case10-s3" - variables: - NF_WORKFLOW_OPTS: "--fastq $S3_TEST_DATA/fastq_ingress/case10" - ASSERT_NEXTFLOW_FAILURE: "1" - ASSERT_NEXTFLOW_FAILURE_REXP: "Input directory '.*' cannot contain more than one level of sub-directories with FASTQ files." - - - if: $MATRIX_NAME == "case11" - variables: - NF_WORKFLOW_OPTS: "--fastq test_data/fastq_ingress/case11 --sample_sheet test_data/fastq_ingress/case11/sample_sheet.csv" - ASSERT_NEXTFLOW_FAILURE: "1" - ASSERT_NEXTFLOW_FAILURE_REXP: "Invalid sample sheet: values in 'alias' column not unique" - - if: $MATRIX_NAME == "case11-s3" - variables: - NF_WORKFLOW_OPTS: "--fastq $S3_TEST_DATA/fastq_ingress/case11 --sample_sheet $S3_TEST_DATA/fastq_ingress/case11/sample_sheet.csv" - ASSERT_NEXTFLOW_FAILURE: "1" - ASSERT_NEXTFLOW_FAILURE_REXP: "Invalid sample sheet: values in 'alias' column not unique" \ No newline at end of file + # Remove this directive in downstream templates + tags: [large_ram] # no need for big ram + + # Define a 1D job matrix to inject a variable named MATRIX_NAME into + # the CI environment, we can use the value of MATRIX_NAME to determine + # which options to apply as part of the rules block below + # NOTE There is a slightly cleaner way to define this matrix to include + # the variables, but it is broken when using long strings! See CW-756 + parallel: + matrix: + - MATRIX_NAME: ["ref"] + rules: + # NOTE As we're overriding the rules block for the included docker-run + # we must redefine this CI_COMMIT_BRANCH rule to prevent docker-run + # being incorrectly scheduled for "detached merge request pipelines" etc. + - if: ($CI_COMMIT_BRANCH == null || $CI_COMMIT_BRANCH == "dev-template") + when: never + - if: $MATRIX_NAME == "ref" + variables: + NF_WORKFLOW_OPTS: + "--fastq test_data/fastq --reference test_data/reference.fasta \ + --threads 2 --reads_downsampling_size 2000" + NF_PROCESS_FILES: > + main.nf + modules/local/pipeline.nf diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9f756c2..1d1d82d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -18,9 +18,10 @@ repos: additional_dependencies: - epi2melabs - repo: https://github.com/pycqa/flake8 - rev: 3.7.9 + rev: 5.0.4 hooks: - id: flake8 + pass_filenames: false additional_dependencies: - flake8-rst-docstrings - flake8-docstrings @@ -31,4 +32,9 @@ repos: - flake8-builtins - flake8-absolute-import - flake8-print - entry: flake8 bin --import-order-style google --statistics --max-line-length 88 + args: [ + "bin", + "--import-order-style=google", + "--statistics", + "--max-line-length=88", + ] diff --git a/CHANGELOG.md b/CHANGELOG.md index 9b99d3f..6967966 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,6 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [v0.0.1] +## [v0.1.0] First release. diff --git a/LICENSE b/LICENSE index 3ee2c24..bd5e2de 100644 --- a/LICENSE +++ b/LICENSE @@ -1,12 +1,5 @@ -This Source Code Form is subject to the terms of the Mozilla Public -License, v. 2.0. If a copy of the MPL was not distributed with this -file, You can obtain one at http://mozilla.org/MPL/2.0/. - -(c) 2021- Oxford Nanopore Technologies Ltd. - - -Mozilla Public License Version 2.0 -================================== +Oxford Nanopore Technologies PLC. Public License Version 1.0 +============================================================= 1. Definitions -------------- @@ -17,7 +10,7 @@ Mozilla Public License Version 2.0 1.2. "Contributor Version" means the combination of the Contributions of others (if any) used - by a Contributor and that particular Contributor's Contribution. + by a Contributor and that particular Contributor’s Contribution. 1.3. "Contribution" means Covered Software of a particular Contributor. @@ -28,59 +21,46 @@ Mozilla Public License Version 2.0 Form, and Modifications of such Source Code Form, in each case including portions thereof. -1.5. "Incompatible With Secondary Licenses" - means - - (a) that the initial Contributor has attached the notice described - in Exhibit B to the Covered Software; or - - (b) that the Covered Software was made available under the terms of - version 1.1 or earlier of the License, but not also under the - terms of a Secondary License. - -1.6. "Executable Form" +1.5. "Executable Form" means any form of the work other than Source Code Form. -1.7. "Larger Work" +1.6. "Larger Work" means a work that combines Covered Software with other material, in a separate file or files, that is not Covered Software. -1.8. "License" +1.7. "License" means this document. -1.9. "Licensable" +1.8. "Licensable" means having the right to grant, to the maximum extent possible, whether at the time of the initial grant or subsequently, any and all of the rights conveyed by this License. -1.10. "Modifications" +1.9. "Modifications" means any of the following: - (a) any file in Source Code Form that results from an addition to, - deletion from, or modification of the contents of Covered - Software; or - - (b) any new file in Source Code Form that contains any Covered - Software. + (a) any file in Source Code Form that results from an addition to, + deletion from, or modification of the contents of Covered + Software; or + (b) any new file in Source Code Form that contains any Covered + Software. -1.11. "Patent Claims" of a Contributor - means any patent claim(s), including without limitation, method, - process, and apparatus claims, in any patent Licensable by such - Contributor that would be infringed, but for the grant of the - License, by the making, using, selling, offering for sale, having - made, import, or transfer of either its Contributions or its - Contributor Version. +1.10. "Research Purposes" + means use for internal research and not intended for or directed + towards commercial advantages or monetary compensation; provided, + however, that monetary compensation does not include sponsored + research of research funded by grants. -1.12. "Secondary License" +1.11 "Secondary License" means either the GNU General Public License, Version 2.0, the GNU Lesser General Public License, Version 2.1, the GNU Affero General Public License, Version 3.0, or any later versions of those licenses. -1.13. "Source Code Form" +1.12. "Source Code Form" means the form of the work preferred for making modifications. -1.14. "You" (or "Your") +1.13. "You" (or "Your") means an individual or a legal entity exercising rights under this License. For legal entities, "You" includes any entity that controls, is controlled by, or is under common control with You. For @@ -96,53 +76,47 @@ Mozilla Public License Version 2.0 2.1. Grants Each Contributor hereby grants You a world-wide, royalty-free, -non-exclusive license: - -(a) under intellectual property rights (other than patent or trademark) - Licensable by such Contributor to use, reproduce, make available, - modify, display, perform, distribute, and otherwise exploit its - Contributions, either on an unmodified basis, with Modifications, or - as part of a Larger Work; and - -(b) under Patent Claims of such Contributor to make, use, sell, offer - for sale, have made, import, and otherwise transfer either its - Contributions or its Contributor Version. +non-exclusive license under Contributor copyrights Licensable by such +Contributor to use, reproduce, make available, modify, display, +perform, distribute, and otherwise exploit solely for Research Purposes +its Contributions, either on an unmodified basis, with Modifications, +or as part of a Larger Work. 2.2. Effective Date The licenses granted in Section 2.1 with respect to any Contribution -become effective for each Contribution on the date the Contributor first -distributes such Contribution. +become effective for each Contribution on the date the Contributor +first distributes such Contribution. 2.3. Limitations on Grant Scope The licenses granted in this Section 2 are the only rights granted under this License. No additional rights or licenses will be implied from the -distribution or licensing of Covered Software under this License. -Notwithstanding Section 2.1(b) above, no patent license is granted by a -Contributor: +distribution or licensing of Covered Software under this License. The +License is incompatible with Secondary Licenses. Notwithstanding +Section 2.1 above, no copyright license is granted: (a) for any code that a Contributor has removed from Covered Software; or -(b) for infringements caused by: (i) Your and any other third party's - modifications of Covered Software, or (ii) the combination of its - Contributions with other software (except as part of its Contributor - Version); or +(b) use of the Contributions or its Contributor Version other than for +Research Purposes only; or -(c) under Patent Claims infringed by Covered Software in the absence of - its Contributions. +(c) for infringements caused by: (i) Your and any other third party’s +modifications of Covered Software, or (ii) the combination of its +Contributions with other software (except as part of its Contributor +Version). -This License does not grant any rights in the trademarks, service marks, -or logos of any Contributor (except as may be necessary to comply with -the notice requirements in Section 3.4). +This License does not grant any rights in the patents, trademarks, +service marks, or logos of any Contributor (except as may be necessary +to comply with the notice requirements in Section 3.4). 2.4. Subsequent Licenses No Contributor makes additional grants as a result of Your choice to distribute the Covered Software under a subsequent version of this -License (see Section 10.2) or under the terms of a Secondary License (if -permitted under the terms of Section 3.3). +License (see Section 10.2) or under the terms of a Secondary License +(if permitted under the terms of Section 3.3). 2.5. Representation @@ -171,8 +145,7 @@ Modifications that You create or to which You contribute, must be under the terms of this License. You must inform recipients that the Source Code Form of the Covered Software is governed by the terms of this License, and how they can obtain a copy of this License. You may not -attempt to alter or restrict the recipients' rights in the Source Code -Form. +attempt to alter or restrict the recipients’ rights in the Source Code Form. 3.2. Distribution of Executable Form @@ -185,22 +158,14 @@ If You distribute Covered Software in Executable Form then: than the cost of distribution to the recipient; and (b) You may distribute such Executable Form under the terms of this - License, or sublicense it under different terms, provided that the - license for the Executable Form does not attempt to limit or alter - the recipients' rights in the Source Code Form under this License. + License. 3.3. Distribution of a Larger Work You may create and distribute a Larger Work under terms of Your choice, provided that You also comply with the requirements of this License for -the Covered Software. If the Larger Work is a combination of Covered -Software with a work governed by one or more Secondary Licenses, and the -Covered Software is not Incompatible With Secondary Licenses, this -License permits You to additionally distribute such Covered Software -under the terms of such Secondary License(s), so that the recipient of -the Larger Work may, at their option, further distribute the Covered -Software under the terms of either this License or such Secondary -License(s). +the Covered Software. The Larger Work may not be a combination of Covered +Software with a work governed by one or more Secondary Licenses. 3.4. Notices @@ -212,16 +177,15 @@ the extent required to remedy known factual inaccuracies. 3.5. Application of Additional Terms -You may choose to offer, and to charge a fee for, warranty, support, -indemnity or liability obligations to one or more recipients of Covered -Software. However, You may do so only on Your own behalf, and not on -behalf of any Contributor. You must make it absolutely clear that any -such warranty, support, indemnity, or liability obligation is offered by -You alone, and You hereby agree to indemnify every Contributor for any -liability incurred by such Contributor as a result of warranty, support, -indemnity or liability terms You offer. You may include additional -disclaimers of warranty and limitations of liability specific to any -jurisdiction. +You may not choose to offer, or charge a fee for use of the Covered +Software or a fee for, warranty, support, indemnity or liability +obligations to one or more recipients of Covered Software. You must +make it absolutely clear that any such warranty, support, indemnity, or +liability obligation is offered by You alone, and You hereby agree to +indemnify every Contributor for any liability incurred by such +Contributor as a result of warranty, support, indemnity or liability +terms You offer. You may include additional disclaimers of warranty and +limitations of liability specific to any jurisdiction. 4. Inability to Comply Due to Statute or Regulation --------------------------------------------------- @@ -240,23 +204,12 @@ recipient of ordinary skill to be able to understand it. -------------- 5.1. The rights granted under this License will terminate automatically -if You fail to comply with any of its terms. However, if You become -compliant, then the rights granted under this License from a particular -Contributor are reinstated (a) provisionally, unless and until such -Contributor explicitly and finally terminates Your grants, and (b) on an -ongoing basis, if such Contributor fails to notify You of the -non-compliance by some reasonable means prior to 60 days after You have -come back into compliance. Moreover, Your grants from a particular -Contributor are reinstated on an ongoing basis if such Contributor -notifies You of the non-compliance by some reasonable means, this is the -first time You have received notice of non-compliance with this License -from such Contributor, and You become compliant prior to 30 days after -Your receipt of the notice. - -5.2. If You initiate litigation against any entity by asserting a patent +if You fail to comply with any of its terms. + +5.2. If You initiate litigation against any entity by asserting an infringement claim (excluding declaratory judgment actions, counter-claims, and cross-claims) alleging that a Contributor Version -directly or indirectly infringes any patent, then the rights granted to +directly or indirectly infringes, then the rights granted to You by any and all Contributors for the Covered Software under Section 2.1 of this License shall terminate. @@ -299,8 +252,10 @@ prior to termination shall survive termination. * and all other commercial damages or losses, even if such party * * shall have been informed of the possibility of such damages. This * * limitation of liability shall not apply to liability for death or * -* personal injury resulting from such party's negligence to the * -* extent applicable law prohibits such limitation. Some * +* personal injury resulting from such party’s negligence to the * +* extent applicable law prohibits such limitation, but in such event, * +* and to the greatest extent permissible, damages will be limited to * +* direct damages not to exceed one hundred dollars. Some * * jurisdictions do not allow the exclusion or limitation of * * incidental or consequential damages, so this exclusion and * * limitation may not apply to You. * @@ -314,7 +269,7 @@ Any litigation relating to this License may be brought only in the courts of a jurisdiction where the defendant maintains its principal place of business and such litigation shall be governed by laws of that jurisdiction, without reference to its conflict-of-law provisions. -Nothing in this Section shall prevent a party's ability to bring +Nothing in this Section shall prevent a party’s ability to bring cross-claims or counter-claims. 9. Miscellaneous @@ -332,10 +287,10 @@ shall not be used to construe this License against a Contributor. 10.1. New Versions -Mozilla Foundation is the license steward. Except as provided in Section -10.3, no one other than the license steward has the right to modify or -publish new versions of this License. Each version will be given a -distinguishing version number. +Oxford Nanopore Technologies PLC. is the license steward. Except as +provided in Section 10.3, no one other than the license steward has the +right to modify or publish new versions of this License. Each version +will be given a distinguishing version number. 10.2. Effect of New Versions @@ -352,19 +307,12 @@ modified version of this License if you rename the license and remove any references to the name of the license steward (except to note that such modified license differs from this License). -10.4. Distributing Source Code Form that is Incompatible With Secondary -Licenses - -If You choose to distribute Source Code Form that is Incompatible With -Secondary Licenses under the terms of this version of the License, the -notice described in Exhibit B of this License must be attached. - Exhibit A - Source Code Form License Notice ------------------------------------------- - This Source Code Form is subject to the terms of the Mozilla Public - License, v. 2.0. If a copy of the MPL was not distributed with this - file, You can obtain one at http://mozilla.org/MPL/2.0/. + This Source Code Form is subject to the terms of the Oxford Nanopore + Technologies PLC. Public License, v. 1.0. Full licence can be found + obtained from support@nanoporetech.com If it is not possible or desirable to put the notice in a particular file, then You may include the notice in a location (such as a LICENSE @@ -372,9 +320,3 @@ file in a relevant directory) where a recipient would be likely to look for such a notice. You may add additional accurate notices of copyright ownership. - -Exhibit B - "Incompatible With Secondary Licenses" Notice ---------------------------------------------------------- - - This Source Code Form is "Incompatible With Secondary Licenses", as - defined by the Mozilla Public License, v. 2.0. diff --git a/README.md b/README.md index c12d86a..4fa41a5 100644 --- a/README.md +++ b/README.md @@ -1,49 +1,113 @@ -# Workflow template +# wf-amplicon + + + -This repository contains a [nextflow](https://www.nextflow.io/) workflow -template that can be used as the basis for creating new workflows. -> This workflow is not intended to be used by end users. ## Introduction -This section of documentation typically contains an overview of the workflow in terms of motivation -and bioinformatics methods, listing any key tools or algorithms employed, whilst also describing its -range of use-cases and what a suitable input dataset should look like. +This [Nextflow](https://www.nextflow.io/) workflow provides a simple way to +analyse Oxford Nanopore reads generated from amplicons.
+ +It requires the raw reads and a reference FASTA file containing one sequence per +amplicon. After filtering (based on read length and quality) and trimming, +reads are aligned to the reference using +[minimap2](https://github.com/lh3/minimap2). Variants are then called with +[Medaka](https://github.com/nanoporetech/medaka). Results include an interactive +HTML report and VCF files containing the called variants.
+ +As mentioned above, the reference FASTA file needs to contain one sequence per +amplicon for now. An option to provide a whole-genome reference file and pairs +of primers might be added in the future if requested by users. + + + ## Quickstart -The workflow uses [nextflow](https://www.nextflow.io/) to manage compute and -software resources, as such nextflow will need to be installed before attempting -to run the workflow. +The workflow relies on the following dependencies: -The workflow can currently be run using either -[Docker](https://www.docker.com/products/docker-desktop) or -[Singularity](https://docs.sylabs.io/guides/latest/user-guide/) to provide isolation of -the required software. Both methods are automated out-of-the-box provided -either Docker or Singularity is installed. +- [Nextflow](https://www.nextflow.io/) for managing compute and software + resources. +- Either [Docker](https://www.docker.com/products/docker-desktop) or + [Singularity](https://docs.sylabs.io/guides/latest/user-guide/) to provide + isolation of the required software. -It is not required to clone or download the git repository in order to run the workflow. -For more information on running EPI2ME Labs workflows [visit out website](https://labs.epi2me.io/wfindex). +It is not necessary to clone or download the git repository in order to run the +workflow. For more information on running EPI2ME Labs workflows, [visit our +website](https://labs.epi2me.io/wfindex). **Workflow options** -To obtain the workflow, having installed `nextflow`, users can run: +If you have [Nextflow](https://www.nextflow.io/) installed, you can run the +following to obtain the workflow: ``` -nextflow run epi2me-labs/wf-template --help +nextflow run epi2me-labs/wf-amplicon --help +``` + +This will show the workflow's command line options. + +### Usage + +Basic usage is as follows + +```bash +nextflow run epi2me-labs/wf-amplicon \ + --fastq $input \ + --reference references.fa \ + --threads 4 ``` -to see the options for the workflow. +`$input` can be a single FASTQ file, a directory containing FASTQ files, or a +directory containing barcoded sub-directories which in turn contain FASTQ files. +A sample sheet can be included with `--sample_sheet` and a sample name for an +individual sample with `--sample`. + +Relevant options for filtering of raw reads are + +- `--min_read_length` +- `--max_read_length` +- `--min_read_qual` + +After filtering and trimming with +[Porechop](https://github.com/rrwick/Porechop), reads can optionally be +downsampled. You can control the number of reads to keep per sample with +`--reads_downsampling_size`. + +Haploid variants are then called with +[Medaka](https://github.com/nanoporetech/medaka). You can set the minimum +coverage a variant needs to exceed in order to be included in the results with +`--min_coverage`.
+The workflow selects the appropriate +[Medaka](https://github.com/nanoporetech/medaka) model based on the basecaller +configuration that was used to process the signal data. You can use the +parameter `--basecaller_cfg` to provide this information (e.g. +`dna_r10.4.1_e8.2_400bps_hac`) to the workflow. Alternatively, you can choose +the [Medaka](https://github.com/nanoporetech/medaka) model directly with +`--medaka_model`. + +If you want to use +[Singularity](https://docs.sylabs.io/guides/latest/user-guide/) instead of +[Docker](https://www.docker.com/products/docker-desktop), add `-profile +singularity`. + +### Key outputs + +- Interactive HTML report detailing the results. +- VCF files with variants called by + [Medaka](https://github.com/nanoporetech/medaka). + + -**Workflow outputs** -The primary outputs of the workflow include: -* a simple text file providing a summary of sequencing reads, -* an HTML report document detailing the primary findings of the workflow. ## Useful links -* [nextflow](https://www.nextflow.io/) -* [docker](https://www.docker.com/products/docker-desktop) -* [singularity](https://docs.sylabs.io/guides/latest/user-guide/) +- [Nextflow](https://www.nextflow.io/) +- [Docker](https://www.docker.com/products/docker-desktop) +- [Singularity](https://docs.sylabs.io/guides/latest/user-guide/) +- [minimap2](https://github.com/lh3/minimap2) +- [Medaka](https://github.com/nanoporetech/medaka) +- [Porechop](https://github.com/rrwick/Porechop) diff --git a/bin/workflow_glue/check_sample_sheet.py b/bin/workflow_glue/check_sample_sheet.py index 0f7d17b..2bd1779 100755 --- a/bin/workflow_glue/check_sample_sheet.py +++ b/bin/workflow_glue/check_sample_sheet.py @@ -11,7 +11,10 @@ def main(args): barcodes = [] aliases = [] - types = [] + sample_types = [] + allowed_sample_types = [ + "test_sample", "positive_control", "negative_control", "no_template_control" + ] try: with open(args.sample_sheet, "r") as f: @@ -31,45 +34,48 @@ def main(args): barcodes.append(row["barcode"]) except KeyError: sys.stdout.write("'barcode' column missing") - exit() + sys.exit() try: aliases.append(row["alias"]) except KeyError: sys.stdout.write("'alias' column missing") - exit() + sys.exit() try: - types.append(row["type"]) + sample_types.append(row["type"]) except KeyError: pass except Exception as e: sys.stdout.write(f"Parsing error: {e}") - exit() + sys.exit() # check barcode and alias values are unique if len(barcodes) > len(set(barcodes)): sys.stdout.write("values in 'barcode' column not unique") - exit() + sys.exit() if len(aliases) > len(set(aliases)): sys.stdout.write("values in 'alias' column not unique") - exit() + sys.exit() - if types: + if sample_types: # check if "type" column has unexpected values - unexp_type_vals = set(types) - set( - [ - "test_sample", - "positive_control", - "negative_control", - "no_template_control", - ] - ) + unexp_type_vals = set(sample_types) - set(allowed_sample_types) + if unexp_type_vals: sys.stdout.write( f"found unexpected values in 'type' column: {unexp_type_vals}. " - "allowed values are: `['test_sample', 'positive_control', " - "'negative_control', 'no_template_control']`" + f"Allowed values are: {allowed_sample_types}" ) - exit() + sys.exit() + + if args.required_sample_types: + for required_type in args.required_sample_types: + if required_type not in allowed_sample_types: + sys.stdout.write(f"Not an allowed sample type: {required_type}") + sys.exit() + if sample_types.count(required_type) < 1: + sys.stdout.write( + f"Sample sheet requires at least 1 of {required_type}") + sys.exit() logger.info(f"Checked sample sheet {args.sample_sheet}.") @@ -78,4 +84,10 @@ def argparser(): """Argument parser for entrypoint.""" parser = wf_parser("check_sample_sheet") parser.add_argument("sample_sheet", help="Sample sheet to check") + parser.add_argument( + "--required_sample_types", + help="List of required sample types. Each sample type provided must " + "appear at least once in the sample sheet", + nargs="*" + ) return parser diff --git a/bin/workflow_glue/report.py b/bin/workflow_glue/report.py index e5e1f6a..d0e3af1 100755 --- a/bin/workflow_glue/report.py +++ b/bin/workflow_glue/report.py @@ -1,65 +1,390 @@ """Create workflow report.""" -import json +from pathlib import Path -from aplanat.components import fastcat -from aplanat.components import simple as scomponents -from aplanat.report import WFReport -import pandas - -from .util import get_named_logger, wf_parser # noqa: ABS101 +import dominate.tags as html_tags +import dominate.util as dom_util +import ezcharts as ezc +from ezcharts.components import fastcat +from ezcharts.components.ezchart import EZChart +from ezcharts.components.reports import labs +from ezcharts.layout.snippets import Tabs +from ezcharts.layout.snippets.stats import Stats +from ezcharts.layout.snippets.table import DataTable +import pandas as pd +import pysam +from . import report_util as util # noqa: ABS101 -def main(args): - """Run the entry point.""" - logger = get_named_logger("Report") - report = WFReport( - "Workflow Template Sequencing report", "wf-template", - revision=args.revision, commit=args.commit) - - with open(args.metadata) as metadata: - sample_details = [ - { - 'sample': d['alias'], - 'type': d['type'], - 'barcode': d['barcode'] - } for d in json.load(metadata) - ] - - if args.stats: - report.add_section(section=fastcat.full_report(args.stats)) - - section = report.add_section() - section.markdown('## Samples') - section.table(pandas.DataFrame(sample_details)) +from .util import get_named_logger, wf_parser # noqa: ABS101 - report.add_section( - section=scomponents.version_table(args.versions)) - report.add_section( - section=scomponents.params_table(args.params)) +# number of points in the depth-across-amplicon line plots +N_DEPTH_WINDOWS = 100 - # write report - report.write(args.report) - logger.info(f"Report written to {args.report}.") +# set order to sort samples by type (as defined in the sample sheet) +SAMPLE_TYPE_ORDER = { + "test_sample": 0, + "positive_control": 1, + "negative_control": 2, + "no_template_control": 3, +} def argparser(): """Argument parser for entrypoint.""" parser = wf_parser("report") - parser.add_argument("report", help="Report output file") - parser.add_argument("--stats", nargs='*', help="Fastcat per-read stats file(s).") parser.add_argument( - "--metadata", default='metadata.json', - help="sample metadata") + "--report-fname", + required=True, + help="Name of report HTML file.", + ) parser.add_argument( - "--versions", required=True, - help="directory containing CSVs containing name,version.") + # all relevant files will be in sub-directories (one per sample) in this + # directory + "--data", + required=True, + type=Path, + help=( + "Directory with one sub-directory per sample containing files " + "required for the report." + ), + ) parser.add_argument( - "--params", default=None, required=True, - help="A JSON file containing the workflow parameter key/values") + "--reference", + type=Path, + help="FASTA file with reference sequences for the individual amplicon.", + ) parser.add_argument( - "--revision", default='unknown', - help="git branch/tag of the executed workflow") + "--meta-json", + type=Path, + help="JSON file containing the meta data for all samples.", + ) parser.add_argument( - "--commit", default='unknown', - help="git commit of the executed workflow") + "--versions", + help="CSV with program versions.", + required=True, + ) + parser.add_argument( + "--params", + required=True, + help="JSON file containing the workflow parameters.", + ) + parser.add_argument( + "--revision", + default="unknown", + help="git branch/tag of the executed workflow", + ) + parser.add_argument( + "--commit", + default="unknown", + help="git commit of the executed workflow", + ) return parser + + +def main(args): + """Run the entry point.""" + logger = get_named_logger("Report") + + # read the meta data for all files + meta_df = ( + pd.read_json(args.meta_json) + .set_index("alias") + .rename(columns=lambda col: col.capitalize()) + .fillna("-") + ) + + report = labs.LabsReport( + "Workflow Amplicon Sequencing report", "wf-amplicon", args.params, args.versions + ) + + # read data for report (sort by sample type and alias) + datasets = sorted( + [util.ReportDataSet(d) for d in args.data.glob("*")], + key=lambda x: ( + SAMPLE_TYPE_ORDER[meta_df.loc[x.sample_alias, "Type"]], + x.sample_alias, + ), + ) + samples = [d.sample_alias for d in datasets] + # check if any samples are missing any of the required inputs and filter them out + # (we will mention them in the report below); this could only happen if there were + # no reads for that particular sample + bad_datasets = [d for d in datasets if not d.all_inputs_valid] + datasets = [d for d in datasets if d.all_inputs_valid] + ref_seqs = [] + with pysam.FastxFile(args.reference) as ref_file: + for entry in ref_file: + ref_seqs.append(entry.name) + + # intro and "at a glance" stats + with report.add_section("Introduction", "Intro."): + html_tags.p( + """ + This report contains tables and plots to help interpret the results of the + wf-amplicon workflow. The individual sections of the report summarize the + outcomes of the different steps of the workflow (read filtering, mapping + against the reference file containing the amplicon sequences, variant + calling). + """ + ) + html_tags.p("The input data contained:") + # brief summary of how many samples / refs there were + for name, items in zip(["sample", "amplicon"], [samples, ref_seqs]): + if len(items) > 1: + name += "s" + items_str = ", ".join(items[:7]) + (", ..." if len(items) > 7 else "") + dom_util.raw( + f""" + {len(items)} {name}:
+ {items_str}

""" + ) + # mention if there were any datasets with invalid input files + if bad_datasets: + dom_util.raw( + f""" + No valid results were found for the following sample(s) (and these + will be omitted from the rest of the report):
+ {", ".join([d.sample_alias for d in bad_datasets])}

+ """ + ) + html_tags.h5("At a glance") + html_tags.p( + """ + Key results for the individual samples are shown below. You can use the + dropdown menu to select individual samples. + """ + ) + # "at a glance" stats in cards (with one dropdown tab per sample) + tabs = Tabs() + with tabs.add_dropdown_menu(): + for d in datasets: + basic_summary = d.get_basic_summary_stats() + with tabs.add_dropdown_tab(d.sample_alias): + Stats( + columns=3, + items=[ + (f'{int(basic_summary["reads"]):,d}', "Reads"), + (f'{int(basic_summary["bases"]):,d}', "Bases"), + (basic_summary["mean_length"].round(1), "Mean length"), + (basic_summary["mean_quality"].round(1), "Mean quality"), + ( + f'{basic_summary["amplicons"]:g} / {len(ref_seqs)}', + "Amplicons detected", + ), + ( + basic_summary["overall_mean_depth"].round(1), + "Mean coverage across all amplicons", + ), + ( + basic_summary["min_mean_depth"].round(1), + "Smallest mean coverage for any amplicon", + ), + (f'{int(basic_summary["snps"])}', "SNVs"), + (f'{int(basic_summary["indels"])}', "Indels"), + ], + ) + + # raw reads stats + with report.add_section("Preprocessing", "Preproc."): + # `fastcat` omits the reads removed during filtering from the per-read stats + # file, but includes them in the per-file summary. We'll get some basic summary + # stats (number of reads, number of bases, mean qual, etc.) for both sets of + # files and put them into a table + basic_summary_stats = pd.DataFrame( + index=["Raw", "Filtered", "Downsampled, trimmed"], + columns=[ + "reads", + "bases", + "min_read_length", + "max_read_length", + "mean_quality", + ], + dtype=float, + ) + fastcat_per_file_stats = pd.concat((d.fastcat_per_file_stats for d in datasets)) + basic_summary_stats.loc["Raw"] = [ + (total_reads := fastcat_per_file_stats["n_seqs"].sum()), + fastcat_per_file_stats["n_bases"].sum(), + fastcat_per_file_stats["min_length"].min(), + fastcat_per_file_stats["max_length"].max(), + fastcat_per_file_stats.eval("n_seqs * mean_quality").sum() / total_reads, + ] + # now get the same stats for after filtering + fastcat_per_read_stats = pd.concat((d.fastcat_per_read_stats for d in datasets)) + basic_summary_stats.loc["Filtered"] = [ + fastcat_per_read_stats.shape[0], + fastcat_per_read_stats["read_length"].sum(), + fastcat_per_read_stats["read_length"].min(), + fastcat_per_read_stats["read_length"].max(), + fastcat_per_read_stats["mean_quality"].mean(), + ] + # and for after trimming + post_trim_per_file_stats = pd.concat( + (d.post_trim_per_file_stats for d in datasets) + ) + basic_summary_stats.loc["Downsampled, trimmed"] = [ + (total_reads := post_trim_per_file_stats["n_seqs"].sum()), + post_trim_per_file_stats["n_bases"].sum(), + post_trim_per_file_stats["min_length"].min(), + post_trim_per_file_stats["max_length"].max(), + post_trim_per_file_stats.eval("n_seqs * mean_quality").sum() / total_reads, + ] + # some formatting + for col in ("reads", "bases"): + basic_summary_stats[col] = ( + basic_summary_stats[col].astype(int).apply(util.si_format) + ) + for col in ("min_read_length", "max_read_length"): + basic_summary_stats[col] = ( + basic_summary_stats[col].astype(int).apply(lambda x: f"{x:,d}") + ) + basic_summary_stats["mean_quality"] = basic_summary_stats["mean_quality"].round( + 1 + ) + basic_summary_stats.index.name = "Condition" + basic_summary_stats.rename( + columns=lambda col: col.capitalize().replace("_", " "), inplace=True + ) + # show the stats in a DataTable + html_tags.p( + """ + Some basic stats covering the raw reads and the reads remaining after the + initial filtering step (based on length and mean quality) as well as after + downsampling and trimming are illustrated in the table below. + """ + ) + DataTable.from_pandas(basic_summary_stats) + + # `SeqSummary` plots + html_tags.p( + """ + The following plots show the read quality and length distributions as well + as the base yield after filtering (but before downsampling / trimming) for + each sample (use the dropdown menu to view the plots for the + individual samples). + """ + ) + fastcat.SeqSummary( + pd.concat((data.fastcat_per_read_stats for data in datasets)) + ) + + # summarize bamstats results of all samples for the following report sections + bamstats_summary = util.summarize_bamstats(datasets) + + # create summary table with one row per sample + per_sample_summary_table = util.format_per_sample_summary_table( + bamstats_summary, datasets + ) + # add sample meta data + per_sample_summary_table = pd.concat((meta_df, per_sample_summary_table), axis=1) + per_sample_summary_table.index.name = "Sample alias" + + # summary table with one row per amplicon + per_amplicon_summary_table = util.format_per_amplicon_summary_table( + bamstats_summary, datasets + ) + + # section for alignment stats and detected variants summary + with report.add_section("Summary", "Summary"): + html_tags.p( + """ + The two tables below (one per tab) briefly summarize the main results of + mapping the reads to the provided amplicon references and subsequent variant + calling. + """ + ) + tabs = Tabs() + with tabs.add_tab("Per-sample summary"): + DataTable.from_pandas(per_sample_summary_table) + with tabs.add_tab("Per-amplicon summary"): + DataTable.from_pandas(per_amplicon_summary_table) + + html_tags.p( + dom_util.raw( + """ + The following table breaks the results down further (one row per + sample–amplicon combination). + """ + ) + ) + with dom_util.container() as c: + summary_table = DataTable.from_pandas( + util.format_stats_table(bamstats_summary.fillna(0)).reset_index(), + use_index=False, + ) + c.clear() + dom_util.raw(str(summary_table)) + + # depth coverage plots + with report.add_section("Depth of coverage", "Depth"): + html_tags.p( + """ + Coverage along the individual amplicon, (use the dropdown menu to view the + plots for the individual amplicons). + """ + ) + tabs = Tabs() + with tabs.add_dropdown_menu(): + # get a table in long format with the amplicon ID, centre position of the + # depth windows, depth values, and sample alias + per_amplicon_depths = pd.concat( + [ + d.depth.assign( + pos=lambda df: df[["start", "end"]].mean(axis=1), + sample=d.sample_alias, + )[["ref", "pos", "depth", "sample"]] + for d in datasets + ] + ) + for amplicon, depth_df in per_amplicon_depths.groupby("ref"): + with tabs.add_dropdown_tab(amplicon): + plt = ezc.lineplot( + data=depth_df.round(2), + x="pos", + y="depth", + hue="sample", + ) + plt.title = {"text": "Coverage along amplicon"} + plt.xAxis.max = depth_df["pos"].max() + plt.xAxis.name = "Position along amplicon" + plt.yAxis.name = "Sequencing depth" + for s in plt.series: + # only show the line and no circles + s.showSymbol = False + EZChart(plt, "epi2melabs") + + # variant tables + with report.add_section("Variants", "Variants"): + html_tags.p( + """ + Haploid variant calling was performed with Medaka. + """ + ) + # combine all variants into a single dataframe with `[sample, amplicon]` as + # multi-level index + comb_variants = ( + ( + pd.concat( + ( + d.variants.reset_index().assign(sample=d.sample_alias) + for d in datasets + ) + ) + .rename( + columns={ + "amp": "amplicon", + "pos": "position", + "ref": "ref. allele", + "alt": "alt. allele", + "AB": "Allelic balance", + } + ) + .rename(columns=lambda x: x.capitalize()) + ) + .sort_values(["Sample", "Amplicon", "Position"]) + .set_index(["Sample", "Amplicon"]) + ) + DataTable.from_pandas(comb_variants.reset_index(), use_index=False) + + report.write(args.report_fname) + logger.info(f"Report written to '{args.report_fname}'.") diff --git a/bin/workflow_glue/report_util.py b/bin/workflow_glue/report_util.py new file mode 100755 index 0000000..4536524 --- /dev/null +++ b/bin/workflow_glue/report_util.py @@ -0,0 +1,411 @@ +"""Common functions and baseclass for holding report data.""" +import dominate.tags as html_tags +from ezcharts.layout.snippets import Progress +import pandas as pd +import pysam +import si_prefix + + +class ReportDataSet: + """Holds the data required for the report for an individual sample.""" + + def __init__(self, data_dir): + """Parse data for sample found in `data_dir`.""" + self.data_dir = data_dir + self.sample_alias = data_dir.name + # the below will be set to `False` when one or more of the input files are empty + # (i.e. they only contain the CSV header) + self.all_inputs_valid = True + + # read the fastcat per-file stats (these include all reads pre-filtering) + self.fastcat_per_file_stats = self.read_csv( + data_dir / "per-file-stats.tsv", sep="\t" + ) + # read the fastcat per-read stats (these include all reads present after + # filtering and before downsampling or trimming) + self.fastcat_per_read_stats = self.read_csv( + data_dir / "per-read-stats.tsv", sep="\t", index_col="read_id" + ) + # read the post-trimming fastcat per-file stats + self.post_trim_per_file_stats = self.read_csv( + data_dir / "porechopped-per-file-stats.tsv", sep="\t" + ) + + # read bamstats files + self.bamstats = self.read_csv(data_dir / "bamstats.tsv", sep="\t", index_col=0) + self.bamstats_flagstat = self.read_csv( + data_dir / "bamstats-flagstat.tsv", sep="\t", index_col=0 + ) + # get the list of amplicons with at least one primary alignment + self.detected_amplicons = self.bamstats_flagstat.query("primary > 0").index + + # read the depth data + self.depth = self.read_csv(data_dir / "per-window-depth.tsv.gz", sep="\t") + + # read the variants from VCF + self.variants = pd.DataFrame( + # 'AB' (allelic balance) is the fraction of reads supporting the ALT allele + columns=["amp", "pos", "ref", "alt", "type", "AB"] + ).set_index(["amp", "pos"]) + # check that the VCF only contains a single sample + vcf_file = pysam.VariantFile(vcf_path := data_dir / "medaka.annotated.vcf") + if len(vcf_file.header.samples) != 1: + raise ValueError(f"Found multiple samples in {vcf_path}.") + # process variants + for entry in vcf_file.fetch(): + # we should only have one alt allele + if len(entry.alts) != 1: + raise ValueError( + f"Found entry with multiple ALT alleles in {vcf_path}." + ) + self.variants.loc[(entry.chrom, entry.pos), :] = [ + entry.ref, + entry.alts[0], + # the field `entry.alleles_variant_types` contains a tuple `('REF', + # alt-type)` + entry.alleles_variant_types[1], + # divide the number of spanning reads supporting the ALT allele by the + # number of all reads (`SR` contains a tuple with `(ref fwd, ref rev, + # alt1 fwd, alt1 rev`) + round( + (entry.info["SR"][2] + entry.info["SR"][3]) + * 100 + / entry.info["DP"], + 1, + ), + ] + + def __repr__(self): + """Return human-readable string (simply the sample alias).""" + return f"ReportDataSet('{self.sample_alias}')" + + def read_csv(self, file, **kwargs): + """Read a CSV and set `self.all_inputs_valid` to `False` if it's empty. + + This is a wrapper around `pd.read_csv` that sets `self.all_inputs_valid = False` + if the CSV contains only a header line and no data. + """ + df = pd.read_csv(file, **kwargs) + if df.empty: + self.all_inputs_valid = False + return df + + def get_basic_summary_stats(self): + """Collect basic summary stats. + + This is used in the "at a glance" section at the beginning of the report. + """ + basic_summary = pd.Series( + 0, + index=[ + "reads", + "bases", + "mean_length", + "mean_quality", + "amplicons", + "overall_mean_depth", + "min_mean_depth", + "snps", + "indels", + ], + ) + basic_summary["reads"] = self.post_trim_per_file_stats["n_seqs"].sum() + basic_summary["bases"] = self.post_trim_per_file_stats["n_bases"].sum() + basic_summary["mean_quality"] = ( + self.post_trim_per_file_stats.eval("n_seqs * mean_quality").sum() + / basic_summary["reads"] + ) + basic_summary["mean_length"] = self.post_trim_per_file_stats.eval( + "n_bases / n_seqs" + ) + basic_summary["amplicons"] = len(self.detected_amplicons) + # now get the mean depths. we got different window sizes for the different + # amplicons. thus, we'll + # 1. calculate the "summed depths" first (i.e. the sum of window size * mean + # depth for each window) for each detected amplicons + # 2. get the ref lengths and divide the "summed depths" by them + grpd = self.depth.query("ref.isin(@self.detected_amplicons)").groupby("ref") + summed_depths = grpd.apply(lambda df: df.eval("(end - start) * depth").sum()) + ref_lengths = grpd.last()["end"] + mean_depths = summed_depths / ref_lengths + basic_summary["overall_mean_depth"] = summed_depths.sum() / ref_lengths.sum() + basic_summary["min_mean_depth"] = mean_depths.min() + basic_summary["snps"] = self.variants.query("type == 'SNP'").shape[0] + basic_summary["indels"] = self.variants.query("type == 'INDEL'").shape[0] + return basic_summary + + +def custom_progress_bar(value, display_val=None): + """Make a bootstrap progress bar with different styling to be used in tables. + + One thing to keep in mind is that the constructor for `Progress` uses `dominate` + context managers and therefore the progress bar will be added to the enclosing + context regardless of whether this desired or not. + """ + p = Progress( + value_min=0, + value_max=100, + value_now=round(value, 1), + display_val=display_val, + height=30, + ) + # set some style attributes to get a bordered light-grey bar with black font + p.children[0].children[0].children[0]["style"] += ";color:black" + p.children[0].children[0]["style"] += ";background-color:rgb(210, 210, 210)" + p.children[0]["style"] += ";width:120px;border:1px solid black" + return p + + +def summarize_bamstats(datasets): + """Summarize the bamstats data from all the samples. + + The resulting DataFrame contains summed / averaged values for each sample--amplicon + combination. + """ + samples = sorted([d.sample_alias for d in datasets]) + amplicons = sorted( + set([amp for d in datasets for amp in d.bamstats_flagstat.index]) - set("*") + ) + + summary_stats = pd.DataFrame( + 0, + index=pd.MultiIndex.from_product( + [samples, amplicons], names=["sample", "amplicon"] + ), + columns=[ + "reads", + "bases", + "median_read_length", + "mean_cov", + "mean_acc", + "variants", + "indels", + ], + ) + + for d in datasets: + for amplicon, df in d.bamstats.groupby("ref"): + n_reads = df.shape[0] + n_bases = df["read_length"].sum() + med_read_length = df["read_length"].median() + mean_cov = df["ref_coverage"].mean() + mean_acc = df["acc"].mean() + try: + variants = d.variants.loc[amplicon].shape[0] + indels = d.variants.loc[amplicon].query('type == "INDEL"').shape[0] + except KeyError: + variants = indels = 0 + summary_stats.loc[(d.sample_alias, amplicon), :] = [ + n_reads, + n_bases, + med_read_length, + mean_cov, + mean_acc, + variants, + indels, + ] + return summary_stats + + +def si_format(n): + """Use `si-prefix`, but don't add a decimal when `n` is smaller than 1000. + + `si_prefix.si_format()` always adds the decimals specified by `precision`, even when + calling it on a small integer. This wrapper prevents that. + """ + return si_prefix.si_format(n, precision=0 if n < 1000 else 1) + + +def format_stats_table(df): + """Pretty-format a table with bamstats data. + + Individual columns will be formatted differently depending on their column name + (e.g. values in columns named `reads` will be replaced by the HTML code for a + progress bar showing the number of reads as well as the proportion of the total). + The resulting DataFrame is expected to be passed to `DataTable.from_pandas()` to be + added to the report. + + Note that progress bars will implicitly added to the enclosing `dominate` context, + which is something that needs to be taken care of. + """ + total_reads = df["reads"].sum() + total_bases = df["bases"].sum() + + # used later for making sure the table columns containing HTML code for progress + # bars are sortable + num_leading_zeros = len(str(df.shape[0])) + 1 + + # initialise resulting DataFrame + df_for_table = df.copy() + + # check if columns with a certain type of data are present (e.g. read length, mean + # coverage, etc.); formatting of the individual columns depends on column name (e.g. + # for 'reads' there will be a progress bar and the column 'median_read_length' will + # be cast to `int`) + if "reads" in df_for_table.columns: + # progress bars for number of reads column + for n, (idx, n_reads) in enumerate( + df["reads"].astype(int).sort_values().items() + ): + # get percentage of total number of reads + perc = 100 * n_reads / total_reads if total_reads != 0 else 0 + # some number formatting + display_val = f"{si_format(n_reads)} ({perc:.0f}%)" + # add the progress bar to the table cell + df_for_table.loc[idx, "reads"] = str( + # We want the column to be sortable by the actual values even though it + # contains the HTML progress bar elements. A hacky way to achieve this + # is by prepending a hidden span containing the zero-filled index from + # the `enumerate()` above before the progress bar to maintain order. The + # table will be sorted based on this number and ignore whatever is in + # the progress bar. + html_tags.span(str(n).zfill(num_leading_zeros), hidden=True) + ) + str(custom_progress_bar(perc, display_val)) + if "bases" in df_for_table.columns: + # progress bars for number of bases column + for n, (idx, n_bases) in enumerate(df["bases"].sort_values().items()): + perc = 100 * n_bases / total_bases if total_bases != 0 else 0 + display_val = f"{si_format(n_bases)} ({perc:.0f}%)" + df_for_table.loc[idx, "bases"] = str( + # same trick to keep the column sortable as above + html_tags.span(str(n).zfill(num_leading_zeros), hidden=True) + ) + str(custom_progress_bar(perc, display_val)) + if "variants" in df_for_table.columns: + # make sure the input DataFrame got a column with indels as well + if "indels" not in df.columns: + raise ValueError( + "`format_stats_table()` was asked to format a 'variants' column, " + "but there was no accompanying 'indels' column." + ) + # variants summary column (simply showing total number of variants and indels) + df_for_table["variants"] = df.apply( + lambda row: f"{int(row['variants'])} ({int(row['indels'])})", axis=1 + ) + # we don't need the `indels` column + df_for_table.drop(columns="indels", inplace=True) + + # round mean coverage and alignment accuracy + for col in ("mean_cov", "mean_acc"): + if col in df_for_table.columns: + df_for_table[col] = df[col].round(1) + + # median read length, number of amplicons, and number of samples should be ints + for col in ("median_read_length", "amplicons", "samples"): + if col in df_for_table.columns: + df_for_table[col] = df[col].astype(int) + + if "unmapped" in df_for_table.columns: + # progress bars for number of unmapped reads column; same as for "reads" and + # "bases" above + for idx, (sample, (n_reads, n_unmapped)) in enumerate( + df[["reads", "unmapped"]].astype(int).sort_values("unmapped").iterrows() + ): + perc = 100 * n_unmapped / n_reads + display_val = f"{si_format(n_unmapped)} ({perc:.0f}%)" + df_for_table.loc[sample, "unmapped"] = str( + html_tags.span(str(idx).zfill(num_leading_zeros), hidden=True) + ) + str(custom_progress_bar(perc, display_val)) + + # Format column names, replace '*' in the index with 'Unmapped', and return + if df_for_table.index.name is not None: + df_for_table.index.name = df_for_table.index.name.capitalize() + return ( + df_for_table.rename(index={"*": "Unmapped"}) + .rename_axis(index=str.capitalize) + .rename( + columns=lambda col: col.replace("_", " ") + .replace("variants", "variants (indels)") + .replace("mean_acc", "mean acc.") + .replace("mean_cov", "mean cov.") + .capitalize() + ) + ) + + +def format_per_sample_summary_table(summary_stats, datasets): + """Summarize and format bamstats results on a per-sample level. + + `summary_stats` contains summary stats for each sample--amplicon combination (i.e. + it's got a multi-index of `['sample', 'amplicon']`) + """ + # initialise results dataframe with zeros + per_sample_summary_stats = pd.DataFrame( + 0, + index=summary_stats.index.unique("sample"), + columns=[ + "reads", + "bases", + "median_read_length", + "amplicons", + "unmapped", + "variants", + "indels", + ], + ) + # group by sample and aggregate the stats + for sample, df in summary_stats.groupby("sample"): + per_sample_summary_stats.loc[sample, ["reads", "bases"]] = df[ + ["reads", "bases"] + ].sum() + # for median read length we have a look at the whole bamstats DataFrame + # belonging to that sample + (dataset,) = [x for x in datasets if x.sample_alias == sample] + per_sample_summary_stats.loc[sample, "median_read_length"] = dataset.bamstats[ + "read_length" + ].median() + per_sample_summary_stats.loc[sample, "amplicons"] = ( + df.query("reads > 0 and amplicon != '*'") + ).shape[0] + per_sample_summary_stats.loc[sample, "unmapped"] = ( + df.loc[(sample, "*"), "reads"] + if "*" in df.index.get_level_values("amplicon") + else 0 + ) + per_sample_summary_stats.loc[sample, ["variants", "indels"]] = df[ + ["variants", "indels"] + ].sum() + per_sample_summary_stats.fillna(0, inplace=True) + return format_stats_table(per_sample_summary_stats) + + +def format_per_amplicon_summary_table(summary_stats, datasets): + """Summarize and format bamstats results on a per-amplicon level. + + `summary_stats` contains summary stats for each sample--amplicon combination (i.e. + it's got a multi-index of `['sample', 'amplicon']`) + """ + per_amplicon_summary_stats = pd.DataFrame( + 0, + index=summary_stats.index.unique("amplicon"), + columns=[ + "reads", + "bases", + "median_read_length", + "samples", + "mean_cov", + "mean_acc", + ], + ) + # group by amplicon and aggregate the stats + for amplicon, df in summary_stats.groupby("amplicon"): + per_amplicon_summary_stats.loc[amplicon, ["reads", "bases"]] = df[ + ["reads", "bases"] + ].sum() + # for median read length we have to combine all reads belonging to this amplicon + # from the individual per-sample datasets + per_amplicon_summary_stats.loc[amplicon, "median_read_length"] = pd.concat( + (d.bamstats.query(f"ref == '{amplicon}'")["read_length"] for d in datasets) + ).median() + per_amplicon_summary_stats.loc[amplicon, "samples"] = (df["reads"] > 0).sum() + # multiply the mean cov + acc. per sample--amplicon combination by the number of + # reads for that combo and divide by total number of reads for this amplicon to + # get the overall per-amplicon average + per_amplicon_summary_stats.loc[amplicon, ["mean_cov", "mean_acc"]] = ( + df[["mean_cov", "mean_acc"]].multiply(df["reads"], axis="index").sum() + / df["reads"].sum() + ) + per_amplicon_summary_stats.loc[amplicon, ["variants", "indels"]] = df[ + ["variants", "indels"] + ].sum() + per_amplicon_summary_stats.fillna(0, inplace=True) + return format_stats_table(per_amplicon_summary_stats) diff --git a/bin/workflow_glue/resolve_medaka_model.py b/bin/workflow_glue/resolve_medaka_model.py new file mode 100755 index 0000000..d941c9c --- /dev/null +++ b/bin/workflow_glue/resolve_medaka_model.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python +"""Map a basecalling model to a Medaka model. + +An unknown basecalling model or basecalling model without an appropriate +Medaka model will explode with a large stderr message and exit non-zero. +A happy basecalling model will print a Medaka model to stdout and exit 0. +""" + +# Delegating this to a Python script seems overkill but allows us to +# expand to more complex logic trivially in future. +# Plus I don't want to write this in Groovy right now. + + +import csv +import os +import sys +import textwrap + +from .util import wf_parser # noqa: ABS101 + + +def exit_obvious_error(header, error_msg, advice, args, width=80): + """Write an obvious looking error to stderr and quit.""" + line = ("-" * width) + '\n' + msg = ( + f"The input basecaller configuration '{args.basecaller_cfg}' does not " + "have a suitable Medaka model " + ) + sys.stderr.write(line) + sys.stderr.write(f"[CRITICAL ERROR] {header}\n\n") + sys.stderr.write(textwrap.fill(msg + error_msg, width)) + sys.stderr.write('\n\n') + sys.stderr.write(textwrap.fill(advice, width)) + sys.stderr.write('\n') + sys.stderr.write(line) + sys.exit(os.EX_DATAERR) + + +def main(args): + """Entry point.""" + with open(args.model_tsv) as tsv: + for row in csv.DictReader(tsv, delimiter='\t'): + if row["basecall_model_name"] == args.basecaller_cfg: + model_type = str(args.model_type) + model = row[model_type] + reason = row["medaka_nomodel_reason"] + if model == "-" or model == "": + # Basecalling model valid but no Medaka model + exit_obvious_error( + header="No appropriate Medaka model.", + error_msg=f"because {reason}.", + advice=( + "It is not possible to run Medaka" + "with this data.\n"), + args=args + ) + break # exit before here but this keeps my intention obvious + else: + # Good model found + sys.stdout.write(model) + break + else: + # No model found (loop not broken) + exit_obvious_error( + header="Unknown basecaller configuration.", + error_msg=( + "because the basecaller configuration has not been recognised." + ), + advice=( + "Check your --basecaller_cfg has been provided correctly. " + ), + ) + + +def argparser(): + """Argument parser for entry point.""" + parser = wf_parser("resolve_medaka_model") + parser.add_argument("model_tsv") + parser.add_argument("basecaller_cfg") + parser.add_argument("model_type") + return parser diff --git a/data/medaka_models.tsv b/data/medaka_models.tsv new file mode 100644 index 0000000..34dbb73 --- /dev/null +++ b/data/medaka_models.tsv @@ -0,0 +1,30 @@ +basecaller basecall_model_name medaka_consensus medaka_variant medaka_nomodel_reason +dorado dna_r10.4.1_e8.2_260bps_hac@v4.0.0 - - Model not supported by this workflow +dorado dna_r10.4.1_e8.2_260bps_sup@v4.0.0 - - Model not supported by this workflow +dorado dna_r10.4.1_e8.2_400bps_hac@v4.0.0 - - Model not supported by this workflow +dorado dna_r10.4.1_e8.2_400bps_sup@v4.0.0 - - Model not supported by this workflow +dorado dna_r10.4.1_e8.2_260bps_fast@v3.5.2 - - Model not supported by this workflow +dorado dna_r10.4.1_e8.2_260bps_hac@v3.5.2 - - Model not supported by this workflow +dorado dna_r10.4.1_e8.2_260bps_sup@v3.5.2 - - Model not supported by this workflow +dorado dna_r10.4.1_e8.2_400bps_fast@v3.5.2 - - Model not supported by this workflow +dorado dna_r10.4.1_e8.2_400bps_hac@v3.5.2 r1041_e82_400bps_hac_g615 r1041_e82_400bps_hac_variant_g615 - +dorado dna_r10.4.1_e8.2_400bps_sup@v3.5.2 r1041_e82_400bps_sup_g615 r1041_e82_400bps_sup_variant_g615 - +dorado dna_r9.4.1_e8_fast@v3.4 r941_min_fast_g507 r941_min_fast_variant_g507 - +dorado dna_r9.4.1_e8_hac@v3.3 r941_min_hac_g507 r941_min_hac_variant_g507 - +dorado dna_r9.4.1_e8_sup@v3.3 r941_min_sup_g507 r941_min_hac_variant_g507 - +guppy dna_r10.4.1_e8.2_400bps_hac_prom r1041_e82_400bps_hac_g615 r1041_e82_400bps_hac_variant_g615 - +guppy dna_r9.4.1_450bps_hac_prom r941_prom_hac_g507 r941_prom_hac_variant_g507 - +guppy dna_r10.3_450bps_hac r103_hac_g507 r103_hac_variant_g507 - +guppy dna_r10.3_450bps_hac_prom r103_hac_g507 r103_hac_variant_g507 - +guppy dna_r10.4.1_e8.2_260bps_hac r1041_e82_260bps_hac_g615 r1041_e82_260bps_hac_variant_g615 - +guppy dna_r10.4.1_e8.2_260bps_hac_prom r1041_e82_260bps_hac_g615 r1041_e82_260bps_hac_variant_g615 - +guppy dna_r10.4.1_e8.2_400bps_hac r1041_e82_400bps_hac_g615 r1041_e82_400bps_hac_variant_g615 - +guppy dna_r10.4_e8.1_hac - - Model not supported by this workflow +guppy dna_r10.4_e8.1_hac_prom - - Model not supported by this workflow +guppy dna_r10_450bps_hac - - Model not supported by this workflow +guppy dna_r9.4.1_450bps_hac r941_min_hac_g507 r941_min_hac_variant_g507 - +guppy dna_r9.4.1_e8.1_hac r941_min_hac_g507 r941_min_hac_variant_g507 - +guppy dna_r9.4.1_e8.1_hac_prom r941_prom_hac_g507 r941_prom_hac_variant_g507 - +guppy dna_r9.5_450bps - - Model not supported by this workflow +guppy rna_r9.4.1_70bps_hac - - RNA data is not suitable for this workflow +guppy rna_r9.4.1_70bps_hac_prom - - RNA data is not suitable for this workflow diff --git a/docs/header.md b/docs/header.md index b7cb6c3..90d29c3 100644 --- a/docs/header.md +++ b/docs/header.md @@ -1,7 +1,3 @@ -# Workflow template +# wf-amplicon -This repository contains a [nextflow](https://www.nextflow.io/) workflow -template that can be used as the basis for creating new workflows. - -> This workflow is not intended to be used by end users. diff --git a/docs/intro.md b/docs/intro.md index d66cdab..44be5a3 100644 --- a/docs/intro.md +++ b/docs/intro.md @@ -1,6 +1,15 @@ ## Introduction -This section of documentation typically contains an overview of the workflow in terms of motivation -and bioinformatics methods, listing any key tools or algorithms employed, whilst also describing its -range of use-cases and what a suitable input dataset should look like. +This [Nextflow](https://www.nextflow.io/) workflow provides a simple way to +analyse Oxford Nanopore reads generated from amplicons.
+It requires the raw reads and a reference FASTA file containing one sequence per +amplicon. After filtering (based on read length and quality) and trimming, +reads are aligned to the reference using +[minimap2](https://github.com/lh3/minimap2). Variants are then called with +[Medaka](https://github.com/nanoporetech/medaka). Results include an interactive +HTML report and VCF files containing the called variants.
+ +As mentioned above, the reference FASTA file needs to contain one sequence per +amplicon for now. An option to provide a whole-genome reference file and pairs +of primers might be added in the future if requested by users. diff --git a/docs/links.md b/docs/links.md index 2722ae3..5182185 100644 --- a/docs/links.md +++ b/docs/links.md @@ -1,5 +1,9 @@ + ## Useful links -* [nextflow](https://www.nextflow.io/) -* [docker](https://www.docker.com/products/docker-desktop) -* [singularity](https://docs.sylabs.io/guides/latest/user-guide/) +- [Nextflow](https://www.nextflow.io/) +- [Docker](https://www.docker.com/products/docker-desktop) +- [Singularity](https://docs.sylabs.io/guides/latest/user-guide/) +- [minimap2](https://github.com/lh3/minimap2) +- [Medaka](https://github.com/nanoporetech/medaka) +- [Porechop](https://github.com/rrwick/Porechop) diff --git a/docs/quickstart.md b/docs/quickstart.md index 4cbfd40..0b5dec7 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -1,31 +1,74 @@ ## Quickstart -The workflow uses [nextflow](https://www.nextflow.io/) to manage compute and -software resources, as such nextflow will need to be installed before attempting -to run the workflow. +The workflow relies on the following dependencies: -The workflow can currently be run using either -[Docker](https://www.docker.com/products/docker-desktop) or -[Singularity](https://docs.sylabs.io/guides/latest/user-guide/) to provide isolation of -the required software. Both methods are automated out-of-the-box provided -either Docker or Singularity is installed. +- [Nextflow](https://www.nextflow.io/) for managing compute and software + resources. +- Either [Docker](https://www.docker.com/products/docker-desktop) or + [Singularity](https://docs.sylabs.io/guides/latest/user-guide/) to provide + isolation of the required software. -It is not required to clone or download the git repository in order to run the workflow. -For more information on running EPI2ME Labs workflows [visit out website](https://labs.epi2me.io/wfindex). +It is not necessary to clone or download the git repository in order to run the +workflow. For more information on running EPI2ME Labs workflows, [visit our +website](https://labs.epi2me.io/wfindex). **Workflow options** -To obtain the workflow, having installed `nextflow`, users can run: +If you have [Nextflow](https://www.nextflow.io/) installed, you can run the +following to obtain the workflow: ``` -nextflow run epi2me-labs/wf-template --help +nextflow run epi2me-labs/wf-amplicon --help ``` -to see the options for the workflow. +This will show the workflow's command line options. -**Workflow outputs** +### Usage -The primary outputs of the workflow include: +Basic usage is as follows -* a simple text file providing a summary of sequencing reads, -* an HTML report document detailing the primary findings of the workflow. +```bash +nextflow run epi2me-labs/wf-amplicon \ + --fastq $input \ + --reference references.fa \ + --threads 4 +``` + +`$input` can be a single FASTQ file, a directory containing FASTQ files, or a +directory containing barcoded sub-directories which in turn contain FASTQ files. +A sample sheet can be included with `--sample_sheet` and a sample name for an +individual sample with `--sample`. + +Relevant options for filtering of raw reads are + +- `--min_read_length` +- `--max_read_length` +- `--min_read_qual` + +After filtering and trimming with +[Porechop](https://github.com/rrwick/Porechop), reads can optionally be +downsampled. You can control the number of reads to keep per sample with +`--reads_downsampling_size`. + +Haploid variants are then called with +[Medaka](https://github.com/nanoporetech/medaka). You can set the minimum +coverage a variant needs to exceed in order to be included in the results with +`--min_coverage`.
+The workflow selects the appropriate +[Medaka](https://github.com/nanoporetech/medaka) model based on the basecaller +configuration that was used to process the signal data. You can use the +parameter `--basecaller_cfg` to provide this information (e.g. +`dna_r10.4.1_e8.2_400bps_hac`) to the workflow. Alternatively, you can choose +the [Medaka](https://github.com/nanoporetech/medaka) model directly with +`--medaka_model`. + +If you want to use +[Singularity](https://docs.sylabs.io/guides/latest/user-guide/) instead of +[Docker](https://www.docker.com/products/docker-desktop), add `-profile +singularity`. + +### Key outputs + +- Interactive HTML report detailing the results. +- VCF files with variants called by + [Medaka](https://github.com/nanoporetech/medaka). diff --git a/lib/fastqingress.nf b/lib/fastqingress.nf index 5ec0d84..6fefbce 100644 --- a/lib/fastqingress.nf +++ b/lib/fastqingress.nf @@ -40,14 +40,22 @@ def fastq_ingress(Map arguments) // with fastq) ch_input = get_valid_inputs(margs) } + // `ch_input` might contain elements of `[metamap, null]` if there were entries in + // the sample sheet for which no FASTQ files were found. We put these into an extra + // channel and combine with the result channel before returning. + ch_input = ch_input.branch { meta, path -> + reads_found: path as boolean + no_reads_found: true + } + def ch_result if (margs.fastcat_stats) { // run fastcat regardless of input type - return fastcat(ch_input, margs["fastcat_extra_args"]) + ch_result = fastcat(ch_input.reads_found, margs["fastcat_extra_args"]) } else { // the fastcat stats were not requested --> run fastcat only on directories with // more than one FASTQ file (and not on single files or directories with a // single file) - ch_branched = ch_input.map {meta, path -> + def ch_branched = ch_input.reads_found.map {meta, path -> // find directories with only a single FASTQ file and "unwrap" the file if (path.isDirectory()) { List fq_files = get_fq_files_in_dir(path) @@ -58,20 +66,21 @@ def fastq_ingress(Map arguments) [meta, path] } .branch { meta, path -> // now there can only be two cases: - // (i) single FASTQ file (pass to `mv_or_pigz` later) + // (i) single FASTQ file (pass to `move_or_compress` later) // (ii) dir with multiple fastq files (pass to `fastcat` later) single_file: path.isFile() dir_with_fastq_files: true } // call the respective processes on both branches and return - return fastcat( + ch_result = fastcat( ch_branched.dir_with_fastq_files, margs["fastcat_extra_args"] ).concat( - ch_branched.single_file | mv_or_pigz | map { + ch_branched.single_file | move_or_compress | map { meta, path -> [meta, path, null] } ) } + return ch_result.concat(ch_input.no_reads_found.map { [*it, null] }) } @@ -97,8 +106,14 @@ def watch_path(Map margs) { if (input.isFile()) { error "Input ($input) must be a directory when using `watch_path`." } - // create channel with `watchPath` - ch_watched = Channel.watchPath("$input/**").until { it.name.startsWith('STOP') } + // get existing FASTQ files first (look for relevant files in the top-level dir and + // all sub-dirs) + def ch_existing_input = Channel.fromPath(input) + | concat(Channel.fromPath("$input/*", type: 'dir')) + | map { get_fq_files_in_dir(it) } + | flatten + // now get channel with files found by `watchPath` + def ch_watched = Channel.watchPath("$input/**").until { it.name.startsWith('STOP') } // only keep FASTQ files | filter { for (ext in EXTENSIONS) { @@ -106,36 +121,41 @@ def watch_path(Map margs) { } return false } - // throw error when finding files in top-level dir and sub-directories - | combine(Channel.of([:])) + // merge the channels + ch_watched = ch_existing_input | concat(ch_watched) + // check if input is as expected; start by throwing an error when finding files in + // top-level dir and sub-directories + String prev_input_type + ch_watched | map { - String old_state = it[1]["state"] - String new_state = (it[0].parent == input) ? "top-level" : "subdir" - if (old_state && (old_state != new_state)) { - error "`watchPath` found files in the top-level directory " + + String input_type = (it.parent == input) ? "top-level" : "sub-dir" + if (prev_input_type && (input_type != prev_input_type)) { + error "`watchPath` found FASTQ files in the top-level directory " + "as well as in sub-directories." } + // if file is in a sub-dir, make sure it's not a sub-sub-dir + if ((input_type == "sub-dir") && (it.parent.parent != input)) { + error "`watchPath` found a FASTQ file more than one level of " + + "sub-directories deep ('$it')." + } // we also don't want files in the top-level dir when we got a sample sheet - if ((new_state == "top-level") && margs["sample_sheet"]) { - error "`watchPath` found files in top-level directory even though there " + - "is a sample sheet." + if ((input_type == "top-level") && margs["sample_sheet"]) { + error "`watchPath` found files in top-level directory even though a " + + "sample sheet was provided ('${margs["sample_sheet"]}')." } - it[1]["state"] = new_state - [it[0], it[1]] + prev_input_type = input_type } - | map { it[0] } - if (margs.sample_sheet) { // add metadata from sample sheet (we can't use join here since it does not work // with repeated keys; we therefore need to transform the sample sheet data into // a map with the barcodes as keys) - def ch_sample_sheet = get_sample_sheet(file(margs.sample_sheet)) + def ch_sample_sheet = get_sample_sheet(file(margs.sample_sheet), margs.required_sample_types) | collect | map { it.collectEntries { [(it["barcode"]): it] } } // now we can use this channel to annotate all files with the corresponding info // from the sample sheet ch_watched = ch_watched - | combine ( ch_sample_sheet ) + | combine(ch_sample_sheet) | map { file_path, sample_sheet_map -> String barcode = file_path.parent.name Map meta = sample_sheet_map[barcode] @@ -166,8 +186,8 @@ def watch_path(Map margs) { } -process mv_or_pigz { - label "wfamplicon" +process move_or_compress { + label params.process_label cpus params.threads input: tuple val(meta), path(input) @@ -183,14 +203,14 @@ process mv_or_pigz { """ } else { """ - cat $input | pigz -p $task.cpus > $out + cat $input | bgzip -@ $task.cpus > $out """ } } process fastcat { - label "wfamplicon" + label params.process_label cpus params.threads input: tuple val(meta), path(input) @@ -203,12 +223,12 @@ process fastcat { """ mkdir $fastcat_stats_outdir fastcat \ - -s $meta.alias \ + -s ${meta["alias"]} \ -r $fastcat_stats_outdir/per-read-stats.tsv \ -f $fastcat_stats_outdir/per-file-stats.tsv \ $extra_args \ $input \ - | pigz -p $task.cpus > $out + | bgzip -@ $task.cpus > $out """ } @@ -227,6 +247,7 @@ Map parse_arguments(Map arguments) { "analyse_unclassified": false, "fastcat_stats": false, "fastcat_extra_args": "", + "required_sample_types": [], "watch_path": false], name: "fastq_ingress") return parser.parse_args(arguments) @@ -248,10 +269,9 @@ def get_valid_inputs(Map margs){ } catch (NoSuchFileException e) { error "Input path $margs.input does not exist." } - boolean dir_has_subdirs = false - boolean dir_has_fastq_files = false - // declare resulting input channel + // declare resulting input channel and other variables needed in the outer scope def ch_input + ArrayList sub_dirs_with_fastq_files // handle case of `input` being a single file if (input.isFile()) { // the `fastcat` process can deal with directories or single file inputs @@ -261,24 +281,26 @@ def get_valid_inputs(Map margs){ // input is a directory --> we accept two cases: (i) a top-level directory with // fastq files and no sub-directories or (ii) a directory with one layer of // sub-directories containing fastq files - dir_has_fastq_files = get_fq_files_in_dir(input) as boolean - // find potential subdirectories (this list can be empty) + boolean dir_has_fastq_files = get_fq_files_in_dir(input) + // find potential sub-directories (and sub-dirs with FASTQ files; note that + // these lists can be empty) ArrayList sub_dirs = file(input.resolve('*'), type: "dir") - dir_has_subdirs = sub_dirs as boolean - // deal with first case (top-lvl dir with fastq files and no sub-directories) - if (dir_has_fastq_files){ - if (dir_has_subdirs) { + sub_dirs_with_fastq_files = sub_dirs.findAll { get_fq_files_in_dir(it) } + // deal with first case (top-lvl dir with FASTQ files and no sub-directories + // containing FASTQ files) + if (dir_has_fastq_files) { + if (sub_dirs_with_fastq_files) { error "Input directory '$input' cannot contain FASTQ " + - "files and sub-directories." + "files and sub-directories with FASTQ files." } ch_input = Channel.of( [create_metamap([alias: margs["sample"] ?: input.baseName]), input]) } else { // deal with the second case (sub-directories with fastq data) --> first // check whether we actually found sub-directories - if (!dir_has_subdirs) { + if (!sub_dirs_with_fastq_files) { error "Input directory '$input' must contain either FASTQ files " + - "or sub-directories." + "or sub-directories containing FASTQ files." } // make sure that there are no sub-sub-directories with FASTQ files and that // the sub-directories actually contain fastq files) @@ -289,12 +311,6 @@ def get_valid_inputs(Map margs){ error "Input directory '$input' cannot contain more " + "than one level of sub-directories with FASTQ files." } - ArrayList sub_dirs_with_fastq_files = sub_dirs.findAll { - get_fq_files_in_dir(it) as boolean - } - if (!sub_dirs_with_fastq_files) { - error "No FASTQ files were found in the sub-directories of '$input'." - } // remove directories called 'unclassified' unless otherwise specified if (!margs.analyse_unclassified) { sub_dirs_with_fastq_files = sub_dirs_with_fastq_files.findAll { @@ -304,21 +320,26 @@ def get_valid_inputs(Map margs){ // filter based on sample sheet in case one was provided if (margs.sample_sheet) { // get channel of entries in the sample sheet - def ch_sample_sheet = get_sample_sheet(file(margs.sample_sheet)) - // get the intersection of both channels - def ch_intersection = Channel.fromPath(sub_dirs_with_fastq_files).map { + def ch_sample_sheet = get_sample_sheet(file(margs.sample_sheet), margs.required_sample_types) + // get the union of both channels (missing values will be replaced with + // `null`) + def ch_union = Channel.fromPath(sub_dirs_with_fastq_files).map { [it.baseName, it] - }.join(ch_sample_sheet.map{[it.barcode, it]}, remainder: false) - // TODO: we should let the user know if a sample present in the sample - // sheet didn't have a matching sub-directory. We could throw an error - // or print a warning, but ideally we would return an extra channel from - // `fastq_ingress` with a summary about what has been found (in terms of - // sample sheet and sub-dirs) --> we'll do this later - - // put metadata into map and return - ch_input = ch_intersection.map {barcode, path, sample_sheet_entry -> [ - create_metamap(sample_sheet_entry), path - ]} + }.join(ch_sample_sheet.map{[it.barcode, it]}, remainder: true) + // after joining the channels, there are three possible cases: + // (i) valid input path and sample sheet entry are both present + // (ii) there is a sample sheet entry but no corresponding input dir + // --> we'll emit `[metamap-from-sample-sheet-entry, null]` + // (iii) there is a valid path, but the sample sheet entry is missing + // --> drop this entry and print a warning to the log + ch_input = ch_union.map {barcode, path, sample_sheet_entry -> + if (sample_sheet_entry) { + [create_metamap(sample_sheet_entry), path] + } else { + log.warn "Input directory '$barcode' was found, but sample " + + "sheet '$margs.sample_sheet' has no such entry." + } + } } else { ch_input = Channel.fromPath(sub_dirs_with_fastq_files).map { [create_metamap([alias: it.baseName, barcode: it.baseName]), it] @@ -330,7 +351,7 @@ def get_valid_inputs(Map margs){ } // a sample sheet only makes sense in the case of a directory with // sub-directories - if (margs.sample_sheet && !dir_has_subdirs) { + if (margs.sample_sheet && !sub_dirs_with_fastq_files) { error "Sample sheet was provided, but input does not contain " + "sub-directories with FASTQ files." } @@ -376,7 +397,7 @@ ArrayList get_fq_files_in_dir(Path dir) { * @param sample_sheet: path to the sample sheet CSV * @return: channel of maps (with values in sample sheet header as keys) */ -def get_sample_sheet(Path sample_sheet) { +def get_sample_sheet(Path sample_sheet, ArrayList required_sample_types) { // If `validate_sample_sheet` does not return an error message, we can assume that // the sample sheet is valid and parse it. However, because of Nextflow's // asynchronous magic, we might emit values from `.splitCSV()` before the @@ -385,9 +406,9 @@ def get_sample_sheet(Path sample_sheet) { // in STDOUT. Thus, we use the somewhat clunky construct with `concat` and `last` // below. This lets the CSV channel only start to emit once the error checking is // done. - ch_err = validate_sample_sheet(sample_sheet).map { + ch_err = validate_sample_sheet(sample_sheet, required_sample_types).map { // check if there was an error message - if (it) error "Invalid sample sheet: $it" + if (it) error "Invalid sample sheet: ${it}." it } // concat the channel holding the path to the sample sheet to `ch_err` and call @@ -405,13 +426,19 @@ def get_sample_sheet(Path sample_sheet) { * message is emitted. * * @param: path to sample sheet CSV + * @param: list of required sample types (optional) * @return: string (optional) */ process validate_sample_sheet { - label "wfamplicon" - input: path csv + label params.process_label + input: + path csv + val required_sample_types output: stdout + script: + String req_types_arg = required_sample_types ? "--required_sample_types "+required_sample_types.join(" ") : "" """ - workflow-glue check_sample_sheet $csv + workflow-glue check_sample_sheet $csv $req_types_arg """ } + diff --git a/main.nf b/main.nf index 0ac177d..70239dc 100644 --- a/main.nf +++ b/main.nf @@ -3,24 +3,45 @@ import groovy.json.JsonBuilder nextflow.enable.dsl = 2 -include { fastq_ingress } from './lib/fastqingress' -include { clusterReads } from './subworkflows/clustering/vsearch' -include { draftAssembly } from './subworkflows/assembly/flye' +include { fastq_ingress } from "./lib/fastqingress" +include { pipeline as variantCallingPipeline } from "./modules/local/pipeline" + OPTIONAL_FILE = file("$projectDir/data/OPTIONAL_FILE") + process getVersions { label "wfamplicon" cpus 1 - output: - path "versions.txt" + output: path "versions.txt" script: """ - python -c "import pysam; print(f'pysam,{pysam.__version__}')" >> versions.txt + python --version | tr ' ' ',' | sed 's/P/p/' > versions.txt fastcat --version | sed 's/^/fastcat,/' >> versions.txt + python -c "import ezcharts; print(f'ezcharts,{ezcharts.__version__}')" >> versions.txt + python -c "import dominate; print(f'dominate,{dominate.__version__}')" >> versions.txt + python -c "import numpy; print(f'numpy,{numpy.__version__}')" >> versions.txt + python -c "import pandas; print(f'pandas,{pandas.__version__}')" >> versions.txt + python -c "import pysam; print(f'pysam,{pysam.__version__}')" >> versions.txt + python -c "import si_prefix; print(f'si-prefix,{si_prefix.__version__}')" >> versions.txt + seqkit version | tr ' ' ',' >> versions.txt + printf "porechop,%s\\n" \$(porechop --version) >> versions.txt + samtools --version | head -n1 | tr ' ' ',' >> versions.txt + printf "minimap2,%s\\n" \$(minimap2 --version) >> versions.txt """ } +process addMedakaToVersionsFile { + label "medaka" + cpus 1 + input: path "old_versions.txt" + output: path "versions.txt" + script: + """ + medaka --version | tr ' ' ',' >> old_versions.txt + mv old_versions.txt versions.txt + """ +} process getParams { label "wfamplicon" @@ -28,14 +49,79 @@ process getParams { output: path "params.json" script: - String paramsJSON = new JsonBuilder(params).toPrettyString() + String paramsJSON = new JsonBuilder(params).toPrettyString() """ # Output nextflow params object to JSON echo '$paramsJSON' > params.json """ } +process downsampleReads { + label "wfamplicon" + cpus Math.min(params.threads, 3) + input: + tuple val(meta), path("reads.fastq.gz") + val n_reads + output: tuple val(meta), path("downsampled.fastq.gz") + script: + int bgzip_threads = task.cpus == 1 ? 1 : task.cpus - 1 + """ + seqkit sample -2 reads.fastq.gz -n $n_reads \ + | bgzip -@ $bgzip_threads > downsampled.fastq.gz + """ +} + +process porechop { + label "wfamplicon" + cpus Math.min(params.threads, 5) + input: tuple val(meta), path("reads.fastq.gz") + output: + tuple val(meta), path("porechopped.fastq.gz"), emit: seqs + tuple val(meta), path("porechopped-per-file-stats.tsv"), emit: stats + script: + // run fastcat on porechopped reads so that we can include the post-trimming stats + // in the report + """ + fastcat <(porechop -i reads.fastq.gz -t $task.cpus --discard_middle) \ + -s $meta.alias \ + -f porechopped-per-file-stats.tsv \ + | bgzip > porechopped.fastq.gz + """ +} + +process collectFilesInDir { + label "wfamplicon" + cpus 1 + input: tuple val(meta), path("staging_dir/*"), val(dirname) + output: tuple val(meta), path(dirname) + script: + """ + mv staging_dir $dirname + """ +} +process makeReport { + label "wfamplicon" + cpus 1 + input: + path "data/*" + path "reference.fasta" + path "metadata.json" + path "versions.txt" + path "params.json" + output: + path "wf-amplicon-report.html" + script: + """ + workflow-glue report \ + --report-fname wf-amplicon-report.html \ + --data data \ + --reference reference.fasta \ + --meta-json metadata.json \ + --versions versions.txt \ + --params params.json + """ +} // See https://github.com/nextflow-io/nextflow/issues/1636. This is the only way to // publish files from a workflow whilst decoupling the publish from the process steps. @@ -44,6 +130,7 @@ process getParams { process output { // publish inputs to output directory label "wfamplicon" + cpus 1 publishDir ( params.out_dir, mode: "copy", @@ -57,31 +144,88 @@ process output { """ } + // workflow module workflow pipeline { take: - reads + ch_reads main: - software_versions = getVersions() - workflow_params = getParams() - - // the reads have already been filtered by `fastcat` --> cluster next - clustering = clusterReads( - reads.map {it[0..1]}, - params.min_cluster_size, + def software_versions = getVersions() | addMedakaToVersionsFile + def workflow_params = getParams() + + // get reference + Path ref = file(params.reference, checkIfExists: true) + + // put fastcat stats into results channel + def ch_per_sample_results = ch_reads + | map { meta, reads, stats_dir -> [meta, *file(stats_dir.resolve("*"))] } + + // remove fastcat stats from reads channel + ch_reads = ch_reads.map { meta, reads, stats_dir -> [meta, reads] } + + // downsample + if (params.reads_downsampling_size) { + ch_reads = downsampleReads(ch_reads, params.reads_downsampling_size) + } + + // trim adapters + ch_reads = porechop(ch_reads).seqs + + // add the post-trim stats to the results channel + ch_per_sample_results = ch_per_sample_results.join(porechop.out.stats) + + // run variant calling pipeline + variantCallingPipeline(ch_reads, ref) + + // add variant calling results to main results channel + ch_per_sample_results = ch_per_sample_results + | join(variantCallingPipeline.out.mapping_stats) + | join(variantCallingPipeline.out.depth) + | join(variantCallingPipeline.out.variants) + + // collect files for report in directories (one per sample) + def ch_results_for_report = ch_per_sample_results + | map { + meta = it[0] + rest = it[1..-1] + [meta, rest, meta.alias] + } + | collectFilesInDir + | map { meta, dirname -> dirname } + + // get all metadata and combine into single JSON file + def metadata_json = ch_reads + | map { meta, reads -> meta } + | collect + | map { new JsonBuilder(it).toPrettyString() } + | collectFile(name: "metadata.json", newLine: true) + + // create channel with files to publish; the channel will have the shape `[file, + // name of sub-dir to be published in]`. + def ch_to_publish = Channel.empty() + | mix( + software_versions | map { [it, null] }, + workflow_params | map { [it, null] }, + variantCallingPipeline.out.variants + | map { meta, vcf -> [vcf, "$meta.alias/variants"] } + ) + + makeReport( + ch_results_for_report | collect, + ref, + metadata_json, + software_versions, + workflow_params, ) - + ch_to_publish = ch_to_publish + | mix(makeReport.out | map { [it, null] } ) + emit: - workflow_params + combined_results_to_publish = ch_to_publish } -params.min_read_length = 300 -params.max_read_length = 3600 -params.min_read_qual = 8 -params.min_cluster_size = 0.2 - // entrypoint workflow WorkflowMain.initialise(workflow, params, log) workflow { @@ -100,19 +244,17 @@ workflow { "sample":params.sample, "sample_sheet":params.sample_sheet, "analyse_unclassified":params.analyse_unclassified, - "fastcat_stats": false, + "fastcat_stats": true, "fastcat_extra_args": fastcat_extra_args.join(" ")]) - - // looks like this is the most robust way to check if a `param` coming from the - // command line is a number - if ( - params.min_cluster_size instanceof String || - !params.min_cluster_size.toString().isNumber() - ) { - error "`--min_cluster_size` must be a float or integer." - } + // run workflow pipeline(samples) + + // publish results + pipeline.out.combined_results_to_publish + | toList + | flatMap + | output } if (params.disable_ping == false) { diff --git a/modules/local/pipeline.nf b/modules/local/pipeline.nf new file mode 100644 index 0000000..1d090eb --- /dev/null +++ b/modules/local/pipeline.nf @@ -0,0 +1,191 @@ +process alignReads { + label "wfamplicon" + cpus params.threads + input: + tuple val(meta), path("reads.fastq.gz") + path "reference.fasta" + output: + tuple val(meta), path("*.bam"), path("*.bai") + script: + """ + minimap2 -t $task.cpus -ax map-ont reference.fasta reads.fastq.gz \ + | samtools sort -@ $task.cpus -o aligned.sorted.bam - + + samtools index aligned.sorted.bam + """ +} + +process bamstats { + label "wfamplicon" + cpus Math.min(params.threads, 2) + input: tuple val(meta), path("input.bam"), path("input.bam.bai") + output: tuple val(meta), path("bamstats.tsv"), path("bamstats-flagstat.tsv") + script: + """ + bamstats -u input.bam -s $meta.alias -f bamstats-flagstat.tsv -t $task.cpus \ + > bamstats.tsv + """ +} + +process mosdepth { + label "wfamplicon" + cpus Math.min(params.threads, 3) + input: + tuple val(meta), path("input.bam"), path("input.bam.bai"), val(ref_id) + val n_windows + output: tuple val(meta), path("depth.regions.bed.gz") + script: + int mosdepth_extra_threads = task.cpus - 1 + """ + # get window length for reference + ref_length=\$(samtools idxstats input.bam | awk '\$1 == "$ref_id" {print \$2}') + window_length=\$(expr \$ref_length / $n_windows) + + # if the reference is very short, `window_length` might be `0` --> set to `1` + [ "\$window_length" -eq "0" ] && window_length=1 + + # get the depths (we could add `-x`, but it loses a lot of detail from the depth + # curves) + mosdepth -b \$window_length -n -c "$ref_id" depth input.bam + """ +} + +process concatMosdepthResultFiles { + label "wfamplicon" + cpus 1 + input: tuple val(meta), path("depth.*.bed.gz") + output: tuple val(meta), path("per-window-depth.tsv.gz") + script: + """ + # add a header line and concatenate the depth .bed files (note that gzipped data + # can be concatenated just like regular data) + cat <(printf "ref\\tstart\\tend\\tdepth\\n" | gzip) depth.*.bed.gz \ + > per-window-depth.tsv.gz + """ +} + +process sanitizeRefFile { + // some tools can't deal with `:` or `*` in ref FASTA ID lines --> replace them (and + // whitespace) with underscores + label "wfamplicon" + cpus 1 + input: path "reference.fasta" + output: path "reference_sanitized_seqIDs.fasta" + script: + """ + sed '/^>/s/:\\|\\*\\| /_/g' reference.fasta > reference_sanitized_seqIDs.fasta + """ +} + +process lookupMedakaVariantModel { + label "wfamplicon" + input: + path("lookup_table") + val basecall_model + output: + stdout + shell: + ''' + medaka_model=$(workflow-glue resolve_medaka_model \ + lookup_table '!{basecall_model}' "medaka_variant") + echo -n $medaka_model + ''' +} + +process medakaConsensus { + label "medaka" + cpus Math.min(params.threads, 2) + input: + tuple val(meta), path("input.bam"), path("input.bam.bai"), val(reg) + val medaka_model + output: tuple val(meta), path("consensus_probs.hdf") + script: + """ + medaka consensus input.bam consensus_probs.hdf \ + --threads $task.cpus --regions "$reg" --model $medaka_model + """ +} + +process medakaVariant { + label "medaka" + cpus 1 + input: + tuple val(meta), + path("consensus_probs*.hdf"), + path("input.bam"), + path("input.bam.bai") + path "reference.fasta" + val min_coverage + output: + tuple val(meta), path("medaka.annotated.vcf"), emit: filtered + tuple val(meta), path("medaka.annotated.unfiltered.vcf"), emit: unfiltered + """ + medaka variant reference.fasta consensus_probs*.hdf medaka.vcf + medaka tools annotate --dpsp medaka.vcf reference.fasta input.bam \ + medaka.annotated.unfiltered.vcf + + # filter variants + bcftools filter medaka.annotated.unfiltered.vcf \ + -e 'ALT = "." | INFO/DP < $min_coverage' \ + -Ov -o medaka.annotated.vcf + """ +} + +// workflow module +workflow pipeline { + take: + // reads have already been downsampled and trimmed + ch_reads + ref + main: + // sanitize seq IDs in ref FASTA + def ref = sanitizeRefFile(ref) + + // get medaka model (look up or override) + def medaka_model + if (params.medaka_model) { + log.warn "Overriding Medaka model with ${params.medaka_model}." + medaka_model = params.medaka_model + } else { + Path lookup_table = file( + "${projectDir}/data/medaka_models.tsv", checkIfExists: true) + medaka_model = lookupMedakaVariantModel(lookup_table, params.basecaller_cfg) + } + + // align to reference + alignReads(ch_reads, ref) + + // get the seq IDs of the amplicons from the ref file + def ch_amplicon_seqIDs = ref | splitFasta(record: [id: true]) | map { it.id } + + // run medaka consensus (the process will run once for each sample--amplicon + // combination) + def ch_medaka_consensus_probs = medakaConsensus( + alignReads.out | combine(ch_amplicon_seqIDs), + medaka_model + ) | groupTuple + + // get the variants + medakaVariant( + ch_medaka_consensus_probs | join(alignReads.out), + ref, + params.min_coverage + ) + + // get mapping stats for report + bamstats(alignReads.out) + + // run mosdepth on each sample--amplicon combination + mosdepth( + alignReads.out | combine(ch_amplicon_seqIDs), + params.number_depth_windows + ) + // concat the depth files for each sample + mosdepth.out + | groupTuple + | concatMosdepthResultFiles + emit: + mapping_stats = bamstats.out + depth = concatMosdepthResultFiles.out + variants = medakaVariant.out.filtered +} diff --git a/nextflow.config b/nextflow.config index 9d417da..b6eb7a2 100644 --- a/nextflow.config +++ b/nextflow.config @@ -11,41 +11,60 @@ params { - help = false - version = false + // I/O args fastq = null + reference = null out_dir = "output" sample = null sample_sheet = null + analyse_unclassified = false + + // filtering + downsampling args + min_read_length = 300 + max_read_length = null + min_read_qual = 10.0 + reads_downsampling_size = null + + // mapping + variant calling args + number_depth_windows = 100 + min_coverage = 20 + basecaller_cfg = "dna_r10.4.1_e8.2_400bps_hac" + medaka_model = null + + // misc + help = false + version = false aws_image_prefix = null aws_queue = null disable_ping = false - analyse_unclassified = false + threads = 1 + process_label = "wfamplicon" - process_label = "wftemplate" + // schema etc. monochrome_logs = false validate_params = true show_hidden_params = false - schema_ignore_params = 'show_hidden_params,validate_params,monochrome_logs,aws_queue,aws_image_prefix,wf' + epi2melabs.outdirParam = "outdir" + schema_ignore_params = 'show_hidden_params,validate_params,monochrome_logs,aws_queue,aws_image_prefix,wf,epi2melabs' wf { fastcat_stats = true example_cmd = [ - "--fastq test_data/reads.fastq.gz" + "--fastq test_data/fastq" ] - container_sha = "shad1cd1088e39f20f3cc24c12d9d720950db37ae64" + container_sha = "shaa969a1ee5479f513d9b6d43ff352c4c80bcebccd" agent = null } } manifest { - name = 'epi2me-labs/wf-template' + name = 'epi2me-labs/wf-amplicon' author = 'Oxford Nanopore Technologies' - homePage = 'https://github.com/epi2me-labs/wf-template' - description = 'Template workflow' + homePage = 'https://github.com/epi2me-labs/wf-amplicon' + description = 'Amplicon workflow' mainScript = 'main.nf' nextflowVersion = '>=20.10.0' - version = 'v4.0.0' + version = 'v0.1.0' } executor { @@ -59,8 +78,11 @@ executor { // used by default for "standard" (docker) and singularity profiles, // other profiles may override. process { - withLabel:wftemplate { - container = "ontresearch/wf-template:${params.wf.container_sha}" + withLabel:wfamplicon { + container = "ontresearch/wf-amplicon:${params.wf.container_sha}" + } + withLabel:medaka { + container = "ontresearch/medaka:v1.7.3" } shell = ['/bin/bash', '-euo', 'pipefail'] } @@ -99,8 +121,8 @@ profiles { executor = 'awsbatch' queue = "${params.aws_queue}" memory = '8G' - withLabel:wftemplate { - container = "${params.aws_image_prefix}-wf-template:${params.wf.container_sha}-root" + withLabel:wfamplicon { + container = "${params.aws_image_prefix}-wf-amplicon:${params.wf.container_sha}-root" } shell = ['/bin/bash', '-euo', 'pipefail'] } diff --git a/nextflow_schema.json b/nextflow_schema.json index ff0fd2c..aaac1c0 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -1,10 +1,10 @@ { "$schema": "http://json-schema.org/draft-07/schema", "$id": "https://raw.githubusercontent.com/./master/nextflow_schema.json", - "title": "epi2me-labs/wf-template", - "description": "Nextflow workflow template repository.", - "demo_url": "https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-template/wf-template-demo.tar.gz", - "url": "https://github.com/epi2me-labs/wf-template", + "title": "epi2me-labs/wf-amplicon", + "description": "Nextflow workflow for analysing Oxford Nanopore reads created by amplicon sequencing.", + "demo_url": "https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-amplicon/wf-amplicon-demo.tar.gz", + "url": "https://github.com/epi2me-labs/wf-amplicon", "type": "object", "definitions": { "input": { @@ -24,6 +24,12 @@ "default": false, "description": "Analyse unclassified reads from input directory. By default the workflow will not process reads in the unclassified directory.", "help_text": "If selected and if the input is a multiplex directory the workflow will also process the unclassified directory." + }, + "reference": { + "type": "string", + "format": "file-path", + "description": "Path to a reference FASTA file.", + "help_text": "If provided, the workflow will perform variant calling against the reference. Otherwise, it will cluster reads and create the consensus sequence for each cluster. The reference file should contain one ref. sequence per amplicon." } }, "required": [ @@ -48,6 +54,79 @@ } } }, + "preprocessing_options": { + "title": "Pre-processing Options", + "type": "object", + "fa_icon": "fas fa-arrow-left", + "description": "Parameters for filtering and downsampling reads before running the rest of the workflow.", + "properties": { + "min_read_length": { + "type": "integer", + "default": 300, + "description": "Minimum read length.", + "help_text": "Shorter reads will be removed." + }, + "max_read_length": { + "type": "integer", + "description": "Maximum read length.", + "help_text": "Longer reads will be removed." + }, + "min_read_qual": { + "type": "number", + "default": 10.0, + "description": "Minimum mean read quality.", + "help_text": "Reads with a lower mean quality will be removed." + }, + "reads_downsampling_size": { + "type": "integer", + "description": "Downsample to this number of reads per sample.", + "help_text": "Downsampling is performed after filtering. Set to 0 to disable downsampling." + } + } + }, + "variant_calling_options": { + "title": "Variant calling options", + "type": "object", + "fa_icon": "fas fa-arrow-left", + "description": "Parameters affecting the variant calling process.", + "properties": { + "min_coverage": { + "type": "integer", + "description": "Minimum coverage for variants to keep.", + "help_text": "Only variants covered by more than this number of reads are reported in the resulting VCF file.", + "min": 0, + "default": 20 + }, + "basecaller_cfg": { + "type": "string", + "default": "dna_r10.4.1_e8.2_400bps_hac", + "description": "Name of the basecaller model that processed the signal data; used to select an appropriate Medaka model.", + "help_text": "The basecaller configuration is used to automatically select the appropriate Medaka model. The automatic selection can be overridden with the 'medaka_model' parameters. Available models are: 'dna_r10.4.1_e8.2_400bps_hac@v3.5.2', 'dna_r10.4.1_e8.2_400bps_sup@v3.5.2', 'dna_r9.4.1_e8_fast@v3.4', 'dna_r9.4.1_e8_hac@v3.3', 'dna_r9.4.1_e8_sup@v3.3', 'dna_r10.4.1_e8.2_400bps_hac_prom', 'dna_r9.4.1_450bps_hac_prom', 'dna_r10.3_450bps_hac', 'dna_r10.3_450bps_hac_prom', 'dna_r10.4.1_e8.2_260bps_hac', 'dna_r10.4.1_e8.2_260bps_hac_prom', 'dna_r10.4.1_e8.2_400bps_hac', 'dna_r9.4.1_450bps_hac', 'dna_r9.4.1_e8.1_hac', 'dna_r9.4.1_e8.1_hac_prom'.", + "enum": [ + "dna_r10.4.1_e8.2_400bps_hac@v3.5.2", + "dna_r10.4.1_e8.2_400bps_sup@v3.5.2", + "dna_r9.4.1_e8_fast@v3.4", + "dna_r9.4.1_e8_hac@v3.3", + "dna_r9.4.1_e8_sup@v3.3", + "dna_r10.4.1_e8.2_400bps_hac_prom", + "dna_r9.4.1_450bps_hac_prom", + "dna_r10.3_450bps_hac", + "dna_r10.3_450bps_hac_prom", + "dna_r10.4.1_e8.2_260bps_hac", + "dna_r10.4.1_e8.2_260bps_hac_prom", + "dna_r10.4.1_e8.2_400bps_hac", + "dna_r9.4.1_450bps_hac", + "dna_r9.4.1_e8.1_hac", + "dna_r9.4.1_e8.1_hac_prom" + ] + }, + "medaka_model": { + "type": "string", + "description": "The name of the Medaka model to use. This will override the model automatically chosen based on the provided basecaller configuration.", + "help_text": "The workflow will attempt to map the basecaller model (provided with 'basecaller_cfg') used to a suitable Medaka model. You can override this by providing a model with this option instead." + } + } + }, "output": { "title": "Output Options", "type": "object", @@ -68,7 +147,15 @@ "fa_icon": "far fa-question-circle", "description": "Advanced options for configuring processes inside the workflow.", "help_text": "These advanced options do not need to be changed for typical use, but allow fine tuning of workflows for users who want more control over the workflow.", - "properties": {} + "properties": { + "number_depth_windows": { + "type": "integer", + "description": "Number of windows used during depth of coverage calculations.", + "help_text": "Depth of coverage is calculated for each sample across each amplicon split into this number of windows. A higher number will produce more fine-grained results at the expense of run time. Values of 100-200 should be suitable in the vast majority of cases.", + "min": 10, + "default": 100 + } + } }, "misc": { "title": "Miscellaneous Options", @@ -76,6 +163,12 @@ "description": "Everything else.", "default": "", "properties": { + "threads": { + "type": "integer", + "default": 4, + "description": "Maximum number of CPU threads to use per workflow task.", + "help_text": "Several tasks in this workflow benefit from using multiple CPU threads. This option sets the maximum number of CPU threads for such processes. The total CPU resources used by the workflow are constrained by the executor configuration in `nextflow.config`." + }, "disable_ping": { "type": "boolean", "default": false, @@ -103,6 +196,12 @@ { "$ref": "#/definitions/samples" }, + { + "$ref": "#/definitions/preprocessing_options" + }, + { + "$ref": "#/definitions/variant_calling_options" + }, { "$ref": "#/definitions/output" }, @@ -116,9 +215,9 @@ "properties": { "process_label": { "type": "string", - "description": "The main process label for template processes to use by default", + "description": "The main process label for processes to use by default", "hidden": true, - "default": "wf-template" + "default": "wfamplicon" }, "aws_image_prefix": { "type": "string", @@ -140,7 +239,7 @@ } }, "docs": { - "intro": "## Introduction\n\nThis section of documentation typically contains an overview of the workflow in terms of motivation\nand bioinformatics methods, listing any key tools or algorithms employed, whilst also describing its\nrange of use-cases and what a suitable input dataset should look like.\n\n", - "links": "## Useful links\n\n* [nextflow](https://www.nextflow.io/)\n* [docker](https://www.docker.com/products/docker-desktop)\n* [singularity](https://docs.sylabs.io/guides/latest/user-guide/)\n" + "intro": "## Introduction\n\nThis [Nextflow](https://www.nextflow.io/) workflow provides a simple way to\nanalyse Oxford Nanopore reads generated from amplicons.
\n\nIt requires the raw reads and a reference FASTA file containing one sequence per\namplicon. After filtering (based on read length and quality) and trimming,\nreads are aligned to the reference using\n[minimap2](https://github.com/lh3/minimap2). Variants are then called with\n[Medaka](https://github.com/nanoporetech/medaka). Results include an interactive\nHTML report and VCF files containing the called variants.
\n\nAs mentioned above, the reference FASTA file needs to contain one sequence per\namplicon for now. An option to provide a whole-genome reference file and pairs\nof primers might be added in the future if requested by users.\n", + "links": "\n## Useful links\n\n- [Nextflow](https://www.nextflow.io/)\n- [Docker](https://www.docker.com/products/docker-desktop)\n- [Singularity](https://docs.sylabs.io/guides/latest/user-guide/)\n- [minimap2](https://github.com/lh3/minimap2)\n- [Medaka](https://github.com/nanoporetech/medaka)\n- [Porechop](https://github.com/rrwick/Porechop)\n" } } \ No newline at end of file diff --git a/test_data/fastq/barcode01/reads.fastq.gz b/test_data/fastq/barcode01/reads.fastq.gz new file mode 100644 index 0000000..c670fd7 Binary files /dev/null and b/test_data/fastq/barcode01/reads.fastq.gz differ diff --git a/test_data/fastq/barcode02/reads.fastq.gz b/test_data/fastq/barcode02/reads.fastq.gz new file mode 100644 index 0000000..b90fcaf Binary files /dev/null and b/test_data/fastq/barcode02/reads.fastq.gz differ diff --git a/test_data/reference.fasta b/test_data/reference.fasta new file mode 100644 index 0000000..c26e618 --- /dev/null +++ b/test_data/reference.fasta @@ -0,0 +1,4 @@ +>katG::NC_000962.3:2154725-2155670 +GATCTGGCTCTTAAGGCTGGCAATCTCGGCTTCGCCGACGAGGTCGTGGCTGACCGCAGGGACCGGATCCTGCCACAGCAGGGTCTGCTTGGGGACCAGCGGCCCAAGGTATCTCGCAACGGGACCCATGTCTCGGTGGATCAGCTTGTACCAGGCCTTGGCGAACTCGTCGGCCAATTCCTCGGGGTGTTCCAGCCAGCGACGCGTGATCCGCTCATAGATCGGATCCACCCGCAGCGAGAGGTCAGTGGCCAGCATCGTCGGGGAGCGCCCTGGCCCGCCGAACGGGTCCGGGATGGTGCCGGCACCGGCGCCGTCCTTGGCGGTGTATTGCCAAGCGCCAGCAGGGCTCTTCGTCAGCTCCCACTCGTAGCCGTACAGGATCTCGAGGAAACTGTTGTCCCATTTCGTCGGGGTGTTCGTCCATACGACCTCGATGCCGCTGGTGATCGCGTCCTTACCGGTTCCGGTGCCATACGAGCTCTTCCAGCCCAAGCCCATCTGCTCCAGCGGAGCAGCCTCGGGTTCGGGGCCGACCAGATCGGCCGGGCCGGCGCCATGGGTCTTACCGAAAGTGTGACCGCCGACGATCAGCGCCGCTGTTTCGACGTCGTTCATGGCCATGCGCCGAAACGTCTCGCGAATGTCGACCGCCGCGGCCATGGGGTCCGGGTTGCCGTTCGGCCCCTCCGGGTTCACGTAGATCAGCCCCATCTGCACCGCGGCCAGCGGGTTCTCCAGATCCCGCTTACCGCTGTAACGCTCATCGCCGAGCCAGGTGGCTTCCTTGCCCCAATAGACCTCATCGGGCTCCCACTGGTCGACCCGGCCGAAGCCGAACCCGAACGTCTTGAAGCCCATCGATTCCAGCGCGCAGTTGCCGGCGAAAACAATCAGGTCCGCCCATGAGAGCTTCTTGCCGTACTTCTTCTTGACCGGCCACAG +>rpoB::NC_000962.3:760285-761376 +CATCATCAACGGGACCGAGCGTGTGGTGGTCAGCCAGCTGGTGCGGTCGCCCGGGGTGTACTTCGACGAGACCATTGACAAGTCCACCGACAAGACGCTGCACAGCGTCAAGGTGATCCCGAGCCGCGGCGCGTGGCTCGAGTTTGACGTCGACAAGCGCGACACCGTCGGCGTGCGCATCGACCGCAAACGCCGGCAACCGGTCACCGTGCTGCTCAAGGCGCTGGGCTGGACCAGCGAGCAGATTGTCGAGCGGTTCGGGTTCTCCGAGATCATGCGATCGACGCTGGAGAAGGACAACACCGTCGGCACCGACGAGGCGCTGTTGGACATCTACCGCAAGCTGCGTCCGGGCGAGCCCCCGACCAAAGAGTCAGCGCAGACGCTGTTGGAAAACTTGTTCTTCAAGGAGAAGCGCTACGACCTGGCCCGCGTCGGTCGCTATAAGGTCAACAAGAAGCTCGGGCTGCATGTCGGCGAGCCCATCACGTCGTCGACGCTGACCGAAGAAGACGTCGTGGCCACCATCGAATATCTGGTCCGCTTGCACGAGGGTCAGACCACGATGACCGTTCCGGGCGGCGTCGAGGTGCCGGTGGAAACCGACGACATCGACCACTTCGGCAACCGCCGCCTGCGTACGGTCGGCGAGCTGATCCAAAACCAGATCCGGGTCGGCATGTCGCGGATGGAGCGGGTGGTCCGGGAGCGGATGACCACCCAGGACGTGGAGGCGATCACACCGCAGACGTTGATCAACATCCGGCCGGTGGTCGCCGCGATCAAGGAGTTCTTCGGCACCAGCCAGCTGAGCCAATTCATGGACCAGAACAACCCGCTGTCGGGGTTGACCCACAAGCGCCGACTGTCGGCGCTGGGGCCCGGCGGTCTGTCACGTGAGCGTGCCGGGCTGGAGGTCCGCGACGTGCACCCGTCGCACTACGGCCGGATGTGCCCGATCGAAACCCCTGAGGGGCCCAACATCGGTCTGATCGGCTCGCTGTCGGTGTACGCGCGGGTCAACCCGTTCGGGTTCATCGAAACGCCGTACCGCAAGGTGGTCGACGGCGTGGTTAGCGACGAGATCGTGT