Merge branch 'CW-3907' into 'dev'

bump medaka img [CW-3907] See merge request epi2melabs/workflows/wf-amplicon!84
epi2me-labs · Jul 8, 2024 · b8a2b8f · b8a2b8f
2 parents b91454f + ed9e8a7
commit b8a2b8f
Show file tree

Hide file tree

Showing 25 changed files with 163 additions and 245 deletions.
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -30,14 +30,18 @@ docker-run:
             "ref",
             "ref-sample_sheet",
             "ref-with-spaces",
-            "filter-all",
+            "ref-filter-all",
             "de-novo",
             "de-novo-spoa-max-len-2000",  # CW-3864
             "ref-no-reads",
             "ref-single-read",
             "de-novo-no-reads",
             "de-novo-single-read",
             "ref-mergeVCFs-single-barcode",  # CW-3611
+            "ref-no-basecall-model",
+            "ref-override-basecaller-cfg",
+            "de-novo-no-basecall-model",
+            "de-novo-override-basecaller-cfg",
           ]
   rules:
     # NOTE As we're overriding the rules block for the included docker-run
@@ -80,7 +84,7 @@ docker-run:
           lib/common.nf
           modules/local/variant-calling.nf
         NF_IGNORE_PROCESSES: downsampleReads,subsetReads,subsetRefFile,concatTSVs,catFastqIntoFasta
-    - if: $MATRIX_NAME == "filter-all"
+    - if: $MATRIX_NAME == "ref-filter-all"
       variables:
         NF_WORKFLOW_OPTS: >
           -executor.\$$local.memory 16GB
@@ -175,7 +179,7 @@ docker-run:
           main.nf
           lib/common.nf
           modules/local/de-novo.nf
-        NF_IGNORE_PROCESSES: downsampleReads,subsetReads,subsetRefFile,concatTSVs,medakaConsensus,miniasm,mosdepthPerBase,mosdepthWindows,racon,spoa,trimAndQC,catFastqIntoFasta,configure_igv
+        NF_IGNORE_PROCESSES: downsampleReads,subsetReads,subsetRefFile,concatTSVs,medakaConsensus,medakaStitch,miniasm,mosdepthPerBase,mosdepthWindows,racon,spoa,trimAndQC,catFastqIntoFasta,configure_igv
         AFTER_NEXTFLOW_CMD: |
           grep 'No reads left after pre-processing' .nextflow.log &&
           grep 'only a limited report is available' $$PWD/$$CI_PROJECT_NAME/*html
@@ -190,7 +194,7 @@ docker-run:
           main.nf
           lib/common.nf
           modules/local/de-novo.nf
-        NF_IGNORE_PROCESSES: downsampleReads,subsetReads,subsetRefFile,concatTSVs,medakaConsensus,miniasm,mosdepthPerBase,mosdepthWindows,racon,spoa,trimAndQC,catFastqIntoFasta,configure_igv
+        NF_IGNORE_PROCESSES: downsampleReads,subsetReads,subsetRefFile,concatTSVs,medakaConsensus,medakaStitch,miniasm,mosdepthPerBase,mosdepthWindows,racon,spoa,trimAndQC,catFastqIntoFasta,configure_igv
         AFTER_NEXTFLOW_CMD: "grep 'only a limited report is available' $$PWD/$$CI_PROJECT_NAME/*html"
 
     - if: $MATRIX_NAME == "ref-mergeVCFs-single-barcode"
@@ -206,3 +210,48 @@ docker-run:
           lib/common.nf
           modules/local/variant-calling.nf
         NF_IGNORE_PROCESSES: downsampleReads,subsetReads,subsetRefFile,concatTSVs,catFastqIntoFasta
+
+    - if: $MATRIX_NAME == "ref-no-basecall-model"
+      variables:
+        NF_WORKFLOW_OPTS: >
+          -executor.\$$local.memory 16GB
+          --fastq test_data/fastq-no-basecall-model
+          --reference test_data/reference.fasta
+        NF_IGNORE_PROCESSES: catFastqIntoFasta,concatTSVs,downsampleReads
+        ASSERT_NEXTFLOW_FAILURE: 1
+        ASSERT_NEXTFLOW_FAILURE_REXP: Found no basecall model information in the input data for sample .*\. Please provide it with the `--override_basecaller_cfg` parameter.
+    - if: $MATRIX_NAME == "ref-override-basecaller-cfg"
+      variables:
+        NF_WORKFLOW_OPTS: >
+          -executor.\$$local.memory 16GB
+          --fastq test_data/fastq
+          --reference test_data/reference.fasta
+          --override_basecaller_cfg [email protected]
+          --combine_results
+        NF_PROCESS_FILES: >
+          main.nf
+          lib/common.nf
+          modules/local/variant-calling.nf
+        NF_IGNORE_PROCESSES: downsampleReads,subsetRefFile,concatTSVs,catFastqIntoFasta
+        AFTER_NEXTFLOW_CMD: grep "Overriding basecall model with '[email protected]'." .nextflow.log
+    - if: $MATRIX_NAME == "de-novo-no-basecall-model"
+      variables:
+        NF_WORKFLOW_OPTS: >
+          -executor.\$$local.memory 16GB
+          --fastq test_data/fastq-denovo-no-basecall-model
+        NF_IGNORE_PROCESSES: downsampleReads
+        ASSERT_NEXTFLOW_FAILURE: 1
+        ASSERT_NEXTFLOW_FAILURE_REXP: Found no basecall model information in the input data for sample .*\. Please provide it with the `--override_basecaller_cfg` parameter.
+    - if: $MATRIX_NAME == "de-novo-override-basecaller-cfg"
+      variables:
+        NF_WORKFLOW_OPTS: >
+          -executor.\$$local.memory 16GB
+          --fastq test_data/fastq-denovo-no-basecall-model
+          --override_basecaller_cfg [email protected]
+          --combine_results
+        NF_PROCESS_FILES: >
+          main.nf
+          lib/common.nf
+          modules/local/de-novo.nf
+        NF_IGNORE_PROCESSES: downsampleBAMforMedaka,downsampleReads,racon
+        AFTER_NEXTFLOW_CMD: grep "Overriding basecall model with '[email protected]'." .nextflow.log
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,18 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [v1.1.1]
+### Changed
+- Updated Medaka to v1.12.0.
+
+### Removed
+- The `--medaka_model` parameter as the appropriate Medaka model is now automatically determined from the input data.
+- The now redundant `--basecaller_cfg` parameter as its value is now automatically detected from the input data on a per-sample basis.
+
+### Added
+- `--override_basecaller_cfg` parameter for cases where automatic selection fails or users wishes to override the automatic choice.
+
+
 ## [v1.1.0]
 ### Fixed
 - The `miniasm` process requesting too little memory in some cases.

diff --git a/README.md b/README.md
@@ -179,8 +179,6 @@ input_reads.fastq   ─── input_directory  ─── input_directory
 | Nextflow parameter name  | Type | Description | Help | Default |
 |--------------------------|------|-------------|------|---------|
 | min_coverage | integer | Minimum coverage for variants to keep. | Only variants covered by more than this number of reads are reported in the resulting VCF file. | 20 |
-| basecaller_cfg | string | Name of the basecaller model that processed the signal data; used to select an appropriate Medaka model. | The basecaller configuration is used to automatically select the appropriate Medaka model. The automatic selection can be overridden with the 'medaka_model' parameters. Available models are: '[email protected]', '[email protected]', '[email protected]', '[email protected]', '[email protected]', 'dna_r10.4.1_e8.2_400bps_hac_prom', 'dna_r9.4.1_450bps_hac_prom', 'dna_r10.3_450bps_hac', 'dna_r10.3_450bps_hac_prom', 'dna_r10.4.1_e8.2_260bps_hac', 'dna_r10.4.1_e8.2_260bps_hac_prom', 'dna_r10.4.1_e8.2_400bps_hac', 'dna_r9.4.1_450bps_hac', 'dna_r9.4.1_e8.1_hac', 'dna_r9.4.1_e8.1_hac_prom'. | [email protected] |
-| medaka_model | string | The name of the Medaka model to use. This will override the model automatically chosen based on the provided basecaller configuration. | The workflow will attempt to map the basecaller model (provided with 'basecaller_cfg') used to a suitable Medaka model. You can override this by providing a model with this option instead. |  |
 
 
 ### De-novo Consensus Options
@@ -207,6 +205,7 @@ input_reads.fastq   ─── input_directory  ─── input_directory
 |--------------------------|------|-------------|------|---------|
 | number_depth_windows | integer | Number of windows used during depth of coverage calculations. | Depth of coverage is calculated for each sample across each amplicon split into this number of windows. A higher number will produce more fine-grained plots at the expense of run time. | 100 |
 | medaka_target_depth_per_strand | integer | Downsample each amplicon to this per-strand depth before running Medaka. | Medaka performs best with even strand coverage and depths between 80X and 400X. To avoid too high coverage, the workflow downsamples the reads for each amplicon to this per-strand depth before running Medaka. Changing this value is discouraged as it might cause decreased performance. | 150 |
+| override_basecaller_cfg | string | Override auto-detected basecaller model that processed the signal data; used to select an appropriate Medaka model. | Per default, the workflow tries to determine the basecall model from the input data. This parameter can be used to override the detected value (or to provide a model name if none was found in the inputs). However, users should only do this if they know for certain which model was used as selecting the wrong option might give sub-optimal results. A list of recent models can be found here: https://github.com/nanoporetech/dorado#DNA-models. |  |
 
 
 ### Miscellaneous Options
@@ -299,9 +298,9 @@ After alignment, haploid variants are called with [Medaka](https://github.com/na
 You can set the minimum coverage a variant needs to exceed in order to be included in the results with `--min_coverage`.
 Variants with lower coverage will still be listed in the resulting VCF files, but with `LOW_DEPTH` instead of `PASS` in the `FILTER` column.
 
-The workflow selects the appropriate [Medaka models](https://github.com/nanoporetech/medaka#models) based on the basecaller configuration that was used to process the signal data.
-You can use the parameter `--basecaller_cfg` to provide this information (e.g. `dna_r10.4.1_e8.2_400bps_hac`).
-Alternatively, you can choose the [Medaka](https://github.com/nanoporetech/medaka) model directly with `--medaka_model`.
+The workflow automatically selects the appropriate [Medaka model](https://github.com/nanoporetech/medaka#models) based on the basecall model that was used to process the signal data.
+In most cases, the workflow should be able to determine the basecall model from the input data.
+If this is not possible, it can be provided with the `--override_basecaller_cfg` parameter.
 
 #### 4. Use the variants to generate a consensus
 

diff --git a/bin/workflow_glue/resolve_medaka_model.py b/bin/workflow_glue/resolve_medaka_model.py
diff --git a/data/medaka_models.tsv b/data/medaka_models.tsv
diff --git a/docs/06_input_parameters.md b/docs/06_input_parameters.md
@@ -33,8 +33,6 @@
 | Nextflow parameter name  | Type | Description | Help | Default |
 |--------------------------|------|-------------|------|---------|
 | min_coverage | integer | Minimum coverage for variants to keep. | Only variants covered by more than this number of reads are reported in the resulting VCF file. | 20 |
-| basecaller_cfg | string | Name of the basecaller model that processed the signal data; used to select an appropriate Medaka model. | The basecaller configuration is used to automatically select the appropriate Medaka model. The automatic selection can be overridden with the 'medaka_model' parameters. Available models are: '[email protected]', '[email protected]', '[email protected]', '[email protected]', '[email protected]', 'dna_r10.4.1_e8.2_400bps_hac_prom', 'dna_r9.4.1_450bps_hac_prom', 'dna_r10.3_450bps_hac', 'dna_r10.3_450bps_hac_prom', 'dna_r10.4.1_e8.2_260bps_hac', 'dna_r10.4.1_e8.2_260bps_hac_prom', 'dna_r10.4.1_e8.2_400bps_hac', 'dna_r9.4.1_450bps_hac', 'dna_r9.4.1_e8.1_hac', 'dna_r9.4.1_e8.1_hac_prom'. | [email protected] |
-| medaka_model | string | The name of the Medaka model to use. This will override the model automatically chosen based on the provided basecaller configuration. | The workflow will attempt to map the basecaller model (provided with 'basecaller_cfg') used to a suitable Medaka model. You can override this by providing a model with this option instead. |  |
 
 
 ### De-novo Consensus Options
@@ -61,6 +59,7 @@
 |--------------------------|------|-------------|------|---------|
 | number_depth_windows | integer | Number of windows used during depth of coverage calculations. | Depth of coverage is calculated for each sample across each amplicon split into this number of windows. A higher number will produce more fine-grained plots at the expense of run time. | 100 |
 | medaka_target_depth_per_strand | integer | Downsample each amplicon to this per-strand depth before running Medaka. | Medaka performs best with even strand coverage and depths between 80X and 400X. To avoid too high coverage, the workflow downsamples the reads for each amplicon to this per-strand depth before running Medaka. Changing this value is discouraged as it might cause decreased performance. | 150 |
+| override_basecaller_cfg | string | Override auto-detected basecaller model that processed the signal data; used to select an appropriate Medaka model. | Per default, the workflow tries to determine the basecall model from the input data. This parameter can be used to override the detected value (or to provide a model name if none was found in the inputs). However, users should only do this if they know for certain which model was used as selecting the wrong option might give sub-optimal results. A list of recent models can be found here: https://github.com/nanoporetech/dorado#DNA-models. |  |
 
 
 ### Miscellaneous Options