feat!: versioned config (kircherlab#140)

* no mad outlier detection, version controled config, global config removed * getting action linter to run * fix: update argument syntax for version retrieval in GitHub Actions workflow * fix: correct argument syntax for version retrieval in GitHub Actions workflow * fix: correct argument syntax for version retrieval in GitHub Actions workflow * fix: update argument syntax for version retrieval in GitHub Actions workflow * fix: add skip version check option in workflow configuration --------- Co-authored-by: Max Schubach <[email protected]>
visze · Nov 20, 2024 · 9573b66 · 9573b66
1 parent 4b7cc33
commit 9573b66
Show file tree

Hide file tree

Showing 15 changed files with 80 additions and 86 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -50,7 +50,7 @@ jobs:
         with:
           directory: .
           snakefile: workflow/Snakefile
-          args: "--lint --configfile config/example_config.yaml"
+          args: "--lint --configfile config/example_config.yaml --config skip_version_check=True"
   # Testing:
   #   runs-on: ubuntu-latest
   #   needs:

diff --git a/config/example_assignment_bbmap.yaml b/config/example_assignment_bbmap.yaml
@@ -1,11 +1,10 @@
 ---
-global: # generall configs effecting one or multiple parts
-  assignments:
-    split_number: 1 # number of files fastq should be split for parallelization
+version: "0.3"
 assignments:
   exampleAssignment: # name of an example assignment (can be any string)
     bc_length: 15
     alignment_tool:
+      split_number: 1 # number of files fastq should be split for parallelization
       tool: bbmap
       configs:
         min_mapping_quality: 30 # 30 is default for bbmap

diff --git a/config/example_assignment_bwa.yaml b/config/example_assignment_bwa.yaml
@@ -1,11 +1,10 @@
 ---
-global: # generall configs effecting one or multiple parts
-  assignments:
-    split_number: 1 # number of files fastq should be split for parallelization
+version: "0.3"
 assignments:
   exampleAssignment: # name of an example assignment (can be any string)
     bc_length: 15
     alignment_tool:
+      split_number: 1 # number of files fastq should be split for parallelization
       tool: bwa
       configs:
         min_mapping_quality: 1 # integer >=0 Please use 1 when you have oligos that differ by 1 base in your reference/design_file

diff --git a/config/example_assignment_exact_lazy.yaml b/config/example_assignment_exact_lazy.yaml
@@ -1,11 +1,10 @@
 ---
-global: # generall configs effecting one or multiple parts
-  assignments:
-    split_number: 1 # number of files fastq should be split for parallelization
+version: "0.3"
 assignments:
   exampleAssignment: # name of an example assignment (can be any string)
     bc_length: 15
     alignment_tool:
+      split_number: 1 # number of files fastq should be split for parallelization
       tool: exact # bwa or exact
       configs:
         sequence_length: 171 # sequence length of design excluding adapters.

diff --git a/config/example_assignment_exact_linker.yaml b/config/example_assignment_exact_linker.yaml
@@ -1,13 +1,12 @@
 ---
-global: # generall configs effecting one or multiple parts
-  assignments:
-    split_number: 1 # number of files fastq should be split for parallelization
+version: "0.3"
 assignments:
   exampleAssignment: # name of an example assignment (can be any string)
     bc_length: 20
     BC_rev_comp: true
     linker: TCTAGACCGTCACTAACTAACAGTGGGTACCC
     alignment_tool:
+      split_number: 1 # number of files fastq should be split for parallelization
       tool: exact # bwa or exact
       configs:
         sequence_length: 171 # sequence length of design excluding adapters.

diff --git a/config/example_config.yaml b/config/example_config.yaml
@@ -1,11 +1,10 @@
 ---
-global: # generall configs effecting one or multiple parts
-  assignments:
-    split_number: 1 # number of files fastq should be split for parallelization
+version: "0.3"
 assignments:
   exampleAssignment: # name of an example assignment (can be any string)
     bc_length: 15
     alignment_tool:
+      split_number: 1 # number of files fastq should be split for parallelization
       tool: exact # bbmap, bwa or exact
       configs:
         sequence_length: 171 # sequence length of design excluding adapters.

diff --git a/config/example_count.yaml b/config/example_count.yaml
@@ -1,4 +1,5 @@
 ---
+version: "0.3"
 experiments:
   exampleCount:
     bc_length: 15

diff --git a/docs/config.rst b/docs/config.rst
@@ -4,7 +4,7 @@
 Config File
 =====================
 
-The config file is a yaml file that contains the configuration. Different runs can be configured. We recommend using one config file per MPRA experiment or MPRA project. But in theory, many different experiments can be configured in only one file. It is divided into :code:`global` (general settings), :code:`assignments` (assigment workflow), and :code:`experiments` (count workflow including variants). This is a full example file with default configurations. :download:`config/example_config.yaml <../config/example_config.yaml>`.
+The config file is a yaml file that contains the configuration. Different runs can be configured. We recommend using one config file per MPRA experiment or MPRA project. But in theory, many different experiments can be configured in only one file. It is divided into :code:`version` (version of MPRAsnakeflow used), :code:`assignments` (assigment workflow), and :code:`experiments` (count workflow). This is a full example file with default configurations. :download:`config/example_config.yaml <../config/example_config.yaml>`.
 
 .. literalinclude:: ../config/example_config.yaml
    :language: yaml
@@ -14,21 +14,18 @@ The config file is a yaml file that contains the configuration. Different runs c
 Note that the config file is controlled by json schema. This means that the config file is validated against the schema. If the config file is not valid, the program will exit with an error message. The schema is located in :download:`workflow/schemas/config.schema.yaml <../workflow/schemas/config.schema.yaml>`.
 
 ----------------
-General settings
+Version settings
 ----------------
 
-The general settings are located in the :code:`global` section. The following settings are possible:
+Set the version of the of MPRAsnakeflow this configuration is used. This is important for future updates. The version is used to check if the config file is compatible with the current version of the workflow. If the version is not the same the workflow will exit with an error message.
 
 .. literalinclude:: ../workflow/schemas/config.schema.yaml
    :language: yaml
-   :start-after: start_global
+   :start-after: start_version
    :end-before: start_assignments
 
-:assignments:
-    Global parameters that hold for the assignment workflow.
-
-    :split_number:
-        To parallize mapping for assignment the reads are split into :code:`split_number` files. E.g. setting to 300 means that the reads are split into 300 files and each file is mapped in parallel. This is only useful when using on a cluster. Running the workflow only on one machine the default value should be used. The default is set to 1. 
+:version:
+    A a string like "0.2.0" or "1.2". When major version "0" is used the minor version should fit with MPRAsnakeflow, e.g. "0.2.0" is compatible with MPRAsnakeflow 0.2.0. as well as 0.2.1 or 0.2.2. When major version greater 0 used then the major version have to fith with MPRAsnakeflow. E.g. config of "1.2.1" fits also with MPRAsnakeflow 1.7 or 1.0.
 
 --------------------
 Assignment workflow
@@ -43,9 +40,12 @@ The assignment workflow is configured in the :code:`assignments` section. The fo
 
 For each assignment you want to process you have to give him a name like :code:`example_assignment`. The name is used to name the output files.
 
+
 :alignment_tool:
     Alignment tool configuration that is used to map the reads to the oligos.
-
+
+    :split_number:
+        To parallize mapping for assignment the reads are split into :code:`split_number` files. E.g. setting to 300 means that the reads are split into 300 files and each file is mapped in parallel. This is only useful when using on a cluster. Running the workflow only on one machine the default value should be used. The default is set to :code:`1`. (For technical reasons when multiple assignments defined all will set to the maximum defined in the config.)
     :tool:
         Alignment tool that is used. Currently :code:`bbmap` :code:`bwa`, :code:`exact` are supported. Default is :code:`bbmap`.
     :configs:

diff --git a/docs/quickstart.rst b/docs/quickstart.rst
@@ -37,7 +37,7 @@ MPRAsnakeflow exoists of two subworkflows, :ref:`Assignment` and :ref:`Experimen
 
 3. Set up the config file
 
-The config file is the heart of MPRAsnakflow. Here different runs can be configured. We recommend using one config file per MPRA experiment or MPRA project. But in theory, many different experiments can be configured in only one file. It is divided into :code:`global` (general settings), :code:`assignments` (assigment workflow), and :code:`experiments` (count workflow including variants).
+The config file is the heart of MPRAsnakflow. Here different runs can be configured. We recommend using one config file per MPRA experiment or MPRA project. But in theory, many different experiments can be configured in only one file. It is divided into :code:`version` (used MPRAsnakeflow version), :code:`assignments` (assigment workflow), and :code:`experiments` (count workflow).
 
 See :ref:`Config` for more details about the config file. Here is an example running only the count experiments and using a provided assignment file.
 

diff --git a/resources/assoc_basic/config.yml b/resources/assoc_basic/config.yml
@@ -1,11 +1,10 @@
 ---
-global:
-  assignments:
-    split_number: 30
+version: "0.3"
 assignments:
   assocBasic:
     bc_length: 15
     alignment_tool:
+      split_number: 30
       tool: bbmap
       configs:
         sequence_length: 171

diff --git a/resources/combined_basic/config.yml b/resources/combined_basic/config.yml
@@ -1,11 +1,10 @@
 ---
-global:
-  assignments:
-    split_number: 30
+version: "0.3"
 assignments:
   assocBasic:
     bc_length: 15
     alignment_tool:
+      split_number: 30
       tool: bbmap
       configs:
         sequence_length: 171
@@ -30,7 +29,7 @@ experiments:
       fromWorkflow:
         type: config
         assignment_name: assocBasic
-        assignment_config: configs
+        assignment_config: default
     design_file: design.fa
     configs:
       default: {}
diff --git a/resources/count_basic/config.yml b/resources/count_basic/config.yml
@@ -1,4 +1,5 @@
 ---
+version: "0.3"
 experiments:
   exampleCount:
     bc_length: 15
@@ -13,10 +14,6 @@ experiments:
     design_file: design.fa
     configs:
       default: {}
-      outlierNone:
-        filter:
-          outlier_detection:
-            method: none
       outlierZscore:
         filter:
           outlier_detection:

diff --git a/workflow/rules/assigned_counts.smk b/workflow/rules/assigned_counts.smk
@@ -115,20 +115,12 @@ rule assigned_counts_dna_rna_merge:
             % config["experiments"][wc.project]["configs"][wc.config]["filter"][
                 "outlier_detection"
             ]["method"]
-            if config["experiments"][wc.project]["configs"][wc.config]["filter"][
+            if "method"
+            in config["experiments"][wc.project]["configs"][wc.config]["filter"][
                 "outlier_detection"
-            ]["method"]
-            != "none"
+            ]
             else ""
         ),
-        outlier_mad_bins=lambda wc: "--outlier-ratio-mad-bins %d"
-        % config["experiments"][wc.project]["configs"][wc.config]["filter"][
-            "outlier_detection"
-        ]["mad_bins"],
-        outlier_mad_times=lambda wc: "--outlier-ratio-mad-times %f"
-        % config["experiments"][wc.project]["configs"][wc.config]["filter"][
-            "outlier_detection"
-        ]["times_mad"],
         outlier_zscore_times=lambda wc: "--outlier-rna-zscore-times %f"
         % config["experiments"][wc.project]["configs"][wc.config]["filter"][
             "outlier_detection"
@@ -143,7 +135,7 @@ rule assigned_counts_dna_rna_merge:
         --minRNACounts {params.minRNACounts} --minDNACounts {params.minDNACounts} \
         --assignment {input.association} \
         {params.outlier_detection} --outlier-barcodes {output.removed_bcs} \
-        {params.outlier_mad_bins} {params.outlier_mad_times} {params.outlier_zscore_times} \
+        {params.outlier_zscore_times} \
         --output {output.counts} \
         --bcOutput {output.bc_counts} \
         --statistic {output.statistic} &> {log}

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
@@ -23,6 +23,33 @@ if "experiments" in config:
         validate(experiment, schema="../schemas/experiment_file.schema.yaml")
         experiments[project] = experiment
 
+# validate version of config with MPRAsnakeflow version
+
+import re
+
+# Regular expression to match the first two digits with the dot in the middle
+pattern_major_version = r"^(\d+)"
+pattern_development_version = r"^(0(\.\d+)?)"
+
+
+def check_version(pattern, version, config_version):
+    # Search for the pattern in the string
+    match_version = re.search(pattern, version)
+
+    match_config = re.search(pattern, config_version)
+
+    # Check if a match is found and print the result
+    if match_version and match_config:
+        if match_version.group(1) != match_config.group(1):
+            raise ValueError(
+                f"\033[38;2;255;165;0mVersion mismatch: MPRAsnakeflow version is {version}, but config version is {config_version}\033[0m"
+            )
+
+
+if not config["skip_version_check"]:
+    check_version(pattern_development_version, version, config["version"])
+    check_version(pattern_major_version, version, config["version"])
+
 
 ################################
 #### HELPERS AND EXCEPTIONS ####
@@ -509,14 +536,12 @@ def withoutZeros(project, conf):
 
 
 def getSplitNumber():
-    split = 1
+    splits = []
 
-    if "global" in config:
-        if "assignments" in config["global"]:
-            if "split_number" in config["global"]["assignments"]:
-                split = config["global"]["assignments"]["split_number"]
+    for assignment in config["assignments"]:
+        splits += [config["assignments"][assignment]["alignment_tool"]["split_number"]]
 
-    return split
+    return max(splits)
 
 
 # count.smk specific functions

diff --git a/workflow/schemas/config.schema.yaml b/workflow/schemas/config.schema.yaml
@@ -10,21 +10,15 @@ type: object
 # possible entries of the config file
 
 properties:
-  # start_global
-  global:
-    type: object
-    default:
-      assignments:
-        split_number: 1
-    properties:
-      assignments:
-        type: object
-        properties:
-          split_number:
-            type: integer
-            default: 1
-        additionalProperties: false
-    additionalProperties: false
+  # start_version
+  version:
+    description: Version of MPRAsnakeflow
+    type: string
+    pattern: ^(\d+(\.\d+)?(\.\d+)?)|(0\.\d+(\.\d+)?)$
+  skip_version_check:
+    description: Skip version check
+    type: boolean
+    default: false
   # start_assignments
   assignments:
     description: Assignments to run with configurations
@@ -37,6 +31,9 @@ properties:
           alignment_tool:
             type: object
             properties:
+              split_number:
+                type: integer
+                default: 1
               tool:
                 type: string
                 enum:
@@ -336,25 +333,11 @@ properties:
                             type: string
                             enum:
                               - rna_counts_zscore
-                              - ratio_mad
-                              - none
-                            default: rna_counts_zscore
-                          mad_bins:
-                            type: integer
-                            minimum: 1
-                            default: 20
-                          times_mad:
-                            type: number
-                            exclusiveMinimum: 0
-                            default: 5
                           times_zscore:
                             type: number
                             exclusiveMinimum: 0
                             default: 3
                         required:
-                          - method
-                          - mad_bins
-                          - times_mad
                           - times_zscore
                         additionalProperties: false
                         default: {}
@@ -419,4 +402,7 @@ properties:
       additionalProperties: false
 # end_experiments
 additionalProperties: false
-minProperties: 1
+required:
+  - version
+  - skip_version_check
+minProperties: 3