Skip to content

Commit

Permalink
feat!: versioned config (kircherlab#140)
Browse files Browse the repository at this point in the history
* no mad outlier detection, version controled config, global config removed

* getting action linter to run

* fix: update argument syntax for version retrieval in GitHub Actions workflow

* fix: correct argument syntax for version retrieval in GitHub Actions workflow

* fix: correct argument syntax for version retrieval in GitHub Actions workflow

* fix: update argument syntax for version retrieval in GitHub Actions workflow

* fix: add skip version check option in workflow configuration

---------

Co-authored-by: Max Schubach <[email protected]>
  • Loading branch information
visze and Max Schubach authored Nov 20, 2024
1 parent 4b7cc33 commit 9573b66
Show file tree
Hide file tree
Showing 15 changed files with 80 additions and 86 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ jobs:
with:
directory: .
snakefile: workflow/Snakefile
args: "--lint --configfile config/example_config.yaml"
args: "--lint --configfile config/example_config.yaml --config skip_version_check=True"
# Testing:
# runs-on: ubuntu-latest
# needs:
Expand Down
5 changes: 2 additions & 3 deletions config/example_assignment_bbmap.yaml
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
---
global: # generall configs effecting one or multiple parts
assignments:
split_number: 1 # number of files fastq should be split for parallelization
version: "0.3"
assignments:
exampleAssignment: # name of an example assignment (can be any string)
bc_length: 15
alignment_tool:
split_number: 1 # number of files fastq should be split for parallelization
tool: bbmap
configs:
min_mapping_quality: 30 # 30 is default for bbmap
Expand Down
5 changes: 2 additions & 3 deletions config/example_assignment_bwa.yaml
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
---
global: # generall configs effecting one or multiple parts
assignments:
split_number: 1 # number of files fastq should be split for parallelization
version: "0.3"
assignments:
exampleAssignment: # name of an example assignment (can be any string)
bc_length: 15
alignment_tool:
split_number: 1 # number of files fastq should be split for parallelization
tool: bwa
configs:
min_mapping_quality: 1 # integer >=0 Please use 1 when you have oligos that differ by 1 base in your reference/design_file
Expand Down
5 changes: 2 additions & 3 deletions config/example_assignment_exact_lazy.yaml
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
---
global: # generall configs effecting one or multiple parts
assignments:
split_number: 1 # number of files fastq should be split for parallelization
version: "0.3"
assignments:
exampleAssignment: # name of an example assignment (can be any string)
bc_length: 15
alignment_tool:
split_number: 1 # number of files fastq should be split for parallelization
tool: exact # bwa or exact
configs:
sequence_length: 171 # sequence length of design excluding adapters.
Expand Down
5 changes: 2 additions & 3 deletions config/example_assignment_exact_linker.yaml
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
---
global: # generall configs effecting one or multiple parts
assignments:
split_number: 1 # number of files fastq should be split for parallelization
version: "0.3"
assignments:
exampleAssignment: # name of an example assignment (can be any string)
bc_length: 20
BC_rev_comp: true
linker: TCTAGACCGTCACTAACTAACAGTGGGTACCC
alignment_tool:
split_number: 1 # number of files fastq should be split for parallelization
tool: exact # bwa or exact
configs:
sequence_length: 171 # sequence length of design excluding adapters.
Expand Down
5 changes: 2 additions & 3 deletions config/example_config.yaml
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
---
global: # generall configs effecting one or multiple parts
assignments:
split_number: 1 # number of files fastq should be split for parallelization
version: "0.3"
assignments:
exampleAssignment: # name of an example assignment (can be any string)
bc_length: 15
alignment_tool:
split_number: 1 # number of files fastq should be split for parallelization
tool: exact # bbmap, bwa or exact
configs:
sequence_length: 171 # sequence length of design excluding adapters.
Expand Down
1 change: 1 addition & 0 deletions config/example_count.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
---
version: "0.3"
experiments:
exampleCount:
bc_length: 15
Expand Down
20 changes: 10 additions & 10 deletions docs/config.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
Config File
=====================

The config file is a yaml file that contains the configuration. Different runs can be configured. We recommend using one config file per MPRA experiment or MPRA project. But in theory, many different experiments can be configured in only one file. It is divided into :code:`global` (general settings), :code:`assignments` (assigment workflow), and :code:`experiments` (count workflow including variants). This is a full example file with default configurations. :download:`config/example_config.yaml <../config/example_config.yaml>`.
The config file is a yaml file that contains the configuration. Different runs can be configured. We recommend using one config file per MPRA experiment or MPRA project. But in theory, many different experiments can be configured in only one file. It is divided into :code:`version` (version of MPRAsnakeflow used), :code:`assignments` (assigment workflow), and :code:`experiments` (count workflow). This is a full example file with default configurations. :download:`config/example_config.yaml <../config/example_config.yaml>`.

.. literalinclude:: ../config/example_config.yaml
:language: yaml
Expand All @@ -14,21 +14,18 @@ The config file is a yaml file that contains the configuration. Different runs c
Note that the config file is controlled by json schema. This means that the config file is validated against the schema. If the config file is not valid, the program will exit with an error message. The schema is located in :download:`workflow/schemas/config.schema.yaml <../workflow/schemas/config.schema.yaml>`.

----------------
General settings
Version settings
----------------

The general settings are located in the :code:`global` section. The following settings are possible:
Set the version of the of MPRAsnakeflow this configuration is used. This is important for future updates. The version is used to check if the config file is compatible with the current version of the workflow. If the version is not the same the workflow will exit with an error message.

.. literalinclude:: ../workflow/schemas/config.schema.yaml
:language: yaml
:start-after: start_global
:start-after: start_version
:end-before: start_assignments

:assignments:
Global parameters that hold for the assignment workflow.

:split_number:
To parallize mapping for assignment the reads are split into :code:`split_number` files. E.g. setting to 300 means that the reads are split into 300 files and each file is mapped in parallel. This is only useful when using on a cluster. Running the workflow only on one machine the default value should be used. The default is set to 1.
:version:
A a string like "0.2.0" or "1.2". When major version "0" is used the minor version should fit with MPRAsnakeflow, e.g. "0.2.0" is compatible with MPRAsnakeflow 0.2.0. as well as 0.2.1 or 0.2.2. When major version greater 0 used then the major version have to fith with MPRAsnakeflow. E.g. config of "1.2.1" fits also with MPRAsnakeflow 1.7 or 1.0.

--------------------
Assignment workflow
Expand All @@ -43,9 +40,12 @@ The assignment workflow is configured in the :code:`assignments` section. The fo

For each assignment you want to process you have to give him a name like :code:`example_assignment`. The name is used to name the output files.


:alignment_tool:
Alignment tool configuration that is used to map the reads to the oligos.


:split_number:
To parallize mapping for assignment the reads are split into :code:`split_number` files. E.g. setting to 300 means that the reads are split into 300 files and each file is mapped in parallel. This is only useful when using on a cluster. Running the workflow only on one machine the default value should be used. The default is set to :code:`1`. (For technical reasons when multiple assignments defined all will set to the maximum defined in the config.)
:tool:
Alignment tool that is used. Currently :code:`bbmap` :code:`bwa`, :code:`exact` are supported. Default is :code:`bbmap`.
:configs:
Expand Down
2 changes: 1 addition & 1 deletion docs/quickstart.rst
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ MPRAsnakeflow exoists of two subworkflows, :ref:`Assignment` and :ref:`Experimen

3. Set up the config file

The config file is the heart of MPRAsnakflow. Here different runs can be configured. We recommend using one config file per MPRA experiment or MPRA project. But in theory, many different experiments can be configured in only one file. It is divided into :code:`global` (general settings), :code:`assignments` (assigment workflow), and :code:`experiments` (count workflow including variants).
The config file is the heart of MPRAsnakflow. Here different runs can be configured. We recommend using one config file per MPRA experiment or MPRA project. But in theory, many different experiments can be configured in only one file. It is divided into :code:`version` (used MPRAsnakeflow version), :code:`assignments` (assigment workflow), and :code:`experiments` (count workflow).

See :ref:`Config` for more details about the config file. Here is an example running only the count experiments and using a provided assignment file.

Expand Down
5 changes: 2 additions & 3 deletions resources/assoc_basic/config.yml
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
---
global:
assignments:
split_number: 30
version: "0.3"
assignments:
assocBasic:
bc_length: 15
alignment_tool:
split_number: 30
tool: bbmap
configs:
sequence_length: 171
Expand Down
7 changes: 3 additions & 4 deletions resources/combined_basic/config.yml
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
---
global:
assignments:
split_number: 30
version: "0.3"
assignments:
assocBasic:
bc_length: 15
alignment_tool:
split_number: 30
tool: bbmap
configs:
sequence_length: 171
Expand All @@ -30,7 +29,7 @@ experiments:
fromWorkflow:
type: config
assignment_name: assocBasic
assignment_config: configs
assignment_config: default
design_file: design.fa
configs:
default: {}
5 changes: 1 addition & 4 deletions resources/count_basic/config.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
---
version: "0.3"
experiments:
exampleCount:
bc_length: 15
Expand All @@ -13,10 +14,6 @@ experiments:
design_file: design.fa
configs:
default: {}
outlierNone:
filter:
outlier_detection:
method: none
outlierZscore:
filter:
outlier_detection:
Expand Down
16 changes: 4 additions & 12 deletions workflow/rules/assigned_counts.smk
Original file line number Diff line number Diff line change
Expand Up @@ -115,20 +115,12 @@ rule assigned_counts_dna_rna_merge:
% config["experiments"][wc.project]["configs"][wc.config]["filter"][
"outlier_detection"
]["method"]
if config["experiments"][wc.project]["configs"][wc.config]["filter"][
if "method"
in config["experiments"][wc.project]["configs"][wc.config]["filter"][
"outlier_detection"
]["method"]
!= "none"
]
else ""
),
outlier_mad_bins=lambda wc: "--outlier-ratio-mad-bins %d"
% config["experiments"][wc.project]["configs"][wc.config]["filter"][
"outlier_detection"
]["mad_bins"],
outlier_mad_times=lambda wc: "--outlier-ratio-mad-times %f"
% config["experiments"][wc.project]["configs"][wc.config]["filter"][
"outlier_detection"
]["times_mad"],
outlier_zscore_times=lambda wc: "--outlier-rna-zscore-times %f"
% config["experiments"][wc.project]["configs"][wc.config]["filter"][
"outlier_detection"
Expand All @@ -143,7 +135,7 @@ rule assigned_counts_dna_rna_merge:
--minRNACounts {params.minRNACounts} --minDNACounts {params.minDNACounts} \
--assignment {input.association} \
{params.outlier_detection} --outlier-barcodes {output.removed_bcs} \
{params.outlier_mad_bins} {params.outlier_mad_times} {params.outlier_zscore_times} \
{params.outlier_zscore_times} \
--output {output.counts} \
--bcOutput {output.bc_counts} \
--statistic {output.statistic} &> {log}
Expand Down
37 changes: 31 additions & 6 deletions workflow/rules/common.smk
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,33 @@ if "experiments" in config:
validate(experiment, schema="../schemas/experiment_file.schema.yaml")
experiments[project] = experiment

# validate version of config with MPRAsnakeflow version

import re

# Regular expression to match the first two digits with the dot in the middle
pattern_major_version = r"^(\d+)"
pattern_development_version = r"^(0(\.\d+)?)"


def check_version(pattern, version, config_version):
# Search for the pattern in the string
match_version = re.search(pattern, version)

match_config = re.search(pattern, config_version)

# Check if a match is found and print the result
if match_version and match_config:
if match_version.group(1) != match_config.group(1):
raise ValueError(
f"\033[38;2;255;165;0mVersion mismatch: MPRAsnakeflow version is {version}, but config version is {config_version}\033[0m"
)


if not config["skip_version_check"]:
check_version(pattern_development_version, version, config["version"])
check_version(pattern_major_version, version, config["version"])


################################
#### HELPERS AND EXCEPTIONS ####
Expand Down Expand Up @@ -509,14 +536,12 @@ def withoutZeros(project, conf):


def getSplitNumber():
split = 1
splits = []

if "global" in config:
if "assignments" in config["global"]:
if "split_number" in config["global"]["assignments"]:
split = config["global"]["assignments"]["split_number"]
for assignment in config["assignments"]:
splits += [config["assignments"][assignment]["alignment_tool"]["split_number"]]

return split
return max(splits)


# count.smk specific functions
Expand Down
46 changes: 16 additions & 30 deletions workflow/schemas/config.schema.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,21 +10,15 @@ type: object
# possible entries of the config file

properties:
# start_global
global:
type: object
default:
assignments:
split_number: 1
properties:
assignments:
type: object
properties:
split_number:
type: integer
default: 1
additionalProperties: false
additionalProperties: false
# start_version
version:
description: Version of MPRAsnakeflow
type: string
pattern: ^(\d+(\.\d+)?(\.\d+)?)|(0\.\d+(\.\d+)?)$
skip_version_check:
description: Skip version check
type: boolean
default: false
# start_assignments
assignments:
description: Assignments to run with configurations
Expand All @@ -37,6 +31,9 @@ properties:
alignment_tool:
type: object
properties:
split_number:
type: integer
default: 1
tool:
type: string
enum:
Expand Down Expand Up @@ -336,25 +333,11 @@ properties:
type: string
enum:
- rna_counts_zscore
- ratio_mad
- none
default: rna_counts_zscore
mad_bins:
type: integer
minimum: 1
default: 20
times_mad:
type: number
exclusiveMinimum: 0
default: 5
times_zscore:
type: number
exclusiveMinimum: 0
default: 3
required:
- method
- mad_bins
- times_mad
- times_zscore
additionalProperties: false
default: {}
Expand Down Expand Up @@ -419,4 +402,7 @@ properties:
additionalProperties: false
# end_experiments
additionalProperties: false
minProperties: 1
required:
- version
- skip_version_check
minProperties: 3

0 comments on commit 9573b66

Please sign in to comment.