feat: strand sensitive option (kircherlab#146)

* allowing additional properties for config back compatibility * work * implementing strand_senssitive config with interplay in snakemake file --------- Co-authored-by: Max Schubach <[email protected]> Co-authored-by: Max Schubach <[email protected]>
visze · Dec 9, 2024 · f315aab · f315aab
1 parent 3c91a6a
commit f315aab
Show file tree

Hide file tree

Showing 5 changed files with 163 additions and 46 deletions.
diff --git a/workflow/rules/assignment.smk b/workflow/rules/assignment.smk
@@ -25,6 +25,7 @@ rule assignment_check_design:
         temp("results/assignment/{assignment}/reference/reference.fa.fxi"),
         touch("results/assignment/{assignment}/design_check.done"),
         ref="results/assignment/{assignment}/reference/reference.fa",
+        ref_tmp=temp("results/assignment/{assignment}/reference/reference.tmp.fa"),
     params:
         start=lambda wc: (
             config["assignments"][wc.assignment]["alignment_tool"]["configs"][
@@ -49,23 +50,38 @@ rule assignment_check_design:
             if config["assignments"][wc.assignment]["design_check"]["fast"]
             else "--slow-string-search"
         ),
-        check_sequence_collitions=lambda wc: (
-            "--perform-sequence-check"
+        sequence_collitions=lambda wc: (
+            "sense_antisense"
             if config["assignments"][wc.assignment]["design_check"][
                 "sequence_collitions"
             ]
-            else "--skip-sequence-check"
+            else "skip"
+        ),
+        attach_sequence=lambda wc: (
+            "--attach-sequence %s %s"
+            % (
+                config["assignments"][wc.assignment]["strand_sensitive"][
+                    "forward_adapter"
+                ],
+                config["assignments"][wc.assignment]["strand_sensitive"][
+                    "reverse_adapter"
+                ],
+            )
+            if config["assignments"][wc.assignment]["strand_sensitive"]["enable"]
+            else ""
         ),
     log:
         log=temp("results/logs/assignment/check_design.{assignment}.log"),
         err="results/assignment/{assignment}/design_check.err",
     shell:
         """
         trap "cat {log.err}" ERR
-        cp {input.design} {output.ref}
-        python {input.script} --input {output.ref} \
+        cp {input.design} {output.ref_tmp}
+        python {input.script} --input {output.ref_tmp} \
+        --output {output.ref} \
         --start {params.start} --length {params.length} \
-        {params.fast_check} {params.check_sequence_collitions} > {log.log} 2> {log.err};
+        {params.fast_check} --sequence-check {params.sequence_collitions} \
+        {params.attach_sequence} > {log.log} 2> {log.err};
         """
 
 
@@ -129,11 +145,27 @@ rule assignment_attach_idx:
             if config["assignments"][wc.assignment]["BC_rev_comp"]
             else ""
         ),
+        attach_sequence=lambda wc: (
+            "--attach-sequence left %s"
+            % (
+                config["assignments"][wc.assignment]["strand_sensitive"][
+                    "forward_adapter"
+                ]
+                if wc.read == "FW"
+                else reverse_complement(
+                    config["assignments"][wc.assignment]["strand_sensitive"][
+                        "reverse_adapter"
+                    ]
+                )
+            )
+            if config["assignments"][wc.assignment]["strand_sensitive"]["enable"]
+            else ""
+        ),
     log:
         temp("results/logs/assignment/attach_idx.{assignment}.{split}.{read}.log"),
     shell:
         """
-        python {input.script} -r {input.read} -b {input.BC} {params.BC_rev_comp} | bgzip -c > {output.read} 2> {log}
+        python {input.script} -r {input.read} -b {input.BC} {params.BC_rev_comp} {params.attach_sequence} | bgzip -c > {output.read} 2> {log}
         """
 
 

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
@@ -51,6 +51,38 @@ if not config["skip_version_check"]:
     check_version(pattern_major_version, version, config["version"])
 
 
+# modify config based on certain rules
+
+
+def modify_config(config):
+    # update sequence length with when adapters are added via strand sensitive is enabled
+    if "assignments" in config:
+        for assignment in config["assignments"].keys():
+            if config["assignments"][assignment]["strand_sensitive"]["enable"]:
+                add_length = len(
+                    config["assignments"][assignment]["strand_sensitive"][
+                        "forward_adapter"
+                    ]
+                ) + len(
+                    config["assignments"][assignment]["strand_sensitive"][
+                        "reverse_adapter"
+                    ]
+                )
+                if config["assignments"][assignment]["alignment_tool"]["tool"] == "bwa":
+                    config["assignments"][assignment]["alignment_tool"]["configs"][
+                        "sequence_length"
+                    ]["min"] += add_length
+                    config["assignments"][assignment]["alignment_tool"]["configs"][
+                        "sequence_length"
+                    ]["max"] += add_length
+                else:
+                    config["assignments"][assignment]["alignment_tool"]["configs"][
+                        "sequence_length"
+                    ] += add_length
+    return config
+
+
+config = modify_config(config)
 ################################
 #### HELPERS AND EXCEPTIONS ####
 ################################
@@ -536,6 +568,11 @@ def withoutZeros(project, conf):
 # assignment.smk specific functions
 
 
+def reverse_complement(seq):
+    complementary = {"A": "T", "T": "A", "G": "C", "C": "G", "N": "N"}
+    return "".join(reversed([complementary[i] for i in seq]))
+
+
 def getSplitNumber():
     splits = [1]
 

diff --git a/workflow/schemas/config.schema.yaml b/workflow/schemas/config.schema.yaml
@@ -62,7 +62,6 @@ properties:
                               type: integer
                             max:
                               type: integer
-                          additionalProperties: false
                           required:
                             - min
                             - max
@@ -73,11 +72,9 @@ properties:
                               type: integer
                             max:
                               type: integer
-                          additionalProperties: false
                           required:
                             - min
                             - max
-                      additionalProperties: false
                       required:
                         - sequence_length
                         - alignment_start
@@ -103,7 +100,6 @@ properties:
                         alignment_start:
                           type: integer
                           minimum: 1
-                      additionalProperties: false
                       required:
                         - min_mapping_quality
                         - sequence_length
@@ -125,7 +121,6 @@ properties:
                         alignment_start:
                           type: integer
                           minimum: 1
-                      additionalProperties: false
                       required:
                         - sequence_length
                         - alignment_start
@@ -178,7 +173,6 @@ properties:
               - frac_mismatches_allowed
               - min_dovetailed_overlap
             default: {}
-            additionalProperties: false
           design_file:
             type: string
           design_check:
@@ -194,7 +188,21 @@ properties:
             required:
               - fast
               - sequence_collitions
-            additionalProperties: false
+          strand_sensitive:
+            type: object
+            default: {}
+            properties:
+              enable:
+                type: boolean
+                default: false
+              forward_adapter:
+                type: string
+                pattern: ^[ATCGN]+$
+                default: AGGACCGGATCAACT
+              reverse_adapter:
+                type: string
+                pattern: ^[ATCGN]+$
+                default: TCGGTTCACGCAATG
           configs:
             type: object
             patternProperties:
@@ -213,8 +221,6 @@ properties:
                 required:
                   - min_support
                   - fraction
-                additionalProperties: false
-            additionalProperties: false
             minProperties: 1
         oneOf:
           - required:
@@ -231,8 +237,6 @@ properties:
           - configs
           - alignment_tool
           - NGmerge
-        additionalProperties: false
-    additionalProperties: false
     minProperties: 1
   # start_experiments
   experiments:
@@ -290,7 +294,6 @@ properties:
                         minimum: 1
                 required:
                   - type
-                additionalProperties: false
                 allOf:
                   - if:
                       properties:
@@ -311,7 +314,6 @@ properties:
                     then:
                       required:
                         - assignment_file
-            additionalProperties: false
           configs:
             type: object
             patternProperties:
@@ -339,7 +341,6 @@ properties:
                             default: 3
                         required:
                           - times_zscore
-                        additionalProperties: false
                         default: {}
                       min_dna_counts:
                         type: integer
@@ -349,7 +350,6 @@ properties:
                         type: integer
                         miminum: 0
                         default: 1
-                    additionalProperties: false
                     required:
                       - bc_threshold
                       - min_rna_counts
@@ -372,12 +372,8 @@ properties:
                             minimum: 1
                           seed:
                             type: integer
-                        additionalProperties: false
-                    additionalProperties: false
-                additionalProperties: false
                 required:
                   - filter
-              additionalProperties: false
           variants:
             type: object
             properties:
@@ -399,7 +395,6 @@ properties:
           - demultiplex
           - assignments
           - configs
-      additionalProperties: false
 # end_experiments
 additionalProperties: false
 required:

diff --git a/workflow/scripts/assignment/check_design_file.py b/workflow/scripts/assignment/check_design_file.py
@@ -42,20 +42,44 @@
     help="Using a simple dictionary to find identical sequences. This is faster but uses only the whole (or center part depending on start/length) of the design file. But in theory a substring can only be present and for more correct, but slower, search use the --slow-string-search.",
 )
 @click.option(
-    "--perform-sequence-check/--skip-sequence-check",
-    "sequence_search",
-    default=True,
-    help="When set to False, the script will not check for sequence collisions. This is useful when you know collisions but still want to preoceed with the design file.",
+    '--sequence-check', 
+    'sequence_check', 
+    type=click.Choice(['skip', 'sense_only', 'sense_antisense']), 
+    default='sense_antisense', 
+    help='Choose the type of sequence check. When set to skip, the script will not check for sequence collisions. This is useful when you know collisions but still want to preoceed with the design file.'
+)
+@click.option(
+    "--attach-sequence",
+    "attach_sequence",
+    required=False,
+    type=(click.STRING, click.STRING),
+    help="Attach a sequence left and right to each entry of the fasta file."
+)
+@click.option(
+    "--output",
+    "output",
+    required=True,
+    type=click.Path(writable=True),
+    help="Output reference fasqt file with attached sequences (if set). Otherwise just a copy.",
 )
-def cli(input_file, start, length, fast_search, sequence_search):
+def cli(input_file, start, length, fast_search, sequence_check, attach_sequence, output):
 
     seq_dict = dict()
 
     forward_collitions = []
     antisense_collitions = []
 
+    # attach sequence
+    with open(output, 'w') as fasta_file:
+        for name, seq in pyfastx.Fasta(input_file, build_index=False):
+            new_sequence = seq
+            if attach_sequence:
+                new_sequence = attach_sequence[0] + new_sequence + attach_sequence[1]
+            fasta_file.write(f">{name}\n{new_sequence}\n")
+
+
     # read fasta file
-    fa = pyfastx.Fasta(input_file)
+    fa = pyfastx.Fasta(output)
 
     # check duplicated headers
     click.echo("Searching for duplicated headers...")
@@ -76,7 +100,12 @@ def cli(input_file, start, length, fast_search, sequence_search):
         )
         exit(1)
 
-    if sequence_search:
+
+    # read fasta file
+    if attach_sequence:
+        fa = pyfastx.Fasta(output)
+
+    if sequence_check != 'skip':
         # build seq dict
         click.echo("Building sequence dictionary...")
         for i in range(len(fa)):
@@ -100,12 +129,12 @@ def cli(input_file, start, length, fast_search, sequence_search):
             antisense_collition = set()
             if fast_search:
                 forward_collition.update(seq_dict.get(sub_seq_forward, set()))
-                antisense_collition.update(seq_dict.get(sub_seq_antisense, set()))
+                antisense_collition.update(seq_dict.get(sub_seq_antisense, set())) if sequence_check == 'sense_antisense' else None
             else:
                 for seq, names in seq_dict.items():
                     if sub_seq_forward in seq:
                         forward_collition.update(names)
-                    if sub_seq_antisense in seq:
+                    if sequence_check == 'sense_antisense' and sub_seq_antisense in seq:
                         antisense_collition.update(names)
 
             if len(forward_collition) > 1:
@@ -135,16 +164,16 @@ def cli(input_file, start, length, fast_search, sequence_search):
                     "\t".join(forward_collitions[i]),
                     err=True,
                 )
-
-            click.echo(
-                "-----------------ANTISENSE COLLISIONS-----------------",
-                err=True,
-            )
-            for i in range(len(antisense_collitions)):
+            if sequence_check == 'sense_antisense':
                 click.echo(
-                    "\t".join(antisense_collitions[i]),
+                    "-----------------ANTISENSE COLLISIONS-----------------",
                     err=True,
                 )
+                for i in range(len(antisense_collitions)):
+                    click.echo(
+                        "\t".join(antisense_collitions[i]),
+                        err=True,
+                    )
             exit(1)
 
         else: