From f315aabd24ebe6f524b9fb3b008f5af96987775d Mon Sep 17 00:00:00 2001
From: Max <visze@users.noreply.github.com>
Date: Mon, 9 Dec 2024 13:05:20 +0100
Subject: [PATCH] feat: strand sensitive option (#146)

* allowing additional properties for config back compatibility

* work

* implementing strand_senssitive config with interplay in snakemake file

---------

Co-authored-by: Max Schubach <max.schubach@bih-charite.de>
Co-authored-by: Max Schubach <max.schubach@bihealth.de>
---
 workflow/rules/assignment.smk                 | 46 +++++++++++---
 workflow/rules/common.smk                     | 37 +++++++++++
 workflow/schemas/config.schema.yaml           | 35 +++++------
 .../scripts/assignment/check_design_file.py   | 61 ++++++++++++++-----
 workflow/scripts/attachBCToFastQ.py           | 30 ++++++++-
 5 files changed, 163 insertions(+), 46 deletions(-)

diff --git a/workflow/rules/assignment.smk b/workflow/rules/assignment.smk
index fe3aaf2b..8785fb00 100644
--- a/workflow/rules/assignment.smk
+++ b/workflow/rules/assignment.smk
@@ -25,6 +25,7 @@ rule assignment_check_design:
         temp("results/assignment/{assignment}/reference/reference.fa.fxi"),
         touch("results/assignment/{assignment}/design_check.done"),
         ref="results/assignment/{assignment}/reference/reference.fa",
+        ref_tmp=temp("results/assignment/{assignment}/reference/reference.tmp.fa"),
     params:
         start=lambda wc: (
             config["assignments"][wc.assignment]["alignment_tool"]["configs"][
@@ -49,12 +50,25 @@ rule assignment_check_design:
             if config["assignments"][wc.assignment]["design_check"]["fast"]
             else "--slow-string-search"
         ),
-        check_sequence_collitions=lambda wc: (
-            "--perform-sequence-check"
+        sequence_collitions=lambda wc: (
+            "sense_antisense"
             if config["assignments"][wc.assignment]["design_check"][
                 "sequence_collitions"
             ]
-            else "--skip-sequence-check"
+            else "skip"
+        ),
+        attach_sequence=lambda wc: (
+            "--attach-sequence %s %s"
+            % (
+                config["assignments"][wc.assignment]["strand_sensitive"][
+                    "forward_adapter"
+                ],
+                config["assignments"][wc.assignment]["strand_sensitive"][
+                    "reverse_adapter"
+                ],
+            )
+            if config["assignments"][wc.assignment]["strand_sensitive"]["enable"]
+            else ""
         ),
     log:
         log=temp("results/logs/assignment/check_design.{assignment}.log"),
@@ -62,10 +76,12 @@ rule assignment_check_design:
     shell:
         """
         trap "cat {log.err}" ERR
-        cp {input.design} {output.ref}
-        python {input.script} --input {output.ref} \
+        cp {input.design} {output.ref_tmp}
+        python {input.script} --input {output.ref_tmp} \
+        --output {output.ref} \
         --start {params.start} --length {params.length} \
-        {params.fast_check} {params.check_sequence_collitions} > {log.log} 2> {log.err};
+        {params.fast_check} --sequence-check {params.sequence_collitions} \
+        {params.attach_sequence} > {log.log} 2> {log.err};
         """
 
 
@@ -129,11 +145,27 @@ rule assignment_attach_idx:
             if config["assignments"][wc.assignment]["BC_rev_comp"]
             else ""
         ),
+        attach_sequence=lambda wc: (
+            "--attach-sequence left %s"
+            % (
+                config["assignments"][wc.assignment]["strand_sensitive"][
+                    "forward_adapter"
+                ]
+                if wc.read == "FW"
+                else reverse_complement(
+                    config["assignments"][wc.assignment]["strand_sensitive"][
+                        "reverse_adapter"
+                    ]
+                )
+            )
+            if config["assignments"][wc.assignment]["strand_sensitive"]["enable"]
+            else ""
+        ),
     log:
         temp("results/logs/assignment/attach_idx.{assignment}.{split}.{read}.log"),
     shell:
         """
-        python {input.script} -r {input.read} -b {input.BC} {params.BC_rev_comp} | bgzip -c > {output.read} 2> {log}
+        python {input.script} -r {input.read} -b {input.BC} {params.BC_rev_comp} {params.attach_sequence} | bgzip -c > {output.read} 2> {log}
         """
 
 
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index 3f13d753..53b2360d 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -51,6 +51,38 @@ if not config["skip_version_check"]:
     check_version(pattern_major_version, version, config["version"])
 
 
+# modify config based on certain rules
+
+
+def modify_config(config):
+    # update sequence length with when adapters are added via strand sensitive is enabled
+    if "assignments" in config:
+        for assignment in config["assignments"].keys():
+            if config["assignments"][assignment]["strand_sensitive"]["enable"]:
+                add_length = len(
+                    config["assignments"][assignment]["strand_sensitive"][
+                        "forward_adapter"
+                    ]
+                ) + len(
+                    config["assignments"][assignment]["strand_sensitive"][
+                        "reverse_adapter"
+                    ]
+                )
+                if config["assignments"][assignment]["alignment_tool"]["tool"] == "bwa":
+                    config["assignments"][assignment]["alignment_tool"]["configs"][
+                        "sequence_length"
+                    ]["min"] += add_length
+                    config["assignments"][assignment]["alignment_tool"]["configs"][
+                        "sequence_length"
+                    ]["max"] += add_length
+                else:
+                    config["assignments"][assignment]["alignment_tool"]["configs"][
+                        "sequence_length"
+                    ] += add_length
+    return config
+
+
+config = modify_config(config)
 ################################
 #### HELPERS AND EXCEPTIONS ####
 ################################
@@ -536,6 +568,11 @@ def withoutZeros(project, conf):
 # assignment.smk specific functions
 
 
+def reverse_complement(seq):
+    complementary = {"A": "T", "T": "A", "G": "C", "C": "G", "N": "N"}
+    return "".join(reversed([complementary[i] for i in seq]))
+
+
 def getSplitNumber():
     splits = [1]
 
diff --git a/workflow/schemas/config.schema.yaml b/workflow/schemas/config.schema.yaml
index 1e61d3bd..56b153b6 100644
--- a/workflow/schemas/config.schema.yaml
+++ b/workflow/schemas/config.schema.yaml
@@ -62,7 +62,6 @@ properties:
                               type: integer
                             max:
                               type: integer
-                          additionalProperties: false
                           required:
                             - min
                             - max
@@ -73,11 +72,9 @@ properties:
                               type: integer
                             max:
                               type: integer
-                          additionalProperties: false
                           required:
                             - min
                             - max
-                      additionalProperties: false
                       required:
                         - sequence_length
                         - alignment_start
@@ -103,7 +100,6 @@ properties:
                         alignment_start:
                           type: integer
                           minimum: 1
-                      additionalProperties: false
                       required:
                         - min_mapping_quality
                         - sequence_length
@@ -125,7 +121,6 @@ properties:
                         alignment_start:
                           type: integer
                           minimum: 1
-                      additionalProperties: false
                       required:
                         - sequence_length
                         - alignment_start
@@ -178,7 +173,6 @@ properties:
               - frac_mismatches_allowed
               - min_dovetailed_overlap
             default: {}
-            additionalProperties: false
           design_file:
             type: string
           design_check:
@@ -194,7 +188,21 @@ properties:
             required:
               - fast
               - sequence_collitions
-            additionalProperties: false
+          strand_sensitive:
+            type: object
+            default: {}
+            properties:
+              enable:
+                type: boolean
+                default: false
+              forward_adapter:
+                type: string
+                pattern: ^[ATCGN]+$
+                default: AGGACCGGATCAACT
+              reverse_adapter:
+                type: string
+                pattern: ^[ATCGN]+$
+                default: TCGGTTCACGCAATG
           configs:
             type: object
             patternProperties:
@@ -213,8 +221,6 @@ properties:
                 required:
                   - min_support
                   - fraction
-                additionalProperties: false
-            additionalProperties: false
             minProperties: 1
         oneOf:
           - required:
@@ -231,8 +237,6 @@ properties:
           - configs
           - alignment_tool
           - NGmerge
-        additionalProperties: false
-    additionalProperties: false
     minProperties: 1
   # start_experiments
   experiments:
@@ -290,7 +294,6 @@ properties:
                         minimum: 1
                 required:
                   - type
-                additionalProperties: false
                 allOf:
                   - if:
                       properties:
@@ -311,7 +314,6 @@ properties:
                     then:
                       required:
                         - assignment_file
-            additionalProperties: false
           configs:
             type: object
             patternProperties:
@@ -339,7 +341,6 @@ properties:
                             default: 3
                         required:
                           - times_zscore
-                        additionalProperties: false
                         default: {}
                       min_dna_counts:
                         type: integer
@@ -349,7 +350,6 @@ properties:
                         type: integer
                         miminum: 0
                         default: 1
-                    additionalProperties: false
                     required:
                       - bc_threshold
                       - min_rna_counts
@@ -372,12 +372,8 @@ properties:
                             minimum: 1
                           seed:
                             type: integer
-                        additionalProperties: false
-                    additionalProperties: false
-                additionalProperties: false
                 required:
                   - filter
-              additionalProperties: false
           variants:
             type: object
             properties:
@@ -399,7 +395,6 @@ properties:
           - demultiplex
           - assignments
           - configs
-      additionalProperties: false
 # end_experiments
 additionalProperties: false
 required:
diff --git a/workflow/scripts/assignment/check_design_file.py b/workflow/scripts/assignment/check_design_file.py
index 7227d845..9f8c6d8d 100644
--- a/workflow/scripts/assignment/check_design_file.py
+++ b/workflow/scripts/assignment/check_design_file.py
@@ -42,20 +42,44 @@
     help="Using a simple dictionary to find identical sequences. This is faster but uses only the whole (or center part depending on start/length) of the design file. But in theory a substring can only be present and for more correct, but slower, search use the --slow-string-search.",
 )
 @click.option(
-    "--perform-sequence-check/--skip-sequence-check",
-    "sequence_search",
-    default=True,
-    help="When set to False, the script will not check for sequence collisions. This is useful when you know collisions but still want to preoceed with the design file.",
+    '--sequence-check', 
+    'sequence_check', 
+    type=click.Choice(['skip', 'sense_only', 'sense_antisense']), 
+    default='sense_antisense', 
+    help='Choose the type of sequence check. When set to skip, the script will not check for sequence collisions. This is useful when you know collisions but still want to preoceed with the design file.'
+)
+@click.option(
+    "--attach-sequence",
+    "attach_sequence",
+    required=False,
+    type=(click.STRING, click.STRING),
+    help="Attach a sequence left and right to each entry of the fasta file."
+)
+@click.option(
+    "--output",
+    "output",
+    required=True,
+    type=click.Path(writable=True),
+    help="Output reference fasqt file with attached sequences (if set). Otherwise just a copy.",
 )
-def cli(input_file, start, length, fast_search, sequence_search):
+def cli(input_file, start, length, fast_search, sequence_check, attach_sequence, output):
 
     seq_dict = dict()
 
     forward_collitions = []
     antisense_collitions = []
 
+    # attach sequence
+    with open(output, 'w') as fasta_file:
+        for name, seq in pyfastx.Fasta(input_file, build_index=False):
+            new_sequence = seq
+            if attach_sequence:
+                new_sequence = attach_sequence[0] + new_sequence + attach_sequence[1]
+            fasta_file.write(f">{name}\n{new_sequence}\n")
+
+
     # read fasta file
-    fa = pyfastx.Fasta(input_file)
+    fa = pyfastx.Fasta(output)
 
     # check duplicated headers
     click.echo("Searching for duplicated headers...")
@@ -76,7 +100,12 @@ def cli(input_file, start, length, fast_search, sequence_search):
         )
         exit(1)
 
-    if sequence_search:
+
+    # read fasta file
+    if attach_sequence:
+        fa = pyfastx.Fasta(output)
+
+    if sequence_check != 'skip':
         # build seq dict
         click.echo("Building sequence dictionary...")
         for i in range(len(fa)):
@@ -100,12 +129,12 @@ def cli(input_file, start, length, fast_search, sequence_search):
             antisense_collition = set()
             if fast_search:
                 forward_collition.update(seq_dict.get(sub_seq_forward, set()))
-                antisense_collition.update(seq_dict.get(sub_seq_antisense, set()))
+                antisense_collition.update(seq_dict.get(sub_seq_antisense, set())) if sequence_check == 'sense_antisense' else None
             else:
                 for seq, names in seq_dict.items():
                     if sub_seq_forward in seq:
                         forward_collition.update(names)
-                    if sub_seq_antisense in seq:
+                    if sequence_check == 'sense_antisense' and sub_seq_antisense in seq:
                         antisense_collition.update(names)
 
             if len(forward_collition) > 1:
@@ -135,16 +164,16 @@ def cli(input_file, start, length, fast_search, sequence_search):
                     "\t".join(forward_collitions[i]),
                     err=True,
                 )
-
-            click.echo(
-                "-----------------ANTISENSE COLLISIONS-----------------",
-                err=True,
-            )
-            for i in range(len(antisense_collitions)):
+            if sequence_check == 'sense_antisense':
                 click.echo(
-                    "\t".join(antisense_collitions[i]),
+                    "-----------------ANTISENSE COLLISIONS-----------------",
                     err=True,
                 )
+                for i in range(len(antisense_collitions)):
+                    click.echo(
+                        "\t".join(antisense_collitions[i]),
+                        err=True,
+                    )
             exit(1)
 
         else:
diff --git a/workflow/scripts/attachBCToFastQ.py b/workflow/scripts/attachBCToFastQ.py
index 205cd64e..04844da2 100644
--- a/workflow/scripts/attachBCToFastQ.py
+++ b/workflow/scripts/attachBCToFastQ.py
@@ -3,9 +3,17 @@
 import gzip
 
 
-def read_sequence_files(read_file, bc_file, use_BC_reverse_complement=False):
+def read_sequence_files(read_file, bc_file, use_BC_reverse_complement=False, add_sequence_left=None, add_sequence_right=None):
     for read, bc in zip(read_fastq(read_file), read_fastq(bc_file)):
         seqid_read, seq_read, qual_read = read
+        
+        if add_sequence_left:
+            seq_read = add_sequence_left + seq_read
+            qual_read = 'I' * len(add_sequence_left) + qual_read
+        if add_sequence_right:
+            seq_read = seq_read + add_sequence_right
+            qual_read = qual_read + 'I' * len(add_sequence_right)
+
         seqid_read = seqid_read.split(" ")[0]
         seqid_bc, seq_bc, qual_bc = bc
         seqid_bc = seqid_bc.split(" ")[0]
@@ -31,10 +39,26 @@ def read_sequence_files(read_file, bc_file, use_BC_reverse_complement=False):
 @click.option('--reverse-complement',
               "use_reverse_complement",
               is_flag=True)
-def cli(read_file, barcode_file, use_reverse_complement):
+@click.option('--attach-sequence',
+              "attach_sequence",
+              required=False,
+              type=(click.Choice(['left', 'right', 'both']), click.STRING)
+              )
+def cli(read_file, barcode_file, use_reverse_complement, attach_sequence):
     
     with gzip.open(read_file, 'rt') as r_file, gzip.open(barcode_file, 'rt') as bc_file:
-        for seqid, seq, qual in read_sequence_files(r_file, bc_file, use_reverse_complement):
+        inputs = {
+            "read_file":r_file, 
+            "bc_file":bc_file, 
+            "use_BC_reverse_complement": use_reverse_complement
+            }
+        if attach_sequence:
+            if attach_sequence[0] == 'left' or attach_sequence[0] == 'both':
+                inputs['add_sequence_left'] = attach_sequence[1]
+            if attach_sequence[0] == 'right' or attach_sequence[0] == 'both':
+                inputs['add_sequence_right'] = reverse_complement(attach_sequence[1])
+        
+        for seqid, seq, qual in read_sequence_files(**inputs):
             click.echo("@%s\n%s\n+\n%s" % (seqid, seq, qual))
 
 def reverse_complement(seq):