From f315aabd24ebe6f524b9fb3b008f5af96987775d Mon Sep 17 00:00:00 2001 From: Max Date: Mon, 9 Dec 2024 13:05:20 +0100 Subject: [PATCH] feat: strand sensitive option (#146) * allowing additional properties for config back compatibility * work * implementing strand_senssitive config with interplay in snakemake file --------- Co-authored-by: Max Schubach Co-authored-by: Max Schubach --- workflow/rules/assignment.smk | 46 +++++++++++--- workflow/rules/common.smk | 37 +++++++++++ workflow/schemas/config.schema.yaml | 35 +++++------ .../scripts/assignment/check_design_file.py | 61 ++++++++++++++----- workflow/scripts/attachBCToFastQ.py | 30 ++++++++- 5 files changed, 163 insertions(+), 46 deletions(-) diff --git a/workflow/rules/assignment.smk b/workflow/rules/assignment.smk index fe3aaf2b..8785fb00 100644 --- a/workflow/rules/assignment.smk +++ b/workflow/rules/assignment.smk @@ -25,6 +25,7 @@ rule assignment_check_design: temp("results/assignment/{assignment}/reference/reference.fa.fxi"), touch("results/assignment/{assignment}/design_check.done"), ref="results/assignment/{assignment}/reference/reference.fa", + ref_tmp=temp("results/assignment/{assignment}/reference/reference.tmp.fa"), params: start=lambda wc: ( config["assignments"][wc.assignment]["alignment_tool"]["configs"][ @@ -49,12 +50,25 @@ rule assignment_check_design: if config["assignments"][wc.assignment]["design_check"]["fast"] else "--slow-string-search" ), - check_sequence_collitions=lambda wc: ( - "--perform-sequence-check" + sequence_collitions=lambda wc: ( + "sense_antisense" if config["assignments"][wc.assignment]["design_check"][ "sequence_collitions" ] - else "--skip-sequence-check" + else "skip" + ), + attach_sequence=lambda wc: ( + "--attach-sequence %s %s" + % ( + config["assignments"][wc.assignment]["strand_sensitive"][ + "forward_adapter" + ], + config["assignments"][wc.assignment]["strand_sensitive"][ + "reverse_adapter" + ], + ) + if config["assignments"][wc.assignment]["strand_sensitive"]["enable"] + else "" ), log: log=temp("results/logs/assignment/check_design.{assignment}.log"), @@ -62,10 +76,12 @@ rule assignment_check_design: shell: """ trap "cat {log.err}" ERR - cp {input.design} {output.ref} - python {input.script} --input {output.ref} \ + cp {input.design} {output.ref_tmp} + python {input.script} --input {output.ref_tmp} \ + --output {output.ref} \ --start {params.start} --length {params.length} \ - {params.fast_check} {params.check_sequence_collitions} > {log.log} 2> {log.err}; + {params.fast_check} --sequence-check {params.sequence_collitions} \ + {params.attach_sequence} > {log.log} 2> {log.err}; """ @@ -129,11 +145,27 @@ rule assignment_attach_idx: if config["assignments"][wc.assignment]["BC_rev_comp"] else "" ), + attach_sequence=lambda wc: ( + "--attach-sequence left %s" + % ( + config["assignments"][wc.assignment]["strand_sensitive"][ + "forward_adapter" + ] + if wc.read == "FW" + else reverse_complement( + config["assignments"][wc.assignment]["strand_sensitive"][ + "reverse_adapter" + ] + ) + ) + if config["assignments"][wc.assignment]["strand_sensitive"]["enable"] + else "" + ), log: temp("results/logs/assignment/attach_idx.{assignment}.{split}.{read}.log"), shell: """ - python {input.script} -r {input.read} -b {input.BC} {params.BC_rev_comp} | bgzip -c > {output.read} 2> {log} + python {input.script} -r {input.read} -b {input.BC} {params.BC_rev_comp} {params.attach_sequence} | bgzip -c > {output.read} 2> {log} """ diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 3f13d753..53b2360d 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -51,6 +51,38 @@ if not config["skip_version_check"]: check_version(pattern_major_version, version, config["version"]) +# modify config based on certain rules + + +def modify_config(config): + # update sequence length with when adapters are added via strand sensitive is enabled + if "assignments" in config: + for assignment in config["assignments"].keys(): + if config["assignments"][assignment]["strand_sensitive"]["enable"]: + add_length = len( + config["assignments"][assignment]["strand_sensitive"][ + "forward_adapter" + ] + ) + len( + config["assignments"][assignment]["strand_sensitive"][ + "reverse_adapter" + ] + ) + if config["assignments"][assignment]["alignment_tool"]["tool"] == "bwa": + config["assignments"][assignment]["alignment_tool"]["configs"][ + "sequence_length" + ]["min"] += add_length + config["assignments"][assignment]["alignment_tool"]["configs"][ + "sequence_length" + ]["max"] += add_length + else: + config["assignments"][assignment]["alignment_tool"]["configs"][ + "sequence_length" + ] += add_length + return config + + +config = modify_config(config) ################################ #### HELPERS AND EXCEPTIONS #### ################################ @@ -536,6 +568,11 @@ def withoutZeros(project, conf): # assignment.smk specific functions +def reverse_complement(seq): + complementary = {"A": "T", "T": "A", "G": "C", "C": "G", "N": "N"} + return "".join(reversed([complementary[i] for i in seq])) + + def getSplitNumber(): splits = [1] diff --git a/workflow/schemas/config.schema.yaml b/workflow/schemas/config.schema.yaml index 1e61d3bd..56b153b6 100644 --- a/workflow/schemas/config.schema.yaml +++ b/workflow/schemas/config.schema.yaml @@ -62,7 +62,6 @@ properties: type: integer max: type: integer - additionalProperties: false required: - min - max @@ -73,11 +72,9 @@ properties: type: integer max: type: integer - additionalProperties: false required: - min - max - additionalProperties: false required: - sequence_length - alignment_start @@ -103,7 +100,6 @@ properties: alignment_start: type: integer minimum: 1 - additionalProperties: false required: - min_mapping_quality - sequence_length @@ -125,7 +121,6 @@ properties: alignment_start: type: integer minimum: 1 - additionalProperties: false required: - sequence_length - alignment_start @@ -178,7 +173,6 @@ properties: - frac_mismatches_allowed - min_dovetailed_overlap default: {} - additionalProperties: false design_file: type: string design_check: @@ -194,7 +188,21 @@ properties: required: - fast - sequence_collitions - additionalProperties: false + strand_sensitive: + type: object + default: {} + properties: + enable: + type: boolean + default: false + forward_adapter: + type: string + pattern: ^[ATCGN]+$ + default: AGGACCGGATCAACT + reverse_adapter: + type: string + pattern: ^[ATCGN]+$ + default: TCGGTTCACGCAATG configs: type: object patternProperties: @@ -213,8 +221,6 @@ properties: required: - min_support - fraction - additionalProperties: false - additionalProperties: false minProperties: 1 oneOf: - required: @@ -231,8 +237,6 @@ properties: - configs - alignment_tool - NGmerge - additionalProperties: false - additionalProperties: false minProperties: 1 # start_experiments experiments: @@ -290,7 +294,6 @@ properties: minimum: 1 required: - type - additionalProperties: false allOf: - if: properties: @@ -311,7 +314,6 @@ properties: then: required: - assignment_file - additionalProperties: false configs: type: object patternProperties: @@ -339,7 +341,6 @@ properties: default: 3 required: - times_zscore - additionalProperties: false default: {} min_dna_counts: type: integer @@ -349,7 +350,6 @@ properties: type: integer miminum: 0 default: 1 - additionalProperties: false required: - bc_threshold - min_rna_counts @@ -372,12 +372,8 @@ properties: minimum: 1 seed: type: integer - additionalProperties: false - additionalProperties: false - additionalProperties: false required: - filter - additionalProperties: false variants: type: object properties: @@ -399,7 +395,6 @@ properties: - demultiplex - assignments - configs - additionalProperties: false # end_experiments additionalProperties: false required: diff --git a/workflow/scripts/assignment/check_design_file.py b/workflow/scripts/assignment/check_design_file.py index 7227d845..9f8c6d8d 100644 --- a/workflow/scripts/assignment/check_design_file.py +++ b/workflow/scripts/assignment/check_design_file.py @@ -42,20 +42,44 @@ help="Using a simple dictionary to find identical sequences. This is faster but uses only the whole (or center part depending on start/length) of the design file. But in theory a substring can only be present and for more correct, but slower, search use the --slow-string-search.", ) @click.option( - "--perform-sequence-check/--skip-sequence-check", - "sequence_search", - default=True, - help="When set to False, the script will not check for sequence collisions. This is useful when you know collisions but still want to preoceed with the design file.", + '--sequence-check', + 'sequence_check', + type=click.Choice(['skip', 'sense_only', 'sense_antisense']), + default='sense_antisense', + help='Choose the type of sequence check. When set to skip, the script will not check for sequence collisions. This is useful when you know collisions but still want to preoceed with the design file.' +) +@click.option( + "--attach-sequence", + "attach_sequence", + required=False, + type=(click.STRING, click.STRING), + help="Attach a sequence left and right to each entry of the fasta file." +) +@click.option( + "--output", + "output", + required=True, + type=click.Path(writable=True), + help="Output reference fasqt file with attached sequences (if set). Otherwise just a copy.", ) -def cli(input_file, start, length, fast_search, sequence_search): +def cli(input_file, start, length, fast_search, sequence_check, attach_sequence, output): seq_dict = dict() forward_collitions = [] antisense_collitions = [] + # attach sequence + with open(output, 'w') as fasta_file: + for name, seq in pyfastx.Fasta(input_file, build_index=False): + new_sequence = seq + if attach_sequence: + new_sequence = attach_sequence[0] + new_sequence + attach_sequence[1] + fasta_file.write(f">{name}\n{new_sequence}\n") + + # read fasta file - fa = pyfastx.Fasta(input_file) + fa = pyfastx.Fasta(output) # check duplicated headers click.echo("Searching for duplicated headers...") @@ -76,7 +100,12 @@ def cli(input_file, start, length, fast_search, sequence_search): ) exit(1) - if sequence_search: + + # read fasta file + if attach_sequence: + fa = pyfastx.Fasta(output) + + if sequence_check != 'skip': # build seq dict click.echo("Building sequence dictionary...") for i in range(len(fa)): @@ -100,12 +129,12 @@ def cli(input_file, start, length, fast_search, sequence_search): antisense_collition = set() if fast_search: forward_collition.update(seq_dict.get(sub_seq_forward, set())) - antisense_collition.update(seq_dict.get(sub_seq_antisense, set())) + antisense_collition.update(seq_dict.get(sub_seq_antisense, set())) if sequence_check == 'sense_antisense' else None else: for seq, names in seq_dict.items(): if sub_seq_forward in seq: forward_collition.update(names) - if sub_seq_antisense in seq: + if sequence_check == 'sense_antisense' and sub_seq_antisense in seq: antisense_collition.update(names) if len(forward_collition) > 1: @@ -135,16 +164,16 @@ def cli(input_file, start, length, fast_search, sequence_search): "\t".join(forward_collitions[i]), err=True, ) - - click.echo( - "-----------------ANTISENSE COLLISIONS-----------------", - err=True, - ) - for i in range(len(antisense_collitions)): + if sequence_check == 'sense_antisense': click.echo( - "\t".join(antisense_collitions[i]), + "-----------------ANTISENSE COLLISIONS-----------------", err=True, ) + for i in range(len(antisense_collitions)): + click.echo( + "\t".join(antisense_collitions[i]), + err=True, + ) exit(1) else: diff --git a/workflow/scripts/attachBCToFastQ.py b/workflow/scripts/attachBCToFastQ.py index 205cd64e..04844da2 100644 --- a/workflow/scripts/attachBCToFastQ.py +++ b/workflow/scripts/attachBCToFastQ.py @@ -3,9 +3,17 @@ import gzip -def read_sequence_files(read_file, bc_file, use_BC_reverse_complement=False): +def read_sequence_files(read_file, bc_file, use_BC_reverse_complement=False, add_sequence_left=None, add_sequence_right=None): for read, bc in zip(read_fastq(read_file), read_fastq(bc_file)): seqid_read, seq_read, qual_read = read + + if add_sequence_left: + seq_read = add_sequence_left + seq_read + qual_read = 'I' * len(add_sequence_left) + qual_read + if add_sequence_right: + seq_read = seq_read + add_sequence_right + qual_read = qual_read + 'I' * len(add_sequence_right) + seqid_read = seqid_read.split(" ")[0] seqid_bc, seq_bc, qual_bc = bc seqid_bc = seqid_bc.split(" ")[0] @@ -31,10 +39,26 @@ def read_sequence_files(read_file, bc_file, use_BC_reverse_complement=False): @click.option('--reverse-complement', "use_reverse_complement", is_flag=True) -def cli(read_file, barcode_file, use_reverse_complement): +@click.option('--attach-sequence', + "attach_sequence", + required=False, + type=(click.Choice(['left', 'right', 'both']), click.STRING) + ) +def cli(read_file, barcode_file, use_reverse_complement, attach_sequence): with gzip.open(read_file, 'rt') as r_file, gzip.open(barcode_file, 'rt') as bc_file: - for seqid, seq, qual in read_sequence_files(r_file, bc_file, use_reverse_complement): + inputs = { + "read_file":r_file, + "bc_file":bc_file, + "use_BC_reverse_complement": use_reverse_complement + } + if attach_sequence: + if attach_sequence[0] == 'left' or attach_sequence[0] == 'both': + inputs['add_sequence_left'] = attach_sequence[1] + if attach_sequence[0] == 'right' or attach_sequence[0] == 'both': + inputs['add_sequence_right'] = reverse_complement(attach_sequence[1]) + + for seqid, seq, qual in read_sequence_files(**inputs): click.echo("@%s\n%s\n+\n%s" % (seqid, seq, qual)) def reverse_complement(seq):