diff --git a/workflow/rules/assignment.smk b/workflow/rules/assignment.smk index fe3aaf2..8785fb0 100644 --- a/workflow/rules/assignment.smk +++ b/workflow/rules/assignment.smk @@ -25,6 +25,7 @@ rule assignment_check_design: temp("results/assignment/{assignment}/reference/reference.fa.fxi"), touch("results/assignment/{assignment}/design_check.done"), ref="results/assignment/{assignment}/reference/reference.fa", + ref_tmp=temp("results/assignment/{assignment}/reference/reference.tmp.fa"), params: start=lambda wc: ( config["assignments"][wc.assignment]["alignment_tool"]["configs"][ @@ -49,12 +50,25 @@ rule assignment_check_design: if config["assignments"][wc.assignment]["design_check"]["fast"] else "--slow-string-search" ), - check_sequence_collitions=lambda wc: ( - "--perform-sequence-check" + sequence_collitions=lambda wc: ( + "sense_antisense" if config["assignments"][wc.assignment]["design_check"][ "sequence_collitions" ] - else "--skip-sequence-check" + else "skip" + ), + attach_sequence=lambda wc: ( + "--attach-sequence %s %s" + % ( + config["assignments"][wc.assignment]["strand_sensitive"][ + "forward_adapter" + ], + config["assignments"][wc.assignment]["strand_sensitive"][ + "reverse_adapter" + ], + ) + if config["assignments"][wc.assignment]["strand_sensitive"]["enable"] + else "" ), log: log=temp("results/logs/assignment/check_design.{assignment}.log"), @@ -62,10 +76,12 @@ rule assignment_check_design: shell: """ trap "cat {log.err}" ERR - cp {input.design} {output.ref} - python {input.script} --input {output.ref} \ + cp {input.design} {output.ref_tmp} + python {input.script} --input {output.ref_tmp} \ + --output {output.ref} \ --start {params.start} --length {params.length} \ - {params.fast_check} {params.check_sequence_collitions} > {log.log} 2> {log.err}; + {params.fast_check} --sequence-check {params.sequence_collitions} \ + {params.attach_sequence} > {log.log} 2> {log.err}; """ @@ -129,11 +145,27 @@ rule assignment_attach_idx: if config["assignments"][wc.assignment]["BC_rev_comp"] else "" ), + attach_sequence=lambda wc: ( + "--attach-sequence left %s" + % ( + config["assignments"][wc.assignment]["strand_sensitive"][ + "forward_adapter" + ] + if wc.read == "FW" + else reverse_complement( + config["assignments"][wc.assignment]["strand_sensitive"][ + "reverse_adapter" + ] + ) + ) + if config["assignments"][wc.assignment]["strand_sensitive"]["enable"] + else "" + ), log: temp("results/logs/assignment/attach_idx.{assignment}.{split}.{read}.log"), shell: """ - python {input.script} -r {input.read} -b {input.BC} {params.BC_rev_comp} | bgzip -c > {output.read} 2> {log} + python {input.script} -r {input.read} -b {input.BC} {params.BC_rev_comp} {params.attach_sequence} | bgzip -c > {output.read} 2> {log} """ diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 3f13d75..53b2360 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -51,6 +51,38 @@ if not config["skip_version_check"]: check_version(pattern_major_version, version, config["version"]) +# modify config based on certain rules + + +def modify_config(config): + # update sequence length with when adapters are added via strand sensitive is enabled + if "assignments" in config: + for assignment in config["assignments"].keys(): + if config["assignments"][assignment]["strand_sensitive"]["enable"]: + add_length = len( + config["assignments"][assignment]["strand_sensitive"][ + "forward_adapter" + ] + ) + len( + config["assignments"][assignment]["strand_sensitive"][ + "reverse_adapter" + ] + ) + if config["assignments"][assignment]["alignment_tool"]["tool"] == "bwa": + config["assignments"][assignment]["alignment_tool"]["configs"][ + "sequence_length" + ]["min"] += add_length + config["assignments"][assignment]["alignment_tool"]["configs"][ + "sequence_length" + ]["max"] += add_length + else: + config["assignments"][assignment]["alignment_tool"]["configs"][ + "sequence_length" + ] += add_length + return config + + +config = modify_config(config) ################################ #### HELPERS AND EXCEPTIONS #### ################################ @@ -536,6 +568,11 @@ def withoutZeros(project, conf): # assignment.smk specific functions +def reverse_complement(seq): + complementary = {"A": "T", "T": "A", "G": "C", "C": "G", "N": "N"} + return "".join(reversed([complementary[i] for i in seq])) + + def getSplitNumber(): splits = [1] diff --git a/workflow/schemas/config.schema.yaml b/workflow/schemas/config.schema.yaml index 1e61d3b..56b153b 100644 --- a/workflow/schemas/config.schema.yaml +++ b/workflow/schemas/config.schema.yaml @@ -62,7 +62,6 @@ properties: type: integer max: type: integer - additionalProperties: false required: - min - max @@ -73,11 +72,9 @@ properties: type: integer max: type: integer - additionalProperties: false required: - min - max - additionalProperties: false required: - sequence_length - alignment_start @@ -103,7 +100,6 @@ properties: alignment_start: type: integer minimum: 1 - additionalProperties: false required: - min_mapping_quality - sequence_length @@ -125,7 +121,6 @@ properties: alignment_start: type: integer minimum: 1 - additionalProperties: false required: - sequence_length - alignment_start @@ -178,7 +173,6 @@ properties: - frac_mismatches_allowed - min_dovetailed_overlap default: {} - additionalProperties: false design_file: type: string design_check: @@ -194,7 +188,21 @@ properties: required: - fast - sequence_collitions - additionalProperties: false + strand_sensitive: + type: object + default: {} + properties: + enable: + type: boolean + default: false + forward_adapter: + type: string + pattern: ^[ATCGN]+$ + default: AGGACCGGATCAACT + reverse_adapter: + type: string + pattern: ^[ATCGN]+$ + default: TCGGTTCACGCAATG configs: type: object patternProperties: @@ -213,8 +221,6 @@ properties: required: - min_support - fraction - additionalProperties: false - additionalProperties: false minProperties: 1 oneOf: - required: @@ -231,8 +237,6 @@ properties: - configs - alignment_tool - NGmerge - additionalProperties: false - additionalProperties: false minProperties: 1 # start_experiments experiments: @@ -290,7 +294,6 @@ properties: minimum: 1 required: - type - additionalProperties: false allOf: - if: properties: @@ -311,7 +314,6 @@ properties: then: required: - assignment_file - additionalProperties: false configs: type: object patternProperties: @@ -339,7 +341,6 @@ properties: default: 3 required: - times_zscore - additionalProperties: false default: {} min_dna_counts: type: integer @@ -349,7 +350,6 @@ properties: type: integer miminum: 0 default: 1 - additionalProperties: false required: - bc_threshold - min_rna_counts @@ -372,12 +372,8 @@ properties: minimum: 1 seed: type: integer - additionalProperties: false - additionalProperties: false - additionalProperties: false required: - filter - additionalProperties: false variants: type: object properties: @@ -399,7 +395,6 @@ properties: - demultiplex - assignments - configs - additionalProperties: false # end_experiments additionalProperties: false required: diff --git a/workflow/scripts/assignment/check_design_file.py b/workflow/scripts/assignment/check_design_file.py index 7227d84..9f8c6d8 100644 --- a/workflow/scripts/assignment/check_design_file.py +++ b/workflow/scripts/assignment/check_design_file.py @@ -42,20 +42,44 @@ help="Using a simple dictionary to find identical sequences. This is faster but uses only the whole (or center part depending on start/length) of the design file. But in theory a substring can only be present and for more correct, but slower, search use the --slow-string-search.", ) @click.option( - "--perform-sequence-check/--skip-sequence-check", - "sequence_search", - default=True, - help="When set to False, the script will not check for sequence collisions. This is useful when you know collisions but still want to preoceed with the design file.", + '--sequence-check', + 'sequence_check', + type=click.Choice(['skip', 'sense_only', 'sense_antisense']), + default='sense_antisense', + help='Choose the type of sequence check. When set to skip, the script will not check for sequence collisions. This is useful when you know collisions but still want to preoceed with the design file.' +) +@click.option( + "--attach-sequence", + "attach_sequence", + required=False, + type=(click.STRING, click.STRING), + help="Attach a sequence left and right to each entry of the fasta file." +) +@click.option( + "--output", + "output", + required=True, + type=click.Path(writable=True), + help="Output reference fasqt file with attached sequences (if set). Otherwise just a copy.", ) -def cli(input_file, start, length, fast_search, sequence_search): +def cli(input_file, start, length, fast_search, sequence_check, attach_sequence, output): seq_dict = dict() forward_collitions = [] antisense_collitions = [] + # attach sequence + with open(output, 'w') as fasta_file: + for name, seq in pyfastx.Fasta(input_file, build_index=False): + new_sequence = seq + if attach_sequence: + new_sequence = attach_sequence[0] + new_sequence + attach_sequence[1] + fasta_file.write(f">{name}\n{new_sequence}\n") + + # read fasta file - fa = pyfastx.Fasta(input_file) + fa = pyfastx.Fasta(output) # check duplicated headers click.echo("Searching for duplicated headers...") @@ -76,7 +100,12 @@ def cli(input_file, start, length, fast_search, sequence_search): ) exit(1) - if sequence_search: + + # read fasta file + if attach_sequence: + fa = pyfastx.Fasta(output) + + if sequence_check != 'skip': # build seq dict click.echo("Building sequence dictionary...") for i in range(len(fa)): @@ -100,12 +129,12 @@ def cli(input_file, start, length, fast_search, sequence_search): antisense_collition = set() if fast_search: forward_collition.update(seq_dict.get(sub_seq_forward, set())) - antisense_collition.update(seq_dict.get(sub_seq_antisense, set())) + antisense_collition.update(seq_dict.get(sub_seq_antisense, set())) if sequence_check == 'sense_antisense' else None else: for seq, names in seq_dict.items(): if sub_seq_forward in seq: forward_collition.update(names) - if sub_seq_antisense in seq: + if sequence_check == 'sense_antisense' and sub_seq_antisense in seq: antisense_collition.update(names) if len(forward_collition) > 1: @@ -135,16 +164,16 @@ def cli(input_file, start, length, fast_search, sequence_search): "\t".join(forward_collitions[i]), err=True, ) - - click.echo( - "-----------------ANTISENSE COLLISIONS-----------------", - err=True, - ) - for i in range(len(antisense_collitions)): + if sequence_check == 'sense_antisense': click.echo( - "\t".join(antisense_collitions[i]), + "-----------------ANTISENSE COLLISIONS-----------------", err=True, ) + for i in range(len(antisense_collitions)): + click.echo( + "\t".join(antisense_collitions[i]), + err=True, + ) exit(1) else: diff --git a/workflow/scripts/attachBCToFastQ.py b/workflow/scripts/attachBCToFastQ.py index 205cd64..04844da 100644 --- a/workflow/scripts/attachBCToFastQ.py +++ b/workflow/scripts/attachBCToFastQ.py @@ -3,9 +3,17 @@ import gzip -def read_sequence_files(read_file, bc_file, use_BC_reverse_complement=False): +def read_sequence_files(read_file, bc_file, use_BC_reverse_complement=False, add_sequence_left=None, add_sequence_right=None): for read, bc in zip(read_fastq(read_file), read_fastq(bc_file)): seqid_read, seq_read, qual_read = read + + if add_sequence_left: + seq_read = add_sequence_left + seq_read + qual_read = 'I' * len(add_sequence_left) + qual_read + if add_sequence_right: + seq_read = seq_read + add_sequence_right + qual_read = qual_read + 'I' * len(add_sequence_right) + seqid_read = seqid_read.split(" ")[0] seqid_bc, seq_bc, qual_bc = bc seqid_bc = seqid_bc.split(" ")[0] @@ -31,10 +39,26 @@ def read_sequence_files(read_file, bc_file, use_BC_reverse_complement=False): @click.option('--reverse-complement', "use_reverse_complement", is_flag=True) -def cli(read_file, barcode_file, use_reverse_complement): +@click.option('--attach-sequence', + "attach_sequence", + required=False, + type=(click.Choice(['left', 'right', 'both']), click.STRING) + ) +def cli(read_file, barcode_file, use_reverse_complement, attach_sequence): with gzip.open(read_file, 'rt') as r_file, gzip.open(barcode_file, 'rt') as bc_file: - for seqid, seq, qual in read_sequence_files(r_file, bc_file, use_reverse_complement): + inputs = { + "read_file":r_file, + "bc_file":bc_file, + "use_BC_reverse_complement": use_reverse_complement + } + if attach_sequence: + if attach_sequence[0] == 'left' or attach_sequence[0] == 'both': + inputs['add_sequence_left'] = attach_sequence[1] + if attach_sequence[0] == 'right' or attach_sequence[0] == 'both': + inputs['add_sequence_right'] = reverse_complement(attach_sequence[1]) + + for seqid, seq, qual in read_sequence_files(**inputs): click.echo("@%s\n%s\n+\n%s" % (seqid, seq, qual)) def reverse_complement(seq):