Skip to content

Commit

Permalink
feat: strand sensitive option (kircherlab#146)
Browse files Browse the repository at this point in the history
* allowing additional properties for config back compatibility

* work

* implementing strand_senssitive config with interplay in snakemake file

---------

Co-authored-by: Max Schubach <[email protected]>
Co-authored-by: Max Schubach <[email protected]>
  • Loading branch information
3 people authored Dec 9, 2024
1 parent 3c91a6a commit f315aab
Show file tree
Hide file tree
Showing 5 changed files with 163 additions and 46 deletions.
46 changes: 39 additions & 7 deletions workflow/rules/assignment.smk
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ rule assignment_check_design:
temp("results/assignment/{assignment}/reference/reference.fa.fxi"),
touch("results/assignment/{assignment}/design_check.done"),
ref="results/assignment/{assignment}/reference/reference.fa",
ref_tmp=temp("results/assignment/{assignment}/reference/reference.tmp.fa"),
params:
start=lambda wc: (
config["assignments"][wc.assignment]["alignment_tool"]["configs"][
Expand All @@ -49,23 +50,38 @@ rule assignment_check_design:
if config["assignments"][wc.assignment]["design_check"]["fast"]
else "--slow-string-search"
),
check_sequence_collitions=lambda wc: (
"--perform-sequence-check"
sequence_collitions=lambda wc: (
"sense_antisense"
if config["assignments"][wc.assignment]["design_check"][
"sequence_collitions"
]
else "--skip-sequence-check"
else "skip"
),
attach_sequence=lambda wc: (
"--attach-sequence %s %s"
% (
config["assignments"][wc.assignment]["strand_sensitive"][
"forward_adapter"
],
config["assignments"][wc.assignment]["strand_sensitive"][
"reverse_adapter"
],
)
if config["assignments"][wc.assignment]["strand_sensitive"]["enable"]
else ""
),
log:
log=temp("results/logs/assignment/check_design.{assignment}.log"),
err="results/assignment/{assignment}/design_check.err",
shell:
"""
trap "cat {log.err}" ERR
cp {input.design} {output.ref}
python {input.script} --input {output.ref} \
cp {input.design} {output.ref_tmp}
python {input.script} --input {output.ref_tmp} \
--output {output.ref} \
--start {params.start} --length {params.length} \
{params.fast_check} {params.check_sequence_collitions} > {log.log} 2> {log.err};
{params.fast_check} --sequence-check {params.sequence_collitions} \
{params.attach_sequence} > {log.log} 2> {log.err};
"""


Expand Down Expand Up @@ -129,11 +145,27 @@ rule assignment_attach_idx:
if config["assignments"][wc.assignment]["BC_rev_comp"]
else ""
),
attach_sequence=lambda wc: (
"--attach-sequence left %s"
% (
config["assignments"][wc.assignment]["strand_sensitive"][
"forward_adapter"
]
if wc.read == "FW"
else reverse_complement(
config["assignments"][wc.assignment]["strand_sensitive"][
"reverse_adapter"
]
)
)
if config["assignments"][wc.assignment]["strand_sensitive"]["enable"]
else ""
),
log:
temp("results/logs/assignment/attach_idx.{assignment}.{split}.{read}.log"),
shell:
"""
python {input.script} -r {input.read} -b {input.BC} {params.BC_rev_comp} | bgzip -c > {output.read} 2> {log}
python {input.script} -r {input.read} -b {input.BC} {params.BC_rev_comp} {params.attach_sequence} | bgzip -c > {output.read} 2> {log}
"""


Expand Down
37 changes: 37 additions & 0 deletions workflow/rules/common.smk
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,38 @@ if not config["skip_version_check"]:
check_version(pattern_major_version, version, config["version"])


# modify config based on certain rules


def modify_config(config):
# update sequence length with when adapters are added via strand sensitive is enabled
if "assignments" in config:
for assignment in config["assignments"].keys():
if config["assignments"][assignment]["strand_sensitive"]["enable"]:
add_length = len(
config["assignments"][assignment]["strand_sensitive"][
"forward_adapter"
]
) + len(
config["assignments"][assignment]["strand_sensitive"][
"reverse_adapter"
]
)
if config["assignments"][assignment]["alignment_tool"]["tool"] == "bwa":
config["assignments"][assignment]["alignment_tool"]["configs"][
"sequence_length"
]["min"] += add_length
config["assignments"][assignment]["alignment_tool"]["configs"][
"sequence_length"
]["max"] += add_length
else:
config["assignments"][assignment]["alignment_tool"]["configs"][
"sequence_length"
] += add_length
return config


config = modify_config(config)
################################
#### HELPERS AND EXCEPTIONS ####
################################
Expand Down Expand Up @@ -536,6 +568,11 @@ def withoutZeros(project, conf):
# assignment.smk specific functions


def reverse_complement(seq):
complementary = {"A": "T", "T": "A", "G": "C", "C": "G", "N": "N"}
return "".join(reversed([complementary[i] for i in seq]))


def getSplitNumber():
splits = [1]

Expand Down
35 changes: 15 additions & 20 deletions workflow/schemas/config.schema.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,6 @@ properties:
type: integer
max:
type: integer
additionalProperties: false
required:
- min
- max
Expand All @@ -73,11 +72,9 @@ properties:
type: integer
max:
type: integer
additionalProperties: false
required:
- min
- max
additionalProperties: false
required:
- sequence_length
- alignment_start
Expand All @@ -103,7 +100,6 @@ properties:
alignment_start:
type: integer
minimum: 1
additionalProperties: false
required:
- min_mapping_quality
- sequence_length
Expand All @@ -125,7 +121,6 @@ properties:
alignment_start:
type: integer
minimum: 1
additionalProperties: false
required:
- sequence_length
- alignment_start
Expand Down Expand Up @@ -178,7 +173,6 @@ properties:
- frac_mismatches_allowed
- min_dovetailed_overlap
default: {}
additionalProperties: false
design_file:
type: string
design_check:
Expand All @@ -194,7 +188,21 @@ properties:
required:
- fast
- sequence_collitions
additionalProperties: false
strand_sensitive:
type: object
default: {}
properties:
enable:
type: boolean
default: false
forward_adapter:
type: string
pattern: ^[ATCGN]+$
default: AGGACCGGATCAACT
reverse_adapter:
type: string
pattern: ^[ATCGN]+$
default: TCGGTTCACGCAATG
configs:
type: object
patternProperties:
Expand All @@ -213,8 +221,6 @@ properties:
required:
- min_support
- fraction
additionalProperties: false
additionalProperties: false
minProperties: 1
oneOf:
- required:
Expand All @@ -231,8 +237,6 @@ properties:
- configs
- alignment_tool
- NGmerge
additionalProperties: false
additionalProperties: false
minProperties: 1
# start_experiments
experiments:
Expand Down Expand Up @@ -290,7 +294,6 @@ properties:
minimum: 1
required:
- type
additionalProperties: false
allOf:
- if:
properties:
Expand All @@ -311,7 +314,6 @@ properties:
then:
required:
- assignment_file
additionalProperties: false
configs:
type: object
patternProperties:
Expand Down Expand Up @@ -339,7 +341,6 @@ properties:
default: 3
required:
- times_zscore
additionalProperties: false
default: {}
min_dna_counts:
type: integer
Expand All @@ -349,7 +350,6 @@ properties:
type: integer
miminum: 0
default: 1
additionalProperties: false
required:
- bc_threshold
- min_rna_counts
Expand All @@ -372,12 +372,8 @@ properties:
minimum: 1
seed:
type: integer
additionalProperties: false
additionalProperties: false
additionalProperties: false
required:
- filter
additionalProperties: false
variants:
type: object
properties:
Expand All @@ -399,7 +395,6 @@ properties:
- demultiplex
- assignments
- configs
additionalProperties: false
# end_experiments
additionalProperties: false
required:
Expand Down
61 changes: 45 additions & 16 deletions workflow/scripts/assignment/check_design_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,20 +42,44 @@
help="Using a simple dictionary to find identical sequences. This is faster but uses only the whole (or center part depending on start/length) of the design file. But in theory a substring can only be present and for more correct, but slower, search use the --slow-string-search.",
)
@click.option(
"--perform-sequence-check/--skip-sequence-check",
"sequence_search",
default=True,
help="When set to False, the script will not check for sequence collisions. This is useful when you know collisions but still want to preoceed with the design file.",
'--sequence-check',
'sequence_check',
type=click.Choice(['skip', 'sense_only', 'sense_antisense']),
default='sense_antisense',
help='Choose the type of sequence check. When set to skip, the script will not check for sequence collisions. This is useful when you know collisions but still want to preoceed with the design file.'
)
@click.option(
"--attach-sequence",
"attach_sequence",
required=False,
type=(click.STRING, click.STRING),
help="Attach a sequence left and right to each entry of the fasta file."
)
@click.option(
"--output",
"output",
required=True,
type=click.Path(writable=True),
help="Output reference fasqt file with attached sequences (if set). Otherwise just a copy.",
)
def cli(input_file, start, length, fast_search, sequence_search):
def cli(input_file, start, length, fast_search, sequence_check, attach_sequence, output):

seq_dict = dict()

forward_collitions = []
antisense_collitions = []

# attach sequence
with open(output, 'w') as fasta_file:
for name, seq in pyfastx.Fasta(input_file, build_index=False):
new_sequence = seq
if attach_sequence:
new_sequence = attach_sequence[0] + new_sequence + attach_sequence[1]
fasta_file.write(f">{name}\n{new_sequence}\n")


# read fasta file
fa = pyfastx.Fasta(input_file)
fa = pyfastx.Fasta(output)

# check duplicated headers
click.echo("Searching for duplicated headers...")
Expand All @@ -76,7 +100,12 @@ def cli(input_file, start, length, fast_search, sequence_search):
)
exit(1)

if sequence_search:

# read fasta file
if attach_sequence:
fa = pyfastx.Fasta(output)

if sequence_check != 'skip':
# build seq dict
click.echo("Building sequence dictionary...")
for i in range(len(fa)):
Expand All @@ -100,12 +129,12 @@ def cli(input_file, start, length, fast_search, sequence_search):
antisense_collition = set()
if fast_search:
forward_collition.update(seq_dict.get(sub_seq_forward, set()))
antisense_collition.update(seq_dict.get(sub_seq_antisense, set()))
antisense_collition.update(seq_dict.get(sub_seq_antisense, set())) if sequence_check == 'sense_antisense' else None
else:
for seq, names in seq_dict.items():
if sub_seq_forward in seq:
forward_collition.update(names)
if sub_seq_antisense in seq:
if sequence_check == 'sense_antisense' and sub_seq_antisense in seq:
antisense_collition.update(names)

if len(forward_collition) > 1:
Expand Down Expand Up @@ -135,16 +164,16 @@ def cli(input_file, start, length, fast_search, sequence_search):
"\t".join(forward_collitions[i]),
err=True,
)

click.echo(
"-----------------ANTISENSE COLLISIONS-----------------",
err=True,
)
for i in range(len(antisense_collitions)):
if sequence_check == 'sense_antisense':
click.echo(
"\t".join(antisense_collitions[i]),
"-----------------ANTISENSE COLLISIONS-----------------",
err=True,
)
for i in range(len(antisense_collitions)):
click.echo(
"\t".join(antisense_collitions[i]),
err=True,
)
exit(1)

else:
Expand Down
Loading

0 comments on commit f315aab

Please sign in to comment.