-
Notifications
You must be signed in to change notification settings - Fork 0
/
FindOffTargets.py
188 lines (163 loc) · 8.6 KB
/
FindOffTargets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
"""Module for finding off-targets"""
import os
import sys
import time
import pandas as pd
from typing import Dict, List
from ActivationCandidate import ActivationCandidate, OffTarget
from OffTargetsAnnotations import add_genomic_regions_to_off_targets, run_bedtools
from ScoringFunctions import moff
def create_crispritz_input_file(list_of_candidates: List, crispritz_path: str) -> str:
"""
:param list_of_candidates: A list of CandidateWithOffTargets objects
:param crispritz_path: A path to the crispys result folder where a folder for crispritz will be created
:return: A path to the input for xxx and will write the input file to xxx
"""
crispritz_infile = os.path.join(crispritz_path, 'crispritz_infile.txt')
out = ""
for candidate in list_of_candidates: # go over each candidate and get the guide sequence
out += f"{candidate.seq}NNN\n"
with open(crispritz_infile, 'w') as f:
f.write(out)
return crispritz_infile
def create_pam_file(pam_file_path: str) -> str:
"""
create the pam file for crispritz (for NGG pam)
:param pam_file_path: path to folder location
:return: pam file path
"""
pam_file = f"{pam_file_path}/pamNGG.txt"
if not os.path.exists(pam_file_path):
os.makedirs(pam_file_path)
with open(pam_file, "w") as f:
f.write("NNNNNNNNNNNNNNNNNNNNNGG 3")
return pam_file
else:
return pam_file
def run_crispritz(list_of_candidates: List, crispritz_script_path: str, output_path: str,
genome_by_chr_path: str,
pam_file_path: str, max_number_of_mismatches: int = 4, threads: int = 1) -> pd.DataFrame:
"""
This function runs crispritz and returns its output
nucleotide codes (e.g. 'NRG' matches both NGG and NAG).
:param crispritz_script_path: path to crispritz python script path.
:param list_of_candidates: A list of CandidateWithOffTargets objects
:param output_path: A path containing the output of CRISPys
:param genome_by_chr_path: path to the folder where the files of each chromosome fasta file.
:param pam_file_path: a path to folder where pam file (created if not exist)
:param max_number_of_mismatches: The maximum number of mismatches allowed between a sgRNA
and a potential off-target.
:param threads: number of threads to use
:return: The output of crispritz as pd.DataFrame, where each row is a potential offtarget.
"""
crispritz_path = f"{output_path}/crispritz"
os.makedirs(crispritz_path, exist_ok=True)
# create crispritz input file
crispritz_infile = create_crispritz_input_file(list_of_candidates, crispritz_path)
pam_file = create_pam_file(pam_file_path)
# run crispritz
os.system(
f"python {crispritz_script_path} search {genome_by_chr_path}/ {pam_file} {crispritz_infile} {crispritz_path}/crispritz -mm {max_number_of_mismatches} -r -th {threads}")
# get results
crispritz_results = pd.read_csv(f"{crispritz_path}/crispritz.targets.txt", sep="\t")
print(f"Number of off-targets: {crispritz_results.shape[0]}")
return crispritz_results
# create dictionary of sequence:candidate
def create_sequence_to_candidate_dict(list_of_candidates):
"""
This function takes a list of sgRNA candidates, and returns a sequence to candidate dictionary.
:param list_of_candidates: a list of sgRNA candidates
:return: a dictionary: sequence -> an ActivationCandidate object where candidate.seq = sequence.
"""
assert all([isinstance(element, ActivationCandidate) for element in list_of_candidates])
sequence_to_candidate_dict = {}
for i, candidate in enumerate(list_of_candidates):
sequence_to_candidate_dict[candidate.seq] = list_of_candidates[i]
return sequence_to_candidate_dict
# add to each "Candidate" its off-targets
def add_crispritz_off_targets(crispritz_results, sequence_to_candidate_dict: Dict[str, ActivationCandidate]):
"""
This function adds all found off-targets to each CandidateWithOffTargets using the crispritz results.
:param crispritz_results: The output of crispritz as a pd datatable, where each row is a potential offtarget.
:param sequence_to_candidate_dict: sequence -> a CandidateWithOffTargets object with the proper sequence
"""
# apply the 'get_off_target' function on each row in the crispritz table results
crispritz_results.apply(get_off_target, args=(sequence_to_candidate_dict,), axis=1)
return
# This function will be used in an apply command' it reads crispritz results and create an offtarget object out of each one
def get_off_target(x, sequence_to_candidate_dict: Dict[str, ActivationCandidate]):
"""
A function to use with apply on crispritz result table
it takes a row of crispritz results and a dictionary of sequence:candidate, and make an OffTarget
object from the crispritz results and add it to the candidate offtargets list.
:param x: a row in crispritz results table
:param sequence_to_candidate_dict: a dictionary of sequence:candidate
"""
candidate = sequence_to_candidate_dict[x['crRNA'][:20]]
off_target = OffTarget(x['DNA'].upper(), x['Chromosome'].split(" ")[0], int(x['Position']), x['Direction'], int(x['Mismatches'])) # TODO fix chromosome names in files
legit_letters = True
for char in off_target.seq:
if char not in {"A", "C", "T", "G"}:
legit_letters = False
break
if legit_letters:
candidate.off_targets_list.append(off_target)
return
def calculate_scores(candidates_list: List[ActivationCandidate]):
"""
Calculate the off-target scores for each off-target of each candidate and store the scores in each off-target's
score attribute.
:param candidates_list: a list of sgRNA candidates
"""
batch_off_targets_list = []
batch_candidates_list = []
for candidate in candidates_list:
batch_candidates_list += [candidate.seq + candidate.pam for _ in range(len(candidate.off_targets_list))]
batch_off_targets_list += [off_target.seq for off_target in candidate.off_targets_list]
t0 = time.perf_counter()
scores = moff(batch_candidates_list, batch_off_targets_list)
t1 = time.perf_counter()
print(f"scoring function for {len(batch_candidates_list)} off-targets ran in {t1 - t0} seconds")
i = 0
for candidate in candidates_list:
if len(candidate.off_targets_list) > 0:
for off in candidate.off_targets_list:
off.score = round(scores[i], 4)
i += 1
# sort the off-targets in the off_targets_list by score from highest to lowest
candidate.sort_off_targets()
def get_off_targets(candidates_list: List[ActivationCandidate], in_path: str, out_path: str, gff_with_proms_path: str):
"""
Find the off-targets for each candidate using crispritz, store them in the candidate's off_targets_list attribute
as a list of OffTarget objects. Then calculates the off-target scores for each off-target of each candidate and
store the scores in each off-target's score attribute.
:param candidates_list: a list of sgRNA candidates
:param in_path: path to directory in which the chromosome FASTA files are stored
:param out_path: output path for the algorithm results
:param gff_with_proms_path:
"""
# run crispritz
off_targets_pd = run_crispritz(candidates_list, sys.exec_prefix + "/bin/crispritz.py", out_path,
in_path + "/chromosomes", out_path + "/pams") # TODO fix the paths
# create a dictionary of sequence -> candidate
sequence_to_candidate_dict = create_sequence_to_candidate_dict(candidates_list)
# add the found off-targets of each candidate to the candidate's off_targets_list
add_crispritz_off_targets(off_targets_pd, sequence_to_candidate_dict)
#
intersect_bed = run_bedtools(candidates_list, out_path+gff_with_proms_path)
# add the genomic regions to the OffTargets attributes
add_genomic_regions_to_off_targets(intersect_bed, sequence_to_candidate_dict)
# remove on-targets
for candidate in candidates_list:
new_off_targets_lst = []
for off in candidate.off_targets_list:
if len(off.genomic_region) == 0:
off.genomic_region = ('chromosome', off.chromosome)
new_off_targets_lst.append(off)
elif off.genomic_region[0] == 'promoter' and off.genomic_region[1] == candidate.gene:
continue
else:
new_off_targets_lst.append(off)
candidate.off_targets_list = new_off_targets_lst
# calculate the off-target scores for each off_target of each candidate
calculate_scores(candidates_list)