-
Notifications
You must be signed in to change notification settings - Fork 51
/
util.py
251 lines (209 loc) · 8.31 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
#!/usr/bin/env python
import os,sys
from ctypes import *
EDIT_DISTANCE_MODULE_EXISTS = True
EDIT_DISTANCE_CTYPES_LOADED = False
# try load editdistance built-in module first
# if editdistance built-in module not exist, try to load editdistance/libed.so, which is made by make in AfterQC folder
try:
import editdistance
except ImportError:
EDIT_DISTANCE_MODULE_EXISTS = False
ed_lib_file = os.path.join(sys.path[0], "editdistance/libed.so")
if os.path.exists(ed_lib_file):
try:
ed_ctypes = cdll.LoadLibrary(ed_lib_file)
except Exception:
EDIT_DISTANCE_CTYPES_LOADED = False
else:
EDIT_DISTANCE_CTYPES_LOADED = True
else:
EDIT_DISTANCE_MODULE_EXISTS = True
COMP = {"A" : "T", "T" : "A", "C" : "G", "G" : "C", "a" : "t", "t" : "a", "c" : "g", "g" : "c", "N":"N", "\n":"\n"}
def parseBool(str):
str = str.lower()
if str=="true" or str=="yes" or str=="on":
return True
else:
return False
def complement(base):
return COMP[base]
def qualNum(q):
return ord(q) - 33
def reverseComplement(origin):
length = len(origin)
revCompArr = ['' for x in xrange(length)]
for i in xrange(length):
orig = origin[length - i -1]
if orig in COMP:
revCompArr[i] = COMP[orig]
else:
revCompArr[i] = 'N'
return ''.join(revCompArr)
def reverse(origin):
return origin[::-1]
def hammingDistance(s1, s2):
length = min(len(s1), len(s2))
d = 0
for i in xrange(length):
if s1[i] != s2[i]:
d += 1
return d
#simple edit distance
def editDistance(s1, s2):
# check if editdistance module loaded
if EDIT_DISTANCE_MODULE_EXISTS:
return editdistance.eval(s1, s2)
elif EDIT_DISTANCE_CTYPES_LOADED:
return ed_ctypes.edit_distance(s1, len(s1), s2, len(s2))
m=len(s1)+1
n=len(s2)+1
tbl = [([0] * n) for i in xrange(m)]
for i in xrange(m):tbl[i][0]=i
for j in xrange(n):tbl[0][j]=j
for i in xrange(1, m):
for j in xrange(1, n):
cost = 0 if s1[i-1] == s2[j-1] else 1
tbl[i][j] = min(tbl[i][j-1]+1, tbl[i-1][j]+1, tbl[i-1][j-1]+cost)
return tbl[i][j]
def distance_threshold(overlap_len):
return min(3, overlap_len/10.0)
def overlap(r1, r2):
return overlap_hm(r1, r2)
def overlap_ed(r1, r2):
len1 = len(r1)
len2 = len(r2)
reverse_r2 = reverseComplement(r2)
overlapped = False
overlap_len = 0
offset = 0
distance = 0
offset_0_is_min = True
# a match of less than 10 is considered as unconfident
while offset < len1-10 and overlapped==False:
# the overlap length of r1 & r2 when r2 is move right for offset
overlap_len = min(len1-offset, len2)
# remind that Julia is a 1-based coordination system
distance = editDistance(r1[offset : offset+overlap_len], reverse_r2[0 : overlap_len])
threshold = distance_threshold(overlap_len)
if distance <= threshold:
# now we find a good candidate
# we verify it by moving r2 one more base to see if the distance is getting longer
# if yes, then current is the best match, otherwise it's not
while offset < len1-10:
next_offset = offset + 1
next_overlap_len = min(len1-next_offset, len2)
next_distance = editDistance(r1[next_offset : next_offset+next_overlap_len], reverse_r2[0 : next_overlap_len])
if distance <= next_distance:
overlapped = True
break
else:
offset_0_is_min = False
offset = next_offset
distance = next_distance
overlap_len = next_overlap_len
break
else:
offset += max(1, (distance - int(threshold))/2 )
if offset_0_is_min:
# in this case, the adapter is sequenced since TEMPLATE_LEN < SEQ_LEN
# check if distance can get smaller if offset goes negative
# this only happens when insert DNA is shorter than sequencing read length, and some adapter/primer is sequenced but not trimmed cleanly
# we go reversely
offset = 0
while offset > -(len2-10):
# the overlap length of r1 & r2 when r2 is move right for offset
overlap_len = min(len1, len2- abs(offset))
distance = editDistance(r1[0:overlap_len], reverse_r2[-offset : -offset + overlap_len])
threshold = distance_threshold(overlap_len)
if distance <= threshold:
while offset > -(len2-10):
next_offset = offset - 1
next_overlap_len = min(len1, len2- abs(next_offset))
next_distance = editDistance(r1[0:next_overlap_len], reverse_r2[-next_offset : -next_offset + next_overlap_len])
if distance <= next_distance:
return (offset, overlap_len, distance)
else:
distance = next_distance
overlap_len = next_overlap_len
offset = next_offset
else:
offset -= max(1, (distance - int(threshold))/2 )
elif overlapped:
return (offset, overlap_len, distance)
return (0,0,0)
# calculate overlap by hamming distance
def overlap_hm(r1, r2):
len1 = len(r1)
len2 = len(r2)
reverse_r2 = reverseComplement(r2)
limit_distance = 3
overlap_require = 30
complete_compare_require = 50
overlap_len = 0
offset = 0
# forward
# a match of less than 10 is considered as unconfident
while offset < len1-overlap_require:
# the overlap length of r1 & r2 when r2 is move right for offset
overlap_len = min(len1-offset, len2)
diff = 0
for i in xrange(overlap_len):
if r1[offset + i] != reverse_r2[i]:
diff += 1
if diff >= limit_distance and i < complete_compare_require:
break
if diff < limit_distance or (diff >= limit_distance and i>complete_compare_require):
return (offset, overlap_len, diff)
offset += 1
# reverse
# in this case, the adapter is sequenced since TEMPLATE_LEN < SEQ_LEN
# check if distance can get smaller if offset goes negative
# this only happens when insert DNA is shorter than sequencing read length, and some adapter/primer is sequenced but not trimmed cleanly
# we go reversely
offset = 0
while offset > -(len2-overlap_require):
# the overlap length of r1 & r2 when r2 is move right for offset
overlap_len = min(len1, len2- abs(offset))
diff = 0
for i in xrange(overlap_len):
if r1[i] != reverse_r2[-offset + i]:
diff += 1
if diff >= limit_distance and i < complete_compare_require:
break
if diff < limit_distance or (diff >= limit_distance and i>complete_compare_require):
return (offset, overlap_len, diff)
offset -= 1
# not matched
return (0,0,0)
def overlap_hm_cpp(r1, r2):
len1 = len(r1)
len2 = len(r2)
reverse_r2 = reverseComplement(r2)
limit_distance = 3
overlap_require = 30
complete_compare_require = 50
ret = ed_ctypes.seek_overlap(r1, len1, reverse_r2, len2, limit_distance, overlap_require, complete_compare_require)
offset = ret >> 8
diff = ret & (0xFF)
if ret == 0x7FFFFFFF:
return (0,0,0)
elif offset >=0:
overlap_len = min(len1-offset, len2)
else:
overlap_len = min(len1, len2- abs(offset))
return (offset, overlap_len, diff)
def changeString(str, pos, val):
lst = list(str)
lst[pos] = val
return ''.join(lst)
if __name__ == "__main__":
r1 = "CAGCGCCTACGGGCCCCTTTTTCTGCGCGACCGCGTGGCTGTGGGCGCGGATGCCTTTGAGCGCGGTGACTTCTCACTGCGTATCGAGCCGCTGGAGGTCTCCC"
r2 = "ACCTCCAGCGGCTCGATACGCAGTGAGAAGTCACCGCGCTCAAAGGCATCCGCGCCCACAGCCACGCGGTCGCGCAGAAAAAGGGGCCCGTAGGCGCGGCTCCC"
r1 = "CAGCGCCTACGGGCCCCTTTTTCTGCGCGACCGCGTGGCTGTGGGCGCGGATGCCTTTGAGCGCGGTGACTTCTCACTGCGTATCGAGC"
r2 = "ACCTCCAGCGGCTCGATACGCAGTGAGAAGTCACCGCGCTCAAAGGCATCCGCGCCCACAGCCACGCGGTCGCGCAGAAAAAGGGGTCC"
print(overlap_ed(r1, r2))
print(overlap_hm(r1, r2))
print(overlap_hm_cpp(r1, r2))
for i in xrange(10000):
overlap_ed(r1, r2)