-
Notifications
You must be signed in to change notification settings - Fork 0
/
ctx-explorer.py
executable file
·583 lines (509 loc) · 30.3 KB
/
ctx-explorer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
@author: Łukasz Szafron
@email: [email protected]
"""
appname = "CTX-Explorer"
app_version = "1.0"
import subprocess, re, os, gzip, sys, getopt, statistics
from itertools import groupby, chain
from termcolor import colored
def usage():
print(
"\nWelcome to the",colored(appname,"green")+" app.\n\n"
"The following options are available:\n\n"
"\t-h, --help:\t\tprints this help message.\n"
"\t-t, --threads:\t\tdefines the number of CPU threads that are to be used (default: 1).\n"
"\t-T, --tmpdir:\t\tdefines the directory where temporary files are to be stored (default: current dir).\n"
"\t-q, --qual:\t\tdefines the minimal Phred quality score of mappings to be used (default: 20).\n"
"\t-b, --bamfile:\t\tthe name of a BAM file to be used",colored("(MANDATORY)", "green")+".\n"
"\t-I, --intractx:\t\tindicates if intra-chromosomal translocations should be evaluated (default: 'no').",colored("This option significantly increases memory usage.","green")+"\n"
"\t-1, --chrom1:\t\tthe name of the first chromosome (optional) involved in a translocation (e.g., 'chr1').\n"
"\t-2, --chrom2:\t\tthe name of the second chromosome (optional) involved in a translocation (e.g., 'chr2').\n"
"\t-l, --tlen:\t\tminimal distance (in nucleotides) between the first and the second read forming a read pair (default: 1,000,000).\n"
"\t-i, --insert:\t\tthe maximum insert size used for identification of CTX-supporting reads and hit pairs (default: median insert size * 2).\n"
"\t-n, --nohits:\t\tminimal number of hits for a translocation to be stored in the final report (default: 2).\n"
"\t-N, --nohits_sec:\tminimal number of hits per the second chromosome for a translocation to be stored in the final report (default: 2).\n"
"\t-s, --min_size:\t\tminimal size of a hit group for a translocation to be stored in the final report (default: 2).\n"
"\t-d, --no_filter:\tspecifies whether the filtering by the number of supporting reads should be turned off (default: 'no').\n"
"\t-p, --prefix:\t\tspecifies a prefix of a file in which the final report will be saved (default: 'output').\n"
"\t-g, --gzipped:\t\tindicates whether the final report file should be gzipped (default: 'no').\n"
"\t-v, --version:\t\tprints the version of this program.\n"
)
# The next two lines are for debugging purposes only and should be commented in the final program.
#option_list = ["-b", "/workspace/lukasz/RUNS/BGRYG/MAPPINGS/GRCh38.chr.only/HISAT2/BGRYG-95.sorted.bam", "-t","6","-T","/tmpdir/lukasz","--prefix","outfile3","-1","chr8", "-2", "chr14"]
#opts, args = getopt.getopt(option_list, "ht:T:q:b:I1:2:l:i:n:N:s:dp:g", ["help","threads=","tmpdir=","qual=","bamfile=","intractx","chrom1=","chrom2=","tlen=","insert=","nohits=","nohits_sec=","min_size=","no_filter","prefix=","gzipped"])
try:
opts, args = getopt.getopt(sys.argv[1:], "ht:T:q:b:I1:2:l:i:n:N:s:dp:gv", ["help","threads=","tmpdir=","qual=","bamfile=","intractx","chrom1=","chrom2=","tlen=","insert=","nohits=","nohits_sec=","min_size=","no_filter","prefix=","gzipped", "version"])
if len(opts) == 0:
usage()
sys.exit()
for o, a in opts:
if o in ("-h", "--help"):
usage()
sys.exit()
elif o in ("-t", "--threads"):
threads = a
elif o in ("-T", "--tmpdir"):
tmpdir = a
elif o in ("-q", "--qual"):
qual = a
elif o in ("-b", "--bamfile"):
bamfile = a
elif o in ("-I", "--intractx"):
intractx = True
elif o in ("-1", "--chrom1"):
chr1 = a
elif o in ("-2", "--chrom2"):
chr2 = a
elif o in ("-l", "--tlen"):
tlen = a
elif o in ("-i", "--insert"):
insert = a
elif o in ("-n", "--nohits"):
nohits = a
elif o in ("-N", "--nohits_sec"):
nohits_secchr = a
elif o in ("-s","--min_size"):
group_size = a
elif o in ("-d","--no_filter"):
no_filter = True
elif o in ("-p", "--prefix"):
prefix = a
elif o in ("-g", "--gzipped"):
compressed_output = True
elif o in ("-v", "--version"):
print(appname, " version: ", app_version, sep = "")
sys.exit()
else:
assert False, "Unhandled option: "+o
except getopt.GetoptError as err:
# print help information and exit:
print("\n"+colored(str(err),"red")) # will print something like "option -a not recognized"
usage()
sys.exit(2)
try:
threads
except:
threads='1'
try:
tmpdir
except:
tmpdir = os.getcwd()
try:
qual
except:
qual = 20
try:
bamfile
except:
usage()
raise(Exception("The bam file was not provided. The program will exit now."))
try:
intractx
except:
intractx = False
try:
tlen
except:
tlen = 1e6
print("Calculating basic BAM file statistics...")
try:
insert
all_reads = subprocess.check_output(["samtools","view","-@", threads,"-c", bamfile]).decode().strip()
print("Maximum insert size defined by the user is:",colored("{}","green").format(insert),"bp.")
except:
bamfile_stats = subprocess.check_output(["bamtools","stats","-insert","-in", bamfile])
bamfile_stats = bamfile_stats.strip().decode().splitlines()
all_reads = bamfile_stats[4].split(":")[1].strip()
insert_median = round(float(bamfile_stats[-1].split(":")[1].strip()))
insert = 2*insert_median
print("Median insert size equals:", colored("{}","green").format(insert_median),"bp, so the maximum insert size is set to:", colored("{}","green").format(insert),"bp.")
try:
nohits
except:
nohits = 2
try:
nohits_secchr
except:
nohits_secchr = 2
try:
group_size
except:
group_size = 2
try:
no_filter
except:
no_filter = False
try:
prefix
except:
prefix = "output"
try:
compressed_output
except:
compressed_output = False
if intractx:
filterflag = "-F12"
print(colored("Intra-chromosomal translocations will be included in the analysis.","green"))
else:
filterflag = "-F14"
print(colored("Intra-chromosomal translocations will be excluded from the analysis.","green"))
bamfile_header = subprocess.check_output(["samtools","view","-@",threads,"-H",bamfile]).decode()
print("Getting coordinates for all mapped reads...")
mapped_lines = subprocess.check_output(["samtools","view","-@",threads, filterflag, bamfile]).decode().splitlines()
len_mapped_lines = len(mapped_lines)
def sel_element(lst, *indices):
return (lst[i] for i in indices)
mapped_lines_coords = ['\t'.join(list(sel_element(value.split("\t"),1,2,3,6,5,8,0))) for value in mapped_lines] # keep all bitwise flags, chromosome names, mapping position coordinates, CIGAR strings, TLENs and read names.
del(mapped_lines)
def sort_chroms(value):
return (value.split("\t")[1],value.split("\t")[3])
mapped_lines_coords.sort(key=sort_chroms) # sort coordinates by chromosomes
def group_chroms(value):
return value.split("\t")[1]+"\t"+value.split("\t")[3]
mapped_lines_coords = [(name,list(group)) for name,group in groupby(mapped_lines_coords,key=group_chroms)] # group coordinates by chrom. names.
print("Filtering out unmapped and low-quality reads...")
filtered_lines = subprocess.check_output(["samtools","view","-@",threads, filterflag,"-q",str(qual),bamfile]).decode().splitlines()
len_filtered_lines = len(filtered_lines)
print("CIGAR-based filtering of the reads...")
filtered_lines = [value for value in filtered_lines if re.search(pattern="\d+(S|H)", string=value.split("\t")[5])] # keep only mappings with clippings in the CIGAR strings.
filtered_lines = [value for value in filtered_lines if not re.search(pattern="^\d+(S|H).*\d+(S|H)$", string=value.split("\t")[5])] # keep only mappings with one overhang.
len_cigar_filtered = len(filtered_lines)
print("Defining the regions of potential chromosomal translocations...")
filtered_lines = ['\t'.join(flowvalue.split(sep="\t")[0:10]) for flowvalue in filtered_lines] # keep only the first 10 mandatory fields of the bam file (1-10).
for index,flowvalue in enumerate(filtered_lines): # Separates matchings from overhangs.
if re.search(pattern="^\d+S.*$", string=flowvalue.split(sep="\t")[5]):
shift = int(flowvalue.split("\t")[5].split("S")[0])
filtered_lines[index] = flowvalue+"\t"+flowvalue.split("\t")[9][shift:]+"\t"+flowvalue.split("\t")[9][:shift]+"\t"+str(shift)
elif re.search(pattern="^.*\d+S$", string=flowvalue.split(sep="\t")[5]):
shift = int(re.findall(pattern="\d+",string=flowvalue.split(sep="\t")[5].split("S")[0])[-1])
filtered_lines[index] = flowvalue+"\t"+flowvalue.split("\t")[9][:-shift]+"\t"+flowvalue.split("\t")[9][-shift:]+"\t"+str(shift)
elif re.search(pattern="\d+H", string=flowvalue.split(sep="\t")[5]):
filtered_lines[index] = flowvalue+"\t"+flowvalue.split("\t")[9]+"\t\t0"
else: raise Exception("CIGAR string analysis failed.")
filtered_lines.sort(key=lambda x:x.split("\t")[2]) # sort reads by chr1
def group_chr(value):
return value.split(sep="\t")[2]
filtered_lines = [[name,list(group)] for name,group in groupby(filtered_lines,key=group_chr)] # group by chr1
locations = [[] for chrom in filtered_lines]
for i1,v1 in enumerate(filtered_lines):
chrom = v1[0]
for i_flowvalue,flowvalue in enumerate(v1[1]):
if flowvalue.split(sep="\t")[2] == chrom:
locations[i1].append([v for v in sel_element(flowvalue.split(sep="\t"),3,5)])
else:
raise Exception("Values assignment error occurred.")
if re.search(pattern="^\d+(S|H).*$", string=locations[i1][i_flowvalue][1]):
locations[i1][i_flowvalue].append(locations[i1][i_flowvalue][0])
elif re.search(pattern="^.*\d+(S|H)$", string=locations[i1][i_flowvalue][1]):
shift = sum(shift for shift in list(map(int,re.findall(string=''.join(re.findall(pattern="(\d+M|\d+D|\d+N|\d+P)", string=locations[i1][i_flowvalue][1])),pattern="\d+")))) # Defines the exact ctx position.
locations[i1][i_flowvalue].append(int(locations[i1][i_flowvalue][0])+shift)
else:
raise Exception("Determination of chromosomal translocation position failed.")
for i1,v1 in enumerate(filtered_lines):
for i_flowvalue,flowvalue in enumerate(v1[1]):
if flowvalue.split(sep="\t")[3] == locations[i1][i_flowvalue][0] and flowvalue.split(sep="\t")[5] == locations[i1][i_flowvalue][1]:
filtered_lines[i1][1][i_flowvalue] = flowvalue+"\t"+str(locations[i1][i_flowvalue][2]) # add CTX location at the end of the read.
else:
raise Exception("Values assignment error occurred.")
del(locations)
def loc_sort(value):
return int(value.split(sep="\t")[13]) # sort by ctx
for chrom in filtered_lines:
chrom[1].sort(key=loc_sort)
del(chrom)
print("Calculating the number of hits for each translocation...")
for i1,v1 in enumerate(filtered_lines):
filtered_lines[i1][1] = [[name,list(group)] for name,group in groupby(v1[1],key=lambda x:x.split(sep="\t")[13])] # group by ctx
filtered_lines = [v2 for v1 in filtered_lines for v2 in v1[1] if len(v2[1]) >= int(nohits)] # filter by nohits
def chrom_regroup(lst):
for value in lst[1]:
return value.split("\t")[2]
filtered_lines = [[name,list(group)] for name,group in groupby(filtered_lines,key=chrom_regroup)] # regroup data by chromosomes
for i1,v1 in enumerate(filtered_lines):
for i2,v2 in enumerate(v1[1]):
count = len(v2[1])
for index,value in enumerate(v2[1]):
filtered_lines[i1][1][i2][1][index] = "\t".join([value,str(count)]) # add total number of hits for each ctx.
counter = 0
for v1 in filtered_lines:
for v2 in v1[1]:
counter = counter + len(v2[1])
len_nohits = counter
for i1,v1 in enumerate(filtered_lines):
for i2,v2 in enumerate(v1[1]):
filtered_lines[i1][1][i2][1].sort(key=lambda x:x.split("\t")[6]) # sort by chr2
for i1,v1 in enumerate(filtered_lines):
for i2,v2 in enumerate(v1[1]):
filtered_lines[i1][1][i2][1] = [[name,list(group)] for name,group in groupby(filtered_lines[i1][1][i2][1],key=lambda x:x.split("\t")[6])] # group by chr2
final_hits = []
for i1,v1 in enumerate(filtered_lines):
final_hits.append([v3 for v2 in v1[1] for v3 in v2[1] if len(v3[1]) >= int(nohits_secchr)]) # filter reads by secchr
del(filtered_lines)
final_hits = list(chain.from_iterable(final_hits)) # flatten the list
final_hits = [[name,list(group)] for name,group in groupby(final_hits,key=chrom_regroup)] # regroup data by chrom1.
def ctx_regroup(lst):
for value in lst[1]:
return value.split("\t")[13]
for i1,v1 in enumerate(final_hits):
final_hits[i1][1] = [[name,list(group)] for name,group in groupby(v1[1],key=ctx_regroup)] # regroup by ctx position
for i1,v1 in enumerate(final_hits):
for i2,v2 in enumerate(v1[1]):
for i3,v3 in enumerate(v2[1]):
count = str(len(v3[1]))
for index,value in enumerate(v3[1]):
final_hits[i1][1][i2][1][i3][1][index] = "\t".join([value,count]) # add number of hits for each ctx per secchr.
counter = 0
for v1 in final_hits:
for v2 in v1[1]:
for v3 in v2[1]:
counter = counter + len(v3[1])
len_nohits_secchr = counter
final_hits = [value for v1 in final_hits for v2 in v1[1] for v3 in v2[1] for value in v3[1] if (int(value.split("\t")[8]) >= int(tlen) or int(value.split("\t")[8]) <= -int(tlen)) or (int(value.split("\t")[8]) == 0 and value.split("\t")[6] != "=")] # filter reads by tlen and flatten the list.
len_tlen = len(final_hits)
final_hits.sort(key=lambda x:(x.split("\t")[2],x.split("\t")[13])) # sort hits by chr1 and ctx position.
final_hits = [(name,list(group)) for name,group in groupby(final_hits,key=lambda x:(x.split("\t")[2],x.split("\t")[13]))] # group hits by chr1 and ctx position.
final_hits = ["\t".join([v2,str(len(v1[1]))]) for v1 in final_hits for v2 in v1[1] if len(v1[1]) >= int(group_size)] # add sizes of the hit groups after tlen filtering to each hit, filter hits by the group size and then flatten the list.
len_group_size = len(final_hits)
final_hits = [[name,list(group)] for name,group in groupby(final_hits,key=lambda x:x.split("\t")[2])] # regroup hits by chrom1.
for i1,v1 in enumerate(final_hits):
final_hits[i1][1] = [[name,list(group)] for name,group in groupby(v1[1],key=lambda x:x.split("\t")[13])] # regroup hits by ctx.
for i1,v1 in enumerate(final_hits):
for i2,v2 in enumerate(v1[1]):
final_hits[i1][1][i2][1] = [[name,list(group)] for name,group in groupby(v2[1],key=lambda x:x.split("\t")[6])] # regroup hits by chrom2.
print("Defining the number of reads supporting each translocation...")
forward_flags = [65, 67, 129, 131, 97, 99, 161, 163, 321, 323, 385, 387, 353, 355, 417, 419, 1089, 1091, 1153, 1155, 1121, 1123, 1185, 1187, 1345, 1347, 1409, 1411, 1377, 1379, 1441, 1443, 2113, 2115, 2177, 2179, 2145, 2147, 2209, 2211, 2369, 2371, 2433, 2435, 2401, 2403, 2465, 2467, 3137, 3139, 3201, 3203, 3169, 3171, 3233, 3235, 3393, 3395, 3457, 3459, 3425, 3427, 3489, 3491]
reverse_flags = [81, 83, 145, 147, 113, 115, 177, 179, 337, 339, 401, 403, 369, 371, 433, 435, 1105, 1107, 1169, 1171, 1137, 1139, 1201, 1203, 1361, 1363, 1425, 1427, 1393, 1395, 1457, 1459, 2129, 2131, 2193, 2195, 2161, 2163, 2225, 2227, 2385, 2387, 2449, 2451, 2417, 2419, 2481, 2483, 3153, 3155, 3217, 3219, 3185, 3187, 3249, 3251, 3409, 3411, 3473, 3475, 3441, 3443, 3505, 3507]
primary_alignments = [65, 67, 129, 131, 97, 99, 161, 163, 81, 83, 145, 147, 113, 115, 177, 179, 1089, 1091, 1153, 1155, 1121, 1123, 1185, 1187, 1105, 1107, 1169, 1171, 1137, 1139, 1201, 1203, 2113, 2115, 2177, 2179, 2145, 2147, 2209, 2211, 2129, 2131, 2193, 2195, 2161, 2163, 2225, 2227, 3137, 3139, 3201, 3203, 3169, 3171, 3233, 3235, 3153, 3155, 3217, 3219, 3185, 3187, 3249, 3251]
secondary_alignments = [321, 323, 385, 387, 353, 355, 417, 419, 337, 339, 401, 403, 369, 371, 433, 435, 1345, 1347, 1409, 1411, 1377, 1379, 1441, 1443, 1361, 1363, 1425, 1427, 1393, 1395, 1457, 1459, 2369, 2371, 2433, 2435, 2401, 2403, 2465, 2467, 2385, 2387, 2449, 2451, 2417, 2419, 2481, 2483, 3393, 3395, 3457, 3459, 3425, 3427, 3489, 3491, 3409, 3411, 3473, 3475, 3441, 3443, 3505, 3507]
def supporting_reads(lst,ctx):
counter = 0
for value in lst[1]:
if lst[0].split("\t")[1] != "=": # ctx on different chromosomes
if int(value.split("\t")[0]) in forward_flags: # forward bitwise flags
if not re.search(pattern="^\d+(S|H).*$",string=value.split("\t")[4]):
shift = sum(shift for shift in list(map(int,re.findall(string=''.join(re.findall(pattern="(\d+M|\d+D|\d+N|\d+P)",string=value.split("\t")[4])),pattern="\d+")))) # Defines the exact end position of the read +1.
if int(value.split("\t")[2])+shift <= int(ctx) and int(value.split("\t")[2])+shift >= int(ctx) - int(insert):
counter += 1
else:
if int(value.split("\t")[2]) == int(ctx):
counter += 1
elif int(value.split("\t")[0]) in reverse_flags: # reverse bitwise flags
if not re.search(pattern="^.*\d+(S|H)$",string=value.split("\t")[4]):
if int(value.split("\t")[2]) >= int(ctx) and int(value.split("\t")[2]) <= int(ctx) + int(insert):
counter += 1
else:
shift = sum(shift for shift in list(map(int,re.findall(string=''.join(re.findall(pattern="(\d+M|\d+D|\d+N|\d+P)",string=value.split("\t")[4])),pattern="\d+"))))
if int(value.split("\t")[2])+shift == int(ctx):
counter += 1
else:
raise Exception("Bitwise flags evaluation failed.")
else: # ctx on the same chromosome
if int(value.split("\t")[0]) in forward_flags and (int(value.split("\t")[5]) >= int(tlen) or int(value.split("\t")[5]) <= -int(tlen)): # forward bitwise flags
if not re.search(pattern="^\d+(S|H).*$",string=value.split("\t")[4]):
shift = sum(shift for shift in list(map(int,re.findall(string=''.join(re.findall(pattern="(\d+M|\d+D|\d+N|\d+P)",string=value.split("\t")[4])),pattern="\d+")))) # Defines the exact end position of the read.
if int(value.split("\t")[2])+shift <= int(ctx) and int(value.split("\t")[2])+shift >= int(ctx) - int(insert):
counter += 1
else:
if int(value.split("\t")[2]) == int(ctx):
counter += 1
elif int(value.split("\t")[0]) in reverse_flags and (int(value.split("\t")[5]) >= int(tlen) or int(value.split("\t")[5]) <= -int(tlen)): # reverse bitwise flags
if not re.search(pattern="^.*\d+(S|H)$",string=value.split("\t")[4]):
if int(value.split("\t")[2]) >= int(ctx) and int(value.split("\t")[2]) <= int(ctx) + int(insert):
counter += 1
else:
shift = sum(shift for shift in list(map(int,re.findall(string=''.join(re.findall(pattern="(\d+M|\d+D|\d+N|\d+P)",string=value.split("\t")[4])),pattern="\d+"))))
if int(value.split("\t")[2])+shift == int(ctx):
counter += 1
elif int(value.split("\t")[0]) not in forward_flags and int(value.split("\t")[0]) not in reverse_flags:
raise Exception("Bitwise flags evaluation failed.")
return counter
for i1,v1 in enumerate(final_hits):
for i2,v2 in enumerate(v1[1]):
for i3,v3 in enumerate(v2[1]):
for c1 in mapped_lines_coords:
if v1[0] == c1[0].split("\t")[0] and v3[0] == c1[0].split("\t")[1]:
sup_reads = supporting_reads(lst=c1,ctx=v2[0])
final_hits[i1][1][i2][1][i3][1] = ["\t".join([hit,str(sup_reads)]) for hit in v3[1]] # add number of supporting reads to each hit.
break
final_hits = [value for v1 in final_hits for v2 in v1[1] for v3 in v2[1] for value in v3[1]] # flatten the list.
def hit_sort(value): # sort hits by hit group size, chr1, ctx, no_hits, no_hits_secchr, the number of supporting reads, and overhang length in the reverse order.
return (int(value.split(sep="\t")[16]),value.split(sep="\t")[2],list(map(int,value.split(sep="\t")[13:16])),int(value.split(sep="\t")[17]),int(value.split(sep="\t")[12]))
final_hits.sort(key=hit_sort,reverse=True)
for i1,v1 in enumerate(final_hits):
l = v1.split("\t")
l[-1],l[-2] = l[-2],l[-1]
final_hits[i1]= "\t".join(l) # Reorder the hits placing the hit group size in the last column.
if no_filter is False:
final_hits = [v for v in final_hits if int(v.split("\t")[16]) > int(v.split("\t")[15])] # keep only those hits where the number of supporting reads is higher than nohits_secchr.
len_final_filter = len(final_hits)
final_hits = [value for value in final_hits if (int(value.split("\t")[1]) in forward_flags and re.search(pattern="^.*\d+(S|H)$",string=value.split("\t")[5])) or (int(value.split("\t")[1]) in reverse_flags and re.search(pattern="^\d+(S|H).*$",string=value.split("\t")[5]))] # keep only those hits where CIGAR strings support the ctx position.
no_hits_with_supp_cigars = len(final_hits)
final_hits = [list(group) for name,group in groupby(final_hits, key=lambda x:(x.split("\t")[2],x.split("\t")[13]))] # regroup hits by chr1 and ctx.
no_final_hit_groups = len(final_hits)
final_hits = ["\t".join([str(i1+1),value]) for i1,v1 in enumerate(final_hits) for value in v1] # index and flatten the list.
print("Discovering hit pairs...")
for i1,v1 in enumerate(final_hits):
if v1.split("\t")[7] == "=":
l1 = v1.split("\t")
l1[7] = l1[3]
final_hits[i1] = "\t".join(l1) # replace '=' in the RNEXT field with the chromosome name.
final_hits = [[name,list(group)] for name,group in groupby(final_hits,key=lambda x:(x.split("\t")[0],x.split("\t")[3],x.split("\t")[14],x.split("\t")[7]))] # regroup hits by hit index, chr1, ctx position and chr2.
for i1,v1 in enumerate(final_hits): # discover hit pairs
pnext_list = []
for coords in mapped_lines_coords:
coords_list = coords[0].split("\t")
if coords_list[1] == "=":
coords_list[1] = coords_list[0] # replace "=" with the chromosome name
if v1[0][3] == coords_list[0] and v1[0][1] == coords_list[1]:
for value1 in v1[1]:
value1 = value1.split("\t")
if int(value1[2]) in primary_alignments:
for value2 in coords[1]:
value2 = value2.split("\t")
if value1[1] == value2[6] and value1[8] == value2[2]:
if int(value2[0]) in forward_flags: # forward bitwise flags
shift = sum(shift for shift in list(map(int,re.findall(string=''.join(re.findall(pattern="(\d+M|\d+D|\d+N|\d+P)",string=value2[4])),pattern="\d+"))))-1 # Defines the exact end position of the mate read.
pnext = int(value2[2]) + shift
elif int(value2[0]) in reverse_flags: # reverse bitwise flags
pnext = int(value2[2])
else:
raise Exception("Bitwise flags evaluation failed.")
pnext_list.append(pnext)
elif int(value1[2]) in secondary_alignments:
pnext_list.append(int(value1[8]))
else:
raise Exception("Unable to determine whether the alignment is primary.")
if len(pnext_list) == 0:
raise Exception("PNEXT evaluation failed.")
pnext_median = statistics.median(pnext_list)
pairs = []
for v2 in final_hits:
if v2[0][1] == v1[0][3] and v1[0][1] == v2[0][3]:
sec_idx = v2[0][0]
distance = (pnext_median-int(v2[0][2]))
counter_forward = 0
counter_reverse = 0
for value3 in v2[1]:
value3 = value3.split("\t")
if int(value3[2]) in forward_flags and re.search(pattern="^.*\d+(S|H)$",string=value3[6]):
counter_forward += 1
elif int(value3[2]) in reverse_flags and re.search(pattern="^\d+(S|H).*$",string=value3[6]):
counter_reverse += 1
else:
raise Exception("Identification of CTX orientation failed.")
if counter_forward > 0 and counter_reverse == 0:
if distance >= -int(insert) and distance < 0: # Distance has to be lower than 0, since CTX position is increased by 1.
res = ":".join([sec_idx,str(round(distance))])
pairs.append(res)
elif counter_reverse > 0 and counter_forward == 0:
if distance >= 0 and distance <= int(insert): # Distance may equal 0, since CTX position is not increased by 1.
res = ":".join([sec_idx,str(round(distance))])
pairs.append(res)
else:
if abs(distance) <= int(insert):
res = ":".join([sec_idx,str(round(distance)),"abs"])
pairs.append(res)
if len(pairs) == 0:
pairs.append("NA")
final_hits[i1][1] = ["\t".join([hit,"; ".join(pairs)]) for hit in v1[1]]
del(mapped_lines_coords)
final_hits = [value for v1 in final_hits for value in v1[1]] # flatten the list
for index,value in enumerate(final_hits): # reorder each hit by placing paired hits next to the hit number.
l = value.split("\t")
l.insert(1,l[-1])
l.pop()
final_hits[index] = "\t".join(l)
# Hit_index is generated to determine the number of hit pairs.
def count_hit_pairs(lst):
hit_index = ["\t".join(hit.split("\t")[0:2]) for hit in lst if hit.split("\t")[1] != "NA"]
hit_index = [[name,list(group)] for name,group in groupby(hit_index,key=lambda x:x.split("\t")[0])]
for i1,v1 in enumerate(hit_index):
for value in v1[1]:
value = value.split("\t")
value.pop(0)
hit_index[i1][1] = value
for i1,v1 in enumerate(hit_index):
for value in v1[1]:
hit_index[i1][1] = re.findall(pattern="\d+\:[^a]", string=value)
for i1,v1 in enumerate(hit_index):
for i2,v2 in enumerate(v1[1]):
hit_index[i1][1][i2] = v2.split(":")[0]
hit_index = list(set(["\t".join([v1[0],v2]) for v1 in hit_index for v2 in v1[1]]))
counter = 0
for v1 in hit_index:
for v2 in hit_index:
if v1.split("\t")[0] == v2.split("\t")[1] and v1.split("\t")[1] == v2.split("\t")[0]:
counter += 1
break
return int(counter/2)
no_final_hit_pairs = count_hit_pairs(final_hits)
for i1,v1 in enumerate(final_hits):
if re.search(pattern="^\d+(S|H).*$",string=v1.split("\t")[7]):
final_hits[i1] = v1
elif re.search(pattern="^.*\d+(S|H)$",string=v1.split("\t")[7]):
v1 = v1.split("\t")
v1[15] = str(int(v1[15])-1) # Subtract 1 to get the exact CTX position.
final_hits[i1] = '\t'.join(v1)
else:
raise Exception("CIGAR string evaluation failed.")
try:
chr1, chr2
print("Finding all hits for the specified pair of chromosomes...")
final_hits_chr = [value for value in final_hits if (value.split("\t")[4] == chr1 and value.split("\t")[8] == chr2) or (value.split("\t")[4] == chr2 and value.split("\t")[8] == chr1)] # keep only mappings for the selected pair of chromosomes
no_final_hit_groups_chr = len(set([hit.split("\t")[0] for hit in final_hits_chr]))
no_final_hit_pairs_chr = count_hit_pairs(final_hits_chr)
except:
None
header = "HIT_NO\tPAIRED_HITS\tQNAME\tFLAG\tRNAME\tPOS\tMAPQ\tCIGAR\tRNEXT\tPNEXT\tTLEN\tSEQ\tSEQ_MATCHING\tSEQ_OVERHANG\tOVERHANG_LENGTH\tCTX_POS\tNO_HITS\tNO_HITS_PER_SEC_CHR\tNO_SUPP_READS\tHIT_GROUP_SIZE"
if compressed_output:
suffix = "_"+appname+"_results.tsv.gz"
else:
suffix = "_"+appname+"_results.tsv"
outfile = prefix+suffix
print("Saving the results to a file: "+colored(outfile,"green"))
def print_output(out):
out.write(appname+" analysis report\n\n")
out.write("User-defined options used:\n")
for opt,arg in opts:
print(opt,arg, file=out, sep="\t")
out.write("\nSummary:\n")
print("Total number of reads:",str(all_reads),sep="\t",file=out)
print("Number of reads after initial filtering:",str(len_mapped_lines),sep="\t",file=out)
print("Number of reads after filtering out low-quality reads:",str(len_filtered_lines),sep="\t",file=out)
print("Number of read groups after the CIGAR-based filtering:", str(len_cigar_filtered),sep="\t",file=out)
print("Number of hits surviving the first filter (nohits):",str(len_nohits),sep="\t",file=out)
print("Number of hits surviving the second filter (nohits_secchr):",str(len_nohits_secchr),sep="\t",file=out)
print("Number of hits after the TLEN filtering:",str(len_tlen),sep="\t",file=out)
print("Number of hits after filtering by the hit group size:",str(len_group_size),sep="\t",file=out)
print("Number of hits after filtering by the number of supporting reads:",str(len_final_filter),sep="\t",file=out)
print("Number of hits after filtering out hits with overhangs non-adjacent to the CTX:",str(no_hits_with_supp_cigars),sep="\t",file=out)
print("Number of hit groups:",str(no_final_hit_groups),sep="\t",file=out)
print("Number of hit pairs:",str(no_final_hit_pairs),sep="\t",file=out)
try:
chr1, chr2
print("Number of hit groups after chromosome-based filtering:",str(no_final_hit_groups_chr),sep="\t",file=out)
print("Number of hit pairs after chromosome-based filtering:",str(no_final_hit_pairs_chr),sep="\t",file=out)
except:
None
out.write("\nSamtools header:\n")
out.write(bamfile_header)
try:
chr1, chr2
print("\n"+appname+" results ("+chr1+", "+chr2+"):",file=out)
out.write(header+"\n")
out.writelines('\n'.join(final_hits_chr)+"\n")
except:
None
print("\n"+appname+" results (full table):",file=out)
out.write(header+"\n")
out.writelines('\n'.join(final_hits))
if compressed_output:
with gzip.open(outfile, mode="wt") as out:
print_output(out)
else:
with open(outfile, mode="wt") as out:
print_output(out)
print(colored("The analysis is complete.", "green"))