forked from Kennedy-Lab-UW/Duplex-Sequencing
-
Notifications
You must be signed in to change notification settings - Fork 0
/
DCS_family_size_plotter.py
84 lines (67 loc) · 2.5 KB
/
DCS_family_size_plotter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import matplotlib.pyplot as plt
import sys
from collections import defaultdict
from argparse import ArgumentParser
import numpy as np
parser=ArgumentParser()
parser.add_argument("--tagfile", action="store", dest="tagfile", required=True)
parser.add_argument("--name", action="store", dest="name", required=True)
o = parser.parse_args()
bam_line = sys.stdin
tag_list = set()
good_fam_size_dict = defaultdict(lambda:0)
failed_fam_size_dict = defaultdict(lambda:0)
total_fam_size_dict = defaultdict(lambda:0)
for line in bam_line:
tag_list.add(line.strip('\n').split('\t')[0])
tag_file = open(o.tagfile, 'r')
total_reads = 0
for line in tag_file:
splitline = line.strip('\n').split('\t')
total_reads += int(splitline[1])
if splitline[0].split(':')[0] in tag_list:
good_fam_size_dict[int(splitline[1])] += 1
total_fam_size_dict[int(splitline[1])] += 1
else:
if int(splitline[1]) > 2:
failed_fam_size_dict[int(splitline[1])] += 1
total_fam_size_dict[int(splitline[1])] += 1
for family_size in total_fam_size_dict.keys():
total_fam_size_dict[family_size] *= int(family_size)
for family_size in good_fam_size_dict.keys():
good_fam_size_dict[family_size] *= int(family_size)
for family_size in failed_fam_size_dict.keys():
failed_fam_size_dict[family_size] *= int(family_size)
tag_file.close()
x_value = range(1,200)
good_y_value = []
failed_y_value = []
total_y_value = []
for family_size in x_value:
if family_size in good_fam_size_dict.keys():
good_y_value.append(float(good_fam_size_dict[family_size])/float(total_reads))
else:
good_y_value.append(0)
if family_size in failed_fam_size_dict.keys():
failed_y_value.append(float(failed_fam_size_dict[family_size])/float(total_reads))
else:
failed_y_value.append(0)
if family_size in total_fam_size_dict.keys():
total_y_value.append(float(total_fam_size_dict[family_size])/float(total_reads))
else:
total_y_value.append(0)
failed_y = np.array(failed_y_value)
good_y = np.array(good_y_value)
plt.bar(x_value, failed_y, alpha=0.3, bottom=good_y)
plt.bar(x_value, good_y, color='r')
plt.xlabel('Family Size')
plt.ylabel('Proportion of Total Reads')
plt.savefig(o.name + '_combined_fam_size.png', bbox_inches='tight')
#plt.bar(failed_x_value, failed_y_value)
#plt.xlabel('Family Size')
#plt.ylabel('Proportion of Total Reads')
#plt.savefig('failed_reads_fam_size.png', bbox_inches='tight')
#plt.bar(x_value, total_y_value)
#plt.xlabel('Family Size')
#plt.ylabel('Proportion of Total Reads')
#plt.savefig(o.name + '_total_reads_fam_size.png', bbox_inches='tight')