-
Notifications
You must be signed in to change notification settings - Fork 0
/
remove_segments_ribd.py
46 lines (43 loc) · 1.42 KB
/
remove_segments_ribd.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import numpy as np
import sys
import gzip
thresh = 2.8
dt = np.dtype('uint64')
if __name__ == '__main__':
match_addr = sys.argv[1]
chr = sys.argv[2]
remove_addr = sys.argv[3]
output_addr = sys.argv[4]
thresh = float(sys.argv[5])
remove_list = []
print("Reading Remove List")
with open(remove_addr) as remove_file:
for line in remove_file:
temp_chr = line.strip(" ").split()[0]
#if temp_chr > chr :
# break
if temp_chr == chr:
remove_list.append( ( int(line.strip(" ").split()[1]), int(line.strip(" ").split()[2]) ) )
count = 0
first = 0
last = 0
flagged = False
print("Removing lines...")
with gzip.open(match_addr, 'rb') as match_file:
with gzip.open(output_addr,'wb') as output_file:
for line in match_file:
data = line.decode().strip().split('\t')
first = int(data[5])
last = int(data[6])
if float(data[-1]) < thresh:
continue
flagged = False
for item in remove_list:
if first <= item[1] and item[0] <=last :
count += 1
flagged = True
break
if not flagged:
output_file.write(line)
print("DONE!")
print("Total " + str(count)+ " tracts removed.")