-
Notifications
You must be signed in to change notification settings - Fork 0
/
load.py
75 lines (62 loc) · 1.67 KB
/
load.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import ast
#For the comparison during alignment, ensure uppercase in peak strings
#takes a generated test data file and outputs a list of peaks (lists)
def process_Generated (testData):
source = open(testData)
data = []
for line in source:
datum = (line.strip()).split('\t')
datum[0] = (str(datum[0])).upper()
datum[1] = int(datum[1])
datum[2] = int(datum[2])
datum[3] = ast.literal_eval(datum[3])
# datum[3] = [e.strip() for e in datum[3]]
#data.append(datum)
data += [datum]
#print data[0]
#print data[4]
return data
#takes an ENCODE narrowPeak file and outputs a list of lists (peaks)
def process_DNase (narrowPeak):
source = open(narrowPeak)
data = []
for line in source:
datum = line.strip().split('\t')
datum[1] = int(datum[1])
datum[2] = int(datum[2])
datum[4] = int(datum[4])
datum[6] = int(datum[6])
datum[7] = float(datum[7])
datum[8] = float(datum[8])
datum[9] = float(datum[9])
data.append(datum)
return data
#takes an ENCODE chromosome file and outputs a string of its sequence
def process_chrom (chromosome):
raw = open(chromosome)
raw.readline()
seq = ''
for line in raw:
seq = seq + line.strip()
return seq
def get_datum(data,i):
return data[i]
def get_chrom(datum):
return datum[1]
def get_sequence(datum, seq):
return seq[datum[1]:datum[2]]
def get_seqs(data,chromosome):
peaks = []
for datum in data:
peaks.append(get_sequence(datum,chromosome))
return peaks
def test():
chrom = process_chrom('data/chr21.fa')
data = process_DNase('data/ENCFF001WIR.narrowPeak_chr21.np')
peaks = get_seqs (data,chrom)
for peak in peaks:
print peak
return
print ['load_test - odd python attributes']
#test()
print ['load_test2']