-
Notifications
You must be signed in to change notification settings - Fork 26
/
Copy pathBatchIterator.py
129 lines (105 loc) · 4.31 KB
/
BatchIterator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import numpy as np
import random
class SimpleDataIterator():
def __init__(self, df,T,MARK,DIFF=False):
self.df = df
self.T = T
self.MARK = MARK
self.DIFF = DIFF
self.size = len(self.df)
self.length = [len(item) for item in self.df]
self.epochs = 0
self.shuffle()
def shuffle(self):
random.shuffle(self.df)
self.length = [len(item) for item in self.df]
self.cursor = 0
def next_batch(self, n):
if self.cursor+n-1 > self.size:
self.epochs += 1
self.shuffle()
res = self.df[self.cursor:self.cursor+n]
seqlen = self.length[self.cursor:self.cursor+n]
self.cursor += n
return res,seqlen
class PaddedDataIterator(SimpleDataIterator):
def next_batch(self, n):
if self.cursor+n > self.size:
self.epochs += 1
self.shuffle()
res = self.df[self.cursor:self.cursor+n]
seqlen = self.length[self.cursor:self.cursor+n]
self.cursor += n
# Pad sequences with 0s so they are all the same length
maxlen = max(seqlen)
#x = np.zeros([n, maxlen,1], dtype=np.float32)
if self.MARK:
x = np.ones([n, maxlen,2], dtype=np.float32)*self.T
else:
x = np.ones([n, maxlen,1], dtype=np.float32)*self.T
for i, x_i in enumerate(x):
if self.MARK:
x_i[:seqlen[i],:] = res[i] # asarray
else:
x_i[:seqlen[i],0] = res[i]
if self.DIFF==True:
if self.MARK:
xt = np.concatenate([x[:,0:1,0:1],np.diff(x[:,:,0:1],axis=1)],axis=1)
x = np.concatenate([xt,x[:,:,1:]],axis=2)
else:
x = np.concatenate([x[:,0:1,:],np.diff(x,axis=1)],axis=1)
return x, np.asarray(seqlen)
class BucketedDataIterator():
def __init__(self, df, T, MARK,DIFF=False, num_buckets = 5):
self.df = df
self.length = [len(item) for item in self.df]
temp_ = sorted(zip(self.df,self.length),key= lambda x:x[1])
self.df = [item[0] for item in temp_]
self.length = [item[1] for item in temp_]
self.T = T
self.MARK = MARK
self.DIFF = DIFF
self.size = len(df) / num_buckets
self.dfs = []
self.lengths = []
for bucket in range(num_buckets):
self.dfs.append(self.df[bucket*self.size: (bucket+1)*self.size])
self.lengths.append( [len(item) for item in self.dfs[bucket]] )
self.num_buckets = num_buckets
# cursor[i] will be the cursor for the ith bucket
self.cursor = np.array([0] * num_buckets)
self.shuffle()
self.epochs = 0
def shuffle(self):
#sorts dataframe by sequence length, but keeps it random within the same length
for i in range(self.num_buckets):
random.shuffle(self.dfs[i])
self.lengths[i] = [len(item) for item in self.dfs[i]]
self.cursor[i] = 0
def next_batch(self, n):
if np.any(self.cursor+n+1 > self.size):
self.epochs += 1
self.shuffle()
i = np.random.randint(0,self.num_buckets)
res = self.dfs[i][self.cursor[i]:self.cursor[i]+n]
seqlen = self.lengths[i][self.cursor[i]:self.cursor[i]+n]
self.cursor[i] += n
# Pad sequences with 0s so they are all the same length
maxlen = max(seqlen)
#x = np.zeros([n, maxlen,1], dtype=np.float32)
if self.MARK:
x = np.ones([n, maxlen,2], dtype=np.float32)*self.T
else:
x = np.ones([n, maxlen,1], dtype=np.float32)*self.T
for i, x_i in enumerate(x):
if self.MARK:
x_i[:seqlen[i],:] = res[i] # asarray
else:
x_i[:seqlen[i],0] = res[i]
if self.DIFF==True:
if self.MARK:
xt = np.concatenate([x[:,0:1,0:1],np.diff(x[:,:,0:1],axis=1)],axis=1)
x = np.concatenate([xt,x[:,:,1:]],axis=2)
else:
x = np.concatenate([x[:,0:1,:],np.diff(x,axis=1)],axis=1)
return x, np.asarray(seqlen)