-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrandom_sampler.py
53 lines (44 loc) · 1.22 KB
/
random_sampler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
from tqdm import tqdm
import numpy as np
import csv
def sample(n):
file = "..\\data.csv"
f = open(file, 'r', encoding = "utf-8")
#Max size = n
reservior = []
fields = None
first = True
for line in tqdm(f):
if first:
fields = line
first = False
if len(reservior) < n:
# reservior.append(line.encode("utf-8"))
encoded = [x.encode("utf-8") for x in line.split(",")]
# reservior.append(encoded)
reservior.append(line.split(","))
else:
rand = np.random.randint(0, n)
if rand < n:
encoded = [x.encode("utf-8") for x in line.split(",")]
# reservior[rand] = encoded
reservior[rand] = line.split(",")
# reservior[rand] = line.encode("utf-8")
fields = fields.split(",")
# f = open("sampled.txt", "wb")
# for i in range(len(reservior)):
# f.write(reservior[i])
# print(fields)
fields[len(fields) - 1] = "jobHash"
print(fields)
with open("sampled_csv.csv", "w", encoding = "utf-8") as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=fields)
writer.writeheader()
for i in range(len(reservior)):
build_dict = {}
for j in range(len(fields)):
build_dict[fields[j]] = str(reservior[i][j])
writer.writerow(build_dict)
if __name__ == "__main__":
n = 500000
sample(n)