-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathbinning.py
130 lines (115 loc) · 3.79 KB
/
binning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import pandas as pd
import numpy as np
import os
import gc
# Puts all ratings into one of 30 bins
num_bins = 30
# Min and max date as specified in the README
minBin = 1.0
maxBin = 2243.0
binSize = float((maxBin - minBin)/num_bins)
def binDataframe(df):
# Create a new column called bin
df["bin"] = pd.Series([-1]*len(df), index=df.index)
# Divide the times into even bins
#minBin = min(df["Date Number"])
#maxBin = max(df["Date Number"])
for i in range(0, num_bins):
lb = minBin + i*binSize
ub = lb + binSize
df.loc[(df["Date Number"] >= lb) & (df["Date Number"] <= ub), ["bin"]] = i
#print(df.loc[(df["Date Number"] >= lb) & (df["Date Number"] <= ub)]["bin"])
return df
print('Loading mu probe set...')
real_mu_probe = pd.read_csv(os.path.join('data', 'real_mu_probe.csv'))
real_mu_probe = binDataframe(real_mu_probe)
mu_probe.to_csv("data/real_mu_probe.csv")
gc.collect()
print('Loading mu qual set...')
real_mu_qual = pd.read_csv(os.path.join('data', 'real_mu_qual.csv'))
real_mu_qual = binDataframe(real_mu_qual)
real_mu_qual.to_csv("data/real_mu_qual.csv")
gc.collect()
print('Loading mu train set...')
real_mu_train = pd.read_csv(os.path.join('data', 'real_mu_train.csv'))
real_mu_train = binDataframe(real_mu_train)
real_mu_train.to_csv("data/real_mu_train.csv")
gc.collect()
# UM
print('Loading um probe set...')
real_um_probe = pd.read_csv(os.path.join('data', 'real_um_probe.csv'))
real_um_probe = binDataframe(real_um_probe)
um_probe.to_csv("data/real_um_probe.csv")
gc.collect()
print('Loading um qual set...')
real_um_qual = pd.read_csv(os.path.join('data', 'real_um_qual.csv'))
real_um_qual = binDataframe(real_um_qual)
real_um_qual.to_csv("data/real_um_qual.csv")
gc.collect()
print('Loading um train set...')
real_um_train = pd.read_csv(os.path.join('data', 'real_um_train.csv'))
real_um_train = binDataframe(real_um_train)
real_um_train.to_csv("data/real_um_train.csv")
gc.collect()
#
# # MU Data Sets
# print('Loading mu probe set...')
# mu_probe = pd.read_csv(os.path.join('data', 'mu_probe.csv'))
# mu_probe = binDataframe(mu_probe)
# mu_probe.to_csv("data/mu_probe.csv")
# gc.collect()
#
# print('Loading mu val set...')
# mu_val = pd.read_csv(os.path.join('data', 'mu_val.csv'))
# mu_val = binDataframe(mu_val)
# mu_val.to_csv("data/mu_val.csv")
# gc.collect()
#
# print('Loading mu hidden set...')
# mu_hidden = pd.read_csv(os.path.join('data', 'mu_hidden.csv'))
# mu_hidden = binDataframe(mu_hidden)
# mu_hidden.to_csv("data/mu_hidden.csv")
# gc.collect()
#
# print('Loading mu qual set...')
# mu_qual = pd.read_csv(os.path.join('data', 'mu_qual.csv'))
# mu_qual = binDataframe(mu_qual)
# mu_qual.to_csv("data/mu_qual.csv")
# gc.collect()
#
# print('Loading mu train set...')
# mu_train = pd.read_csv(os.path.join('data', 'mu_train.csv'))
# mu_train = binDataframe(mu_train)
# mu_train.to_csv("data/mu_train.csv")
# gc.collect()
#
# # UM Data sets
# print('Loading um probe set...')
# um_probe = pd.read_csv(os.path.join('data', 'um_probe.csv'))
# um_probe = binDataframe(um_probe)
# um_probe.to_csv("data/um_probe.csv")
# gc.collect()
#
# print('Loading um val set...')
# um_val = pd.read_csv(os.path.join('data', 'um_val.csv'))
# um_val = binDataframe(um_val)
# um_val.to_csv("data/um_val.csv")
# gc.collect()
#
# print('Loading um hidden set...')
# um_hidden = pd.read_csv(os.path.join('data', 'um_hidden.csv'))
# um_hidden = binDataframe(um_hidden)
# um_hidden.to_csv("data/um_hidden.csv")
# gc.collect()
#
# print('Loading um qual set...')
# um_qual = pd.read_csv(os.path.join('data', 'um_qual.csv'))
# um_qual = binDataframe(um_qual)
# um_qual.to_csv("data/um_qual.csv")
# gc.collect()
#
# print('Loading um train set...')
# um_train = pd.read_csv(os.path.join('data', 'um_train.csv'))
# um_train = binDataframe(um_train)
# um_train.to_csv("data/um_train.csv")
# gc.collect()