forked from bchidamb/Jess-s_Angels
-
Notifications
You must be signed in to change notification settings - Fork 1
/
new_dataset_creation.py
81 lines (59 loc) · 2.28 KB
/
new_dataset_creation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import pandas as pd
import numpy as np
import gc
probe_size = 100000
###
# MU data
###
mu_data = pd.read_csv("mu/all.dta", sep=' ',header=None)
mu_data.columns = ["User Number", "Movie Number", "Date Number", "Rating"]
print("Full mu data")
mu_idx = pd.read_table("mu/all.idx", header=None)
mu_idx.columns = ["Index"]
print("Full mu indexing")
four = mu_idx.loc[mu_idx["Index"] == 4]
four = four.index.tolist()
real_probe_indices = np.random.choice(four, probe_size, replace=False)
probe_set = mu_data.loc[real_probe_indices].copy()
probe_set.to_csv("data/real_mu_probe.csv")
probe_set.drop(["Date Number"], axis=1).to_csv("data/real_mu_probe.txt", header=None, index=False, sep=' ')
print(len(probe_set))
one = mu_idx.loc[mu_idx["Index"] == 1]
train_list = one.index.tolist()
two = mu_idx.loc[mu_idx["Index"] == 2]
train_list += two.index.tolist()
three = mu_idx.loc[mu_idx["Index"] == 3]
train_list += three.index.tolist()
train_list += list(set(four) - set(real_probe_indices))
mu_train = mu_data.loc[train_list]
mu_train.to_csv("data/real_mu_train.csv")
mu_train.drop(["Date Number"], axis=1).to_csv("data/real_mu_train.txt", header=None, index=False, sep=' ')
print(len(mu_train))
gc.collect()
###
# um data
###
um_data = pd.read_csv("um/all.dta", sep=' ',header=None)
um_data.columns = ["User Number", "Movie Number", "Date Number", "Rating"]
print("Full um data")
um_idx = pd.read_table("um/all.idx", header=None)
um_idx.columns = ["Index"]
print("Full um indexing")
four = um_idx.loc[um_idx["Index"] == 4]
four = four.index.tolist()
real_probe_indices = np.random.choice(four, probe_size, replace=False)
probe_set = um_data.loc[real_probe_indices].copy()
probe_set.to_csv("data/real_um_probe.csv")
probe_set.drop(["Date Number"], axis=1).to_csv("data/real_um_probe.txt", header=None, index=False, sep=' ')
print(len(probe_set))
one = um_idx.loc[um_idx["Index"] == 1]
train_list = one.index.tolist()
two = um_idx.loc[um_idx["Index"] == 2]
train_list += two.index.tolist()
three = um_idx.loc[um_idx["Index"] == 3]
train_list += three.index.tolist()
train_list += list(set(four) - set(real_probe_indices))
um_train = um_data.loc[train_list]
um_train.to_csv("data/real_um_train.csv")
um_train.drop(["Date Number"], axis=1).to_csv("data/real_um_train.txt", header=None, index=False, sep=' ')
print(len(um_train))