-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_msd.py
104 lines (87 loc) · 3.71 KB
/
get_msd.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import hdf5_getters
import pandas as pd
import numpy as np
import json
import pprint
import sys
deezer_path = "./data/deezer/deezer-spotify.csv"
msd_path = "../datasets/msd/data"
attr_track = "MSD_track_id"
attr_song = "MSD_sng_id"
songdata = pd.read_csv(deezer_path, header=0, index_col=0)
count = int(sys.argv[1]) if len(sys.argv) > 1 else len(songdata)
msdata = {}
attributes = [k[4:] for k in filter(lambda x : x[:4] == 'get_', hdf5_getters.__dict__.keys())]
attributes.remove("num_songs")
attributes = [
"key",
"key_confidence",
"loudness",
"mode",
"mode_confidence",
"tempo",
"time_signature",
"time_signature_confidence",
]
for a in attributes: msdata[a] = ["" for _ in range(count)]
array_attributes = [
("segments_loudness_max",1),
("segments_loudness_max_time",1),
("segments_timbre",12)
]
for a, n in array_attributes:
if n == 1:
msdata[a + "_avg"] = ["" for _ in range(count)]
# msdata[a + "_std"] = ["" for _ in range(count)]
msdata[a + "_var"] = ["" for _ in range(count)]
msdata[a + "_min"] = ["" for _ in range(count)]
msdata[a + "_max"] = ["" for _ in range(count)]
msdata[a + "_med"] = ["" for _ in range(count)]
else:
for i in range(n):
msdata[a + "_" + str(i) + "_avg"] = ["" for _ in range(count)]
# msdata[a + "_" + str(i) + "_std"] = ["" for _ in range(count)]
msdata[a + "_" + str(i) + "_var"] = ["" for _ in range(count)]
msdata[a + "_" + str(i) + "_min"] = ["" for _ in range(count)]
msdata[a + "_" + str(i) + "_max"] = ["" for _ in range(count)]
msdata[a + "_" + str(i) + "_med"] = ["" for _ in range(count)]
for i in range(count):
song = songdata.iloc[i]
trackid = song[attr_track]
h5path = "{}/{}/{}/{}/{}.h5".format(msd_path, trackid[2], trackid[3], trackid[4], trackid)
h5 = hdf5_getters.open_h5_file_read(h5path)
# print("Grabbed file {}".format(h5path))
for a in attributes:
# print("\n{}:".format(a))
try:
res = hdf5_getters.__getattribute__("get_" + a)(h5, 0)
msdata[a][i] = res
except:
msdata[a][i] = None
print("{0:>6}: Error finding {}".format(i, a), file=sys.stderr)
for a, n in array_attributes:
try:
res = hdf5_getters.__getattribute__("get_" + a)(h5, 0)
if n == 1:
msdata[a + "_avg"][i] = float(np.nanmean(res))
# msdata[a + "_std"][i] = float(np.nanstd(res))
msdata[a + "_var"][i] = float(np.nanvar(res))
msdata[a + "_min"][i] = float(np.nanmin(res))
msdata[a + "_max"][i] = float(np.nanmax(res))
msdata[a + "_med"][i] = float(np.nanmedian(res))
else:
res = np.transpose(res)
for j in range(n):
msdata[a + "_" + str(j) + "_avg"][i] = float(np.nanmean(res[j]))
# msdata[a + "_" + str(j) + "_std"][i] = float(np.nanstd(res[j]))
msdata[a + "_" + str(j) + "_var"][i] = float(np.nanvar(res[j]))
msdata[a + "_" + str(j) + "_min"][i] = float(np.nanmin(res[j]))
msdata[a + "_" + str(j) + "_max"][i] = float(np.nanmax(res[j]))
msdata[a + "_" + str(j) + "_med"][i] = float(np.nanmedian(res[j]))
except:
msdata[a][i] = None
print("{0:>6}: Error finding {}".format(i, a), file=sys.stderr)
h5.close()
for a in msdata.keys():
songdata["MSD_" + a] = msdata[a]
songdata.to_csv(path_or_buf="data/deezer/deezer-msd.csv")