-
Notifications
You must be signed in to change notification settings - Fork 0
/
precomputation.py
206 lines (196 loc) · 7.58 KB
/
precomputation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
import numpy as np
import pandas as pd
import math
from sklearn.cluster import KMeans
import json
LOG_DIR = 'log_2021-12-06_04:51:47/'
NUM_PER_CLUSTER_LIST = [1, 2, 4, 8, 16]
SCORE_WEIGHT = 3
# artists
with open(LOG_DIR + 'artist.csv') as a:
artists = pd.read_csv(a, header=None)
# following
with open(LOG_DIR + 'following.csv') as f:
following = pd.read_csv(f, header=None)
# spotify
with open(LOG_DIR + 'spotify_artist.csv') as s:
spotify = pd.read_csv(s, header=None)
means_cols = []
# audio features up to duration
for i in range(5, 27, 2):
# omit key
if i == 9:
pass
else:
means_cols.append(i)
# twitter
with open(LOG_DIR + 'twitter_user.csv') as t:
twitter = pd.read_csv(t, header=None)
# check for nan and finite columns
for col in means_cols:
col_nan = np.isnan(spotify.iloc[:, col]).values.any()
if col_nan is True:
print('Column {} has nan'.format(col))
col_inf = np.isinf(spotify.iloc[:, col]).values.any()
if col_inf is True:
print('Column {} has inf'.format(col))
col_large = (spotify.iloc[:, col] >= np.finfo('float64').max).any()
if col_large is True:
print('Column {} has large value'.format(col))
#print('nan in spotify {}'.format(np.isnan(pd.DataFrame(np.nan_to_num(spotify.iloc[:, means_cols]))).values.any()))
#print('done with checks')
# spotify info dictionary
s_info = {}
sids = set()
tids = set()
# spotify id key, add twitter id
for i, row in artists.iterrows():
s_info[row[2]] = {'tid': int(row[1])}
sids.add(row[2])
tids.add(int(row[1]))
print('{} unique spotify ids'.format(len(sids)))
print('{} unique twitter ids'.format(len(tids)))
# spotify name, genres, means
for i, row in spotify.iterrows():
sid = row[0]
if sid in s_info:
s_info[sid]['spotify name'] = row[1]
genres = []
for g in row.iloc[2].replace('[', '').replace(']', '').replace("'", "").split(','):
genres.append(g.strip())
s_info[sid]['genres'] = genres
means = []
for col in means_cols:
means.append(row.iloc[col])
s_info[sid]['means'] = means
# twitter info dictionary
t_info = {}
# twitter id keys, add username, name, followers and following counts
for i, row in twitter.iterrows():
t_info[int(row[0])] = {'username': row[1], 'name': row[2],
'followers count': row[5], 'following count': row[6],
'followers': [], 'following': []}
# followers and following ids
for i, row in following.iterrows():
t_info[int(row[0])]['following'].append(int(row[1]))
t_info[int(row[1])]['followers'].append(int(row[0]))
# artists and followers count table
df_artists_fcounts = pd.DataFrame(columns=['sid', 'tid', 'followers count'])
for sid in s_info:
tid = s_info[sid]['tid']
df_artists_fcounts = df_artists_fcounts.append(
{'sid': sid, 'tid': tid, 'followers count': t_info[tid]['followers count']}, ignore_index=True)
# Popular vs candidate threshold
popular = df_artists_fcounts[df_artists_fcounts.iloc[:, 2] >= 100000]
popular_sids = set([psid for psid in popular.iloc[:,0]])
print('{} popular artists'.format(len(popular_sids)))
candidates = df_artists_fcounts[df_artists_fcounts.iloc[:, 2] < 100000]
candidates_sids = set([csid for csid in candidates.iloc[:, 0]])
print('{} candidate artists'.format(len(candidates_sids)))
for num_per_cluster in NUM_PER_CLUSTER_LIST:
print("Running for {:} artists per clusters".format(num_per_cluster))
# spotify means only, for clustering
cols_id_means = [0] + means_cols
spotify_means = spotify.iloc[:, cols_id_means]
for i, row in spotify_means.iterrows():
if row[0] not in s_info:
spotify_means.drop(i, inplace=True)
spotify_means.reset_index(drop=True, inplace=True)
spotify_means_clust = pd.DataFrame(np.nan_to_num(spotify_means.drop(0, axis=1)))
# clustering
num_clust = math.floor(popular.shape[0]/num_per_cluster)
print('{} clusters'.format(num_clust))
clusters = KMeans(n_clusters=num_clust, init='k-means++').fit(spotify_means_clust)
# add cluster group info
for i, row in spotify_means.iterrows():
s_info[row.iloc[0]]['cluster'] = clusters.labels_[i].item()
# make sid and cluster group df
df_artists_clusters = pd.DataFrame(columns=['sid', 'cluster'])
for artist in s_info:
df_artists_clusters = df_artists_clusters.append(
{'sid': artist, 'cluster': s_info[artist]['cluster']}, ignore_index=True)
# group by cluster
clusters_groups = df_artists_clusters.groupby(['cluster'])
# make dictionary with:
# popular artist key
# candidate artists in same cluster
popular_candidates = {}
for psid in popular_sids:
popular_candidates[psid] = []
for clust in range(num_clust):
g = clusters_groups.get_group(clust)
p = []
c = []
for sid in g.loc[:, 'sid']:
if sid in popular_sids:
p.append(sid)
elif sid in candidates_sids:
c.append(sid)
else:
print('neither')
for psid in p:
for csid in c:
popular_candidates[psid].append(csid)
# calculate scores
candidates_scores = {}
min_similarity = 90000000
max_similarity = -5
for psid in popular_candidates:
candidates = popular_candidates[psid]
candidates_scores[psid] = []
for csid in candidates:
similarity = np.linalg.norm(np.array(s_info[psid]['means']) - np.array(s_info[csid]['means'])).item()
ctid = s_info[csid]['tid']
ptid = s_info[psid]['tid']
cf = t_info[ctid]['followers count']
pf = t_info[ptid]['followers count']
if cf == 0:
popularity = 0
else:
popularity = math.log(cf) / math.log(pf)
novelty = 1 - popularity
score = (similarity ** SCORE_WEIGHT)*(novelty)
min_similarity = min(min_similarity, similarity)
max_similarity = max(max_similarity, similarity)
candidates_scores[psid].append(tuple((csid, score)))
print(min_similarity, max_similarity)
# spotify name find
s_name_find = {}
for sid in s_info:
name = s_info[sid]['spotify name']
if name in s_name_find:
s_name_find[name].append(sid)
else:
s_name_find[name] = [sid]
# twitter username find
t_uname_find = {}
for sid in s_info:
tid = s_info[sid]['tid']
uname = t_info[tid]['username']
if uname in t_uname_find:
t_uname_find[uname].append(sid)
else:
t_uname_find[uname] = [sid]
# export dictionaries and scores as json files
with open('precomp_diff_clusters/precomp_{:}/s_name_find.json'.format(num_per_cluster), 'w') as snf_file:
json.dump(s_name_find, snf_file)
with open('precomp_diff_clusters/precomp_{:}/t_uname_find.json'.format(num_per_cluster), 'w') as tuf_file:
json.dump(t_uname_find, tuf_file)
with open('precomp_diff_clusters/precomp_{:}/s_info.json'.format(num_per_cluster), 'w') as si_file:
json.dump(s_info, si_file)
with open('precomp_diff_clusters/precomp_{:}/t_info.json'.format(num_per_cluster), 'w') as ti_file:
json.dump(t_info, ti_file)
with open('precomp_diff_clusters/precomp_{:}/candidates_scores.json'.format(num_per_cluster), 'w') as cs_file:
json.dump(candidates_scores, cs_file)
'''
# Justin Bieber
# twitter id
jb_tid = twitter.iloc[0,0]
# followers count
jb_count = twitter.iloc[0,5]
# manual count from following table
jb_fcount = 0
for i, row in following.iterrows():
if row[1] == jb_tid:
jb_fcount += 1
'''