HazemTarek1080
/
A-Comprehensive-Analysis-of-Netflix-User-Behavior-Through-Hashing-and-Clustering-Techniques
Public
forked from marinazanoni/ADM-HW4
-
Notifications
You must be signed in to change notification settings - Fork 0
/
functions.py
236 lines (173 loc) · 9.91 KB
/
functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
import pandas as pd
import numpy as np
import random
def min_hash_signatures(matrix, perms, num_buckets):
"""
Calculate min-hash signatures for users based on their likes in a binary matrix.
Parameters:
- matrix (numpy.ndarray): Binary matrix representing user likes (genres).
- perms (int): Number of permutations for generating min-hash signatures.
- num_buckets (int): Number of buckets to hash signatures.
Returns:
- signature_matrix (numpy.ndarray): Matrix containing min-hash signatures.
- buckets (dict): Dictionary mapping bucket index to a list of user indices.
"""
num_users = matrix.shape[1]
# Initialize the signature matrix with infinity values
signature_matrix = np.full((perms, num_users), np.inf)
# Generate random hash functions
hash_functions = [np.random.permutation(num_users) for _ in range(perms)]
# Initialize an empty dictionary to store buckets
buckets = {i: [] for i in range(num_buckets)}
# Iterate through each row (genre) in the binary matrix
for genre_index in range(matrix.shape[0]):
# Check if the genre is liked by each user
genre_likes = matrix[genre_index, :]
# Update the min-hash signatures
for hash_index, hash_function in enumerate(hash_functions):
# Find users who like the genre
users_in_genre = np.where(genre_likes == 1)[0]
# Update the signature if the user likes the genre
signature_matrix[hash_index, users_in_genre] = np.minimum(
signature_matrix[hash_index, users_in_genre],
hash_function[genre_index]
)
# Map the min-hash signature to a bucket
for user_index in range(num_users):
bucket_index = hash(tuple(signature_matrix[:, user_index])) % num_buckets
buckets[bucket_index].append(user_index) # Add the user to the corresponding bucket
return signature_matrix, buckets
def find_similar_users(target_user_id, unique_users, signatures):
"""
Find the two most similar users to a target user based on Jaccard similarity.
Parameters:
- target_user_id (int): ID of the target user.
- unique_users (list): List of unique user IDs.
- signatures (numpy.ndarray): Matrix containing min-hash signatures.
Returns:
- most_similar_users (list): List of tuples containing user IDs and their Jaccard similarity.
"""
# Index of the target user
target_index = unique_users.index(target_user_id)
# Calculate Jaccard similarity with all other users
similarities = []
for user_index, other_user_id in enumerate(unique_users):
# Skip the target user
if user_index != target_index:
# Calculate Jaccard similarity
intersection = np.sum(np.minimum(signatures[:, target_index], signatures[:, user_index]))
union = np.sum(np.maximum(signatures[:, target_index], signatures[:, user_index]))
similarity = intersection / union
# Append the similarity and user ID to the list
similarities.append((other_user_id, similarity))
# Sort the list by similarity in descending order
similarities.sort(key=lambda x: x[1], reverse=True)
# Return the two most similar users
most_similar_users = similarities[:2]
return most_similar_users
def get_top5_movies(user_id, df, user_label):
"""
Get the top 5 movies clicked by a user.
Parameters:
- user_id (int): ID of the target user.
- df (pandas.DataFrame): DataFrame containing user clicks data.
- user_label (str): Label for the target user.
Returns:
- top_5_movies_titles (pandas.DataFrame): DataFrame with the top 5 movies titles, movie IDs, clicks, and user label.
"""
# Filter the DataFrame to the target user
user_df = df[df['user_id'] == user_id]
# Get the top 5 movies that appeared most in the user clicks
top_5_movies = user_df['movie_id'].value_counts()[:5]
# Get the top 5 movies titles
top_5_movies_titles = df[df['movie_id'].isin(top_5_movies.index)][['title', 'movie_id']]
# Remove duplicates
top_5_movies_titles.drop_duplicates(inplace=True)
# Remove the movies with title 'NOT AVAILABLE'
top_5_movies_titles = top_5_movies_titles[top_5_movies_titles['title'] != 'NOT AVAILABLE']
# Add the number of clicks as a column
top_5_movies_titles['clicks'] = top_5_movies.values
# Remove the index column
top_5_movies_titles.reset_index(drop=True, inplace=True)
# Add the user column with the value of user_label for all rows
top_5_movies_titles['user'] = user_label
return top_5_movies_titles
def recommend_movies(user_A_movies, user_B_movies):
"""
Recommend movies based on common clicks between two users.
Parameters:
- user_A_movies (pandas.DataFrame): DataFrame of movies clicked by user A.
- user_B_movies (pandas.DataFrame): DataFrame of movies clicked by user B.
Returns:
- recommend_movies (list): List of recommended movie titles.
"""
# Merge user_A_movies and user_B_movies to find common movies
common_movies_df = pd.merge(user_A_movies, user_B_movies, on="movie_id", how="inner")
# If there are common movies, recommend based on total clicks
if not common_movies_df.empty:
# Calculate total clicks for each common movie
common_movies_df["total_clicks"] = common_movies_df["clicks_x"] + common_movies_df["clicks_y"]
# Sort movies based on total clicks (descending order)
sorted_movies_df = common_movies_df.sort_values(by="total_clicks", ascending=False)
# Extract up to five recommended movies
recommended_movies_df = sorted_movies_df.head(5)[["title_x", "movie_id", "total_clicks"]].rename(columns={"title_x": "title"})
# Recommend most clicked movies from each user
clicks_A = user_A_movies.sort_values(by="clicks", ascending=False).head(5)
clicks_B = user_B_movies.sort_values(by="clicks", ascending=False).head(5)
# Concatenate and deduplicate recommended movies
recommended_movies_df = pd.concat([clicks_A, clicks_B]).drop_duplicates(subset="movie_id").head(5)
recommend_movies = list(recommended_movies_df["title"])
return recommend_movies
def extract_starting_centroids(rdd, k):
#returns k samples extracted from our dataset(in rdd form) without replacement
return rdd.takeSample(False, k)
def euclidean_squared_distance(x, y): #Computes the square of the euclidean distance between two points
x = np.array(x, dtype=np.float64)
y = np.array(y, dtype=np.float64)
return (np.linalg.norm(x-y))**2
# Function used to check the convergence of the Kmeans algorithm: checks if the centroids of two following iterations are the same
checkconvergencefun = lambda l1, l2: sum(sum([el1 - el2 for (el1, el2) in zip(l1, l2)]))
def Kmeans(rdd_data, centroids, k, it):
niter = it
# MAP PHASE: assign to each point a key-value pair -> key: the label of the cluster it's assigned to; value: (point, 1)
rddmap = rdd_data.map(lambda point:(np.argmin([ euclidean_squared_distance(point, centroid) for centroid in centroids ]), (point, 1)))
# REDUCE PHASE: for each key(cluster) we get (sum_{point in the cluster}, #{points in the clusters})
rddreduce = rddmap.reduceByKey(lambda t1, t2: (np.add(t1[0], t2[0]), t1[1] + t2[1]) )
# COMPUTE THE NEW CENTROIDS: new centroid = mean of the points in each cluster = sum_{point in the cluster}/#{points in the clusters} => for each point,
new_centroids = rddreduce.mapValues(lambda t: t[0]/t[1]).map(lambda t: t[1])
new_centr_list = new_centroids.collect()
# CHECKING CONVERGENCE
if checkconvergencefun([np.array(centroid) for centroid in centroids], new_centr_list) == 0:
# Compute the clusters
clusters = rddmap.groupByKey().sortByKey().mapValues(lambda iterable: [t[0] for t in list(iterable)]).collect()
# List of keys corresponding to the cluster each point its assigned to (ordered by appearence of the points in the dataset)
clusters_idx = rddmap.map(lambda t: t[0]).collect()
return clusters, clusters_idx, new_centr_list, niter
else:
# Update the number of iterations
niter += 1
# Iterative call
return Kmeans(rdd_data, new_centr_list, k, niter)
def get_centroids_Kmeans_pp(data, k):
n = len(data)
data = data.values
# Initialize the list of centroids indeces by choosing the first one uniformly at random and saving it
centroids_indeces = [random.randint(0, n)]
centroids = [np.array(data[centroids_indeces[-1], :])]
# Compute the remaining k-1 centroids
for _ in range(1, k):
# Initialize list with squared distances from the nearest centeroids
distances_nc = []
for point in data:
# Find the nearest centroid between the ones already computed and save its distance from the point
# (we can directly use the squared distance beacause is an increasing function in [0, +\infty) => we "save" computational cost])
distances = np.array([euclidean_squared_distance(point, centroid) for centroid in centroids]).flatten()#compute sistances between point - centroids
distances_nc.append(np.min(distances, axis =0)) #save the smallest distance
# Choose the new centroid wrt their distance from the nearest centroid between the ones already computed
centroids.append(random.choices(data, weights=distances_nc, k=1)[0])
return centroids
def kmeans_plusplus(datardd, datapddf, k):
# Initialize the centroids for the first iteration
centroids_pp = get_centroids_Kmeans_pp(datapddf, k)
# Use the previous clustering (k-Means) algorithm to compute the clusters
return Kmeans(datardd, centroids_pp, k, 0)