Skip to content

Commit

Permalink
re-wrote func defs & doc strings
Browse files Browse the repository at this point in the history
  • Loading branch information
Kinjuriu committed May 17, 2023
1 parent a0dcf98 commit b0ae82b
Show file tree
Hide file tree
Showing 5 changed files with 296 additions and 52 deletions.
120 changes: 120 additions & 0 deletions models/collaborative_filtering.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
from surprise import KNNBasic
import pandas as pd
from sklearn.model_selection import GridSearchCV
from surprise import accuracy

def train_item_based_model(trainset, testset, sim_options={'name': 'cosine', 'user_based': False}):
"""
Trains an item-based collaborative filtering recommendation system using KNNBasic algorithm.
Parameters
----------
trainset: surprise.Trainset
The trainset used to fit the KNNBasic algorithm.
testset: list of tuples
The testset used to predict the ratings.
sim_options: dict, optional (default={'name': 'cosine', 'user_based': False})
The similarity options for the KNNBasic algorithm.
Available options are 'cosine' and 'msd'.
Returns
-------
predictions: list of surprise.Prediction objects
The predictions made by the KNNBasic algorithm on the testset.
"""
# Define the KNNBasic algorithm
algo = KNNBasic(sim_options=sim_options, verbose=False)

# Fit the KNNBasic algorithm on the trainset
algo.fit(trainset)

# Predict the ratings on the testset
predictions = algo.test(testset)

return predictions

def tune_hyperparameters(data, param_grid={'k': [20, 30,40], 'min_k': [3,6,9],
'sim_options': {'name': ['msd', 'cosine'],
'user_based': [False]}
}, cv=3):
"""
Performs a hyperparameter tuning on a KNNBasic algorithm using grid search cross-validation.
Parameters
----------
data: surprise.Dataset
The dataset used to perform the grid search cross-validation.
param_grid: dict, optional (default={'k': [20, 30,40], 'min_k': [3,6,9],
'sim_options': {'name': ['msd', 'cosine'],
'user_based': [False]}
})
The parameter grid to search through.
cv: int, optional (default=3)
The number of folds to use for cross-validation.
Returns
-------
best_score: float
The best root mean squared error (RMSE) score found through the grid search cross-validation.
best_params: dict
The combination of parameters that gave the best RMSE score.
results_df: pandas.DataFrame
The dataframe containing the results from the grid search cross-validation.
"""
# Perform grid search cross-validation
grid_obj = GridSearchCV(KNNBasic, param_grid, measures=['rmse', 'mae'], cv=cv, n_jobs=-1)
grid_obj.fit(data)

# Get the best RMSE score and best parameters
best_score = grid_obj.best_score['rmse']
best_params = grid_obj.best_params['rmse']

def train_recommendation_model(trainset, testset, param_grid, measures=['rmse', 'mae'], cv=3, n_jobs=-1):
"""
Train recommendation model using grid search cross-validation
Parameters:
trainset (Dataset): trainset for the recommendation system
testset (Dataset): testset for the recommendation system
param_grid (dict): Hyperparameters to tune
measures (list): Evaluation metrics, default ['rmse', 'mae']
cv (int): Number of folds for cross-validation, default 3
n_jobs (int): Number of parallel jobs, default -1
Returns:
best_model (object): Optimized recommendation model
results_df (pd.DataFrame): Results of the grid search cross-validation
best_params (dict): Optimal hyperparameters
"""
grid_obj = GridSearchCV(KNNBasic, param_grid, measures=measures, cv=cv, n_jobs=n_jobs)
grid_obj.fit(trainset)

best_model = KNNBasic(sim_options={'name': grid_obj.best_params['rmse']['sim_options']['name'],
'user_based': grid_obj.best_params['rmse']['sim_options']['user_based']},
k=grid_obj.best_params['rmse']['k'],
min_k=grid_obj.best_params['rmse']['min_k'],
verbose=False)
best_model.fit(trainset)

results_df = pd.DataFrame.from_dict(grid_obj.cv_results)
best_params = grid_obj.best_params['rmse']

return best_model, results_df, best_params

def evaluate_recommendation_model(recommendation_model, testset, accuracy):
"""
Evaluate recommendation model using Root Mean Squared Error (RMSE)
Parameters:
recommendation_model (object): Trained recommendation model
testset (Dataset): testset for the recommendation system
accuracy (module): Accuracy module from surprise
Returns:
rmse (float): RMSE of the recommendation model
"""
predictions = recommendation_model.test(testset)
rmse = accuracy.rmse(predictions)

return rmse

109 changes: 109 additions & 0 deletions models/matrix_factorization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
import pandas as pd
from surprise import SVD
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import GridSearchCV

def train_svd(trainset):
"""
Train a SVD algorithm on a trainset.
Parameters:
trainset (surprise.Trainset): The trainset to train the SVD algorithm on.
Returns:
surprise.SVD: The trained SVD algorithm.
"""
algo_svd = SVD()
algo_svd.fit(trainset)
return algo_svd

def predict_ratings(testset, algo_svd):
"""
Predict ratings using a trained SVD algorithm on a testset.
Parameters:
testset (list of tuples): The testset to predict the ratings on.
algo_svd (surprise.SVD): The trained SVD algorithm.
Returns:
list: List of predictions.
"""
predictions = algo_svd.test(testset)
return predictions

def compute_rmse(predictions):
"""
Compute RMSE (Root Mean Squared Error) on predictions.
Parameters:
predictions (list): List of predictions.
Returns:
float: RMSE.
"""
rmse = accuracy.rmse(predictions)
return rmse

def grid_search_svd(data):
"""
Perform hyperparameter tuning for a SVD algorithm using GridSearchCV.
Parameters:
data (surprise.Dataset): The dataset to perform hyperparameter tuning on.
Returns:
surprise.model_selection.GridSearchCV: The grid search object.
"""
param_grid = {'n_epochs': [10, 20, 30], 'lr_all': [0.001, 0.005, 0.01], 'reg_all': [0.2, 0.4, 0.6]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3, n_jobs=-1)
gs.fit(data)
return gs

def get_best_score_rmse(gs):
"""
Get the best RMSE score from a grid search object.
Parameters:
gs (surprise.model_selection.GridSearchCV): The grid search object.
Returns:
float: Best RMSE score.
"""
best_score_rmse = gs.best_score['rmse']
return best_score_rmse

def get_best_params_rmse(gs):
"""
Get the combination of parameters that gave the best RMSE score from a grid search object.
Parameters:
gs (surprise.model_selection.GridSearchCV): The grid search object.
Returns:
dict: Combination of parameters that gave the best RMSE score.
"""
best_params_rmse = gs.best_params['rmse']
return best_params_rmse

def build_final_model(trainset, best_params):
"""
Builds the final SVD model using the best hyperparameters from the grid search.
Parameters:
trainset (pandas dataframe): The training data for the SVD model
best_params (dict): A dictionary of the best hyperparameters found during the grid search
Returns:
svd_algo_optimized (scikit-learn SVD model): The optimized SVD model
"""
# Building the optimized SVD model using the best hyperparameters
svd_algo_optimized = SVD(n_epochs=best_params['n_epochs'],
lr_all=best_params['lr_all'],
reg_all=best_params['reg_all'])

# Training the algorithm on the trainset
svd_algo_optimized.fit(trainset)

return svd_algo_optimized

7 changes: 7 additions & 0 deletions src/recommender_system/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,13 @@ def read_csv(filepath: str) -> pd.DataFrame:
# Create a dataframe with the average ratings and rating counts
final_rating = pd.DataFrame({'avg_rating': avg_rating, 'rating_count': rating_count})

if __name__ == "__main__":
cleaned_ratings = load_data()

print(recommend_movies_based_on_popularity(cleaned_ratings, 5, 50))
print(recommend_movies_based_on_popularity(cleaned_ratings, 5, 100))

recommend_movies_based_on_similarity(cleaned_ratings, 5, 4)

# Recommend top 5 movies with 50 minimum interactions based on popularity
print(list(get_top_movies(final_rating, 5, 50)))
Expand Down
76 changes: 29 additions & 47 deletions src/recommender_system/pre_processing.py
Original file line number Diff line number Diff line change
@@ -1,51 +1,33 @@
# Data pre-processing
import pandas as pd

from numpy import unique


# PREVIEW
def preProcessing(ratings):
ratings.info() # Checking the info of the dataset
#There are 100,004 observations and 4 columns in the data

#Drop columns that are not numeric
# Get numerical columns only from the dataframe
numerics = ["int16", "int32", "int64", "float16", "float32", "float64"]
ratings = ratings.select_dtypes(include=numerics)

#zero-variance predictors
counts = ratings.nunique()
to_del = [i for i,v in enumerate(counts) if v == 1]
print(to_del)
# drop useless columns
ratings.drop(to_del, axis=1, inplace=True)

# counts = ratings.nunique()
# #Few-value columns
# # record columns to delete
# to_del = [i for i,v in enumerate(counts) if (float(v)/ratings.shape[0]*100) < 1]
# print("before")
# print(to_del)
# print("after")
# # drop useless columns
# if(len(to_del)>0):
# ratings.drop(to_del, axis=1, inplace=True)

#data deduplication
# delete duplicate rows
# ratings.drop_duplicates(inplace=True)

#Missing values
#checking for null values
# print(ratings.isnull().sum().sum()) # zero null values
import numpy as np

def pre_process_data(ratings: pd.DataFrame) -> pd.DataFrame:
"""
Pre-process the ratings dataframe.
Parameters
----------
ratings : pd.DataFrame
The input ratings dataframe.
Returns
-------
pd.DataFrame
The pre-processed dataframe.
"""
# Keep only numeric columns
ratings = ratings.select_dtypes(include=["int16", "int32", "int64", "float16", "float32", "float64"])

# Drop columns with only one unique value
to_del = [i for i, v in enumerate(ratings.nunique()) if v == 1]
ratings.drop(ratings.columns[to_del], axis=1, inplace=True)

# Replace empty values with NaN
sum_of_null_values = ratings.isnull().sum().sum()

if(sum_of_null_values>1):
ratings.replace("", np.nan, regex=False, inplace=True) # replace the dashes with Nan
if sum_of_null_values > 0:
ratings.replace("", np.nan, regex=False, inplace=True)

# Drop timestamp column
ratings.drop(["timestamp"], axis=1, inplace=True)

# Dropping the timestamp column
ratings.drop(['timestamp'], axis=1, inplace=True)
return ratings
# ratings.info()

36 changes: 31 additions & 5 deletions src/recommender_system/recommender.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,6 @@
# build a similarity-based recommendation systems using cosine similarity
# use KNN to find similar users which are the nearest neighbor to the given userid


# Installing surprise library
#!pip install surprise

# A performance metrics in surprise
from surprise import accuracy
# Class will be used to parse a file containing ratings, data should be in structure - user ; item ; rating
Expand Down Expand Up @@ -58,4 +54,34 @@ def get_recommendations(data, user_id, top_n, algo):
# Sorting the predicted ratings in descending order
recommendations.sort(key=lambda x: x[1], reverse=True)

return recommendations[:top_n] # returing top n highest predicted rating movies for this user
return recommendations[:top_n] # returing top n highest predicted rating movies for this user

def perform_tuning_and_rmse(data):
"""
This function performs hyperparameter tuning for the KNN algorithm and
returns the best rmse score and the combination of hyperparameters that gave the best score.
"""
param_grid = {
'k': [20, 30, 40],
'min_k': [3, 6, 9],
'sim_options': {
'name': ['msd', 'cosine'],
'user_based': [True]
}
}
grid_obj = GridSearchCV(KNNBasic, param_grid, measures=['rmse', 'mae'], cv=3, n_jobs=-1)
grid_obj.fit(data)
best_rmse_score = grid_obj.best_score['rmse']
best_params = grid_obj.best_params['rmse']
return best_rmse_score, best_params

def build_final_model(trainset, testset, sim_options):
"""
This function builds the final KNN model with optimal hyperparameters and returns
the rmse score on the test set.
"""
similarity_algo_optimized_user = KNNBasic(sim_options=sim_options, k=40, min_k=6, verbose=False)
similarity_algo_optimized_user.fit(trainset)
predictions = similarity_algo_optimized_user.test(testset)
rmse = accuracy.rmse(predictions)
return rmse

0 comments on commit b0ae82b

Please sign in to comment.