-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
5 changed files
with
296 additions
and
52 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,120 @@ | ||
from surprise import KNNBasic | ||
import pandas as pd | ||
from sklearn.model_selection import GridSearchCV | ||
from surprise import accuracy | ||
|
||
def train_item_based_model(trainset, testset, sim_options={'name': 'cosine', 'user_based': False}): | ||
""" | ||
Trains an item-based collaborative filtering recommendation system using KNNBasic algorithm. | ||
Parameters | ||
---------- | ||
trainset: surprise.Trainset | ||
The trainset used to fit the KNNBasic algorithm. | ||
testset: list of tuples | ||
The testset used to predict the ratings. | ||
sim_options: dict, optional (default={'name': 'cosine', 'user_based': False}) | ||
The similarity options for the KNNBasic algorithm. | ||
Available options are 'cosine' and 'msd'. | ||
Returns | ||
------- | ||
predictions: list of surprise.Prediction objects | ||
The predictions made by the KNNBasic algorithm on the testset. | ||
""" | ||
# Define the KNNBasic algorithm | ||
algo = KNNBasic(sim_options=sim_options, verbose=False) | ||
|
||
# Fit the KNNBasic algorithm on the trainset | ||
algo.fit(trainset) | ||
|
||
# Predict the ratings on the testset | ||
predictions = algo.test(testset) | ||
|
||
return predictions | ||
|
||
def tune_hyperparameters(data, param_grid={'k': [20, 30,40], 'min_k': [3,6,9], | ||
'sim_options': {'name': ['msd', 'cosine'], | ||
'user_based': [False]} | ||
}, cv=3): | ||
""" | ||
Performs a hyperparameter tuning on a KNNBasic algorithm using grid search cross-validation. | ||
Parameters | ||
---------- | ||
data: surprise.Dataset | ||
The dataset used to perform the grid search cross-validation. | ||
param_grid: dict, optional (default={'k': [20, 30,40], 'min_k': [3,6,9], | ||
'sim_options': {'name': ['msd', 'cosine'], | ||
'user_based': [False]} | ||
}) | ||
The parameter grid to search through. | ||
cv: int, optional (default=3) | ||
The number of folds to use for cross-validation. | ||
Returns | ||
------- | ||
best_score: float | ||
The best root mean squared error (RMSE) score found through the grid search cross-validation. | ||
best_params: dict | ||
The combination of parameters that gave the best RMSE score. | ||
results_df: pandas.DataFrame | ||
The dataframe containing the results from the grid search cross-validation. | ||
""" | ||
# Perform grid search cross-validation | ||
grid_obj = GridSearchCV(KNNBasic, param_grid, measures=['rmse', 'mae'], cv=cv, n_jobs=-1) | ||
grid_obj.fit(data) | ||
|
||
# Get the best RMSE score and best parameters | ||
best_score = grid_obj.best_score['rmse'] | ||
best_params = grid_obj.best_params['rmse'] | ||
|
||
def train_recommendation_model(trainset, testset, param_grid, measures=['rmse', 'mae'], cv=3, n_jobs=-1): | ||
""" | ||
Train recommendation model using grid search cross-validation | ||
Parameters: | ||
trainset (Dataset): trainset for the recommendation system | ||
testset (Dataset): testset for the recommendation system | ||
param_grid (dict): Hyperparameters to tune | ||
measures (list): Evaluation metrics, default ['rmse', 'mae'] | ||
cv (int): Number of folds for cross-validation, default 3 | ||
n_jobs (int): Number of parallel jobs, default -1 | ||
Returns: | ||
best_model (object): Optimized recommendation model | ||
results_df (pd.DataFrame): Results of the grid search cross-validation | ||
best_params (dict): Optimal hyperparameters | ||
""" | ||
grid_obj = GridSearchCV(KNNBasic, param_grid, measures=measures, cv=cv, n_jobs=n_jobs) | ||
grid_obj.fit(trainset) | ||
|
||
best_model = KNNBasic(sim_options={'name': grid_obj.best_params['rmse']['sim_options']['name'], | ||
'user_based': grid_obj.best_params['rmse']['sim_options']['user_based']}, | ||
k=grid_obj.best_params['rmse']['k'], | ||
min_k=grid_obj.best_params['rmse']['min_k'], | ||
verbose=False) | ||
best_model.fit(trainset) | ||
|
||
results_df = pd.DataFrame.from_dict(grid_obj.cv_results) | ||
best_params = grid_obj.best_params['rmse'] | ||
|
||
return best_model, results_df, best_params | ||
|
||
def evaluate_recommendation_model(recommendation_model, testset, accuracy): | ||
""" | ||
Evaluate recommendation model using Root Mean Squared Error (RMSE) | ||
Parameters: | ||
recommendation_model (object): Trained recommendation model | ||
testset (Dataset): testset for the recommendation system | ||
accuracy (module): Accuracy module from surprise | ||
Returns: | ||
rmse (float): RMSE of the recommendation model | ||
""" | ||
predictions = recommendation_model.test(testset) | ||
rmse = accuracy.rmse(predictions) | ||
|
||
return rmse | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,109 @@ | ||
import pandas as pd | ||
from surprise import SVD | ||
from surprise import Dataset | ||
from surprise import accuracy | ||
from surprise.model_selection import GridSearchCV | ||
|
||
def train_svd(trainset): | ||
""" | ||
Train a SVD algorithm on a trainset. | ||
Parameters: | ||
trainset (surprise.Trainset): The trainset to train the SVD algorithm on. | ||
Returns: | ||
surprise.SVD: The trained SVD algorithm. | ||
""" | ||
algo_svd = SVD() | ||
algo_svd.fit(trainset) | ||
return algo_svd | ||
|
||
def predict_ratings(testset, algo_svd): | ||
""" | ||
Predict ratings using a trained SVD algorithm on a testset. | ||
Parameters: | ||
testset (list of tuples): The testset to predict the ratings on. | ||
algo_svd (surprise.SVD): The trained SVD algorithm. | ||
Returns: | ||
list: List of predictions. | ||
""" | ||
predictions = algo_svd.test(testset) | ||
return predictions | ||
|
||
def compute_rmse(predictions): | ||
""" | ||
Compute RMSE (Root Mean Squared Error) on predictions. | ||
Parameters: | ||
predictions (list): List of predictions. | ||
Returns: | ||
float: RMSE. | ||
""" | ||
rmse = accuracy.rmse(predictions) | ||
return rmse | ||
|
||
def grid_search_svd(data): | ||
""" | ||
Perform hyperparameter tuning for a SVD algorithm using GridSearchCV. | ||
Parameters: | ||
data (surprise.Dataset): The dataset to perform hyperparameter tuning on. | ||
Returns: | ||
surprise.model_selection.GridSearchCV: The grid search object. | ||
""" | ||
param_grid = {'n_epochs': [10, 20, 30], 'lr_all': [0.001, 0.005, 0.01], 'reg_all': [0.2, 0.4, 0.6]} | ||
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3, n_jobs=-1) | ||
gs.fit(data) | ||
return gs | ||
|
||
def get_best_score_rmse(gs): | ||
""" | ||
Get the best RMSE score from a grid search object. | ||
Parameters: | ||
gs (surprise.model_selection.GridSearchCV): The grid search object. | ||
Returns: | ||
float: Best RMSE score. | ||
""" | ||
best_score_rmse = gs.best_score['rmse'] | ||
return best_score_rmse | ||
|
||
def get_best_params_rmse(gs): | ||
""" | ||
Get the combination of parameters that gave the best RMSE score from a grid search object. | ||
Parameters: | ||
gs (surprise.model_selection.GridSearchCV): The grid search object. | ||
Returns: | ||
dict: Combination of parameters that gave the best RMSE score. | ||
""" | ||
best_params_rmse = gs.best_params['rmse'] | ||
return best_params_rmse | ||
|
||
def build_final_model(trainset, best_params): | ||
""" | ||
Builds the final SVD model using the best hyperparameters from the grid search. | ||
Parameters: | ||
trainset (pandas dataframe): The training data for the SVD model | ||
best_params (dict): A dictionary of the best hyperparameters found during the grid search | ||
Returns: | ||
svd_algo_optimized (scikit-learn SVD model): The optimized SVD model | ||
""" | ||
# Building the optimized SVD model using the best hyperparameters | ||
svd_algo_optimized = SVD(n_epochs=best_params['n_epochs'], | ||
lr_all=best_params['lr_all'], | ||
reg_all=best_params['reg_all']) | ||
|
||
# Training the algorithm on the trainset | ||
svd_algo_optimized.fit(trainset) | ||
|
||
return svd_algo_optimized | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,51 +1,33 @@ | ||
# Data pre-processing | ||
import pandas as pd | ||
|
||
from numpy import unique | ||
|
||
|
||
# PREVIEW | ||
def preProcessing(ratings): | ||
ratings.info() # Checking the info of the dataset | ||
#There are 100,004 observations and 4 columns in the data | ||
|
||
#Drop columns that are not numeric | ||
# Get numerical columns only from the dataframe | ||
numerics = ["int16", "int32", "int64", "float16", "float32", "float64"] | ||
ratings = ratings.select_dtypes(include=numerics) | ||
|
||
#zero-variance predictors | ||
counts = ratings.nunique() | ||
to_del = [i for i,v in enumerate(counts) if v == 1] | ||
print(to_del) | ||
# drop useless columns | ||
ratings.drop(to_del, axis=1, inplace=True) | ||
|
||
# counts = ratings.nunique() | ||
# #Few-value columns | ||
# # record columns to delete | ||
# to_del = [i for i,v in enumerate(counts) if (float(v)/ratings.shape[0]*100) < 1] | ||
# print("before") | ||
# print(to_del) | ||
# print("after") | ||
# # drop useless columns | ||
# if(len(to_del)>0): | ||
# ratings.drop(to_del, axis=1, inplace=True) | ||
|
||
#data deduplication | ||
# delete duplicate rows | ||
# ratings.drop_duplicates(inplace=True) | ||
|
||
#Missing values | ||
#checking for null values | ||
# print(ratings.isnull().sum().sum()) # zero null values | ||
import numpy as np | ||
|
||
def pre_process_data(ratings: pd.DataFrame) -> pd.DataFrame: | ||
""" | ||
Pre-process the ratings dataframe. | ||
Parameters | ||
---------- | ||
ratings : pd.DataFrame | ||
The input ratings dataframe. | ||
Returns | ||
------- | ||
pd.DataFrame | ||
The pre-processed dataframe. | ||
""" | ||
# Keep only numeric columns | ||
ratings = ratings.select_dtypes(include=["int16", "int32", "int64", "float16", "float32", "float64"]) | ||
|
||
# Drop columns with only one unique value | ||
to_del = [i for i, v in enumerate(ratings.nunique()) if v == 1] | ||
ratings.drop(ratings.columns[to_del], axis=1, inplace=True) | ||
|
||
# Replace empty values with NaN | ||
sum_of_null_values = ratings.isnull().sum().sum() | ||
|
||
if(sum_of_null_values>1): | ||
ratings.replace("", np.nan, regex=False, inplace=True) # replace the dashes with Nan | ||
if sum_of_null_values > 0: | ||
ratings.replace("", np.nan, regex=False, inplace=True) | ||
|
||
# Drop timestamp column | ||
ratings.drop(["timestamp"], axis=1, inplace=True) | ||
|
||
# Dropping the timestamp column | ||
ratings.drop(['timestamp'], axis=1, inplace=True) | ||
return ratings | ||
# ratings.info() | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters