re-wrote func defs & doc strings

Kinjuriu · May 17, 2023 · b0ae82b · b0ae82b
1 parent a0dcf98
commit b0ae82b
Show file tree

Hide file tree

Showing 5 changed files with 296 additions and 52 deletions.
diff --git a/models/collaborative_filtering.py b/models/collaborative_filtering.py
@@ -0,0 +1,120 @@
+from surprise import KNNBasic
+import pandas as pd
+from sklearn.model_selection import GridSearchCV
+from surprise import accuracy
+
+def train_item_based_model(trainset, testset, sim_options={'name': 'cosine', 'user_based': False}):
+    """
+    Trains an item-based collaborative filtering recommendation system using KNNBasic algorithm.
+    
+    Parameters
+    ----------
+    trainset: surprise.Trainset
+        The trainset used to fit the KNNBasic algorithm.
+    testset: list of tuples
+        The testset used to predict the ratings.
+    sim_options: dict, optional (default={'name': 'cosine', 'user_based': False})
+        The similarity options for the KNNBasic algorithm. 
+        Available options are 'cosine' and 'msd'.
+    
+    Returns
+    -------
+    predictions: list of surprise.Prediction objects
+        The predictions made by the KNNBasic algorithm on the testset.
+    """
+    # Define the KNNBasic algorithm
+    algo = KNNBasic(sim_options=sim_options, verbose=False)
+
+    # Fit the KNNBasic algorithm on the trainset
+    algo.fit(trainset)
+
+    # Predict the ratings on the testset
+    predictions = algo.test(testset)
+
+    return predictions
+
+def tune_hyperparameters(data, param_grid={'k': [20, 30,40], 'min_k': [3,6,9],
+              'sim_options': {'name': ['msd', 'cosine'],
+                              'user_based': [False]}
+              }, cv=3):
+    """
+    Performs a hyperparameter tuning on a KNNBasic algorithm using grid search cross-validation.
+    
+    Parameters
+    ----------
+    data: surprise.Dataset
+        The dataset used to perform the grid search cross-validation.
+    param_grid: dict, optional (default={'k': [20, 30,40], 'min_k': [3,6,9],
+              'sim_options': {'name': ['msd', 'cosine'],
+                              'user_based': [False]}
+              })
+        The parameter grid to search through.
+    cv: int, optional (default=3)
+        The number of folds to use for cross-validation.
+    
+    Returns
+    -------
+    best_score: float
+        The best root mean squared error (RMSE) score found through the grid search cross-validation.
+    best_params: dict
+        The combination of parameters that gave the best RMSE score.
+    results_df: pandas.DataFrame
+        The dataframe containing the results from the grid search cross-validation.
+    """
+    # Perform grid search cross-validation
+    grid_obj = GridSearchCV(KNNBasic, param_grid, measures=['rmse', 'mae'], cv=cv, n_jobs=-1)
+    grid_obj.fit(data)
+
+    # Get the best RMSE score and best parameters
+    best_score = grid_obj.best_score['rmse']
+    best_params = grid_obj.best_params['rmse']
+
+def train_recommendation_model(trainset, testset, param_grid, measures=['rmse', 'mae'], cv=3, n_jobs=-1):
+    """
+    Train recommendation model using grid search cross-validation
+    
+    Parameters:
+    trainset (Dataset): trainset for the recommendation system
+    testset (Dataset): testset for the recommendation system
+    param_grid (dict): Hyperparameters to tune
+    measures (list): Evaluation metrics, default ['rmse', 'mae']
+    cv (int): Number of folds for cross-validation, default 3
+    n_jobs (int): Number of parallel jobs, default -1
+    
+    Returns:
+    best_model (object): Optimized recommendation model
+    results_df (pd.DataFrame): Results of the grid search cross-validation
+    best_params (dict): Optimal hyperparameters
+    """
+    grid_obj = GridSearchCV(KNNBasic, param_grid, measures=measures, cv=cv, n_jobs=n_jobs)
+    grid_obj.fit(trainset)
+
+    best_model = KNNBasic(sim_options={'name': grid_obj.best_params['rmse']['sim_options']['name'], 
+                                       'user_based': grid_obj.best_params['rmse']['sim_options']['user_based']}, 
+                         k=grid_obj.best_params['rmse']['k'],
+                         min_k=grid_obj.best_params['rmse']['min_k'],
+                         verbose=False)
+    best_model.fit(trainset)
+
+    results_df = pd.DataFrame.from_dict(grid_obj.cv_results)
+    best_params = grid_obj.best_params['rmse']
+
+    return best_model, results_df, best_params
+
+def evaluate_recommendation_model(recommendation_model, testset, accuracy):
+    """
+    Evaluate recommendation model using Root Mean Squared Error (RMSE)
+    
+    Parameters:
+    recommendation_model (object): Trained recommendation model
+    testset (Dataset): testset for the recommendation system
+    accuracy (module): Accuracy module from surprise
+    
+    Returns:
+    rmse (float): RMSE of the recommendation model
+    """
+    predictions = recommendation_model.test(testset)
+    rmse = accuracy.rmse(predictions)
+
+    return rmse
+
diff --git a/models/matrix_factorization.py b/models/matrix_factorization.py
@@ -0,0 +1,109 @@
+import pandas as pd
+from surprise import SVD
+from surprise import Dataset
+from surprise import accuracy
+from surprise.model_selection import GridSearchCV
+
+def train_svd(trainset):
+    """
+    Train a SVD algorithm on a trainset.
+
+    Parameters:
+    trainset (surprise.Trainset): The trainset to train the SVD algorithm on.
+
+    Returns:
+    surprise.SVD: The trained SVD algorithm.
+    """
+    algo_svd = SVD()
+    algo_svd.fit(trainset)
+    return algo_svd
+
+def predict_ratings(testset, algo_svd):
+    """
+    Predict ratings using a trained SVD algorithm on a testset.
+
+    Parameters:
+    testset (list of tuples): The testset to predict the ratings on.
+    algo_svd (surprise.SVD): The trained SVD algorithm.
+
+    Returns:
+    list: List of predictions.
+    """
+    predictions = algo_svd.test(testset)
+    return predictions
+
+def compute_rmse(predictions):
+    """
+    Compute RMSE (Root Mean Squared Error) on predictions.
+
+    Parameters:
+    predictions (list): List of predictions.
+
+    Returns:
+    float: RMSE.
+    """
+    rmse = accuracy.rmse(predictions)
+    return rmse
+
+def grid_search_svd(data):
+    """
+    Perform hyperparameter tuning for a SVD algorithm using GridSearchCV.
+
+    Parameters:
+    data (surprise.Dataset): The dataset to perform hyperparameter tuning on.
+
+    Returns:
+    surprise.model_selection.GridSearchCV: The grid search object.
+    """
+    param_grid = {'n_epochs': [10, 20, 30], 'lr_all': [0.001, 0.005, 0.01], 'reg_all': [0.2, 0.4, 0.6]}
+    gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3, n_jobs=-1)
+    gs.fit(data)
+    return gs
+
+def get_best_score_rmse(gs):
+    """
+    Get the best RMSE score from a grid search object.
+
+    Parameters:
+    gs (surprise.model_selection.GridSearchCV): The grid search object.
+
+    Returns:
+    float: Best RMSE score.
+    """
+    best_score_rmse = gs.best_score['rmse']
+    return best_score_rmse
+
+def get_best_params_rmse(gs):
+    """
+    Get the combination of parameters that gave the best RMSE score from a grid search object.
+
+    Parameters:
+    gs (surprise.model_selection.GridSearchCV): The grid search object.
+
+    Returns:
+    dict: Combination of parameters that gave the best RMSE score.
+    """
+    best_params_rmse = gs.best_params['rmse']
+    return best_params_rmse
+
+def build_final_model(trainset, best_params):
+    """
+    Builds the final SVD model using the best hyperparameters from the grid search.
+    
+    Parameters:
+        trainset (pandas dataframe): The training data for the SVD model
+        best_params (dict): A dictionary of the best hyperparameters found during the grid search
+        
+    Returns:
+        svd_algo_optimized (scikit-learn SVD model): The optimized SVD model
+    """
+    # Building the optimized SVD model using the best hyperparameters
+    svd_algo_optimized = SVD(n_epochs=best_params['n_epochs'], 
+                             lr_all=best_params['lr_all'], 
+                             reg_all=best_params['reg_all'])
+
+    # Training the algorithm on the trainset
+    svd_algo_optimized.fit(trainset)
+
+    return svd_algo_optimized
+
diff --git a/src/recommender_system/main.py b/src/recommender_system/main.py
@@ -47,6 +47,13 @@ def read_csv(filepath: str) -> pd.DataFrame:
 # Create a dataframe with the average ratings and rating counts
 final_rating = pd.DataFrame({'avg_rating': avg_rating, 'rating_count': rating_count})
 
+if __name__ == "__main__":
+    cleaned_ratings = load_data()
+
+    print(recommend_movies_based_on_popularity(cleaned_ratings, 5, 50))
+    print(recommend_movies_based_on_popularity(cleaned_ratings, 5, 100))
+
+    recommend_movies_based_on_similarity(cleaned_ratings, 5, 4)
 
 # Recommend top 5 movies with 50 minimum interactions based on popularity
 print(list(get_top_movies(final_rating, 5, 50)))

diff --git a/src/recommender_system/pre_processing.py b/src/recommender_system/pre_processing.py
@@ -1,51 +1,33 @@
-# Data pre-processing
 import pandas as pd
-
-from numpy import unique
-
-
-# PREVIEW
-def preProcessing(ratings):
-    ratings.info()  # Checking the info of the dataset
-    #There are 100,004 observations and 4 columns in the data
-
-    #Drop columns that are not numeric
-    # Get numerical columns only from the dataframe
-    numerics = ["int16", "int32", "int64", "float16", "float32", "float64"]
-    ratings = ratings.select_dtypes(include=numerics)
-
-    #zero-variance predictors
-    counts = ratings.nunique()
-    to_del = [i for i,v in enumerate(counts) if v == 1]
-    print(to_del)
-    # drop useless columns
-    ratings.drop(to_del, axis=1, inplace=True)   
-
-    # counts = ratings.nunique()
-    # #Few-value columns
-    # # record columns to delete
-    # to_del = [i for i,v in enumerate(counts) if (float(v)/ratings.shape[0]*100) < 1] 
-    # print("before")
-    # print(to_del)
-    # print("after")
-    # # drop useless columns
-    # if(len(to_del)>0):
-    #     ratings.drop(to_del, axis=1, inplace=True)   
-
-    #data deduplication
-    # delete duplicate rows 
-    # ratings.drop_duplicates(inplace=True) 
-
-    #Missing values 
-    #checking for null values
-    # print(ratings.isnull().sum().sum())  # zero null values
+import numpy as np
+
+def pre_process_data(ratings: pd.DataFrame) -> pd.DataFrame:
+    """
+    Pre-process the ratings dataframe.
+
+    Parameters
+    ----------
+    ratings : pd.DataFrame
+        The input ratings dataframe.
+
+    Returns
+    -------
+    pd.DataFrame
+        The pre-processed dataframe.
+    """
+    # Keep only numeric columns
+    ratings = ratings.select_dtypes(include=["int16", "int32", "int64", "float16", "float32", "float64"])
+
+    # Drop columns with only one unique value
+    to_del = [i for i, v in enumerate(ratings.nunique()) if v == 1]
+    ratings.drop(ratings.columns[to_del], axis=1, inplace=True)
+
+    # Replace empty values with NaN
     sum_of_null_values = ratings.isnull().sum().sum()
-
-    if(sum_of_null_values>1):
-        ratings.replace("", np.nan, regex=False, inplace=True)  # replace the dashes with Nan
+    if sum_of_null_values > 0:
+        ratings.replace("", np.nan, regex=False, inplace=True)
+
+    # Drop timestamp column
+    ratings.drop(["timestamp"], axis=1, inplace=True)
 
-    # Dropping the timestamp column
-    ratings.drop(['timestamp'], axis=1, inplace=True)
     return ratings
-    # ratings.info()
-
diff --git a/src/recommender_system/recommender.py b/src/recommender_system/recommender.py
@@ -1,10 +1,6 @@
 # build a similarity-based recommendation systems using cosine similarity
 # use KNN to find similar users which are the nearest neighbor to the given userid
 
-
-# Installing surprise library
-#!pip install surprise
-
 # A performance metrics in surprise
 from surprise import accuracy
 # Class will be used to parse a file containing ratings, data should be in structure - user ; item ; rating
@@ -58,4 +54,34 @@ def get_recommendations(data, user_id, top_n, algo):
     # Sorting the predicted ratings in descending order
     recommendations.sort(key=lambda x: x[1], reverse=True)
 
-    return recommendations[:top_n] # returing top n highest predicted rating movies for this user
+    return recommendations[:top_n] # returing top n highest predicted rating movies for this user
+
+def perform_tuning_and_rmse(data):
+    """
+    This function performs hyperparameter tuning for the KNN algorithm and
+    returns the best rmse score and the combination of hyperparameters that gave the best score.
+    """
+    param_grid = {
+        'k': [20, 30, 40],
+        'min_k': [3, 6, 9],
+        'sim_options': {
+            'name': ['msd', 'cosine'],
+            'user_based': [True]
+        }
+    }
+    grid_obj = GridSearchCV(KNNBasic, param_grid, measures=['rmse', 'mae'], cv=3, n_jobs=-1)
+    grid_obj.fit(data)
+    best_rmse_score = grid_obj.best_score['rmse']
+    best_params = grid_obj.best_params['rmse']
+    return best_rmse_score, best_params
+
+def build_final_model(trainset, testset, sim_options):
+    """
+    This function builds the final KNN model with optimal hyperparameters and returns
+    the rmse score on the test set.
+    """
+    similarity_algo_optimized_user = KNNBasic(sim_options=sim_options, k=40, min_k=6, verbose=False)
+    similarity_algo_optimized_user.fit(trainset)
+    predictions = similarity_algo_optimized_user.test(testset)
+    rmse = accuracy.rmse(predictions)
+    return rmse