diff --git a/notebooks/main.ipynb b/notebooks/main.ipynb index b9db6ae..ddede84 100644 --- a/notebooks/main.ipynb +++ b/notebooks/main.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 9, + "execution_count": 2, "metadata": { "metadata": {} }, @@ -18,10 +18,10 @@ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 9, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -55,7 +55,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -88,6 +88,17 @@ "print(\"Num validation interactions\", validation_data.to_numpy().sum())" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "training_data.to_pandas().to_pickle(\"training_data.pkl\")\n", + "test_data.to_pandas().to_pickle(\"test_data.pkl\")\n", + "validation_data.to_pandas().to_pickle(\"validation_data.pkl\")" + ] + }, { "cell_type": "code", "execution_count": 11, @@ -103,7 +114,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -127,7 +138,7 @@ "\n", " results = []\n", "\n", - " embedding_dimensions = (4,8,10,16,32,64)\n", + " embedding_dimensions = (4,8,16,32)\n", " # for every embedding\n", " for embedding_dimension in embedding_dimensions:\n", "\n", @@ -617,22 +628,25 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 13, "metadata": { "metadata": {} }, "outputs": [ { - "ename": "KeyboardInterrupt", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[20], line 22\u001b[0m\n\u001b[1;32m 9\u001b[0m tie \u001b[38;5;241m=\u001b[39m TechniqueInferenceEngine(\n\u001b[1;32m 10\u001b[0m training_data\u001b[38;5;241m=\u001b[39mtraining_data,\n\u001b[1;32m 11\u001b[0m validation_data\u001b[38;5;241m=\u001b[39mvalidation_data,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 15\u001b[0m enterprise_attack_filepath\u001b[38;5;241m=\u001b[39menterprise_attack_filepath,\n\u001b[1;32m 16\u001b[0m )\n\u001b[1;32m 17\u001b[0m \u001b[38;5;66;03m# mse = tie.fit_with_validation(\u001b[39;00m\n\u001b[1;32m 18\u001b[0m \u001b[38;5;66;03m# learning_rate=[0.001, 0.005, 0.01, 0.02, 0.05],\u001b[39;00m\n\u001b[1;32m 19\u001b[0m \u001b[38;5;66;03m# num_iterations=[500 * 512],\u001b[39;00m\n\u001b[1;32m 20\u001b[0m \u001b[38;5;66;03m# regularization_coefficient=[0, 0.0001, 0.001, 0.01],\u001b[39;00m\n\u001b[1;32m 21\u001b[0m \u001b[38;5;66;03m# )\u001b[39;00m\n\u001b[0;32m---> 22\u001b[0m mse \u001b[38;5;241m=\u001b[39m \u001b[43mtie\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mbest_hyperparameters\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 23\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMean Squared Error\u001b[39m\u001b[38;5;124m\"\u001b[39m, mse)\n\u001b[1;32m 24\u001b[0m precision \u001b[38;5;241m=\u001b[39m tie\u001b[38;5;241m.\u001b[39mprecision(k\u001b[38;5;241m=\u001b[39mk)\n", - "File \u001b[0;32m~/code/technique-inference-engine/models/tie.py:122\u001b[0m, in \u001b[0;36mTechniqueInferenceEngine.fit\u001b[0;34m(self, **kwargs)\u001b[0m\n\u001b[1;32m 104\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Fit the model to the data.\u001b[39;00m\n\u001b[1;32m 105\u001b[0m \n\u001b[1;32m 106\u001b[0m \u001b[38;5;124;03mArgs:\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 119\u001b[0m \u001b[38;5;124;03m The MSE of the prediction matrix, as determined by the test set.\u001b[39;00m\n\u001b[1;32m 120\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 121\u001b[0m \u001b[38;5;66;03m# train\u001b[39;00m\n\u001b[0;32m--> 122\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_model\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_training_data\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mto_sparse_tensor\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 124\u001b[0m mean_squared_error \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_model\u001b[38;5;241m.\u001b[39mevaluate(\n\u001b[1;32m 125\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_test_data\u001b[38;5;241m.\u001b[39mto_sparse_tensor(), method\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_prediction_method\n\u001b[1;32m 126\u001b[0m )\n\u001b[1;32m 128\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_checkrep()\n", - "File \u001b[0;32m~/code/technique-inference-engine/models/recommender/bpr_recommender.py:244\u001b[0m, in \u001b[0;36mBPRRecommender.fit\u001b[0;34m(self, data, learning_rate, epochs, regularization_coefficient)\u001b[0m\n\u001b[1;32m 238\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_U[u, :] \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m learning_rate \u001b[38;5;241m*\u001b[39m (\n\u001b[1;32m 239\u001b[0m sigmoid_derivative \u001b[38;5;241m*\u001b[39m d_w \u001b[38;5;241m-\u001b[39m (regularization_coefficient \u001b[38;5;241m*\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_U[u, :])\n\u001b[1;32m 240\u001b[0m )\n\u001b[1;32m 241\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_V[i, :] \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m learning_rate \u001b[38;5;241m*\u001b[39m (\n\u001b[1;32m 242\u001b[0m sigmoid_derivative \u001b[38;5;241m*\u001b[39m d_hi \u001b[38;5;241m-\u001b[39m (regularization_coefficient \u001b[38;5;241m*\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_V[i, :])\n\u001b[1;32m 243\u001b[0m )\n\u001b[0;32m--> 244\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_V[j, :] \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m learning_rate \u001b[38;5;241m*\u001b[39m (\n\u001b[1;32m 245\u001b[0m sigmoid_derivative \u001b[38;5;241m*\u001b[39m d_hj \u001b[38;5;241m-\u001b[39m (regularization_coefficient \u001b[38;5;241m*\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_V[j, :])\n\u001b[1;32m 246\u001b[0m )\n", - "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + "name": "stdout", + "output_type": "stream", + "text": [ + "sample user 0.0012049674987792969\n", + "u and i 8.883323907852173\n", + "j 34.49376893043518\n", + "1 2.0999958776653285e-07\n", + "2 1.2779525031083565e-06\n", + "3 8.259746333628572e-14\n", + "Mean Squared Error 1.0243713\n", + "Precision 0.010736048749198205\n", + "Recall 0.11750631136337891\n", + "Normalized Discounted Cumulative Gain 0.07381812089582988\n" ] } ], @@ -640,7 +654,7 @@ "# hyperparameters\n", "embedding_dimension = 4\n", "k = 20\n", - "best_hyperparameters = {'regularization_coefficient': 0.0001, 'epochs': 2, 'learning_rate': 0.0001}\n", + "best_hyperparameters = {'regularization_coefficient': 0.0001, 'epochs': 25, 'learning_rate': 0.0001}\n", "# best_hyperparameters[\"epochs\"] = 20*training_data.m*training_data.n\n", "\n", "model = BPRRecommender(m=training_data.m, n=training_data.n, k=embedding_dimension)\n", @@ -650,7 +664,7 @@ " validation_data=validation_data,\n", " test_data=test_data,\n", " model=model,\n", - " prediction_method=PredictionMethod.COSINE,\n", + " prediction_method=PredictionMethod.DOT,\n", " enterprise_attack_filepath=enterprise_attack_filepath,\n", ")\n", "# mse = tie.fit_with_validation(\n", @@ -1170,7 +1184,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.8" + "version": "3.11.9" } }, "nbformat": 4, diff --git a/src/tie/engine.py b/src/tie/engine.py index ed1042b..0a22355 100644 --- a/src/tie/engine.py +++ b/src/tie/engine.py @@ -111,7 +111,11 @@ def fit(self, **kwargs) -> float: The MSE of the prediction matrix, as determined by the test set. """ # train - self._model.fit(self._training_data.to_sparse_tensor(), **kwargs) + self._model.fit( + self._training_data.to_sparse_tensor(), + self._validation_data.to_sparse_tensor(), + **kwargs, + ) mean_squared_error = self._model.evaluate( self._test_data.to_sparse_tensor(), method=self._prediction_method diff --git a/src/tie/recommender/bpr_recommender.py b/src/tie/recommender/bpr_recommender.py index e7370e6..8d90b38 100644 --- a/src/tie/recommender/bpr_recommender.py +++ b/src/tie/recommender/bpr_recommender.py @@ -6,8 +6,9 @@ from tie.constants import PredictionMethod from tie.utils import calculate_predicted_matrix - +import time from .recommender import Recommender +import matplotlib.pyplot as plt class BPRRecommender(Recommender): @@ -101,40 +102,59 @@ def _sample_dataset( m, n = data.shape + start = time.time() sample_user_probability = self._calculate_sample_user_probability(data) - # repeat for each of n items - num_items_per_user = np.sum(data, axis=1).astype(float) - assert not np.any(np.isnan(num_items_per_user)) - num_items_per_user[num_items_per_user == 0.0] = np.nan - assert num_items_per_user.shape == (m,) # m users - sample_item_probability = np.nan_to_num( - data / np.expand_dims(num_items_per_user, axis=1) - ) - - joint_user_item_probability = ( - np.expand_dims(sample_user_probability, axis=1) * sample_item_probability - ) - assert joint_user_item_probability.shape == (m, n) + end = time.time() + print("sample user", end - start) - flattened_probability = joint_user_item_probability.flatten("C") - u_i = np.random.choice( - np.arange(m * n), size=(num_samples,), p=flattened_probability + start_2 = time.time() + # repeat for each of n items + # num_items_per_user = np.sum(data, axis=1).astype(float) + # assert not np.any(np.isnan(num_items_per_user)) + # num_items_per_user[num_items_per_user == 0.0] = np.nan + # assert num_items_per_user.shape == (m,) # m users + # sample_item_probability = np.nan_to_num( + # data / np.expand_dims(num_items_per_user, axis=1) + # ) + + # joint_user_item_probability = ( + # np.expand_dims(sample_user_probability, axis=1) * sample_item_probability + # ) + # assert joint_user_item_probability.shape == (m, n) + + # flattened_probability = joint_user_item_probability.flatten("C") + # u_i = np.random.choice( + # np.arange(m * n), size=(num_samples,), p=flattened_probability + # ) + + # all_u = u_i // n + # all_i = u_i % n + # assert (all_i < 611).all() + all_u = np.random.choice( + m, size=num_samples, replace=True, p=sample_user_probability ) - all_u = u_i // n - all_i = u_i % n - assert (all_i < 611).all() + end_2 = time.time() + print("u and i", end_2 - start_2) + start_3 = time.time() non_observations = 1 - data unique_users, counts = np.unique(all_u, return_counts=True) value_to_count = dict(zip(unique_users, counts)) + u_to_i = {} u_to_j = {} # for each u for u, count in value_to_count.items(): + potential_i = data[u, :] + all_i_for_user = np.random.choice( + n, size=count, replace=True, p=potential_i / np.sum(potential_i) + ) + u_to_i[u] = all_i_for_user.tolist() + # get potential_j = non_observations[u, :] @@ -144,12 +164,19 @@ def _sample_dataset( u_to_j[u] = all_j_for_user.tolist() + all_i = [] all_j = [] for u in all_u: + i = u_to_i[u].pop() + all_i.append(i) j = u_to_j[u].pop() all_j.append(j) + end_3 = time.time() + + print("j", end_3 - start_3) + assert len(all_u) == len(all_j) == len(all_i) return all_u, all_i, all_j @@ -182,6 +209,7 @@ def _predict_for_single_entry(self, u, i) -> float: def fit( self, data: tf.SparseTensor, + test_data: tf.SparseTensor, learning_rate: float, epochs: int, regularization_coefficient: float, @@ -211,22 +239,37 @@ def fit( all_u, all_i, all_j = self._sample_dataset(data, num_samples=num_iterations) + losses = [] # initialize theta - done - init # repeat + elapsed_1 = 0 + elapsed_2 = 0 + elapsed_3 = 0 for iteration_count in range(num_iterations): # draw u, i, j from D_s + start = time.time() u = all_u[iteration_count] i = all_i[iteration_count] j = all_j[iteration_count] - assert data[u, i] == 1 - assert data[u, j] == 0 + # u, i, j = all_samples[iteration_count] + + # assert data[u, i] == 1 + # assert data[u, j] == 0 + end = time.time() + elapsed_1 += end - start + + start_2 = time.time() # theta = theta + alpha * (e^(-x) sigma(x) d/dtheta x + lambda theta) x_ui = self._predict_for_single_entry(u, i) x_uj = self._predict_for_single_entry(u, j) x_uij = x_ui - x_uj + end_2 = time.time() + elapsed_2 += end_2 - start_2 + + start_3 = time.time() sigmoid_derivative = (math.e ** (-x_uij)) / (1 + math.e ** (-x_uij)) d_w = self._V[i, :] - self._V[j, :] @@ -244,9 +287,20 @@ def fit( self._V[j, :] += learning_rate * ( sigmoid_derivative * d_hj - (regularization_coefficient * self._V[j, :]) ) + end_3 = time.time() + + elapsed_3 = end_3 - start_3 + + if iteration_count % len(data) == 0: + losses.append(self.evaluate(test_data)) + print("1", elapsed_1 / num_iterations) + print("2", elapsed_2 / num_iterations) + print("3", elapsed_3 / num_iterations) # return theta # set in rep + plt.plot(list(range(len(losses))), losses) + plt.show() def evaluate( self,