Skip to content

Commit

Permalink
train bpr
Browse files Browse the repository at this point in the history
  • Loading branch information
Turner committed Jul 20, 2024
1 parent 6e4fa04 commit 924ecb1
Show file tree
Hide file tree
Showing 3 changed files with 115 additions and 43 deletions.
54 changes: 34 additions & 20 deletions notebooks/main.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 2,
"metadata": {
"metadata": {}
},
Expand All @@ -18,10 +18,10 @@
{
"data": {
"text/plain": [
"<module 'tie.recommender' from '/Users/mcarenzo/Desktop/CTID/technique-inference-engine/src/tie/recommender/__init__.py'>"
"<module 'tie.recommender' from '/Users/mjturner/code/technique-inference-engine/src/tie/recommender/__init__.py'>"
]
},
"execution_count": 9,
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
Expand Down Expand Up @@ -55,7 +55,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 3,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -88,6 +88,17 @@
"print(\"Num validation interactions\", validation_data.to_numpy().sum())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"training_data.to_pandas().to_pickle(\"training_data.pkl\")\n",
"test_data.to_pandas().to_pickle(\"test_data.pkl\")\n",
"validation_data.to_pandas().to_pickle(\"validation_data.pkl\")"
]
},
{
"cell_type": "code",
"execution_count": 11,
Expand All @@ -103,7 +114,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -127,7 +138,7 @@
"\n",
" results = []\n",
"\n",
" embedding_dimensions = (4,8,10,16,32,64)\n",
" embedding_dimensions = (4,8,16,32)\n",
" # for every embedding\n",
" for embedding_dimension in embedding_dimensions:\n",
"\n",
Expand Down Expand Up @@ -617,30 +628,33 @@
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": 13,
"metadata": {
"metadata": {}
},
"outputs": [
{
"ename": "KeyboardInterrupt",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[20], line 22\u001b[0m\n\u001b[1;32m 9\u001b[0m tie \u001b[38;5;241m=\u001b[39m TechniqueInferenceEngine(\n\u001b[1;32m 10\u001b[0m training_data\u001b[38;5;241m=\u001b[39mtraining_data,\n\u001b[1;32m 11\u001b[0m validation_data\u001b[38;5;241m=\u001b[39mvalidation_data,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 15\u001b[0m enterprise_attack_filepath\u001b[38;5;241m=\u001b[39menterprise_attack_filepath,\n\u001b[1;32m 16\u001b[0m )\n\u001b[1;32m 17\u001b[0m \u001b[38;5;66;03m# mse = tie.fit_with_validation(\u001b[39;00m\n\u001b[1;32m 18\u001b[0m \u001b[38;5;66;03m# learning_rate=[0.001, 0.005, 0.01, 0.02, 0.05],\u001b[39;00m\n\u001b[1;32m 19\u001b[0m \u001b[38;5;66;03m# num_iterations=[500 * 512],\u001b[39;00m\n\u001b[1;32m 20\u001b[0m \u001b[38;5;66;03m# regularization_coefficient=[0, 0.0001, 0.001, 0.01],\u001b[39;00m\n\u001b[1;32m 21\u001b[0m \u001b[38;5;66;03m# )\u001b[39;00m\n\u001b[0;32m---> 22\u001b[0m mse \u001b[38;5;241m=\u001b[39m \u001b[43mtie\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mbest_hyperparameters\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 23\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMean Squared Error\u001b[39m\u001b[38;5;124m\"\u001b[39m, mse)\n\u001b[1;32m 24\u001b[0m precision \u001b[38;5;241m=\u001b[39m tie\u001b[38;5;241m.\u001b[39mprecision(k\u001b[38;5;241m=\u001b[39mk)\n",
"File \u001b[0;32m~/code/technique-inference-engine/models/tie.py:122\u001b[0m, in \u001b[0;36mTechniqueInferenceEngine.fit\u001b[0;34m(self, **kwargs)\u001b[0m\n\u001b[1;32m 104\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Fit the model to the data.\u001b[39;00m\n\u001b[1;32m 105\u001b[0m \n\u001b[1;32m 106\u001b[0m \u001b[38;5;124;03mArgs:\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 119\u001b[0m \u001b[38;5;124;03m The MSE of the prediction matrix, as determined by the test set.\u001b[39;00m\n\u001b[1;32m 120\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 121\u001b[0m \u001b[38;5;66;03m# train\u001b[39;00m\n\u001b[0;32m--> 122\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_model\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_training_data\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mto_sparse_tensor\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 124\u001b[0m mean_squared_error \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_model\u001b[38;5;241m.\u001b[39mevaluate(\n\u001b[1;32m 125\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_test_data\u001b[38;5;241m.\u001b[39mto_sparse_tensor(), method\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_prediction_method\n\u001b[1;32m 126\u001b[0m )\n\u001b[1;32m 128\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_checkrep()\n",
"File \u001b[0;32m~/code/technique-inference-engine/models/recommender/bpr_recommender.py:244\u001b[0m, in \u001b[0;36mBPRRecommender.fit\u001b[0;34m(self, data, learning_rate, epochs, regularization_coefficient)\u001b[0m\n\u001b[1;32m 238\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_U[u, :] \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m learning_rate \u001b[38;5;241m*\u001b[39m (\n\u001b[1;32m 239\u001b[0m sigmoid_derivative \u001b[38;5;241m*\u001b[39m d_w \u001b[38;5;241m-\u001b[39m (regularization_coefficient \u001b[38;5;241m*\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_U[u, :])\n\u001b[1;32m 240\u001b[0m )\n\u001b[1;32m 241\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_V[i, :] \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m learning_rate \u001b[38;5;241m*\u001b[39m (\n\u001b[1;32m 242\u001b[0m sigmoid_derivative \u001b[38;5;241m*\u001b[39m d_hi \u001b[38;5;241m-\u001b[39m (regularization_coefficient \u001b[38;5;241m*\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_V[i, :])\n\u001b[1;32m 243\u001b[0m )\n\u001b[0;32m--> 244\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_V[j, :] \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m learning_rate \u001b[38;5;241m*\u001b[39m (\n\u001b[1;32m 245\u001b[0m sigmoid_derivative \u001b[38;5;241m*\u001b[39m d_hj \u001b[38;5;241m-\u001b[39m (regularization_coefficient \u001b[38;5;241m*\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_V[j, :])\n\u001b[1;32m 246\u001b[0m )\n",
"\u001b[0;31mKeyboardInterrupt\u001b[0m: "
"name": "stdout",
"output_type": "stream",
"text": [
"sample user 0.0012049674987792969\n",
"u and i 8.883323907852173\n",
"j 34.49376893043518\n",
"1 2.0999958776653285e-07\n",
"2 1.2779525031083565e-06\n",
"3 8.259746333628572e-14\n",
"Mean Squared Error 1.0243713\n",
"Precision 0.010736048749198205\n",
"Recall 0.11750631136337891\n",
"Normalized Discounted Cumulative Gain 0.07381812089582988\n"
]
}
],
"source": [
"# hyperparameters\n",
"embedding_dimension = 4\n",
"k = 20\n",
"best_hyperparameters = {'regularization_coefficient': 0.0001, 'epochs': 2, 'learning_rate': 0.0001}\n",
"best_hyperparameters = {'regularization_coefficient': 0.0001, 'epochs': 25, 'learning_rate': 0.0001}\n",
"# best_hyperparameters[\"epochs\"] = 20*training_data.m*training_data.n\n",
"\n",
"model = BPRRecommender(m=training_data.m, n=training_data.n, k=embedding_dimension)\n",
Expand All @@ -650,7 +664,7 @@
" validation_data=validation_data,\n",
" test_data=test_data,\n",
" model=model,\n",
" prediction_method=PredictionMethod.COSINE,\n",
" prediction_method=PredictionMethod.DOT,\n",
" enterprise_attack_filepath=enterprise_attack_filepath,\n",
")\n",
"# mse = tie.fit_with_validation(\n",
Expand Down Expand Up @@ -1170,7 +1184,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.8"
"version": "3.11.9"
}
},
"nbformat": 4,
Expand Down
6 changes: 5 additions & 1 deletion src/tie/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,11 @@ def fit(self, **kwargs) -> float:
The MSE of the prediction matrix, as determined by the test set.
"""
# train
self._model.fit(self._training_data.to_sparse_tensor(), **kwargs)
self._model.fit(
self._training_data.to_sparse_tensor(),
self._validation_data.to_sparse_tensor(),
**kwargs,
)

mean_squared_error = self._model.evaluate(
self._test_data.to_sparse_tensor(), method=self._prediction_method
Expand Down
98 changes: 76 additions & 22 deletions src/tie/recommender/bpr_recommender.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@

from tie.constants import PredictionMethod
from tie.utils import calculate_predicted_matrix

import time
from .recommender import Recommender
import matplotlib.pyplot as plt


class BPRRecommender(Recommender):
Expand Down Expand Up @@ -101,40 +102,59 @@ def _sample_dataset(

m, n = data.shape

start = time.time()
sample_user_probability = self._calculate_sample_user_probability(data)

# repeat for each of n items
num_items_per_user = np.sum(data, axis=1).astype(float)
assert not np.any(np.isnan(num_items_per_user))
num_items_per_user[num_items_per_user == 0.0] = np.nan
assert num_items_per_user.shape == (m,) # m users
sample_item_probability = np.nan_to_num(
data / np.expand_dims(num_items_per_user, axis=1)
)

joint_user_item_probability = (
np.expand_dims(sample_user_probability, axis=1) * sample_item_probability
)
assert joint_user_item_probability.shape == (m, n)
end = time.time()
print("sample user", end - start)

flattened_probability = joint_user_item_probability.flatten("C")
u_i = np.random.choice(
np.arange(m * n), size=(num_samples,), p=flattened_probability
start_2 = time.time()
# repeat for each of n items
# num_items_per_user = np.sum(data, axis=1).astype(float)
# assert not np.any(np.isnan(num_items_per_user))
# num_items_per_user[num_items_per_user == 0.0] = np.nan
# assert num_items_per_user.shape == (m,) # m users
# sample_item_probability = np.nan_to_num(
# data / np.expand_dims(num_items_per_user, axis=1)
# )

# joint_user_item_probability = (
# np.expand_dims(sample_user_probability, axis=1) * sample_item_probability
# )
# assert joint_user_item_probability.shape == (m, n)

# flattened_probability = joint_user_item_probability.flatten("C")
# u_i = np.random.choice(
# np.arange(m * n), size=(num_samples,), p=flattened_probability
# )

# all_u = u_i // n
# all_i = u_i % n
# assert (all_i < 611).all()
all_u = np.random.choice(
m, size=num_samples, replace=True, p=sample_user_probability
)

all_u = u_i // n
all_i = u_i % n
assert (all_i < 611).all()
end_2 = time.time()
print("u and i", end_2 - start_2)

start_3 = time.time()
non_observations = 1 - data

unique_users, counts = np.unique(all_u, return_counts=True)
value_to_count = dict(zip(unique_users, counts))

u_to_i = {}
u_to_j = {}

# for each u
for u, count in value_to_count.items():
potential_i = data[u, :]
all_i_for_user = np.random.choice(
n, size=count, replace=True, p=potential_i / np.sum(potential_i)
)
u_to_i[u] = all_i_for_user.tolist()

# get
potential_j = non_observations[u, :]

Expand All @@ -144,12 +164,19 @@ def _sample_dataset(

u_to_j[u] = all_j_for_user.tolist()

all_i = []
all_j = []

for u in all_u:
i = u_to_i[u].pop()
all_i.append(i)
j = u_to_j[u].pop()
all_j.append(j)

end_3 = time.time()

print("j", end_3 - start_3)

assert len(all_u) == len(all_j) == len(all_i)

return all_u, all_i, all_j
Expand Down Expand Up @@ -182,6 +209,7 @@ def _predict_for_single_entry(self, u, i) -> float:
def fit(
self,
data: tf.SparseTensor,
test_data: tf.SparseTensor,
learning_rate: float,
epochs: int,
regularization_coefficient: float,
Expand Down Expand Up @@ -211,22 +239,37 @@ def fit(

all_u, all_i, all_j = self._sample_dataset(data, num_samples=num_iterations)

losses = []
# initialize theta - done - init
# repeat
elapsed_1 = 0
elapsed_2 = 0
elapsed_3 = 0
for iteration_count in range(num_iterations):
# draw u, i, j from D_s
start = time.time()
u = all_u[iteration_count]
i = all_i[iteration_count]
j = all_j[iteration_count]

assert data[u, i] == 1
assert data[u, j] == 0
# u, i, j = all_samples[iteration_count]

# assert data[u, i] == 1
# assert data[u, j] == 0

end = time.time()
elapsed_1 += end - start

start_2 = time.time()
# theta = theta + alpha * (e^(-x) sigma(x) d/dtheta x + lambda theta)
x_ui = self._predict_for_single_entry(u, i)
x_uj = self._predict_for_single_entry(u, j)
x_uij = x_ui - x_uj

end_2 = time.time()
elapsed_2 += end_2 - start_2

start_3 = time.time()
sigmoid_derivative = (math.e ** (-x_uij)) / (1 + math.e ** (-x_uij))

d_w = self._V[i, :] - self._V[j, :]
Expand All @@ -244,9 +287,20 @@ def fit(
self._V[j, :] += learning_rate * (
sigmoid_derivative * d_hj - (regularization_coefficient * self._V[j, :])
)
end_3 = time.time()

elapsed_3 = end_3 - start_3

if iteration_count % len(data) == 0:
losses.append(self.evaluate(test_data))

print("1", elapsed_1 / num_iterations)
print("2", elapsed_2 / num_iterations)
print("3", elapsed_3 / num_iterations)
# return theta
# set in rep
plt.plot(list(range(len(losses))), losses)
plt.show()

def evaluate(
self,
Expand Down

0 comments on commit 924ecb1

Please sign in to comment.