forked from lancedb/vectordb-recipes
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
89 lines (73 loc) · 2.36 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import lancedb
import numpy as np
import pandas as pd
global data
data = []
global table
table = None
def get_recommendations(title):
pd_data = pd.DataFrame(data)
# Table Search
result = (
table.search(pd_data[pd_data["title"] == title]["vector"].values[0])
.limit(5)
.to_df()
)
# Get IMDB links
links = pd.read_csv(
"./ml-latest-small/links.csv",
header=0,
names=["movie id", "imdb id", "tmdb id"],
converters={"imdb id": str},
)
ret = result["title"].values.tolist()
# Loop to add links
for i in range(len(ret)):
link = links[links["movie id"] == result["id"].values[i]]["imdb id"].values[0]
link = "https://www.imdb.com/title/tt" + link
ret[i] = [ret[i], link]
return ret
if __name__ == "__main__":
# Load and prepare data
ratings = pd.read_csv(
"./ml-latest-small/ratings.csv",
header=None,
names=["user id", "movie id", "rating", "timestamp"],
)
ratings = ratings.drop(columns=["timestamp"])
ratings = ratings.drop(0)
ratings["rating"] = ratings["rating"].values.astype(np.float32)
ratings["user id"] = ratings["user id"].values.astype(np.int32)
ratings["movie id"] = ratings["movie id"].values.astype(np.int32)
reviewmatrix = ratings.pivot(
index="user id", columns="movie id", values="rating"
).fillna(0)
# SVD
matrix = reviewmatrix.values
u, s, vh = np.linalg.svd(matrix, full_matrices=False)
vectors = np.rot90(np.fliplr(vh))
print(vectors.shape)
# Metadata
movies = pd.read_csv(
"./ml-latest-small/movies.csv", header=0, names=["movie id", "title", "genres"]
)
movies = movies[movies["movie id"].isin(reviewmatrix.columns)]
data = []
for i in range(len(movies)):
data.append(
{
"id": movies.iloc[i]["movie id"],
"title": movies.iloc[i]["title"],
"vector": vectors[i],
"genre": movies.iloc[i]["genres"],
}
)
print(pd.DataFrame(data))
# Connect to LanceDB
db = lancedb.connect("./data/test-db")
try:
table = db.create_table("movie_set", data=data)
except:
table = db.open_table("movie_set")
print(get_recommendations("Moana (2016)"))
print(get_recommendations("Rogue One: A Star Wars Story (2016)"))