-
Notifications
You must be signed in to change notification settings - Fork 0
/
anime_rec_sys.py
101 lines (79 loc) · 3.28 KB
/
anime_rec_sys.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
from sklearn.metrics.pairwise import sigmoid_kernel
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd
import re
import streamlit as st
import joblib
import os
# Load the anime data and ratings data
# ratings_path = 'animelist.csv'
data_path = 'processed_anime.csv'
# anime_ratings = pd.read_csv(ratings_path)
tfv_file = 'tfv.joblib'
sig_file = 'sig.joblib'
indices_file = 'indices.joblib'
@st.cache_data
def load_csv():
anime_data = pd.read_csv(data_path)
return anime_data
anime_data = load_csv()
if os.path.exists(tfv_file) and os.path.exists(sig_file) and os.path.exists(indices_file):
print("Found existing joblib files, loading them...")
tfv_matrix = joblib.load(tfv_file)
sig = joblib.load(sig_file)
indices = joblib.load(indices_file)
print("Files loaded successfully!")
else:
print("Files not found, generating them..")
genres_str = anime_data['Genres'].str.split(',').astype(str)
# Initialize the TfidfVectorizer with various parameters
tfv = TfidfVectorizer(min_df=3, max_features=None,
strip_accents='unicode', analyzer='word',
token_pattern=r'\w{1,}',
ngram_range=(1, 3),
stop_words='english')
# Use the TfidfVectorizer to transform the genres_str into a sparse matrix
tfv_matrix = tfv.fit_transform(genres_str)
# Compute the sigmoid kernel
sig = sigmoid_kernel(tfv_matrix, tfv_matrix)
# Create a Pandas Series object where the index is the anime names and the values are the indices in anime_data
indices = pd.Series(anime_data.index, index=anime_data['Name'])
indices = indices.drop_duplicates()
joblib.dump(tfv_matrix, tfv_file)
joblib.dump(sig, sig_file)
joblib.dump(indices, indices_file)
print("Files generated successfully!")
# Define the give_rec function to recommend anime based on similarity to input title
def give_rec(title, sig=sig):
# Get the index corresponding to anime title
idx = indices[title]
# Get the pairwsie similarity scores
sig_scores = list(enumerate(sig[idx]))
# Sort the anime based on similarity scores
sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse=True)
# Get the indices of top 10 most similar anime excluding the input anime
anime_indices = [i[0] for i in sig_scores[1:11]]
# Create dataframe of top 10 recommended anime
top_anime = pd.DataFrame({
'Anime name': anime_data['Name'].iloc[anime_indices].values,
'Rating': anime_data['Score'].iloc[anime_indices].values
})
return top_anime
# Set up the Streamlit app
st.title('Anime Recommender System')
options = anime_data['Name'].tolist()
options.append('Type name here..')
# Create a text input box for the user to enter an anime title
user_input = st.selectbox(
'Enter the name of an anime you like:', options=options, index=len(options)-1)
if user_input == options[-1]:
pass
# When the user submits an input, call the give_rec function and display the output
elif user_input:
try:
recommendations = give_rec(user_input)
st.write(f"Recommended anime similar to {user_input}:")
st.table(recommendations)
except KeyError:
st.write(f"Sorry, {user_input} is not in our database.")