Generate_csv_data.py

#This code is only tio generate the CSV files for the user and movie data. and does not contain the main code for the recommendation system.
#please run the AI_Lab_Project_mainfile.py file to see the recommendation system.

import pandas as pd
import random

# Defining a comprehensive set of genres
genres = [
    'Action', 'Drama', 'Sci-Fi', 'Animation', 'Crime', 'Comedy',
    'Romance', 'Horror', 'Thriller', 'Fantasy', 'Adventure', 
    'Documentary', 'Musical'
]

# Generatin a diverse movie dataset
movies_data = []
movie_titles = [
    ("Inception", "Sci-Fi,Action,Thriller"),
    ("The Godfather", "Crime,Drama"),
    ("Toy Story", "Animation,Comedy"),
    ("The Shawshank Redemption", "Drama,Thriller"),
    ("The Dark Knight", "Action,Crime,Drama"),
    ("Pulp Fiction", "Crime,Drama,Comedy"),
    ("The Avengers", "Action,Sci-Fi,Adventure"),
    ("Forrest Gump", "Drama,Romance"),
    ("Finding Nemo", "Animation,Adventure,Comedy"),
    ("The Conjuring", "Horror,Thriller"),
    ("Coco", "Animation,Musical"),
    ("Avatar", "Action,Adventure,Fantasy"),
    ("Interstellar", "Sci-Fi,Drama,Adventure"),
    ("The Social Dilemma", "Documentary"),
    ("Frozen", "Animation,Fantasy,Musical"),
    ("Mad Max: Fury Road", "Action,Adventure,Thriller"),
    ("Parasite", "Drama,Thriller"),
    ("Get Out", "Horror,Thriller"),
    ("La La Land", "Romance,Musical"),
    ("The Lion King", "Animation,Adventure,Musical"),
    ("The Matrix", "Action,Sci-Fi"),
    ("Spirited Away", "Animation,Fantasy"),
    ("Jurassic Park", "Adventure,Sci-Fi"),
    ("The Grand Budapest Hotel", "Comedy,Drama"),
    ("Black Panther", "Action,Adventure,Fantasy"),
    ("The Silence of the Lambs", "Crime,Thriller"),
    ("Titanic", "Romance,Drama"),
    ("Shrek", "Animation,Comedy,Fantasy"),
    ("The Irishman", "Crime,Drama"),
    ("Moana", "Animation,Adventure,Musical"),
    ("The Witch", "Horror,Thriller"),
    ("A Beautiful Mind", "Drama,Biography"),
    ("The Pursuit of Happyness", "Drama,Biography"),
    ("1917", "Action,Drama,War"),
    ("The Revenant", "Adventure,Drama,Thriller")
]

# Creating movie data with auto-generated IDs
for i, (title, genre) in enumerate(movie_titles):
    movies_data.append({'movieId': i + 1, 'title': title, 'genre': genre})
movies_df = pd.DataFrame(movies_data)

# Generating a larger user rating dataset
users_data = []
num_users = 100  # Increasing the number of users
num_ratings_per_user = random.randint(10, 20)  # making sure each user rates 10-20 movies

for user_id in range(1, num_users + 1):
    rated_movies = random.sample(range(1, len(movies_df) + 1), num_ratings_per_user)
    for movie_id in rated_movies:
        users_data.append({
            'userId': user_id,
            'movieId': movie_id,
            'rating': round(random.uniform(1.0, 5.0), 1)  # Ratings range between 1.0 and 5.0
        })

users_df = pd.DataFrame(users_data)

# Save to CSV
users_df.to_csv('users.csv', index=False)
movies_df.to_csv('movies.csv', index=False)

print("Both user and movie data has been generated and saved to users.csv and movies.csv, respectively.")