diff --git "a/\"I don't know, what do you feel like\" Proposal" "b/\"I don't know, what do you feel like\" Proposal" new file mode 100644 index 0000000000..96d240b81b --- /dev/null +++ "b/\"I don't know, what do you feel like\" Proposal" @@ -0,0 +1,107 @@ +Intelligent Dining Decision Assistant Application + +Group Members: +Pauline Brunet, NetID: pbrunet2 (Captain) +Catherine Orlando, NetID: co24 +Austin Harmon, NetID: austin31 +Mitchell Kopczyk, NetID: kopczyk2 +Louis Hamilton, NetID: louisch3 + +Topic Chosen: +Building an application that aids indecisive users in making informed dining choices by providing personalized restaurant recommendations and insights. + +Problem and Relevance to Theme and Class: +Choosing a restaurant can be a complex decision-making process, made even more challenging due to the vast amount of online information available. Our application intends to make this process more manageable and intelligent. The chosen topic aligns with our Text Information Systems class by incorporating concepts such as semantic analysis, relevance, probabilistic ranking, and collaborative filtering to improve the accuracy and relevance of restaurant recommendations and insights. + +Datasets, Algorithms, or Techniques Planned: + + Datasets: Utilize publicly available restaurant APIs for obtaining detailed information, reviews, and ratings. + + Algorithms: Our current algorithm for relevance (R) is R = (Number of unique query terms in document / Number of query terms) * 100 + + Techniques and Concepts from the Course: + Semantic Analysis: Understanding user queries and reviews to generate relevant restaurant recommendations. + Bag of Words Representation & Vector Space Model: For text representation and understanding the context of user preferences and restaurant information. + Probabilistic Relevance Ranking for Text Retrieval: Ensuring that the most suitable restaurant recommendations are shown to the users. + Collaborative Filtering: Using user-item interactions for personalizing restaurant recommendations. + Evaluation Metrics (Precision, Recall, MAP): To evaluate the performance and relevance of our recommendations. + +Demonstration and Programming Language: +We plan to conduct user-based evaluations to receive feedback on the relevance and usefulness of our recommendations, as well as the overall user experience of our application. + +Programming Language: +We intend to use languages such as Python for backend development and potentially JavaScript along with suitable frameworks for frontend development. + +Application Outcome: +Every time a user enters a query and clicks the “Find Restaurants” button, the system will generate a +Python list that consists of a collection of restaurants that are relevant to the query. The generation of +this list is made possible by an inverted index. Next to each restaurant in the results, the city, state, and +a relevance score will be displayed. The relevance score is calculated based on the following formula: +Relevance = (Number of unique query terms in document / Number of query terms) * 100 +The only exception to the relevance formula is when certain stop words are entered, such as “and” and “the”. +Stop words are ignored during the relevance calculation. + +In this scenario, each restaurant name displayed in the results represents a document. Each document is +a text file that contains the name of the restaurant on the first line. The city and state are included on the second line. +The next lines contain information about the restaurant, which is mostly the food items offered by the restaurant. +When the program is not running, more restaurant text files can be added to the main folder and the program will incorporate +these files into the inverted index when it starts. + +There is no limit to the number of queries a user can enter. While the program is actively running, the +user can continually keep entering queries and all results will be maintained. Each new result list will +be displayed at the top of the previous results. If the collection of result lists exceeds the window +height, the user will be able to scroll down to the bottom of the results. + +The user has the option to filter the results by specifying their location with a city and state name. +If the user decides to enter their location, the program will still return the original results, +except the results that correspond to a different location will be replaced with the text “Location Filter Applied”. +The location filter will only allow the user to see the results that correspond to the specified location. The user will still be +able to view the count of the other search results for different locations, which could encourage the user to try their +query with a different location, or remove the entire filter. To be the most effective, the user should specify their +location in the query and the location filter. + +Workload Justification: +1. Initial Project Proposal Draft (Completed) + - Conducted research and brainstorming. + - Drafted the initial proposal outlining the project's goals and methodology. + - Total hours spent: 10 hours (2 hours/student) + +2. Project Proposal Edit and Proofread + - Revised, refined, and proofread the proposal for clarity. + - Ensured that the proposal met all project requirements and guidelines. + - Total hours spent: 5 hours (1 hour/student) + +3. First Program Draft Development (Completed) + - Developed the initial version of the program, focusing on core functionalities. + - Conducted basic testing and debugging. + - Total hours spent: 30 hours (6 hours/student) + +4. Second Program Draft Development + - Improved and expanded upon the initial code. + - Conducted further testing, identifying areas for improvement and optimization. + - Total hours spent: 25 hours (5 hours/student) + +5. Final Program Draft Development + - Conducted final refinements to optimize the code. + - Carried out comprehensive testing and debugging to ensure functionality and reliability. + - Total hours spent: 20 hours (4 hours/student) + +6. Initial Progress Report + - Compiled and documented the project's progress, identifying completed tasks. + - Total hours spent: 2 hours (0.4 hours/student) + +7. Progress Report Edit and Proofread + - Reviewed and revised the progress report, ensuring accuracy and clarity. + - Total hours spent: 2 hours (0.4 hours/student) + +8. Program Documentation Development + - Developed detailed documentation explaining the program’s code and functionalities. + - Ensured that the documentation is clear and understandable. + - Total hours spent: 3 hours (0.6 hours/student) + +9. Program Presentation + - Prepared a cohesive presentation to showcase the project’s development process and final product. + - Practiced the presentation to ensure smooth delivery. + - Total hours spent: 3 hours (0.6 hours/student) + +Total Workload: 100 hours diff --git a/Documentation and Final Report.pdf b/Documentation and Final Report.pdf new file mode 100644 index 0000000000..1b87670346 Binary files /dev/null and b/Documentation and Final Report.pdf differ diff --git a/PDF_Proposal.pdf b/PDF_Proposal.pdf new file mode 100644 index 0000000000..dec04a049e Binary files /dev/null and b/PDF_Proposal.pdf differ diff --git a/Presentation b/Presentation new file mode 100644 index 0000000000..65d33a750e --- /dev/null +++ b/Presentation @@ -0,0 +1,2 @@ +Link to the presentation: https://drive.google.com/file/d/1uc--E565vLbPTn34vjsqmvqeWMpUogwa/view?usp=drive_link + diff --git a/Project Progress Report Submission for Grading.pdf b/Project Progress Report Submission for Grading.pdf new file mode 100644 index 0000000000..14d87e38df Binary files /dev/null and b/Project Progress Report Submission for Grading.pdf differ diff --git a/README.md b/README.md deleted file mode 100644 index a7b40d2cc8..0000000000 --- a/README.md +++ /dev/null @@ -1,3 +0,0 @@ -# CourseProject - -Please fork this repository and paste the github link of your fork on Microsoft CMT. Detailed instructions are on Coursera under Week 1: Course Project Overview/Week 9 Activities. diff --git a/ReadmeF.pdf b/ReadmeF.pdf new file mode 100644 index 0000000000..c434dbb25e Binary files /dev/null and b/ReadmeF.pdf differ diff --git a/restaurant_search.py b/restaurant_search.py new file mode 100644 index 0000000000..5326dd9010 --- /dev/null +++ b/restaurant_search.py @@ -0,0 +1,171 @@ +import os +from tkinter import * +from collections import defaultdict +import math +import re + +class RestaurantSearch: + def __init__(self): + # Initializes the RestaurantSearch class, creating the inverted index and document list + self.inverted_index = dict() + self.documents = [] + self.doc_lengths = dict() + self.total_doc_len = 0.0 + self.load_and_index_files() + self.document_count = len(self.documents) + self.avg_doc_len = self.total_doc_len / self.document_count + + def load_and_index_files(self): + # Loads restaurant data from files and indexes them for search + file_location = os.path.join(os.path.dirname(__file__), "restaurants") + if not os.path.exists(file_location): + #messagebox.showerror("Error", "Restaurant folder does not exist") + print("Error restaurant folder not found") + exit() + + for filename in os.listdir(file_location): + with open(os.path.join(file_location, filename), "r") as file: + content = file.read() + self.documents.append(content) + self.index_document(content) + + def index_document(self, content): + # Creates an index of words from the document content for search functionality + tokens = content.lower().split() + doc_id = len(self.documents) - 1 + self.doc_lengths[doc_id] = len(tokens) + # redo with form [token][doc_id] + self.inverted_index[doc_id] = dict() + for token in tokens: + cleaned_token = token.strip(".,").replace("and", "").replace("the", "") + if cleaned_token not in self.inverted_index: + self.inverted_index[cleaned_token] = dict() + if doc_id in self.inverted_index[cleaned_token]: + self.inverted_index[cleaned_token][doc_id] += 1 + else: + self.inverted_index[cleaned_token][doc_id] = 1 + self.total_doc_len += len(tokens) + + def search(self, query, location): + # Searches indexed documents based on the query and location, returning relevant results + query_terms = query.lower().split() + query_counts = dict() + qt_df = dict() + query_len = len(query_terms) + + result_list = [] + for term in query_terms: + cleaned_term = term.strip(".,").replace("and", "").replace("the", "") + if cleaned_term in self.inverted_index: + query_counts[cleaned_term] = query_counts.get(cleaned_term, 0) + 1 + qt_df[cleaned_term] = len(self.inverted_index[cleaned_term]) + result_list.extend(self.inverted_index[cleaned_term]) + + unique_results = set(result_list) + num_unique_results = len(unique_results) + + rank_dict = {doc_id: self.bm25(doc_id, query_counts, num_unique_results, qt_df) for doc_id in unique_results} + ranked_results = sorted(rank_dict, key=rank_dict.get, reverse=True) + + return self.format_results(ranked_results, rank_dict, location), len(ranked_results) + + def format_results(self, ranked_results, rank_dict, location): + # Formats the search results for display, filtering by location if specified + output = [] + location_match_count = 0 + # Scale every result as a percentage of the most relevant result + top_score = 0 + + for doc_id in ranked_results: + title, loc = self.documents[doc_id].split("\n")[:2] + if location and not self.match_location(loc, location): + continue + top_score = max(top_score, rank_dict[doc_id]) + + for doc_id in ranked_results: + title, loc = self.documents[doc_id].split("\n")[:2] + if location and not self.match_location(loc, location): + continue + location_match_count += 1 + relevance = '{:.3f}'.format(round((rank_dict[doc_id] / top_score) * 100, 3)) + output.append(f"{title}, LOC: {loc}, relevance: {relevance}%") + return output, location_match_count + + def bm25(self, doc_id, query_counts, num, qt_df): + b_ = .75 + k1_ = 1.6 + k3_ = 1 + + okapi_bm25 = 0 + + for t in query_counts: + if doc_id in self.inverted_index[t]: + # Inverse Document Frequency Term + IDF = math.log( + 1.0 + (self.document_count - qt_df[t] + 0.5) / (qt_df[t] + 0.5)); + + # Term Frequency with Document Length Normalization + TF = ((k1_ + 1.0) * self.inverted_index[t][doc_id]) / ((k1_ * ((1.0 - b_) + b_ * self.doc_lengths[doc_id] / self.avg_doc_len)) + + self.inverted_index[t][doc_id]); + + # QTF handles how to value appearances of a term multiple times in the same query + QTF = ((k3_ + 1.0) * query_counts[t]) / (k3_ + query_counts[t]); + + #Okapi BM25 + okapi_bm25 += TF * IDF * QTF + + return okapi_bm25 + + def match_location(self, a, b): + regex = re.compile('[^a-zA-Z]') + a_new = regex.sub('', a).lower() + b_new = regex.sub('', b).lower() + return a_new == b_new + + +class GUI: + def __init__(self, master): + # Initializes the GUI elements and sets up the main window + self.master = master + master.title('The Restaurant Selector') + master.geometry("850x600") + + self.search_engine = RestaurantSearch() + + Label(master, text="What do you feel like eating today?", font=("Times", 21)).pack() + self.query_entry = Text(master, width=40, height=5) + self.query_entry.pack() + + Label(master, text="(Optional) Please enter a city name and the first two letters of the state. e.g., Urbana IL", font=("Times", 8)).pack() + self.location_entry = Entry(master, width=20) + self.location_entry.pack() + + self.results_text = Text(master, width=95, height=24) + self.results_text.pack() + + Button(master, text="Find Restaurants", command=self.perform_search).pack() + + def perform_search(self): + # Initiates a search when the 'Find Restaurants' button is clicked + query = self.query_entry.get("1.0", "end").strip() + location = self.location_entry.get().strip() + results, total_results = self.search_engine.search(query, location) + results, location_match_count = results + + self.results_text.delete("1.0", END) + self.results_text.insert("1.0", f"{total_results} total results returned\n") + if location: + self.results_text.insert(END, f"Location Filter Applied\n{location_match_count} relevant to your specified location\n") + else: + self.results_text.insert(END, "No Location Filter Applied\n") + for result in results: + self.results_text.insert(END, result + "\n") + +def main(): + # The main function to run the application + root = Tk() + gui = GUI(root) + root.mainloop() + +if __name__ == "__main__": + main() diff --git a/restaurants.zip b/restaurants.zip new file mode 100644 index 0000000000..1edda16de8 Binary files /dev/null and b/restaurants.zip differ