diff --git a/app/public/manifest.json b/app/public/manifest.json deleted file mode 100644 index 080d6c7..0000000 --- a/app/public/manifest.json +++ /dev/null @@ -1,25 +0,0 @@ -{ - "short_name": "React App", - "name": "Create React App Sample", - "icons": [ - { - "src": "favicon.ico", - "sizes": "64x64 32x32 24x24 16x16", - "type": "image/x-icon" - }, - { - "src": "logo192.png", - "type": "image/png", - "sizes": "192x192" - }, - { - "src": "logo512.png", - "type": "image/png", - "sizes": "512x512" - } - ], - "start_url": ".", - "display": "standalone", - "theme_color": "#000000", - "background_color": "#ffffff" -} diff --git a/reddit_crawler.ipynb b/reddit_crawler.ipynb deleted file mode 100644 index 63fee4b..0000000 --- a/reddit_crawler.ipynb +++ /dev/null @@ -1,209 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Lad wrote a Python script to download Alexa voice recordings, he didn't expect this email.\n", - "This post has:\n", - "I redesign the Python logo to make it more modern\n", - "Automate the boring stuff with python - tinder\n", - "Just finished programming and building my own smart mirror in python, wrote all of the code myself and implemented my own voice control and facial recognition features\n" - ] - } - ], - "source": [ - "import praw\n", - "import os\n", - "import json\n", - "import time\n", - "\n", - "from supabase import create_client, Client\n", - "\n", - "# Supabase setup\n", - "url: str = os.environ.get('SUPABASE_WATCHDB_URL')\n", - "key: str = os.environ.get('SUPABASE_WATCHDB_SERVICE_ROLE_KEY')\n", - "supabase: Client = create_client(url, key)\n", - "\n", - "# Reddit API Credentials\n", - "client_id = os.environ.get('REDDIT_APP_ID')\n", - "client_secret = os.environ.get('REDDIT_APP_KEY')\n", - "user_agent = 'User-Agent:chrono-codex-server:v1 (by /u/ChronoCrawler)'\n", - "\n", - "# Initialize PRAW with your credentials\n", - "reddit = praw.Reddit(client_id=client_id,\n", - " client_secret=client_secret,\n", - " user_agent=user_agent)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024-02-06 17:53:19,214:INFO - HTTP Request: POST https://gvvfniijngcyqnwvrbal.supabase.co/rest/v1/rqueue \"HTTP/1.1 409 Conflict\"\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Skipping insertion for post_id=1akq7lk as it already exists.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024-02-06 17:53:19,456:INFO - HTTP Request: POST https://gvvfniijngcyqnwvrbal.supabase.co/rest/v1/rqueue \"HTTP/1.1 201 Created\"\n", - "2024-02-06 17:53:19,743:INFO - HTTP Request: POST https://gvvfniijngcyqnwvrbal.supabase.co/rest/v1/rqueue \"HTTP/1.1 201 Created\"\n", - "2024-02-06 17:53:20,017:INFO - HTTP Request: POST https://gvvfniijngcyqnwvrbal.supabase.co/rest/v1/rqueue \"HTTP/1.1 201 Created\"\n", - "2024-02-06 17:53:20,316:INFO - HTTP Request: POST https://gvvfniijngcyqnwvrbal.supabase.co/rest/v1/rqueue \"HTTP/1.1 201 Created\"\n", - "2024-02-06 17:53:20,715:INFO - HTTP Request: POST https://gvvfniijngcyqnwvrbal.supabase.co/rest/v1/rqueue \"HTTP/1.1 201 Created\"\n", - "2024-02-06 17:53:20,948:INFO - HTTP Request: POST https://gvvfniijngcyqnwvrbal.supabase.co/rest/v1/rqueue \"HTTP/1.1 201 Created\"\n", - "2024-02-06 17:53:21,310:INFO - HTTP Request: POST https://gvvfniijngcyqnwvrbal.supabase.co/rest/v1/rqueue \"HTTP/1.1 201 Created\"\n", - "2024-02-06 17:53:21,584:INFO - HTTP Request: POST https://gvvfniijngcyqnwvrbal.supabase.co/rest/v1/rqueue \"HTTP/1.1 201 Created\"\n", - "2024-02-06 17:53:21,863:INFO - HTTP Request: POST https://gvvfniijngcyqnwvrbal.supabase.co/rest/v1/rqueue \"HTTP/1.1 201 Created\"\n" - ] - } - ], - "source": [ - "# The subreddit you want to scrape\n", - "subreddit = reddit.subreddit('watchexchange')\n", - "\n", - "# Initialize a list to store data before saving to disk\n", - "data_list = []\n", - "\n", - "# Fetch the top posts from the subreddit\n", - "top_posts = subreddit.top(time_filter=\"hour\", limit=10) # Adjust limit as needed\n", - "\n", - "\n", - "# Push the data collected to Supabase\n", - "for post in top_posts:\n", - " post.comments.replace_more(limit=25) # Load all comments\n", - " # comments = [{'userid': comment.author.name, 'comment': comment.body} for comment in post.comments.list()]\n", - " comments = ' | '.join([f\"{comment.author.name}: {comment.body}\" for comment in post.comments.list()])\n", - "\n", - " post_data = {\n", - " 'post_id': post.id,\n", - " 'author_id': post.author.name,\n", - " 'title': post.title,\n", - " 'url': post.url,\n", - " 'comments': comments\n", - " }\n", - " \n", - " try:\n", - " # Attempt to insert post_data into your Supabase table\n", - " data_insert_response = supabase.table('rqueue').insert(post_data).execute()\n", - " except Exception as e:\n", - " if 'duplicate key value violates unique constraint \"rqueue_pkey\"' in str(e):\n", - " print(f\"Skipping insertion for post_id={post_data['post_id']} as it already exists.\")\n", - " else:\n", - " raise\n" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024-02-06 18:18:58,543:INFO - HTTP Request: GET https://gvvfniijngcyqnwvrbal.supabase.co/rest/v1/rqueue?select=%2A&processed=eq.False&limit=1 \"HTTP/1.1 200 OK\"\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "An error occurred: \n", - "\n", - "You tried to access openai.Completion, but this is no longer supported in openai>=1.0.0 - see the README at https://github.com/openai/openai-python for the API.\n", - "\n", - "You can run `openai migrate` to automatically upgrade your codebase to use the 1.0.0 interface. \n", - "\n", - "Alternatively, you can pin your installation to the old version, e.g. `pip install openai==0.28`\n", - "\n", - "A detailed migration guide is available here: https://github.com/openai/openai-python/discussions/742\n", - "\n" - ] - } - ], - "source": [ - "import openai\n", - "import json\n", - "import jsonschema\n", - "\n", - "# Load your OpenAI API key from an environment variable or directly\n", - "openai_api_key = os.environ.get('OPENAI_API_CHRONO_KEY')\n", - "\n", - "openai.api_key = openai_api_key\n", - "\n", - "with open('query_schema.json') as f:\n", - " output_schema_str = f.read()\n", - "\n", - "# Fetch data from Supabase queue\n", - "try:\n", - " queue_data = supabase.table('rqueue').select('*').eq('processed', False).limit(1).execute()\n", - " if len(queue_data.data) > 1:\n", - " raise Exception(\"lll\")\n", - " if queue_data.data:\n", - " for item in queue_data.data:\n", - " # Convert the item to JSON string\n", - " item_json = json.dumps(item)\n", - "\n", - " prompt = f\"Given the data: {item_json}, construct a JSON object that adheres to the specified output schema. Here is the output schema: {output_schema_str}\"\n", - " try:\n", - " response = openai.Completion.create(\n", - " model=\"gpt-3.5-turbo-0125\",\n", - " prompt=prompt,\n", - " temperature=0.5, # Adjust as needed\n", - " max_tokens=1024, # Adjust based on your expected output size\n", - " top_p=1.0,\n", - " frequency_penalty=0.0,\n", - " presence_penalty=0.0\n", - " )\n", - " print(response.choices[0].text)\n", - " except Exception as e:\n", - " print(f\"An error occurred: {e}\")\n", - "\n", - " else:\n", - " print(\"No data found in the queue.\")\n", - "except Exception as e:\n", - " print(f\"Failed to fetch data from Supabase: {e}\")\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "base", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.5" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/app/.gitignore b/src/app/.gitignore similarity index 100% rename from app/.gitignore rename to src/app/.gitignore diff --git a/app/README.md b/src/app/README.md similarity index 100% rename from app/README.md rename to src/app/README.md diff --git a/app/package-lock.json b/src/app/package-lock.json similarity index 100% rename from app/package-lock.json rename to src/app/package-lock.json diff --git a/app/package.json b/src/app/package.json similarity index 100% rename from app/package.json rename to src/app/package.json diff --git a/app/public/index.html b/src/app/public/index.html similarity index 100% rename from app/public/index.html rename to src/app/public/index.html diff --git a/app/public/robots.txt b/src/app/public/robots.txt similarity index 100% rename from app/public/robots.txt rename to src/app/public/robots.txt diff --git a/app/src/App.css b/src/app/src/App.css similarity index 100% rename from app/src/App.css rename to src/app/src/App.css diff --git a/app/src/App.js b/src/app/src/App.js similarity index 100% rename from app/src/App.js rename to src/app/src/App.js diff --git a/app/src/App.test.js b/src/app/src/App.test.js similarity index 100% rename from app/src/App.test.js rename to src/app/src/App.test.js diff --git a/app/src/FilterInput.js b/src/app/src/FilterInput.js similarity index 100% rename from app/src/FilterInput.js rename to src/app/src/FilterInput.js diff --git a/app/src/NavBar.js b/src/app/src/NavBar.js similarity index 100% rename from app/src/NavBar.js rename to src/app/src/NavBar.js diff --git a/app/src/SharedDataContext.js b/src/app/src/SharedDataContext.js similarity index 100% rename from app/src/SharedDataContext.js rename to src/app/src/SharedDataContext.js diff --git a/app/src/WatchCard.js b/src/app/src/WatchCard.js similarity index 100% rename from app/src/WatchCard.js rename to src/app/src/WatchCard.js diff --git a/app/src/WatchDetailsModal.js b/src/app/src/WatchDetailsModal.js similarity index 100% rename from app/src/WatchDetailsModal.js rename to src/app/src/WatchDetailsModal.js diff --git a/app/src/WatchGrid.js b/src/app/src/WatchGrid.js similarity index 100% rename from app/src/WatchGrid.js rename to src/app/src/WatchGrid.js diff --git a/app/src/index.css b/src/app/src/index.css similarity index 100% rename from app/src/index.css rename to src/app/src/index.css diff --git a/app/src/index.js b/src/app/src/index.js similarity index 100% rename from app/src/index.js rename to src/app/src/index.js diff --git a/app/src/reportWebVitals.js b/src/app/src/reportWebVitals.js similarity index 100% rename from app/src/reportWebVitals.js rename to src/app/src/reportWebVitals.js diff --git a/app/src/setupTests.js b/src/app/src/setupTests.js similarity index 100% rename from app/src/setupTests.js rename to src/app/src/setupTests.js diff --git a/app/src/supabaseClient.js b/src/app/src/supabaseClient.js similarity index 100% rename from app/src/supabaseClient.js rename to src/app/src/supabaseClient.js diff --git a/app/src/watches.json b/src/app/src/watches.json similarity index 100% rename from app/src/watches.json rename to src/app/src/watches.json diff --git a/src/data_collection/gpt_formatter.py b/src/data_collection/gpt_formatter.py new file mode 100644 index 0000000..575db56 --- /dev/null +++ b/src/data_collection/gpt_formatter.py @@ -0,0 +1,65 @@ +import argparse +import os +import json +from openai import OpenAI +from supabase import create_client, Client +from schema_validator import validate_schema + +def process_queue(supabase_url, supabase_key, openai_key): + # Supabase setup + supabase: Client = create_client(supabase_url, supabase_key) + + # Set the API key for OpenAI + client = OpenAI( + api_key=openai_key + ) + + with open('query_schema.json') as f: + output_schema_str = f.read() + + # Fetch data from Supabase queue + try: + queue_data = supabase.table('rqueue').select('*').eq('processed', False).limit(1).execute() + if len(queue_data.data) < 2: # Fixed to check for non-empty data + for item in queue_data.data: + relevant_data = {key: item[key] for key in ["author_id", "title", "url", "comments"]} + item_json = json.dumps(relevant_data) + prompt = f"Given the data: {item_json}, construct a JSON object that adheres to the specified output schema. Output schema: {output_schema_str}" + try: + response = client.chat.completions.create( + model="gpt-3.5-turbo-0125", + response_format={ "type": "json_object" }, + messages=[{"role": "system", "content": "You are a helpful assistant that outputs valid JSON.>"}, + {"role": "user", "content": prompt}], + ) + try: + response_json = json.loads(response.choices[0].message.content) + except Exception as e: + print("Error in openai response: ", e) + + try: + validated_response = validate_schema(response_json) + try: + # supabase.table("watches").insert([validated_response]).execute() + supabase.table("rqueue").update({"processed": True}).eq("post_id", item["post_id"]).execute() + except Exception as e: + print(f"Failed to push to supabase (watches): {e}") + except Exception as e: + print(f"current response could not be validated: {e}") + + except Exception as e: + print(f"An OpenAI error occurred: {e}") + + + except Exception as e: + print(f"Failed to fetch data from Supabase (rqueue): {e}") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Process queue items and format them using OpenAI") + parser.add_argument("--supabase_url", required=True, help="Supabase project URL") + parser.add_argument("--supabase_key", required=True, help="Supabase service role key") + parser.add_argument("--openai_key", required=True, help="OpenAI API key") + + args = parser.parse_args() + + process_queue(args.supabase_url, args.supabase_key, args.openai_key) diff --git a/query_schema.json b/src/data_collection/query_schema.json similarity index 100% rename from query_schema.json rename to src/data_collection/query_schema.json diff --git a/src/data_collection/reddit_crawler.ipynb b/src/data_collection/reddit_crawler.ipynb new file mode 100644 index 0000000..0c0d28f --- /dev/null +++ b/src/data_collection/reddit_crawler.ipynb @@ -0,0 +1,326 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Lad wrote a Python script to download Alexa voice recordings, he didn't expect this email.\n", + "This post has:\n", + "I redesign the Python logo to make it more modern\n", + "Automate the boring stuff with python - tinder\n", + "Just finished programming and building my own smart mirror in python, wrote all of the code myself and implemented my own voice control and facial recognition features\n" + ] + } + ], + "source": [ + "import praw\n", + "import os\n", + "import json\n", + "import time\n", + "\n", + "from supabase import create_client, Client\n", + "\n", + "# Supabase setup\n", + "url: str = os.environ.get('SUPABASE_WATCHDB_URL')\n", + "key: str = os.environ.get('SUPABASE_WATCHDB_SERVICE_ROLE_KEY')\n", + "supabase: Client = create_client(url, key)\n", + "\n", + "# Reddit API Credentials\n", + "client_id = os.environ.get('REDDIT_APP_ID')\n", + "client_secret = os.environ.get('REDDIT_APP_KEY')\n", + "user_agent = 'User-Agent:chrono-codex-server:v1 (by /u/ChronoCrawler)'\n", + "\n", + "# Initialize PRAW with your credentials\n", + "reddit = praw.Reddit(client_id=client_id,\n", + " client_secret=client_secret,\n", + " user_agent=user_agent)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-02-06 17:53:19,214:INFO - HTTP Request: POST https://gvvfniijngcyqnwvrbal.supabase.co/rest/v1/rqueue \"HTTP/1.1 409 Conflict\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Skipping insertion for post_id=1akq7lk as it already exists.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-02-06 17:53:19,456:INFO - HTTP Request: POST https://gvvfniijngcyqnwvrbal.supabase.co/rest/v1/rqueue \"HTTP/1.1 201 Created\"\n", + "2024-02-06 17:53:19,743:INFO - HTTP Request: POST https://gvvfniijngcyqnwvrbal.supabase.co/rest/v1/rqueue \"HTTP/1.1 201 Created\"\n", + "2024-02-06 17:53:20,017:INFO - HTTP Request: POST https://gvvfniijngcyqnwvrbal.supabase.co/rest/v1/rqueue \"HTTP/1.1 201 Created\"\n", + "2024-02-06 17:53:20,316:INFO - HTTP Request: POST https://gvvfniijngcyqnwvrbal.supabase.co/rest/v1/rqueue \"HTTP/1.1 201 Created\"\n", + "2024-02-06 17:53:20,715:INFO - HTTP Request: POST https://gvvfniijngcyqnwvrbal.supabase.co/rest/v1/rqueue \"HTTP/1.1 201 Created\"\n", + "2024-02-06 17:53:20,948:INFO - HTTP Request: POST https://gvvfniijngcyqnwvrbal.supabase.co/rest/v1/rqueue \"HTTP/1.1 201 Created\"\n", + "2024-02-06 17:53:21,310:INFO - HTTP Request: POST https://gvvfniijngcyqnwvrbal.supabase.co/rest/v1/rqueue \"HTTP/1.1 201 Created\"\n", + "2024-02-06 17:53:21,584:INFO - HTTP Request: POST https://gvvfniijngcyqnwvrbal.supabase.co/rest/v1/rqueue \"HTTP/1.1 201 Created\"\n", + "2024-02-06 17:53:21,863:INFO - HTTP Request: POST https://gvvfniijngcyqnwvrbal.supabase.co/rest/v1/rqueue \"HTTP/1.1 201 Created\"\n" + ] + } + ], + "source": [ + "# The subreddit you want to scrape\n", + "subreddit = reddit.subreddit('watchexchange')\n", + "\n", + "# Initialize a list to store data before saving to disk\n", + "data_list = []\n", + "\n", + "# Fetch the top posts from the subreddit\n", + "top_posts = subreddit.top(time_filter=\"hour\", limit=10) # Adjust limit as needed\n", + "\n", + "\n", + "# Push the data collected to Supabase\n", + "for post in top_posts:\n", + " post.comments.replace_more(limit=25) # Load all comments\n", + " # comments = [{'userid': comment.author.name, 'comment': comment.body} for comment in post.comments.list()]\n", + " comments = ' | '.join([f\"{comment.author.name}: {comment.body}\" for comment in post.comments.list()])\n", + "\n", + " post_data = {\n", + " 'post_id': post.id,\n", + " 'author_id': post.author.name,\n", + " 'title': post.title,\n", + " 'url': post.url,\n", + " 'comments': comments\n", + " }\n", + " \n", + " try:\n", + " # Attempt to insert post_data into your Supabase table\n", + " data_insert_response = supabase.table('rqueue').insert(post_data).execute()\n", + " except Exception as e:\n", + " if 'duplicate key value violates unique constraint \"rqueue_pkey\"' in str(e):\n", + " print(f\"Skipping insertion for post_id={post_data['post_id']} as it already exists.\")\n", + " else:\n", + " raise\n" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [], + "source": [ + "from jsonschema import validate\n", + "from jsonschema.exceptions import ValidationError\n", + "\n", + "\n", + "def filter_invalid_entries(data, schema):\n", + " \"\"\"\n", + " Removes entries from the data that do not match the schema's type requirements.\n", + " \n", + " :param data: The JSON object to filter.\n", + " :param schema: The schema defining the expected types.\n", + " :return: A new dictionary with only the valid entries according to the schema.\n", + " \"\"\"\n", + " valid_data = {}\n", + " for key, value in data.items():\n", + " expected_type = schema.get(\"properties\", {}).get(key, {}).get(\"type\", None)\n", + " if expected_type:\n", + " if expected_type == \"number\" and value is not None:\n", + " print(key,value)\n", + " try:\n", + " converted_value = int(value)\n", + " except ValueError:\n", + " try:\n", + " converted_value = float(value)\n", + " except ValueError:\n", + " continue\n", + " valid_data[key] = converted_value\n", + " elif expected_type == \"string\" and isinstance(value, str):\n", + " valid_data[key] = value\n", + " return valid_data\n", + "\n", + "\n", + "\n", + "def validate_schema(data: dict):\n", + " # This code validates the schema of the provided JSON data\n", + " # and drops any entries that do not match the schema, effectively\n", + " # filtering out incorrect data.\n", + "\n", + " # Load the schema\n", + " with open('watch_schema.json', 'r') as file:\n", + " schema = json.load(file)\n", + "\n", + " json_data = filter_invalid_entries(data, schema)\n", + "\n", + " # Validate the JSON, on failure throw exception\n", + " validate(instance=json_data, schema=schema)\n", + " return json_data \n", + " \n" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'Brand': 'Breitling', 'Model': 'SuperOcean II 36', 'Reference Number': 'A17312', 'Case Material': 'Not specified', 'Case Diameter': None, 'Case Thickness': None, 'Lug Width': None, 'Lug-to-Lug': None, 'Dial Color': 'Not specified', 'Crystal Type': 'Not specified', 'Water Resistance': 'Not specified', 'Movement': 'Not specified', 'Caliber': 'Not specified', 'Movement Type': 'Not specified', 'Power Reserve': 'Not specified', 'Bracelet/Strap Material': 'Not specified', 'Clasp Type': 'Not specified', 'Product Weight': 'Not specified', 'Features': 'Completely full kit with watch roll. 2016 year.', 'Price': 1699, 'Availability': 'Zelle, Venmo, or WU accepted', 'Photo URL': 'https://imgur.com/a/Quq7MB9', 'Merchant Name': 'Pristine_Courage_535', 'Product URL': 'https://www.reddit.com/r/Watchexchange/comments/1akq7lk/wts_breitling_superocean_a17312_white_full_kit/'}\n", + "Price 1699\n" + ] + }, + { + "data": { + "text/plain": [ + "{'Brand': 'Breitling',\n", + " 'Model': 'SuperOcean II 36',\n", + " 'Reference Number': 'A17312',\n", + " 'Case Material': 'Not specified',\n", + " 'Dial Color': 'Not specified',\n", + " 'Crystal Type': 'Not specified',\n", + " 'Water Resistance': 'Not specified',\n", + " 'Movement': 'Not specified',\n", + " 'Caliber': 'Not specified',\n", + " 'Movement Type': 'Not specified',\n", + " 'Power Reserve': 'Not specified',\n", + " 'Bracelet/Strap Material': 'Not specified',\n", + " 'Clasp Type': 'Not specified',\n", + " 'Product Weight': 'Not specified',\n", + " 'Features': 'Completely full kit with watch roll. 2016 year.',\n", + " 'Price': 1699,\n", + " 'Availability': 'Zelle, Venmo, or WU accepted',\n", + " 'Photo URL': 'https://imgur.com/a/Quq7MB9',\n", + " 'Merchant Name': 'Pristine_Courage_535',\n", + " 'Product URL': 'https://www.reddit.com/r/Watchexchange/comments/1akq7lk/wts_breitling_superocean_a17312_white_full_kit/'}" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "response_json = json.loads(response.choices[0].message.content)\n", + "print(response_json)\n", + "validated_response = validate_schema(response_json)\n", + "validated_response" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-02-07 11:37:51,075:INFO - HTTP Request: GET https://gvvfniijngcyqnwvrbal.supabase.co/rest/v1/rqueue?select=%2A&processed=eq.False&limit=1 \"HTTP/1.1 200 OK\"\n", + "2024-02-07 11:37:51,152:INFO - HTTP Request: PATCH https://gvvfniijngcyqnwvrbal.supabase.co/rest/v1/rqueue?post_id=eq.1akqglv \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "data=[{'created_at': '2024-02-07T01:53:19.464226+00:00', 'post_id': '1akqglv', 'author_id': 'liuserr', 'title': '[WTS] Sinn 356 on OEM Leather strap', 'url': 'https://i.redd.it/18ictee7d2hc1.jpeg', 'comments': 'AutoModerator: * u/liuserr has 149 Transactions and [this 7-day post history](https://www.reddit.com/r/Watchexchange/wiki/user_post_history/liuserr).\\n* [Click send on this message](https://www.reddit.com/message/compose/?to=WatchExBot&subject=Feedback_Check&message=u/liuserr) to get a detailed look at this user\\'s transaction history. \\n * If the above link doesn\\'t work, send a message containing the username (starting with u/) to the bot, `WatchExBot`\\n* This post: \"[[WTS] Sinn 356 on OEM Leather strap](https://www.reddit.com/r/Watchexchange/comments/1akqglv/wts_sinn_356_on_oem_leather_strap/)\" \\n\\n---\\n\\n**This post may be removed without notice if it does not follow the rules!**\\n\\n* **Pictures** are required. Do you have pictures? \\n* A **timestamp** is required. Do you have a ***handwritten timestamp*** under your item(s)? \\n* A **price** or trade value is required. Do you have a price? \\n \\nIf you make a mistake in your post, ***do not delete and repost.*** Message the mods *first* or your post may be removed and you\\'ll have to wait 7 days to repost.\\n\\n---\\n\\n[Click here](https://www.reddit.com/r/Watchexchange/comments/ihs99w/meta_new_flairbased_feedback/) to learn how to leave feedback that updates your transaction flair.\\n\\n**Avoid banned or unqualified users; do not deal with anyone unless they comment on this thread.** \\n \\nIf you believe you have been scammed, please [fill out this form](https://www.reddit.com/message/compose/?to=r/Watchexchange&subject=I%20Think%20I%20Have%20Been%20Scammed&message=*%20Username%20of%20the%20person%20scamming%20me:%20%0D%0A%0D%0A*%20Link%20to%20the%20thread%20where%20the%20transaction%20originated:%20%0D%0A%0D%0A*%20Link%20to%20screenshots%20uploaded%20to%20imgur%20of%20my%20conversations%20with%20the%20scammer:%20%0D%0A%0D%0A*%20An%20explanation%20of%20the%20situation:%20).\\n\\nThe presence of this message does **not** indicate a need to message the moderators. \\n\\n\\n*I am a bot, and this action was performed automatically. Please [contact the moderators of this subreddit](/message/compose/?to=/r/Watchexchange) if you have any questions or concerns.* | liuserr: For sale is a Sinn 356 that comes on an unworn leather strap. I\\'m selling it because a chronograph is a little too frail for everyday wear. The watch comes on a sandblasted case with very little signs of wear. I believe the day is slightly misaligned, but not by a terrible amount if so as I can barely tell when scrolling through the days.\\n\\nThe watch comes on its OEM black leather strap that has not been worn - there are no creases on the strap holes. It also comes with the inner and outer boxes, instruction manual, and warranty card dated: Jan 2020.\\n\\nModel: Sinn 356 Movement: SW-500 Case Size: 38.5mm Case Thickness: 15mm Lug Width: 20mm Lug to lug: 46mm Water resistance: 100 meters\\n\\nAlbum: https://imgur.com/a/Lbafeue \\nTimestamp: https://i.imgur.com/9OYeF6O.jpg\\n\\nPrice: $1700 shipped CONUS via Zelle, wire transfer, or Paypal F&F for established members. Face to face in the Bay Area is welcomed.\\n\\nNo trades please.', 'processed': False}] count=None\n", + "Case Diameter 38.5\n", + "Case Thickness 15\n", + "Lug Width 20\n", + "Lug-to-Lug 46\n", + "Price 1700\n" + ] + } + ], + "source": [ + "from openai import OpenAI\n", + "import os\n", + "import json\n", + "\n", + "# Set the API key\n", + "client = OpenAI(\n", + " api_key= os.environ.get('OPENAI_API_CHRONO_KEY')\n", + ")\n", + "\n", + "with open('query_schema.json') as f:\n", + " output_schema_str = f.read()\n", + "\n", + "# Fetch data from Supabase queue\n", + "try:\n", + " queue_data = supabase.table('rqueue').select('*').eq('processed', False).limit(1).execute()\n", + " if len(queue_data.data) < 2: # Fixed to check for non-empty data\n", + " for item in queue_data.data:\n", + " relevant_data = {key: item[key] for key in [\"author_id\", \"title\", \"url\", \"comments\"]}\n", + " item_json = json.dumps(relevant_data)\n", + " prompt = f\"Given the data: {item_json}, construct a JSON object that adheres to the specified output schema. Output schema: {output_schema_str}\"\n", + " try:\n", + " response = client.chat.completions.create( \n", + " model=\"gpt-3.5-turbo-0125\", \n", + " response_format={ \"type\": \"json_object\" },\n", + " messages=[{\"role\": \"system\", \"content\": \"You are a helpful assistant that outputs valid JSON.>\"}, \n", + " {\"role\": \"user\", \"content\": prompt}],\n", + " )\n", + " try:\n", + " response_json = json.loads(response.choices[0].message.content)\n", + " except Exception as e:\n", + " print(\"Error in openai response: \", e)\n", + "\n", + " try:\n", + " validated_response = validate_schema(response_json)\n", + " try:\n", + " # supabase.table(\"watches\").insert([validated_response]).execute()\n", + " supabase.table(\"rqueue\").update({\"processed\": True}).eq(\"post_id\", item[\"post_id\"]).execute()\n", + " except Exception as e:\n", + " print(f\"Failed to push to supabase (watches): {e}\")\n", + " except Exception as e:\n", + " print(f\"current response could not be validated: {e}\")\n", + " \n", + " except Exception as e:\n", + " print(f\"An OpenAI error occurred: {e}\")\n", + " \n", + "\n", + "except Exception as e:\n", + " print(f\"Failed to fetch data from Supabase (rqueue): {e}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/src/data_collection/reddit_crawler.py b/src/data_collection/reddit_crawler.py new file mode 100644 index 0000000..297aba4 --- /dev/null +++ b/src/data_collection/reddit_crawler.py @@ -0,0 +1,62 @@ +import praw +import os +import json +import time +import argparse + +from supabase import create_client, Client + +def main(time_filter, post_limit, comments_limit): + # Supabase setup + url: str = os.environ.get('SUPABASE_WATCHDB_URL') + key: str = os.environ.get('SUPABASE_WATCHDB_SERVICE_ROLE_KEY') + supabase: Client = create_client(url, key) + + # Reddit API Credentials + client_id = os.environ.get('REDDIT_APP_ID') + client_secret = os.environ.get('REDDIT_APP_KEY') + user_agent = 'User-Agent:chrono-codex-server:v1 (by /u/ChronoCrawler)' + + # Initialize PRAW with your credentials + reddit = praw.Reddit(client_id=client_id, + client_secret=client_secret, + user_agent=user_agent) + + # The subreddit you want to scrape + subreddit = reddit.subreddit('watchexchange') + + # Fetch the top posts from the subreddit + top_posts = subreddit.top(time_filter=time_filter, limit=post_limit) + + # Push the data collected to Supabase + for post in top_posts: + post.comments.replace_more(limit=comments_limit) # Load all comments + comments = ' | '.join([f"{comment.author.name}: {comment.body}" for comment in post.comments.list()]) + + post_data = { + 'post_id': post.id, + 'author_id': post.author.name, + 'title': post.title, + 'url': post.url, + 'comments': comments + } + + try: + # Attempt to insert post_data into your Supabase table + data_insert_response = supabase.table('rqueue').insert(post_data).execute() + except Exception as e: + if 'duplicate key value violates unique constraint "rqueue_pkey"' in str(e): + print(f"Skipping insertion for post_id={post_data['post_id']} as it already exists.") + else: + raise + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Reddit Crawler for Subreddit Posts") + parser.add_argument("--time_filter", help="Time filter for posts", default="hour") + parser.add_argument("--post_limit", help="Limit of posts to fetch", type=int, default=10) + parser.add_argument("--comments_limit", help="Limit of comments to fetch for each post", type=int, default=25) + + args = parser.parse_args() + + main(args.time_filter, args.post_limit, args.comments_limit) + diff --git a/src/data_collection/schema_validator.py b/src/data_collection/schema_validator.py new file mode 100644 index 0000000..49b44f3 --- /dev/null +++ b/src/data_collection/schema_validator.py @@ -0,0 +1,47 @@ +from jsonschema import validate +from jsonschema.exceptions import ValidationError + + +def filter_invalid_entries(data, schema): + """ + Removes entries from the data that do not match the schema's type requirements. + + :param data: The JSON object to filter. + :param schema: The schema defining the expected types. + :return: A new dictionary with only the valid entries according to the schema. + """ + valid_data = {} + for key, value in data.items(): + expected_type = schema.get("properties", {}).get(key, {}).get("type", None) + if expected_type: + if expected_type == "number" and value is not None: + print(key,value) + try: + converted_value = int(value) + except ValueError: + try: + converted_value = float(value) + except ValueError: + continue + valid_data[key] = converted_value + elif expected_type == "string" and isinstance(value, str): + valid_data[key] = value + return valid_data + + + +def validate_schema(data: dict): + # This code validates the schema of the provided JSON data + # and drops any entries that do not match the schema, effectively + # filtering out incorrect data. + + # Load the schema + with open('watch_schema.json', 'r') as file: + schema = json.load(file) + + json_data = filter_invalid_entries(data, schema) + + # Validate the JSON, on failure throw exception + validate(instance=json_data, schema=schema) + return json_data + diff --git a/test.ipynb b/src/data_collection/test.ipynb similarity index 100% rename from test.ipynb rename to src/data_collection/test.ipynb diff --git a/src/data_collection/watch_schema.json b/src/data_collection/watch_schema.json new file mode 100644 index 0000000..25f32ee --- /dev/null +++ b/src/data_collection/watch_schema.json @@ -0,0 +1,34 @@ +{ + "type": "object", + "properties": { + "Brand": {"type": "string"}, + "Model": {"type": "string"}, + "Reference Number": {"type": "string"}, + "Case Material": {"type": "string"}, + "Case Diameter": {"type": "number"}, + "Case Thickness": {"type": "number"}, + "Lug Width": {"type": "number"}, + "Lug-to-Lug": {"type": "number"}, + "Dial Color": {"type": "string"}, + "Crystal Type": {"type": "string"}, + "Water Resistance": {"type": "string"}, + "Movement": {"type": "string"}, + "Caliber": {"type": "string"}, + "Movement Type": {"type": "string"}, + "Power Reserve": {"type": "string"}, + "Bracelet/Strap Material": {"type": "string"}, + "Clasp Type": {"type": "string"}, + "Product Weight": {"type": "string"}, + "Features": {"type": "string"}, + "Price": {"type": "number"}, + "Availability": {"type": "string"}, + "Photo URL": {"type": "string"}, + "Merchant Name": {"type": "string"}, + "Product URL": {"type": "string"} + }, + "required": [ + "Brand", + "Model" + ], + "additionalProperties": false +} \ No newline at end of file