-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
2 changed files
with
363 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,209 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Lad wrote a Python script to download Alexa voice recordings, he didn't expect this email.\n", | ||
"This post has:\n", | ||
"I redesign the Python logo to make it more modern\n", | ||
"Automate the boring stuff with python - tinder\n", | ||
"Just finished programming and building my own smart mirror in python, wrote all of the code myself and implemented my own voice control and facial recognition features\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"import praw\n", | ||
"import os\n", | ||
"import json\n", | ||
"import time\n", | ||
"\n", | ||
"from supabase import create_client, Client\n", | ||
"\n", | ||
"# Supabase setup\n", | ||
"url: str = os.environ.get('SUPABASE_WATCHDB_URL')\n", | ||
"key: str = os.environ.get('SUPABASE_WATCHDB_SERVICE_ROLE_KEY')\n", | ||
"supabase: Client = create_client(url, key)\n", | ||
"\n", | ||
"# Reddit API Credentials\n", | ||
"client_id = os.environ.get('REDDIT_APP_ID')\n", | ||
"client_secret = os.environ.get('REDDIT_APP_KEY')\n", | ||
"user_agent = 'User-Agent:chrono-codex-server:v1 (by /u/ChronoCrawler)'\n", | ||
"\n", | ||
"# Initialize PRAW with your credentials\n", | ||
"reddit = praw.Reddit(client_id=client_id,\n", | ||
" client_secret=client_secret,\n", | ||
" user_agent=user_agent)\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 5, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"2024-02-06 17:53:19,214:INFO - HTTP Request: POST https://gvvfniijngcyqnwvrbal.supabase.co/rest/v1/rqueue \"HTTP/1.1 409 Conflict\"\n" | ||
] | ||
}, | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Skipping insertion for post_id=1akq7lk as it already exists.\n" | ||
] | ||
}, | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"2024-02-06 17:53:19,456:INFO - HTTP Request: POST https://gvvfniijngcyqnwvrbal.supabase.co/rest/v1/rqueue \"HTTP/1.1 201 Created\"\n", | ||
"2024-02-06 17:53:19,743:INFO - HTTP Request: POST https://gvvfniijngcyqnwvrbal.supabase.co/rest/v1/rqueue \"HTTP/1.1 201 Created\"\n", | ||
"2024-02-06 17:53:20,017:INFO - HTTP Request: POST https://gvvfniijngcyqnwvrbal.supabase.co/rest/v1/rqueue \"HTTP/1.1 201 Created\"\n", | ||
"2024-02-06 17:53:20,316:INFO - HTTP Request: POST https://gvvfniijngcyqnwvrbal.supabase.co/rest/v1/rqueue \"HTTP/1.1 201 Created\"\n", | ||
"2024-02-06 17:53:20,715:INFO - HTTP Request: POST https://gvvfniijngcyqnwvrbal.supabase.co/rest/v1/rqueue \"HTTP/1.1 201 Created\"\n", | ||
"2024-02-06 17:53:20,948:INFO - HTTP Request: POST https://gvvfniijngcyqnwvrbal.supabase.co/rest/v1/rqueue \"HTTP/1.1 201 Created\"\n", | ||
"2024-02-06 17:53:21,310:INFO - HTTP Request: POST https://gvvfniijngcyqnwvrbal.supabase.co/rest/v1/rqueue \"HTTP/1.1 201 Created\"\n", | ||
"2024-02-06 17:53:21,584:INFO - HTTP Request: POST https://gvvfniijngcyqnwvrbal.supabase.co/rest/v1/rqueue \"HTTP/1.1 201 Created\"\n", | ||
"2024-02-06 17:53:21,863:INFO - HTTP Request: POST https://gvvfniijngcyqnwvrbal.supabase.co/rest/v1/rqueue \"HTTP/1.1 201 Created\"\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"# The subreddit you want to scrape\n", | ||
"subreddit = reddit.subreddit('watchexchange')\n", | ||
"\n", | ||
"# Initialize a list to store data before saving to disk\n", | ||
"data_list = []\n", | ||
"\n", | ||
"# Fetch the top posts from the subreddit\n", | ||
"top_posts = subreddit.top(time_filter=\"hour\", limit=10) # Adjust limit as needed\n", | ||
"\n", | ||
"\n", | ||
"# Push the data collected to Supabase\n", | ||
"for post in top_posts:\n", | ||
" post.comments.replace_more(limit=25) # Load all comments\n", | ||
" # comments = [{'userid': comment.author.name, 'comment': comment.body} for comment in post.comments.list()]\n", | ||
" comments = ' | '.join([f\"{comment.author.name}: {comment.body}\" for comment in post.comments.list()])\n", | ||
"\n", | ||
" post_data = {\n", | ||
" 'post_id': post.id,\n", | ||
" 'author_id': post.author.name,\n", | ||
" 'title': post.title,\n", | ||
" 'url': post.url,\n", | ||
" 'comments': comments\n", | ||
" }\n", | ||
" \n", | ||
" try:\n", | ||
" # Attempt to insert post_data into your Supabase table\n", | ||
" data_insert_response = supabase.table('rqueue').insert(post_data).execute()\n", | ||
" except Exception as e:\n", | ||
" if 'duplicate key value violates unique constraint \"rqueue_pkey\"' in str(e):\n", | ||
" print(f\"Skipping insertion for post_id={post_data['post_id']} as it already exists.\")\n", | ||
" else:\n", | ||
" raise\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 8, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"2024-02-06 18:18:58,543:INFO - HTTP Request: GET https://gvvfniijngcyqnwvrbal.supabase.co/rest/v1/rqueue?select=%2A&processed=eq.False&limit=1 \"HTTP/1.1 200 OK\"\n" | ||
] | ||
}, | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"An error occurred: \n", | ||
"\n", | ||
"You tried to access openai.Completion, but this is no longer supported in openai>=1.0.0 - see the README at https://github.com/openai/openai-python for the API.\n", | ||
"\n", | ||
"You can run `openai migrate` to automatically upgrade your codebase to use the 1.0.0 interface. \n", | ||
"\n", | ||
"Alternatively, you can pin your installation to the old version, e.g. `pip install openai==0.28`\n", | ||
"\n", | ||
"A detailed migration guide is available here: https://github.com/openai/openai-python/discussions/742\n", | ||
"\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"import openai\n", | ||
"import json\n", | ||
"import jsonschema\n", | ||
"\n", | ||
"# Load your OpenAI API key from an environment variable or directly\n", | ||
"openai_api_key = os.environ.get('OPENAI_API_CHRONO_KEY')\n", | ||
"\n", | ||
"openai.api_key = openai_api_key\n", | ||
"\n", | ||
"with open('query_schema.json') as f:\n", | ||
" output_schema_str = f.read()\n", | ||
"\n", | ||
"# Fetch data from Supabase queue\n", | ||
"try:\n", | ||
" queue_data = supabase.table('rqueue').select('*').eq('processed', False).limit(1).execute()\n", | ||
" if len(queue_data.data) > 1:\n", | ||
" raise Exception(\"lll\")\n", | ||
" if queue_data.data:\n", | ||
" for item in queue_data.data:\n", | ||
" # Convert the item to JSON string\n", | ||
" item_json = json.dumps(item)\n", | ||
"\n", | ||
" prompt = f\"Given the data: {item_json}, construct a JSON object that adheres to the specified output schema. Here is the output schema: {output_schema_str}\"\n", | ||
" try:\n", | ||
" response = openai.Completion.create(\n", | ||
" model=\"gpt-3.5-turbo-0125\",\n", | ||
" prompt=prompt,\n", | ||
" temperature=0.5, # Adjust as needed\n", | ||
" max_tokens=1024, # Adjust based on your expected output size\n", | ||
" top_p=1.0,\n", | ||
" frequency_penalty=0.0,\n", | ||
" presence_penalty=0.0\n", | ||
" )\n", | ||
" print(response.choices[0].text)\n", | ||
" except Exception as e:\n", | ||
" print(f\"An error occurred: {e}\")\n", | ||
"\n", | ||
" else:\n", | ||
" print(\"No data found in the queue.\")\n", | ||
"except Exception as e:\n", | ||
" print(f\"Failed to fetch data from Supabase: {e}\")\n" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "base", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.11.5" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,154 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import webtranspose as webt\n", | ||
"import os\n", | ||
"import json " | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 13, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# # Crawl\n", | ||
"# crawl = webt.Crawl(\n", | ||
"# 'https://www.seikowatches.com/us-en/watchfinder?page=100',\n", | ||
"# allowed_urls=[],\n", | ||
"# banned_urls=[],\n", | ||
"# max_pages=1,\n", | ||
"# render_js=True,\n", | ||
"# api_key=os.environ['WEBTRANSPOSE_API_KEY']\n", | ||
"# )\n", | ||
"\n", | ||
"# crawl.queue_crawl() # async\n", | ||
"\n", | ||
"# # crawl.crawl() # sync\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"query = json.load(open('query_schema.json'))\n", | ||
"\n", | ||
"scraper = webt.Scraper(\n", | ||
" query, \n", | ||
" scraper_id='scrape single seiko page',\n", | ||
" name='seiko_page_scraper',\n", | ||
" render_js=True, \n", | ||
" api_key=os.environ['WEBTRANSPOSE_API_KEY'], # optional, if you want to run on cloud\n", | ||
" # proxy=True, # optional, if you want to use proxy\n", | ||
") \n", | ||
"\n", | ||
"# out_json = scraper.scrape('https://www.seikowatches.com/us-en/products/presage/spb441j1')" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"test_json = {'Brand': 'Seiko',\n", | ||
" 'Model': 'SPB441J1',\n", | ||
" 'Reference Number': 'SPB441J1',\n", | ||
" 'Case Material': 'Stainless steel (super hard coating)',\n", | ||
" 'Case Diameter': 35.0,\n", | ||
" 'Case Thickness': 12.3,\n", | ||
" 'Lug Width': 11,\n", | ||
" 'Lug-to-Lug': 37.0,\n", | ||
" 'Dial Color': 'Enamel',\n", | ||
" 'Crystal Type': 'Box shaped sapphire crystal',\n", | ||
" 'Water Resistance': '5 bar',\n", | ||
" 'Movement': '6R5H',\n", | ||
" 'Caliber': '6R5H',\n", | ||
" 'Movement Type': 'Automatic with manual winding',\n", | ||
" 'Bracelet/Strap Material': 'Calfskin',\n", | ||
" 'Product Weight': '59.0g',\n", | ||
" 'Price': 1900,\n", | ||
" 'Photo URL': 'https://www.seikowatches.com/us-en/-/media/Images/Product--Image/All/Seiko/2023/10/16/11/57/SPB441J1/SPB441J1.png?mh=1000&mw=1000&hash=FED3D8AEDD94CFCC97DDA5671B158436',\n", | ||
" 'Merchant Name': 'Seiko Watch Corporation',\n", | ||
" 'Product URL': 'https://www.seikowatches.com/us-en/products/presage/spb441j1'}" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 7, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"2024-02-06 14:25:20,007:INFO - HTTP Request: POST https://gvvfniijngcyqnwvrbal.supabase.co/rest/v1/watches \"HTTP/1.1 409 Conflict\"\n" | ||
] | ||
}, | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"{'code': '23505', 'details': 'Key (\"Brand\", \"Reference Number\")=(Seiko, SPB441J1) already exists.', 'hint': None, 'message': 'duplicate key value violates unique constraint \"watches_pkey\"'}\n", | ||
"data=[{'Brand': 'Seiko', 'Model': 'SPB441J1', 'Reference Number': 'SPB441J1', 'Case Material': 'Stainless steel (super hard coating)', 'Case Diameter': 35, 'Case Thickness': 12.3, 'Lug Width': 11, 'Lug-to-Lug': 37, 'Dial Color': 'Enamel', 'Crystal Type': 'Box shaped sapphire crystal', 'Water Resistance': '5 bar', 'Movement': '6R5H', 'Caliber': '6R5H', 'Movement Type': 'Automatic with manual winding', 'Power Reserve': None, 'Bracelet/Strap Material': 'Calfskin', 'Clasp Type': None, 'Product Weight': '59.0g', 'Features': None, 'Price': 1900, 'Availability': None, 'Photo URL': 'https://www.seikowatches.com/us-en/-/media/Images/Product--Image/All/Seiko/2023/10/16/11/57/SPB441J1/SPB441J1.png?mh=1000&mw=1000&hash=FED3D8AEDD94CFCC97DDA5671B158436', 'Merchant Name': 'Seiko Watch Corporation', 'Product URL': 'https://www.seikowatches.com/us-en/products/presage/spb441j1'}] count=None\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"from supabase import create_client, Client\n", | ||
"\n", | ||
"# Supabase project details\n", | ||
"SUPABASE_URL = os.environ['SUPABASE_WATCHDB_URL']\n", | ||
"SUPABASE_KEY = os.environ['SUPABASE_WATCHDB_SERVICE_ROLE_KEY']\n", | ||
"\n", | ||
"supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)\n", | ||
"\n", | ||
"# Your JSON data\n", | ||
"data = [\n", | ||
" # Add your JSON objects here, e.g.,\n", | ||
" test_json\n", | ||
"]\n", | ||
"\n", | ||
"# Push data to the 'watches' table\n", | ||
"try:\n", | ||
" response = supabase.table(\"watches\").insert(data).execute()\n", | ||
"except Exception as e:\n", | ||
" print(e)\n", | ||
" print(response)\n", | ||
"# Check response\n", | ||
"# if response.error:\n", | ||
"# print(f\"Error: {response.error.message}\")\n", | ||
"# else:\n", | ||
"# print(\"Data pushed successfully\")\n" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "base", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.11.5" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |