Skip to content

Commit

Permalink
intermediate crawl/db testing
Browse files Browse the repository at this point in the history
  • Loading branch information
aapatni committed Feb 7, 2024
1 parent 7962507 commit 68f37ba
Show file tree
Hide file tree
Showing 2 changed files with 363 additions and 0 deletions.
209 changes: 209 additions & 0 deletions reddit_crawler.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,209 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Lad wrote a Python script to download Alexa voice recordings, he didn't expect this email.\n",
"This post has:\n",
"I redesign the Python logo to make it more modern\n",
"Automate the boring stuff with python - tinder\n",
"Just finished programming and building my own smart mirror in python, wrote all of the code myself and implemented my own voice control and facial recognition features\n"
]
}
],
"source": [
"import praw\n",
"import os\n",
"import json\n",
"import time\n",
"\n",
"from supabase import create_client, Client\n",
"\n",
"# Supabase setup\n",
"url: str = os.environ.get('SUPABASE_WATCHDB_URL')\n",
"key: str = os.environ.get('SUPABASE_WATCHDB_SERVICE_ROLE_KEY')\n",
"supabase: Client = create_client(url, key)\n",
"\n",
"# Reddit API Credentials\n",
"client_id = os.environ.get('REDDIT_APP_ID')\n",
"client_secret = os.environ.get('REDDIT_APP_KEY')\n",
"user_agent = 'User-Agent:chrono-codex-server:v1 (by /u/ChronoCrawler)'\n",
"\n",
"# Initialize PRAW with your credentials\n",
"reddit = praw.Reddit(client_id=client_id,\n",
" client_secret=client_secret,\n",
" user_agent=user_agent)\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2024-02-06 17:53:19,214:INFO - HTTP Request: POST https://gvvfniijngcyqnwvrbal.supabase.co/rest/v1/rqueue \"HTTP/1.1 409 Conflict\"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Skipping insertion for post_id=1akq7lk as it already exists.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"2024-02-06 17:53:19,456:INFO - HTTP Request: POST https://gvvfniijngcyqnwvrbal.supabase.co/rest/v1/rqueue \"HTTP/1.1 201 Created\"\n",
"2024-02-06 17:53:19,743:INFO - HTTP Request: POST https://gvvfniijngcyqnwvrbal.supabase.co/rest/v1/rqueue \"HTTP/1.1 201 Created\"\n",
"2024-02-06 17:53:20,017:INFO - HTTP Request: POST https://gvvfniijngcyqnwvrbal.supabase.co/rest/v1/rqueue \"HTTP/1.1 201 Created\"\n",
"2024-02-06 17:53:20,316:INFO - HTTP Request: POST https://gvvfniijngcyqnwvrbal.supabase.co/rest/v1/rqueue \"HTTP/1.1 201 Created\"\n",
"2024-02-06 17:53:20,715:INFO - HTTP Request: POST https://gvvfniijngcyqnwvrbal.supabase.co/rest/v1/rqueue \"HTTP/1.1 201 Created\"\n",
"2024-02-06 17:53:20,948:INFO - HTTP Request: POST https://gvvfniijngcyqnwvrbal.supabase.co/rest/v1/rqueue \"HTTP/1.1 201 Created\"\n",
"2024-02-06 17:53:21,310:INFO - HTTP Request: POST https://gvvfniijngcyqnwvrbal.supabase.co/rest/v1/rqueue \"HTTP/1.1 201 Created\"\n",
"2024-02-06 17:53:21,584:INFO - HTTP Request: POST https://gvvfniijngcyqnwvrbal.supabase.co/rest/v1/rqueue \"HTTP/1.1 201 Created\"\n",
"2024-02-06 17:53:21,863:INFO - HTTP Request: POST https://gvvfniijngcyqnwvrbal.supabase.co/rest/v1/rqueue \"HTTP/1.1 201 Created\"\n"
]
}
],
"source": [
"# The subreddit you want to scrape\n",
"subreddit = reddit.subreddit('watchexchange')\n",
"\n",
"# Initialize a list to store data before saving to disk\n",
"data_list = []\n",
"\n",
"# Fetch the top posts from the subreddit\n",
"top_posts = subreddit.top(time_filter=\"hour\", limit=10) # Adjust limit as needed\n",
"\n",
"\n",
"# Push the data collected to Supabase\n",
"for post in top_posts:\n",
" post.comments.replace_more(limit=25) # Load all comments\n",
" # comments = [{'userid': comment.author.name, 'comment': comment.body} for comment in post.comments.list()]\n",
" comments = ' | '.join([f\"{comment.author.name}: {comment.body}\" for comment in post.comments.list()])\n",
"\n",
" post_data = {\n",
" 'post_id': post.id,\n",
" 'author_id': post.author.name,\n",
" 'title': post.title,\n",
" 'url': post.url,\n",
" 'comments': comments\n",
" }\n",
" \n",
" try:\n",
" # Attempt to insert post_data into your Supabase table\n",
" data_insert_response = supabase.table('rqueue').insert(post_data).execute()\n",
" except Exception as e:\n",
" if 'duplicate key value violates unique constraint \"rqueue_pkey\"' in str(e):\n",
" print(f\"Skipping insertion for post_id={post_data['post_id']} as it already exists.\")\n",
" else:\n",
" raise\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2024-02-06 18:18:58,543:INFO - HTTP Request: GET https://gvvfniijngcyqnwvrbal.supabase.co/rest/v1/rqueue?select=%2A&processed=eq.False&limit=1 \"HTTP/1.1 200 OK\"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"An error occurred: \n",
"\n",
"You tried to access openai.Completion, but this is no longer supported in openai>=1.0.0 - see the README at https://github.com/openai/openai-python for the API.\n",
"\n",
"You can run `openai migrate` to automatically upgrade your codebase to use the 1.0.0 interface. \n",
"\n",
"Alternatively, you can pin your installation to the old version, e.g. `pip install openai==0.28`\n",
"\n",
"A detailed migration guide is available here: https://github.com/openai/openai-python/discussions/742\n",
"\n"
]
}
],
"source": [
"import openai\n",
"import json\n",
"import jsonschema\n",
"\n",
"# Load your OpenAI API key from an environment variable or directly\n",
"openai_api_key = os.environ.get('OPENAI_API_CHRONO_KEY')\n",
"\n",
"openai.api_key = openai_api_key\n",
"\n",
"with open('query_schema.json') as f:\n",
" output_schema_str = f.read()\n",
"\n",
"# Fetch data from Supabase queue\n",
"try:\n",
" queue_data = supabase.table('rqueue').select('*').eq('processed', False).limit(1).execute()\n",
" if len(queue_data.data) > 1:\n",
" raise Exception(\"lll\")\n",
" if queue_data.data:\n",
" for item in queue_data.data:\n",
" # Convert the item to JSON string\n",
" item_json = json.dumps(item)\n",
"\n",
" prompt = f\"Given the data: {item_json}, construct a JSON object that adheres to the specified output schema. Here is the output schema: {output_schema_str}\"\n",
" try:\n",
" response = openai.Completion.create(\n",
" model=\"gpt-3.5-turbo-0125\",\n",
" prompt=prompt,\n",
" temperature=0.5, # Adjust as needed\n",
" max_tokens=1024, # Adjust based on your expected output size\n",
" top_p=1.0,\n",
" frequency_penalty=0.0,\n",
" presence_penalty=0.0\n",
" )\n",
" print(response.choices[0].text)\n",
" except Exception as e:\n",
" print(f\"An error occurred: {e}\")\n",
"\n",
" else:\n",
" print(\"No data found in the queue.\")\n",
"except Exception as e:\n",
" print(f\"Failed to fetch data from Supabase: {e}\")\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
154 changes: 154 additions & 0 deletions test.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import webtranspose as webt\n",
"import os\n",
"import json "
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"# # Crawl\n",
"# crawl = webt.Crawl(\n",
"# 'https://www.seikowatches.com/us-en/watchfinder?page=100',\n",
"# allowed_urls=[],\n",
"# banned_urls=[],\n",
"# max_pages=1,\n",
"# render_js=True,\n",
"# api_key=os.environ['WEBTRANSPOSE_API_KEY']\n",
"# )\n",
"\n",
"# crawl.queue_crawl() # async\n",
"\n",
"# # crawl.crawl() # sync\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"query = json.load(open('query_schema.json'))\n",
"\n",
"scraper = webt.Scraper(\n",
" query, \n",
" scraper_id='scrape single seiko page',\n",
" name='seiko_page_scraper',\n",
" render_js=True, \n",
" api_key=os.environ['WEBTRANSPOSE_API_KEY'], # optional, if you want to run on cloud\n",
" # proxy=True, # optional, if you want to use proxy\n",
") \n",
"\n",
"# out_json = scraper.scrape('https://www.seikowatches.com/us-en/products/presage/spb441j1')"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"test_json = {'Brand': 'Seiko',\n",
" 'Model': 'SPB441J1',\n",
" 'Reference Number': 'SPB441J1',\n",
" 'Case Material': 'Stainless steel (super hard coating)',\n",
" 'Case Diameter': 35.0,\n",
" 'Case Thickness': 12.3,\n",
" 'Lug Width': 11,\n",
" 'Lug-to-Lug': 37.0,\n",
" 'Dial Color': 'Enamel',\n",
" 'Crystal Type': 'Box shaped sapphire crystal',\n",
" 'Water Resistance': '5 bar',\n",
" 'Movement': '6R5H',\n",
" 'Caliber': '6R5H',\n",
" 'Movement Type': 'Automatic with manual winding',\n",
" 'Bracelet/Strap Material': 'Calfskin',\n",
" 'Product Weight': '59.0g',\n",
" 'Price': 1900,\n",
" 'Photo URL': 'https://www.seikowatches.com/us-en/-/media/Images/Product--Image/All/Seiko/2023/10/16/11/57/SPB441J1/SPB441J1.png?mh=1000&mw=1000&hash=FED3D8AEDD94CFCC97DDA5671B158436',\n",
" 'Merchant Name': 'Seiko Watch Corporation',\n",
" 'Product URL': 'https://www.seikowatches.com/us-en/products/presage/spb441j1'}"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2024-02-06 14:25:20,007:INFO - HTTP Request: POST https://gvvfniijngcyqnwvrbal.supabase.co/rest/v1/watches \"HTTP/1.1 409 Conflict\"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'code': '23505', 'details': 'Key (\"Brand\", \"Reference Number\")=(Seiko, SPB441J1) already exists.', 'hint': None, 'message': 'duplicate key value violates unique constraint \"watches_pkey\"'}\n",
"data=[{'Brand': 'Seiko', 'Model': 'SPB441J1', 'Reference Number': 'SPB441J1', 'Case Material': 'Stainless steel (super hard coating)', 'Case Diameter': 35, 'Case Thickness': 12.3, 'Lug Width': 11, 'Lug-to-Lug': 37, 'Dial Color': 'Enamel', 'Crystal Type': 'Box shaped sapphire crystal', 'Water Resistance': '5 bar', 'Movement': '6R5H', 'Caliber': '6R5H', 'Movement Type': 'Automatic with manual winding', 'Power Reserve': None, 'Bracelet/Strap Material': 'Calfskin', 'Clasp Type': None, 'Product Weight': '59.0g', 'Features': None, 'Price': 1900, 'Availability': None, 'Photo URL': 'https://www.seikowatches.com/us-en/-/media/Images/Product--Image/All/Seiko/2023/10/16/11/57/SPB441J1/SPB441J1.png?mh=1000&mw=1000&hash=FED3D8AEDD94CFCC97DDA5671B158436', 'Merchant Name': 'Seiko Watch Corporation', 'Product URL': 'https://www.seikowatches.com/us-en/products/presage/spb441j1'}] count=None\n"
]
}
],
"source": [
"from supabase import create_client, Client\n",
"\n",
"# Supabase project details\n",
"SUPABASE_URL = os.environ['SUPABASE_WATCHDB_URL']\n",
"SUPABASE_KEY = os.environ['SUPABASE_WATCHDB_SERVICE_ROLE_KEY']\n",
"\n",
"supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)\n",
"\n",
"# Your JSON data\n",
"data = [\n",
" # Add your JSON objects here, e.g.,\n",
" test_json\n",
"]\n",
"\n",
"# Push data to the 'watches' table\n",
"try:\n",
" response = supabase.table(\"watches\").insert(data).execute()\n",
"except Exception as e:\n",
" print(e)\n",
" print(response)\n",
"# Check response\n",
"# if response.error:\n",
"# print(f\"Error: {response.error.message}\")\n",
"# else:\n",
"# print(\"Data pushed successfully\")\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

0 comments on commit 68f37ba

Please sign in to comment.