Skip to content

Commit

Permalink
update reddit crawler to handle duplicate insertion to queue
Browse files Browse the repository at this point in the history
  • Loading branch information
aapatni committed Feb 7, 2024
1 parent e651303 commit 6a19290
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 7 deletions.
11 changes: 5 additions & 6 deletions src/data_collection/reddit_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import time
import argparse

from postgrest.exceptions import APIError
from supabase import create_client, Client

def main(time_filter, post_limit, comments_limit):
Expand All @@ -24,7 +25,6 @@ def main(time_filter, post_limit, comments_limit):
client_secret=client_secret,
user_agent=user_agent)

# The subreddit you want to scrape
subreddit = reddit.subreddit('watchexchange')

# Fetch the top posts from the subreddit
Expand All @@ -44,13 +44,12 @@ def main(time_filter, post_limit, comments_limit):
}

try:
# Attempt to insert post_data into your Supabase table
data_insert_response = supabase.table('rqueue').insert(post_data).execute()
except Exception as e:
if 'duplicate key value violates unique constraint "rqueue_pkey"' in str(e):
print(f"Skipping insertion for post_id={post_data['post_id']} as it already exists.")
except APIError as api_error:
if api_error.code == '23505':
print(f"Duplicate entry ({post.id}), skipping")
else:
raise
raise api_error

if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Reddit WatchExchange Crawler")
Expand Down
3 changes: 2 additions & 1 deletion src/data_collection/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
praw
supabase
openai
schema-validator
schema-validator
postgrest

0 comments on commit 6a19290

Please sign in to comment.