update reddit crawler to handle duplicate insertion to queue

aapatni · Feb 7, 2024 · 6a19290 · 6a19290
1 parent e651303
commit 6a19290
Show file tree

Hide file tree

Showing 2 changed files with 7 additions and 7 deletions.
diff --git a/src/data_collection/reddit_crawler.py b/src/data_collection/reddit_crawler.py
@@ -4,6 +4,7 @@
 import time
 import argparse
 
+from postgrest.exceptions import APIError
 from supabase import create_client, Client
 
 def main(time_filter, post_limit, comments_limit):
@@ -24,7 +25,6 @@ def main(time_filter, post_limit, comments_limit):
                         client_secret=client_secret,
                         user_agent=user_agent)
 
-    # The subreddit you want to scrape
     subreddit = reddit.subreddit('watchexchange')
 
     # Fetch the top posts from the subreddit
@@ -44,13 +44,12 @@ def main(time_filter, post_limit, comments_limit):
         }
 
         try:
-            # Attempt to insert post_data into your Supabase table
             data_insert_response = supabase.table('rqueue').insert(post_data).execute()
-        except Exception as e:
-            if 'duplicate key value violates unique constraint "rqueue_pkey"' in str(e):
-                print(f"Skipping insertion for post_id={post_data['post_id']} as it already exists.")
+        except APIError as api_error:
+            if api_error.code == '23505':
+                print(f"Duplicate entry ({post.id}), skipping")
             else:
-                raise
+                raise api_error
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Reddit WatchExchange Crawler")

diff --git a/src/data_collection/requirements.txt b/src/data_collection/requirements.txt
@@ -1,4 +1,5 @@
 praw
 supabase
 openai
-schema-validator
+schema-validator
+postgrest