From 8e0e7c3d29e107c081a00a9d380a37720c47ad14 Mon Sep 17 00:00:00 2001
From: Adam Patni <adampatni27@gmail.com>
Date: Fri, 15 Mar 2024 11:00:50 -0700
Subject: [PATCH] update the comment grabbing

---
 .github/workflows/main.yml            |  2 +-
 src/data_collection/reddit_crawler.py | 60 ++++++++++++++++++++-------
 2 files changed, 45 insertions(+), 17 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 50f5f48..bf3eba6 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -44,7 +44,7 @@ jobs:
         DB_PORT: ${{ secrets.DB_PORT }}
         SUPABASE_WATCHDB_URL: ${{ secrets.SUPABASE_WATCHDB_URL }}
         SUPABASE_WATCHDB_SERVICE_ROLE_KEY: ${{ secrets.SUPABASE_WATCHDB_SERVICE_ROLE_KEY }}
-      run: python src/data_collection/reddit_crawler.py --post_limit 15
+      run: python src/data_collection/reddit_crawler.py --post_limit 15 --comments_limit 25
     
     - name: Run the watchdb populator to
       env:
diff --git a/src/data_collection/reddit_crawler.py b/src/data_collection/reddit_crawler.py
index 8fd569e..4ecb0e2 100644
--- a/src/data_collection/reddit_crawler.py
+++ b/src/data_collection/reddit_crawler.py
@@ -3,10 +3,10 @@
 import logging
 import os
 from datetime import datetime
-from dotenv import load_dotenv
 
 import praw
-from supabase import create_client, Client
+from dotenv import load_dotenv
+from supabase import Client, create_client
 
 load_dotenv()
 # Configure logging
@@ -14,6 +14,7 @@
     level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s"
 )
 
+
 def main(time_filter, post_limit, comments_limit):
     # Reddit API Credentials
     client_id = os.environ.get("REDDIT_APP_ID")
@@ -48,39 +49,66 @@ def main(time_filter, post_limit, comments_limit):
 
     # Push the data collected to Supabase
     for post in top_posts:
-        post.comments.replace_more(limit=comments_limit)  # Load all comments
-        comments = " | ".join(
-            [
-                f"{comment.author.name}: {comment.body}"
-                for comment in post.comments.list()
-                if comment.author and comment.author.name and comment.body
-            ]
-        )
-        logging.debug(f"Collected comments for post ID: {post.id}")
+        comments = ""
+        post.comment_sort = "top"
+        post.comments.replace_more(limit=0)
+        for comment in post.comments.list():
+            if comment.author and comment.author.name == post.author.name:
+                comments += f"{comment.author.name}: {comment.body}"
+                break
+
+        post.comments.replace_more(limit=comments_limit)
+
+        max_iter = min(len(post.comments), comments_limit)
+        for i in range(max_iter):
+            if (
+                post.comments[i].author
+                and post.comments[i].author.name
+                and post.comments[i].body
+            ):
+                comments += (
+                    f" | {post.comments[i].author.name}: {post.comments[i].body}"
+                )
+        # comments = " | ".join(
+        #     [
+        #         f"{comment.author.name}: {comment.body}"
+        #         for comment in post.comments.list()
+        #         if comment.author and comment.author.name and comment.body
+        #     ]
+        # )
+        logging.info(f"Collected comments for post ID: {post.id}")
+        logging.info(f"comments: {len(comments)}{comments}")
 
         post_data = {
             "post_id": post.id,
-            "created_at": datetime.utcfromtimestamp(post.created_utc).strftime("%Y-%m-%d %H:%M:%S"),
+            "created_at": datetime.utcfromtimestamp(post.created_utc).strftime(
+                "%Y-%m-%d %H:%M:%S"
+            ),
             "author_id": post.author.name,
             "title": post.title,
             "url": post.url,
             "comments": comments,
-            "processed": False
+            "processed": False,
         }
 
         try:
             data, error = supabase.table("queued_posts").insert(post_data).execute()
             logging.info(f"Data inserted successfully for post ID: {post.id}")
         except Exception as e:
-            if hasattr(e, 'args') and len(e.args) > 0:
+            if hasattr(e, "args") and len(e.args) > 0:
                 error_message = e.args[0]
-                if '23505' in error_message:  # Check if the error message contains the unique constraint violation code
-                    logging.warning(f"Duplicate entry for post ID: {post.id}, skipping.")
+                if (
+                    "23505" in error_message
+                ):  # Check if the error message contains the unique constraint violation code
+                    logging.warning(
+                        f"Duplicate entry for post ID: {post.id}, skipping."
+                    )
                 else:
                     logging.error(f"Error inserting data: {error_message}")
             else:
                 logging.error(f"Error inserting data with unknown format: {e}")
 
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Reddit WatchExchange Crawler")
     parser.add_argument("--time_filter", help="Time filter for posts", default="hour")