update reddit cron job

stocknear · Jul 27, 2024 · ba91246 · ba91246
1 parent 343797b
commit ba91246
Show file tree

Hide file tree

Showing 3 changed files with 80 additions and 84 deletions.
diff --git a/app/cron_reddit_statistics.py b/app/cron_reddit_statistics.py
@@ -1,26 +1,37 @@
 import json
 import re
 import requests
+import praw
 from datetime import datetime
 from collections import defaultdict
 
-def get_subscriber_count():
-    url = "https://www.reddit.com/r/wallstreetbets/new.json"
-    headers = {'User-agent': 'Mozilla/5.0'}
-    response = requests.get(url, headers=headers)
-    if response.status_code == 200:
-        data = response.json()
-        return data['data']['children'][0]['data']['subreddit_subscribers']
-    return None
+import os
+from dotenv import load_dotenv
+
+load_dotenv()
+client_key = os.getenv('REDDIT_API_KEY')
+client_secret = os.getenv('REDDIT_API_SECRET')
+user_agent = os.getenv('REDDIT_USER_AGENT')
+
+
+# Initialize Reddit instance
+reddit = praw.Reddit(
+    client_id=client_key,
+    client_secret=client_secret,
+    user_agent=user_agent
+)
+
+# Function to save data
+def save_data(data):
+    with open('json/reddit-tracker/wallstreetbets/stats.json', 'w', encoding='utf-8') as f:
+        json.dump(data, f, ensure_ascii=False, indent=2)
+
 
 def compute_daily_statistics(file_path):
     # Load the data from the JSON file
     with open(file_path, 'r', encoding='utf-8') as f:
         data = json.load(f)
 
-    # Get current subscriber count
-    subscriber_count = get_subscriber_count()
-
     # Initialize a defaultdict to store daily statistics
     daily_stats = defaultdict(lambda: {
         'post_count': 0, 
@@ -56,7 +67,6 @@ def compute_daily_statistics(file_path):
             'date': date.isoformat(),
             'totalPosts': stats['post_count'],
             'totalComments': stats['total_comments'],
-            'subscribersCount': subscriber_count,
             'totalMentions': sum(stats['ticker_mentions'].values()),
             'companySpread': len(stats['unique_tickers']),
             'tickerMentions': dict(stats['ticker_mentions'])  # Optional: include detailed ticker mentions
@@ -67,4 +77,4 @@ def compute_daily_statistics(file_path):
 # Usage
 file_path = 'json/reddit-tracker/wallstreetbets/data.json'
 daily_statistics = compute_daily_statistics(file_path)
-print(json.dumps(daily_statistics, indent=2))
+save_data(daily_statistics)
diff --git a/app/cron_reddit_tracker.py b/app/cron_reddit_tracker.py
@@ -1,18 +1,18 @@
-import requests
+import praw
 import json
 from datetime import datetime
 import os
+from dotenv import load_dotenv
+import time
+
+load_dotenv()
+client_key = os.getenv('REDDIT_API_KEY')
+client_secret = os.getenv('REDDIT_API_SECRET')
+user_agent = os.getenv('REDDIT_USER_AGENT')
 
-# URL of the Reddit API endpoint
-url = "https://www.reddit.com/r/wallstreetbets/new.json"
 # File path for the JSON data
 file_path = 'json/reddit-tracker/wallstreetbets/data.json'
 
-headers = {
-    'User-Agent': 'python:myapp:v1.0 (by /u/realstocknear)'
-}
-
-
 # Ensure the directory exists
 os.makedirs(os.path.dirname(file_path), exist_ok=True)
 
@@ -28,76 +28,61 @@ def save_data(data):
     with open(file_path, 'w', encoding='utf-8') as f:
         json.dump(data, f, ensure_ascii=False, indent=4)
 
-# Function to get updated post data
-def get_updated_post_data(permalink):
-    post_url = f"https://www.reddit.com{permalink}.json"
-    response = requests.get(post_url, headers=headers)
-    if response.status_code == 200:
-        post_data = response.json()[0]['data']['children'][0]['data']
-        return post_data
-    return None
+# Initialize Reddit instance
+reddit = praw.Reddit(
+    client_id=client_key,
+    client_secret=client_secret,
+    user_agent=user_agent
+)
 
 # Load existing data
 existing_data = load_existing_data()
 
 # Create a dictionary of existing posts for faster lookup and update
 existing_posts = {post['id']: post for post in existing_data}
 
-# Send a GET request to the API
-response = requests.get(url, headers=headers)
+# Flag to check if any data was added or updated
+data_changed = False
 
-counter = 0
-# Check if the request was successful
-if response.status_code == 200:
-    # Parse the JSON data
-    data = response.json()
-
-    # Flag to check if any data was added or updated
-    data_changed = False
-
-    # Iterate through each post in the 'children' list
-    for post in data['data']['children']:
-        post_data = post['data']
-        post_id = post_data.get('id', '')
+# Get the subreddit
+subreddit = reddit.subreddit("wallstreetbets")
+
+# Iterate through new submissions
+for submission in subreddit.new(limit=1000):
+    post_id = submission.id
+    # Check if this post is already in our data
+    if post_id in existing_posts:
+        # Update existing post
+        existing_posts[post_id]['upvote_ratio'] = submission.upvote_ratio
+        existing_posts[post_id]['num_comments'] = submission.num_comments
+        data_changed = True
+    else:
+        # Extract the required fields for new post
+        extracted_post = {
+            "id": post_id,
+            "permalink": submission.permalink,
+            "title": submission.title,
+            "selftext": submission.selftext,
+            "created_utc": int(submission.created_utc),
+            "upvote_ratio": submission.upvote_ratio,
+            "num_comments": submission.num_comments,
+            "link_flair_text": submission.link_flair_text,
+            "author": str(submission.author),
+        }
 
-        # Check if this post is already in our data
-        if post_id in existing_posts:
-            # Update existing post
-            if counter < 25: #Only update the latest 25 posts to not overload the reddit server
-	            updated_data = get_updated_post_data(post_data['permalink'])
-	            if updated_data:
-	                existing_posts[post_id]['upvote_ratio'] = updated_data.get('upvote_ratio', existing_posts[post_id]['upvote_ratio'])
-	                existing_posts[post_id]['num_comments'] = updated_data.get('num_comments', existing_posts[post_id]['num_comments'])
-	                data_changed = True
-	                counter +=1
-	                print(counter)
-        else:
-            # Extract the required fields for new post
-            extracted_post = {
-                "id": post_id,
-                "permalink": post_data.get('permalink', ''),
-                "title": post_data.get('title', ''),
-                "selftext": post_data.get('selftext', ''),
-                "created_utc": post_data.get('created_utc', ''),
-                "upvote_ratio": post_data.get('upvote_ratio', ''),
-                "num_comments": post_data.get('num_comments', ''),
-                "link_flair_text": post_data.get('link_flair_text', ''),
-                "author": post_data.get('author', ''),
-            }
-
-            # Add the new post to the existing data
-            existing_posts[post_id] = extracted_post
-            data_changed = True
+        # Add the new post to the existing data
+        existing_posts[post_id] = extracted_post
+        data_changed = True
+
+    time.sleep(1)  # Add a 1-second delay between processing submissions
+
+if data_changed:
+    # Convert the dictionary back to a list and sort by created_utc
+    updated_data = list(existing_posts.values())
+    updated_data.sort(key=lambda x: x['created_utc'], reverse=True)
 
-    if data_changed:
-        # Convert the dictionary back to a list and sort by created_utc
-        updated_data = list(existing_posts.values())
-        updated_data.sort(key=lambda x: x['created_utc'], reverse=True)
-
-        # Save the updated data
-        save_data(updated_data)
-        print(f"Data updated and saved to {file_path}")
-    else:
-        print("No new data to add or update.")
+    # Save the updated data
+    save_data(updated_data)
+    print(f"Data updated and saved to {file_path}")
 else:
-    print(f"Failed to retrieve data. Status code: {response.status_code}")
+    print("No new data to add or update.")
diff --git a/requirements.txt b/requirements.txt
@@ -36,4 +36,5 @@ faker
 finnhub-python
 intrinio_sdk
 openai
-slowapi
+slowapi
+praw
-Original file line number
+Diff line change
@@ Expand Up / @@ -36,4 +36,5 @@ faker @@
     finnhub-python
     intrinio_sdk
     openai
-    slowapi
+    slowapi
+    praw