add reddit tracker

stocknear · Jul 27, 2024 · 11a0619 · 11a0619
1 parent 7100791
commit 11a0619
Show file tree

Hide file tree

Showing 2 changed files with 168 additions and 0 deletions.
diff --git a/app/cron_reddit_statistics.py b/app/cron_reddit_statistics.py
@@ -0,0 +1,70 @@
+import json
+import re
+import requests
+from datetime import datetime
+from collections import defaultdict
+
+def get_subscriber_count():
+    url = "https://www.reddit.com/r/wallstreetbets/new.json"
+    headers = {'User-agent': 'Mozilla/5.0'}
+    response = requests.get(url, headers=headers)
+    if response.status_code == 200:
+        data = response.json()
+        return data['data']['children'][0]['data']['subreddit_subscribers']
+    return None
+
+def compute_daily_statistics(file_path):
+    # Load the data from the JSON file
+    with open(file_path, 'r', encoding='utf-8') as f:
+        data = json.load(f)
+
+    # Get current subscriber count
+    subscriber_count = get_subscriber_count()
+
+    # Initialize a defaultdict to store daily statistics
+    daily_stats = defaultdict(lambda: {
+        'post_count': 0, 
+        'total_comments': 0, 
+        'ticker_mentions': defaultdict(int),
+        'unique_tickers': set()
+    })
+
+    # Compile regex pattern for finding tickers
+    ticker_pattern = re.compile(r'\$([A-Z]+)')
+
+    # Process each post
+    for post in data:
+        # Convert UTC timestamp to datetime object
+        post_date = datetime.utcfromtimestamp(post['created_utc']).date()
+
+        # Update statistics for this day
+        daily_stats[post_date]['post_count'] += 1
+        daily_stats[post_date]['total_comments'] += post['num_comments']
+
+        # Find ticker mentions in title and selftext
+        text_to_search = post['title'] + ' ' + post['selftext']
+        tickers = ticker_pattern.findall(text_to_search)
+
+        for ticker in tickers:
+            daily_stats[post_date]['ticker_mentions'][ticker] += 1
+            daily_stats[post_date]['unique_tickers'].add(ticker)
+
+    # Calculate averages and format the results
+    formatted_stats = []
+    for date, stats in sorted(daily_stats.items(), reverse=True):
+        formatted_stats.append({
+            'date': date.isoformat(),
+            'totalPosts': stats['post_count'],
+            'totalComments': stats['total_comments'],
+            'subscribersCount': subscriber_count,
+            'totalMentions': sum(stats['ticker_mentions'].values()),
+            'companySpread': len(stats['unique_tickers']),
+            'tickerMentions': dict(stats['ticker_mentions'])  # Optional: include detailed ticker mentions
+        })
+
+    return formatted_stats
+
+# Usage
+file_path = 'json/reddit-tracker/wallstreetbets/data.json'
+daily_statistics = compute_daily_statistics(file_path)
+print(json.dumps(daily_statistics, indent=2))
diff --git a/app/cron_reddit_tracker.py b/app/cron_reddit_tracker.py
@@ -0,0 +1,98 @@
+import requests
+import json
+from datetime import datetime
+import os
+
+# URL of the Reddit API endpoint
+url = "https://www.reddit.com/r/wallstreetbets/new.json"
+# File path for the JSON data
+file_path = 'json/reddit-tracker/wallstreetbets/data.json'
+
+# Ensure the directory exists
+os.makedirs(os.path.dirname(file_path), exist_ok=True)
+
+# Function to load existing data
+def load_existing_data():
+    if os.path.exists(file_path):
+        with open(file_path, 'r', encoding='utf-8') as f:
+            return json.load(f)
+    return []
+
+# Function to save data
+def save_data(data):
+    with open(file_path, 'w', encoding='utf-8') as f:
+        json.dump(data, f, ensure_ascii=False, indent=4)
+
+# Function to get updated post data
+def get_updated_post_data(permalink):
+    post_url = f"https://www.reddit.com{permalink}.json"
+    response = requests.get(post_url, headers={'User-agent': 'Mozilla/5.0'})
+    if response.status_code == 200:
+        post_data = response.json()[0]['data']['children'][0]['data']
+        return post_data
+    return None
+
+# Load existing data
+existing_data = load_existing_data()
+
+# Create a dictionary of existing posts for faster lookup and update
+existing_posts = {post['id']: post for post in existing_data}
+
+# Send a GET request to the API
+response = requests.get(url, headers={'User-agent': 'Mozilla/5.0'})
+
+counter = 0
+# Check if the request was successful
+if response.status_code == 200:
+    # Parse the JSON data
+    data = response.json()
+
+    # Flag to check if any data was added or updated
+    data_changed = False
+
+    # Iterate through each post in the 'children' list
+    for post in data['data']['children']:
+        post_data = post['data']
+        post_id = post_data.get('id', '')
+
+        # Check if this post is already in our data
+        if post_id in existing_posts:
+            # Update existing post
+            if counter < 25: #Only update the latest 25 posts to not overload the reddit server
+	            updated_data = get_updated_post_data(post_data['permalink'])
+	            if updated_data:
+	                existing_posts[post_id]['upvote_ratio'] = updated_data.get('upvote_ratio', existing_posts[post_id]['upvote_ratio'])
+	                existing_posts[post_id]['num_comments'] = updated_data.get('num_comments', existing_posts[post_id]['num_comments'])
+	                data_changed = True
+	                counter +=1
+	                print(counter)
+        else:
+            # Extract the required fields for new post
+            extracted_post = {
+                "id": post_id,
+                "permalink": post_data.get('permalink', ''),
+                "title": post_data.get('title', ''),
+                "selftext": post_data.get('selftext', ''),
+                "created_utc": post_data.get('created_utc', ''),
+                "upvote_ratio": post_data.get('upvote_ratio', ''),
+                "num_comments": post_data.get('num_comments', ''),
+                "link_flair_text": post_data.get('link_flair_text', ''),
+                "author": post_data.get('author', ''),
+            }
+
+            # Add the new post to the existing data
+            existing_posts[post_id] = extracted_post
+            data_changed = True
+
+    if data_changed:
+        # Convert the dictionary back to a list and sort by created_utc
+        updated_data = list(existing_posts.values())
+        updated_data.sort(key=lambda x: x['created_utc'], reverse=True)
+
+        # Save the updated data
+        save_data(updated_data)
+        print(f"Data updated and saved to {file_path}")
+    else:
+        print("No new data to add or update.")
+else:
+    print(f"Failed to retrieve data. Status code: {response.status_code}")