diff --git a/app/cron_reddit_statistics.py b/app/cron_reddit_statistics.py new file mode 100644 index 0000000..8c40ab2 --- /dev/null +++ b/app/cron_reddit_statistics.py @@ -0,0 +1,70 @@ +import json +import re +import requests +from datetime import datetime +from collections import defaultdict + +def get_subscriber_count(): + url = "https://www.reddit.com/r/wallstreetbets/new.json" + headers = {'User-agent': 'Mozilla/5.0'} + response = requests.get(url, headers=headers) + if response.status_code == 200: + data = response.json() + return data['data']['children'][0]['data']['subreddit_subscribers'] + return None + +def compute_daily_statistics(file_path): + # Load the data from the JSON file + with open(file_path, 'r', encoding='utf-8') as f: + data = json.load(f) + + # Get current subscriber count + subscriber_count = get_subscriber_count() + + # Initialize a defaultdict to store daily statistics + daily_stats = defaultdict(lambda: { + 'post_count': 0, + 'total_comments': 0, + 'ticker_mentions': defaultdict(int), + 'unique_tickers': set() + }) + + # Compile regex pattern for finding tickers + ticker_pattern = re.compile(r'\$([A-Z]+)') + + # Process each post + for post in data: + # Convert UTC timestamp to datetime object + post_date = datetime.utcfromtimestamp(post['created_utc']).date() + + # Update statistics for this day + daily_stats[post_date]['post_count'] += 1 + daily_stats[post_date]['total_comments'] += post['num_comments'] + + # Find ticker mentions in title and selftext + text_to_search = post['title'] + ' ' + post['selftext'] + tickers = ticker_pattern.findall(text_to_search) + + for ticker in tickers: + daily_stats[post_date]['ticker_mentions'][ticker] += 1 + daily_stats[post_date]['unique_tickers'].add(ticker) + + # Calculate averages and format the results + formatted_stats = [] + for date, stats in sorted(daily_stats.items(), reverse=True): + formatted_stats.append({ + 'date': date.isoformat(), + 'totalPosts': stats['post_count'], + 'totalComments': stats['total_comments'], + 'subscribersCount': subscriber_count, + 'totalMentions': sum(stats['ticker_mentions'].values()), + 'companySpread': len(stats['unique_tickers']), + 'tickerMentions': dict(stats['ticker_mentions']) # Optional: include detailed ticker mentions + }) + + return formatted_stats + +# Usage +file_path = 'json/reddit-tracker/wallstreetbets/data.json' +daily_statistics = compute_daily_statistics(file_path) +print(json.dumps(daily_statistics, indent=2)) \ No newline at end of file diff --git a/app/cron_reddit_tracker.py b/app/cron_reddit_tracker.py new file mode 100644 index 0000000..c3b8bed --- /dev/null +++ b/app/cron_reddit_tracker.py @@ -0,0 +1,98 @@ +import requests +import json +from datetime import datetime +import os + +# URL of the Reddit API endpoint +url = "https://www.reddit.com/r/wallstreetbets/new.json" +# File path for the JSON data +file_path = 'json/reddit-tracker/wallstreetbets/data.json' + +# Ensure the directory exists +os.makedirs(os.path.dirname(file_path), exist_ok=True) + +# Function to load existing data +def load_existing_data(): + if os.path.exists(file_path): + with open(file_path, 'r', encoding='utf-8') as f: + return json.load(f) + return [] + +# Function to save data +def save_data(data): + with open(file_path, 'w', encoding='utf-8') as f: + json.dump(data, f, ensure_ascii=False, indent=4) + +# Function to get updated post data +def get_updated_post_data(permalink): + post_url = f"https://www.reddit.com{permalink}.json" + response = requests.get(post_url, headers={'User-agent': 'Mozilla/5.0'}) + if response.status_code == 200: + post_data = response.json()[0]['data']['children'][0]['data'] + return post_data + return None + +# Load existing data +existing_data = load_existing_data() + +# Create a dictionary of existing posts for faster lookup and update +existing_posts = {post['id']: post for post in existing_data} + +# Send a GET request to the API +response = requests.get(url, headers={'User-agent': 'Mozilla/5.0'}) + +counter = 0 +# Check if the request was successful +if response.status_code == 200: + # Parse the JSON data + data = response.json() + + # Flag to check if any data was added or updated + data_changed = False + + # Iterate through each post in the 'children' list + for post in data['data']['children']: + post_data = post['data'] + post_id = post_data.get('id', '') + + # Check if this post is already in our data + if post_id in existing_posts: + # Update existing post + if counter < 25: #Only update the latest 25 posts to not overload the reddit server + updated_data = get_updated_post_data(post_data['permalink']) + if updated_data: + existing_posts[post_id]['upvote_ratio'] = updated_data.get('upvote_ratio', existing_posts[post_id]['upvote_ratio']) + existing_posts[post_id]['num_comments'] = updated_data.get('num_comments', existing_posts[post_id]['num_comments']) + data_changed = True + counter +=1 + print(counter) + else: + # Extract the required fields for new post + extracted_post = { + "id": post_id, + "permalink": post_data.get('permalink', ''), + "title": post_data.get('title', ''), + "selftext": post_data.get('selftext', ''), + "created_utc": post_data.get('created_utc', ''), + "upvote_ratio": post_data.get('upvote_ratio', ''), + "num_comments": post_data.get('num_comments', ''), + "link_flair_text": post_data.get('link_flair_text', ''), + "author": post_data.get('author', ''), + } + + # Add the new post to the existing data + existing_posts[post_id] = extracted_post + data_changed = True + + if data_changed: + # Convert the dictionary back to a list and sort by created_utc + updated_data = list(existing_posts.values()) + updated_data.sort(key=lambda x: x['created_utc'], reverse=True) + + # Save the updated data + save_data(updated_data) + print(f"Data updated and saved to {file_path}") + else: + print("No new data to add or update.") +else: + print(f"Failed to retrieve data. Status code: {response.status_code}") \ No newline at end of file