Skip to content

Commit

Permalink
update reddit cron job
Browse files Browse the repository at this point in the history
  • Loading branch information
MuslemRahimi committed Jul 27, 2024
1 parent 343797b commit ba91246
Show file tree
Hide file tree
Showing 3 changed files with 80 additions and 84 deletions.
36 changes: 23 additions & 13 deletions app/cron_reddit_statistics.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,37 @@
import json
import re
import requests
import praw
from datetime import datetime
from collections import defaultdict

def get_subscriber_count():
url = "https://www.reddit.com/r/wallstreetbets/new.json"
headers = {'User-agent': 'Mozilla/5.0'}
response = requests.get(url, headers=headers)
if response.status_code == 200:
data = response.json()
return data['data']['children'][0]['data']['subreddit_subscribers']
return None
import os
from dotenv import load_dotenv

load_dotenv()
client_key = os.getenv('REDDIT_API_KEY')
client_secret = os.getenv('REDDIT_API_SECRET')
user_agent = os.getenv('REDDIT_USER_AGENT')


# Initialize Reddit instance
reddit = praw.Reddit(
client_id=client_key,
client_secret=client_secret,
user_agent=user_agent
)

# Function to save data
def save_data(data):
with open('json/reddit-tracker/wallstreetbets/stats.json', 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)


def compute_daily_statistics(file_path):
# Load the data from the JSON file
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)

# Get current subscriber count
subscriber_count = get_subscriber_count()

# Initialize a defaultdict to store daily statistics
daily_stats = defaultdict(lambda: {
'post_count': 0,
Expand Down Expand Up @@ -56,7 +67,6 @@ def compute_daily_statistics(file_path):
'date': date.isoformat(),
'totalPosts': stats['post_count'],
'totalComments': stats['total_comments'],
'subscribersCount': subscriber_count,
'totalMentions': sum(stats['ticker_mentions'].values()),
'companySpread': len(stats['unique_tickers']),
'tickerMentions': dict(stats['ticker_mentions']) # Optional: include detailed ticker mentions
Expand All @@ -67,4 +77,4 @@ def compute_daily_statistics(file_path):
# Usage
file_path = 'json/reddit-tracker/wallstreetbets/data.json'
daily_statistics = compute_daily_statistics(file_path)
print(json.dumps(daily_statistics, indent=2))
save_data(daily_statistics)
125 changes: 55 additions & 70 deletions app/cron_reddit_tracker.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
import requests
import praw
import json
from datetime import datetime
import os
from dotenv import load_dotenv
import time

load_dotenv()
client_key = os.getenv('REDDIT_API_KEY')
client_secret = os.getenv('REDDIT_API_SECRET')
user_agent = os.getenv('REDDIT_USER_AGENT')

# URL of the Reddit API endpoint
url = "https://www.reddit.com/r/wallstreetbets/new.json"
# File path for the JSON data
file_path = 'json/reddit-tracker/wallstreetbets/data.json'

headers = {
'User-Agent': 'python:myapp:v1.0 (by /u/realstocknear)'
}


# Ensure the directory exists
os.makedirs(os.path.dirname(file_path), exist_ok=True)

Expand All @@ -28,76 +28,61 @@ def save_data(data):
with open(file_path, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)

# Function to get updated post data
def get_updated_post_data(permalink):
post_url = f"https://www.reddit.com{permalink}.json"
response = requests.get(post_url, headers=headers)
if response.status_code == 200:
post_data = response.json()[0]['data']['children'][0]['data']
return post_data
return None
# Initialize Reddit instance
reddit = praw.Reddit(
client_id=client_key,
client_secret=client_secret,
user_agent=user_agent
)

# Load existing data
existing_data = load_existing_data()

# Create a dictionary of existing posts for faster lookup and update
existing_posts = {post['id']: post for post in existing_data}

# Send a GET request to the API
response = requests.get(url, headers=headers)
# Flag to check if any data was added or updated
data_changed = False

counter = 0
# Check if the request was successful
if response.status_code == 200:
# Parse the JSON data
data = response.json()

# Flag to check if any data was added or updated
data_changed = False

# Iterate through each post in the 'children' list
for post in data['data']['children']:
post_data = post['data']
post_id = post_data.get('id', '')
# Get the subreddit
subreddit = reddit.subreddit("wallstreetbets")

# Iterate through new submissions
for submission in subreddit.new(limit=1000):
post_id = submission.id
# Check if this post is already in our data
if post_id in existing_posts:
# Update existing post
existing_posts[post_id]['upvote_ratio'] = submission.upvote_ratio
existing_posts[post_id]['num_comments'] = submission.num_comments
data_changed = True
else:
# Extract the required fields for new post
extracted_post = {
"id": post_id,
"permalink": submission.permalink,
"title": submission.title,
"selftext": submission.selftext,
"created_utc": int(submission.created_utc),
"upvote_ratio": submission.upvote_ratio,
"num_comments": submission.num_comments,
"link_flair_text": submission.link_flair_text,
"author": str(submission.author),
}

# Check if this post is already in our data
if post_id in existing_posts:
# Update existing post
if counter < 25: #Only update the latest 25 posts to not overload the reddit server
updated_data = get_updated_post_data(post_data['permalink'])
if updated_data:
existing_posts[post_id]['upvote_ratio'] = updated_data.get('upvote_ratio', existing_posts[post_id]['upvote_ratio'])
existing_posts[post_id]['num_comments'] = updated_data.get('num_comments', existing_posts[post_id]['num_comments'])
data_changed = True
counter +=1
print(counter)
else:
# Extract the required fields for new post
extracted_post = {
"id": post_id,
"permalink": post_data.get('permalink', ''),
"title": post_data.get('title', ''),
"selftext": post_data.get('selftext', ''),
"created_utc": post_data.get('created_utc', ''),
"upvote_ratio": post_data.get('upvote_ratio', ''),
"num_comments": post_data.get('num_comments', ''),
"link_flair_text": post_data.get('link_flair_text', ''),
"author": post_data.get('author', ''),
}

# Add the new post to the existing data
existing_posts[post_id] = extracted_post
data_changed = True
# Add the new post to the existing data
existing_posts[post_id] = extracted_post
data_changed = True

time.sleep(1) # Add a 1-second delay between processing submissions

if data_changed:
# Convert the dictionary back to a list and sort by created_utc
updated_data = list(existing_posts.values())
updated_data.sort(key=lambda x: x['created_utc'], reverse=True)

if data_changed:
# Convert the dictionary back to a list and sort by created_utc
updated_data = list(existing_posts.values())
updated_data.sort(key=lambda x: x['created_utc'], reverse=True)

# Save the updated data
save_data(updated_data)
print(f"Data updated and saved to {file_path}")
else:
print("No new data to add or update.")
# Save the updated data
save_data(updated_data)
print(f"Data updated and saved to {file_path}")
else:
print(f"Failed to retrieve data. Status code: {response.status_code}")
print("No new data to add or update.")
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,5 @@ faker
finnhub-python
intrinio_sdk
openai
slowapi
slowapi
praw

0 comments on commit ba91246

Please sign in to comment.