diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml new file mode 100644 index 0000000..df4d4b3 --- /dev/null +++ b/.github/workflows/main.yml @@ -0,0 +1,40 @@ +name: Hourly Job + +on: + schedule: + # Runs at the start of every hour + - cron: '0 * * * *' + +jobs: + run-script: + runs-on: ubuntu-latest + + steps: + - name: Check out repository code + uses: actions/checkout@v2 + + - name: setup python + uses: actions/setup-python@v4 + with: + python-version: '3.11' + + - name: install python packages + run: | + python -m pip install --upgrade pip + pip install -r src/data_collection/requirements.txt + + - name: Run the reddit crawler + env: + REDDIT_APP_ID: ${{ secrets.REDDIT_APP_ID }} + REDDIT_APP_KEY: ${{ secrets.REDDIT_APP_KEY }} + SUPABASE_WATCHDB_URL: ${{ secrets.SUPABASE_WATCHDB_URL }} + SUPABASE_WATCHDB_SERVICE_ROLE_KEY: ${{ secrets.SUPABASE_WATCHDB_SERVICE_ROLE_KEY }} + run: python src/data_collection/reddit_crawler.py + + - name: Run the watchdb populator + env: + REDDIT_APP_ID: ${{ secrets.REDDIT_APP_ID }} + REDDIT_APP_KEY: ${{ secrets.REDDIT_APP_KEY }} + SUPABASE_WATCHDB_URL: ${{ secrets.SUPABASE_WATCHDB_URL }} + SUPABASE_WATCHDB_SERVICE_ROLE_KEY: ${{ secrets.SUPABASE_WATCHDB_SERVICE_ROLE_KEY }} + run: python src/data_collection/watchdb_populator.py diff --git a/src/data_collection/reddit_crawler.py b/src/data_collection/reddit_crawler.py index 297aba4..b9316fd 100644 --- a/src/data_collection/reddit_crawler.py +++ b/src/data_collection/reddit_crawler.py @@ -7,17 +7,19 @@ from supabase import create_client, Client def main(time_filter, post_limit, comments_limit): - # Supabase setup + # Supabase credentials url: str = os.environ.get('SUPABASE_WATCHDB_URL') key: str = os.environ.get('SUPABASE_WATCHDB_SERVICE_ROLE_KEY') - supabase: Client = create_client(url, key) # Reddit API Credentials client_id = os.environ.get('REDDIT_APP_ID') client_secret = os.environ.get('REDDIT_APP_KEY') - user_agent = 'User-Agent:chrono-codex-server:v1 (by /u/ChronoCrawler)' - # Initialize PRAW with your credentials + # Supabase setup + supabase: Client = create_client(url, key) + + # Initialize PRAW with credentials + user_agent = 'User-Agent:chrono-codex-server:v1 (by /u/ChronoCrawler)' reddit = praw.Reddit(client_id=client_id, client_secret=client_secret, user_agent=user_agent) @@ -51,7 +53,7 @@ def main(time_filter, post_limit, comments_limit): raise if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Reddit Crawler for Subreddit Posts") + parser = argparse.ArgumentParser(description="Reddit WatchExchange Crawler") parser.add_argument("--time_filter", help="Time filter for posts", default="hour") parser.add_argument("--post_limit", help="Limit of posts to fetch", type=int, default=10) parser.add_argument("--comments_limit", help="Limit of comments to fetch for each post", type=int, default=25) diff --git a/src/data_collection/requirements.txt b/src/data_collection/requirements.txt new file mode 100644 index 0000000..c6ef90d --- /dev/null +++ b/src/data_collection/requirements.txt @@ -0,0 +1,4 @@ +praw +supabase +openai +schema-validator \ No newline at end of file diff --git a/src/data_collection/schema_validator.py b/src/data_collection/schema_validator.py index 49b44f3..4aee0c8 100644 --- a/src/data_collection/schema_validator.py +++ b/src/data_collection/schema_validator.py @@ -36,7 +36,7 @@ def validate_schema(data: dict): # filtering out incorrect data. # Load the schema - with open('watch_schema.json', 'r') as file: + with open('src/data_collection/watch_schema.json', 'r') as file: schema = json.load(file) json_data = filter_invalid_entries(data, schema) diff --git a/src/data_collection/gpt_formatter.py b/src/data_collection/watchdb_populator.py similarity index 84% rename from src/data_collection/gpt_formatter.py rename to src/data_collection/watchdb_populator.py index 575db56..1ddf43d 100644 --- a/src/data_collection/gpt_formatter.py +++ b/src/data_collection/watchdb_populator.py @@ -6,6 +6,10 @@ from schema_validator import validate_schema def process_queue(supabase_url, supabase_key, openai_key): + url: str = os.environ.get('SUPABASE_WATCHDB_URL') + key: str = os.environ.get('SUPABASE_WATCHDB_SERVICE_ROLE_KEY') + openai_key = os.environ.get('OPENAI_API_CHRONO_KEY') + # Supabase setup supabase: Client = create_client(supabase_url, supabase_key) @@ -14,7 +18,7 @@ def process_queue(supabase_url, supabase_key, openai_key): api_key=openai_key ) - with open('query_schema.json') as f: + with open('src/data_collection/query_schema.json') as f: output_schema_str = f.read() # Fetch data from Supabase queue @@ -56,9 +60,9 @@ def process_queue(supabase_url, supabase_key, openai_key): if __name__ == "__main__": parser = argparse.ArgumentParser(description="Process queue items and format them using OpenAI") - parser.add_argument("--supabase_url", required=True, help="Supabase project URL") - parser.add_argument("--supabase_key", required=True, help="Supabase service role key") - parser.add_argument("--openai_key", required=True, help="OpenAI API key") + # parser.add_argument("--supabase_url", required=True, help="Supabase project URL") + # parser.add_argument("--supabase_key", required=True, help="Supabase service role key") + # parser.add_argument("--openai_key", required=True, help="OpenAI API key") args = parser.parse_args()