From 8dd26ffbe78f7fad9b0461c5d79fa8a5919cd8f1 Mon Sep 17 00:00:00 2001
From: dcjohnson24 <dcjohnson24@gmail.com>
Date: Sun, 11 Feb 2024 21:09:23 -0600
Subject: [PATCH 1/5] Action for downloading ridership data and saving JSON to
 s3

---
 .github/workflows/ridership-action.yml | 35 ++++++++++++++++++++
 data_analysis/ridership_to_json.py     | 17 ++++++----
 scrape_data/ridership_download.py      | 46 ++++++++++++++++++++++++++
 3 files changed, 92 insertions(+), 6 deletions(-)
 create mode 100644 .github/workflows/ridership-action.yml
 create mode 100644 scrape_data/ridership_download.py

diff --git a/.github/workflows/ridership-action.yml b/.github/workflows/ridership-action.yml
new file mode 100644
index 0000000..e9c70b0
--- /dev/null
+++ b/.github/workflows/ridership-action.yml
@@ -0,0 +1,35 @@
+name: Automate ridership data updates
+
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - ridership-gh-action
+  schedule:
+    # Run every day at 12:30pm CST which is 5:30pm UTC
+    - cron: 30 17 * * * 
+
+env: 
+  PYTHON_VERSION: 3.10.6
+  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+
+
+jobs:
+  download-ridership-data:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-python@v4
+        with:
+          python-version: ${{ env.PYTHON_VERSION }}
+        
+      - name: Download and save ridership data to s3
+
+        run: |
+          pip install -r requirements.txt
+          python -c 'from scrape_data.ridership_download import save_ridership_json; \
+            save_ridership_json()' \
+            $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY
diff --git a/data_analysis/ridership_to_json.py b/data_analysis/ridership_to_json.py
index 4778dfa..9ed894d 100644
--- a/data_analysis/ridership_to_json.py
+++ b/data_analysis/ridership_to_json.py
@@ -64,7 +64,8 @@ def get_latest_month_and_year(ridership_df: pd.DataFrame) -> tuple:
     return latest_date.month, latest_date.year
 
 
-def ridership_to_json(ridership_df: pd.DataFrame, month: int = None, year: int = None) -> None:
+def ridership_to_json(ridership_df: pd.DataFrame, month: int = None, year: int = None,
+                      save: bool = True) -> None:
     """
     Save ridership data to JSON for given month and year.
     Note that the data is typically a few months 
@@ -83,6 +84,7 @@ def ridership_to_json(ridership_df: pd.DataFrame, month: int = None, year: int =
         4	9	01/01/2001	U	11207
         month (int): Month of interest. Defaults to None
         year (int): Year of interest. Defaults to None
+        save (bool): Whether to save JSON locally. Defaults to True.
     """
     ridership = ridership_df.copy()
     latest_month, latest_year = get_latest_month_and_year(ridership)
@@ -119,13 +121,16 @@ def ridership_to_json(ridership_df: pd.DataFrame, month: int = None, year: int =
     df_daytype_summary_json = df_daytype_summary.to_json(orient='records')
     full_json = {'date': f'{month_name} {year}'}
     full_json['data'] = json.loads(df_daytype_summary_json)
-    with open(DATA_PATH / f'{month_name}_{year}_cta_ridership_data_day_type_summary.json', 'w') as outfile:
-        json.dump(full_json, outfile)
-
+    if save:
+        with open(DATA_PATH / f'{month_name}_{year}_cta_ridership_data_day_type_summary.json', 'w') as outfile:
+            json.dump(full_json, outfile)
+    else:
+        return json.dumps(full_json, indent=4)
+    
 app = typer.Typer()
 
 @app.command()
-def main(month: int = None, year: int = None) -> None:
+def main(month: int = None, year: int = None, save: bool = True) -> None:
     
     print("Loading data from data.cityofchicago.org")
     ridership_df = pd.read_csv(
@@ -133,7 +138,7 @@ def main(month: int = None, year: int = None) -> None:
         'jyb9-n7fm/rows.csv?accessType=DOWNLOAD'
     )
     print("Done!")
-    ridership_to_json(ridership_df=ridership_df, month=month, year=year)
+    ridership_to_json(ridership_df=ridership_df, month=month, year=year, save=save)
 
 
 if __name__ == '__main__':
diff --git a/scrape_data/ridership_download.py b/scrape_data/ridership_download.py
new file mode 100644
index 0000000..59ada06
--- /dev/null
+++ b/scrape_data/ridership_download.py
@@ -0,0 +1,46 @@
+import boto3
+import sys
+import data_analysis.ridership_to_json as ridership_to_json
+import data_analysis.compare_scheduled_and_rt as csrt
+
+ACCESS_KEY = sys.argv[1]
+SECRET_KEY = sys.argv[2]
+
+client = boto3.client(
+    's3',
+    aws_access_key_id=ACCESS_KEY,
+    aws_secret_access_key=SECRET_KEY
+)
+
+s3 = boto3.resource(
+    's3',
+    region_name='us-east-1',
+    aws_access_key_id=ACCESS_KEY,
+    aws_secret_access_key=SECRET_KEY
+)
+
+def save_ridership_json() -> None:
+    ridership_json = ridership_to_json.main(save=False)
+    s3_ridership_json_path = 'frontend_data_files/cta_ridership_data_day_type_summary.json'
+    print(f'Saving {s3_ridership_json_path}')
+    s3.Object(
+        csrt.BUCKET_PUBLIC,
+        f'{s3_ridership_json_path}')\
+        .put(Body=ridership_json)
+
+    # Check that the file was uploaded successfully
+    keys(csrt.BUCKET_PUBLIC, [s3_ridership_json_path])
+
+
+# https://stackoverflow.com/questions/30249069/listing-contents-of-a-bucket-with-boto3
+def keys(bucket_name: str, filenames: list,
+         prefix: str='/', delimiter: str='/',
+         start_after: str='') -> None:
+    s3_paginator = client.get_paginator('list_objects_v2')
+    prefix = prefix.lstrip(delimiter)
+    start_after = (start_after or prefix) if prefix.endswith(delimiter) else start_after
+    for page in s3_paginator.paginate(Bucket=bucket_name, Prefix=prefix, StartAfter=start_after):
+        for content in page.get('Contents', ()):
+            if content['Key'] in filenames:
+                print(f"{content['Key']} exists")
+                
\ No newline at end of file

From 5d263ea59077bf096263a93c046d767816f03337 Mon Sep 17 00:00:00 2001
From: dcjohnson24 <dcjohnson24@gmail.com>
Date: Mon, 12 Feb 2024 10:33:29 -0600
Subject: [PATCH 2/5] Fix import error

---
 scrape_data/ridership_download.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scrape_data/ridership_download.py b/scrape_data/ridership_download.py
index 59ada06..4afdc1a 100644
--- a/scrape_data/ridership_download.py
+++ b/scrape_data/ridership_download.py
@@ -1,7 +1,7 @@
 import boto3
 import sys
 import data_analysis.ridership_to_json as ridership_to_json
-import data_analysis.compare_scheduled_and_rt as csrt
+import data_analysis.static_gtfs_analysis as sga
 
 ACCESS_KEY = sys.argv[1]
 SECRET_KEY = sys.argv[2]
@@ -24,12 +24,12 @@ def save_ridership_json() -> None:
     s3_ridership_json_path = 'frontend_data_files/cta_ridership_data_day_type_summary.json'
     print(f'Saving {s3_ridership_json_path}')
     s3.Object(
-        csrt.BUCKET_PUBLIC,
+        sga.BUCKET_PUBLIC,
         f'{s3_ridership_json_path}')\
         .put(Body=ridership_json)
 
     # Check that the file was uploaded successfully
-    keys(csrt.BUCKET_PUBLIC, [s3_ridership_json_path])
+    keys(sga.BUCKET_PUBLIC, [s3_ridership_json_path])
 
 
 # https://stackoverflow.com/questions/30249069/listing-contents-of-a-bucket-with-boto3

From 8eeb097e2a2249ec125fe95164d09234468f04fe Mon Sep 17 00:00:00 2001
From: dcjohnson24 <dcjohnson24@gmail.com>
Date: Mon, 12 Feb 2024 11:00:22 -0600
Subject: [PATCH 3/5] Fix attribute error

---
 scrape_data/ridership_download.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scrape_data/ridership_download.py b/scrape_data/ridership_download.py
index 4afdc1a..4758661 100644
--- a/scrape_data/ridership_download.py
+++ b/scrape_data/ridership_download.py
@@ -24,12 +24,12 @@ def save_ridership_json() -> None:
     s3_ridership_json_path = 'frontend_data_files/cta_ridership_data_day_type_summary.json'
     print(f'Saving {s3_ridership_json_path}')
     s3.Object(
-        sga.BUCKET_PUBLIC,
+        sga.BUCKET,
         f'{s3_ridership_json_path}')\
         .put(Body=ridership_json)
 
     # Check that the file was uploaded successfully
-    keys(sga.BUCKET_PUBLIC, [s3_ridership_json_path])
+    keys(sga.BUCKET, [s3_ridership_json_path])
 
 
 # https://stackoverflow.com/questions/30249069/listing-contents-of-a-bucket-with-boto3

From 78b90283bf98608119a65956525e0b002b5427dd Mon Sep 17 00:00:00 2001
From: dcjohnson24 <dcjohnson24@gmail.com>
Date: Mon, 12 Feb 2024 11:19:04 -0600
Subject: [PATCH 4/5] Add return a value in main function

---
 data_analysis/ridership_to_json.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data_analysis/ridership_to_json.py b/data_analysis/ridership_to_json.py
index 9ed894d..ee84535 100644
--- a/data_analysis/ridership_to_json.py
+++ b/data_analysis/ridership_to_json.py
@@ -138,7 +138,7 @@ def main(month: int = None, year: int = None, save: bool = True) -> None:
         'jyb9-n7fm/rows.csv?accessType=DOWNLOAD'
     )
     print("Done!")
-    ridership_to_json(ridership_df=ridership_df, month=month, year=year, save=save)
+    return ridership_to_json(ridership_df=ridership_df, month=month, year=year, save=save)
 
 
 if __name__ == '__main__':

From a5126ae15a0c7f8d9bc5b1abbc17c7a791680876 Mon Sep 17 00:00:00 2001
From: dcjohnson24 <dcjohnson24@gmail.com>
Date: Tue, 13 Feb 2024 12:05:27 -0600
Subject: [PATCH 5/5] Remove on push. Change to run monthly

---
 .github/workflows/ridership-action.yml | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/ridership-action.yml b/.github/workflows/ridership-action.yml
index e9c70b0..6f87816 100644
--- a/.github/workflows/ridership-action.yml
+++ b/.github/workflows/ridership-action.yml
@@ -2,12 +2,10 @@ name: Automate ridership data updates
 
 on:
   workflow_dispatch:
-  push:
-    branches:
-      - ridership-gh-action
+  
   schedule:
-    # Run every day at 12:30pm CST which is 5:30pm UTC
-    - cron: 30 17 * * * 
+    # Run monthly at 12:30pm CST which is 5:30pm UTC
+    - cron: 30 17 1 * * 
 
 env: 
   PYTHON_VERSION: 3.10.6