From 8dd26ffbe78f7fad9b0461c5d79fa8a5919cd8f1 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Sun, 11 Feb 2024 21:09:23 -0600 Subject: [PATCH 1/5] Action for downloading ridership data and saving JSON to s3 --- .github/workflows/ridership-action.yml | 35 ++++++++++++++++++++ data_analysis/ridership_to_json.py | 17 ++++++---- scrape_data/ridership_download.py | 46 ++++++++++++++++++++++++++ 3 files changed, 92 insertions(+), 6 deletions(-) create mode 100644 .github/workflows/ridership-action.yml create mode 100644 scrape_data/ridership_download.py diff --git a/.github/workflows/ridership-action.yml b/.github/workflows/ridership-action.yml new file mode 100644 index 0000000..e9c70b0 --- /dev/null +++ b/.github/workflows/ridership-action.yml @@ -0,0 +1,35 @@ +name: Automate ridership data updates + +on: + workflow_dispatch: + push: + branches: + - ridership-gh-action + schedule: + # Run every day at 12:30pm CST which is 5:30pm UTC + - cron: 30 17 * * * + +env: + PYTHON_VERSION: 3.10.6 + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + + +jobs: + download-ridership-data: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v4 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Download and save ridership data to s3 + + run: | + pip install -r requirements.txt + python -c 'from scrape_data.ridership_download import save_ridership_json; \ + save_ridership_json()' \ + $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY diff --git a/data_analysis/ridership_to_json.py b/data_analysis/ridership_to_json.py index 4778dfa..9ed894d 100644 --- a/data_analysis/ridership_to_json.py +++ b/data_analysis/ridership_to_json.py @@ -64,7 +64,8 @@ def get_latest_month_and_year(ridership_df: pd.DataFrame) -> tuple: return latest_date.month, latest_date.year -def ridership_to_json(ridership_df: pd.DataFrame, month: int = None, year: int = None) -> None: +def ridership_to_json(ridership_df: pd.DataFrame, month: int = None, year: int = None, + save: bool = True) -> None: """ Save ridership data to JSON for given month and year. Note that the data is typically a few months @@ -83,6 +84,7 @@ def ridership_to_json(ridership_df: pd.DataFrame, month: int = None, year: int = 4 9 01/01/2001 U 11207 month (int): Month of interest. Defaults to None year (int): Year of interest. Defaults to None + save (bool): Whether to save JSON locally. Defaults to True. """ ridership = ridership_df.copy() latest_month, latest_year = get_latest_month_and_year(ridership) @@ -119,13 +121,16 @@ def ridership_to_json(ridership_df: pd.DataFrame, month: int = None, year: int = df_daytype_summary_json = df_daytype_summary.to_json(orient='records') full_json = {'date': f'{month_name} {year}'} full_json['data'] = json.loads(df_daytype_summary_json) - with open(DATA_PATH / f'{month_name}_{year}_cta_ridership_data_day_type_summary.json', 'w') as outfile: - json.dump(full_json, outfile) - + if save: + with open(DATA_PATH / f'{month_name}_{year}_cta_ridership_data_day_type_summary.json', 'w') as outfile: + json.dump(full_json, outfile) + else: + return json.dumps(full_json, indent=4) + app = typer.Typer() @app.command() -def main(month: int = None, year: int = None) -> None: +def main(month: int = None, year: int = None, save: bool = True) -> None: print("Loading data from data.cityofchicago.org") ridership_df = pd.read_csv( @@ -133,7 +138,7 @@ def main(month: int = None, year: int = None) -> None: 'jyb9-n7fm/rows.csv?accessType=DOWNLOAD' ) print("Done!") - ridership_to_json(ridership_df=ridership_df, month=month, year=year) + ridership_to_json(ridership_df=ridership_df, month=month, year=year, save=save) if __name__ == '__main__': diff --git a/scrape_data/ridership_download.py b/scrape_data/ridership_download.py new file mode 100644 index 0000000..59ada06 --- /dev/null +++ b/scrape_data/ridership_download.py @@ -0,0 +1,46 @@ +import boto3 +import sys +import data_analysis.ridership_to_json as ridership_to_json +import data_analysis.compare_scheduled_and_rt as csrt + +ACCESS_KEY = sys.argv[1] +SECRET_KEY = sys.argv[2] + +client = boto3.client( + 's3', + aws_access_key_id=ACCESS_KEY, + aws_secret_access_key=SECRET_KEY +) + +s3 = boto3.resource( + 's3', + region_name='us-east-1', + aws_access_key_id=ACCESS_KEY, + aws_secret_access_key=SECRET_KEY +) + +def save_ridership_json() -> None: + ridership_json = ridership_to_json.main(save=False) + s3_ridership_json_path = 'frontend_data_files/cta_ridership_data_day_type_summary.json' + print(f'Saving {s3_ridership_json_path}') + s3.Object( + csrt.BUCKET_PUBLIC, + f'{s3_ridership_json_path}')\ + .put(Body=ridership_json) + + # Check that the file was uploaded successfully + keys(csrt.BUCKET_PUBLIC, [s3_ridership_json_path]) + + +# https://stackoverflow.com/questions/30249069/listing-contents-of-a-bucket-with-boto3 +def keys(bucket_name: str, filenames: list, + prefix: str='/', delimiter: str='/', + start_after: str='') -> None: + s3_paginator = client.get_paginator('list_objects_v2') + prefix = prefix.lstrip(delimiter) + start_after = (start_after or prefix) if prefix.endswith(delimiter) else start_after + for page in s3_paginator.paginate(Bucket=bucket_name, Prefix=prefix, StartAfter=start_after): + for content in page.get('Contents', ()): + if content['Key'] in filenames: + print(f"{content['Key']} exists") + \ No newline at end of file From 5d263ea59077bf096263a93c046d767816f03337 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Mon, 12 Feb 2024 10:33:29 -0600 Subject: [PATCH 2/5] Fix import error --- scrape_data/ridership_download.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scrape_data/ridership_download.py b/scrape_data/ridership_download.py index 59ada06..4afdc1a 100644 --- a/scrape_data/ridership_download.py +++ b/scrape_data/ridership_download.py @@ -1,7 +1,7 @@ import boto3 import sys import data_analysis.ridership_to_json as ridership_to_json -import data_analysis.compare_scheduled_and_rt as csrt +import data_analysis.static_gtfs_analysis as sga ACCESS_KEY = sys.argv[1] SECRET_KEY = sys.argv[2] @@ -24,12 +24,12 @@ def save_ridership_json() -> None: s3_ridership_json_path = 'frontend_data_files/cta_ridership_data_day_type_summary.json' print(f'Saving {s3_ridership_json_path}') s3.Object( - csrt.BUCKET_PUBLIC, + sga.BUCKET_PUBLIC, f'{s3_ridership_json_path}')\ .put(Body=ridership_json) # Check that the file was uploaded successfully - keys(csrt.BUCKET_PUBLIC, [s3_ridership_json_path]) + keys(sga.BUCKET_PUBLIC, [s3_ridership_json_path]) # https://stackoverflow.com/questions/30249069/listing-contents-of-a-bucket-with-boto3 From 8eeb097e2a2249ec125fe95164d09234468f04fe Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Mon, 12 Feb 2024 11:00:22 -0600 Subject: [PATCH 3/5] Fix attribute error --- scrape_data/ridership_download.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scrape_data/ridership_download.py b/scrape_data/ridership_download.py index 4afdc1a..4758661 100644 --- a/scrape_data/ridership_download.py +++ b/scrape_data/ridership_download.py @@ -24,12 +24,12 @@ def save_ridership_json() -> None: s3_ridership_json_path = 'frontend_data_files/cta_ridership_data_day_type_summary.json' print(f'Saving {s3_ridership_json_path}') s3.Object( - sga.BUCKET_PUBLIC, + sga.BUCKET, f'{s3_ridership_json_path}')\ .put(Body=ridership_json) # Check that the file was uploaded successfully - keys(sga.BUCKET_PUBLIC, [s3_ridership_json_path]) + keys(sga.BUCKET, [s3_ridership_json_path]) # https://stackoverflow.com/questions/30249069/listing-contents-of-a-bucket-with-boto3 From 78b90283bf98608119a65956525e0b002b5427dd Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Mon, 12 Feb 2024 11:19:04 -0600 Subject: [PATCH 4/5] Add return a value in main function --- data_analysis/ridership_to_json.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data_analysis/ridership_to_json.py b/data_analysis/ridership_to_json.py index 9ed894d..ee84535 100644 --- a/data_analysis/ridership_to_json.py +++ b/data_analysis/ridership_to_json.py @@ -138,7 +138,7 @@ def main(month: int = None, year: int = None, save: bool = True) -> None: 'jyb9-n7fm/rows.csv?accessType=DOWNLOAD' ) print("Done!") - ridership_to_json(ridership_df=ridership_df, month=month, year=year, save=save) + return ridership_to_json(ridership_df=ridership_df, month=month, year=year, save=save) if __name__ == '__main__': From a5126ae15a0c7f8d9bc5b1abbc17c7a791680876 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Tue, 13 Feb 2024 12:05:27 -0600 Subject: [PATCH 5/5] Remove on push. Change to run monthly --- .github/workflows/ridership-action.yml | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/.github/workflows/ridership-action.yml b/.github/workflows/ridership-action.yml index e9c70b0..6f87816 100644 --- a/.github/workflows/ridership-action.yml +++ b/.github/workflows/ridership-action.yml @@ -2,12 +2,10 @@ name: Automate ridership data updates on: workflow_dispatch: - push: - branches: - - ridership-gh-action + schedule: - # Run every day at 12:30pm CST which is 5:30pm UTC - - cron: 30 17 * * * + # Run monthly at 12:30pm CST which is 5:30pm UTC + - cron: 30 17 1 * * env: PYTHON_VERSION: 3.10.6