From a2af9bf9ed9c1443e068a8bc2b222e757f4c3c71 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Tue, 18 Jul 2023 18:07:00 -0500 Subject: [PATCH 01/32] First commit for downloading and saving schedule data --- .github/workflows/cta_schedule_data.yml | 25 +++++++++ data_analysis/static_gtfs_analysis.py | 69 ++++++++++++++++++------- scrape_data/cta_schedule_versions.py | 36 +++++++++++++ 3 files changed, 112 insertions(+), 18 deletions(-) create mode 100644 .github/workflows/cta_schedule_data.yml create mode 100644 scrape_data/cta_schedule_versions.py diff --git a/.github/workflows/cta_schedule_data.yml b/.github/workflows/cta_schedule_data.yml new file mode 100644 index 0000000..170a1ac --- /dev/null +++ b/.github/workflows/cta_schedule_data.yml @@ -0,0 +1,25 @@ +name: Automated job + +on: [push, workflow_dispatch] + branches: + - 'automate-schedule-downloads' + + +jobs: + download-cta-schedule-data: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + + - uses: actions/setup-python@v4 + with: + python-version: '3.10' + + - name: Download and save CTA schedule data + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + run: | + pip install -r requirements.txt + python scrape_data.cta_schedule_versions.py' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY diff --git a/data_analysis/static_gtfs_analysis.py b/data_analysis/static_gtfs_analysis.py index 119366c..de5ffc8 100644 --- a/data_analysis/static_gtfs_analysis.py +++ b/data_analysis/static_gtfs_analysis.py @@ -38,7 +38,6 @@ datefmt='%m/%d/%Y %I:%M:%S %p' ) - @dataclass class GTFSFeed: """Class for storing GTFSFeed data. @@ -53,14 +52,14 @@ class GTFSFeed: @classmethod def extract_data(cls, gtfs_zipfile: zipfile.ZipFile, - version_id: str = None) -> GTFSFeed: + version_id: str = None, cta_download: bool = True) -> GTFSFeed: """Load each text file in zipfile into a DataFrame Args: gtfs_zipfile (zipfile.ZipFile): Zipfile downloaded from - CTA transit feeds e.g. + transitfeeds.com or transitchicago.com e.g. https://transitfeeds.com/p/chicago-transit-authority/ - 165/20220718/download" + 165/20220718/download or https://www.transitchicago.com/downloads/sch_data/ version_id (str, optional): The schedule version in use. Defaults to None. @@ -68,9 +67,17 @@ def extract_data(cls, gtfs_zipfile: zipfile.ZipFile, GTFSFeed: A GTFSFeed object containing multiple DataFrames accessible by name. """ - if version_id is None: - version_id = VERSION_ID - logging.info(f"Extracting data from CTA zipfile version {version_id}") + if cta_download: + if version_id is not None: + raise ValueError("version_id is not used for downloads directly from CTA") + else: + logging.info(f"Extracting data from transitchicago.com zipfile") + + else: + if version_id is None: + version_id = VERSION_ID + logging.info(f"Extracting data from transitfeeds.com zipfile version {version_id}") + data_dict = {} pbar = tqdm(cls.__annotations__.keys()) for txt_file in pbar: @@ -140,14 +147,16 @@ def format_dates_hours(data: GTFSFeed) -> GTFSFeed: def make_trip_summary( data: GTFSFeed, - feed_start_date: pendulum.datetime, - feed_end_date: pendulum.datetime) -> pd.DataFrame: + feed_start_date: pendulum.datetime = None, + feed_end_date: pendulum.datetime = None) -> pd.DataFrame: """Create a summary of trips with one row per date Args: data (GTFSFeed): GTFS data from CTA - feed_start_date (datetime): Date from which this feed is valid (inclusive) - feed_end_date (datetime): Date until which this feed is valid (inclusive) + feed_start_date (datetime): Date from which this feed is valid (inclusive). + Defaults to None + feed_end_date (datetime): Date until which this feed is valid (inclusive). + Defaults to None Returns: pd.DataFrame: A DataFrame with each trip that occurred per row. @@ -161,7 +170,7 @@ def make_trip_summary( ), columns=["raw_date"], ) - + # cross join calendar index with actual calendar to get all combos of # possible dates & services calendar_cross = calendar_date_range.merge(data.calendar, how="cross") @@ -244,9 +253,10 @@ def make_trip_summary( trip_stop_hours, how="left", on="trip_id") # filter to only the rows for the period where this specific feed version was in effect - trip_summary = trip_summary.loc[ - (trip_summary['raw_date'] >= feed_start_date) - & (trip_summary['raw_date'] <= feed_end_date), :] + if feed_start_date is not None and feed_end_date is not None: + trip_summary = trip_summary.loc[ + (trip_summary['raw_date'] >= feed_start_date) + & (trip_summary['raw_date'] <= feed_end_date), :] return trip_summary @@ -321,6 +331,24 @@ def make_linestring_of_points( return shapely.geometry.LineString(list(sorted_df["pt"])) +def download_cta_zip() -> zipfile.ZipFile: + """Download CTA schedule data from transitchicago.com + + Returns: + zipfile.ZipFile: A zipfile of the latest GTFS schedule data from transitchicago.com + """ + logger.info('Downloading CTA data') + CTA_GTFS = zipfile.ZipFile( + BytesIO( + requests.get("https://www.transitchicago.com/downloads/sch_data/google_transit.zip" + ).content + ) + ) + logging.info('Download complete') + return CTA_GTFS + + + def download_zip(version_id: str) -> zipfile.ZipFile: """Download a version schedule from transitfeeds.com @@ -344,17 +372,22 @@ def download_zip(version_id: str) -> zipfile.ZipFile: return CTA_GTFS -def download_extract_format(version_id: str) -> GTFSFeed: +def download_extract_format(version_id: str = None) -> GTFSFeed: """Download a zipfile of GTFS data for a given version_id, extract data, and format date column. Args: - version_id (str): The version of the GTFS schedule data to download + version_id (str): The version of the GTFS schedule data to download. Defaults to None + If version_id is None, data will be downloaded from the CTA directly (transitchicag.com) + instead of transitfeeds.com Returns: GTFSFeed: A GTFSFeed object with formated dates """ - CTA_GTFS = download_zip(version_id) + if version_id is None: + CTA_GTFS = download_cta_zip() + else: + CTA_GTFS = download_zip(version_id) data = GTFSFeed.extract_data(CTA_GTFS, version_id=version_id) data = format_dates_hours(data) return data diff --git a/scrape_data/cta_schedule_versions.py b/scrape_data/cta_schedule_versions.py new file mode 100644 index 0000000..f2d4538 --- /dev/null +++ b/scrape_data/cta_schedule_versions.py @@ -0,0 +1,36 @@ +import boto3 +import sys +import data_analysis.static_gtfs_analysis as sga +import pendulum +from io import StringIO + +ACCESS_KEY = sys.argv[1] +SECRET_KEY = sys.argv[2] + +client = boto3.client( + 's3', + aws_access_key_id=ACCESS_KEY, + aws_secret_access_key=SECRET_KEY +) + +s3 = boto3.resource( + 's3', + region_name='us-east-1', + aws_access_key_id=ACCESS_KEY, + aws_secret_access_key=SECRET_KEY +) + +data = sga.download_extract_format() +trip_summary = sga.make_trip_summary(data) + +route_daily_summary = ( + sga.summarize_date_rt(trip_summary) +) +date = pendulum.now().to_date_string() + +csv_buffer = StringIO() +route_daily_summary.to_csv(csv_buffer) + +s3.Object('chn-ghost-buses-public', f'cta_route_daily_summary_{date}.csv')\ + .put(Body=csv_buffer.getvalue()) + From 4b38f62fb19f8b370777015a2a28d340fd70b50f Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Tue, 18 Jul 2023 18:12:30 -0500 Subject: [PATCH 02/32] Fix syntax error --- .github/workflows/cta_schedule_data.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/cta_schedule_data.yml b/.github/workflows/cta_schedule_data.yml index 170a1ac..0a75a8d 100644 --- a/.github/workflows/cta_schedule_data.yml +++ b/.github/workflows/cta_schedule_data.yml @@ -1,7 +1,8 @@ name: Automated job -on: [push, workflow_dispatch] - branches: +on: + push: + branches: - 'automate-schedule-downloads' From 50a8a4e1c05d563ee99d9c099e1c1fc09e0e0314 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Tue, 18 Jul 2023 19:03:14 -0500 Subject: [PATCH 03/32] Change version constraint of mapclassify --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index a709e5d..9e3218d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,7 +9,7 @@ python-dotenv==0.20.0 seaborn==0.12.0 PyQt5==5.15.7 folium==0.12.1.post1 -mapclassify==2.4.2+55.g0155c6e +mapclassify>=2.4.2+55.g0155c6e plotly==5.11.0 kaleido==0.2.1 pre-commit==2.20.0 From f56d0d43d45a996df369755a1962cb0646b7af15 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Thu, 20 Jul 2023 14:29:45 -0500 Subject: [PATCH 04/32] remove single quote --- .github/workflows/cta_schedule_data.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/cta_schedule_data.yml b/.github/workflows/cta_schedule_data.yml index 0a75a8d..d835f54 100644 --- a/.github/workflows/cta_schedule_data.yml +++ b/.github/workflows/cta_schedule_data.yml @@ -23,4 +23,4 @@ jobs: AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} run: | pip install -r requirements.txt - python scrape_data.cta_schedule_versions.py' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY + python scrape_data.cta_schedule_versions.py $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY From 140ffbc828b8919000312036716287269f089d05 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Thu, 20 Jul 2023 14:40:43 -0500 Subject: [PATCH 05/32] Run as a module --- .github/workflows/cta_schedule_data.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/cta_schedule_data.yml b/.github/workflows/cta_schedule_data.yml index d835f54..909c345 100644 --- a/.github/workflows/cta_schedule_data.yml +++ b/.github/workflows/cta_schedule_data.yml @@ -23,4 +23,4 @@ jobs: AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} run: | pip install -r requirements.txt - python scrape_data.cta_schedule_versions.py $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY + python -m scrape_data.cta_schedule_versions $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY From 9f00363e9314100332e8102b27d2d3645facbf72 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Mon, 24 Jul 2023 20:52:49 -0500 Subject: [PATCH 06/32] Add print function for saving csv to public bucket --- scrape_data/cta_schedule_versions.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scrape_data/cta_schedule_versions.py b/scrape_data/cta_schedule_versions.py index f2d4538..e24ca84 100644 --- a/scrape_data/cta_schedule_versions.py +++ b/scrape_data/cta_schedule_versions.py @@ -31,6 +31,7 @@ csv_buffer = StringIO() route_daily_summary.to_csv(csv_buffer) +print(f'Saving cta_route_daily_summary_{date}.csv to public bucket') s3.Object('chn-ghost-buses-public', f'cta_route_daily_summary_{date}.csv')\ .put(Body=csv_buffer.getvalue()) From 12f6b0887ff660831c9793969a00bbff2d20a221 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Tue, 25 Jul 2023 14:31:11 -0500 Subject: [PATCH 07/32] Download schedule daily at 5:30pm UTC --- .github/workflows/cta_schedule_data.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/cta_schedule_data.yml b/.github/workflows/cta_schedule_data.yml index 909c345..d4eb7d8 100644 --- a/.github/workflows/cta_schedule_data.yml +++ b/.github/workflows/cta_schedule_data.yml @@ -5,6 +5,10 @@ on: branches: - 'automate-schedule-downloads' + schedule: + # Run every day at 12:30pm CST which is 5:30pm UTC + - cron: 30 17 * * * + jobs: download-cta-schedule-data: From 8aa369130663f7ab4b422229412e7abde2446627 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Tue, 25 Jul 2023 18:18:30 -0500 Subject: [PATCH 08/32] Save zipfile from transitchicago.com to s3 --- scrape_data/cta_schedule_versions.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/scrape_data/cta_schedule_versions.py b/scrape_data/cta_schedule_versions.py index e24ca84..44e5c7f 100644 --- a/scrape_data/cta_schedule_versions.py +++ b/scrape_data/cta_schedule_versions.py @@ -20,13 +20,22 @@ aws_secret_access_key=SECRET_KEY ) +date = pendulum.now().to_date_string() + +zipfile = sga.download_cta_zip() +print(f'Saving zipfile available at ' + f'https://www.transitchicago.com/downloads/sch_data/google_transit.zip ' + f'on {date} to public bucket') + +s3.Object('chn-ghost-buses-public', f'google_transit_{date}.zip')\ + .put(Body=zipfile) + data = sga.download_extract_format() trip_summary = sga.make_trip_summary(data) route_daily_summary = ( sga.summarize_date_rt(trip_summary) ) -date = pendulum.now().to_date_string() csv_buffer = StringIO() route_daily_summary.to_csv(csv_buffer) @@ -34,4 +43,3 @@ print(f'Saving cta_route_daily_summary_{date}.csv to public bucket') s3.Object('chn-ghost-buses-public', f'cta_route_daily_summary_{date}.csv')\ .put(Body=csv_buffer.getvalue()) - From 2ee3d05fc6a125c59a5ac6be34154c424adf48e7 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Tue, 25 Jul 2023 20:03:10 -0500 Subject: [PATCH 09/32] Change method of uploading zipfile --- data_analysis/static_gtfs_analysis.py | 13 ++++++------- scrape_data/cta_schedule_versions.py | 7 +++---- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/data_analysis/static_gtfs_analysis.py b/data_analysis/static_gtfs_analysis.py index de5ffc8..65bba88 100644 --- a/data_analysis/static_gtfs_analysis.py +++ b/data_analysis/static_gtfs_analysis.py @@ -13,7 +13,7 @@ import os from pathlib import Path from dataclasses import dataclass -from typing import List +from typing import Tuple import logging import calendar @@ -331,21 +331,20 @@ def make_linestring_of_points( return shapely.geometry.LineString(list(sorted_df["pt"])) -def download_cta_zip() -> zipfile.ZipFile: +def download_cta_zip() -> Tuple[zipfile.ZipFile, BytesIO]: """Download CTA schedule data from transitchicago.com Returns: zipfile.ZipFile: A zipfile of the latest GTFS schedule data from transitchicago.com """ logger.info('Downloading CTA data') - CTA_GTFS = zipfile.ZipFile( - BytesIO( + zip_bytes_io = BytesIO( requests.get("https://www.transitchicago.com/downloads/sch_data/google_transit.zip" ).content ) - ) + CTA_GTFS = zipfile.ZipFile(zip_bytes_io) logging.info('Download complete') - return CTA_GTFS + return CTA_GTFS, zip_bytes_io @@ -385,7 +384,7 @@ def download_extract_format(version_id: str = None) -> GTFSFeed: GTFSFeed: A GTFSFeed object with formated dates """ if version_id is None: - CTA_GTFS = download_cta_zip() + CTA_GTFS, _ = download_cta_zip() else: CTA_GTFS = download_zip(version_id) data = GTFSFeed.extract_data(CTA_GTFS, version_id=version_id) diff --git a/scrape_data/cta_schedule_versions.py b/scrape_data/cta_schedule_versions.py index 44e5c7f..f6676d0 100644 --- a/scrape_data/cta_schedule_versions.py +++ b/scrape_data/cta_schedule_versions.py @@ -22,13 +22,12 @@ date = pendulum.now().to_date_string() -zipfile = sga.download_cta_zip() +zipfile, zipfile_bytes_io = sga.download_cta_zip() print(f'Saving zipfile available at ' f'https://www.transitchicago.com/downloads/sch_data/google_transit.zip ' f'on {date} to public bucket') - -s3.Object('chn-ghost-buses-public', f'google_transit_{date}.zip')\ - .put(Body=zipfile) +zipfile_bytes_io.seek(0) +client.upload_fileobj(zipfile_bytes_io, 'chn-ghost-buses-public', f'google_transit_{date}.zip') data = sga.download_extract_format() trip_summary = sga.make_trip_summary(data) From 7c6a42e80f6130a39bd61c6fea6b6c53fd83470a Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Wed, 26 Jul 2023 20:45:47 -0500 Subject: [PATCH 10/32] Check that objects exist in bucket --- scrape_data/cta_schedule_versions.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/scrape_data/cta_schedule_versions.py b/scrape_data/cta_schedule_versions.py index f6676d0..66e882e 100644 --- a/scrape_data/cta_schedule_versions.py +++ b/scrape_data/cta_schedule_versions.py @@ -42,3 +42,18 @@ print(f'Saving cta_route_daily_summary_{date}.csv to public bucket') s3.Object('chn-ghost-buses-public', f'cta_route_daily_summary_{date}.csv')\ .put(Body=csv_buffer.getvalue()) + + +# https://stackoverflow.com/questions/30249069/listing-contents-of-a-bucket-with-boto3 +print('Confirm that objects exist in bucket') +s3_paginator = client.get_paginator('list_objects_v2') + +def keys(bucket_name, prefix='/', delimiter='/', start_after=''): + prefix = prefix.lstrip(delimiter) + start_after = (start_after or prefix) if prefix.endswith(delimiter) else start_after + for page in s3_paginator.paginate(Bucket=bucket_name, Prefix=prefix, StartAfter=start_after): + for content in page.get('Contents', ()): + if content['Key'] in [f'cta_route_daily_summary_{date}.csv', f'google_transit_{date}.zip']: + yield f"{content['Key']} exists" + +keys('chn-ghost-buses-public') \ No newline at end of file From bc9176676f2c22d556a983924599e02442db44c4 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Thu, 27 Jul 2023 18:51:25 -0500 Subject: [PATCH 11/32] Change yield to print --- scrape_data/cta_schedule_versions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrape_data/cta_schedule_versions.py b/scrape_data/cta_schedule_versions.py index 66e882e..bf79ab2 100644 --- a/scrape_data/cta_schedule_versions.py +++ b/scrape_data/cta_schedule_versions.py @@ -54,6 +54,6 @@ def keys(bucket_name, prefix='/', delimiter='/', start_after=''): for page in s3_paginator.paginate(Bucket=bucket_name, Prefix=prefix, StartAfter=start_after): for content in page.get('Contents', ()): if content['Key'] in [f'cta_route_daily_summary_{date}.csv', f'google_transit_{date}.zip']: - yield f"{content['Key']} exists" + print(f"{content['Key']} exists") keys('chn-ghost-buses-public') \ No newline at end of file From c0c153c3f7e94daf6225528c92418d6700771e32 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Mon, 7 Aug 2023 21:01:37 -0500 Subject: [PATCH 12/32] Separate downloading zip file and saving daily summaries --- .github/workflows/cta_schedule_data.yml | 34 +++++++++++-- scrape_data/cta_schedule_versions.py | 63 +++++++++++++++---------- 2 files changed, 70 insertions(+), 27 deletions(-) diff --git a/.github/workflows/cta_schedule_data.yml b/.github/workflows/cta_schedule_data.yml index d4eb7d8..3fb75d3 100644 --- a/.github/workflows/cta_schedule_data.yml +++ b/.github/workflows/cta_schedule_data.yml @@ -1,4 +1,4 @@ -name: Automated job +name: Automate CTA schedule downloads on: push: @@ -20,11 +20,39 @@ jobs: - uses: actions/setup-python@v4 with: python-version: '3.10' - + - name: Download and save CTA schedule data env: AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + + run: | + pip install -r requirements.txt + python -c 'from scrape_data.cta_schedule_versions import save_cta_zip; \ + save_cta_zip()' \ + $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY + + + save-schedule-daily-summary: + needs: download-cta-schedule-data + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + + - uses: actions/setup-python@v4 + with: + python-version: '3.10' + + - name: 'Save schedule summaries' + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + run: | pip install -r requirements.txt - python -m scrape_data.cta_schedule_versions $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY + python -c 'from scrape_data.cta_schedule_versions \ + import save_route_daily_summary; save_route_daily_summary()' \ + $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY + + diff --git a/scrape_data/cta_schedule_versions.py b/scrape_data/cta_schedule_versions.py index bf79ab2..a3d8986 100644 --- a/scrape_data/cta_schedule_versions.py +++ b/scrape_data/cta_schedule_versions.py @@ -20,40 +20,55 @@ aws_secret_access_key=SECRET_KEY ) -date = pendulum.now().to_date_string() +today = pendulum.now().to_date_string() -zipfile, zipfile_bytes_io = sga.download_cta_zip() -print(f'Saving zipfile available at ' - f'https://www.transitchicago.com/downloads/sch_data/google_transit.zip ' - f'on {date} to public bucket') -zipfile_bytes_io.seek(0) -client.upload_fileobj(zipfile_bytes_io, 'chn-ghost-buses-public', f'google_transit_{date}.zip') +def save_cta_zip(): + _, zipfile_bytes_io = sga.download_cta_zip() + print(f'Saving zipfile available at ' + f'https://www.transitchicago.com/downloads/sch_data/google_transit.zip ' + f'on {today} to public bucket') + zipfile_bytes_io.seek(0) + client.upload_fileobj( + zipfile_bytes_io, + 'chn-ghost-buses-public', + f'cta_schedule_zipfiles_raw/google_transit_{today}.zip' + ) + print('Confirm that object exists in bucket') + keys('chn-ghost-buses-public', [ + f'cta_schedule_zipfiles_raw/google_transit_{today}.zip' + ]) -data = sga.download_extract_format() -trip_summary = sga.make_trip_summary(data) +def save_route_daily_summary(): + data = sga.download_extract_format() + trip_summary = sga.make_trip_summary(data) -route_daily_summary = ( - sga.summarize_date_rt(trip_summary) -) + route_daily_summary = ( + sga.summarize_date_rt(trip_summary) + ) + route_daily_summary['date'] = route_daily_summary['date'].astype(str) + route_daily_summary_today = route_daily_summary.loc[route_daily_summary['date'] == today] -csv_buffer = StringIO() -route_daily_summary.to_csv(csv_buffer) + csv_buffer = StringIO() + route_daily_summary_today.to_csv(csv_buffer) -print(f'Saving cta_route_daily_summary_{date}.csv to public bucket') -s3.Object('chn-ghost-buses-public', f'cta_route_daily_summary_{date}.csv')\ - .put(Body=csv_buffer.getvalue()) + print(f'Saving cta_route_daily_summary_{today}.csv to public bucket') + s3.Object( + 'chn-ghost-buses-public', + f'schedule_summaries/daily_job/cta_route_daily_summary_{today}.csv')\ + .put(Body=csv_buffer.getvalue()) + print('Confirm that object exists in bucket') + keys('chn-ghost-buses-public', [ + f'schedule_summaries/daily_job/cta_route_daily_summary_{today}.csv', + ]) -# https://stackoverflow.com/questions/30249069/listing-contents-of-a-bucket-with-boto3 -print('Confirm that objects exist in bucket') -s3_paginator = client.get_paginator('list_objects_v2') -def keys(bucket_name, prefix='/', delimiter='/', start_after=''): +# https://stackoverflow.com/questions/30249069/listing-contents-of-a-bucket-with-boto3 +def keys(bucket_name: str, filenames: list, prefix: str='/', delimiter: str='/', start_after: str=''): + s3_paginator = client.get_paginator('list_objects_v2') prefix = prefix.lstrip(delimiter) start_after = (start_after or prefix) if prefix.endswith(delimiter) else start_after for page in s3_paginator.paginate(Bucket=bucket_name, Prefix=prefix, StartAfter=start_after): for content in page.get('Contents', ()): - if content['Key'] in [f'cta_route_daily_summary_{date}.csv', f'google_transit_{date}.zip']: + if content['Key'] in filenames: print(f"{content['Key']} exists") - -keys('chn-ghost-buses-public') \ No newline at end of file From 2dc18f3772e6a4fa5ab6060751d3d01c1bc6ba8e Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Mon, 7 Aug 2023 21:45:00 -0500 Subject: [PATCH 13/32] remove job dependency --- .github/workflows/cta_schedule_data.yml | 5 ++--- scrape_data/cta_schedule_versions.py | 6 ++++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/.github/workflows/cta_schedule_data.yml b/.github/workflows/cta_schedule_data.yml index 3fb75d3..66b0802 100644 --- a/.github/workflows/cta_schedule_data.yml +++ b/.github/workflows/cta_schedule_data.yml @@ -34,7 +34,6 @@ jobs: save-schedule-daily-summary: - needs: download-cta-schedule-data runs-on: ubuntu-latest steps: @@ -51,8 +50,8 @@ jobs: run: | pip install -r requirements.txt - python -c 'from scrape_data.cta_schedule_versions \ - import save_route_daily_summary; save_route_daily_summary()' \ + python -c 'from scrape_data.cta_schedule_versions import save_route_daily_summary; \ + save_route_daily_summary()' \ $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY diff --git a/scrape_data/cta_schedule_versions.py b/scrape_data/cta_schedule_versions.py index a3d8986..6323085 100644 --- a/scrape_data/cta_schedule_versions.py +++ b/scrape_data/cta_schedule_versions.py @@ -22,8 +22,9 @@ today = pendulum.now().to_date_string() +CTA_GTFS, zipfile_bytes_io = sga.download_cta_zip() + def save_cta_zip(): - _, zipfile_bytes_io = sga.download_cta_zip() print(f'Saving zipfile available at ' f'https://www.transitchicago.com/downloads/sch_data/google_transit.zip ' f'on {today} to public bucket') @@ -39,7 +40,8 @@ def save_cta_zip(): ]) def save_route_daily_summary(): - data = sga.download_extract_format() + data = sga.GTFSFeed.extract_data(CTA_GTFS) + data = sga.format_dates_hours(data) trip_summary = sga.make_trip_summary(data) route_daily_summary = ( From d35f310d3a0584ef09e9ea53295f7de32da505f3 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Mon, 7 Aug 2023 21:57:11 -0500 Subject: [PATCH 14/32] Add args to same line --- .github/workflows/cta_schedule_data.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/cta_schedule_data.yml b/.github/workflows/cta_schedule_data.yml index 66b0802..3a11f65 100644 --- a/.github/workflows/cta_schedule_data.yml +++ b/.github/workflows/cta_schedule_data.yml @@ -51,7 +51,6 @@ jobs: run: | pip install -r requirements.txt python -c 'from scrape_data.cta_schedule_versions import save_route_daily_summary; \ - save_route_daily_summary()' \ - $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY + save_route_daily_summary()' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY From ee7b05721fd829788c0d086d005f399bd61ff1f0 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Sun, 13 Aug 2023 17:19:53 -0500 Subject: [PATCH 15/32] Save realtime summary file --- ...hedule_data.yml => cta_data_downloads.yml} | 34 ++++-- scrape_data/cta_data_downloads.py | 110 ++++++++++++++++++ scrape_data/cta_schedule_versions.py | 76 ------------ 3 files changed, 134 insertions(+), 86 deletions(-) rename .github/workflows/{cta_schedule_data.yml => cta_data_downloads.yml} (57%) create mode 100644 scrape_data/cta_data_downloads.py delete mode 100644 scrape_data/cta_schedule_versions.py diff --git a/.github/workflows/cta_schedule_data.yml b/.github/workflows/cta_data_downloads.yml similarity index 57% rename from .github/workflows/cta_schedule_data.yml rename to .github/workflows/cta_data_downloads.yml index 3a11f65..c9d08d4 100644 --- a/.github/workflows/cta_schedule_data.yml +++ b/.github/workflows/cta_data_downloads.yml @@ -1,4 +1,4 @@ -name: Automate CTA schedule downloads +name: Automate CTA schedule and realtime downloads on: push: @@ -9,6 +9,10 @@ on: # Run every day at 12:30pm CST which is 5:30pm UTC - cron: 30 17 * * * +env: + PYTHON_VERSION: '3.10' + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} jobs: download-cta-schedule-data: @@ -19,12 +23,9 @@ jobs: - uses: actions/setup-python@v4 with: - python-version: '3.10' + python-version: $PYTHON_VERSION - name: Download and save CTA schedule data - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} run: | pip install -r requirements.txt @@ -41,16 +42,29 @@ jobs: - uses: actions/setup-python@v4 with: - python-version: '3.10' + python-version: $PYTHON_VERSION - name: 'Save schedule summaries' - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - run: | pip install -r requirements.txt python -c 'from scrape_data.cta_schedule_versions import save_route_daily_summary; \ save_route_daily_summary()' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY + save-realtime-daily-summary: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + + - uses: actions/setup-python@v4 + with: + python-version: $PYTHON_VERSION + + - name: 'Save realtime summaries' + + run: | + pip install -r requirements.txt + python -c 'from scrape_data.cta_data_downloads import save_realtime_daily_summary; \ + save_realtime_daily_summary()' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY + \ No newline at end of file diff --git a/scrape_data/cta_data_downloads.py b/scrape_data/cta_data_downloads.py new file mode 100644 index 0000000..4375b10 --- /dev/null +++ b/scrape_data/cta_data_downloads.py @@ -0,0 +1,110 @@ +import boto3 +import sys +import data_analysis.static_gtfs_analysis as sga +import data_analysis.compare_scheduled_and_rt as csrt +import pendulum +from io import StringIO +import pandas as pd + + +ACCESS_KEY = sys.argv[1] +SECRET_KEY = sys.argv[2] + +client = boto3.client( + 's3', + aws_access_key_id=ACCESS_KEY, + aws_secret_access_key=SECRET_KEY +) + +s3 = boto3.resource( + 's3', + region_name='us-east-1', + aws_access_key_id=ACCESS_KEY, + aws_secret_access_key=SECRET_KEY +) + +today = pendulum.now().to_date_string() + +CTA_GTFS, zipfile_bytes_io = sga.download_cta_zip() + +def save_cta_zip() -> None: + print(f'Saving zipfile available at ' + f'https://www.transitchicago.com/downloads/sch_data/google_transit.zip ' + f'on {today} to public bucket') + filename = f'cta_schedule_zipfiles_raw/google_transit_{today}.zip' + zipfile_bytes_io.seek(0) + client.upload_fileobj( + zipfile_bytes_io, + csrt.BUCKET_PUBLIC, + filename + ) + print(f'Confirm that {filename} exists in bucket') + keys('chn-ghost-buses-public', [filename]) + + +def save_csv_to_bucket(df: pd.DataFrame, filename: str) -> None: + """Save pandas DataFrame to csv in s3 + + Args: + df (pd.DataFrame): DataFrame to be saved + filename (str): Name of the saved filename in s3. + Should contain the .csv suffix. + """ + csv_buffer = StringIO() + df.to_csv(csv_buffer) + + print(f'Saving {filename} to public bucket') + s3.Object( + csrt.BUCKET_PUBLIC, + f'{filename}')\ + .put(Body=csv_buffer.getvalue()) + + +def save_route_daily_summary() -> None: + data = sga.GTFSFeed.extract_data(CTA_GTFS) + data = sga.format_dates_hours(data) + trip_summary = sga.make_trip_summary(data) + + route_daily_summary = ( + sga.summarize_date_rt(trip_summary) + ) + route_daily_summary['date'] = route_daily_summary['date'].astype(str) + route_daily_summary_today = route_daily_summary.loc[route_daily_summary['date'] == today] + + print(f'Saving cta_route_daily_summary_{today}.csv to public bucket') + filename = f'schedule_summaries/daily_job/cta_route_daily_summary_{today}.csv' + save_csv_to_bucket( + route_daily_summary_today, + filename=filename + ) + print(f'Confirm that {filename} exists in bucket') + keys(csrt.BUCKET_PUBLIC, [filename]) + + +def save_realtime_daily_summary() -> None: + # This will be run at 5 pm Central time. bus_full_day_data_v2/{today}.csv + # will be in the public bucket by 11 am Central time, so there shouldn't be any issues. + daily_data = pd.read_csv( + (csrt.BASE_PATH / f"bus_full_day_data_v2/{today}.csv") + .as_uri(), + low_memory=False + ) + + daily_data = csrt.make_daily_summary(daily_data) + filename = f'realtime_summaries/daily_job/bus_full_day_data_v2/{today}.csv' + save_csv_to_bucket(daily_data, filename=filename) + + print(f'Confirm that {filename} exists in bucket') + keys(csrt.BUCKET_PUBLIC, [filename]) + +# https://stackoverflow.com/questions/30249069/listing-contents-of-a-bucket-with-boto3 +def keys(bucket_name: str, filenames: list, + prefix: str='/', delimiter: str='/', + start_after: str='') -> None: + s3_paginator = client.get_paginator('list_objects_v2') + prefix = prefix.lstrip(delimiter) + start_after = (start_after or prefix) if prefix.endswith(delimiter) else start_after + for page in s3_paginator.paginate(Bucket=bucket_name, Prefix=prefix, StartAfter=start_after): + for content in page.get('Contents', ()): + if content['Key'] in filenames: + print(f"{content['Key']} exists") diff --git a/scrape_data/cta_schedule_versions.py b/scrape_data/cta_schedule_versions.py deleted file mode 100644 index 6323085..0000000 --- a/scrape_data/cta_schedule_versions.py +++ /dev/null @@ -1,76 +0,0 @@ -import boto3 -import sys -import data_analysis.static_gtfs_analysis as sga -import pendulum -from io import StringIO - -ACCESS_KEY = sys.argv[1] -SECRET_KEY = sys.argv[2] - -client = boto3.client( - 's3', - aws_access_key_id=ACCESS_KEY, - aws_secret_access_key=SECRET_KEY -) - -s3 = boto3.resource( - 's3', - region_name='us-east-1', - aws_access_key_id=ACCESS_KEY, - aws_secret_access_key=SECRET_KEY -) - -today = pendulum.now().to_date_string() - -CTA_GTFS, zipfile_bytes_io = sga.download_cta_zip() - -def save_cta_zip(): - print(f'Saving zipfile available at ' - f'https://www.transitchicago.com/downloads/sch_data/google_transit.zip ' - f'on {today} to public bucket') - zipfile_bytes_io.seek(0) - client.upload_fileobj( - zipfile_bytes_io, - 'chn-ghost-buses-public', - f'cta_schedule_zipfiles_raw/google_transit_{today}.zip' - ) - print('Confirm that object exists in bucket') - keys('chn-ghost-buses-public', [ - f'cta_schedule_zipfiles_raw/google_transit_{today}.zip' - ]) - -def save_route_daily_summary(): - data = sga.GTFSFeed.extract_data(CTA_GTFS) - data = sga.format_dates_hours(data) - trip_summary = sga.make_trip_summary(data) - - route_daily_summary = ( - sga.summarize_date_rt(trip_summary) - ) - route_daily_summary['date'] = route_daily_summary['date'].astype(str) - route_daily_summary_today = route_daily_summary.loc[route_daily_summary['date'] == today] - - csv_buffer = StringIO() - route_daily_summary_today.to_csv(csv_buffer) - - print(f'Saving cta_route_daily_summary_{today}.csv to public bucket') - s3.Object( - 'chn-ghost-buses-public', - f'schedule_summaries/daily_job/cta_route_daily_summary_{today}.csv')\ - .put(Body=csv_buffer.getvalue()) - - print('Confirm that object exists in bucket') - keys('chn-ghost-buses-public', [ - f'schedule_summaries/daily_job/cta_route_daily_summary_{today}.csv', - ]) - - -# https://stackoverflow.com/questions/30249069/listing-contents-of-a-bucket-with-boto3 -def keys(bucket_name: str, filenames: list, prefix: str='/', delimiter: str='/', start_after: str=''): - s3_paginator = client.get_paginator('list_objects_v2') - prefix = prefix.lstrip(delimiter) - start_after = (start_after or prefix) if prefix.endswith(delimiter) else start_after - for page in s3_paginator.paginate(Bucket=bucket_name, Prefix=prefix, StartAfter=start_after): - for content in page.get('Contents', ()): - if content['Key'] in filenames: - print(f"{content['Key']} exists") From 461df4274f783a4e69209922d89cff40fd785a70 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Sun, 13 Aug 2023 17:40:20 -0500 Subject: [PATCH 16/32] Change to string --- .github/workflows/cta_data_downloads.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/cta_data_downloads.yml b/.github/workflows/cta_data_downloads.yml index c9d08d4..fcd681d 100644 --- a/.github/workflows/cta_data_downloads.yml +++ b/.github/workflows/cta_data_downloads.yml @@ -10,7 +10,7 @@ on: - cron: 30 17 * * * env: - PYTHON_VERSION: '3.10' + PYTHON_VERSION: 3.10 AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} @@ -23,7 +23,7 @@ jobs: - uses: actions/setup-python@v4 with: - python-version: $PYTHON_VERSION + python-version: '$PYTHON_VERSION' - name: Download and save CTA schedule data @@ -42,7 +42,7 @@ jobs: - uses: actions/setup-python@v4 with: - python-version: $PYTHON_VERSION + python-version: '$PYTHON_VERSION' - name: 'Save schedule summaries' run: | @@ -59,10 +59,10 @@ jobs: - uses: actions/setup-python@v4 with: - python-version: $PYTHON_VERSION + python-version: '$PYTHON_VERSION' - name: 'Save realtime summaries' - + run: | pip install -r requirements.txt python -c 'from scrape_data.cta_data_downloads import save_realtime_daily_summary; \ From e1baeaa648169fe20a32bd7568a354ab3bae7e87 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Sun, 13 Aug 2023 17:50:18 -0500 Subject: [PATCH 17/32] Correct python version name --- .github/workflows/cta_data_downloads.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/cta_data_downloads.yml b/.github/workflows/cta_data_downloads.yml index fcd681d..c8dcee3 100644 --- a/.github/workflows/cta_data_downloads.yml +++ b/.github/workflows/cta_data_downloads.yml @@ -10,7 +10,7 @@ on: - cron: 30 17 * * * env: - PYTHON_VERSION: 3.10 + PYTHON_VERSION: '3.10.0' AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} @@ -23,7 +23,7 @@ jobs: - uses: actions/setup-python@v4 with: - python-version: '$PYTHON_VERSION' + python-version: $PYTHON_VERSION - name: Download and save CTA schedule data @@ -42,7 +42,7 @@ jobs: - uses: actions/setup-python@v4 with: - python-version: '$PYTHON_VERSION' + python-version: $PYTHON_VERSION - name: 'Save schedule summaries' run: | @@ -59,7 +59,7 @@ jobs: - uses: actions/setup-python@v4 with: - python-version: '$PYTHON_VERSION' + python-version: $PYTHON_VERSION - name: 'Save realtime summaries' From 398d62add781a3d9ff203e3f4a5a4bf938fc7367 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Sun, 13 Aug 2023 18:04:06 -0500 Subject: [PATCH 18/32] Add quotes --- .github/workflows/cta_data_downloads.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/cta_data_downloads.yml b/.github/workflows/cta_data_downloads.yml index c8dcee3..12ed31d 100644 --- a/.github/workflows/cta_data_downloads.yml +++ b/.github/workflows/cta_data_downloads.yml @@ -23,7 +23,7 @@ jobs: - uses: actions/setup-python@v4 with: - python-version: $PYTHON_VERSION + python-version: '$PYTHON_VERSION' - name: Download and save CTA schedule data @@ -42,7 +42,7 @@ jobs: - uses: actions/setup-python@v4 with: - python-version: $PYTHON_VERSION + python-version: '$PYTHON_VERSION' - name: 'Save schedule summaries' run: | @@ -59,7 +59,7 @@ jobs: - uses: actions/setup-python@v4 with: - python-version: $PYTHON_VERSION + python-version: '$PYTHON_VERSION' - name: 'Save realtime summaries' From 77ef70816356c6c0bce370946413feef63f374a7 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Sun, 13 Aug 2023 18:15:26 -0500 Subject: [PATCH 19/32] Add environment context --- .github/workflows/cta_data_downloads.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/cta_data_downloads.yml b/.github/workflows/cta_data_downloads.yml index 12ed31d..94607f1 100644 --- a/.github/workflows/cta_data_downloads.yml +++ b/.github/workflows/cta_data_downloads.yml @@ -23,7 +23,7 @@ jobs: - uses: actions/setup-python@v4 with: - python-version: '$PYTHON_VERSION' + python-version: ${{ env.PYTHON_VERSION }} - name: Download and save CTA schedule data @@ -42,7 +42,7 @@ jobs: - uses: actions/setup-python@v4 with: - python-version: '$PYTHON_VERSION' + python-version: ${{ env.PYTHON_VERSION }} - name: 'Save schedule summaries' run: | @@ -59,7 +59,7 @@ jobs: - uses: actions/setup-python@v4 with: - python-version: '$PYTHON_VERSION' + python-version: ${{ env.PYTHON_VERSION }} - name: 'Save realtime summaries' From 4842fa07c13850c75373c0dcca08dd3657358a1e Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Sun, 13 Aug 2023 18:23:30 -0500 Subject: [PATCH 20/32] Remove quotes --- .github/workflows/cta_data_downloads.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/cta_data_downloads.yml b/.github/workflows/cta_data_downloads.yml index 94607f1..62b8968 100644 --- a/.github/workflows/cta_data_downloads.yml +++ b/.github/workflows/cta_data_downloads.yml @@ -10,7 +10,7 @@ on: - cron: 30 17 * * * env: - PYTHON_VERSION: '3.10.0' + PYTHON_VERSION: 3.10.0 AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} From 3817614186c70519a1e15fd0569148637836d532 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Sun, 13 Aug 2023 18:36:23 -0500 Subject: [PATCH 21/32] Test without environment variables --- .github/workflows/cta_data_downloads.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/cta_data_downloads.yml b/.github/workflows/cta_data_downloads.yml index 62b8968..1d9dd73 100644 --- a/.github/workflows/cta_data_downloads.yml +++ b/.github/workflows/cta_data_downloads.yml @@ -10,7 +10,7 @@ on: - cron: 30 17 * * * env: - PYTHON_VERSION: 3.10.0 + # PYTHON_VERSION: 3.10.0 AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} @@ -23,7 +23,7 @@ jobs: - uses: actions/setup-python@v4 with: - python-version: ${{ env.PYTHON_VERSION }} + python-version: '3.10.0' - name: Download and save CTA schedule data @@ -42,7 +42,7 @@ jobs: - uses: actions/setup-python@v4 with: - python-version: ${{ env.PYTHON_VERSION }} + python-version: '3.10.0' - name: 'Save schedule summaries' run: | @@ -59,7 +59,7 @@ jobs: - uses: actions/setup-python@v4 with: - python-version: ${{ env.PYTHON_VERSION }} + python-version: '3.10.0' - name: 'Save realtime summaries' From cfb09606e9434875a07c8991d52f9ec79057599d Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Sun, 13 Aug 2023 18:45:23 -0500 Subject: [PATCH 22/32] Revert "Test without environment variables" This reverts commit 3817614186c70519a1e15fd0569148637836d532. --- .github/workflows/cta_data_downloads.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/cta_data_downloads.yml b/.github/workflows/cta_data_downloads.yml index 1d9dd73..62b8968 100644 --- a/.github/workflows/cta_data_downloads.yml +++ b/.github/workflows/cta_data_downloads.yml @@ -10,7 +10,7 @@ on: - cron: 30 17 * * * env: - # PYTHON_VERSION: 3.10.0 + PYTHON_VERSION: 3.10.0 AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} @@ -23,7 +23,7 @@ jobs: - uses: actions/setup-python@v4 with: - python-version: '3.10.0' + python-version: ${{ env.PYTHON_VERSION }} - name: Download and save CTA schedule data @@ -42,7 +42,7 @@ jobs: - uses: actions/setup-python@v4 with: - python-version: '3.10.0' + python-version: ${{ env.PYTHON_VERSION }} - name: 'Save schedule summaries' run: | @@ -59,7 +59,7 @@ jobs: - uses: actions/setup-python@v4 with: - python-version: '3.10.0' + python-version: ${{ env.PYTHON_VERSION }} - name: 'Save realtime summaries' From c08335a5c36e156cd61a0a970980087bd4073c73 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Sun, 13 Aug 2023 18:52:41 -0500 Subject: [PATCH 23/32] Change python version --- .github/workflows/cta_data_downloads.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/cta_data_downloads.yml b/.github/workflows/cta_data_downloads.yml index 62b8968..66bbf7f 100644 --- a/.github/workflows/cta_data_downloads.yml +++ b/.github/workflows/cta_data_downloads.yml @@ -10,7 +10,7 @@ on: - cron: 30 17 * * * env: - PYTHON_VERSION: 3.10.0 + PYTHON_VERSION: 3.11.0 AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} From 1eef5d2f85e5adff82014b41a9e473ab90b966a5 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Sun, 13 Aug 2023 19:03:30 -0500 Subject: [PATCH 24/32] Loosen constraint on pandas version --- data_analysis/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data_analysis/requirements.txt b/data_analysis/requirements.txt index 6d8ddfa..6634a98 100644 --- a/data_analysis/requirements.txt +++ b/data_analysis/requirements.txt @@ -1,5 +1,5 @@ boto3==1.21.21 # The version can also be removed to resolve conflict. -pandas==1.4.3 +pandas>=1.4.3 geopandas==0.11.1 s3fs==2022.7.1 shapely==1.8.4 From b56f0c8fbe464be9410a88653d111303838c0801 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Sun, 13 Aug 2023 19:14:37 -0500 Subject: [PATCH 25/32] Change cta_schedule_versions to cta_data_downloads --- .github/workflows/cta_data_downloads.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/cta_data_downloads.yml b/.github/workflows/cta_data_downloads.yml index 66bbf7f..1e36c80 100644 --- a/.github/workflows/cta_data_downloads.yml +++ b/.github/workflows/cta_data_downloads.yml @@ -29,7 +29,7 @@ jobs: run: | pip install -r requirements.txt - python -c 'from scrape_data.cta_schedule_versions import save_cta_zip; \ + python -c 'from scrape_data.cta_data_downloads import save_cta_zip; \ save_cta_zip()' \ $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY @@ -47,7 +47,7 @@ jobs: - name: 'Save schedule summaries' run: | pip install -r requirements.txt - python -c 'from scrape_data.cta_schedule_versions import save_route_daily_summary; \ + python -c 'from scrape_data.cta_data_downloads import save_route_daily_summary; \ save_route_daily_summary()' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY From 665e90e7b1ec4bc74bcdc276a98d472967c7d67f Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Sun, 13 Aug 2023 19:25:35 -0500 Subject: [PATCH 26/32] Install libgeo-dev --- .github/workflows/cta_data_downloads.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/cta_data_downloads.yml b/.github/workflows/cta_data_downloads.yml index 1e36c80..e566ebb 100644 --- a/.github/workflows/cta_data_downloads.yml +++ b/.github/workflows/cta_data_downloads.yml @@ -29,6 +29,7 @@ jobs: run: | pip install -r requirements.txt + sudo apt-get install libgeos-dev python -c 'from scrape_data.cta_data_downloads import save_cta_zip; \ save_cta_zip()' \ $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY @@ -47,6 +48,7 @@ jobs: - name: 'Save schedule summaries' run: | pip install -r requirements.txt + sudo apt-get install libgeos-dev python -c 'from scrape_data.cta_data_downloads import save_route_daily_summary; \ save_route_daily_summary()' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY @@ -65,6 +67,7 @@ jobs: run: | pip install -r requirements.txt + sudo apt-get install libgeos-dev python -c 'from scrape_data.cta_data_downloads import save_realtime_daily_summary; \ save_realtime_daily_summary()' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY \ No newline at end of file From 6e287ec8ee06cfec6bcd60708ace8f82ea2ce10f Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Sun, 13 Aug 2023 19:36:55 -0500 Subject: [PATCH 27/32] Back to python 3.10 --- .github/workflows/cta_data_downloads.yml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/.github/workflows/cta_data_downloads.yml b/.github/workflows/cta_data_downloads.yml index e566ebb..86fc0c0 100644 --- a/.github/workflows/cta_data_downloads.yml +++ b/.github/workflows/cta_data_downloads.yml @@ -10,7 +10,7 @@ on: - cron: 30 17 * * * env: - PYTHON_VERSION: 3.11.0 + PYTHON_VERSION: 3.10.6 AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} @@ -29,7 +29,6 @@ jobs: run: | pip install -r requirements.txt - sudo apt-get install libgeos-dev python -c 'from scrape_data.cta_data_downloads import save_cta_zip; \ save_cta_zip()' \ $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY @@ -48,7 +47,6 @@ jobs: - name: 'Save schedule summaries' run: | pip install -r requirements.txt - sudo apt-get install libgeos-dev python -c 'from scrape_data.cta_data_downloads import save_route_daily_summary; \ save_route_daily_summary()' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY @@ -67,7 +65,7 @@ jobs: run: | pip install -r requirements.txt - sudo apt-get install libgeos-dev + python -c 'from scrape_data.cta_data_downloads import save_realtime_daily_summary; \ save_realtime_daily_summary()' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY \ No newline at end of file From 9b0497031c1ed36d7ab0d836de8ef5c404d40f9c Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Sun, 13 Aug 2023 19:41:34 -0500 Subject: [PATCH 28/32] Change back to version constraint --- data_analysis/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data_analysis/requirements.txt b/data_analysis/requirements.txt index 6634a98..6d8ddfa 100644 --- a/data_analysis/requirements.txt +++ b/data_analysis/requirements.txt @@ -1,5 +1,5 @@ boto3==1.21.21 # The version can also be removed to resolve conflict. -pandas>=1.4.3 +pandas==1.4.3 geopandas==0.11.1 s3fs==2022.7.1 shapely==1.8.4 From f0bd45a5bb2a04a5b8269823a3f69b1aa5c1aec9 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Sun, 13 Aug 2023 19:44:02 -0500 Subject: [PATCH 29/32] Change timezone to America/Chicago --- scrape_data/cta_data_downloads.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrape_data/cta_data_downloads.py b/scrape_data/cta_data_downloads.py index 4375b10..c486755 100644 --- a/scrape_data/cta_data_downloads.py +++ b/scrape_data/cta_data_downloads.py @@ -23,7 +23,7 @@ aws_secret_access_key=SECRET_KEY ) -today = pendulum.now().to_date_string() +today = pendulum.now('America/Chicago').to_date_string() CTA_GTFS, zipfile_bytes_io = sga.download_cta_zip() From cebd713fd1b3014671f7e8fc82c0d079b260187b Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Sun, 13 Aug 2023 20:44:50 -0500 Subject: [PATCH 30/32] Change to correct end date for realtime data --- scrape_data/cta_data_downloads.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/scrape_data/cta_data_downloads.py b/scrape_data/cta_data_downloads.py index c486755..aa69aef 100644 --- a/scrape_data/cta_data_downloads.py +++ b/scrape_data/cta_data_downloads.py @@ -82,16 +82,21 @@ def save_route_daily_summary() -> None: def save_realtime_daily_summary() -> None: - # This will be run at 5 pm Central time. bus_full_day_data_v2/{today}.csv - # will be in the public bucket by 11 am Central time, so there shouldn't be any issues. + if pendulum.now("America/Chicago").hour >= 11: + end_date = pendulum.yesterday("America/Chicago") + else: + end_date = pendulum.now("America/Chicago").subtract(days=2) + + end_date = end_date.to_date_string() + daily_data = pd.read_csv( - (csrt.BASE_PATH / f"bus_full_day_data_v2/{today}.csv") + (csrt.BASE_PATH / f"bus_full_day_data_v2/{end_date}.csv") .as_uri(), low_memory=False ) daily_data = csrt.make_daily_summary(daily_data) - filename = f'realtime_summaries/daily_job/bus_full_day_data_v2/{today}.csv' + filename = f'realtime_summaries/daily_job/bus_full_day_data_v2/{end_date}.csv' save_csv_to_bucket(daily_data, filename=filename) print(f'Confirm that {filename} exists in bucket') From 20c595fe7687fbc716c2da5f1808030c330be175 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Mon, 14 Aug 2023 19:17:53 -0500 Subject: [PATCH 31/32] rename schedule summary function --- .github/workflows/cta_data_downloads.yml | 4 ++-- scrape_data/cta_data_downloads.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/cta_data_downloads.yml b/.github/workflows/cta_data_downloads.yml index 86fc0c0..c36457d 100644 --- a/.github/workflows/cta_data_downloads.yml +++ b/.github/workflows/cta_data_downloads.yml @@ -47,8 +47,8 @@ jobs: - name: 'Save schedule summaries' run: | pip install -r requirements.txt - python -c 'from scrape_data.cta_data_downloads import save_route_daily_summary; \ - save_route_daily_summary()' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY + python -c 'from scrape_data.cta_data_downloads import save_sched_daily_summary; \ + save_sched_daily_summary()' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY save-realtime-daily-summary: diff --git a/scrape_data/cta_data_downloads.py b/scrape_data/cta_data_downloads.py index aa69aef..75f10d3 100644 --- a/scrape_data/cta_data_downloads.py +++ b/scrape_data/cta_data_downloads.py @@ -60,7 +60,7 @@ def save_csv_to_bucket(df: pd.DataFrame, filename: str) -> None: .put(Body=csv_buffer.getvalue()) -def save_route_daily_summary() -> None: +def save_sched_daily_summary() -> None: data = sga.GTFSFeed.extract_data(CTA_GTFS) data = sga.format_dates_hours(data) trip_summary = sga.make_trip_summary(data) From 4c06991f3c66e2760f0abaea504e751026316681 Mon Sep 17 00:00:00 2001 From: Laurie <55149902+lauriemerrell@users.noreply.github.com> Date: Tue, 19 Sep 2023 21:14:49 -0500 Subject: [PATCH 32/32] remove on push --- .github/workflows/cta_data_downloads.yml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/.github/workflows/cta_data_downloads.yml b/.github/workflows/cta_data_downloads.yml index c36457d..d98520b 100644 --- a/.github/workflows/cta_data_downloads.yml +++ b/.github/workflows/cta_data_downloads.yml @@ -1,9 +1,6 @@ name: Automate CTA schedule and realtime downloads on: - push: - branches: - - 'automate-schedule-downloads' schedule: # Run every day at 12:30pm CST which is 5:30pm UTC @@ -68,4 +65,4 @@ jobs: python -c 'from scrape_data.cta_data_downloads import save_realtime_daily_summary; \ save_realtime_daily_summary()' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY - \ No newline at end of file +