From a2af9bf9ed9c1443e068a8bc2b222e757f4c3c71 Mon Sep 17 00:00:00 2001
From: dcjohnson24 <dcjohnson24@gmail.com>
Date: Tue, 18 Jul 2023 18:07:00 -0500
Subject: [PATCH 01/32] First commit for downloading and saving schedule data

---
 .github/workflows/cta_schedule_data.yml | 25 +++++++++
 data_analysis/static_gtfs_analysis.py   | 69 ++++++++++++++++++-------
 scrape_data/cta_schedule_versions.py    | 36 +++++++++++++
 3 files changed, 112 insertions(+), 18 deletions(-)
 create mode 100644 .github/workflows/cta_schedule_data.yml
 create mode 100644 scrape_data/cta_schedule_versions.py

diff --git a/.github/workflows/cta_schedule_data.yml b/.github/workflows/cta_schedule_data.yml
new file mode 100644
index 0000000..170a1ac
--- /dev/null
+++ b/.github/workflows/cta_schedule_data.yml
@@ -0,0 +1,25 @@
+name: Automated job
+
+on: [push, workflow_dispatch]
+  branches:
+    - 'automate-schedule-downloads'
+
+
+jobs:
+  download-cta-schedule-data:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - uses: actions/setup-python@v4
+        with:
+          python-version: '3.10'
+
+      - name: Download and save CTA schedule data
+        env:
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+        run: |
+          pip install -r requirements.txt
+          python scrape_data.cta_schedule_versions.py' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY
diff --git a/data_analysis/static_gtfs_analysis.py b/data_analysis/static_gtfs_analysis.py
index 119366c..de5ffc8 100644
--- a/data_analysis/static_gtfs_analysis.py
+++ b/data_analysis/static_gtfs_analysis.py
@@ -38,7 +38,6 @@
     datefmt='%m/%d/%Y %I:%M:%S %p'
 )
 
-
 @dataclass
 class GTFSFeed:
     """Class for storing GTFSFeed data.
@@ -53,14 +52,14 @@ class GTFSFeed:
 
     @classmethod
     def extract_data(cls, gtfs_zipfile: zipfile.ZipFile,
-                     version_id: str = None) -> GTFSFeed:
+                     version_id: str = None, cta_download: bool = True) -> GTFSFeed:
         """Load each text file in zipfile into a DataFrame
 
         Args:
             gtfs_zipfile (zipfile.ZipFile): Zipfile downloaded from
-                CTA transit feeds e.g.
+                transitfeeds.com or transitchicago.com e.g.
                 https://transitfeeds.com/p/chicago-transit-authority/
-                165/20220718/download"
+                165/20220718/download or https://www.transitchicago.com/downloads/sch_data/
             version_id (str, optional): The schedule version in use.
                 Defaults to None.
 
@@ -68,9 +67,17 @@ def extract_data(cls, gtfs_zipfile: zipfile.ZipFile,
             GTFSFeed: A GTFSFeed object containing multiple DataFrames
                 accessible by name.
         """
-        if version_id is None:
-            version_id = VERSION_ID
-        logging.info(f"Extracting data from CTA zipfile version {version_id}")
+        if cta_download:
+            if version_id is not None:
+                raise ValueError("version_id is not used for downloads directly from CTA")
+            else:
+                logging.info(f"Extracting data from transitchicago.com zipfile")
+        
+        else:
+            if version_id is None:
+                version_id = VERSION_ID
+            logging.info(f"Extracting data from transitfeeds.com zipfile version {version_id}")
+
         data_dict = {}
         pbar = tqdm(cls.__annotations__.keys())
         for txt_file in pbar:
@@ -140,14 +147,16 @@ def format_dates_hours(data: GTFSFeed) -> GTFSFeed:
 
 def make_trip_summary(
     data: GTFSFeed,
-    feed_start_date: pendulum.datetime,
-        feed_end_date: pendulum.datetime) -> pd.DataFrame:
+    feed_start_date: pendulum.datetime = None,
+        feed_end_date: pendulum.datetime = None) -> pd.DataFrame:
     """Create a summary of trips with one row per date
 
     Args:
         data (GTFSFeed): GTFS data from CTA
-        feed_start_date (datetime): Date from which this feed is valid (inclusive)
-        feed_end_date (datetime): Date until which this feed is valid (inclusive)
+        feed_start_date (datetime): Date from which this feed is valid (inclusive).
+            Defaults to None
+        feed_end_date (datetime): Date until which this feed is valid (inclusive).
+            Defaults to None
 
     Returns:
         pd.DataFrame: A DataFrame with each trip that occurred per row.
@@ -161,7 +170,7 @@ def make_trip_summary(
         ),
         columns=["raw_date"],
     )
-
+     
     # cross join calendar index with actual calendar to get all combos of
     # possible dates & services
     calendar_cross = calendar_date_range.merge(data.calendar, how="cross")
@@ -244,9 +253,10 @@ def make_trip_summary(
         trip_stop_hours, how="left", on="trip_id")
 
     # filter to only the rows for the period where this specific feed version was in effect
-    trip_summary = trip_summary.loc[
-        (trip_summary['raw_date'] >= feed_start_date)
-        & (trip_summary['raw_date'] <= feed_end_date), :]
+    if feed_start_date is not None and feed_end_date is not None:
+        trip_summary = trip_summary.loc[
+            (trip_summary['raw_date'] >= feed_start_date)
+            & (trip_summary['raw_date'] <= feed_end_date), :]
 
     return trip_summary
 
@@ -321,6 +331,24 @@ def make_linestring_of_points(
     return shapely.geometry.LineString(list(sorted_df["pt"]))
 
 
+def download_cta_zip() -> zipfile.ZipFile:
+    """Download CTA schedule data from transitchicago.com
+
+    Returns:
+        zipfile.ZipFile: A zipfile of the latest GTFS schedule data from transitchicago.com
+    """
+    logger.info('Downloading CTA data')
+    CTA_GTFS = zipfile.ZipFile(
+        BytesIO(
+            requests.get("https://www.transitchicago.com/downloads/sch_data/google_transit.zip"
+            ).content
+        )
+    )
+    logging.info('Download complete')
+    return CTA_GTFS
+ 
+
+
 def download_zip(version_id: str) -> zipfile.ZipFile:
     """Download a version schedule from transitfeeds.com
 
@@ -344,17 +372,22 @@ def download_zip(version_id: str) -> zipfile.ZipFile:
     return CTA_GTFS
 
 
-def download_extract_format(version_id: str) -> GTFSFeed:
+def download_extract_format(version_id: str = None) -> GTFSFeed:
     """Download a zipfile of GTFS data for a given version_id,
         extract data, and format date column.
 
     Args:
-        version_id (str): The version of the GTFS schedule data to download
+        version_id (str): The version of the GTFS schedule data to download. Defaults to None
+            If version_id is None, data will be downloaded from the CTA directly (transitchicag.com)
+            instead of transitfeeds.com
 
     Returns:
         GTFSFeed: A GTFSFeed object with formated dates
     """
-    CTA_GTFS = download_zip(version_id)
+    if version_id is None:
+        CTA_GTFS = download_cta_zip()
+    else:
+        CTA_GTFS = download_zip(version_id)
     data = GTFSFeed.extract_data(CTA_GTFS, version_id=version_id)
     data = format_dates_hours(data)
     return data
diff --git a/scrape_data/cta_schedule_versions.py b/scrape_data/cta_schedule_versions.py
new file mode 100644
index 0000000..f2d4538
--- /dev/null
+++ b/scrape_data/cta_schedule_versions.py
@@ -0,0 +1,36 @@
+import boto3
+import sys
+import data_analysis.static_gtfs_analysis as sga
+import pendulum
+from io import StringIO
+
+ACCESS_KEY = sys.argv[1]
+SECRET_KEY = sys.argv[2]
+
+client = boto3.client(
+    's3',
+    aws_access_key_id=ACCESS_KEY,
+    aws_secret_access_key=SECRET_KEY
+)
+
+s3 = boto3.resource(
+    's3',
+    region_name='us-east-1',
+    aws_access_key_id=ACCESS_KEY,
+    aws_secret_access_key=SECRET_KEY
+)
+
+data = sga.download_extract_format()
+trip_summary = sga.make_trip_summary(data)
+
+route_daily_summary = (
+    sga.summarize_date_rt(trip_summary)
+)
+date = pendulum.now().to_date_string()
+
+csv_buffer = StringIO()
+route_daily_summary.to_csv(csv_buffer)
+
+s3.Object('chn-ghost-buses-public', f'cta_route_daily_summary_{date}.csv')\
+    .put(Body=csv_buffer.getvalue())
+

From 4b38f62fb19f8b370777015a2a28d340fd70b50f Mon Sep 17 00:00:00 2001
From: dcjohnson24 <dcjohnson24@gmail.com>
Date: Tue, 18 Jul 2023 18:12:30 -0500
Subject: [PATCH 02/32] Fix syntax error

---
 .github/workflows/cta_schedule_data.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/cta_schedule_data.yml b/.github/workflows/cta_schedule_data.yml
index 170a1ac..0a75a8d 100644
--- a/.github/workflows/cta_schedule_data.yml
+++ b/.github/workflows/cta_schedule_data.yml
@@ -1,7 +1,8 @@
 name: Automated job
 
-on: [push, workflow_dispatch]
-  branches:
+on:
+  push:
+    branches:
     - 'automate-schedule-downloads'
 
 

From 50a8a4e1c05d563ee99d9c099e1c1fc09e0e0314 Mon Sep 17 00:00:00 2001
From: dcjohnson24 <dcjohnson24@gmail.com>
Date: Tue, 18 Jul 2023 19:03:14 -0500
Subject: [PATCH 03/32] Change version constraint of mapclassify

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index a709e5d..9e3218d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,7 +9,7 @@ python-dotenv==0.20.0
 seaborn==0.12.0
 PyQt5==5.15.7
 folium==0.12.1.post1
-mapclassify==2.4.2+55.g0155c6e
+mapclassify>=2.4.2+55.g0155c6e
 plotly==5.11.0
 kaleido==0.2.1
 pre-commit==2.20.0

From f56d0d43d45a996df369755a1962cb0646b7af15 Mon Sep 17 00:00:00 2001
From: dcjohnson24 <dcjohnson24@gmail.com>
Date: Thu, 20 Jul 2023 14:29:45 -0500
Subject: [PATCH 04/32] remove single quote

---
 .github/workflows/cta_schedule_data.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/cta_schedule_data.yml b/.github/workflows/cta_schedule_data.yml
index 0a75a8d..d835f54 100644
--- a/.github/workflows/cta_schedule_data.yml
+++ b/.github/workflows/cta_schedule_data.yml
@@ -23,4 +23,4 @@ jobs:
           AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
         run: |
           pip install -r requirements.txt
-          python scrape_data.cta_schedule_versions.py' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY
+          python scrape_data.cta_schedule_versions.py $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY

From 140ffbc828b8919000312036716287269f089d05 Mon Sep 17 00:00:00 2001
From: dcjohnson24 <dcjohnson24@gmail.com>
Date: Thu, 20 Jul 2023 14:40:43 -0500
Subject: [PATCH 05/32] Run as a module

---
 .github/workflows/cta_schedule_data.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/cta_schedule_data.yml b/.github/workflows/cta_schedule_data.yml
index d835f54..909c345 100644
--- a/.github/workflows/cta_schedule_data.yml
+++ b/.github/workflows/cta_schedule_data.yml
@@ -23,4 +23,4 @@ jobs:
           AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
         run: |
           pip install -r requirements.txt
-          python scrape_data.cta_schedule_versions.py $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY
+          python -m scrape_data.cta_schedule_versions $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY

From 9f00363e9314100332e8102b27d2d3645facbf72 Mon Sep 17 00:00:00 2001
From: dcjohnson24 <dcjohnson24@gmail.com>
Date: Mon, 24 Jul 2023 20:52:49 -0500
Subject: [PATCH 06/32] Add print function for saving csv to public bucket

---
 scrape_data/cta_schedule_versions.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scrape_data/cta_schedule_versions.py b/scrape_data/cta_schedule_versions.py
index f2d4538..e24ca84 100644
--- a/scrape_data/cta_schedule_versions.py
+++ b/scrape_data/cta_schedule_versions.py
@@ -31,6 +31,7 @@
 csv_buffer = StringIO()
 route_daily_summary.to_csv(csv_buffer)
 
+print(f'Saving cta_route_daily_summary_{date}.csv to public bucket')
 s3.Object('chn-ghost-buses-public', f'cta_route_daily_summary_{date}.csv')\
     .put(Body=csv_buffer.getvalue())
 

From 12f6b0887ff660831c9793969a00bbff2d20a221 Mon Sep 17 00:00:00 2001
From: dcjohnson24 <dcjohnson24@gmail.com>
Date: Tue, 25 Jul 2023 14:31:11 -0500
Subject: [PATCH 07/32] Download schedule daily at 5:30pm UTC

---
 .github/workflows/cta_schedule_data.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/cta_schedule_data.yml b/.github/workflows/cta_schedule_data.yml
index 909c345..d4eb7d8 100644
--- a/.github/workflows/cta_schedule_data.yml
+++ b/.github/workflows/cta_schedule_data.yml
@@ -5,6 +5,10 @@ on:
     branches:
     - 'automate-schedule-downloads'
 
+  schedule:
+    # Run every day at 12:30pm CST which is 5:30pm UTC
+    - cron: 30 17 * * * 
+
 
 jobs:
   download-cta-schedule-data:

From 8aa369130663f7ab4b422229412e7abde2446627 Mon Sep 17 00:00:00 2001
From: dcjohnson24 <dcjohnson24@gmail.com>
Date: Tue, 25 Jul 2023 18:18:30 -0500
Subject: [PATCH 08/32] Save zipfile from transitchicago.com to s3

---
 scrape_data/cta_schedule_versions.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/scrape_data/cta_schedule_versions.py b/scrape_data/cta_schedule_versions.py
index e24ca84..44e5c7f 100644
--- a/scrape_data/cta_schedule_versions.py
+++ b/scrape_data/cta_schedule_versions.py
@@ -20,13 +20,22 @@
     aws_secret_access_key=SECRET_KEY
 )
 
+date = pendulum.now().to_date_string()
+
+zipfile = sga.download_cta_zip()
+print(f'Saving zipfile available at '
+      f'https://www.transitchicago.com/downloads/sch_data/google_transit.zip '
+      f'on {date} to public bucket')
+
+s3.Object('chn-ghost-buses-public', f'google_transit_{date}.zip')\
+     .put(Body=zipfile)
+
 data = sga.download_extract_format()
 trip_summary = sga.make_trip_summary(data)
 
 route_daily_summary = (
     sga.summarize_date_rt(trip_summary)
 )
-date = pendulum.now().to_date_string()
 
 csv_buffer = StringIO()
 route_daily_summary.to_csv(csv_buffer)
@@ -34,4 +43,3 @@
 print(f'Saving cta_route_daily_summary_{date}.csv to public bucket')
 s3.Object('chn-ghost-buses-public', f'cta_route_daily_summary_{date}.csv')\
     .put(Body=csv_buffer.getvalue())
-

From 2ee3d05fc6a125c59a5ac6be34154c424adf48e7 Mon Sep 17 00:00:00 2001
From: dcjohnson24 <dcjohnson24@gmail.com>
Date: Tue, 25 Jul 2023 20:03:10 -0500
Subject: [PATCH 09/32] Change method of uploading zipfile

---
 data_analysis/static_gtfs_analysis.py | 13 ++++++-------
 scrape_data/cta_schedule_versions.py  |  7 +++----
 2 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/data_analysis/static_gtfs_analysis.py b/data_analysis/static_gtfs_analysis.py
index de5ffc8..65bba88 100644
--- a/data_analysis/static_gtfs_analysis.py
+++ b/data_analysis/static_gtfs_analysis.py
@@ -13,7 +13,7 @@
 import os
 from pathlib import Path
 from dataclasses import dataclass
-from typing import List
+from typing import Tuple
 
 import logging
 import calendar
@@ -331,21 +331,20 @@ def make_linestring_of_points(
     return shapely.geometry.LineString(list(sorted_df["pt"]))
 
 
-def download_cta_zip() -> zipfile.ZipFile:
+def download_cta_zip() -> Tuple[zipfile.ZipFile, BytesIO]:
     """Download CTA schedule data from transitchicago.com
 
     Returns:
         zipfile.ZipFile: A zipfile of the latest GTFS schedule data from transitchicago.com
     """
     logger.info('Downloading CTA data')
-    CTA_GTFS = zipfile.ZipFile(
-        BytesIO(
+    zip_bytes_io = BytesIO(
             requests.get("https://www.transitchicago.com/downloads/sch_data/google_transit.zip"
             ).content
         )
-    )
+    CTA_GTFS = zipfile.ZipFile(zip_bytes_io)
     logging.info('Download complete')
-    return CTA_GTFS
+    return CTA_GTFS, zip_bytes_io
  
 
 
@@ -385,7 +384,7 @@ def download_extract_format(version_id: str = None) -> GTFSFeed:
         GTFSFeed: A GTFSFeed object with formated dates
     """
     if version_id is None:
-        CTA_GTFS = download_cta_zip()
+        CTA_GTFS, _ = download_cta_zip()
     else:
         CTA_GTFS = download_zip(version_id)
     data = GTFSFeed.extract_data(CTA_GTFS, version_id=version_id)
diff --git a/scrape_data/cta_schedule_versions.py b/scrape_data/cta_schedule_versions.py
index 44e5c7f..f6676d0 100644
--- a/scrape_data/cta_schedule_versions.py
+++ b/scrape_data/cta_schedule_versions.py
@@ -22,13 +22,12 @@
 
 date = pendulum.now().to_date_string()
 
-zipfile = sga.download_cta_zip()
+zipfile, zipfile_bytes_io = sga.download_cta_zip()
 print(f'Saving zipfile available at '
       f'https://www.transitchicago.com/downloads/sch_data/google_transit.zip '
       f'on {date} to public bucket')
-
-s3.Object('chn-ghost-buses-public', f'google_transit_{date}.zip')\
-     .put(Body=zipfile)
+zipfile_bytes_io.seek(0)
+client.upload_fileobj(zipfile_bytes_io, 'chn-ghost-buses-public', f'google_transit_{date}.zip')
 
 data = sga.download_extract_format()
 trip_summary = sga.make_trip_summary(data)

From 7c6a42e80f6130a39bd61c6fea6b6c53fd83470a Mon Sep 17 00:00:00 2001
From: dcjohnson24 <dcjohnson24@gmail.com>
Date: Wed, 26 Jul 2023 20:45:47 -0500
Subject: [PATCH 10/32] Check that objects exist in bucket

---
 scrape_data/cta_schedule_versions.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/scrape_data/cta_schedule_versions.py b/scrape_data/cta_schedule_versions.py
index f6676d0..66e882e 100644
--- a/scrape_data/cta_schedule_versions.py
+++ b/scrape_data/cta_schedule_versions.py
@@ -42,3 +42,18 @@
 print(f'Saving cta_route_daily_summary_{date}.csv to public bucket')
 s3.Object('chn-ghost-buses-public', f'cta_route_daily_summary_{date}.csv')\
     .put(Body=csv_buffer.getvalue())
+
+
+# https://stackoverflow.com/questions/30249069/listing-contents-of-a-bucket-with-boto3
+print('Confirm that objects exist in bucket')
+s3_paginator = client.get_paginator('list_objects_v2')
+
+def keys(bucket_name, prefix='/', delimiter='/', start_after=''):
+    prefix = prefix.lstrip(delimiter)
+    start_after = (start_after or prefix) if prefix.endswith(delimiter) else start_after
+    for page in s3_paginator.paginate(Bucket=bucket_name, Prefix=prefix, StartAfter=start_after):
+        for content in page.get('Contents', ()):
+            if content['Key'] in [f'cta_route_daily_summary_{date}.csv', f'google_transit_{date}.zip']:
+                yield f"{content['Key']} exists"
+
+keys('chn-ghost-buses-public')
\ No newline at end of file

From bc9176676f2c22d556a983924599e02442db44c4 Mon Sep 17 00:00:00 2001
From: dcjohnson24 <dcjohnson24@gmail.com>
Date: Thu, 27 Jul 2023 18:51:25 -0500
Subject: [PATCH 11/32] Change yield to print

---
 scrape_data/cta_schedule_versions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scrape_data/cta_schedule_versions.py b/scrape_data/cta_schedule_versions.py
index 66e882e..bf79ab2 100644
--- a/scrape_data/cta_schedule_versions.py
+++ b/scrape_data/cta_schedule_versions.py
@@ -54,6 +54,6 @@ def keys(bucket_name, prefix='/', delimiter='/', start_after=''):
     for page in s3_paginator.paginate(Bucket=bucket_name, Prefix=prefix, StartAfter=start_after):
         for content in page.get('Contents', ()):
             if content['Key'] in [f'cta_route_daily_summary_{date}.csv', f'google_transit_{date}.zip']:
-                yield f"{content['Key']} exists"
+                print(f"{content['Key']} exists")
 
 keys('chn-ghost-buses-public')
\ No newline at end of file

From c0c153c3f7e94daf6225528c92418d6700771e32 Mon Sep 17 00:00:00 2001
From: dcjohnson24 <dcjohnson24@gmail.com>
Date: Mon, 7 Aug 2023 21:01:37 -0500
Subject: [PATCH 12/32] Separate downloading zip file and saving daily
 summaries

---
 .github/workflows/cta_schedule_data.yml | 34 +++++++++++--
 scrape_data/cta_schedule_versions.py    | 63 +++++++++++++++----------
 2 files changed, 70 insertions(+), 27 deletions(-)

diff --git a/.github/workflows/cta_schedule_data.yml b/.github/workflows/cta_schedule_data.yml
index d4eb7d8..3fb75d3 100644
--- a/.github/workflows/cta_schedule_data.yml
+++ b/.github/workflows/cta_schedule_data.yml
@@ -1,4 +1,4 @@
-name: Automated job
+name: Automate CTA schedule downloads
 
 on:
   push:
@@ -20,11 +20,39 @@ jobs:
       - uses: actions/setup-python@v4
         with:
           python-version: '3.10'
-
+        
       - name: Download and save CTA schedule data
         env:
           AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
           AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+
+        run: |
+          pip install -r requirements.txt
+          python -c 'from scrape_data.cta_schedule_versions import save_cta_zip; \
+           save_cta_zip()' \
+           $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY
+   
+          
+  save-schedule-daily-summary:
+    needs: download-cta-schedule-data
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - uses: actions/setup-python@v4
+        with:
+          python-version: '3.10'
+      
+      - name: 'Save schedule summaries'
+        env:
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+
         run: |
           pip install -r requirements.txt
-          python -m scrape_data.cta_schedule_versions $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY
+          python -c 'from scrape_data.cta_schedule_versions \
+           import save_route_daily_summary; save_route_daily_summary()' \ 
+           $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY
+    
+      
diff --git a/scrape_data/cta_schedule_versions.py b/scrape_data/cta_schedule_versions.py
index bf79ab2..a3d8986 100644
--- a/scrape_data/cta_schedule_versions.py
+++ b/scrape_data/cta_schedule_versions.py
@@ -20,40 +20,55 @@
     aws_secret_access_key=SECRET_KEY
 )
 
-date = pendulum.now().to_date_string()
+today = pendulum.now().to_date_string()
 
-zipfile, zipfile_bytes_io = sga.download_cta_zip()
-print(f'Saving zipfile available at '
-      f'https://www.transitchicago.com/downloads/sch_data/google_transit.zip '
-      f'on {date} to public bucket')
-zipfile_bytes_io.seek(0)
-client.upload_fileobj(zipfile_bytes_io, 'chn-ghost-buses-public', f'google_transit_{date}.zip')
+def save_cta_zip():
+    _, zipfile_bytes_io = sga.download_cta_zip()
+    print(f'Saving zipfile available at '
+        f'https://www.transitchicago.com/downloads/sch_data/google_transit.zip '
+        f'on {today} to public bucket')
+    zipfile_bytes_io.seek(0)
+    client.upload_fileobj(
+        zipfile_bytes_io,
+        'chn-ghost-buses-public',
+        f'cta_schedule_zipfiles_raw/google_transit_{today}.zip'
+    )
+    print('Confirm that object exists in bucket')
+    keys('chn-ghost-buses-public', [
+                f'cta_schedule_zipfiles_raw/google_transit_{today}.zip'
+            ])
 
-data = sga.download_extract_format()
-trip_summary = sga.make_trip_summary(data)
+def save_route_daily_summary():
+    data = sga.download_extract_format()
+    trip_summary = sga.make_trip_summary(data)
 
-route_daily_summary = (
-    sga.summarize_date_rt(trip_summary)
-)
+    route_daily_summary = (
+        sga.summarize_date_rt(trip_summary)
+    )
+    route_daily_summary['date'] = route_daily_summary['date'].astype(str)
+    route_daily_summary_today = route_daily_summary.loc[route_daily_summary['date'] == today]
 
-csv_buffer = StringIO()
-route_daily_summary.to_csv(csv_buffer)
+    csv_buffer = StringIO()
+    route_daily_summary_today.to_csv(csv_buffer)
 
-print(f'Saving cta_route_daily_summary_{date}.csv to public bucket')
-s3.Object('chn-ghost-buses-public', f'cta_route_daily_summary_{date}.csv')\
-    .put(Body=csv_buffer.getvalue())
+    print(f'Saving cta_route_daily_summary_{today}.csv to public bucket')
+    s3.Object(
+        'chn-ghost-buses-public',
+        f'schedule_summaries/daily_job/cta_route_daily_summary_{today}.csv')\
+        .put(Body=csv_buffer.getvalue())
 
+    print('Confirm that object exists in bucket')
+    keys('chn-ghost-buses-public', [
+                f'schedule_summaries/daily_job/cta_route_daily_summary_{today}.csv',
+            ])
 
-# https://stackoverflow.com/questions/30249069/listing-contents-of-a-bucket-with-boto3
-print('Confirm that objects exist in bucket')
-s3_paginator = client.get_paginator('list_objects_v2')
 
-def keys(bucket_name, prefix='/', delimiter='/', start_after=''):
+# https://stackoverflow.com/questions/30249069/listing-contents-of-a-bucket-with-boto3
+def keys(bucket_name: str, filenames: list, prefix: str='/', delimiter: str='/', start_after: str=''):
+    s3_paginator = client.get_paginator('list_objects_v2')
     prefix = prefix.lstrip(delimiter)
     start_after = (start_after or prefix) if prefix.endswith(delimiter) else start_after
     for page in s3_paginator.paginate(Bucket=bucket_name, Prefix=prefix, StartAfter=start_after):
         for content in page.get('Contents', ()):
-            if content['Key'] in [f'cta_route_daily_summary_{date}.csv', f'google_transit_{date}.zip']:
+            if content['Key'] in filenames:
                 print(f"{content['Key']} exists")
-
-keys('chn-ghost-buses-public')
\ No newline at end of file

From 2dc18f3772e6a4fa5ab6060751d3d01c1bc6ba8e Mon Sep 17 00:00:00 2001
From: dcjohnson24 <dcjohnson24@gmail.com>
Date: Mon, 7 Aug 2023 21:45:00 -0500
Subject: [PATCH 13/32] remove job dependency

---
 .github/workflows/cta_schedule_data.yml | 5 ++---
 scrape_data/cta_schedule_versions.py    | 6 ++++--
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/cta_schedule_data.yml b/.github/workflows/cta_schedule_data.yml
index 3fb75d3..66b0802 100644
--- a/.github/workflows/cta_schedule_data.yml
+++ b/.github/workflows/cta_schedule_data.yml
@@ -34,7 +34,6 @@ jobs:
    
           
   save-schedule-daily-summary:
-    needs: download-cta-schedule-data
     runs-on: ubuntu-latest
 
     steps:
@@ -51,8 +50,8 @@ jobs:
 
         run: |
           pip install -r requirements.txt
-          python -c 'from scrape_data.cta_schedule_versions \
-           import save_route_daily_summary; save_route_daily_summary()' \ 
+          python -c 'from scrape_data.cta_schedule_versions import save_route_daily_summary; \
+           save_route_daily_summary()' \ 
            $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY
     
       
diff --git a/scrape_data/cta_schedule_versions.py b/scrape_data/cta_schedule_versions.py
index a3d8986..6323085 100644
--- a/scrape_data/cta_schedule_versions.py
+++ b/scrape_data/cta_schedule_versions.py
@@ -22,8 +22,9 @@
 
 today = pendulum.now().to_date_string()
 
+CTA_GTFS, zipfile_bytes_io = sga.download_cta_zip()
+
 def save_cta_zip():
-    _, zipfile_bytes_io = sga.download_cta_zip()
     print(f'Saving zipfile available at '
         f'https://www.transitchicago.com/downloads/sch_data/google_transit.zip '
         f'on {today} to public bucket')
@@ -39,7 +40,8 @@ def save_cta_zip():
             ])
 
 def save_route_daily_summary():
-    data = sga.download_extract_format()
+    data = sga.GTFSFeed.extract_data(CTA_GTFS)
+    data = sga.format_dates_hours(data)
     trip_summary = sga.make_trip_summary(data)
 
     route_daily_summary = (

From d35f310d3a0584ef09e9ea53295f7de32da505f3 Mon Sep 17 00:00:00 2001
From: dcjohnson24 <dcjohnson24@gmail.com>
Date: Mon, 7 Aug 2023 21:57:11 -0500
Subject: [PATCH 14/32] Add args to same line

---
 .github/workflows/cta_schedule_data.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/cta_schedule_data.yml b/.github/workflows/cta_schedule_data.yml
index 66b0802..3a11f65 100644
--- a/.github/workflows/cta_schedule_data.yml
+++ b/.github/workflows/cta_schedule_data.yml
@@ -51,7 +51,6 @@ jobs:
         run: |
           pip install -r requirements.txt
           python -c 'from scrape_data.cta_schedule_versions import save_route_daily_summary; \
-           save_route_daily_summary()' \ 
-           $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY
+           save_route_daily_summary()' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY
     
       

From ee7b05721fd829788c0d086d005f399bd61ff1f0 Mon Sep 17 00:00:00 2001
From: dcjohnson24 <dcjohnson24@gmail.com>
Date: Sun, 13 Aug 2023 17:19:53 -0500
Subject: [PATCH 15/32] Save realtime summary file

---
 ...hedule_data.yml => cta_data_downloads.yml} |  34 ++++--
 scrape_data/cta_data_downloads.py             | 110 ++++++++++++++++++
 scrape_data/cta_schedule_versions.py          |  76 ------------
 3 files changed, 134 insertions(+), 86 deletions(-)
 rename .github/workflows/{cta_schedule_data.yml => cta_data_downloads.yml} (57%)
 create mode 100644 scrape_data/cta_data_downloads.py
 delete mode 100644 scrape_data/cta_schedule_versions.py

diff --git a/.github/workflows/cta_schedule_data.yml b/.github/workflows/cta_data_downloads.yml
similarity index 57%
rename from .github/workflows/cta_schedule_data.yml
rename to .github/workflows/cta_data_downloads.yml
index 3a11f65..c9d08d4 100644
--- a/.github/workflows/cta_schedule_data.yml
+++ b/.github/workflows/cta_data_downloads.yml
@@ -1,4 +1,4 @@
-name: Automate CTA schedule downloads
+name: Automate CTA schedule and realtime downloads
 
 on:
   push:
@@ -9,6 +9,10 @@ on:
     # Run every day at 12:30pm CST which is 5:30pm UTC
     - cron: 30 17 * * * 
 
+env: 
+  PYTHON_VERSION: '3.10'
+  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
 
 jobs:
   download-cta-schedule-data:
@@ -19,12 +23,9 @@ jobs:
 
       - uses: actions/setup-python@v4
         with:
-          python-version: '3.10'
+          python-version: $PYTHON_VERSION
         
       - name: Download and save CTA schedule data
-        env:
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
 
         run: |
           pip install -r requirements.txt
@@ -41,16 +42,29 @@ jobs:
 
       - uses: actions/setup-python@v4
         with:
-          python-version: '3.10'
+          python-version: $PYTHON_VERSION
       
       - name: 'Save schedule summaries'
-        env:
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-
         run: |
           pip install -r requirements.txt
           python -c 'from scrape_data.cta_schedule_versions import save_route_daily_summary; \
            save_route_daily_summary()' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY
     
       
+  save-realtime-daily-summary:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - uses: actions/setup-python@v4
+        with:
+          python-version: $PYTHON_VERSION
+      
+      - name: 'Save realtime summaries'
+      
+        run: |
+          pip install -r requirements.txt
+          python -c 'from scrape_data.cta_data_downloads import save_realtime_daily_summary; \
+           save_realtime_daily_summary()' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY
+    
\ No newline at end of file
diff --git a/scrape_data/cta_data_downloads.py b/scrape_data/cta_data_downloads.py
new file mode 100644
index 0000000..4375b10
--- /dev/null
+++ b/scrape_data/cta_data_downloads.py
@@ -0,0 +1,110 @@
+import boto3
+import sys
+import data_analysis.static_gtfs_analysis as sga
+import data_analysis.compare_scheduled_and_rt as csrt
+import pendulum
+from io import StringIO
+import pandas as pd
+
+
+ACCESS_KEY = sys.argv[1]
+SECRET_KEY = sys.argv[2]
+
+client = boto3.client(
+    's3',
+    aws_access_key_id=ACCESS_KEY,
+    aws_secret_access_key=SECRET_KEY
+)
+
+s3 = boto3.resource(
+    's3',
+    region_name='us-east-1',
+    aws_access_key_id=ACCESS_KEY,
+    aws_secret_access_key=SECRET_KEY
+)
+
+today = pendulum.now().to_date_string()
+
+CTA_GTFS, zipfile_bytes_io = sga.download_cta_zip()
+
+def save_cta_zip() -> None:
+    print(f'Saving zipfile available at '
+        f'https://www.transitchicago.com/downloads/sch_data/google_transit.zip '
+        f'on {today} to public bucket')
+    filename = f'cta_schedule_zipfiles_raw/google_transit_{today}.zip'
+    zipfile_bytes_io.seek(0)
+    client.upload_fileobj(
+        zipfile_bytes_io,
+        csrt.BUCKET_PUBLIC,
+        filename
+    )
+    print(f'Confirm that {filename} exists in bucket')
+    keys('chn-ghost-buses-public', [filename])
+
+
+def save_csv_to_bucket(df: pd.DataFrame, filename: str) -> None:
+    """Save pandas DataFrame to csv in s3
+
+    Args:
+        df (pd.DataFrame): DataFrame to be saved
+        filename (str): Name of the saved filename in s3.
+            Should contain the .csv suffix.
+    """
+    csv_buffer = StringIO()
+    df.to_csv(csv_buffer)
+
+    print(f'Saving {filename} to public bucket')
+    s3.Object(
+        csrt.BUCKET_PUBLIC,
+        f'{filename}')\
+        .put(Body=csv_buffer.getvalue())
+
+
+def save_route_daily_summary() -> None:
+    data = sga.GTFSFeed.extract_data(CTA_GTFS)
+    data = sga.format_dates_hours(data)
+    trip_summary = sga.make_trip_summary(data)
+
+    route_daily_summary = (
+        sga.summarize_date_rt(trip_summary)
+    )
+    route_daily_summary['date'] = route_daily_summary['date'].astype(str)
+    route_daily_summary_today = route_daily_summary.loc[route_daily_summary['date'] == today]
+
+    print(f'Saving cta_route_daily_summary_{today}.csv to public bucket')
+    filename = f'schedule_summaries/daily_job/cta_route_daily_summary_{today}.csv'
+    save_csv_to_bucket(
+        route_daily_summary_today,
+        filename=filename
+    )
+    print(f'Confirm that {filename} exists in bucket')
+    keys(csrt.BUCKET_PUBLIC, [filename])
+
+
+def save_realtime_daily_summary() -> None:
+    # This will be run at 5 pm Central time. bus_full_day_data_v2/{today}.csv 
+    # will be in the public bucket by 11 am Central time, so there shouldn't be any issues.
+    daily_data = pd.read_csv(
+                (csrt.BASE_PATH / f"bus_full_day_data_v2/{today}.csv")
+                .as_uri(),
+                low_memory=False
+            )
+
+    daily_data = csrt.make_daily_summary(daily_data)
+    filename = f'realtime_summaries/daily_job/bus_full_day_data_v2/{today}.csv'
+    save_csv_to_bucket(daily_data, filename=filename)
+
+    print(f'Confirm that {filename} exists in bucket')
+    keys(csrt.BUCKET_PUBLIC, [filename])
+
+# https://stackoverflow.com/questions/30249069/listing-contents-of-a-bucket-with-boto3
+def keys(bucket_name: str, filenames: list,
+         prefix: str='/', delimiter: str='/',
+         start_after: str='') -> None:
+    s3_paginator = client.get_paginator('list_objects_v2')
+    prefix = prefix.lstrip(delimiter)
+    start_after = (start_after or prefix) if prefix.endswith(delimiter) else start_after
+    for page in s3_paginator.paginate(Bucket=bucket_name, Prefix=prefix, StartAfter=start_after):
+        for content in page.get('Contents', ()):
+            if content['Key'] in filenames:
+                print(f"{content['Key']} exists")
diff --git a/scrape_data/cta_schedule_versions.py b/scrape_data/cta_schedule_versions.py
deleted file mode 100644
index 6323085..0000000
--- a/scrape_data/cta_schedule_versions.py
+++ /dev/null
@@ -1,76 +0,0 @@
-import boto3
-import sys
-import data_analysis.static_gtfs_analysis as sga
-import pendulum
-from io import StringIO
-
-ACCESS_KEY = sys.argv[1]
-SECRET_KEY = sys.argv[2]
-
-client = boto3.client(
-    's3',
-    aws_access_key_id=ACCESS_KEY,
-    aws_secret_access_key=SECRET_KEY
-)
-
-s3 = boto3.resource(
-    's3',
-    region_name='us-east-1',
-    aws_access_key_id=ACCESS_KEY,
-    aws_secret_access_key=SECRET_KEY
-)
-
-today = pendulum.now().to_date_string()
-
-CTA_GTFS, zipfile_bytes_io = sga.download_cta_zip()
-
-def save_cta_zip():
-    print(f'Saving zipfile available at '
-        f'https://www.transitchicago.com/downloads/sch_data/google_transit.zip '
-        f'on {today} to public bucket')
-    zipfile_bytes_io.seek(0)
-    client.upload_fileobj(
-        zipfile_bytes_io,
-        'chn-ghost-buses-public',
-        f'cta_schedule_zipfiles_raw/google_transit_{today}.zip'
-    )
-    print('Confirm that object exists in bucket')
-    keys('chn-ghost-buses-public', [
-                f'cta_schedule_zipfiles_raw/google_transit_{today}.zip'
-            ])
-
-def save_route_daily_summary():
-    data = sga.GTFSFeed.extract_data(CTA_GTFS)
-    data = sga.format_dates_hours(data)
-    trip_summary = sga.make_trip_summary(data)
-
-    route_daily_summary = (
-        sga.summarize_date_rt(trip_summary)
-    )
-    route_daily_summary['date'] = route_daily_summary['date'].astype(str)
-    route_daily_summary_today = route_daily_summary.loc[route_daily_summary['date'] == today]
-
-    csv_buffer = StringIO()
-    route_daily_summary_today.to_csv(csv_buffer)
-
-    print(f'Saving cta_route_daily_summary_{today}.csv to public bucket')
-    s3.Object(
-        'chn-ghost-buses-public',
-        f'schedule_summaries/daily_job/cta_route_daily_summary_{today}.csv')\
-        .put(Body=csv_buffer.getvalue())
-
-    print('Confirm that object exists in bucket')
-    keys('chn-ghost-buses-public', [
-                f'schedule_summaries/daily_job/cta_route_daily_summary_{today}.csv',
-            ])
-
-
-# https://stackoverflow.com/questions/30249069/listing-contents-of-a-bucket-with-boto3
-def keys(bucket_name: str, filenames: list, prefix: str='/', delimiter: str='/', start_after: str=''):
-    s3_paginator = client.get_paginator('list_objects_v2')
-    prefix = prefix.lstrip(delimiter)
-    start_after = (start_after or prefix) if prefix.endswith(delimiter) else start_after
-    for page in s3_paginator.paginate(Bucket=bucket_name, Prefix=prefix, StartAfter=start_after):
-        for content in page.get('Contents', ()):
-            if content['Key'] in filenames:
-                print(f"{content['Key']} exists")

From 461df4274f783a4e69209922d89cff40fd785a70 Mon Sep 17 00:00:00 2001
From: dcjohnson24 <dcjohnson24@gmail.com>
Date: Sun, 13 Aug 2023 17:40:20 -0500
Subject: [PATCH 16/32] Change to string

---
 .github/workflows/cta_data_downloads.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/cta_data_downloads.yml b/.github/workflows/cta_data_downloads.yml
index c9d08d4..fcd681d 100644
--- a/.github/workflows/cta_data_downloads.yml
+++ b/.github/workflows/cta_data_downloads.yml
@@ -10,7 +10,7 @@ on:
     - cron: 30 17 * * * 
 
 env: 
-  PYTHON_VERSION: '3.10'
+  PYTHON_VERSION: 3.10
   AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
   AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
 
@@ -23,7 +23,7 @@ jobs:
 
       - uses: actions/setup-python@v4
         with:
-          python-version: $PYTHON_VERSION
+          python-version: '$PYTHON_VERSION'
         
       - name: Download and save CTA schedule data
 
@@ -42,7 +42,7 @@ jobs:
 
       - uses: actions/setup-python@v4
         with:
-          python-version: $PYTHON_VERSION
+          python-version: '$PYTHON_VERSION'
       
       - name: 'Save schedule summaries'
         run: |
@@ -59,10 +59,10 @@ jobs:
 
       - uses: actions/setup-python@v4
         with:
-          python-version: $PYTHON_VERSION
+          python-version: '$PYTHON_VERSION'
       
       - name: 'Save realtime summaries'
-      
+
         run: |
           pip install -r requirements.txt
           python -c 'from scrape_data.cta_data_downloads import save_realtime_daily_summary; \

From e1baeaa648169fe20a32bd7568a354ab3bae7e87 Mon Sep 17 00:00:00 2001
From: dcjohnson24 <dcjohnson24@gmail.com>
Date: Sun, 13 Aug 2023 17:50:18 -0500
Subject: [PATCH 17/32] Correct python version name

---
 .github/workflows/cta_data_downloads.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/cta_data_downloads.yml b/.github/workflows/cta_data_downloads.yml
index fcd681d..c8dcee3 100644
--- a/.github/workflows/cta_data_downloads.yml
+++ b/.github/workflows/cta_data_downloads.yml
@@ -10,7 +10,7 @@ on:
     - cron: 30 17 * * * 
 
 env: 
-  PYTHON_VERSION: 3.10
+  PYTHON_VERSION: '3.10.0'
   AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
   AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
 
@@ -23,7 +23,7 @@ jobs:
 
       - uses: actions/setup-python@v4
         with:
-          python-version: '$PYTHON_VERSION'
+          python-version: $PYTHON_VERSION
         
       - name: Download and save CTA schedule data
 
@@ -42,7 +42,7 @@ jobs:
 
       - uses: actions/setup-python@v4
         with:
-          python-version: '$PYTHON_VERSION'
+          python-version: $PYTHON_VERSION
       
       - name: 'Save schedule summaries'
         run: |
@@ -59,7 +59,7 @@ jobs:
 
       - uses: actions/setup-python@v4
         with:
-          python-version: '$PYTHON_VERSION'
+          python-version: $PYTHON_VERSION
       
       - name: 'Save realtime summaries'
 

From 398d62add781a3d9ff203e3f4a5a4bf938fc7367 Mon Sep 17 00:00:00 2001
From: dcjohnson24 <dcjohnson24@gmail.com>
Date: Sun, 13 Aug 2023 18:04:06 -0500
Subject: [PATCH 18/32] Add quotes

---
 .github/workflows/cta_data_downloads.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/cta_data_downloads.yml b/.github/workflows/cta_data_downloads.yml
index c8dcee3..12ed31d 100644
--- a/.github/workflows/cta_data_downloads.yml
+++ b/.github/workflows/cta_data_downloads.yml
@@ -23,7 +23,7 @@ jobs:
 
       - uses: actions/setup-python@v4
         with:
-          python-version: $PYTHON_VERSION
+          python-version: '$PYTHON_VERSION'
         
       - name: Download and save CTA schedule data
 
@@ -42,7 +42,7 @@ jobs:
 
       - uses: actions/setup-python@v4
         with:
-          python-version: $PYTHON_VERSION
+          python-version: '$PYTHON_VERSION'
       
       - name: 'Save schedule summaries'
         run: |
@@ -59,7 +59,7 @@ jobs:
 
       - uses: actions/setup-python@v4
         with:
-          python-version: $PYTHON_VERSION
+          python-version: '$PYTHON_VERSION'
       
       - name: 'Save realtime summaries'
 

From 77ef70816356c6c0bce370946413feef63f374a7 Mon Sep 17 00:00:00 2001
From: dcjohnson24 <dcjohnson24@gmail.com>
Date: Sun, 13 Aug 2023 18:15:26 -0500
Subject: [PATCH 19/32] Add environment context

---
 .github/workflows/cta_data_downloads.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/cta_data_downloads.yml b/.github/workflows/cta_data_downloads.yml
index 12ed31d..94607f1 100644
--- a/.github/workflows/cta_data_downloads.yml
+++ b/.github/workflows/cta_data_downloads.yml
@@ -23,7 +23,7 @@ jobs:
 
       - uses: actions/setup-python@v4
         with:
-          python-version: '$PYTHON_VERSION'
+          python-version: ${{ env.PYTHON_VERSION }}
         
       - name: Download and save CTA schedule data
 
@@ -42,7 +42,7 @@ jobs:
 
       - uses: actions/setup-python@v4
         with:
-          python-version: '$PYTHON_VERSION'
+          python-version: ${{ env.PYTHON_VERSION }}
       
       - name: 'Save schedule summaries'
         run: |
@@ -59,7 +59,7 @@ jobs:
 
       - uses: actions/setup-python@v4
         with:
-          python-version: '$PYTHON_VERSION'
+          python-version: ${{ env.PYTHON_VERSION }}
       
       - name: 'Save realtime summaries'
 

From 4842fa07c13850c75373c0dcca08dd3657358a1e Mon Sep 17 00:00:00 2001
From: dcjohnson24 <dcjohnson24@gmail.com>
Date: Sun, 13 Aug 2023 18:23:30 -0500
Subject: [PATCH 20/32] Remove quotes

---
 .github/workflows/cta_data_downloads.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/cta_data_downloads.yml b/.github/workflows/cta_data_downloads.yml
index 94607f1..62b8968 100644
--- a/.github/workflows/cta_data_downloads.yml
+++ b/.github/workflows/cta_data_downloads.yml
@@ -10,7 +10,7 @@ on:
     - cron: 30 17 * * * 
 
 env: 
-  PYTHON_VERSION: '3.10.0'
+  PYTHON_VERSION: 3.10.0
   AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
   AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
 

From 3817614186c70519a1e15fd0569148637836d532 Mon Sep 17 00:00:00 2001
From: dcjohnson24 <dcjohnson24@gmail.com>
Date: Sun, 13 Aug 2023 18:36:23 -0500
Subject: [PATCH 21/32] Test without environment variables

---
 .github/workflows/cta_data_downloads.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/cta_data_downloads.yml b/.github/workflows/cta_data_downloads.yml
index 62b8968..1d9dd73 100644
--- a/.github/workflows/cta_data_downloads.yml
+++ b/.github/workflows/cta_data_downloads.yml
@@ -10,7 +10,7 @@ on:
     - cron: 30 17 * * * 
 
 env: 
-  PYTHON_VERSION: 3.10.0
+  # PYTHON_VERSION: 3.10.0
   AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
   AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
 
@@ -23,7 +23,7 @@ jobs:
 
       - uses: actions/setup-python@v4
         with:
-          python-version: ${{ env.PYTHON_VERSION }}
+          python-version: '3.10.0'
         
       - name: Download and save CTA schedule data
 
@@ -42,7 +42,7 @@ jobs:
 
       - uses: actions/setup-python@v4
         with:
-          python-version: ${{ env.PYTHON_VERSION }}
+          python-version: '3.10.0'
       
       - name: 'Save schedule summaries'
         run: |
@@ -59,7 +59,7 @@ jobs:
 
       - uses: actions/setup-python@v4
         with:
-          python-version: ${{ env.PYTHON_VERSION }}
+          python-version: '3.10.0'
       
       - name: 'Save realtime summaries'
 

From cfb09606e9434875a07c8991d52f9ec79057599d Mon Sep 17 00:00:00 2001
From: dcjohnson24 <dcjohnson24@gmail.com>
Date: Sun, 13 Aug 2023 18:45:23 -0500
Subject: [PATCH 22/32] Revert "Test without environment variables"

This reverts commit 3817614186c70519a1e15fd0569148637836d532.
---
 .github/workflows/cta_data_downloads.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/cta_data_downloads.yml b/.github/workflows/cta_data_downloads.yml
index 1d9dd73..62b8968 100644
--- a/.github/workflows/cta_data_downloads.yml
+++ b/.github/workflows/cta_data_downloads.yml
@@ -10,7 +10,7 @@ on:
     - cron: 30 17 * * * 
 
 env: 
-  # PYTHON_VERSION: 3.10.0
+  PYTHON_VERSION: 3.10.0
   AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
   AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
 
@@ -23,7 +23,7 @@ jobs:
 
       - uses: actions/setup-python@v4
         with:
-          python-version: '3.10.0'
+          python-version: ${{ env.PYTHON_VERSION }}
         
       - name: Download and save CTA schedule data
 
@@ -42,7 +42,7 @@ jobs:
 
       - uses: actions/setup-python@v4
         with:
-          python-version: '3.10.0'
+          python-version: ${{ env.PYTHON_VERSION }}
       
       - name: 'Save schedule summaries'
         run: |
@@ -59,7 +59,7 @@ jobs:
 
       - uses: actions/setup-python@v4
         with:
-          python-version: '3.10.0'
+          python-version: ${{ env.PYTHON_VERSION }}
       
       - name: 'Save realtime summaries'
 

From c08335a5c36e156cd61a0a970980087bd4073c73 Mon Sep 17 00:00:00 2001
From: dcjohnson24 <dcjohnson24@gmail.com>
Date: Sun, 13 Aug 2023 18:52:41 -0500
Subject: [PATCH 23/32] Change python version

---
 .github/workflows/cta_data_downloads.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/cta_data_downloads.yml b/.github/workflows/cta_data_downloads.yml
index 62b8968..66bbf7f 100644
--- a/.github/workflows/cta_data_downloads.yml
+++ b/.github/workflows/cta_data_downloads.yml
@@ -10,7 +10,7 @@ on:
     - cron: 30 17 * * * 
 
 env: 
-  PYTHON_VERSION: 3.10.0
+  PYTHON_VERSION: 3.11.0
   AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
   AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
 

From 1eef5d2f85e5adff82014b41a9e473ab90b966a5 Mon Sep 17 00:00:00 2001
From: dcjohnson24 <dcjohnson24@gmail.com>
Date: Sun, 13 Aug 2023 19:03:30 -0500
Subject: [PATCH 24/32] Loosen constraint on pandas version

---
 data_analysis/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data_analysis/requirements.txt b/data_analysis/requirements.txt
index 6d8ddfa..6634a98 100644
--- a/data_analysis/requirements.txt
+++ b/data_analysis/requirements.txt
@@ -1,5 +1,5 @@
 boto3==1.21.21 # The version can also be removed to resolve conflict.
-pandas==1.4.3
+pandas>=1.4.3
 geopandas==0.11.1
 s3fs==2022.7.1
 shapely==1.8.4

From b56f0c8fbe464be9410a88653d111303838c0801 Mon Sep 17 00:00:00 2001
From: dcjohnson24 <dcjohnson24@gmail.com>
Date: Sun, 13 Aug 2023 19:14:37 -0500
Subject: [PATCH 25/32] Change cta_schedule_versions to cta_data_downloads

---
 .github/workflows/cta_data_downloads.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/cta_data_downloads.yml b/.github/workflows/cta_data_downloads.yml
index 66bbf7f..1e36c80 100644
--- a/.github/workflows/cta_data_downloads.yml
+++ b/.github/workflows/cta_data_downloads.yml
@@ -29,7 +29,7 @@ jobs:
 
         run: |
           pip install -r requirements.txt
-          python -c 'from scrape_data.cta_schedule_versions import save_cta_zip; \
+          python -c 'from scrape_data.cta_data_downloads import save_cta_zip; \
            save_cta_zip()' \
            $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY
    
@@ -47,7 +47,7 @@ jobs:
       - name: 'Save schedule summaries'
         run: |
           pip install -r requirements.txt
-          python -c 'from scrape_data.cta_schedule_versions import save_route_daily_summary; \
+          python -c 'from scrape_data.cta_data_downloads import save_route_daily_summary; \
            save_route_daily_summary()' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY
     
       

From 665e90e7b1ec4bc74bcdc276a98d472967c7d67f Mon Sep 17 00:00:00 2001
From: dcjohnson24 <dcjohnson24@gmail.com>
Date: Sun, 13 Aug 2023 19:25:35 -0500
Subject: [PATCH 26/32] Install libgeo-dev

---
 .github/workflows/cta_data_downloads.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/cta_data_downloads.yml b/.github/workflows/cta_data_downloads.yml
index 1e36c80..e566ebb 100644
--- a/.github/workflows/cta_data_downloads.yml
+++ b/.github/workflows/cta_data_downloads.yml
@@ -29,6 +29,7 @@ jobs:
 
         run: |
           pip install -r requirements.txt
+          sudo apt-get install libgeos-dev
           python -c 'from scrape_data.cta_data_downloads import save_cta_zip; \
            save_cta_zip()' \
            $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY
@@ -47,6 +48,7 @@ jobs:
       - name: 'Save schedule summaries'
         run: |
           pip install -r requirements.txt
+          sudo apt-get install libgeos-dev
           python -c 'from scrape_data.cta_data_downloads import save_route_daily_summary; \
            save_route_daily_summary()' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY
     
@@ -65,6 +67,7 @@ jobs:
 
         run: |
           pip install -r requirements.txt
+          sudo apt-get install libgeos-dev
           python -c 'from scrape_data.cta_data_downloads import save_realtime_daily_summary; \
            save_realtime_daily_summary()' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY
     
\ No newline at end of file

From 6e287ec8ee06cfec6bcd60708ace8f82ea2ce10f Mon Sep 17 00:00:00 2001
From: dcjohnson24 <dcjohnson24@gmail.com>
Date: Sun, 13 Aug 2023 19:36:55 -0500
Subject: [PATCH 27/32] Back to python 3.10

---
 .github/workflows/cta_data_downloads.yml | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/cta_data_downloads.yml b/.github/workflows/cta_data_downloads.yml
index e566ebb..86fc0c0 100644
--- a/.github/workflows/cta_data_downloads.yml
+++ b/.github/workflows/cta_data_downloads.yml
@@ -10,7 +10,7 @@ on:
     - cron: 30 17 * * * 
 
 env: 
-  PYTHON_VERSION: 3.11.0
+  PYTHON_VERSION: 3.10.6
   AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
   AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
 
@@ -29,7 +29,6 @@ jobs:
 
         run: |
           pip install -r requirements.txt
-          sudo apt-get install libgeos-dev
           python -c 'from scrape_data.cta_data_downloads import save_cta_zip; \
            save_cta_zip()' \
            $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY
@@ -48,7 +47,6 @@ jobs:
       - name: 'Save schedule summaries'
         run: |
           pip install -r requirements.txt
-          sudo apt-get install libgeos-dev
           python -c 'from scrape_data.cta_data_downloads import save_route_daily_summary; \
            save_route_daily_summary()' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY
     
@@ -67,7 +65,7 @@ jobs:
 
         run: |
           pip install -r requirements.txt
-          sudo apt-get install libgeos-dev
+          
           python -c 'from scrape_data.cta_data_downloads import save_realtime_daily_summary; \
            save_realtime_daily_summary()' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY
     
\ No newline at end of file

From 9b0497031c1ed36d7ab0d836de8ef5c404d40f9c Mon Sep 17 00:00:00 2001
From: dcjohnson24 <dcjohnson24@gmail.com>
Date: Sun, 13 Aug 2023 19:41:34 -0500
Subject: [PATCH 28/32] Change back to version constraint

---
 data_analysis/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data_analysis/requirements.txt b/data_analysis/requirements.txt
index 6634a98..6d8ddfa 100644
--- a/data_analysis/requirements.txt
+++ b/data_analysis/requirements.txt
@@ -1,5 +1,5 @@
 boto3==1.21.21 # The version can also be removed to resolve conflict.
-pandas>=1.4.3
+pandas==1.4.3
 geopandas==0.11.1
 s3fs==2022.7.1
 shapely==1.8.4

From f0bd45a5bb2a04a5b8269823a3f69b1aa5c1aec9 Mon Sep 17 00:00:00 2001
From: dcjohnson24 <dcjohnson24@gmail.com>
Date: Sun, 13 Aug 2023 19:44:02 -0500
Subject: [PATCH 29/32] Change timezone to America/Chicago

---
 scrape_data/cta_data_downloads.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scrape_data/cta_data_downloads.py b/scrape_data/cta_data_downloads.py
index 4375b10..c486755 100644
--- a/scrape_data/cta_data_downloads.py
+++ b/scrape_data/cta_data_downloads.py
@@ -23,7 +23,7 @@
     aws_secret_access_key=SECRET_KEY
 )
 
-today = pendulum.now().to_date_string()
+today = pendulum.now('America/Chicago').to_date_string()
 
 CTA_GTFS, zipfile_bytes_io = sga.download_cta_zip()
 

From cebd713fd1b3014671f7e8fc82c0d079b260187b Mon Sep 17 00:00:00 2001
From: dcjohnson24 <dcjohnson24@gmail.com>
Date: Sun, 13 Aug 2023 20:44:50 -0500
Subject: [PATCH 30/32] Change to correct end date for realtime data

---
 scrape_data/cta_data_downloads.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/scrape_data/cta_data_downloads.py b/scrape_data/cta_data_downloads.py
index c486755..aa69aef 100644
--- a/scrape_data/cta_data_downloads.py
+++ b/scrape_data/cta_data_downloads.py
@@ -82,16 +82,21 @@ def save_route_daily_summary() -> None:
 
 
 def save_realtime_daily_summary() -> None:
-    # This will be run at 5 pm Central time. bus_full_day_data_v2/{today}.csv 
-    # will be in the public bucket by 11 am Central time, so there shouldn't be any issues.
+    if pendulum.now("America/Chicago").hour >= 11:
+        end_date = pendulum.yesterday("America/Chicago")
+    else: 
+        end_date = pendulum.now("America/Chicago").subtract(days=2)
+    
+    end_date = end_date.to_date_string()
+
     daily_data = pd.read_csv(
-                (csrt.BASE_PATH / f"bus_full_day_data_v2/{today}.csv")
+                (csrt.BASE_PATH / f"bus_full_day_data_v2/{end_date}.csv")
                 .as_uri(),
                 low_memory=False
             )
 
     daily_data = csrt.make_daily_summary(daily_data)
-    filename = f'realtime_summaries/daily_job/bus_full_day_data_v2/{today}.csv'
+    filename = f'realtime_summaries/daily_job/bus_full_day_data_v2/{end_date}.csv'
     save_csv_to_bucket(daily_data, filename=filename)
 
     print(f'Confirm that {filename} exists in bucket')

From 20c595fe7687fbc716c2da5f1808030c330be175 Mon Sep 17 00:00:00 2001
From: dcjohnson24 <dcjohnson24@gmail.com>
Date: Mon, 14 Aug 2023 19:17:53 -0500
Subject: [PATCH 31/32] rename schedule summary function

---
 .github/workflows/cta_data_downloads.yml | 4 ++--
 scrape_data/cta_data_downloads.py        | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/cta_data_downloads.yml b/.github/workflows/cta_data_downloads.yml
index 86fc0c0..c36457d 100644
--- a/.github/workflows/cta_data_downloads.yml
+++ b/.github/workflows/cta_data_downloads.yml
@@ -47,8 +47,8 @@ jobs:
       - name: 'Save schedule summaries'
         run: |
           pip install -r requirements.txt
-          python -c 'from scrape_data.cta_data_downloads import save_route_daily_summary; \
-           save_route_daily_summary()' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY
+          python -c 'from scrape_data.cta_data_downloads import save_sched_daily_summary; \
+           save_sched_daily_summary()' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY
     
       
   save-realtime-daily-summary:
diff --git a/scrape_data/cta_data_downloads.py b/scrape_data/cta_data_downloads.py
index aa69aef..75f10d3 100644
--- a/scrape_data/cta_data_downloads.py
+++ b/scrape_data/cta_data_downloads.py
@@ -60,7 +60,7 @@ def save_csv_to_bucket(df: pd.DataFrame, filename: str) -> None:
         .put(Body=csv_buffer.getvalue())
 
 
-def save_route_daily_summary() -> None:
+def save_sched_daily_summary() -> None:
     data = sga.GTFSFeed.extract_data(CTA_GTFS)
     data = sga.format_dates_hours(data)
     trip_summary = sga.make_trip_summary(data)

From 4c06991f3c66e2760f0abaea504e751026316681 Mon Sep 17 00:00:00 2001
From: Laurie <55149902+lauriemerrell@users.noreply.github.com>
Date: Tue, 19 Sep 2023 21:14:49 -0500
Subject: [PATCH 32/32] remove on push

---
 .github/workflows/cta_data_downloads.yml | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/.github/workflows/cta_data_downloads.yml b/.github/workflows/cta_data_downloads.yml
index c36457d..d98520b 100644
--- a/.github/workflows/cta_data_downloads.yml
+++ b/.github/workflows/cta_data_downloads.yml
@@ -1,9 +1,6 @@
 name: Automate CTA schedule and realtime downloads
 
 on:
-  push:
-    branches:
-    - 'automate-schedule-downloads'
 
   schedule:
     # Run every day at 12:30pm CST which is 5:30pm UTC
@@ -68,4 +65,4 @@ jobs:
           
           python -c 'from scrape_data.cta_data_downloads import save_realtime_daily_summary; \
            save_realtime_daily_summary()' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY
-    
\ No newline at end of file
+