Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ridership gh action #75

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions .github/workflows/ridership-action.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
name: Automate ridership data updates

on:
workflow_dispatch:

schedule:
# Run monthly at 12:30pm CST which is 5:30pm UTC
- cron: 30 17 1 * *

env:
PYTHON_VERSION: 3.10.6
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}


jobs:
download-ridership-data:
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v4

- uses: actions/setup-python@v4
with:
python-version: ${{ env.PYTHON_VERSION }}

- name: Download and save ridership data to s3

run: |
pip install -r requirements.txt
python -c 'from scrape_data.ridership_download import save_ridership_json; \
save_ridership_json()' \
$AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY
17 changes: 11 additions & 6 deletions data_analysis/ridership_to_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,8 @@ def get_latest_month_and_year(ridership_df: pd.DataFrame) -> tuple:
return latest_date.month, latest_date.year


def ridership_to_json(ridership_df: pd.DataFrame, month: int = None, year: int = None) -> None:
def ridership_to_json(ridership_df: pd.DataFrame, month: int = None, year: int = None,
save: bool = True) -> None:
"""
Save ridership data to JSON for given month and year.
Note that the data is typically a few months
Expand All @@ -83,6 +84,7 @@ def ridership_to_json(ridership_df: pd.DataFrame, month: int = None, year: int =
4 9 01/01/2001 U 11207
month (int): Month of interest. Defaults to None
year (int): Year of interest. Defaults to None
save (bool): Whether to save JSON locally. Defaults to True.
"""
ridership = ridership_df.copy()
latest_month, latest_year = get_latest_month_and_year(ridership)
Expand Down Expand Up @@ -119,21 +121,24 @@ def ridership_to_json(ridership_df: pd.DataFrame, month: int = None, year: int =
df_daytype_summary_json = df_daytype_summary.to_json(orient='records')
full_json = {'date': f'{month_name} {year}'}
full_json['data'] = json.loads(df_daytype_summary_json)
with open(DATA_PATH / f'{month_name}_{year}_cta_ridership_data_day_type_summary.json', 'w') as outfile:
json.dump(full_json, outfile)

if save:
with open(DATA_PATH / f'{month_name}_{year}_cta_ridership_data_day_type_summary.json', 'w') as outfile:
json.dump(full_json, outfile)
else:
return json.dumps(full_json, indent=4)

app = typer.Typer()

@app.command()
def main(month: int = None, year: int = None) -> None:
def main(month: int = None, year: int = None, save: bool = True) -> None:

print("Loading data from data.cityofchicago.org")
ridership_df = pd.read_csv(
'https://data.cityofchicago.org/api/views/'
'jyb9-n7fm/rows.csv?accessType=DOWNLOAD'
)
print("Done!")
ridership_to_json(ridership_df=ridership_df, month=month, year=year)
return ridership_to_json(ridership_df=ridership_df, month=month, year=year, save=save)


if __name__ == '__main__':
Expand Down
46 changes: 46 additions & 0 deletions scrape_data/ridership_download.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import boto3
import sys
import data_analysis.ridership_to_json as ridership_to_json
import data_analysis.static_gtfs_analysis as sga

ACCESS_KEY = sys.argv[1]
SECRET_KEY = sys.argv[2]

client = boto3.client(
's3',
aws_access_key_id=ACCESS_KEY,
aws_secret_access_key=SECRET_KEY
)

s3 = boto3.resource(
's3',
region_name='us-east-1',
aws_access_key_id=ACCESS_KEY,
aws_secret_access_key=SECRET_KEY
)

def save_ridership_json() -> None:
ridership_json = ridership_to_json.main(save=False)
s3_ridership_json_path = 'frontend_data_files/cta_ridership_data_day_type_summary.json'
print(f'Saving {s3_ridership_json_path}')
s3.Object(
sga.BUCKET,
f'{s3_ridership_json_path}')\
.put(Body=ridership_json)

# Check that the file was uploaded successfully
keys(sga.BUCKET, [s3_ridership_json_path])


# https://stackoverflow.com/questions/30249069/listing-contents-of-a-bucket-with-boto3
def keys(bucket_name: str, filenames: list,
prefix: str='/', delimiter: str='/',
start_after: str='') -> None:
s3_paginator = client.get_paginator('list_objects_v2')
prefix = prefix.lstrip(delimiter)
start_after = (start_after or prefix) if prefix.endswith(delimiter) else start_after
for page in s3_paginator.paginate(Bucket=bucket_name, Prefix=prefix, StartAfter=start_after):
for content in page.get('Contents', ()):
if content['Key'] in filenames:
print(f"{content['Key']} exists")