diff --git a/data_analysis/compare_scheduled_and_rt.py b/data_analysis/compare_scheduled_and_rt.py index eb48705..7ff0652 100644 --- a/data_analysis/compare_scheduled_and_rt.py +++ b/data_analysis/compare_scheduled_and_rt.py @@ -11,14 +11,18 @@ from tqdm import tqdm from dotenv import load_dotenv -import static_gtfs_analysis - +import data_analysis.static_gtfs_analysis as static_gtfs_analysis +from scrape_data.scrape_schedule_versions import create_schedule_list load_dotenv() BUCKET_PUBLIC = os.getenv('BUCKET_PUBLIC', 'chn-ghost-buses-public') logger = logging.getLogger() -logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s %(levelname)s: %(message)s', + datefmt='%m/%d/%Y %I:%M:%S %p' +) BASE_PATH = S3Path(f"/{BUCKET_PUBLIC}") @@ -96,7 +100,7 @@ def sum_trips_by_rt_by_freq( rt_df: pd.DataFrame, sched_df: pd.DataFrame, agg_info: AggInfo, - holidays: List[str] = ["2022-05-30", "2022-07-04", "2022-09-05", "2022-11-24", "2022-12-25"]) -> Tuple[pd.DataFrame, pd.DataFrame]: + holidays: List[str] = ["2022-05-30", "2022-07-04", "2022-09-05", "2022-11-24", "2022-12-25"]) -> Tuple[pd.DataFrame, pd.DataFrame]: """Calculate ratio of trips to scheduled trips per route per specified frequency. @@ -172,7 +176,7 @@ def combine_real_time_rt_comparison( schedule_data_list: List[dict], agg_info: AggInfo, holidays: List[str] = ["2022-05-31", "2022-07-04", "2022-09-05", "2022-11-24", "2022-12-25"], - save: bool = True) -> Tuple[pd.DataFrame, pd.DataFrame]: + save: bool = True) -> Tuple[pd.DataFrame, pd.DataFrame]: """Generate a combined DataFrame with the realtime route comparisons Args: @@ -222,7 +226,7 @@ def combine_real_time_rt_comparison( for day in date_pbar: date_str = day.to_date_string() pbar.set_description( - f"Processing {date_str} at" + f" Processing {date_str} at " f"{pendulum.now().to_datetime_string()}" ) @@ -266,7 +270,7 @@ def combine_real_time_rt_comparison( outpath, index=False, ) - logger.info(f"Processing {feed['schedule_version']}") + logger.info(f" Processing version {feed['schedule_version']}") combined_grouped = pd.concat([combined_grouped, compare_by_day_type]) combined_long = pd.concat([combined_long, compare_freq_by_rte]) @@ -275,7 +279,7 @@ def combine_real_time_rt_comparison( def build_summary( combined_df: pd.DataFrame, - save: bool = True) -> pd.DataFrame: + save: bool = True) -> pd.DataFrame: """Create a summary by route and day type Args: @@ -317,58 +321,12 @@ def main(freq: str = 'D') -> Tuple[List[dict],pd.DataFrame, pd.DataFrame]: Args: freq (str): Frequency of aggregation. Defaults to Daily. Returns: - pd.DataFrame: A DataFrame of every day in the specified data with scheduled and - observed count of trips. + pd.DataFrame: A DataFrame of every day in the specified data with + scheduled and observed count of trips. pd.DataFrame: A DataFrame summary across versioned schedule comparisons. """ - - schedule_feeds = [{'schedule_version': '20220507', - 'feed_start_date': '2022-05-20', - 'feed_end_date': '2022-06-02'}, - {'schedule_version': '20220603', - 'feed_start_date': '2022-06-04', - 'feed_end_date': '2022-06-07'}, - {'schedule_version': '20220608', - 'feed_start_date': '2022-06-09', - 'feed_end_date': '2022-07-08'}, - {'schedule_version': '20220709', - 'feed_start_date': '2022-07-10', - 'feed_end_date': '2022-07-17'}, - {'schedule_version': '20220718', - 'feed_start_date': '2022-07-19', - 'feed_end_date': '2022-07-29'}, - {'schedule_version': '20220730', - 'feed_start_date': '2022-07-31', - 'feed_end_date': '2022-08-10'}, - {'schedule_version': '20220811', - 'feed_start_date': '2022-08-12', - 'feed_end_date': '2022-08-12'}, - {'schedule_version': '20220813', - 'feed_start_date': '2022-08-14', - 'feed_end_date': '2022-08-16'}, - {'schedule_version': '20220817', - 'feed_start_date': '2022-08-18', - 'feed_end_date': '2022-09-07'}, - {'schedule_version': '20220908', - 'feed_start_date': '2022-09-09', - 'feed_end_date': '2022-09-17'}, - {'schedule_version': '20220918', - 'feed_start_date': '2022-09-19', - 'feed_end_date': '2022-09-28'}, - {'schedule_version': '20220929', - 'feed_start_date': '2022-09-30', - 'feed_end_date': '2022-10-06'}, - {'schedule_version': '20221007', - 'feed_start_date': '2022-10-08', - 'feed_end_date': '2022-10-11'}, - {'schedule_version': '20221012', - 'feed_start_date': '2022-10-13', - 'feed_end_date': '2022-10-19'}, - {'schedule_version': '20221020', - 'feed_start_date': '2022-10-21', - 'feed_end_date': '2022-10-21'} - ] + schedule_feeds = create_schedule_list(month=5, year=2022) schedule_data_list = [] pbar = tqdm(schedule_feeds) @@ -378,19 +336,19 @@ def main(freq: str = 'D') -> Tuple[List[dict],pd.DataFrame, pd.DataFrame]: f"Generating daily schedule data for " f"schedule version {schedule_version}" ) - logging.info( + logger.info( f"\nDownloading zip file for schedule version " f"{schedule_version}" ) CTA_GTFS = static_gtfs_analysis.download_zip(schedule_version) - logging.info("\nExtracting data") + logger.info("\nExtracting data") data = static_gtfs_analysis.GTFSFeed.extract_data( CTA_GTFS, version_id=schedule_version ) data = static_gtfs_analysis.format_dates_hours(data) - logging.info("\nSummarizing trip data") + logger.info("\nSummarizing trip data") trip_summary = static_gtfs_analysis.make_trip_summary(data, pendulum.from_format(feed['feed_start_date'], 'YYYY-MM-DD'), pendulum.from_format(feed['feed_end_date'], 'YYYY-MM-DD')) @@ -404,7 +362,6 @@ def main(freq: str = 'D') -> Tuple[List[dict],pd.DataFrame, pd.DataFrame]: {"schedule_version": schedule_version, "data": route_daily_summary} ) - agg_info = AggInfo(freq=freq) combined_long, combined_grouped = combine_real_time_rt_comparison( schedule_feeds, diff --git a/scrape_data/requirements.txt b/scrape_data/requirements.txt index 54d45b6..9e57baa 100644 --- a/scrape_data/requirements.txt +++ b/scrape_data/requirements.txt @@ -1,2 +1,4 @@ pendulum==2.1.2 -requests==2.26.0 \ No newline at end of file +requests==2.26.0 +beautifulsoup4==4.11.1 +lxml==4.9.1 \ No newline at end of file diff --git a/scrape_data/scrape_schedule_versions.py b/scrape_data/scrape_schedule_versions.py new file mode 100644 index 0000000..5747edc --- /dev/null +++ b/scrape_data/scrape_schedule_versions.py @@ -0,0 +1,214 @@ +from typing import List, Tuple + +from bs4 import BeautifulSoup +import requests +import pendulum +import logging +import calendar +import pandas as pd + +logger = logging.getLogger() +logging.basicConfig(level=logging.INFO) +logger.setLevel(logging.INFO) + +BASE_URL = "https://transitfeeds.com" + + +def check_latest_rt_data_date() -> str: + """Fetch the latest available date of real-time bus data + + Returns: + str: A string of the latest date in YYYY-MM-DD format. + """ + if pendulum.now("America/Chicago").hour >= 11: + end_date = pendulum.yesterday("America/Chicago").date().format("YYYY-MM-DD") + else: + end_date = ( + pendulum.now("America/Chicago").subtract(days=2).date().format("YYYY-MM-DD") + ) + return end_date + + +def fetch_schedule_versions(month: int, year: int) -> List[pendulum.date]: + """Get the schedule versions from transitfeeds.com from the most recent + to specified month and year (inclusive). In case there are + multiple schedules for a given month and year pair, + all schedules will be fetched. + + Args: + month (int): The month of interest + year (int): The year of interest + + Returns: + List[pendulum.date]: A list of unique schedule versions + """ + link_list = [] + page = 1 + found = False + while not found: + logging.info(f" Searching page {page}") + url = BASE_URL + f"/p/chicago-transit-authority/165?p={page}" + response = requests.get(url).content + soup = BeautifulSoup(response, "lxml") + # List of dates from first row + table = soup.find_all("table") + for row in table[0].tbody.find_all("tr"): + first_col = row.find_all("td")[0] + date = pendulum.parse(first_col.text.strip(), strict=False) + # Find schedules up to and including the specified date. + if date.month == month and date.year == year: + logging.info( + f" Found schedule for" + f" {calendar.month_name[date.month]} {date.year}" + ) + logging.info( + f" Adding schedule for {calendar.month_name[date.month]}" + f" {date.day}, {date.year}" + ) + link_list.append(first_col) + found = True + continue + if found: + break + link_list.append(first_col) + page += 1 + + date_list = [s.text.strip() for s in link_list] + # Check for duplicates. The presence of duplicates could mean + # that the schedule was not in-effect. + # See https://github.com/chihacknight/chn-ghost-buses/issues/30 + duplicates = pd.Series(date_list)[pd.Series(date_list).duplicated()].values + if len(duplicates) > 0: + logging.info( + f" The duplicate schedule versions are" + f" {set(duplicates)}. Check whether these were in-effect." + ) + # Keep the first occurrence of duplicates. + # date_list is returned from Transitfeeds from newest to oldest + # [...'14 September 2021', '7 September 2021', '1 September 2021', + # '1 September 2021', '1 September 2021', '1 September 2021', + # '1 September 2021', '2 August 2021', '15 June 2021',..] + # By keeping the first entry, the entry that appears + # first on TransitFeeds site will be kept, which is the version that + # was left on the CTA website the longest. + date_list = pd.Series(date_list).drop_duplicates() + + return sorted([pendulum.parse(date, strict=False).date() for date in date_list]) + + +def modify_data_collection_start(date_list: List[pendulum.date]) -> List[pendulum.date]: + """Whether to modify the schedule version for the start of + data collection on May 20, 2022 + + Args: + date_list (List[pendulum.date]): A list of dates in pendulum format + + Returns: + List[pendulum.date]: A list of dates in pendulum format where the + start date for schedule version 2022-05-07 + is 2022-05-19. This will ensure that the date + ranges are valid i.e. starting with 2022-05-20 up to the day + before the next schedule version. + """ + # For schedule version 20220507, set the date to be May 19th 2022, + # one day before the start of data collection. This will mean that + # the start date will fall on 2022-05-20 in calculate_version_date_ranges + for idx, date in enumerate(date_list): + if date.month == 5 and date.day == 7 and date.year == 2022: + date = pendulum.date(2022, 5, 19) + date_list[idx] = date + + return date_list + + +def calculate_version_date_ranges( + month: int, year: int, start2022: bool = True +) -> Tuple[List[pendulum.date], List[Tuple[pendulum.date, pendulum.date]]]: + """Get the start and end dates for each schedule version from the most + recent version to the version specified by the month and year + + Args: + month (int): month of interest + year (int): year of interest + start2022 (bool, optional): Whether to modify the + start date of version 20220507 to reflect the start of + real-time bus data collection. Defaults to True. + + Returns: + Tuple[List[pendulum.date], List[Tuple[pendulum.date, pendulum.date]]]: + A list of schedule versions and list of tuples for the + start and end dates corresponding to those versions. + """ + schedule_list = fetch_schedule_versions(month=month, year=year) + if start2022: + schedule_list = modify_data_collection_start(schedule_list) + + start_end_list = [] + for i in range(len(schedule_list)): + try: + date_tuple = ( + schedule_list[i].add(days=1), + schedule_list[i + 1].subtract(days=1), + ) + start_end_list.append(date_tuple) + except IndexError: + pass + + # Handle the current schedule version by setting the end date as the latest + # available date for data. + start_end_list.append((schedule_list[-1].add(days=1), check_latest_rt_data_date())) + return schedule_list, start_end_list + + +def create_schedule_list_dict( + schedule_list: List[pendulum.date], + start_end_list: List[Tuple[pendulum.date, pendulum.date]], +) -> List[dict]: + """Create a list of dictionaries with keys for the schedule_version, + start_date, and end_date + + Args: + schedule_list (List[pendulum.date]): A list of schedule versions from + transitfeeds.com + start_end_list (List[pendulum.date]): A list of start and end dates + for each version + + Returns: + List[dict]: A list of dictionaries with the start and end dates + corresponding to each schedule version. + """ + schedule_list_dict = [] + for version, (start_date, end_date) in zip(schedule_list, start_end_list): + # Changing back the starting version to 20220507 + if version == pendulum.date(2022, 5, 19): + version = pendulum.date(2022, 5, 7) + schedule_dict = { + "schedule_version": version.format("YYYYMMDD"), + "feed_start_date": start_date.format("YYYY-MM-DD"), + "feed_end_date": end_date.format("YYYY-MM-DD"), + } + schedule_list_dict.append(schedule_dict) + return schedule_list_dict + + +def create_schedule_list(month: int, year: int, start2022: bool = True) -> List[dict]: + """Return a list of dictionaries with start and end dates + for each schedule version. + + Args: + month (int): month of interest + year (int): year of interest + start2022 (bool, optional): Whether to modify the + start date of version 20220507 to reflect the start of + real-time bus data collection. Defaults to True. + + Returns: + List[dict]: A list of dictionaries with the start and end dates + corresponding to each schedule version. + """ + schedule_list, start_end_list = calculate_version_date_ranges( + month=month, year=year, start2022=start2022 + ) + return create_schedule_list_dict( + schedule_list=schedule_list, start_end_list=start_end_list + )