diff --git a/news_events/etl/mitpe_events.py b/news_events/etl/mitpe_events.py index 2fa8d89c9e..fe2f9afa75 100644 --- a/news_events/etl/mitpe_events.py +++ b/news_events/etl/mitpe_events.py @@ -8,7 +8,7 @@ from main.utils import now_in_utc from news_events.constants import ALL_AUDIENCES, FeedType -from news_events.etl.utils import fetch_data_by_page, parse_date +from news_events.etl.utils import fetch_data_by_page, parse_date_time_range log = logging.getLogger(__name__) MITPE_EVENTS_TITLE = "MIT Professional Education Events" @@ -66,16 +66,9 @@ def transform_item(item: dict) -> dict: """ - times = item.get("time_range", "").split("-") - start_dt = parse_date( - f"{item.get("start_date")} {times[0] if len(times) > 0 else ''}" + start_dt, end_dt = parse_date_time_range( + item.get("start_date"), item.get("end_date"), item.get("time_range") ) - if not start_dt: - # Time range may be invalid, try without it - start_dt = parse_date(f"{item.get("start_date")}") - end_dt = parse_date(f"{item.get("end_date")} {times[1] if len(times) > 1 else ''}") - if not end_dt: - end_dt = parse_date(f"{item.get("end_date")}") # Do not bother transforming past events now = now_in_utc() diff --git a/news_events/etl/mitpe_events_test.py b/news_events/etl/mitpe_events_test.py index 8759f2650c..48f4e75682 100644 --- a/news_events/etl/mitpe_events_test.py +++ b/news_events/etl/mitpe_events_test.py @@ -64,8 +64,8 @@ def test_transform(mitpe_events_json_data): 2024, 8, 15, 21, 0, 0, tzinfo=UTC ) assert items[3]["detail"]["event_datetime"] == datetime( - 2023, 5, 12, 4, 0, 0, tzinfo=UTC + 2023, 5, 12, 16, 0, 0, tzinfo=UTC ) assert items[3]["detail"]["event_end_datetime"] == datetime( - 2023, 5, 12, 4, 0, 0, tzinfo=UTC + 2023, 5, 12, 16, 0, 0, tzinfo=UTC ) diff --git a/news_events/etl/utils.py b/news_events/etl/utils.py index af7cccacbd..ba54fe9114 100644 --- a/news_events/etl/utils.py +++ b/news_events/etl/utils.py @@ -1,14 +1,18 @@ """Utility functions for news/events ETL pipelines""" import logging +import re +from dataclasses import dataclass from datetime import UTC, datetime from time import mktime, struct_time +from typing import Optional from zoneinfo import ZoneInfo import dateparser import requests from bs4 import BeautifulSoup as Soup from bs4 import Tag +from dateparser import timezone_parser from django.conf import settings from main.constants import ISOFORMAT @@ -16,6 +20,14 @@ log = logging.getLogger(__name__) +@dataclass +class FormattedTime: + hour: Optional[str] + minute: Optional[str] + ampm: Optional[str] + tz: Optional[str] + + def get_soup(url: str) -> Soup: """ Get a BeautifulSoup object from a URL. @@ -131,3 +143,132 @@ def parse_date(text_date: str) -> datetime: except: # noqa: E722 logging.exception("unparsable date received - ignoring '%s'", text_date) return dt_utc + + +def convert_to_utc(dt: datetime, known_tz: str) -> datetime: + """ + Convert a datetime object to UTC timezone. If its + orignal timezone is not known, assume it is in US/Eastern. + + Args: + dt (datetime): The datetime object to convert + known_tz (str): The timezone string if known + + Returns: + datetime: The datetime object in UTC timezone + """ + if not dt: + return None + if not known_tz: + # Assume it is in US/Eastern where MIT is + dt = dt.replace(tzinfo=ZoneInfo("US/Eastern")) + return dt.astimezone(UTC) + + +def format_time(matched_time: re.Match) -> FormattedTime: + """ + Format a time regex match group into a standard format + + Args: + time_str (str): The time string to parse + + Returns: + FormattedTime: A formatted time object + """ + # Regex for AM/PM and timezone + ampm_tz_regex = re.compile(r"(am|pm)\s*([A-Za-z]{2,3})?", re.IGNORECASE) + ampm, tz = "", "" + hour = matched_time.group(1) or "" + minute = matched_time.group(2) or (":00" if hour else "") + ampm_and_tz_match = re.search(ampm_tz_regex, matched_time.group(3) or "") + if ampm_and_tz_match: + ampm = ampm_and_tz_match.group(1) or "" + tz = ampm_and_tz_match.group(2) or "" + return FormattedTime( + hour, minute, ampm, (tz if timezone_parser.word_is_tz(tz.upper()) else "") + ) + + +def parse_date_time_range( + start_date_str: str, end_date_str: str, time_range_str: str +) -> tuple[datetime, datetime]: + """ + Attempt to parse the time range from the MITPE events API. + If the time cannot be parsed, default to noon Easterm time, + then convert to UTC. + The field might not always contain a valid time/range. + + Args: + start_date_str (str): start date string + end_date_str (str): end date string + time_range (str): time range string + + Returns: + tuple(datetime, datetime): start and end datetimes in UTC timezone + + """ + # If one date is missing, set it to the other + end_date_str = end_date_str or start_date_str + start_date_str = start_date_str or end_date_str + + default_time = FormattedTime("12", ":00", "PM", "") + default_time_str = "12:00 PM" + # Set start/end times to noon as default + start_time, end_time = (default_time, default_time) + # Try to split the string into start and end times + split_times = list( + re.finditer( + re.compile(r"(\d{1,2})(:\d{2})?(\D*)", re.IGNORECASE), time_range_str or "" + ) + ) + if split_times: + # At least one time match was found + formatted_times = [format_time(time_match) for time_match in split_times] + # make ruff happy + TWO = 2 + TWELVE = 12 + if len(formatted_times) == TWO: + # Both start and end times were found + start_time, end_time = formatted_times + if start_time.hour and end_time.hour: + # Times must at least have an hour to be valid + if int(start_time.hour) > int(end_time.hour): + # Example: 8 - 1 PM; 8 AM - 1 + start_time.ampm = start_time.ampm or "AM" + end_time.ampm = end_time.ampm or "PM" + elif int(end_time.hour) == TWELVE and int(start_time.hour) < TWELVE: + # Example: 10 - 12 PM + start_time.ampm = start_time.ampm or "AM" + end_time.ampm = end_time.ampm or "PM" + else: + # Anything else, if AM/PM missing for one, set it to the other, + # or "" if both are missing + start_time.ampm = start_time.ampm or end_time.ampm or "" + end_time.ampm = end_time.ampm or start_time.ampm or "" + # If timezone missing for one, set it to the other, + # or "" if both are missing + start_time.tz = start_time.tz or end_time.tz or "" + end_time.tz = end_time.tz or start_time.tz or "" + elif len(formatted_times) == 1: + # Only one time was found, set both start and end to that time + start_time = formatted_times[0] + end_time = start_time + + # Ignore time range and use default time range if dates aren't parsable with it + start_date = dateparser.parse( + f"{start_date_str} {start_time.hour}{start_time.minute} " + f"{start_time.ampm} {start_time.tz}" + ) or dateparser.parse(f"{start_date_str} {default_time_str}") + end_date = dateparser.parse( + f"{end_date_str} {end_time.hour}{end_time.minute} " + f"{end_time.ampm} {end_time.tz}" + ) or dateparser.parse(f"{end_date_str} {default_time_str}") + + if end_date and start_date and end_date < start_date: + # This is nonsensical, so just set the end date to the start date + end_date = start_date + if not start_date: + log.error("Failed to parse start date %s", start_date_str) + return convert_to_utc(start_date, start_time.tz), convert_to_utc( + end_date, end_time.tz + ) diff --git a/news_events/etl/utils_test.py b/news_events/etl/utils_test.py index 856ae147d7..f7caf1de8e 100644 --- a/news_events/etl/utils_test.py +++ b/news_events/etl/utils_test.py @@ -1,5 +1,6 @@ """Tests for utils functions""" +from datetime import UTC, datetime from pathlib import Path from time import struct_time from urllib.error import HTTPError @@ -82,3 +83,136 @@ def test_get_request_json_error_raise(mocker): ) with pytest.raises(HTTPError): utils.get_request_json("https://test.mit.edu", raise_on_error=True) + + +@pytest.mark.parametrize( + ("start_date_str", "end_date_str", "time_range_str", "start_dt", "end_dt"), + [ + ( + "2024-01-15", + "2024-01-15", + "9-10 AM", + datetime(2024, 1, 15, 14, 0, 0, tzinfo=UTC), + datetime(2024, 1, 15, 15, 0, 0, tzinfo=UTC), + ), + ( + "2024-01-15", + None, + "9-10 AM", + datetime(2024, 1, 15, 14, 0, 0, tzinfo=UTC), + datetime(2024, 1, 15, 15, 0, 0, tzinfo=UTC), + ), + ( + "2024-07-15", + "2024-07-16", + "9 - 12 PM", + datetime(2024, 7, 15, 13, 0, 0, tzinfo=UTC), + datetime(2024, 7, 16, 16, 0, 0, tzinfo=UTC), + ), + ( + "2024-07-15", + "2024-07-15", + "3:30 PM and ends at 5:45 PM", + datetime(2024, 7, 15, 19, 30, 0, tzinfo=UTC), + datetime(2024, 7, 15, 21, 45, 0, tzinfo=UTC), + ), + ( + "2024-07-15", + "2024-07-15", + "3:30 PM - 5:30 PM pdt", # Should figure out this is Pacific Daylight Time + datetime(2024, 7, 15, 22, 30, 0, tzinfo=UTC), + datetime(2024, 7, 16, 0, 30, 0, tzinfo=UTC), + ), + ( + "Future date tbd", + None, + None, + None, + None, + ), + ( + "2024-07-15", + "2024-07-30", + "Every afternoon after end of class", + datetime(2024, 7, 15, 16, 0, 0, tzinfo=UTC), + datetime(2024, 7, 30, 16, 0, 0, tzinfo=UTC), + ), + ( + "2024-07-15", + "2024-07-15", + "1pm", + datetime(2024, 7, 15, 17, 0, 0, tzinfo=UTC), + datetime(2024, 7, 15, 17, 0, 0, tzinfo=UTC), + ), + ( + "2024-07-15", + "2024-07-15", + "8 to 1pm", # Should correctly guess that 8 is AM + datetime(2024, 7, 15, 12, 0, 0, tzinfo=UTC), + datetime(2024, 7, 15, 17, 0, 0, tzinfo=UTC), + ), + ( + "2024-07-15", + "2024-07-15", + "8 AM to 1", # Should correctly guess that 1 is PM + datetime(2024, 7, 15, 12, 0, 0, tzinfo=UTC), + datetime(2024, 7, 15, 17, 0, 0, tzinfo=UTC), + ), + ( + "2024-12-15", + "2024-12-15", + "11 to 12 pm", # Should correctly guess that 11 is AM + datetime(2024, 12, 15, 16, 0, 0, tzinfo=UTC), + datetime(2024, 12, 15, 17, 0, 0, tzinfo=UTC), + ), + ( + "2024-07-15", + "2024-07-15", + "Beginning at 4:30 and ending at about 6pm", + datetime(2024, 7, 15, 20, 30, 0, tzinfo=UTC), + datetime(2024, 7, 15, 22, 0, 0, tzinfo=UTC), + ), + ( + "2024-07-15", + "2024-07-15", + "3:00pm; weather permitting", + datetime(2024, 7, 15, 19, 0, 0, tzinfo=UTC), + datetime(2024, 7, 15, 19, 0, 0, tzinfo=UTC), + ), + ( + "2024-07-15", + "2024-07-15", + "3:00pm; doors open at 2:30pm", # Ignore any end time before the start time + datetime(2024, 7, 15, 19, 0, 0, tzinfo=UTC), + datetime(2024, 7, 15, 19, 0, 0, tzinfo=UTC), + ), + ( + "2024-07-15", + "2024-07-15", + "Beginning at 4:30", # No AM/PM, so take it literally as is + datetime(2024, 7, 15, 8, 30, 0, tzinfo=UTC), + datetime(2024, 7, 15, 8, 30, 0, tzinfo=UTC), + ), + ( + "2024-07-15", + "2024-07-30", + "Beginning at 16:30", # No AM/PM, so take it literally as is + datetime(2024, 7, 15, 20, 30, 0, tzinfo=UTC), + datetime(2024, 7, 30, 20, 30, 0, tzinfo=UTC), + ), + ( + None, + "2024-11-30", + "Bldg. 123, E52nd Street, Salon MIT", # Invalid time, default to noon Eastern time, convert to UTC + datetime(2024, 11, 30, 17, 0, 0, tzinfo=UTC), + datetime(2024, 11, 30, 17, 0, 0, tzinfo=UTC), + ), + ], +) +def test_parse_date_time_range( + start_date_str, end_date_str, time_range_str, start_dt, end_dt +): + """parse_date_time_range should return the expected start and end datetimes""" + assert utils.parse_date_time_range( + start_date_str, end_date_str, time_range_str + ) == (start_dt, end_dt)