Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MT: fix import error due to duplicate events #5123

Merged
merged 3 commits into from
Dec 9, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 45 additions & 15 deletions scrapers/mt/events.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from typing import Union

from openstates.scrape import Scraper, Event
from utils.events import match_coordinates
import datetime
Expand All @@ -10,17 +12,21 @@

class MTEventScraper(Scraper):
_tz = pytz.timezone("America/Denver")
# the same MT event can be listed more than once at the source URLs
# where each listing is an alternate media stream (video vs. audio)
# so we need to do some data combining before yielding
_events = []

def scrape(self):

yield from self.scrape_upcoming()
self.scrape_upcoming()

# scrape events from this month, and last month
today = datetime.date.today()
yield from self.scrape_cal_month(today)
yield from self.scrape_cal_month(
today + dateutil.relativedelta.relativedelta(months=-1)
)
self.scrape_cal_month(today)
self.scrape_cal_month(today + dateutil.relativedelta.relativedelta(months=-1))
for event in self._events:
yield event

def scrape_upcoming(self):
url = "https://sg001-harmony.sliq.net/00309/Harmony/en/View/UpcomingEvents"
Expand All @@ -30,7 +36,7 @@ def scrape_upcoming(self):
page.make_links_absolute(url)

for link in page.xpath("//div[@class='divEvent']/a[1]"):
yield from self.scrape_event(link.xpath("@href")[0])
self.scrape_event(link.xpath("@href")[0])

def scrape_cal_month(self, when: datetime.datetime.date):
date_str = when.strftime("%Y%m01")
Expand All @@ -43,7 +49,7 @@ def scrape_cal_month(self, when: datetime.datetime.date):
if when.date() < datetime.datetime.today().date():
event_id = str(row["Id"])
event_url = f"https://sg001-harmony.sliq.net/00309/Harmony/en/PowerBrowser/PowerBrowserV2/1/-1/{event_id}"
yield from self.scrape_event(event_url)
self.scrape_event(event_url)

def scrape_event(self, url: str):
html = self.get(url).text
Expand All @@ -62,17 +68,25 @@ def scrape_event(self, url: str):
when = dateutil.parser.parse(f"{when_date} {when_time}")
when = self._tz.localize(when)

event = Event(
name=title,
location_name=location,
start_date=when,
classification="committee-meeting",
)
# Check if event already exists in the self._events list
# and if so, add data to that instead of creating duplicate
existing_event = self.check_for_existing_event(title, location, when)
if existing_event is None:
# No existing event found, create one
event = Event(
name=title,
location_name=location,
start_date=when,
classification="committee-meeting",
)
else:
event = existing_event

self.scrape_versions(event, html)
self.scrape_media(event, html)

event.add_source(url)
if existing_event is None:
event.add_source(url)

if "HB" not in title.lower() and "SB" not in title.lower():
event.add_committee(title)
Expand All @@ -84,7 +98,22 @@ def scrape_event(self, url: str):
},
)

yield event
# Make sure we add any new event to the list
if existing_event is None:
self._events.append(event)

def check_for_existing_event(
self, title: str, location_name: str, start_date: datetime.datetime.date
) -> Union[Event, None]:
for event in self._events:
if (
event.name == title
and event.location["name"] == location_name
and event.start_date == start_date
):
return event

return None

# versions and media are in the 'dataModel' js variable on the page
def scrape_versions(self, event: Event, html: str):
Expand All @@ -107,4 +136,5 @@ def scrape_media(self, event: Event, html: str):
m["textTags"]["DESCRIPTION"]["text"],
m["textTags"]["URL"]["text"],
media_type="application/vnd",
on_duplicate="ignore", # we are combining links from duplicate "event" listings into one
)
Loading