From 29f8b64d9b8cd0aa2c2de573c6b37abec522b2b9 Mon Sep 17 00:00:00 2001 From: Jesse Mortenson Date: Wed, 11 Dec 2024 20:33:31 -0600 Subject: [PATCH 1/2] MT: enable 2025 session prefiles, implement new API endpoints --- scrapers/mt/__init__.py | 22 +++- scrapers/mt/bills.py | 233 +++++++++++++++++++++++++++++++++++++--- 2 files changed, 239 insertions(+), 16 deletions(-) diff --git a/scrapers/mt/__init__.py b/scrapers/mt/__init__.py index a4c0c23987..a7ca992304 100644 --- a/scrapers/mt/__init__.py +++ b/scrapers/mt/__init__.py @@ -60,8 +60,17 @@ class Montana(State): # TODO: update dates "start_date": "2023-01-04", "end_date": "2023-04-25", + "active": False, + "extras": {"legislatureOrdinal": 68, "newAPIIdentifier": None}, + }, + { + "_scraped_name": "20251", + "identifier": "2025", + "name": "2025 Regular Session", + "start_date": "2023-01-06", + "end_date": "2023-05-05", "active": True, - "extras": {"legislatureOrdinal": 68}, + "extras": {"legislatureOrdinal": 69, "newAPIIdentifier": 2}, }, ] ignored_scraped_sessions = [ @@ -77,9 +86,20 @@ class Montana(State): ] def get_session_list(self): + # archive of sessions url = "https://api.legmt.gov/archive/v1/sessions" sessions = [] page = requests.get(url).json() for row in page: sessions.append(str(row["sessionId"])) + + # incoming session can be found in another endpoint + legislators_sessions_url = "https://api.legmt.gov/legislators/v1/sessions" + page = requests.get(legislators_sessions_url).json() + for row in page: + # skip if this session was already found above + if row["ordinals"] in sessions: + continue + sessions.append(row["ordinals"]) + return sessions diff --git a/scrapers/mt/bills.py b/scrapers/mt/bills.py index 3bf453e822..cc01ce619f 100644 --- a/scrapers/mt/bills.py +++ b/scrapers/mt/bills.py @@ -26,13 +26,216 @@ def scrape(self, session=None): for i in self.jurisdiction.legislative_sessions: if i["identifier"] == session: - self.session_ord = i["extras"]["legislatureOrdinal"] + self.session_ord = i.get("extras", {}).get("legislatureOrdinal", None) self.mt_session_id = i["_scraped_name"] self.session_year = i["start_date"][0:4] + self.new_api_session_identifier = i.get("extras", {}).get( + "newAPIIdentifier", None + ) + + # MT appears to have two sets of endpoints to its API: archive and [non-archive] + # if we are missing a new_api_identifier for the session, use the archive endpoint + if self.new_api_session_identifier is None: + yield from self.scrape_archive_list_page(session, 0) + else: + # Get prerequisite data + self.scrape_non_standing_committees() + self.scrape_requesting_agencies() + self.scrape_legislators() + + # scrape bills (TODO: votes) + yield from self.scrape_list_page(session, 0) + + def scrape_legislators(self): + self.legislators = [] + url = "https://api.legmt.gov/legislators/v1/legislators" + response = requests.get(url).json() + + for legislator in response: + self.legislators.append( + { + "id": legislator["id"], + "first_name": legislator["firstName"], + "last_name": legislator["lastName"], + "middle_name": legislator["middleName"], + "start_date": legislator["startDate"], + "end_date": legislator["endDate"], + "chamber": "upper" + if legislator["chamber"] == "SENATE" + else "lower", + "party": legislator["politicalParty"]["name"], + "district": legislator["district"]["number"], + "email": legislator["emailAddress"], + "legislative_position": legislator["position"] + if legislator["position"] + else None, + } + ) + + def scrape_requesting_agencies(self): + self.requesting_agencies = [] + url = "https://api.legmt.gov/legislators/v1/organizations" + response = requests.get(url).json() + + for agency in response: + self.requesting_agencies.append( + { + "id": agency["id"], + "name": agency["name"], + "type": agency["type"], + } + ) - yield from self.scrape_list_page(session, 0) + def scrape_non_standing_committees(self): + self.non_standing_committees = [] + url = "https://api.legmt.gov/committees/v1/nonStandingCommittees/search" + params = {"limit": 500, "offset": 0} + json_data = {"legislatureIds": [self.new_api_session_identifier]} + response = requests.post(url, params=params, json=json_data).json() + + for committee in response["content"]: + cmte_code = committee["committeeDetails"]["committeeCode"] + self.non_standing_committees.append( + { + "id": committee["id"], + "committee_code_id": cmte_code["id"], + "name": cmte_code["name"], + "code": cmte_code["code"], + "type": cmte_code["committeeType"]["description"], + } + ) def scrape_list_page(self, session, page_num: int): + self.info(f"Scraping page {str(page_num)}") + params = { + "limit": str(self.results_per_page), + "offset": str(page_num), + "includeCounts": "true", # TODO do we need the "counts" part of response? + "sort": ["billType.code,desc", "billNumber,asc", "draft.draftNumber,asc"], + } + + json_data = { + "sessionIds": [self.new_api_session_identifier], + } + + response = requests.post( + "https://api.legmt.gov/bills/v1/bills/search", params=params, json=json_data + ).json() + + for row in response["content"]: + is_draft = False + if row["billNumber"]: + bill_id = f"{row['billType']['code']} {row['billNumber']}" + else: + bill_id = row["draft"]["draftNumber"] + is_draft = True + + chamber = self.bill_chambers[bill_id[0]] + title = row["draft"]["shortTitle"] + bill = Bill( + bill_id, + legislative_session=session, + chamber=chamber, + title=title, + classification=self.bill_types[bill_id[1]], + ) + + bills_base_url = "https://bills.legmt.gov/#" + if is_draft: + source_url = f"{bills_base_url}/lc/bill/{self.new_api_session_identifier}/{row['draft']['draftNumber']}" + else: + source_url = ( + f"{bills_base_url}/laws/bill/{self.new_api_session_identifier}/{row['draft']['draftNumber']}" + f"?open_tab=sum" + ) + bill.add_source(source_url) + + if not is_draft: + # attempt to add a bill relation to the LC/draft version of this bill + bill.add_related_bill(row["draft"]["draftNumber"], session, "replaces") + + # TODO votes, used to be processed in actions + self.scrape_actions(bill, row) + self.scrape_extras(bill, row) + self.scrape_subjects(bill, row) + + if not is_draft: + self.scrape_versions(bill, row["billType"]["code"], row["billNumber"]) + if row["draft"]["fiscalNote"]: + self.scrape_fiscal_note(bill, row["billType"]["code"], row["billNumber"]) + + if row["sponsorId"]: + for legislator in self.legislators: + if row["sponsorId"] == legislator["id"]: + sponsor_name = f"{legislator['first_name']} {legislator['last_name']}" + bill.add_sponsorship( + sponsor_name, + classification="primary", + entity_type="person", + primary=True, + ) + + yield bill + + if response["totalPages"] > page_num: + yield from self.scrape_list_page(session, page_num + 1) + + def scrape_actions(self, bill: Bill, row: dict): + for action in row["draft"]["billStatuses"]: + name = action["billStatusCode"]["name"] + when = dateutil.parser.parse(action["timeStamp"]) + when = self.TIMEZONE.localize(when) + if "(H)" in name: + chamber = "lower" + elif "(S)" in name: + chamber = "upper" + else: + chamber = "legislature" + + bill.add_action( + name, + date=when, + chamber=chamber, + classification=categorize_actions(name), + ) + + # TODO vote processing + # at this time, no new bills have votes yet + # so we have no idea how data will appear + + def scrape_extras(self, bill: Bill, row: dict): + bill.extras["bill_draft_number"] = row["draft"]["draftNumber"] + + # MT-specific data point of legislation requester (by_request_of) + requester_type = row["draft"]["requesterType"] + requester_id = row["draft"]["requesterId"] + if requester_type == "LEGISLATOR": + for legislator in self.legislators: + if requester_id == legislator["id"]: + bill.extras[ + "by_request_of" + ] = f"{legislator['first_name']} {legislator['last_name']}" + elif requester_type == "AGENCY": + for agency in self.requesting_agencies: + if requester_id == agency["id"]: + bill.extras["by_request_of"] = agency["name"] + elif requester_type == "NON_STANDING_COMMITTEE": + for committee in self.non_standing_committees: + if requester_id == committee["id"]: + bill.extras["by_request_of"] = committee["name"] + + # legal citation + # TODO verify this still works with new API, currently no data populates this field + if row["sessionLawChapterNumber"]: + cite = f"{self.session_year} Chapter {row['sessionLawChapterNumber']}, {bill.identifier}" + bill.add_citation("Montanta Chapter Laws", cite, "chapter") + + def scrape_subjects(self, bill: Bill, row: dict): + for subject in row["draft"]["subjects"]: + bill.add_subject(subject["subjectCode"]["description"]) + + + def scrape_archive_list_page(self, session, page_num: int): self.info(f"Scraping page {str(page_num)}") params = { "limit": str(self.results_per_page), @@ -71,15 +274,15 @@ def scrape_list_page(self, session, page_num: int): f"https://bills.legmt.gov/#/bill/{self.mt_session_id}/{row['id']['billDraftNumber']}" ) - yield from self.scrape_actions(bill, row) - self.scrape_extras(bill, row) - self.scrape_subjects(bill, row) + yield from self.scrape_archive_actions(bill, row) + self.scrape_archive_extras(bill, row) + self.scrape_archive_subjects(bill, row) if not is_draft: - self.scrape_versions(bill, row) + self.scrape_versions(bill, row['billType'], row['billNumber']) if row["hasFiscalNote"]: - self.scrape_fiscal_note(bill, row) + self.scrape_fiscal_note(bill, row['billType'], row['billNumber']) if row["coSponsor"]: print(row["coSponsor"]) @@ -97,9 +300,9 @@ def scrape_list_page(self, session, page_num: int): yield bill if page["bills"]["totalPages"] > page_num: - yield from self.scrape_list_page(session, page_num + 1) + yield from self.scrape_archive_list_page(session, page_num + 1) - def scrape_actions(self, bill: Bill, row: dict): + def scrape_archive_actions(self, bill: Bill, row: dict): for action in row["billActions"]: name = action["actionType"]["description"] when = dateutil.parser.parse(action["date"]) @@ -135,7 +338,7 @@ def scrape_actions(self, bill: Bill, row: dict): vote.add_source(bill.sources[0]["url"]) yield vote - def scrape_extras(self, bill: Bill, row: dict): + def scrape_archive_extras(self, bill: Bill, row: dict): bill.extras["bill_draft_number"] = row["id"]["billDraftNumber"] # this is a for loop but there's only ever one entity @@ -151,8 +354,8 @@ def scrape_extras(self, bill: Bill, row: dict): cite = f"{self.session_year} Chapter {row['sessionLawChapterNumber']}, {bill.identifier}" bill.add_citation("Montanta Chapter Laws", cite, "chapter") - def scrape_fiscal_note(self, bill: Bill, row: dict): - url = f"https://api.legmt.gov/docs/v1/documents/getBillFiscalNotes?legislatureOrdinal={self.session_ord}&sessionOrdinal={self.mt_session_id}&billType={row['billType']}&billNumber={row['billNumber']}" + def scrape_fiscal_note(self, bill: Bill, bill_type: str, bill_number: str): + url = f"https://api.legmt.gov/docs/v1/documents/getBillFiscalNotes?legislatureOrdinal={self.session_ord}&sessionOrdinal={self.mt_session_id}&billType={bill_type}&billNumber={bill_number}" try: page = self.get(url).json() except scrapelib.HTTPError: @@ -168,13 +371,13 @@ def scrape_fiscal_note(self, bill: Bill, row: dict): on_duplicate="ignore", ) - def scrape_subjects(self, bill: Bill, row: dict): + def scrape_archive_subjects(self, bill: Bill, row: dict): for subject in row["subjects"]: bill.add_subject(subject["subject"]["description"]) - def scrape_versions(self, bill: Bill, row: dict): + def scrape_versions(self, bill: Bill, bill_type: str, bill_number: str): for endpoint in ["Versions", "Amendments", "Other"]: - url = f"https://api.legmt.gov/docs/v1/documents/getBill{endpoint}?legislatureOrdinal={self.session_ord}&sessionOrdinal={self.mt_session_id}&billType={row['billType']}&billNumber={row['billNumber']}" + url = f"https://api.legmt.gov/docs/v1/documents/getBill{endpoint}?legislatureOrdinal={self.session_ord}&sessionOrdinal={self.mt_session_id}&billType={bill_type}&billNumber={bill_number}" try: page = self.get(url).json() except scrapelib.HTTPError: From 741dad1aaea176de5fcff306edd0195590bbbded Mon Sep 17 00:00:00 2001 From: Jesse Mortenson Date: Wed, 11 Dec 2024 20:37:01 -0600 Subject: [PATCH 2/2] MT fix linting --- scrapers/mt/bills.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/scrapers/mt/bills.py b/scrapers/mt/bills.py index cc01ce619f..b3797ddf5f 100644 --- a/scrapers/mt/bills.py +++ b/scrapers/mt/bills.py @@ -162,12 +162,16 @@ def scrape_list_page(self, session, page_num: int): if not is_draft: self.scrape_versions(bill, row["billType"]["code"], row["billNumber"]) if row["draft"]["fiscalNote"]: - self.scrape_fiscal_note(bill, row["billType"]["code"], row["billNumber"]) + self.scrape_fiscal_note( + bill, row["billType"]["code"], row["billNumber"] + ) if row["sponsorId"]: for legislator in self.legislators: if row["sponsorId"] == legislator["id"]: - sponsor_name = f"{legislator['first_name']} {legislator['last_name']}" + sponsor_name = ( + f"{legislator['first_name']} {legislator['last_name']}" + ) bill.add_sponsorship( sponsor_name, classification="primary", @@ -234,7 +238,6 @@ def scrape_subjects(self, bill: Bill, row: dict): for subject in row["draft"]["subjects"]: bill.add_subject(subject["subjectCode"]["description"]) - def scrape_archive_list_page(self, session, page_num: int): self.info(f"Scraping page {str(page_num)}") params = { @@ -279,10 +282,10 @@ def scrape_archive_list_page(self, session, page_num: int): self.scrape_archive_subjects(bill, row) if not is_draft: - self.scrape_versions(bill, row['billType'], row['billNumber']) + self.scrape_versions(bill, row["billType"], row["billNumber"]) if row["hasFiscalNote"]: - self.scrape_fiscal_note(bill, row['billType'], row['billNumber']) + self.scrape_fiscal_note(bill, row["billType"], row["billNumber"]) if row["coSponsor"]: print(row["coSponsor"])