From 4f9dfbd24b601960da499ccec0b9583741f5edf8 Mon Sep 17 00:00:00 2001 From: Jesse Mortenson Date: Thu, 12 Dec 2024 13:29:38 -0600 Subject: [PATCH] MT: add bill versions for LC draft versions --- scrapers/mt/bills.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/scrapers/mt/bills.py b/scrapers/mt/bills.py index b3797ddf5f..794c470784 100644 --- a/scrapers/mt/bills.py +++ b/scrapers/mt/bills.py @@ -165,6 +165,7 @@ def scrape_list_page(self, session, page_num: int): self.scrape_fiscal_note( bill, row["billType"]["code"], row["billNumber"] ) + self.scrape_lc_versions(bill, row["draft"]["draftNumber"]) if row["sponsorId"]: for legislator in self.legislators: @@ -398,3 +399,23 @@ def scrape_versions(self, bill: Bill, bill_type: str, bill_number: str): media_type="application/pdf", on_duplicate="ignore", ) + + def scrape_lc_versions(self, bill: Bill, lc_number: str): + lc_docs_url = f"https://api.legmt.gov/docs/v1/documents/getBillLcs?legislatureOrdinal={self.session_ord}&sessionOrdinal={self.mt_session_id}&lcnumber={lc_number}" + try: + response = self.get(lc_docs_url).json() + except scrapelib.HTTPError: + # no data = 404 instead of empty json + return + + # TODO: this url returns binary data without the correct content type header, + # we could POST to https://api.legmt.gov/docs/v1/documents/shortPdfUrl?documentId=2710 and get back a better + # GET url, but is that worth 5x the requests? + for doc_row in response: + doc_url = f"https://api.legmt.gov/docs/v1/documents/getContent?documentId={str(doc_row['id'])}" + bill.add_version_link( + doc_row["fileName"], + doc_url, + media_type="application/pdf", + on_duplicate="ignore", + )