From 838842d145ee0056094aae6a526ec8a23af9022b Mon Sep 17 00:00:00 2001 From: braykuka Date: Tue, 26 Nov 2024 16:20:14 +0100 Subject: [PATCH 1/3] WV: Scraper needs fixes to how it selects sessions --- scrapers/wv/__init__.py | 6 ++-- scrapers/wv/bills.py | 76 ++++++++++++++--------------------------- 2 files changed, 28 insertions(+), 54 deletions(-) diff --git a/scrapers/wv/__init__.py b/scrapers/wv/__init__.py index d38c1f3aad..6b72726dd1 100644 --- a/scrapers/wv/__init__.py +++ b/scrapers/wv/__init__.py @@ -183,7 +183,7 @@ class WestVirginia(State): "_scraped_name": "2022", "classification": "primary", "identifier": "2022", - "name": "2022 Regular", + "name": "2022 Regular Session", "start_date": "2022-01-12", "end_date": "2022-03-12", "active": False, @@ -221,7 +221,7 @@ class WestVirginia(State): "_scraped_name": "2023", "classification": "primary", "identifier": "2023", - "name": "2023 Regular", + "name": "2023 Regular Session", "start_date": "2023-01-11", "end_date": "2023-03-11", "active": False, @@ -239,7 +239,7 @@ class WestVirginia(State): "_scraped_name": "2024", "classification": "primary", "identifier": "2024", - "name": "2024 Regular", + "name": "2024 Regular Session", "start_date": "2024-01-11", "end_date": "2024-03-09", "active": False, diff --git a/scrapers/wv/bills.py b/scrapers/wv/bills.py index 8ff3c65fc5..5a9d1cd5a9 100644 --- a/scrapers/wv/bills.py +++ b/scrapers/wv/bills.py @@ -34,27 +34,6 @@ def __hash__(self): class WVBillScraper(Scraper): categorizer = Categorizer() - - _special_names = { - "20161S": "1X", - "2017": "rs", - "20171S": "1X", - "20172S": "2X", - "20181S": "1x", - "20182S": "2x", - "20191S": "1x", - "20211S": "1x", - "20212S": "2x", - "20213S": "3x", - "20221S": "1X", - "20222S": "2X", - "20223S": "3X", - "20224S": "4X", - "20231S": "1X", - "20241S": "1X", - "20242S": "2X", - } - bill_types = { "B": "bill", "R": "resolution", @@ -62,8 +41,15 @@ class WVBillScraper(Scraper): "JR": "joint resolution", } - def scrape(self, chamber=None, session=None): + def get_year_and_stype(self, session): + year = session[0:4] + session_type = session[4:].lower().replace("s", "x") + if not session_type: + session_type = "rs" + session_type = session_type.upper() + return (year, session_type) + def scrape(self, chamber=None, session=None): chambers = [chamber] if chamber is not None else ["upper", "lower"] for chamber in chambers: yield from self.scrape_chamber(chamber, session) @@ -75,22 +61,16 @@ def scrape_chamber(self, chamber, session): orig = "s" # scrape bills - if "special" in self.jurisdiction.legislative_sessions[-1]["name"].lower(): - url = ( - "https://www.wvlegislature.gov/Bill_Status/Bills_all_bills.cfm?" - "year=%s&sessiontype=%s&btype=bill&orig=%s" - % ( - self.jurisdiction.legislative_sessions[-1]["_scraped_name"], - self._special_names[session], - orig, - ) - ) - else: - url = ( - "https://www.wvlegislature.gov/Bill_Status/Bills_all_bills.cfm?" - "year=%s&sessiontype=RS&btype=bill&orig=%s" % (session, orig) + year, session_type = self.get_year_and_stype(session) + url = ( + "https://www.wvlegislature.gov/Bill_Status/Bills_all_bills.cfm?" + "year=%s&sessiontype=%s&btype=bill&orig=%s" + % ( + year, + session_type, + orig, ) - + ) page = lxml.html.fromstring(self.get(url, timeout=80, verify=False).text) page.make_links_absolute(url) @@ -114,21 +94,15 @@ def scrape_chamber(self, chamber, session): ) # scrape resolutions - if "special" in self.jurisdiction.legislative_sessions[-1]["name"].lower(): - res_url = ( - "https://www.wvlegislature.gov/Bill_Status/res_list.cfm?year=%s" - "&sessiontype=%s&btype=res" - % ( - self.jurisdiction.legislative_sessions[-1]["_scraped_name"], - self._special_names[session], - ) - ) - else: - res_url = ( - "https://www.wvlegislature.gov/Bill_Status/res_list.cfm?year=%s" - "&sessiontype=rs&btype=res" - % (self.jurisdiction.legislative_sessions[-1]["_scraped_name"]) + year, session_type = self.get_year_and_stype(session) + res_url = ( + "https://www.wvlegislature.gov/Bill_Status/res_list.cfm?year=%s" + "&sessiontype=%s&btype=res" + % ( + year, + session_type, ) + ) doc = lxml.html.fromstring(self.get(res_url, timeout=80, verify=False).text) doc.make_links_absolute(res_url) From 96049c61f7f39003c26ef5b6691c60af83cde02b Mon Sep 17 00:00:00 2001 From: braykuka Date: Tue, 26 Nov 2024 16:25:25 +0100 Subject: [PATCH 2/3] add docs of the function --- scrapers/wv/bills.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scrapers/wv/bills.py b/scrapers/wv/bills.py index 5a9d1cd5a9..6bae2b16b1 100644 --- a/scrapers/wv/bills.py +++ b/scrapers/wv/bills.py @@ -42,6 +42,9 @@ class WVBillScraper(Scraper): } def get_year_and_stype(self, session): + # Get year and session type from the given session + # 2024 => year is 2024, session type is RS + # 20231S => year is 2023, session type is 1X year = session[0:4] session_type = session[4:].lower().replace("s", "x") if not session_type: From 98770d99ef6223ffe7e14f4f7abd79303c5a90a9 Mon Sep 17 00:00:00 2001 From: braykuka Date: Tue, 26 Nov 2024 16:26:08 +0100 Subject: [PATCH 3/3] fix small issue --- scrapers/wv/bills.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/scrapers/wv/bills.py b/scrapers/wv/bills.py index 6bae2b16b1..50c8d44677 100644 --- a/scrapers/wv/bills.py +++ b/scrapers/wv/bills.py @@ -46,10 +46,9 @@ def get_year_and_stype(self, session): # 2024 => year is 2024, session type is RS # 20231S => year is 2023, session type is 1X year = session[0:4] - session_type = session[4:].lower().replace("s", "x") + session_type = session[4:].upper().replace("S", "X") if not session_type: - session_type = "rs" - session_type = session_type.upper() + session_type = "RS" return (year, session_type) def scrape(self, chamber=None, session=None):