Skip to content

Commit

Permalink
Merge pull request #5097 from braykuka/wv-scraper-needs-fixes-to-how-…
Browse files Browse the repository at this point in the history
…it-selects-sessions

WV Scraper needs fixes to how it selects sessions
  • Loading branch information
jessemortenson authored Nov 26, 2024
2 parents 50a5c0b + 98770d9 commit 4df5311
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 54 deletions.
6 changes: 3 additions & 3 deletions scrapers/wv/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ class WestVirginia(State):
"_scraped_name": "2022",
"classification": "primary",
"identifier": "2022",
"name": "2022 Regular",
"name": "2022 Regular Session",
"start_date": "2022-01-12",
"end_date": "2022-03-12",
"active": False,
Expand Down Expand Up @@ -221,7 +221,7 @@ class WestVirginia(State):
"_scraped_name": "2023",
"classification": "primary",
"identifier": "2023",
"name": "2023 Regular",
"name": "2023 Regular Session",
"start_date": "2023-01-11",
"end_date": "2023-03-11",
"active": False,
Expand All @@ -239,7 +239,7 @@ class WestVirginia(State):
"_scraped_name": "2024",
"classification": "primary",
"identifier": "2024",
"name": "2024 Regular",
"name": "2024 Regular Session",
"start_date": "2024-01-11",
"end_date": "2024-03-09",
"active": False,
Expand Down
78 changes: 27 additions & 51 deletions scrapers/wv/bills.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,36 +34,24 @@ def __hash__(self):

class WVBillScraper(Scraper):
categorizer = Categorizer()

_special_names = {
"20161S": "1X",
"2017": "rs",
"20171S": "1X",
"20172S": "2X",
"20181S": "1x",
"20182S": "2x",
"20191S": "1x",
"20211S": "1x",
"20212S": "2x",
"20213S": "3x",
"20221S": "1X",
"20222S": "2X",
"20223S": "3X",
"20224S": "4X",
"20231S": "1X",
"20241S": "1X",
"20242S": "2X",
}

bill_types = {
"B": "bill",
"R": "resolution",
"CR": "concurrent resolution",
"JR": "joint resolution",
}

def scrape(self, chamber=None, session=None):
def get_year_and_stype(self, session):
# Get year and session type from the given session
# 2024 => year is 2024, session type is RS
# 20231S => year is 2023, session type is 1X
year = session[0:4]
session_type = session[4:].upper().replace("S", "X")
if not session_type:
session_type = "RS"
return (year, session_type)

def scrape(self, chamber=None, session=None):
chambers = [chamber] if chamber is not None else ["upper", "lower"]
for chamber in chambers:
yield from self.scrape_chamber(chamber, session)
Expand All @@ -75,22 +63,16 @@ def scrape_chamber(self, chamber, session):
orig = "s"

# scrape bills
if "special" in self.jurisdiction.legislative_sessions[-1]["name"].lower():
url = (
"https://www.wvlegislature.gov/Bill_Status/Bills_all_bills.cfm?"
"year=%s&sessiontype=%s&btype=bill&orig=%s"
% (
self.jurisdiction.legislative_sessions[-1]["_scraped_name"],
self._special_names[session],
orig,
)
)
else:
url = (
"https://www.wvlegislature.gov/Bill_Status/Bills_all_bills.cfm?"
"year=%s&sessiontype=RS&btype=bill&orig=%s" % (session, orig)
year, session_type = self.get_year_and_stype(session)
url = (
"https://www.wvlegislature.gov/Bill_Status/Bills_all_bills.cfm?"
"year=%s&sessiontype=%s&btype=bill&orig=%s"
% (
year,
session_type,
orig,
)

)
page = lxml.html.fromstring(self.get(url, timeout=80, verify=False).text)
page.make_links_absolute(url)

Expand All @@ -114,21 +96,15 @@ def scrape_chamber(self, chamber, session):
)

# scrape resolutions
if "special" in self.jurisdiction.legislative_sessions[-1]["name"].lower():
res_url = (
"https://www.wvlegislature.gov/Bill_Status/res_list.cfm?year=%s"
"&sessiontype=%s&btype=res"
% (
self.jurisdiction.legislative_sessions[-1]["_scraped_name"],
self._special_names[session],
)
)
else:
res_url = (
"https://www.wvlegislature.gov/Bill_Status/res_list.cfm?year=%s"
"&sessiontype=rs&btype=res"
% (self.jurisdiction.legislative_sessions[-1]["_scraped_name"])
year, session_type = self.get_year_and_stype(session)
res_url = (
"https://www.wvlegislature.gov/Bill_Status/res_list.cfm?year=%s"
"&sessiontype=%s&btype=res"
% (
year,
session_type,
)
)

doc = lxml.html.fromstring(self.get(res_url, timeout=80, verify=False).text)
doc.make_links_absolute(res_url)
Expand Down

0 comments on commit 4df5311

Please sign in to comment.