Skip to content

Commit

Permalink
Merge pull request #5085 from alexobaseki/add-entity-types-multi-juri…
Browse files Browse the repository at this point in the history
…sdictions

Add entity types multi jurisdictions
  • Loading branch information
alexobaseki authored Nov 16, 2024
2 parents b7a2fe5 + 04e87e5 commit bb7e0da
Show file tree
Hide file tree
Showing 14 changed files with 168 additions and 44 deletions.
7 changes: 6 additions & 1 deletion scrapers/ar/bills.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@
from .common import get_slug_for_session, get_biennium_year

TIMEZONE = pytz.timezone("US/Central")
_AR_ORGANIZATION_ENTITY_NAME_KEYWORDS = [
"Committee",
"House Management",
"Senate Efficiency",
]


# Needed because they're using a port python doesn't expect
Expand Down Expand Up @@ -209,7 +214,7 @@ def scrape_actions(self):

def get_entity_name(self, link):
entity_type = "person"
if "Committees" in link:
if any(keyword in link for keyword in _AR_ORGANIZATION_ENTITY_NAME_KEYWORDS):
entity_type = "organization"
return entity_type

Expand Down
26 changes: 25 additions & 1 deletion scrapers/fl/bills.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,25 @@
SPONSOR_RE = re.compile(
r"by\s+(?P<sponsors>[^(]+)(\(CO-INTRODUCERS\)\s+(?P<cosponsors>[\s\S]+))?"
)
FL_ORGANIZATION_ENTITY_NAME_KEYWORDS = [
"affairs",
"agriculture",
"appropriations",
"banking and insurance",
"committee",
"commerce and tourism",
"criminal justice",
"education",
"ethics and elections",
"environment and natural resources",
"fiscal policy",
"finance and tax",
"governmental oversight",
"health policy",
"regulated industries",
"rules",
"transportation",
]

requests.packages.urllib3.disable_warnings()
requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS += ":HIGH:!DH:!aNULL"
Expand Down Expand Up @@ -157,7 +176,12 @@ def process_sponsors(self):
for sp in sponsors.split("; "):
sp = sp.strip()
if sp:
sp_type = "organization" if "committee" in sp.lower() else "person"
sp_type = "person"
if any(
keyword in sp.lower()
for keyword in FL_ORGANIZATION_ENTITY_NAME_KEYWORDS
):
sp_type = "organization"
self.input.add_sponsorship(sp, "primary", sp_type, True)

cosponsors = match.groupdict()["cosponsors"]
Expand Down
9 changes: 8 additions & 1 deletion scrapers/ia/bills.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
from openstates.scrape import Scraper, Bill
from .actions import Categorizer

_IA_ORGANIZATION_ENTITY_NAME_KEYWORDS = ["COMMITTEE", "RULES AND ADMINISTRATION"]


class IABillScraper(Scraper):
categorizer = Categorizer()
Expand Down Expand Up @@ -237,10 +239,15 @@ def scrape_bill(
sponsor_array = sponsors.replace("and", ",").split(",")

for sponsor in sponsor_array:
entity_type = "person"
if any(
keyword in sponsor for keyword in _IA_ORGANIZATION_ENTITY_NAME_KEYWORDS
):
entity_type = "organization"
bill.add_sponsorship(
name=sponsor.strip(),
classification="primary",
entity_type="organization" if "COMMITTEE ON" in sponsor else "person",
entity_type=entity_type,
primary=True,
)

Expand Down
3 changes: 3 additions & 0 deletions scrapers/id/bills.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,13 +169,16 @@ def _split(string):
# sponsors range from a committee to one legislator to a group of legs
sponsor_lists = bill_tables[0].text_content().split("by")
if len(sponsor_lists) > 1:
# Adding chamber to further filter search results for committee
# This is based on the assumption that a House Bill can only be sponsored by a House Committee and so on
for sponsors in sponsor_lists[1:]:
if "COMMITTEE" in sponsors.upper():
bill.add_sponsorship(
name=sponsors.strip(),
entity_type="organization",
primary=True,
classification="primary",
chamber=chamber,
)
else:
for person in _split(sponsors):
Expand Down
29 changes: 19 additions & 10 deletions scrapers/ks/bills.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,30 +71,37 @@ def scrape_bill_from_api(self, session, bill_id, bill_url):

bill.add_source(api_url)
bill.add_source(bill_url)

# An "original sponsor" is the API's expression of "primary sponsor"
for primary_sponsor in bill_data["ORIGINAL_SPONSOR"]:
primary_sponsor = self.clean_sponsor_name(primary_sponsor)
primary_sponsor, sponsor_chamber = self.clean_sponsor_name(primary_sponsor)
if primary_sponsor:
bill.add_sponsorship(
name=primary_sponsor,
entity_type="organization"
if "committee" in primary_sponsor.lower()
else "person",
entity_type=(
"organization"
if "committee" in primary_sponsor.lower()
else "person"
),
primary=True,
classification="primary",
# Using global "chamber" here because we assume
# the primary sponsor i.e. bill_data["ORIGINAL_SPONSOR"]
# will be a committee from the chamber of bill origin
# Not confident enough to do the same for bill_data["SPONSOR_NAMES"].
chamber=sponsor_chamber or chamber,
)
for sponsor in bill_data["SPONSOR_NAMES"]:
if sponsor in bill_data["ORIGINAL_SPONSOR"]:
continue
sponsor = self.clean_sponsor_name(sponsor)
sponsor, sponsor_chamber = self.clean_sponsor_name(sponsor)
bill.add_sponsorship(
name=sponsor,
entity_type="organization"
if "committee" in sponsor.lower()
else "person",
entity_type=(
"organization" if "committee" in sponsor.lower() else "person"
),
primary=False,
classification="cosponsor",
chamber=sponsor_chamber,
)

# history is backwards
Expand Down Expand Up @@ -142,6 +149,8 @@ def classify_chamber(self, bill_id):
return "upper" if (bill_id[0] == "S") else "lower"

def clean_sponsor_name(self, sponsor):
sp_chamber = None
if sponsor and sponsor.split()[0] in ["Representative", "Senator"]:
sp_chamber = "upper" if sponsor.split()[0] == "Senator" else "lower"
sponsor = "".join(sponsor.split()[1:])
return sponsor
return sponsor, sp_chamber
12 changes: 11 additions & 1 deletion scrapers/ma/bills.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,16 @@ def scrape_bill(self, session, bill_meta, chamber):
'//dt[text()="Sponsor:" or text()="Presenter:"]/'
"following-sibling::dd/descendant-or-self::*/text()[normalize-space()]"
)
# Sponsors always have link that follows pattern <a href="/Legislators/Profile/JNR1/193">Jeffrey N. Roy</a>
# If this is a person i.e. "legislators" it will show in sponsor_href.
sponsor_href = page.xpath(
'//dt[text()="Sponsor:" or text()="Presenter:"]/following-sibling::dd//a/@href'
)
sponsor_href = sponsor_href[0] if sponsor_href else ""
entity_type = (
"person" if "legislators/" in sponsor_href.lower() else "organization"
)

if sponsor:
sponsor = (
sponsor[0]
Expand All @@ -198,7 +208,7 @@ def scrape_bill(self, session, bill_meta, chamber):
.strip()
)
bill.add_sponsorship(
sponsor, classification="primary", primary=True, entity_type="person"
sponsor, classification="primary", primary=True, entity_type=entity_type
)

self.scrape_cosponsors(bill, bill_url)
Expand Down
6 changes: 5 additions & 1 deletion scrapers/nc/bills.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,10 +131,14 @@ def scrape_bill(self, chamber, session, bill_id, bill_type, bill_title):
spon_type = "cosponsor"
if not name:
continue
entity_type = "person"
if "rules, calendar, and operations of the house" in name.lower():
name = name.replace(")", "")
entity_type = "organization"
bill.add_sponsorship(
name,
classification=spon_type,
entity_type="person",
entity_type=entity_type,
primary=(spon_type == "primary"),
)
except IndexError:
Expand Down
10 changes: 8 additions & 2 deletions scrapers/ne/bills.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,9 +142,12 @@ def bill_info(self, bill_link, session, main_url):
introduced_by = introduced_by.split("Introduced By:")[1].strip()

introduced_by = introduced_by.strip()
entity_type = "person"
if "committee" in introduced_by.lower():
entity_type = "organization"
bill.add_sponsorship(
name=introduced_by,
entity_type="person",
entity_type=entity_type,
primary=True,
classification="primary",
)
Expand All @@ -165,9 +168,12 @@ def bill_info(self, bill_link, session, main_url):
# NE legislature site does not list cosponsors, so we grab it from action statements
if "name added" in action:
cosponsor_name = action.split("name added")[0].strip()
entity_type = "person"
if "committee" in cosponsor_name.lower():
entity_type = "organization"
bill.add_sponsorship(
cosponsor_name,
entity_type="person",
entity_type=entity_type,
classification="cosponsor",
primary=False,
)
Expand Down
11 changes: 10 additions & 1 deletion scrapers/nv/bills.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,16 +208,25 @@ def add_sponsors(self, bill, sponsor_links, primary):
if "Sponsors" in name or name == "":
continue
# Removes leg position from name
# Use position to determine chamber
# Example: Assemblywoman Alexis Hansen
# Also check if sponsor is an organization or person
# Example: "Assembly Committee on Government Affairs" is an organization
chamber = None
entity_type = "person"
if "committee" in name.lower():
entity_type = "organization"
if name.split()[0] in ["Assemblywoman", "Assemblyman", "Senator"]:
chamber = "lower" if "Assembly" in name.split()[0] else "upper"
name = " ".join(name.split()[1:]).strip()
if name not in seen:
seen.add(name)
bill.add_sponsorship(
name=name,
classification="sponsor" if primary else "cosponsor",
entity_type="person",
entity_type=entity_type,
primary=primary,
chamber=chamber,
)

def add_actions(self, bill, chamber):
Expand Down
19 changes: 16 additions & 3 deletions scrapers/sc/bills.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ def scrape_subjects(self, session):
try:
self.info(url)
data = urllib.request.urlopen(url).read()
except (http.client.IncompleteRead) as e:
except http.client.IncompleteRead as e:
self.warning("Client IncompleteRead error on {}".format(url))
data = e.partial

Expand Down Expand Up @@ -394,24 +394,37 @@ def scrape_details(self, bill_detail_url, session, chamber, bill_id):

subjects = list(self._subjects[bill_id])

def _get_sponsor_chamber(url):
url = url.get("href")
return (
"upper"
if "chamber=S" in url
else ("lower" if "chamber=H" in url else None)
)

for subject in subjects:
bill.add_subject(subject)

# sponsors
for sponsor in doc.xpath('//a[contains(@href, "member.php")]/text()'):
for sponsor in doc.xpath('//a[contains(@href, "member.php")]'):
sp_chamber = _get_sponsor_chamber(sponsor)
sponsor = sponsor.text.strip()
bill.add_sponsorship(
name=sponsor,
classification="primary",
primary=True,
entity_type="person",
chamber=sp_chamber,
)
for sponsor in doc.xpath('//a[contains(@href, "committee.php")]/text()'):
for sponsor in doc.xpath('//a[contains(@href, "committee.php")]'):
sp_chamber = _get_sponsor_chamber(sponsor)
sponsor = sponsor.replace("\xa0", " ").strip()
bill.add_sponsorship(
name=sponsor,
classification="primary",
primary=True,
entity_type="organization",
chamber=sp_chamber,
)

# find versions
Expand Down
11 changes: 11 additions & 0 deletions scrapers/sd/bills.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@
"2023": "68",
"2024": "69",
}
_CHAMBER_MAP = {
"H": "lower",
"S": "upper",
}


class SDBillScraper(Scraper, LXMLMixin):
Expand Down Expand Up @@ -100,15 +104,22 @@ def scrape_bill(self, chamber, session, bill_id, title, url):
classification="primary",
primary=True,
entity_type=sponsor_type,
chamber=_CHAMBER_MAP.get(sponsor["MemberType"], None),
)
else:
sponsor_type = "organization"
committee_sponsor = re.search(r">(.*)</a>", page["BillCommitteeSponsor"])[1]
csp_chamber = (
"upper"
if "Senate" in committee_sponsor
else ("lower" if "House" in committee_sponsor else None)
)
bill.add_sponsorship(
committee_sponsor,
classification="primary",
primary=True,
entity_type=sponsor_type,
chamber=csp_chamber or chamber,
)

for keyword in page["Keywords"]:
Expand Down
21 changes: 15 additions & 6 deletions scrapers/vt/bills.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,18 +123,27 @@ def scrape(self, session=None):
sponsor_type = "cosponsor"
continue

sponsor_name = (
sponsor.xpath("a/text()")[0]
.replace("Rep.", "")
.replace("Sen.", "")
.strip()
chamber = None
sponsor_name = sponsor.xpath("a/text()")[0]

if sponsor_name.startswith("Rep.") or sponsor_name.startswith("House"):
chamber = "lower"
sponsor_name = sponsor_name.replace("Rep.", "").strip()
elif sponsor_name.startswith("Sen.") or sponsor_name.startswith(
"Senate"
):
chamber = "upper"
sponsor_name = sponsor_name.replace("Sen.", "").strip()
entity_type = (
"organization" if "committee" in sponsor_name else "person"
)
if sponsor_name and sponsor_name != "Less…":
bill.add_sponsorship(
name=sponsor_name,
classification=sponsor_type,
entity_type="person",
entity_type=entity_type,
primary=(sponsor_type == "primary"),
chamber=chamber,
)

version_links = doc.xpath("//ul[contains(@class,'bill-path')]/li/div/a")
Expand Down
4 changes: 3 additions & 1 deletion scrapers/wi/bills.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,7 @@ def parse_sponsors(self, bill, action, chamber):
elif type == "Cosponsored":
sponsor_type = "cosponsor"

entity_type = "person"
if title == "Senator":
sponsor_chamber = "upper"
elif title == "Representative":
Expand All @@ -298,6 +299,7 @@ def parse_sponsors(self, bill, action, chamber):
elif title in ("Joint Legislative Council", "Law Revision Committee"):
sponsor_chamber = chamber
people = title
entity_type = "organization"

for r in re.split(r"\sand\s|\,", people):
if r.strip():
Expand All @@ -306,7 +308,7 @@ def parse_sponsors(self, bill, action, chamber):
chamber=sponsor_chamber,
classification=sponsor_type,
primary=sponsor_type == "primary",
entity_type="person",
entity_type=entity_type,
)

def add_vote(self, bill, chamber, date, text, url):
Expand Down
Loading

0 comments on commit bb7e0da

Please sign in to comment.