Merge pull request #5085 from alexobaseki/add-entity-types-multi-juri…

…sdictions Add entity types multi jurisdictions
openstates · Nov 16, 2024 · bb7e0da · bb7e0da
2 parents b7a2fe5 + 04e87e5
commit bb7e0da
Show file tree

Hide file tree

Showing 14 changed files with 168 additions and 44 deletions.
diff --git a/scrapers/ar/bills.py b/scrapers/ar/bills.py
@@ -16,6 +16,11 @@
 from .common import get_slug_for_session, get_biennium_year
 
 TIMEZONE = pytz.timezone("US/Central")
+_AR_ORGANIZATION_ENTITY_NAME_KEYWORDS = [
+    "Committee",
+    "House Management",
+    "Senate Efficiency",
+]
 
 
 # Needed because they're using a port python doesn't expect
@@ -209,7 +214,7 @@ def scrape_actions(self):
 
     def get_entity_name(self, link):
         entity_type = "person"
-        if "Committees" in link:
+        if any(keyword in link for keyword in _AR_ORGANIZATION_ENTITY_NAME_KEYWORDS):
             entity_type = "organization"
         return entity_type
 

diff --git a/scrapers/fl/bills.py b/scrapers/fl/bills.py
@@ -14,6 +14,25 @@
 SPONSOR_RE = re.compile(
     r"by\s+(?P<sponsors>[^(]+)(\(CO-INTRODUCERS\)\s+(?P<cosponsors>[\s\S]+))?"
 )
+FL_ORGANIZATION_ENTITY_NAME_KEYWORDS = [
+    "affairs",
+    "agriculture",
+    "appropriations",
+    "banking and insurance",
+    "committee",
+    "commerce and tourism",
+    "criminal justice",
+    "education",
+    "ethics and elections",
+    "environment and natural resources",
+    "fiscal policy",
+    "finance and tax",
+    "governmental oversight",
+    "health policy",
+    "regulated industries",
+    "rules",
+    "transportation",
+]
 
 requests.packages.urllib3.disable_warnings()
 requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS += ":HIGH:!DH:!aNULL"
@@ -157,7 +176,12 @@ def process_sponsors(self):
         for sp in sponsors.split("; "):
             sp = sp.strip()
             if sp:
-                sp_type = "organization" if "committee" in sp.lower() else "person"
+                sp_type = "person"
+                if any(
+                    keyword in sp.lower()
+                    for keyword in FL_ORGANIZATION_ENTITY_NAME_KEYWORDS
+                ):
+                    sp_type = "organization"
                 self.input.add_sponsorship(sp, "primary", sp_type, True)
 
         cosponsors = match.groupdict()["cosponsors"]

diff --git a/scrapers/ia/bills.py b/scrapers/ia/bills.py
@@ -6,6 +6,8 @@
 from openstates.scrape import Scraper, Bill
 from .actions import Categorizer
 
+_IA_ORGANIZATION_ENTITY_NAME_KEYWORDS = ["COMMITTEE", "RULES AND ADMINISTRATION"]
+
 
 class IABillScraper(Scraper):
     categorizer = Categorizer()
@@ -237,10 +239,15 @@ def scrape_bill(
         sponsor_array = sponsors.replace("and", ",").split(",")
 
         for sponsor in sponsor_array:
+            entity_type = "person"
+            if any(
+                keyword in sponsor for keyword in _IA_ORGANIZATION_ENTITY_NAME_KEYWORDS
+            ):
+                entity_type = "organization"
             bill.add_sponsorship(
                 name=sponsor.strip(),
                 classification="primary",
-                entity_type="organization" if "COMMITTEE ON" in sponsor else "person",
+                entity_type=entity_type,
                 primary=True,
             )
 

diff --git a/scrapers/id/bills.py b/scrapers/id/bills.py
@@ -169,13 +169,16 @@ def _split(string):
         # sponsors range from a committee to one legislator to a group of legs
         sponsor_lists = bill_tables[0].text_content().split("by")
         if len(sponsor_lists) > 1:
+            # Adding chamber to further filter search results for committee
+            # This is based on the assumption that a House Bill can only be sponsored by a House Committee and so on
             for sponsors in sponsor_lists[1:]:
                 if "COMMITTEE" in sponsors.upper():
                     bill.add_sponsorship(
                         name=sponsors.strip(),
                         entity_type="organization",
                         primary=True,
                         classification="primary",
+                        chamber=chamber,
                     )
                 else:
                     for person in _split(sponsors):

diff --git a/scrapers/ks/bills.py b/scrapers/ks/bills.py
@@ -71,30 +71,37 @@ def scrape_bill_from_api(self, session, bill_id, bill_url):
 
         bill.add_source(api_url)
         bill.add_source(bill_url)
-
         # An "original sponsor" is the API's expression of "primary sponsor"
         for primary_sponsor in bill_data["ORIGINAL_SPONSOR"]:
-            primary_sponsor = self.clean_sponsor_name(primary_sponsor)
+            primary_sponsor, sponsor_chamber = self.clean_sponsor_name(primary_sponsor)
             if primary_sponsor:
                 bill.add_sponsorship(
                     name=primary_sponsor,
-                    entity_type="organization"
-                    if "committee" in primary_sponsor.lower()
-                    else "person",
+                    entity_type=(
+                        "organization"
+                        if "committee" in primary_sponsor.lower()
+                        else "person"
+                    ),
                     primary=True,
                     classification="primary",
+                    # Using global "chamber" here because we assume
+                    # the primary sponsor i.e. bill_data["ORIGINAL_SPONSOR"]
+                    # will be a committee from the chamber of bill origin
+                    # Not confident enough to do the same for bill_data["SPONSOR_NAMES"].
+                    chamber=sponsor_chamber or chamber,
                 )
         for sponsor in bill_data["SPONSOR_NAMES"]:
             if sponsor in bill_data["ORIGINAL_SPONSOR"]:
                 continue
-            sponsor = self.clean_sponsor_name(sponsor)
+            sponsor, sponsor_chamber = self.clean_sponsor_name(sponsor)
             bill.add_sponsorship(
                 name=sponsor,
-                entity_type="organization"
-                if "committee" in sponsor.lower()
-                else "person",
+                entity_type=(
+                    "organization" if "committee" in sponsor.lower() else "person"
+                ),
                 primary=False,
                 classification="cosponsor",
+                chamber=sponsor_chamber,
             )
 
         # history is backwards
@@ -142,6 +149,8 @@ def classify_chamber(self, bill_id):
         return "upper" if (bill_id[0] == "S") else "lower"
 
     def clean_sponsor_name(self, sponsor):
+        sp_chamber = None
         if sponsor and sponsor.split()[0] in ["Representative", "Senator"]:
+            sp_chamber = "upper" if sponsor.split()[0] == "Senator" else "lower"
             sponsor = "".join(sponsor.split()[1:])
-        return sponsor
+        return sponsor, sp_chamber
diff --git a/scrapers/ma/bills.py b/scrapers/ma/bills.py
@@ -189,6 +189,16 @@ def scrape_bill(self, session, bill_meta, chamber):
             '//dt[text()="Sponsor:" or text()="Presenter:"]/'
             "following-sibling::dd/descendant-or-self::*/text()[normalize-space()]"
         )
+        # Sponsors always have link that follows pattern <a href="/Legislators/Profile/JNR1/193">Jeffrey N. Roy</a>
+        # If this is a person i.e. "legislators" it will show in sponsor_href.
+        sponsor_href = page.xpath(
+            '//dt[text()="Sponsor:" or text()="Presenter:"]/following-sibling::dd//a/@href'
+        )
+        sponsor_href = sponsor_href[0] if sponsor_href else ""
+        entity_type = (
+            "person" if "legislators/" in sponsor_href.lower() else "organization"
+        )
+
         if sponsor:
             sponsor = (
                 sponsor[0]
@@ -198,7 +208,7 @@ def scrape_bill(self, session, bill_meta, chamber):
                 .strip()
             )
             bill.add_sponsorship(
-                sponsor, classification="primary", primary=True, entity_type="person"
+                sponsor, classification="primary", primary=True, entity_type=entity_type
             )
 
         self.scrape_cosponsors(bill, bill_url)

diff --git a/scrapers/nc/bills.py b/scrapers/nc/bills.py
@@ -131,10 +131,14 @@ def scrape_bill(self, chamber, session, bill_id, bill_type, bill_title):
                     spon_type = "cosponsor"
                 if not name:
                     continue
+                entity_type = "person"
+                if "rules, calendar, and operations of the house" in name.lower():
+                    name = name.replace(")", "")
+                    entity_type = "organization"
                 bill.add_sponsorship(
                     name,
                     classification=spon_type,
-                    entity_type="person",
+                    entity_type=entity_type,
                     primary=(spon_type == "primary"),
                 )
         except IndexError:

diff --git a/scrapers/ne/bills.py b/scrapers/ne/bills.py
@@ -142,9 +142,12 @@ def bill_info(self, bill_link, session, main_url):
             introduced_by = introduced_by.split("Introduced By:")[1].strip()
 
         introduced_by = introduced_by.strip()
+        entity_type = "person"
+        if "committee" in introduced_by.lower():
+            entity_type = "organization"
         bill.add_sponsorship(
             name=introduced_by,
-            entity_type="person",
+            entity_type=entity_type,
             primary=True,
             classification="primary",
         )
@@ -165,9 +168,12 @@ def bill_info(self, bill_link, session, main_url):
             # NE legislature site does not list cosponsors, so we grab it from action statements
             if "name added" in action:
                 cosponsor_name = action.split("name added")[0].strip()
+                entity_type = "person"
+                if "committee" in cosponsor_name.lower():
+                    entity_type = "organization"
                 bill.add_sponsorship(
                     cosponsor_name,
-                    entity_type="person",
+                    entity_type=entity_type,
                     classification="cosponsor",
                     primary=False,
                 )

diff --git a/scrapers/nv/bills.py b/scrapers/nv/bills.py
@@ -208,16 +208,25 @@ def add_sponsors(self, bill, sponsor_links, primary):
             if "Sponsors" in name or name == "":
                 continue
             # Removes leg position from name
+            # Use position to determine chamber
             # Example: Assemblywoman Alexis Hansen
+            # Also check if sponsor is an organization or person
+            # Example: "Assembly Committee on Government Affairs" is an organization
+            chamber = None
+            entity_type = "person"
+            if "committee" in name.lower():
+                entity_type = "organization"
             if name.split()[0] in ["Assemblywoman", "Assemblyman", "Senator"]:
+                chamber = "lower" if "Assembly" in name.split()[0] else "upper"
                 name = " ".join(name.split()[1:]).strip()
             if name not in seen:
                 seen.add(name)
                 bill.add_sponsorship(
                     name=name,
                     classification="sponsor" if primary else "cosponsor",
-                    entity_type="person",
+                    entity_type=entity_type,
                     primary=primary,
+                    chamber=chamber,
                 )
 
     def add_actions(self, bill, chamber):

diff --git a/scrapers/sc/bills.py b/scrapers/sc/bills.py
@@ -200,7 +200,7 @@ def scrape_subjects(self, session):
             try:
                 self.info(url)
                 data = urllib.request.urlopen(url).read()
-            except (http.client.IncompleteRead) as e:
+            except http.client.IncompleteRead as e:
                 self.warning("Client IncompleteRead error on {}".format(url))
                 data = e.partial
 
@@ -394,24 +394,37 @@ def scrape_details(self, bill_detail_url, session, chamber, bill_id):
 
         subjects = list(self._subjects[bill_id])
 
+        def _get_sponsor_chamber(url):
+            url = url.get("href")
+            return (
+                "upper"
+                if "chamber=S" in url
+                else ("lower" if "chamber=H" in url else None)
+            )
+
         for subject in subjects:
             bill.add_subject(subject)
 
         # sponsors
-        for sponsor in doc.xpath('//a[contains(@href, "member.php")]/text()'):
+        for sponsor in doc.xpath('//a[contains(@href, "member.php")]'):
+            sp_chamber = _get_sponsor_chamber(sponsor)
+            sponsor = sponsor.text.strip()
             bill.add_sponsorship(
                 name=sponsor,
                 classification="primary",
                 primary=True,
                 entity_type="person",
+                chamber=sp_chamber,
             )
-        for sponsor in doc.xpath('//a[contains(@href, "committee.php")]/text()'):
+        for sponsor in doc.xpath('//a[contains(@href, "committee.php")]'):
+            sp_chamber = _get_sponsor_chamber(sponsor)
             sponsor = sponsor.replace("\xa0", " ").strip()
             bill.add_sponsorship(
                 name=sponsor,
                 classification="primary",
                 primary=True,
                 entity_type="organization",
+                chamber=sp_chamber,
             )
 
         # find versions

diff --git a/scrapers/sd/bills.py b/scrapers/sd/bills.py
@@ -16,6 +16,10 @@
     "2023": "68",
     "2024": "69",
 }
+_CHAMBER_MAP = {
+    "H": "lower",
+    "S": "upper",
+}
 
 
 class SDBillScraper(Scraper, LXMLMixin):
@@ -100,15 +104,22 @@ def scrape_bill(self, chamber, session, bill_id, title, url):
                     classification="primary",
                     primary=True,
                     entity_type=sponsor_type,
+                    chamber=_CHAMBER_MAP.get(sponsor["MemberType"], None),
                 )
         else:
             sponsor_type = "organization"
             committee_sponsor = re.search(r">(.*)</a>", page["BillCommitteeSponsor"])[1]
+            csp_chamber = (
+                "upper"
+                if "Senate" in committee_sponsor
+                else ("lower" if "House" in committee_sponsor else None)
+            )
             bill.add_sponsorship(
                 committee_sponsor,
                 classification="primary",
                 primary=True,
                 entity_type=sponsor_type,
+                chamber=csp_chamber or chamber,
             )
 
         for keyword in page["Keywords"]:

diff --git a/scrapers/vt/bills.py b/scrapers/vt/bills.py
@@ -123,18 +123,27 @@ def scrape(self, session=None):
                     sponsor_type = "cosponsor"
                     continue
 
-                sponsor_name = (
-                    sponsor.xpath("a/text()")[0]
-                    .replace("Rep.", "")
-                    .replace("Sen.", "")
-                    .strip()
+                chamber = None
+                sponsor_name = sponsor.xpath("a/text()")[0]
+
+                if sponsor_name.startswith("Rep.") or sponsor_name.startswith("House"):
+                    chamber = "lower"
+                    sponsor_name = sponsor_name.replace("Rep.", "").strip()
+                elif sponsor_name.startswith("Sen.") or sponsor_name.startswith(
+                    "Senate"
+                ):
+                    chamber = "upper"
+                    sponsor_name = sponsor_name.replace("Sen.", "").strip()
+                entity_type = (
+                    "organization" if "committee" in sponsor_name else "person"
                 )
                 if sponsor_name and sponsor_name != "Less…":
                     bill.add_sponsorship(
                         name=sponsor_name,
                         classification=sponsor_type,
-                        entity_type="person",
+                        entity_type=entity_type,
                         primary=(sponsor_type == "primary"),
+                        chamber=chamber,
                     )
 
             version_links = doc.xpath("//ul[contains(@class,'bill-path')]/li/div/a")

diff --git a/scrapers/wi/bills.py b/scrapers/wi/bills.py
@@ -288,6 +288,7 @@ def parse_sponsors(self, bill, action, chamber):
             elif type == "Cosponsored":
                 sponsor_type = "cosponsor"
 
+            entity_type = "person"
             if title == "Senator":
                 sponsor_chamber = "upper"
             elif title == "Representative":
@@ -298,6 +299,7 @@ def parse_sponsors(self, bill, action, chamber):
             elif title in ("Joint Legislative Council", "Law Revision Committee"):
                 sponsor_chamber = chamber
                 people = title
+                entity_type = "organization"
 
             for r in re.split(r"\sand\s|\,", people):
                 if r.strip():
@@ -306,7 +308,7 @@ def parse_sponsors(self, bill, action, chamber):
                         chamber=sponsor_chamber,
                         classification=sponsor_type,
                         primary=sponsor_type == "primary",
-                        entity_type="person",
+                        entity_type=entity_type,
                     )
 
     def add_vote(self, bill, chamber, date, text, url):