Merge pull request #40 from chance-on-brink/master

fix new disclosure detection, model refusals, name exceptions, output easy copy paste
jlopp · Nov 28, 2024 · c2c11d2 · c2c11d2
2 parents b973316 + 7903686
commit c2c11d2
Show file tree

Hide file tree

Showing 16 changed files with 733 additions and 643 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/automated_updates/.gitignore b/automated_updates/.gitignore
@@ -5,6 +5,7 @@ all_processed_data
 all_processed_data_test
 all_source_data_test
 bitcoin-politicians-env
+env-bitcoin-politicians
 intermediate_files
 intermediate_files_test
 missing_disclosures.txt

diff --git a/automated_updates/Automated Updates README.md b/automated_updates/Automated Updates README.md
@@ -79,6 +79,7 @@ To quickly set up and ensure all code paths are functioning, a pre-defined test
 **parse_asset_names_llm.py**  
 * Sends images to OpenAI's API with specific prompts, extracting asset names and saving them to `./all_processed_data`.
 * Use the `--new-only` flag to parse only new disclosures since last run.
+* To parallelize this step, Use parse_asset_names_llm_parallel.py instead (8 workers by default). Requires OpenAI API Usage Tier 2 or higher.
      
 **summarize_results.py**  
 * Summarizes files in `./all_processed_data` into `final_datasets/final_asset_data.csv` and `final_datasets/final_summary_data.csv`.

diff --git a/automated_updates/all_source_data/Arenholz_Ashley_IA_2023_house.pdf b/automated_updates/all_source_data/Arenholz_Ashley_IA_2023_house.pdf
diff --git a/automated_updates/all_source_data/source_data_links.csv b/automated_updates/all_source_data/source_data_links.csv
diff --git a/automated_updates/gather_source_data.py b/automated_updates/gather_source_data.py
@@ -55,15 +55,15 @@ def retry_with_delay(func, *args, retries=3, delay=60, **kwargs):
 
     if house_senate == 'House':
         # site will occasionally deny access. if this happens, wait and try again
-        success = retry_with_delay(download_house_source_data_most_recent, last_name, first_name, state_abbr)
+        success = retry_with_delay(download_house_source_data_most_recent, last_name, first_name, state_abbr, party)
         if not success:
             print(f'\033[91m{first_name} {last_name} no disclosures found.\033[0m')
             no_disclosures.append(member)
 
     elif house_senate == 'Senate':
         headless=True # whether to run the chromedriver headless
         # site will occasionally deny access. if this happens, wait and try again
-        success = retry_with_delay(download_senate_source_data_most_recent, last_name, first_name, state_abbr, headless)
+        success = retry_with_delay(download_senate_source_data_most_recent, last_name, first_name, state_abbr, party, headless)
         if not success:
             print(f'\033[91m{first_name} {last_name} no disclosures found.\033[0m')
             no_disclosures.append(member)

diff --git a/automated_updates/modules/gather/congress_members.py b/automated_updates/modules/gather/congress_members.py
@@ -150,12 +150,19 @@ def get_congress_members(limit=250, ignore_cache=True, test_set=False):
             'Van Drew, Jefferson', # house messy
             'Vance, J. D.', # senate easy extract
             'Markey, Edward J.', # senate gifs
+            'Hinson, Ashley'
         ]
         members = [member for member in members if member[0] in test_set_names]
 
         print('using test set:')
         for member in members: print(member)
 
+    # manual exceptions for name mismatches that are impossible to accomodate programmatically
+    # example: congress.gov api gives "Hinson, Ashley" (Iowa) who files under Arenholz as of 2022
+    for i in range(len(members)):
+        if members[i][0] == 'Hinson, Ashley' and members[i][2] == 'IA':
+            members[i][0] = 'Arenholz, Ashley'
+
     return members
 
 def parse_members(members):

diff --git a/automated_updates/modules/gather/house_scrape.py b/automated_updates/modules/gather/house_scrape.py
@@ -12,7 +12,7 @@ def remove_accents(text):
         if unicodedata.category(c) != 'Mn'
     )
 
-def download_house_source_data_specific_year(last_name, first_name, state_abbr, filing_year):
+def download_house_source_data_specific_year(last_name, first_name, state_abbr, filing_year, party):
     search_url = "https://disclosures-clerk.house.gov/FinancialDisclosure/ViewMemberSearchResult"
     headers = {
         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36',
@@ -52,43 +52,40 @@ def download_house_source_data_specific_year(last_name, first_name, state_abbr,
                 with open(filename, 'wb') as pdf_file:
                     pdf_file.write(pdf_response.content)
                 print(f"\033[32mDownloaded {filename}\033[0m")
-                add_link_to_source_file(last_name, first_name, state_abbr, filing_year, pdf_link)
+                add_link_to_source_file(last_name, first_name, state_abbr, filing_year, pdf_link, party)
                 return True
             else:
                 return False
         else:
             return False
 
-def download_house_source_data_most_recent(last_name, first_name, state_abbr):
+def download_house_source_data_most_recent(last_name, first_name, state_abbr, party):
     current_year = datetime.now().year
     for year in range(current_year, current_year - 5, -1):
-        success = download_house_source_data_specific_year(last_name=last_name, first_name=first_name, state_abbr=state_abbr, filing_year=year)
+        success = download_house_source_data_specific_year(last_name=last_name, first_name=first_name, state_abbr=state_abbr, filing_year=year, party=party)
         if success:
             return True
 
     ## try different variations of the name
     # normalize accented characters
     if last_name != remove_accents(last_name):
         last_name = remove_accents(last_name)
-        return download_house_source_data_most_recent(last_name, first_name, state_abbr)
+        return download_house_source_data_most_recent(last_name, first_name, state_abbr, party)
 
     # use second part of multipart last name
     if '-' in last_name:
         last_name = last_name.replace('-', ' ')
-        return download_house_source_data_most_recent(last_name, first_name, state_abbr)
+        return download_house_source_data_most_recent(last_name, first_name, state_abbr, party)
 
     # use second part of multipart last name
     if ' ' in last_name:
         last_name = last_name.split(' ')[1]
-        return download_house_source_data_most_recent(last_name, first_name, state_abbr)
+        return download_house_source_data_most_recent(last_name, first_name, state_abbr, party)
 
     # add second part of multipart first name to last name
     if ' ' in first_name:
         last_name = first_name.split(' ')[-1] + ' ' + last_name
         first_name = first_name.split(' ')[0]
-        return download_house_source_data_most_recent(last_name, first_name, state_abbr)
+        return download_house_source_data_most_recent(last_name, first_name, state_abbr, party)
 
-    return False
-
-if __name__ == '__main__':
-    download_house_source_data_specific_year(last_name='Bilirakis', state_abbr='FL', filing_year=2023)
+    return False
diff --git a/automated_updates/modules/gather/senate_scrape.py b/automated_updates/modules/gather/senate_scrape.py
@@ -27,7 +27,7 @@ def start_chrome_driver(chrome_driver_path, headless=True):
 
     return driver
 
-def download_senate_source_data_most_recent(last_name, first_name, state_abbr, headless):
+def download_senate_source_data_most_recent(last_name, first_name, state_abbr, party, headless):
     driver = start_chrome_driver(chrome_driver_path, headless=headless)
     # beware, this gets ugly
     driver.set_window_size(1920, 1080)
@@ -71,7 +71,7 @@ def download_senate_source_data_most_recent(last_name, first_name, state_abbr, h
     annual_report_link = driver.find_element(By.XPATH, "//a[contains(text(), 'Annual Report') and not(contains(text(), 'Amendment'))]")
     report_url = annual_report_link.get_attribute("href")
     driver.get(report_url)
-    add_link_to_source_file(last_name, first_name, state_abbr, most_recent_date.year, report_url)
+    add_link_to_source_file(last_name, first_name, state_abbr, most_recent_date.year, report_url, party)
 
     if 'view/annual' in report_url:
         wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#grid_items tbody tr")))
@@ -119,8 +119,4 @@ def download_senate_source_data_most_recent(last_name, first_name, state_abbr, h
         return True
 
     else:
-        raise RuntimeError("Unrecognized URL encountered during execution.")
-
-if __name__ == '__main__':
-    driver = start_chrome_driver(chrome_driver_path=chrome_driver_path, headless=False)
-    success = download_senate_source_data_most_recent(driver=driver, last_name='Ricketts', state_abbr='NE')
+        raise RuntimeError("Unrecognized URL encountered during execution.")
diff --git a/automated_updates/modules/gather/source_file_links.py b/automated_updates/modules/gather/source_file_links.py
@@ -1,14 +1,14 @@
 import csv
-from config import source_data_dir
+from config import source_data_dir, processed_data_dir
 import pandas as pd
 import os
 import re
 from collections import defaultdict
 
-def add_link_to_source_file(last_name, first_name, state_abbr, filing_year, link):
+def add_link_to_source_file(last_name, first_name, state_abbr, filing_year, link, party):
 
     csv_file_path = os.path.join(source_data_dir, "source_data_links.csv")
-    csv_columns = ["last_name", "first_name", "state", "filing_year", "link"]
+    csv_columns = ["last_name", "first_name", "party", "state", "filing_year", "link"]
 
     file_exists = os.path.isfile(csv_file_path)
     with open(csv_file_path, mode="a", newline="") as csv_file:
@@ -20,6 +20,7 @@ def add_link_to_source_file(last_name, first_name, state_abbr, filing_year, link
         writer.writerow({
             "last_name": last_name,
             "first_name": first_name,
+            "party": party,
             "state": state_abbr,
             "filing_year": filing_year,
             "link": link
@@ -36,33 +37,31 @@ def deduplicate_link_source_file():
     df.to_csv(csv_file_path, index=False)
 
 def get_new_disclosures():
-    source_file_path = os.path.join(source_data_dir, "source_data_links.csv")
-    df = pd.read_csv(source_file_path)
-    df_sorted = df.sort_values(by=['last_name', 'first_name', 'state', 'filing_year'], ascending=[True, True, True, False])
-    current_disclosures = df_sorted.drop_duplicates(subset=['last_name', 'first_name', 'state'], keep='first')
-    old_disclosures = df_sorted[~df_sorted['link'].isin(list(current_disclosures['link']))]
-
-    common_identifiers = current_disclosures.merge(
-        old_disclosures,
-        on=['last_name', 'first_name', 'state'],
-        how='inner'
-    )[['last_name', 'first_name', 'state']]
+    source_data_links = pd.read_csv(os.path.join(source_data_dir, "source_data_links.csv"))
+    final_summary_data = pd.read_csv('./final_datasets/final_summary_data.csv')
 
-    new_disclosures = current_disclosures.merge(
-        common_identifiers,
+    merged_data = source_data_links.merge(
+        final_summary_data,
         on=['last_name', 'first_name', 'state'],
-        how='inner'
+        suffixes=('_source', '_final'),
+        how='left'
     )
 
-    new_disclosures.to_csv('./new_disclosures.csv', index=False)
-    current_disclosures.to_csv(source_file_path, index=False)
+    new_disclosures = merged_data[
+        (merged_data['filing_year_final'].isna()) |  # Not in final_summary_data
+        (merged_data['filing_year_source'] > merged_data['filing_year_final'])  # More recent year
+    ]
+
+    new_disclosures = new_disclosures[['last_name', 'first_name', 'state', 'filing_year_source']].rename(
+        columns={'filing_year_source': 'filing_year'}
+    )
 
-    print("\nNew disclosures since last run:")
     if len(new_disclosures): print(new_disclosures) 
     else: print('None')
     print('\n')
 
-    return new_disclosures
+    new_disclosures.to_csv('./new_disclosures.csv', index=False)
+
 
 def get_outdated_source_files(files):
     file_pattern = re.compile(r'(.+?)_(.+?)_(.+?)_(\d{4})_.+')
@@ -84,3 +83,6 @@ def get_outdated_source_files(files):
         outdated_files.extend(entry[4] for entry in entries[1:])
 
     return outdated_files
+
+if __name__ == '__main__':
+    print(get_new_disclosures())
diff --git a/automated_updates/modules/process/parse_house_clean_llm.py b/automated_updates/modules/process/parse_house_clean_llm.py
@@ -8,17 +8,16 @@
 
 def assets_from_house_clean_image_to_csv(input_image_path):
 	base64_image = encode_image(input_image_path)
-	if False:
-		# this idea was to skip pages that don't list assets. it is a waste of tokens and doesn't seem to hurt accuracy to just shove all pages
-		response = send_to_api("is there an assets table in the image with a column called 'Asset'? answer Y or N only.", base64_image)
-
-		if response.lower() == 'n':
-			# not present, skip the page
-			print('skipped')
-			return []
 
 	# use '|' separator to avoid characters in asset names
-	response = send_to_api("get the asset names in the 'Asset' column of the Assets table in the public disclosure form. return only a | separated list. no other commentary.", base64_image)
+	response = send_to_api("This is a public disclosure form for a US congressman from house.gov. Get the asset names in the 'Asset' column of the Assets table. Return only a | separated list. If there are no assets listed, state: 'None'. No other commentary.", 
+							base64_image)
+
+	# print(response);print('\n')
+	if re.sub(r'[^a-zA-Z]', '', response.strip().lower() ) == 'none':
+		# print('No assets on this page.')
+		response = ''
+
 	asset_list = [response.strip() for response in response.split("|")]
 
 	return asset_list

diff --git a/automated_updates/modules/process/parse_house_messy_llm.py b/automated_updates/modules/process/parse_house_messy_llm.py
@@ -8,20 +8,17 @@
 
 def assets_from_house_messy_image_to_csv(input_image_path):
 	base64_image = encode_image(input_image_path)
-	if False:
-		# this idea was to skip pages that don't list assets. it is a waste of tokens and doesn't seem to hurt accuracy to just shove all pages
-		response = send_to_api(message="is this the schedule A part of the form that lists assets? answer Y or N only.", 
-							base64_image=base64_image)
-
-		if response.lower() == 'n':
-			# not schedule A, skip the page
-			print('skipped')
-			return []
 
 	# use '|' separator to avoid characters in asset names
-	response = send_to_api(message="get the asset names in the public disclosure form, return them in a | separated list only, no other commentary.", 
-						   base64_image=base64_image, 
-						   model="gpt-4o")
+	response = send_to_api(message="This is a public disclosure form for a US congressman from house.gov. Get the asset names form. Return them in a | separated list only. If there are no assets listed, state: 'None'. No other commentary.", 
+						base64_image=base64_image, 
+						model="gpt-4o")
+
+	# print(response);print('\n')
+	if re.sub(r'[^a-zA-Z]', '', response.strip().lower() ) == 'none':
+		# print('No assets on this page.')
+		response = ''
+
 	asset_list = [response.strip() for response in response.split("|")]
 
 	return asset_list

diff --git a/automated_updates/modules/process/parse_senate_llm.py b/automated_updates/modules/process/parse_senate_llm.py
@@ -8,20 +8,17 @@
 
 def assets_from_senate_image_to_csv(input_image_path):
 	base64_image = encode_image(input_image_path)
-	if False:
-		# this idea was to skip pages that don't list assets. it is a waste of tokens and doesn't seem to hurt accuracy to just shove all pages
-		response = send_to_api(message="does this part of the form list asset disclosures? answer Y or N only.", 
-							base64_image=base64_image)
-
-		if response.lower() == 'n':
-			print('skipped')
-			# not schedule A, skip the page
-			return []
 
 	# use '|' separator to avoid characters in asset names
-	response = send_to_api(message="get the asset names in the public disclosure form. return them in a | separated list only. no other commentary.", 
-						   base64_image=base64_image,
-						   model='gpt-4o')
+	response = send_to_api(message="This is a public disclosure form for a US congressman from senate.gov. Get the asset names in the form. Return them in a | separated list only. If there are no assets listed, state: 'None'. No other commentary.", 
+						base64_image=base64_image,
+						model='gpt-4o')
+
+	# print(response);print('\n')
+	if re.sub(r'[^a-zA-Z]', '', response.strip().lower() ) == 'none':
+		# print('No assets on this page.')
+		response = ''
+
 	asset_list = [response.strip() for response in response.split("|")]
 
 	return asset_list

diff --git a/automated_updates/parse_asset_names_llm.py b/automated_updates/parse_asset_names_llm.py
@@ -38,39 +38,42 @@
 
 # TODO: this should be parallelized, bottleneck will be OpenAI usage tier
 # Process folders in house_clean_pdf_dir
-for root, dirs, files in os.walk(house_clean_pdf_dir):
-    for folder in dirs:
-        processed_folders += 1
-        print(f"Progress: {processed_folders} / {total_folders}")
-        if folder in skips: continue
-        if args.new_only and folder.replace('_house','').replace('_senate','') not in new_disclosure_names: continue
+root = next(os.walk(house_clean_pdf_dir))[0]
+dirs = sorted(next(os.walk(house_clean_pdf_dir))[1])
+for folder in dirs:
+    processed_folders += 1
+    print(f"\nProgress: {processed_folders} / {total_folders}")
+    if folder in skips: continue
+    if args.new_only and folder.replace('_house','').replace('_senate','') not in new_disclosure_names: continue
 
-        folder_path = os.path.join(root, folder)
-        print(f"\nProcessing folder in house_clean_pdf_dir: {folder_path}")
-        assets_from_house_clean_to_csv_entire_folder(folder_path)
+    folder_path = os.path.join(root, folder)
+    print(f"\nProcessing folder in house_clean_pdf_dir: {folder_path}")
+    assets_from_house_clean_to_csv_entire_folder(folder_path)
 
-# # Process folders in house_messy_pdf_dir
-for root, dirs, files in os.walk(house_messy_pdf_dir):
-    for folder in dirs:
-        processed_folders += 1
-        print(f"Progress: {processed_folders} / {total_folders}")
-        if folder in skips: continue
-        if args.new_only and folder.replace('_house','').replace('_senate','') not in new_disclosure_names: continue
+# Process folders in house_messy_pdf_dir
+root = next(os.walk(house_messy_pdf_dir))[0]
+dirs = sorted(next(os.walk(house_messy_pdf_dir))[1])
+for folder in dirs:
+    processed_folders += 1
+    print(f"Progress: {processed_folders} / {total_folders}")
+    if folder in skips: continue
+    if args.new_only and folder.replace('_house','').replace('_senate','') not in new_disclosure_names: continue
 
-        folder_path = os.path.join(root, folder)
-        print(f"\nProcessing folder in house_messy_pdf_dir: {folder_path}")
-        assets_from_house_messy_to_csv_entire_folder(folder_path)
+    folder_path = os.path.join(root, folder)
+    print(f"\nProcessing folder in house_messy_pdf_dir: {folder_path}")
+    assets_from_house_messy_to_csv_entire_folder(folder_path)
 
 # Process folders in senate_dir
-for root, dirs, files in os.walk(senate_dir):
-    for folder in dirs:
-        processed_folders += 1
-        print(f"Progress: {processed_folders} / {total_folders}")
-        if folder in skips: continue
-        if args.new_only and folder.replace('_house','').replace('_senate','') not in new_disclosure_names: continue
+root = next(os.walk(senate_dir))[0]
+dirs = sorted(next(os.walk(senate_dir))[1])
+for folder in dirs:
+    processed_folders += 1
+    print(f"Progress: {processed_folders} / {total_folders}")
+    if folder in skips: continue
+    if args.new_only and folder.replace('_house','').replace('_senate','') not in new_disclosure_names: continue
 
-        folder_path = os.path.join(root, folder)
-        print(f"\nProcessing folder in senate_dir: {folder_path}")
-        assets_from_senate_to_csv_entire_folder(folder_path)
+    folder_path = os.path.join(root, folder)
+    print(f"\nProcessing folder in senate_dir: {folder_path}")
+    assets_from_senate_to_csv_entire_folder(folder_path)
 
 print('\n')