Skip to content

Commit

Permalink
Merge pull request #40 from chance-on-brink/master
Browse files Browse the repository at this point in the history
fix new disclosure detection, model refusals, name exceptions, output easy copy paste
  • Loading branch information
jlopp authored Nov 28, 2024
2 parents b973316 + 7903686 commit c2c11d2
Show file tree
Hide file tree
Showing 16 changed files with 733 additions and 643 deletions.
Binary file modified .DS_Store
Binary file not shown.
1 change: 1 addition & 0 deletions automated_updates/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ all_processed_data
all_processed_data_test
all_source_data_test
bitcoin-politicians-env
env-bitcoin-politicians
intermediate_files
intermediate_files_test
missing_disclosures.txt
Expand Down
1 change: 1 addition & 0 deletions automated_updates/Automated Updates README.md
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ To quickly set up and ensure all code paths are functioning, a pre-defined test
**parse_asset_names_llm.py**
* Sends images to OpenAI's API with specific prompts, extracting asset names and saving them to `./all_processed_data`.
* Use the `--new-only` flag to parse only new disclosures since last run.
* To parallelize this step, Use parse_asset_names_llm_parallel.py instead (8 workers by default). Requires OpenAI API Usage Tier 2 or higher.
**summarize_results.py**
* Summarizes files in `./all_processed_data` into `final_datasets/final_asset_data.csv` and `final_datasets/final_summary_data.csv`.
Expand Down
Binary file not shown.
1,068 changes: 534 additions & 534 deletions automated_updates/all_source_data/source_data_links.csv

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions automated_updates/gather_source_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,15 +55,15 @@ def retry_with_delay(func, *args, retries=3, delay=60, **kwargs):

if house_senate == 'House':
# site will occasionally deny access. if this happens, wait and try again
success = retry_with_delay(download_house_source_data_most_recent, last_name, first_name, state_abbr)
success = retry_with_delay(download_house_source_data_most_recent, last_name, first_name, state_abbr, party)
if not success:
print(f'\033[91m{first_name} {last_name} no disclosures found.\033[0m')
no_disclosures.append(member)

elif house_senate == 'Senate':
headless=True # whether to run the chromedriver headless
# site will occasionally deny access. if this happens, wait and try again
success = retry_with_delay(download_senate_source_data_most_recent, last_name, first_name, state_abbr, headless)
success = retry_with_delay(download_senate_source_data_most_recent, last_name, first_name, state_abbr, party, headless)
if not success:
print(f'\033[91m{first_name} {last_name} no disclosures found.\033[0m')
no_disclosures.append(member)
Expand Down
7 changes: 7 additions & 0 deletions automated_updates/modules/gather/congress_members.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,12 +150,19 @@ def get_congress_members(limit=250, ignore_cache=True, test_set=False):
'Van Drew, Jefferson', # house messy
'Vance, J. D.', # senate easy extract
'Markey, Edward J.', # senate gifs
'Hinson, Ashley'
]
members = [member for member in members if member[0] in test_set_names]

print('using test set:')
for member in members: print(member)

# manual exceptions for name mismatches that are impossible to accomodate programmatically
# example: congress.gov api gives "Hinson, Ashley" (Iowa) who files under Arenholz as of 2022
for i in range(len(members)):
if members[i][0] == 'Hinson, Ashley' and members[i][2] == 'IA':
members[i][0] = 'Arenholz, Ashley'

return members

def parse_members(members):
Expand Down
21 changes: 9 additions & 12 deletions automated_updates/modules/gather/house_scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def remove_accents(text):
if unicodedata.category(c) != 'Mn'
)

def download_house_source_data_specific_year(last_name, first_name, state_abbr, filing_year):
def download_house_source_data_specific_year(last_name, first_name, state_abbr, filing_year, party):
search_url = "https://disclosures-clerk.house.gov/FinancialDisclosure/ViewMemberSearchResult"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36',
Expand Down Expand Up @@ -52,43 +52,40 @@ def download_house_source_data_specific_year(last_name, first_name, state_abbr,
with open(filename, 'wb') as pdf_file:
pdf_file.write(pdf_response.content)
print(f"\033[32mDownloaded {filename}\033[0m")
add_link_to_source_file(last_name, first_name, state_abbr, filing_year, pdf_link)
add_link_to_source_file(last_name, first_name, state_abbr, filing_year, pdf_link, party)
return True
else:
return False
else:
return False

def download_house_source_data_most_recent(last_name, first_name, state_abbr):
def download_house_source_data_most_recent(last_name, first_name, state_abbr, party):
current_year = datetime.now().year
for year in range(current_year, current_year - 5, -1):
success = download_house_source_data_specific_year(last_name=last_name, first_name=first_name, state_abbr=state_abbr, filing_year=year)
success = download_house_source_data_specific_year(last_name=last_name, first_name=first_name, state_abbr=state_abbr, filing_year=year, party=party)
if success:
return True

## try different variations of the name
# normalize accented characters
if last_name != remove_accents(last_name):
last_name = remove_accents(last_name)
return download_house_source_data_most_recent(last_name, first_name, state_abbr)
return download_house_source_data_most_recent(last_name, first_name, state_abbr, party)

# use second part of multipart last name
if '-' in last_name:
last_name = last_name.replace('-', ' ')
return download_house_source_data_most_recent(last_name, first_name, state_abbr)
return download_house_source_data_most_recent(last_name, first_name, state_abbr, party)

# use second part of multipart last name
if ' ' in last_name:
last_name = last_name.split(' ')[1]
return download_house_source_data_most_recent(last_name, first_name, state_abbr)
return download_house_source_data_most_recent(last_name, first_name, state_abbr, party)

# add second part of multipart first name to last name
if ' ' in first_name:
last_name = first_name.split(' ')[-1] + ' ' + last_name
first_name = first_name.split(' ')[0]
return download_house_source_data_most_recent(last_name, first_name, state_abbr)
return download_house_source_data_most_recent(last_name, first_name, state_abbr, party)

return False

if __name__ == '__main__':
download_house_source_data_specific_year(last_name='Bilirakis', state_abbr='FL', filing_year=2023)
return False
10 changes: 3 additions & 7 deletions automated_updates/modules/gather/senate_scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def start_chrome_driver(chrome_driver_path, headless=True):

return driver

def download_senate_source_data_most_recent(last_name, first_name, state_abbr, headless):
def download_senate_source_data_most_recent(last_name, first_name, state_abbr, party, headless):
driver = start_chrome_driver(chrome_driver_path, headless=headless)
# beware, this gets ugly
driver.set_window_size(1920, 1080)
Expand Down Expand Up @@ -71,7 +71,7 @@ def download_senate_source_data_most_recent(last_name, first_name, state_abbr, h
annual_report_link = driver.find_element(By.XPATH, "//a[contains(text(), 'Annual Report') and not(contains(text(), 'Amendment'))]")
report_url = annual_report_link.get_attribute("href")
driver.get(report_url)
add_link_to_source_file(last_name, first_name, state_abbr, most_recent_date.year, report_url)
add_link_to_source_file(last_name, first_name, state_abbr, most_recent_date.year, report_url, party)

if 'view/annual' in report_url:
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#grid_items tbody tr")))
Expand Down Expand Up @@ -119,8 +119,4 @@ def download_senate_source_data_most_recent(last_name, first_name, state_abbr, h
return True

else:
raise RuntimeError("Unrecognized URL encountered during execution.")

if __name__ == '__main__':
driver = start_chrome_driver(chrome_driver_path=chrome_driver_path, headless=False)
success = download_senate_source_data_most_recent(driver=driver, last_name='Ricketts', state_abbr='NE')
raise RuntimeError("Unrecognized URL encountered during execution.")
44 changes: 23 additions & 21 deletions automated_updates/modules/gather/source_file_links.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
import csv
from config import source_data_dir
from config import source_data_dir, processed_data_dir
import pandas as pd
import os
import re
from collections import defaultdict

def add_link_to_source_file(last_name, first_name, state_abbr, filing_year, link):
def add_link_to_source_file(last_name, first_name, state_abbr, filing_year, link, party):

csv_file_path = os.path.join(source_data_dir, "source_data_links.csv")
csv_columns = ["last_name", "first_name", "state", "filing_year", "link"]
csv_columns = ["last_name", "first_name", "party", "state", "filing_year", "link"]

file_exists = os.path.isfile(csv_file_path)
with open(csv_file_path, mode="a", newline="") as csv_file:
Expand All @@ -20,6 +20,7 @@ def add_link_to_source_file(last_name, first_name, state_abbr, filing_year, link
writer.writerow({
"last_name": last_name,
"first_name": first_name,
"party": party,
"state": state_abbr,
"filing_year": filing_year,
"link": link
Expand All @@ -36,33 +37,31 @@ def deduplicate_link_source_file():
df.to_csv(csv_file_path, index=False)

def get_new_disclosures():
source_file_path = os.path.join(source_data_dir, "source_data_links.csv")
df = pd.read_csv(source_file_path)
df_sorted = df.sort_values(by=['last_name', 'first_name', 'state', 'filing_year'], ascending=[True, True, True, False])
current_disclosures = df_sorted.drop_duplicates(subset=['last_name', 'first_name', 'state'], keep='first')
old_disclosures = df_sorted[~df_sorted['link'].isin(list(current_disclosures['link']))]

common_identifiers = current_disclosures.merge(
old_disclosures,
on=['last_name', 'first_name', 'state'],
how='inner'
)[['last_name', 'first_name', 'state']]
source_data_links = pd.read_csv(os.path.join(source_data_dir, "source_data_links.csv"))
final_summary_data = pd.read_csv('./final_datasets/final_summary_data.csv')

new_disclosures = current_disclosures.merge(
common_identifiers,
merged_data = source_data_links.merge(
final_summary_data,
on=['last_name', 'first_name', 'state'],
how='inner'
suffixes=('_source', '_final'),
how='left'
)

new_disclosures.to_csv('./new_disclosures.csv', index=False)
current_disclosures.to_csv(source_file_path, index=False)
new_disclosures = merged_data[
(merged_data['filing_year_final'].isna()) | # Not in final_summary_data
(merged_data['filing_year_source'] > merged_data['filing_year_final']) # More recent year
]

new_disclosures = new_disclosures[['last_name', 'first_name', 'state', 'filing_year_source']].rename(
columns={'filing_year_source': 'filing_year'}
)

print("\nNew disclosures since last run:")
if len(new_disclosures): print(new_disclosures)
else: print('None')
print('\n')

return new_disclosures
new_disclosures.to_csv('./new_disclosures.csv', index=False)


def get_outdated_source_files(files):
file_pattern = re.compile(r'(.+?)_(.+?)_(.+?)_(\d{4})_.+')
Expand All @@ -84,3 +83,6 @@ def get_outdated_source_files(files):
outdated_files.extend(entry[4] for entry in entries[1:])

return outdated_files

if __name__ == '__main__':
print(get_new_disclosures())
17 changes: 8 additions & 9 deletions automated_updates/modules/process/parse_house_clean_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,16 @@

def assets_from_house_clean_image_to_csv(input_image_path):
base64_image = encode_image(input_image_path)
if False:
# this idea was to skip pages that don't list assets. it is a waste of tokens and doesn't seem to hurt accuracy to just shove all pages
response = send_to_api("is there an assets table in the image with a column called 'Asset'? answer Y or N only.", base64_image)

if response.lower() == 'n':
# not present, skip the page
print('skipped')
return []

# use '|' separator to avoid characters in asset names
response = send_to_api("get the asset names in the 'Asset' column of the Assets table in the public disclosure form. return only a | separated list. no other commentary.", base64_image)
response = send_to_api("This is a public disclosure form for a US congressman from house.gov. Get the asset names in the 'Asset' column of the Assets table. Return only a | separated list. If there are no assets listed, state: 'None'. No other commentary.",
base64_image)

# print(response);print('\n')
if re.sub(r'[^a-zA-Z]', '', response.strip().lower() ) == 'none':
# print('No assets on this page.')
response = ''

asset_list = [response.strip() for response in response.split("|")]

return asset_list
Expand Down
21 changes: 9 additions & 12 deletions automated_updates/modules/process/parse_house_messy_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,20 +8,17 @@

def assets_from_house_messy_image_to_csv(input_image_path):
base64_image = encode_image(input_image_path)
if False:
# this idea was to skip pages that don't list assets. it is a waste of tokens and doesn't seem to hurt accuracy to just shove all pages
response = send_to_api(message="is this the schedule A part of the form that lists assets? answer Y or N only.",
base64_image=base64_image)

if response.lower() == 'n':
# not schedule A, skip the page
print('skipped')
return []

# use '|' separator to avoid characters in asset names
response = send_to_api(message="get the asset names in the public disclosure form, return them in a | separated list only, no other commentary.",
base64_image=base64_image,
model="gpt-4o")
response = send_to_api(message="This is a public disclosure form for a US congressman from house.gov. Get the asset names form. Return them in a | separated list only. If there are no assets listed, state: 'None'. No other commentary.",
base64_image=base64_image,
model="gpt-4o")

# print(response);print('\n')
if re.sub(r'[^a-zA-Z]', '', response.strip().lower() ) == 'none':
# print('No assets on this page.')
response = ''

asset_list = [response.strip() for response in response.split("|")]

return asset_list
Expand Down
21 changes: 9 additions & 12 deletions automated_updates/modules/process/parse_senate_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,20 +8,17 @@

def assets_from_senate_image_to_csv(input_image_path):
base64_image = encode_image(input_image_path)
if False:
# this idea was to skip pages that don't list assets. it is a waste of tokens and doesn't seem to hurt accuracy to just shove all pages
response = send_to_api(message="does this part of the form list asset disclosures? answer Y or N only.",
base64_image=base64_image)

if response.lower() == 'n':
print('skipped')
# not schedule A, skip the page
return []

# use '|' separator to avoid characters in asset names
response = send_to_api(message="get the asset names in the public disclosure form. return them in a | separated list only. no other commentary.",
base64_image=base64_image,
model='gpt-4o')
response = send_to_api(message="This is a public disclosure form for a US congressman from senate.gov. Get the asset names in the form. Return them in a | separated list only. If there are no assets listed, state: 'None'. No other commentary.",
base64_image=base64_image,
model='gpt-4o')

# print(response);print('\n')
if re.sub(r'[^a-zA-Z]', '', response.strip().lower() ) == 'none':
# print('No assets on this page.')
response = ''

asset_list = [response.strip() for response in response.split("|")]

return asset_list
Expand Down
59 changes: 31 additions & 28 deletions automated_updates/parse_asset_names_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,39 +38,42 @@

# TODO: this should be parallelized, bottleneck will be OpenAI usage tier
# Process folders in house_clean_pdf_dir
for root, dirs, files in os.walk(house_clean_pdf_dir):
for folder in dirs:
processed_folders += 1
print(f"Progress: {processed_folders} / {total_folders}")
if folder in skips: continue
if args.new_only and folder.replace('_house','').replace('_senate','') not in new_disclosure_names: continue
root = next(os.walk(house_clean_pdf_dir))[0]
dirs = sorted(next(os.walk(house_clean_pdf_dir))[1])
for folder in dirs:
processed_folders += 1
print(f"\nProgress: {processed_folders} / {total_folders}")
if folder in skips: continue
if args.new_only and folder.replace('_house','').replace('_senate','') not in new_disclosure_names: continue

folder_path = os.path.join(root, folder)
print(f"\nProcessing folder in house_clean_pdf_dir: {folder_path}")
assets_from_house_clean_to_csv_entire_folder(folder_path)
folder_path = os.path.join(root, folder)
print(f"\nProcessing folder in house_clean_pdf_dir: {folder_path}")
assets_from_house_clean_to_csv_entire_folder(folder_path)

# # Process folders in house_messy_pdf_dir
for root, dirs, files in os.walk(house_messy_pdf_dir):
for folder in dirs:
processed_folders += 1
print(f"Progress: {processed_folders} / {total_folders}")
if folder in skips: continue
if args.new_only and folder.replace('_house','').replace('_senate','') not in new_disclosure_names: continue
# Process folders in house_messy_pdf_dir
root = next(os.walk(house_messy_pdf_dir))[0]
dirs = sorted(next(os.walk(house_messy_pdf_dir))[1])
for folder in dirs:
processed_folders += 1
print(f"Progress: {processed_folders} / {total_folders}")
if folder in skips: continue
if args.new_only and folder.replace('_house','').replace('_senate','') not in new_disclosure_names: continue

folder_path = os.path.join(root, folder)
print(f"\nProcessing folder in house_messy_pdf_dir: {folder_path}")
assets_from_house_messy_to_csv_entire_folder(folder_path)
folder_path = os.path.join(root, folder)
print(f"\nProcessing folder in house_messy_pdf_dir: {folder_path}")
assets_from_house_messy_to_csv_entire_folder(folder_path)

# Process folders in senate_dir
for root, dirs, files in os.walk(senate_dir):
for folder in dirs:
processed_folders += 1
print(f"Progress: {processed_folders} / {total_folders}")
if folder in skips: continue
if args.new_only and folder.replace('_house','').replace('_senate','') not in new_disclosure_names: continue
root = next(os.walk(senate_dir))[0]
dirs = sorted(next(os.walk(senate_dir))[1])
for folder in dirs:
processed_folders += 1
print(f"Progress: {processed_folders} / {total_folders}")
if folder in skips: continue
if args.new_only and folder.replace('_house','').replace('_senate','') not in new_disclosure_names: continue

folder_path = os.path.join(root, folder)
print(f"\nProcessing folder in senate_dir: {folder_path}")
assets_from_senate_to_csv_entire_folder(folder_path)
folder_path = os.path.join(root, folder)
print(f"\nProcessing folder in senate_dir: {folder_path}")
assets_from_senate_to_csv_entire_folder(folder_path)

print('\n')
Loading

0 comments on commit c2c11d2

Please sign in to comment.