Skip to content

Commit

Permalink
Merge pull request #41 from chance-on-brink/master
Browse files Browse the repository at this point in the history
minor update, include members with no asset disclosures in final output
  • Loading branch information
jlopp authored Nov 28, 2024
2 parents 6d39794 + 575053c commit 3b69904
Show file tree
Hide file tree
Showing 8 changed files with 20,317 additions and 15,989 deletions.
1,069 changes: 534 additions & 535 deletions automated_updates/all_source_data/source_data_links.csv

Large diffs are not rendered by default.

29 changes: 16 additions & 13 deletions automated_updates/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
'btc',
'bitcoin',
# bitcoin etfs
'arka',
# 'arka', # too many false positives
'arkb',
'bitb',
'bitc',
Expand All @@ -20,8 +20,8 @@
'bitx',
'brrr',
'btco',
'btcw',
'defi',
'btcw',
#'defi', # too many false positives
'ezbc',
'fbtc',
'grayscale',
Expand All @@ -36,25 +36,25 @@
'arkz',
'ceth',
'eeth',
'etha',
# 'etha', # too many false positives
'ethd',
'ethe',
# 'ethe', # too many false positives
'etht',
'ethu',
'ethv',
'feth',
'seth',
# 'seth', # too many false positives
# mixed crypto etfs
'bete',
'beth',
# 'bete', # too many false positives
# 'beth', # too many false positives
'bitq',
'bits',
'bitw',
'blkc',
'btf',
# 'btf', # too many false positives
'btop',
'dapp',
'sato',
# 'sato', # too many false positives
'spbc',
'stce',
# miners
Expand Down Expand Up @@ -99,9 +99,9 @@
'\(sq\)',
# shitcoins
'avalanche',
'avax',
# 'avax', # too many false positives
'binance',
'bch',
# 'bch', # too many false positives
'cardano',
'chainlink',
'dogecoin',
Expand Down Expand Up @@ -133,7 +133,10 @@
'Marriott',
'Pershing',
'Quonset',
'Squibb'
'Squibb',
'CALIF PRSMULTCCN EGID COR',
'XRPO Inc PC 360',
'Canaan LLC',
]

source_data_dir = './all_source_data/'
Expand Down
33,577 changes: 18,682 additions & 14,895 deletions automated_updates/final_datasets/final_asset_data.csv

Large diffs are not rendered by default.

1,069 changes: 534 additions & 535 deletions automated_updates/final_datasets/final_summary_data.csv

Large diffs are not rendered by default.

535 changes: 535 additions & 0 deletions automated_updates/final_datasets/final_summary_data.md

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions automated_updates/modules/gather/source_file_links.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ def get_new_disclosures():
columns={'filing_year_source': 'filing_year'}
)

print('new disclosures:')
if len(new_disclosures): print(new_disclosures)
else: print('None')
print('\n')
Expand Down
10 changes: 4 additions & 6 deletions automated_updates/parse_asset_names_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@
from modules.process.parse_house_messy_llm import assets_from_house_messy_to_csv_entire_folder
from modules.process.parse_senate_llm import assets_from_senate_to_csv_entire_folder
from modules.gather.source_file_links import get_outdated_source_files
# TODO: these parse files share a lot of code.
# could all be the same module, dynamically get paths and prompts

import os
import argparse
Expand Down Expand Up @@ -42,12 +40,12 @@
dirs = sorted(next(os.walk(house_clean_pdf_dir))[1])
for folder in dirs:
processed_folders += 1
print(f"\nProgress: {processed_folders} / {total_folders}")
print(f"Progress: {processed_folders} / {total_folders}")
if folder in skips: continue
if args.new_only and folder.replace('_house','').replace('_senate','') not in new_disclosure_names: continue

folder_path = os.path.join(root, folder)
print(f"\nProcessing folder in house_clean_pdf_dir: {folder_path}")
print(f"\nProcessing: {folder_path}")
assets_from_house_clean_to_csv_entire_folder(folder_path)

# Process folders in house_messy_pdf_dir
Expand All @@ -60,7 +58,7 @@
if args.new_only and folder.replace('_house','').replace('_senate','') not in new_disclosure_names: continue

folder_path = os.path.join(root, folder)
print(f"\nProcessing folder in house_messy_pdf_dir: {folder_path}")
print(f"\nProcessing: {folder_path}")
assets_from_house_messy_to_csv_entire_folder(folder_path)

# Process folders in senate_dir
Expand All @@ -73,7 +71,7 @@
if args.new_only and folder.replace('_house','').replace('_senate','') not in new_disclosure_names: continue

folder_path = os.path.join(root, folder)
print(f"\nProcessing folder in senate_dir: {folder_path}")
print(f"\nProcessing: {folder_path}")
assets_from_senate_to_csv_entire_folder(folder_path)

print('\n')
16 changes: 11 additions & 5 deletions automated_updates/summarize_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,17 @@ def combine_processed_data():
# Remove rows with blank or null asset_name
df = df[df['asset_name'].notna() & (df['asset_name'] != '')]

df['last_name'] = last_name
df['first_name'] = first_name
df['state'] = state
df['year'] = year
df['chamber'] = house_senate
# Check if dataframe is empty, and if so, add a row with asset_name as None
if df.empty:
df = pd.DataFrame({'asset_name': [None], 'last_name': [last_name],
'first_name': [first_name], 'state': [state],
'year': [year], 'chamber': [house_senate]})
else:
df['last_name'] = last_name
df['first_name'] = first_name
df['state'] = state
df['year'] = year
df['chamber'] = house_senate

dataframes.append(df)

Expand Down

0 comments on commit 3b69904

Please sign in to comment.