Skip to content

Commit

Permalink
update cron job
Browse files Browse the repository at this point in the history
  • Loading branch information
MuslemRahimi committed Aug 7, 2024
1 parent 064b7ad commit 57c07d0
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 53 deletions.
85 changes: 38 additions & 47 deletions app/cron_corporate_lobbying.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,11 @@
import pandas as pd
from dotenv import load_dotenv
import requests


from collections import defaultdict
import math
from fuzzywuzzy import process
import sqlite3
import concurrent.futures

BASE_SESSION = requests.Session()

Expand Down Expand Up @@ -345,60 +348,48 @@ def save_json(symbol, data):
with open(f"json/corporate-lobbying/companies/{symbol}.json", 'w') as file:
json.dump(data, file)

def create_dataset():
from fuzzywuzzy import process
import sqlite3
import math
from collections import defaultdict

def process_stock(stock, csv_files, reports_folder, threshold):
print(stock['name'])
year_totals = defaultdict(float)
stock_name_lower = stock['name'].lower()

for csv_file in csv_files:
print(csv_file)
df = pd.read_csv(os.path.join(reports_folder, csv_file), usecols=['ClientName', 'AmountReported', 'FilingYear'])

df['ClientName_lower'] = df['ClientName'].str.lower()
df['score'] = df['ClientName_lower'].apply(lambda x: process.extractOne(stock_name_lower, [x])[1])

matched_df = df[df['score'] >= threshold]

year_totals.update(matched_df.groupby('FilingYear')['AmountReported'].sum().to_dict())

all_res_list = [{'year': year, 'amount': amount} for year, amount in year_totals.items()]

if all_res_list:
save_json(stock['symbol'], all_res_list)
print(f"Saved data for {stock['symbol']} ({len(all_res_list)} matches)")

def create_dataset():
reports_folder = "json/corporate-lobbying/reports"
threshold = 95
csv_files = [f for f in os.listdir(reports_folder) if f.endswith('.csv')]

con = sqlite3.connect('stocks.db')
cursor = con.cursor()
cursor.execute("PRAGMA journal_mode = wal")
cursor.execute("SELECT DISTINCT symbol,name FROM stocks WHERE marketCap >= 1E9 AND symbol NOT LIKE '%.%'")
cursor.execute("SELECT DISTINCT symbol, name FROM stocks WHERE marketCap >= 10E9 AND symbol NOT LIKE '%.%' AND symbol NOT LIKE '%-%'")
stock_data = [{'symbol': row[0], 'name': row[1]} for row in cursor.fetchall()]
print(len(stock_data))
con.close()
# Set a threshold for similarity (0-100)
threshold = 95


# Get all CSV files in the reports folder
reports_folder = 'json/corporate-lobbying/reports'
csv_files = [f for f in os.listdir(reports_folder) if f.endswith('.csv')]

for stock in stock_data:
all_res_list = []
print(stock['name'])
for csv_file in csv_files:
# Read the CSV file into a DataFrame
print(csv_file)
df = pd.read_csv(os.path.join(reports_folder, csv_file))

# Convert the DataFrame to a list of dictionaries
df_list = df.to_dict(orient='records')

for item in df_list:
company_name = item['ClientName']

best_match, score = process.extractOne(stock['name'].lower(), [company_name.lower()])
if score >= threshold:
all_res_list.append({'amount': item['AmountReported'], 'year': item['FilingYear']})

all_res_list = [item for item in all_res_list if isinstance(item.get("amount"), (int, float)) and not math.isnan(item["amount"])]

# Group amounts by year
year_totals = defaultdict(float)
for item in all_res_list:
year_totals[item['year']] += item['amount']

all_res_list = [{'year': year, 'amount': amount} for year, amount in year_totals.items()]

with concurrent.futures.ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
executor.map(lambda stock: process_stock(stock, csv_files, reports_folder, threshold), stock_data)

if len(all_res_list) > 0:
save_json(stock['symbol'], all_res_list)
print(f"Saved data for {stock['symbol']} ({len(all_res_list)} matches)")

if '__main__' == __name__:

#get_historical_data()
get_historical_data()
#update_latest_quarter()
create_dataset()
#create_dataset()
8 changes: 2 additions & 6 deletions app/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -1922,24 +1922,20 @@ def remove_text_before_operator(text):
return "Operator not found in the text."



def extract_names_and_descriptions(text):
# Define a regular expression pattern to match names and descriptions
pattern = r'([A-Z][a-zA-Z\s]+):\s+(.*?)(?=\n[A-Z][a-zA-Z\s]*:|$)'
pattern = r'([A-Z][a-zA-Z\s]+):\s+(.*?)(?=\n[A-Z][a-zA-Z\s]+:|$)'
matches = re.findall(pattern, text, re.DOTALL)

extracted_data = []

for match in matches:
name = match[0].strip()
description = match[1].strip()

# Append the current name and description to the list
extracted_data.append({'name': name, 'description': description})

return extracted_data



@app.post("/earnings-call-transcripts")
async def get_earnings_call_transcripts(data:TranscriptData, api_key: str = Security(get_api_key)):
data = data.dict()
Expand Down

0 comments on commit 57c07d0

Please sign in to comment.