Skip to content

Commit

Permalink
scripts to calc/graph funder data sharing by year
Browse files Browse the repository at this point in the history
  • Loading branch information
agt24 committed Aug 23, 2024
1 parent 61c6c81 commit 2eda432
Show file tree
Hide file tree
Showing 7 changed files with 592 additions and 0 deletions.
32 changes: 32 additions & 0 deletions scripts/biomedical_research_funders.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
"Name","Country","Acronym","Estimated Yearly Funds (USD)","Grant Code Format Example"
"National Institutes of Health","USA","NIH",41000000000,
"European Commission","EU","EC",13500000000,101000000
"National Natural Science Foundation of China","China","NSFC",4500000000,81630001
"German Research Foundation","Germany","DFG",3500000000,"SFB 1361"
"Japan Agency for Medical Research and Development","Japan","AMED",1500000000,JP20fk0108104
"Wellcome Trust","UK","WT",1200000000,"209031/Z/17/Z"
"Canadian Institutes of Health Research","Canada","CIHR",1000000000,"FDN-148477"
"Medical Research Council","UK","MRC",900000000,"MR/N003713/1"
"Howard Hughes Medical Institute","USA","HHMI",750000000,GT10178
"Bill & Melinda Gates Foundation","USA","BMGF",5000000000,OPP1191684
"National Cancer Institute","USA","NCI",6400000000,"R01 CA123456"
"National Institute of Allergy and Infectious Diseases","USA","NIAID",6100000000,"R01 AI123456"
"National Institute on Aging","USA","NIA",3900000000,"R01 AG123456"
"National Heart Lung and Blood Institute","USA","NHLBI",3700000000,"R01 HL123456"
"National Institute of General Medical Sciences","USA","NIGMS",3000000000,"R01 GM123456"
"National Institute of Neurological Disorders and Stroke","USA","NINDS",2500000000,"R01 NS123456"
"National Institute of Diabetes and Digestive and Kidney Diseases","USA","NIDDK",2200000000,"R01 DK123456"
"National Institute of Mental Health","USA","NIMH",2100000000,"R01 MH123456-01A1"
"National Institute of Child Health and Human Development","USA","NICHD",1600000000,"R01 HD123456"
"National Institute on Drug Abuse","USA","NIDA",1500000000,"R01 DA123456"
"National Institute of Environmental Health Sciences","USA","NIEHS",900000000,"R01 ES123456"
"National Eye Institute","USA","NEI",800000000,"R01 EY123456"
"National Human Genome Research Institute","USA","NHGRI",600000000,"R01 HG123456"
"National Institute of Arthritis and Musculoskeletal and Skin Diseases","USA","NIAMS",650000000,"R01 AR123456"
"National Institute on Alcohol Abuse and Alcoholism","USA","NIAAA",550000000,"R01 AA123456"
"National Institute of Dental and Craniofacial Research","USA","NIDCR",500000000,"R01 DE123456"
"National Library of Medicine","USA","NLM",450000000,"R01 LM123456"
"National Institute of Biomedical Imaging and Bioengineering","USA","NIBIB",400000000,"R01 EB123456"
"National Institute on Minority Health and Health Disparities","USA","NIMHD",400000000,"R01 MD123456"
"National Institute of Nursing Research","USA","NINR",180000000,"R01 NR123456"
"National Center for Complementary and Integrative Health","USA","NCCIH",150000000,"R01 AT123456"
185 changes: 185 additions & 0 deletions scripts/funder-line-graph_v15.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
import pandas as pd
import matplotlib.pyplot as plt
import sys
import logging
from tabulate import tabulate
import argparse
import itertools

def setup_logging(log_level):
numeric_level = getattr(logging, log_level.upper(), None)
if not isinstance(numeric_level, int):
raise ValueError(f'Invalid log level: {log_level}')
logging.basicConfig(level=numeric_level, format='%(levelname)s: %(message)s')

def get_acronym(name):
return ''.join(word[0].upper() for word in name.split() if word[0].isupper())

def print_results_table(results, percentages, source_acronyms):
table_data = []
headers = ["Year"] + [f"{acr} (Count)" for acr in source_acronyms.values()] + [f"{acr} (%)" for acr in source_acronyms.values()]
years = sorted(set(year for source_data in results.values() for year in source_data.keys()))

for year in years:
row = [year]
for source in results.keys():
row.append(results[source].get(year, 0))
for source in percentages.keys():
row.append(f"{percentages[source].get(year, 0):.2f}%")
table_data.append(row)

print(tabulate(table_data, headers=headers, tablefmt="grid"))

def create_plots_and_csv(results, percentages, source_acronyms, output_filename_base):
# Prepare data for count plot and CSV
count_data = {source_acronyms[source]: data for source, data in results.items()}
count_df = pd.DataFrame(count_data).T # Transpose the DataFrame
count_df['Total'] = count_df.sum(axis=1)
count_df = count_df.sort_values('Total', ascending=False)
count_df = count_df.drop('Total', axis=1)

# Write count data to CSV
count_csv_filename = f'{output_filename_base}_count.csv'
count_df.to_csv(count_csv_filename)
logging.info(f"Count data has been saved to '{count_csv_filename}'")

# Define line styles
line_styles = ['-', '--', '-.', ':']
style_cycler = itertools.cycle(line_styles)

# Count plot
plt.figure(figsize=(15, 10))
for source in count_df.index:
years = sorted(count_df.columns)
counts = [count_df.loc[source, year] for year in years]
plt.plot(years, counts, marker='o', label=source, linestyle=next(style_cycler))
plt.xlabel('Year')
plt.ylabel('Number of TRUE values in is_code_pred')
plt.title('Funding Sources and Code Predictions Over Time (Count)')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.grid(True)
plt.savefig(f'{output_filename_base}_count.png', dpi=300, bbox_inches='tight')
plt.close()

# Prepare data for percentage plot and CSV
percentage_data = {source_acronyms[source]: {year: perc for year, perc in data.items() if 2015 <= year <= 2020}
for source, data in percentages.items()}
percentage_df = pd.DataFrame(percentage_data).T # Transpose the DataFrame
percentage_df['Total'] = percentage_df.sum(axis=1)
percentage_df = percentage_df.sort_values('Total', ascending=False)
percentage_df = percentage_df.drop('Total', axis=1)

# Write percentage data to CSV
percentage_csv_filename = f'{output_filename_base}_percentage_2015_2020.csv'
percentage_df.to_csv(percentage_csv_filename)
logging.info(f"Percentage data has been saved to '{percentage_csv_filename}'")

# Reset style cycler
style_cycler = itertools.cycle(line_styles)

# Percentage plot (2015-2020)
plt.figure(figsize=(15, 10))
for source in percentage_df.index:
years = sorted(percentage_df.columns)
percentages = [percentage_df.loc[source, year] for year in years]
plt.plot(years, percentages, marker='o', label=source, linestyle=next(style_cycler))
plt.xlabel('Year')
plt.ylabel('Percentage of TRUE values in is_code_pred')
plt.title('Funding Sources and Code Predictions Over Time (Percentage, 2015-2020)')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.grid(True)
plt.xticks(range(2015, 2021))
plt.savefig(f'{output_filename_base}_percentage_2015_2020.png', dpi=300, bbox_inches='tight')
plt.close()

def main(csv_filename, log_level):
setup_logging(log_level)

logging.info(f"Reading file '{csv_filename}'")

try:
df = pd.read_csv(csv_filename, dtype={col: bool for col in range(1, 32)})
logging.info(f"Successfully read CSV file. Shape: {df.shape}")
except FileNotFoundError:
logging.error(f"File '{csv_filename}' not found.")
sys.exit(1)
except pd.errors.EmptyDataError:
logging.error(f"File '{csv_filename}' is empty.")
sys.exit(1)
except Exception as e:
logging.error(f"An error occurred while reading the file: {str(e)}")
sys.exit(1)

logging.debug("First few rows of the DataFrame:")
logging.debug(df.head().to_string())

logging.debug("Data types of columns:")
logging.debug(df.dtypes)

logging.debug(f"Unique values in 'year' column: {df['year'].unique()}")

df['year_numeric'] = pd.to_numeric(df['year'], errors='coerce')
logging.info(f"Unique years after numeric conversion: {sorted(df['year_numeric'].dropna().unique())}")

if df['year_numeric'].isna().all():
logging.warning("No valid numeric years found. Attempting to extract year from 'pmid'.")
df['year_numeric'] = df['pmid'].astype(str).str[:4].astype(float)
logging.info(f"Unique years extracted from 'pmid': {sorted(df['year_numeric'].dropna().unique())}")

df = df.dropna(subset=['year_numeric'])
df['year_numeric'] = df['year_numeric'].astype(int) # Convert to integer
logging.info(f"Shape after dropping NaN years: {df.shape}")

if df.empty:
logging.error("No valid data remaining after processing years. Please check the 'year' column in your CSV file.")
sys.exit(1)

funding_sources = df.columns[1:32]
source_acronyms = {source: get_acronym(source) for source in funding_sources}
logging.debug(f"Funding sources: {', '.join(source_acronyms.values())}")

results = {source: {} for source in funding_sources}
percentages = {source: {} for source in funding_sources}

for year in df['year_numeric'].unique():
year_data = df[df['year_numeric'] == year]
logging.debug(f"Processing year {year}, {len(year_data)} rows")
for source in funding_sources:
total_count = year_data[source].sum()
code_pred_count = year_data[year_data[source] & year_data['is_code_pred']].shape[0]
results[source][int(year)] = code_pred_count
percentages[source][int(year)] = (code_pred_count / total_count * 100) if total_count > 0 else 0
logging.debug(f"{source_acronyms[source]} - {code_pred_count} TRUE values, {percentages[source][int(year)]:.2f}%")

print("\nResults Table:")
print_results_table(results, percentages, source_acronyms)

if all(len(data) == 0 for data in results.values()):
logging.warning("No data to plot. All counts are zero.")
else:
create_plots_and_csv(results, percentages, source_acronyms, 'funding_sources_code_predictions')
logging.info("Graphs and CSV files have been saved.")

logging.info(f"Total rows in DataFrame: {len(df)}")
logging.info(f"Unique years: {sorted(df['year_numeric'].unique())}")
logging.info("Funding sources summary:")
for source in funding_sources:
true_count = df[source].sum()
code_pred_count = df[df[source] & df['is_code_pred']].shape[0]
percentage = (code_pred_count / true_count * 100) if true_count > 0 else 0
logging.info(f" {source_acronyms[source]}: {code_pred_count} TRUE out of {true_count} ({percentage:.2f}%)")

code_pred_count = df['is_code_pred'].sum()
total_count = len(df)
code_pred_percentage = (code_pred_count / total_count * 100) if total_count > 0 else 0
logging.info(f"Total 'is_code_pred' TRUE values: {code_pred_count} out of {total_count} ({code_pred_percentage:.2f}%)")

if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Analyze funding sources and code predictions from CSV data.")
parser.add_argument("csv_file", help="Path to the input CSV file")
parser.add_argument("--log", default="INFO", help="Set the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)")
args = parser.parse_args()

main(args.csv_file, args.log)
103 changes: 103 additions & 0 deletions scripts/funder-mapping-chunks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
import pandas as pd
import numpy as np
import logging
import sys
import re

# still need to test this version and modify logging AGT 2024-08-23

logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)

funders_df = pd.read_csv('biomedical_research_funders.csv')

funding_columns = ['fund_text', 'fund_pmc_institute',
'fund_pmc_source', 'fund_pmc_anysource']

def validate_pmcid(pmcid):
"""Validate that a PMCID is a positive integer"""
return pd.notna(pmcid) and isinstance(pmcid, (int, np.integer)) and pmcid > 0

def data_cleaning_processing(df):
"""Removes spaces and symbols"""
for col in funding_columns:
if df[col].dtype == 'object': # Only process string columns
df[col] = df[col].str.replace('[^\w\s]', '', regex=True)
return df

def funder_mapping(chunk, funder_names, funder_acronyms):
"""Map funders for a chunk of data"""
output_chunk = pd.DataFrame(chunk['pmcid'])

for name, acronym in zip(funder_names, funder_acronyms):
output_chunk[name] = False
for column in funding_columns:
if chunk[column].dtype == 'object':
name_matches = chunk[column].str.contains(name, case=False, na=False)
acronym_matches = chunk[column].str.contains(acronym, case=False, na=False)

output_chunk[name] |= name_matches | acronym_matches

if name_matches.any():
matched_rows = chunk.loc[name_matches]
for _, row in matched_rows.iterrows():
match = re.search(name, str(row[column]), re.IGNORECASE)
if match:
logger.info(f"PMCID {row['pmcid']}: Found {name} in {column}: {match.group()}")

if acronym_matches.any():
matched_rows = chunk.loc[acronym_matches]
for _, row in matched_rows.iterrows():
match = re.search(acronym, str(row[column]), re.IGNORECASE)
if match:
logger.info(f"PMCID {row['pmcid']}: Found {acronym} in {column}: {match.group()}")

return output_chunk

def process_chunks(chunk_size=10000):
funder_names = funders_df['Name'].tolist()
funder_acronyms = funders_df['Acronym'].tolist()

output_chunks = []
invalid_pmcids = []

for chunk in pd.read_csv(sys.stdin, chunksize=chunk_size):
if 'pmcid_pmc' in chunk.columns:
chunk = chunk.rename(columns={'pmcid_pmc': 'pmcid'})

# Validate PMCIDs
invalid_mask = ~chunk['pmcid'].apply(validate_pmcid)
if invalid_mask.any():
invalid_pmcids.extend(chunk.loc[invalid_mask, 'pmcid'].tolist())

chunk = chunk[~invalid_mask]

if not chunk.empty:
chunk = data_cleaning_processing(chunk)
output_chunk = funder_mapping(chunk, funder_names, funder_acronyms)
output_chunks.append(output_chunk)

if invalid_pmcids:
for pmcid in invalid_pmcids:
logger.error(f"Invalid PMCID: '{pmcid}'")
raise ValueError("Invalid PMCIDs found in the input. Please check the log for details.")

return pd.concat(output_chunks, ignore_index=True)

def output_to_file(output_df: pd.DataFrame):
"""Convert True/False to TRUE/FALSE and save to file"""
output = output_df.replace({True: 'TRUE', False: 'FALSE'})
output.to_csv('pmcid-funding-matrix.csv', index=False)

def main():
try:
output_df = process_chunks()
output_to_file(output_df)
except Exception as e:
logger.error(f"Failed to process data: {e}")
sys.exit(1)

if __name__ == "__main__":
main()
Loading

0 comments on commit 2eda432

Please sign in to comment.