-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
scripts to calc/graph funder data sharing by year
- Loading branch information
Showing
7 changed files
with
592 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
"Name","Country","Acronym","Estimated Yearly Funds (USD)","Grant Code Format Example" | ||
"National Institutes of Health","USA","NIH",41000000000, | ||
"European Commission","EU","EC",13500000000,101000000 | ||
"National Natural Science Foundation of China","China","NSFC",4500000000,81630001 | ||
"German Research Foundation","Germany","DFG",3500000000,"SFB 1361" | ||
"Japan Agency for Medical Research and Development","Japan","AMED",1500000000,JP20fk0108104 | ||
"Wellcome Trust","UK","WT",1200000000,"209031/Z/17/Z" | ||
"Canadian Institutes of Health Research","Canada","CIHR",1000000000,"FDN-148477" | ||
"Medical Research Council","UK","MRC",900000000,"MR/N003713/1" | ||
"Howard Hughes Medical Institute","USA","HHMI",750000000,GT10178 | ||
"Bill & Melinda Gates Foundation","USA","BMGF",5000000000,OPP1191684 | ||
"National Cancer Institute","USA","NCI",6400000000,"R01 CA123456" | ||
"National Institute of Allergy and Infectious Diseases","USA","NIAID",6100000000,"R01 AI123456" | ||
"National Institute on Aging","USA","NIA",3900000000,"R01 AG123456" | ||
"National Heart Lung and Blood Institute","USA","NHLBI",3700000000,"R01 HL123456" | ||
"National Institute of General Medical Sciences","USA","NIGMS",3000000000,"R01 GM123456" | ||
"National Institute of Neurological Disorders and Stroke","USA","NINDS",2500000000,"R01 NS123456" | ||
"National Institute of Diabetes and Digestive and Kidney Diseases","USA","NIDDK",2200000000,"R01 DK123456" | ||
"National Institute of Mental Health","USA","NIMH",2100000000,"R01 MH123456-01A1" | ||
"National Institute of Child Health and Human Development","USA","NICHD",1600000000,"R01 HD123456" | ||
"National Institute on Drug Abuse","USA","NIDA",1500000000,"R01 DA123456" | ||
"National Institute of Environmental Health Sciences","USA","NIEHS",900000000,"R01 ES123456" | ||
"National Eye Institute","USA","NEI",800000000,"R01 EY123456" | ||
"National Human Genome Research Institute","USA","NHGRI",600000000,"R01 HG123456" | ||
"National Institute of Arthritis and Musculoskeletal and Skin Diseases","USA","NIAMS",650000000,"R01 AR123456" | ||
"National Institute on Alcohol Abuse and Alcoholism","USA","NIAAA",550000000,"R01 AA123456" | ||
"National Institute of Dental and Craniofacial Research","USA","NIDCR",500000000,"R01 DE123456" | ||
"National Library of Medicine","USA","NLM",450000000,"R01 LM123456" | ||
"National Institute of Biomedical Imaging and Bioengineering","USA","NIBIB",400000000,"R01 EB123456" | ||
"National Institute on Minority Health and Health Disparities","USA","NIMHD",400000000,"R01 MD123456" | ||
"National Institute of Nursing Research","USA","NINR",180000000,"R01 NR123456" | ||
"National Center for Complementary and Integrative Health","USA","NCCIH",150000000,"R01 AT123456" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,185 @@ | ||
import pandas as pd | ||
import matplotlib.pyplot as plt | ||
import sys | ||
import logging | ||
from tabulate import tabulate | ||
import argparse | ||
import itertools | ||
|
||
def setup_logging(log_level): | ||
numeric_level = getattr(logging, log_level.upper(), None) | ||
if not isinstance(numeric_level, int): | ||
raise ValueError(f'Invalid log level: {log_level}') | ||
logging.basicConfig(level=numeric_level, format='%(levelname)s: %(message)s') | ||
|
||
def get_acronym(name): | ||
return ''.join(word[0].upper() for word in name.split() if word[0].isupper()) | ||
|
||
def print_results_table(results, percentages, source_acronyms): | ||
table_data = [] | ||
headers = ["Year"] + [f"{acr} (Count)" for acr in source_acronyms.values()] + [f"{acr} (%)" for acr in source_acronyms.values()] | ||
years = sorted(set(year for source_data in results.values() for year in source_data.keys())) | ||
|
||
for year in years: | ||
row = [year] | ||
for source in results.keys(): | ||
row.append(results[source].get(year, 0)) | ||
for source in percentages.keys(): | ||
row.append(f"{percentages[source].get(year, 0):.2f}%") | ||
table_data.append(row) | ||
|
||
print(tabulate(table_data, headers=headers, tablefmt="grid")) | ||
|
||
def create_plots_and_csv(results, percentages, source_acronyms, output_filename_base): | ||
# Prepare data for count plot and CSV | ||
count_data = {source_acronyms[source]: data for source, data in results.items()} | ||
count_df = pd.DataFrame(count_data).T # Transpose the DataFrame | ||
count_df['Total'] = count_df.sum(axis=1) | ||
count_df = count_df.sort_values('Total', ascending=False) | ||
count_df = count_df.drop('Total', axis=1) | ||
|
||
# Write count data to CSV | ||
count_csv_filename = f'{output_filename_base}_count.csv' | ||
count_df.to_csv(count_csv_filename) | ||
logging.info(f"Count data has been saved to '{count_csv_filename}'") | ||
|
||
# Define line styles | ||
line_styles = ['-', '--', '-.', ':'] | ||
style_cycler = itertools.cycle(line_styles) | ||
|
||
# Count plot | ||
plt.figure(figsize=(15, 10)) | ||
for source in count_df.index: | ||
years = sorted(count_df.columns) | ||
counts = [count_df.loc[source, year] for year in years] | ||
plt.plot(years, counts, marker='o', label=source, linestyle=next(style_cycler)) | ||
plt.xlabel('Year') | ||
plt.ylabel('Number of TRUE values in is_code_pred') | ||
plt.title('Funding Sources and Code Predictions Over Time (Count)') | ||
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left') | ||
plt.tight_layout() | ||
plt.grid(True) | ||
plt.savefig(f'{output_filename_base}_count.png', dpi=300, bbox_inches='tight') | ||
plt.close() | ||
|
||
# Prepare data for percentage plot and CSV | ||
percentage_data = {source_acronyms[source]: {year: perc for year, perc in data.items() if 2015 <= year <= 2020} | ||
for source, data in percentages.items()} | ||
percentage_df = pd.DataFrame(percentage_data).T # Transpose the DataFrame | ||
percentage_df['Total'] = percentage_df.sum(axis=1) | ||
percentage_df = percentage_df.sort_values('Total', ascending=False) | ||
percentage_df = percentage_df.drop('Total', axis=1) | ||
|
||
# Write percentage data to CSV | ||
percentage_csv_filename = f'{output_filename_base}_percentage_2015_2020.csv' | ||
percentage_df.to_csv(percentage_csv_filename) | ||
logging.info(f"Percentage data has been saved to '{percentage_csv_filename}'") | ||
|
||
# Reset style cycler | ||
style_cycler = itertools.cycle(line_styles) | ||
|
||
# Percentage plot (2015-2020) | ||
plt.figure(figsize=(15, 10)) | ||
for source in percentage_df.index: | ||
years = sorted(percentage_df.columns) | ||
percentages = [percentage_df.loc[source, year] for year in years] | ||
plt.plot(years, percentages, marker='o', label=source, linestyle=next(style_cycler)) | ||
plt.xlabel('Year') | ||
plt.ylabel('Percentage of TRUE values in is_code_pred') | ||
plt.title('Funding Sources and Code Predictions Over Time (Percentage, 2015-2020)') | ||
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left') | ||
plt.tight_layout() | ||
plt.grid(True) | ||
plt.xticks(range(2015, 2021)) | ||
plt.savefig(f'{output_filename_base}_percentage_2015_2020.png', dpi=300, bbox_inches='tight') | ||
plt.close() | ||
|
||
def main(csv_filename, log_level): | ||
setup_logging(log_level) | ||
|
||
logging.info(f"Reading file '{csv_filename}'") | ||
|
||
try: | ||
df = pd.read_csv(csv_filename, dtype={col: bool for col in range(1, 32)}) | ||
logging.info(f"Successfully read CSV file. Shape: {df.shape}") | ||
except FileNotFoundError: | ||
logging.error(f"File '{csv_filename}' not found.") | ||
sys.exit(1) | ||
except pd.errors.EmptyDataError: | ||
logging.error(f"File '{csv_filename}' is empty.") | ||
sys.exit(1) | ||
except Exception as e: | ||
logging.error(f"An error occurred while reading the file: {str(e)}") | ||
sys.exit(1) | ||
|
||
logging.debug("First few rows of the DataFrame:") | ||
logging.debug(df.head().to_string()) | ||
|
||
logging.debug("Data types of columns:") | ||
logging.debug(df.dtypes) | ||
|
||
logging.debug(f"Unique values in 'year' column: {df['year'].unique()}") | ||
|
||
df['year_numeric'] = pd.to_numeric(df['year'], errors='coerce') | ||
logging.info(f"Unique years after numeric conversion: {sorted(df['year_numeric'].dropna().unique())}") | ||
|
||
if df['year_numeric'].isna().all(): | ||
logging.warning("No valid numeric years found. Attempting to extract year from 'pmid'.") | ||
df['year_numeric'] = df['pmid'].astype(str).str[:4].astype(float) | ||
logging.info(f"Unique years extracted from 'pmid': {sorted(df['year_numeric'].dropna().unique())}") | ||
|
||
df = df.dropna(subset=['year_numeric']) | ||
df['year_numeric'] = df['year_numeric'].astype(int) # Convert to integer | ||
logging.info(f"Shape after dropping NaN years: {df.shape}") | ||
|
||
if df.empty: | ||
logging.error("No valid data remaining after processing years. Please check the 'year' column in your CSV file.") | ||
sys.exit(1) | ||
|
||
funding_sources = df.columns[1:32] | ||
source_acronyms = {source: get_acronym(source) for source in funding_sources} | ||
logging.debug(f"Funding sources: {', '.join(source_acronyms.values())}") | ||
|
||
results = {source: {} for source in funding_sources} | ||
percentages = {source: {} for source in funding_sources} | ||
|
||
for year in df['year_numeric'].unique(): | ||
year_data = df[df['year_numeric'] == year] | ||
logging.debug(f"Processing year {year}, {len(year_data)} rows") | ||
for source in funding_sources: | ||
total_count = year_data[source].sum() | ||
code_pred_count = year_data[year_data[source] & year_data['is_code_pred']].shape[0] | ||
results[source][int(year)] = code_pred_count | ||
percentages[source][int(year)] = (code_pred_count / total_count * 100) if total_count > 0 else 0 | ||
logging.debug(f"{source_acronyms[source]} - {code_pred_count} TRUE values, {percentages[source][int(year)]:.2f}%") | ||
|
||
print("\nResults Table:") | ||
print_results_table(results, percentages, source_acronyms) | ||
|
||
if all(len(data) == 0 for data in results.values()): | ||
logging.warning("No data to plot. All counts are zero.") | ||
else: | ||
create_plots_and_csv(results, percentages, source_acronyms, 'funding_sources_code_predictions') | ||
logging.info("Graphs and CSV files have been saved.") | ||
|
||
logging.info(f"Total rows in DataFrame: {len(df)}") | ||
logging.info(f"Unique years: {sorted(df['year_numeric'].unique())}") | ||
logging.info("Funding sources summary:") | ||
for source in funding_sources: | ||
true_count = df[source].sum() | ||
code_pred_count = df[df[source] & df['is_code_pred']].shape[0] | ||
percentage = (code_pred_count / true_count * 100) if true_count > 0 else 0 | ||
logging.info(f" {source_acronyms[source]}: {code_pred_count} TRUE out of {true_count} ({percentage:.2f}%)") | ||
|
||
code_pred_count = df['is_code_pred'].sum() | ||
total_count = len(df) | ||
code_pred_percentage = (code_pred_count / total_count * 100) if total_count > 0 else 0 | ||
logging.info(f"Total 'is_code_pred' TRUE values: {code_pred_count} out of {total_count} ({code_pred_percentage:.2f}%)") | ||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser(description="Analyze funding sources and code predictions from CSV data.") | ||
parser.add_argument("csv_file", help="Path to the input CSV file") | ||
parser.add_argument("--log", default="INFO", help="Set the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)") | ||
args = parser.parse_args() | ||
|
||
main(args.csv_file, args.log) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,103 @@ | ||
import pandas as pd | ||
import numpy as np | ||
import logging | ||
import sys | ||
import re | ||
|
||
# still need to test this version and modify logging AGT 2024-08-23 | ||
|
||
logging.basicConfig( | ||
level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" | ||
) | ||
logger = logging.getLogger(__name__) | ||
|
||
funders_df = pd.read_csv('biomedical_research_funders.csv') | ||
|
||
funding_columns = ['fund_text', 'fund_pmc_institute', | ||
'fund_pmc_source', 'fund_pmc_anysource'] | ||
|
||
def validate_pmcid(pmcid): | ||
"""Validate that a PMCID is a positive integer""" | ||
return pd.notna(pmcid) and isinstance(pmcid, (int, np.integer)) and pmcid > 0 | ||
|
||
def data_cleaning_processing(df): | ||
"""Removes spaces and symbols""" | ||
for col in funding_columns: | ||
if df[col].dtype == 'object': # Only process string columns | ||
df[col] = df[col].str.replace('[^\w\s]', '', regex=True) | ||
return df | ||
|
||
def funder_mapping(chunk, funder_names, funder_acronyms): | ||
"""Map funders for a chunk of data""" | ||
output_chunk = pd.DataFrame(chunk['pmcid']) | ||
|
||
for name, acronym in zip(funder_names, funder_acronyms): | ||
output_chunk[name] = False | ||
for column in funding_columns: | ||
if chunk[column].dtype == 'object': | ||
name_matches = chunk[column].str.contains(name, case=False, na=False) | ||
acronym_matches = chunk[column].str.contains(acronym, case=False, na=False) | ||
|
||
output_chunk[name] |= name_matches | acronym_matches | ||
|
||
if name_matches.any(): | ||
matched_rows = chunk.loc[name_matches] | ||
for _, row in matched_rows.iterrows(): | ||
match = re.search(name, str(row[column]), re.IGNORECASE) | ||
if match: | ||
logger.info(f"PMCID {row['pmcid']}: Found {name} in {column}: {match.group()}") | ||
|
||
if acronym_matches.any(): | ||
matched_rows = chunk.loc[acronym_matches] | ||
for _, row in matched_rows.iterrows(): | ||
match = re.search(acronym, str(row[column]), re.IGNORECASE) | ||
if match: | ||
logger.info(f"PMCID {row['pmcid']}: Found {acronym} in {column}: {match.group()}") | ||
|
||
return output_chunk | ||
|
||
def process_chunks(chunk_size=10000): | ||
funder_names = funders_df['Name'].tolist() | ||
funder_acronyms = funders_df['Acronym'].tolist() | ||
|
||
output_chunks = [] | ||
invalid_pmcids = [] | ||
|
||
for chunk in pd.read_csv(sys.stdin, chunksize=chunk_size): | ||
if 'pmcid_pmc' in chunk.columns: | ||
chunk = chunk.rename(columns={'pmcid_pmc': 'pmcid'}) | ||
|
||
# Validate PMCIDs | ||
invalid_mask = ~chunk['pmcid'].apply(validate_pmcid) | ||
if invalid_mask.any(): | ||
invalid_pmcids.extend(chunk.loc[invalid_mask, 'pmcid'].tolist()) | ||
|
||
chunk = chunk[~invalid_mask] | ||
|
||
if not chunk.empty: | ||
chunk = data_cleaning_processing(chunk) | ||
output_chunk = funder_mapping(chunk, funder_names, funder_acronyms) | ||
output_chunks.append(output_chunk) | ||
|
||
if invalid_pmcids: | ||
for pmcid in invalid_pmcids: | ||
logger.error(f"Invalid PMCID: '{pmcid}'") | ||
raise ValueError("Invalid PMCIDs found in the input. Please check the log for details.") | ||
|
||
return pd.concat(output_chunks, ignore_index=True) | ||
|
||
def output_to_file(output_df: pd.DataFrame): | ||
"""Convert True/False to TRUE/FALSE and save to file""" | ||
output = output_df.replace({True: 'TRUE', False: 'FALSE'}) | ||
output.to_csv('pmcid-funding-matrix.csv', index=False) | ||
|
||
def main(): | ||
try: | ||
output_df = process_chunks() | ||
output_to_file(output_df) | ||
except Exception as e: | ||
logger.error(f"Failed to process data: {e}") | ||
sys.exit(1) | ||
|
||
if __name__ == "__main__": | ||
main() |
Oops, something went wrong.