scripts to calc/graph funder data sharing by year

nimh-dsst · Aug 23, 2024 · 2eda432 · 2eda432
1 parent 61c6c81
commit 2eda432
Show file tree

Hide file tree

Showing 7 changed files with 592 additions and 0 deletions.
diff --git a/scripts/biomedical_research_funders.csv b/scripts/biomedical_research_funders.csv
@@ -0,0 +1,32 @@
+"Name","Country","Acronym","Estimated Yearly Funds (USD)","Grant Code Format Example"
+"National Institutes of Health","USA","NIH",41000000000,
+"European Commission","EU","EC",13500000000,101000000
+"National Natural Science Foundation of China","China","NSFC",4500000000,81630001
+"German Research Foundation","Germany","DFG",3500000000,"SFB 1361"
+"Japan Agency for Medical Research and Development","Japan","AMED",1500000000,JP20fk0108104
+"Wellcome Trust","UK","WT",1200000000,"209031/Z/17/Z"
+"Canadian Institutes of Health Research","Canada","CIHR",1000000000,"FDN-148477"
+"Medical Research Council","UK","MRC",900000000,"MR/N003713/1"
+"Howard Hughes Medical Institute","USA","HHMI",750000000,GT10178
+"Bill & Melinda Gates Foundation","USA","BMGF",5000000000,OPP1191684
+"National Cancer Institute","USA","NCI",6400000000,"R01 CA123456"
+"National Institute of Allergy and Infectious Diseases","USA","NIAID",6100000000,"R01 AI123456"
+"National Institute on Aging","USA","NIA",3900000000,"R01 AG123456"
+"National Heart Lung and Blood Institute","USA","NHLBI",3700000000,"R01 HL123456"
+"National Institute of General Medical Sciences","USA","NIGMS",3000000000,"R01 GM123456"
+"National Institute of Neurological Disorders and Stroke","USA","NINDS",2500000000,"R01 NS123456"
+"National Institute of Diabetes and Digestive and Kidney Diseases","USA","NIDDK",2200000000,"R01 DK123456"
+"National Institute of Mental Health","USA","NIMH",2100000000,"R01 MH123456-01A1"
+"National Institute of Child Health and Human Development","USA","NICHD",1600000000,"R01 HD123456"
+"National Institute on Drug Abuse","USA","NIDA",1500000000,"R01 DA123456"
+"National Institute of Environmental Health Sciences","USA","NIEHS",900000000,"R01 ES123456"
+"National Eye Institute","USA","NEI",800000000,"R01 EY123456"
+"National Human Genome Research Institute","USA","NHGRI",600000000,"R01 HG123456"
+"National Institute of Arthritis and Musculoskeletal and Skin Diseases","USA","NIAMS",650000000,"R01 AR123456"
+"National Institute on Alcohol Abuse and Alcoholism","USA","NIAAA",550000000,"R01 AA123456"
+"National Institute of Dental and Craniofacial Research","USA","NIDCR",500000000,"R01 DE123456"
+"National Library of Medicine","USA","NLM",450000000,"R01 LM123456"
+"National Institute of Biomedical Imaging and Bioengineering","USA","NIBIB",400000000,"R01 EB123456"
+"National Institute on Minority Health and Health Disparities","USA","NIMHD",400000000,"R01 MD123456"
+"National Institute of Nursing Research","USA","NINR",180000000,"R01 NR123456"
+"National Center for Complementary and Integrative Health","USA","NCCIH",150000000,"R01 AT123456"
diff --git a/scripts/funder-line-graph_v15.py b/scripts/funder-line-graph_v15.py
@@ -0,0 +1,185 @@
+import pandas as pd
+import matplotlib.pyplot as plt
+import sys
+import logging
+from tabulate import tabulate
+import argparse
+import itertools
+
+def setup_logging(log_level):
+    numeric_level = getattr(logging, log_level.upper(), None)
+    if not isinstance(numeric_level, int):
+        raise ValueError(f'Invalid log level: {log_level}')
+    logging.basicConfig(level=numeric_level, format='%(levelname)s: %(message)s')
+
+def get_acronym(name):
+    return ''.join(word[0].upper() for word in name.split() if word[0].isupper())
+
+def print_results_table(results, percentages, source_acronyms):
+    table_data = []
+    headers = ["Year"] + [f"{acr} (Count)" for acr in source_acronyms.values()] + [f"{acr} (%)" for acr in source_acronyms.values()]
+    years = sorted(set(year for source_data in results.values() for year in source_data.keys()))
+
+    for year in years:
+        row = [year]
+        for source in results.keys():
+            row.append(results[source].get(year, 0))
+        for source in percentages.keys():
+            row.append(f"{percentages[source].get(year, 0):.2f}%")
+        table_data.append(row)
+
+    print(tabulate(table_data, headers=headers, tablefmt="grid"))
+
+def create_plots_and_csv(results, percentages, source_acronyms, output_filename_base):
+    # Prepare data for count plot and CSV
+    count_data = {source_acronyms[source]: data for source, data in results.items()}
+    count_df = pd.DataFrame(count_data).T  # Transpose the DataFrame
+    count_df['Total'] = count_df.sum(axis=1)
+    count_df = count_df.sort_values('Total', ascending=False)
+    count_df = count_df.drop('Total', axis=1)
+
+    # Write count data to CSV
+    count_csv_filename = f'{output_filename_base}_count.csv'
+    count_df.to_csv(count_csv_filename)
+    logging.info(f"Count data has been saved to '{count_csv_filename}'")
+
+    # Define line styles
+    line_styles = ['-', '--', '-.', ':']
+    style_cycler = itertools.cycle(line_styles)
+
+    # Count plot
+    plt.figure(figsize=(15, 10))
+    for source in count_df.index:
+        years = sorted(count_df.columns)
+        counts = [count_df.loc[source, year] for year in years]
+        plt.plot(years, counts, marker='o', label=source, linestyle=next(style_cycler))
+    plt.xlabel('Year')
+    plt.ylabel('Number of TRUE values in is_code_pred')
+    plt.title('Funding Sources and Code Predictions Over Time (Count)')
+    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
+    plt.tight_layout()
+    plt.grid(True)
+    plt.savefig(f'{output_filename_base}_count.png', dpi=300, bbox_inches='tight')
+    plt.close()
+
+    # Prepare data for percentage plot and CSV
+    percentage_data = {source_acronyms[source]: {year: perc for year, perc in data.items() if 2015 <= year <= 2020} 
+                       for source, data in percentages.items()}
+    percentage_df = pd.DataFrame(percentage_data).T  # Transpose the DataFrame
+    percentage_df['Total'] = percentage_df.sum(axis=1)
+    percentage_df = percentage_df.sort_values('Total', ascending=False)
+    percentage_df = percentage_df.drop('Total', axis=1)
+
+    # Write percentage data to CSV
+    percentage_csv_filename = f'{output_filename_base}_percentage_2015_2020.csv'
+    percentage_df.to_csv(percentage_csv_filename)
+    logging.info(f"Percentage data has been saved to '{percentage_csv_filename}'")
+
+    # Reset style cycler
+    style_cycler = itertools.cycle(line_styles)
+
+    # Percentage plot (2015-2020)
+    plt.figure(figsize=(15, 10))
+    for source in percentage_df.index:
+        years = sorted(percentage_df.columns)
+        percentages = [percentage_df.loc[source, year] for year in years]
+        plt.plot(years, percentages, marker='o', label=source, linestyle=next(style_cycler))
+    plt.xlabel('Year')
+    plt.ylabel('Percentage of TRUE values in is_code_pred')
+    plt.title('Funding Sources and Code Predictions Over Time (Percentage, 2015-2020)')
+    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
+    plt.tight_layout()
+    plt.grid(True)
+    plt.xticks(range(2015, 2021))
+    plt.savefig(f'{output_filename_base}_percentage_2015_2020.png', dpi=300, bbox_inches='tight')
+    plt.close()
+
+def main(csv_filename, log_level):
+    setup_logging(log_level)
+
+    logging.info(f"Reading file '{csv_filename}'")
+
+    try:
+        df = pd.read_csv(csv_filename, dtype={col: bool for col in range(1, 32)})
+        logging.info(f"Successfully read CSV file. Shape: {df.shape}")
+    except FileNotFoundError:
+        logging.error(f"File '{csv_filename}' not found.")
+        sys.exit(1)
+    except pd.errors.EmptyDataError:
+        logging.error(f"File '{csv_filename}' is empty.")
+        sys.exit(1)
+    except Exception as e:
+        logging.error(f"An error occurred while reading the file: {str(e)}")
+        sys.exit(1)
+
+    logging.debug("First few rows of the DataFrame:")
+    logging.debug(df.head().to_string())
+
+    logging.debug("Data types of columns:")
+    logging.debug(df.dtypes)
+
+    logging.debug(f"Unique values in 'year' column: {df['year'].unique()}")
+
+    df['year_numeric'] = pd.to_numeric(df['year'], errors='coerce')
+    logging.info(f"Unique years after numeric conversion: {sorted(df['year_numeric'].dropna().unique())}")
+
+    if df['year_numeric'].isna().all():
+        logging.warning("No valid numeric years found. Attempting to extract year from 'pmid'.")
+        df['year_numeric'] = df['pmid'].astype(str).str[:4].astype(float)
+        logging.info(f"Unique years extracted from 'pmid': {sorted(df['year_numeric'].dropna().unique())}")
+
+    df = df.dropna(subset=['year_numeric'])
+    df['year_numeric'] = df['year_numeric'].astype(int)  # Convert to integer
+    logging.info(f"Shape after dropping NaN years: {df.shape}")
+
+    if df.empty:
+        logging.error("No valid data remaining after processing years. Please check the 'year' column in your CSV file.")
+        sys.exit(1)
+
+    funding_sources = df.columns[1:32]
+    source_acronyms = {source: get_acronym(source) for source in funding_sources}
+    logging.debug(f"Funding sources: {', '.join(source_acronyms.values())}")
+
+    results = {source: {} for source in funding_sources}
+    percentages = {source: {} for source in funding_sources}
+
+    for year in df['year_numeric'].unique():
+        year_data = df[df['year_numeric'] == year]
+        logging.debug(f"Processing year {year}, {len(year_data)} rows")
+        for source in funding_sources:
+            total_count = year_data[source].sum()
+            code_pred_count = year_data[year_data[source] & year_data['is_code_pred']].shape[0]
+            results[source][int(year)] = code_pred_count
+            percentages[source][int(year)] = (code_pred_count / total_count * 100) if total_count > 0 else 0
+            logging.debug(f"{source_acronyms[source]} - {code_pred_count} TRUE values, {percentages[source][int(year)]:.2f}%")
+
+    print("\nResults Table:")
+    print_results_table(results, percentages, source_acronyms)
+
+    if all(len(data) == 0 for data in results.values()):
+        logging.warning("No data to plot. All counts are zero.")
+    else:
+        create_plots_and_csv(results, percentages, source_acronyms, 'funding_sources_code_predictions')
+        logging.info("Graphs and CSV files have been saved.")
+
+    logging.info(f"Total rows in DataFrame: {len(df)}")
+    logging.info(f"Unique years: {sorted(df['year_numeric'].unique())}")
+    logging.info("Funding sources summary:")
+    for source in funding_sources:
+        true_count = df[source].sum()
+        code_pred_count = df[df[source] & df['is_code_pred']].shape[0]
+        percentage = (code_pred_count / true_count * 100) if true_count > 0 else 0
+        logging.info(f"  {source_acronyms[source]}: {code_pred_count} TRUE out of {true_count} ({percentage:.2f}%)")
+
+    code_pred_count = df['is_code_pred'].sum()
+    total_count = len(df)
+    code_pred_percentage = (code_pred_count / total_count * 100) if total_count > 0 else 0
+    logging.info(f"Total 'is_code_pred' TRUE values: {code_pred_count} out of {total_count} ({code_pred_percentage:.2f}%)")
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Analyze funding sources and code predictions from CSV data.")
+    parser.add_argument("csv_file", help="Path to the input CSV file")
+    parser.add_argument("--log", default="INFO", help="Set the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)")
+    args = parser.parse_args()
+
+    main(args.csv_file, args.log)
diff --git a/scripts/funder-mapping-chunks.py b/scripts/funder-mapping-chunks.py
@@ -0,0 +1,103 @@
+import pandas as pd
+import numpy as np
+import logging
+import sys
+import re
+
+# still need to test this version and modify logging AGT 2024-08-23
+
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger(__name__)
+
+funders_df = pd.read_csv('biomedical_research_funders.csv')
+
+funding_columns = ['fund_text', 'fund_pmc_institute',
+                   'fund_pmc_source', 'fund_pmc_anysource']
+
+def validate_pmcid(pmcid):
+    """Validate that a PMCID is a positive integer"""
+    return pd.notna(pmcid) and isinstance(pmcid, (int, np.integer)) and pmcid > 0
+
+def data_cleaning_processing(df):
+    """Removes spaces and symbols"""
+    for col in funding_columns:
+        if df[col].dtype == 'object':  # Only process string columns
+            df[col] = df[col].str.replace('[^\w\s]', '', regex=True)
+    return df
+
+def funder_mapping(chunk, funder_names, funder_acronyms):
+    """Map funders for a chunk of data"""
+    output_chunk = pd.DataFrame(chunk['pmcid'])
+
+    for name, acronym in zip(funder_names, funder_acronyms):
+        output_chunk[name] = False
+        for column in funding_columns:
+            if chunk[column].dtype == 'object':
+                name_matches = chunk[column].str.contains(name, case=False, na=False)
+                acronym_matches = chunk[column].str.contains(acronym, case=False, na=False)
+
+                output_chunk[name] |= name_matches | acronym_matches
+
+                if name_matches.any():
+                    matched_rows = chunk.loc[name_matches]
+                    for _, row in matched_rows.iterrows():
+                        match = re.search(name, str(row[column]), re.IGNORECASE)
+                        if match:
+                            logger.info(f"PMCID {row['pmcid']}: Found {name} in {column}: {match.group()}")
+
+                if acronym_matches.any():
+                    matched_rows = chunk.loc[acronym_matches]
+                    for _, row in matched_rows.iterrows():
+                        match = re.search(acronym, str(row[column]), re.IGNORECASE)
+                        if match:
+                            logger.info(f"PMCID {row['pmcid']}: Found {acronym} in {column}: {match.group()}")
+
+    return output_chunk
+
+def process_chunks(chunk_size=10000):
+    funder_names = funders_df['Name'].tolist()
+    funder_acronyms = funders_df['Acronym'].tolist()
+
+    output_chunks = []
+    invalid_pmcids = []
+
+    for chunk in pd.read_csv(sys.stdin, chunksize=chunk_size):
+        if 'pmcid_pmc' in chunk.columns:
+            chunk = chunk.rename(columns={'pmcid_pmc': 'pmcid'})
+
+        # Validate PMCIDs
+        invalid_mask = ~chunk['pmcid'].apply(validate_pmcid)
+        if invalid_mask.any():
+            invalid_pmcids.extend(chunk.loc[invalid_mask, 'pmcid'].tolist())
+
+        chunk = chunk[~invalid_mask]
+
+        if not chunk.empty:
+            chunk = data_cleaning_processing(chunk)
+            output_chunk = funder_mapping(chunk, funder_names, funder_acronyms)
+            output_chunks.append(output_chunk)
+
+    if invalid_pmcids:
+        for pmcid in invalid_pmcids:
+            logger.error(f"Invalid PMCID: '{pmcid}'")
+        raise ValueError("Invalid PMCIDs found in the input. Please check the log for details.")
+
+    return pd.concat(output_chunks, ignore_index=True)
+
+def output_to_file(output_df: pd.DataFrame):
+    """Convert True/False to TRUE/FALSE and save to file"""
+    output = output_df.replace({True: 'TRUE', False: 'FALSE'})
+    output.to_csv('pmcid-funding-matrix.csv', index=False)
+
+def main():
+    try:
+        output_df = process_chunks()
+        output_to_file(output_df)
+    except Exception as e:
+        logger.error(f"Failed to process data: {e}")
+        sys.exit(1)
+
+if __name__ == "__main__":
+    main()