From 6d6afcb2edf7d849cc34c352330d4d7d2de550b6 Mon Sep 17 00:00:00 2001 From: JulioAPeraza Date: Tue, 15 Oct 2024 18:24:29 -0700 Subject: [PATCH] Add scripts to link nv data to pmids --- gen_nimare_dset.py | 105 ++++++++++++++ get_nv_collections.py | 319 ++++++++++++++++++++++++++++++++++++++++++ get_nv_images.py | 236 +++++++++++++++++++++++++++++++ 3 files changed, 660 insertions(+) create mode 100644 gen_nimare_dset.py create mode 100644 get_nv_collections.py create mode 100644 get_nv_images.py diff --git a/gen_nimare_dset.py b/gen_nimare_dset.py new file mode 100644 index 0000000..75c9a71 --- /dev/null +++ b/gen_nimare_dset.py @@ -0,0 +1,105 @@ +import argparse +import os.path as op + +import pandas as pd +from nimare.dataset import Dataset +from nimare.io import DEFAULT_MAP_TYPE_CONVERSION +from nimare.transforms import ImageTransformer + + +def _get_parser(): + parser = argparse.ArgumentParser(description="Generate NiMARE dataset from NeuroVault data") + parser.add_argument( + "--project_dir", + dest="project_dir", + required=True, + help="Path to project directory", + ) + return parser + + +def convert_to_nimare_dataset(images_df, text_df, img_dir, suffix=""): + suffix = f"-{suffix}" if suffix else "" + + images_df["pmid"] = images_df["pmid"].astype(int).astype(str) + + dataset_dict = {} + for _, image in images_df.iterrows(): + id_ = image["pmid"] + image_id = image["image_id"] + collection_id = image["collection_id"] + map_type = f"{image['map_type']} map" + new_contrast_name = f"{collection_id}-{image_id}" + suffix + + if id_ not in dataset_dict: + dataset_dict[id_] = {} + + if "contrasts" not in dataset_dict[id_]: + dataset_dict[id_]["contrasts"] = {} + + text_df_row = text_df[text_df["pmid"] == int(id_)] + if text_df_row.shape[0] == 0: + title, keywords, abstract, body = None, None, None, None + else: + title = text_df_row["title"].values[0] + keywords = text_df_row["keywords"].values[0] + abstract = text_df_row["abstract"].values[0] + body = text_df_row["body"].values[0] + + dataset_dict[id_]["contrasts"][new_contrast_name] = { + "metadata": { + "sample_sizes": [image["number_of_subjects"]], + "pmid": image["pmid"], + "pmcid": image["pmcid"], + "collection_id": collection_id, + "image_id": image_id, + "map_type": image["map_type"], + "cognitive_paradigm_cogatlas_id": image["cognitive_paradigm_cogatlas_id"], + "cognitive_contrast_cogatlas_id": image["cognitive_contrast_cogatlas_id"], + "contrast_definition": image["contrast_definition"], + "cognitive_paradigm_cogatlas_name": image["cognitive_paradigm_cogatlas_name"], + "cognitive_contrast_cogatlas_name": image["cognitive_contrast_cogatlas_name"], + "image_name": image["image_name"], + "image_file": image["image_file"], + }, + "text": { + "title": title, + "keywords": keywords, + "abstract": abstract, + "body": body, + }, + "images": { + DEFAULT_MAP_TYPE_CONVERSION[map_type]: op.join(img_dir, image["image_path"]) + }, + } + + return Dataset(dataset_dict) + + +def main(project_dir): + data_dir = op.join(project_dir, "data") + image_dir = op.join(data_dir, "neurovault", "images") + + nv_collections_images_df = pd.read_csv(op.join(data_dir, "nv_all_collections_images.csv")) + nv_text_df = pd.read_csv(op.join(data_dir, "pmid_text.csv")) + dset_nv_fn = op.join(data_dir, "neurovault_all_dataset.pkl") + + print(f"Creating full dataset {nv_collections_images_df.shape[0]}", flush=True) + dset_nv = convert_to_nimare_dataset( + nv_collections_images_df, + nv_text_df, + image_dir, + ) + dset_nv = ImageTransformer("z").transform(dset_nv) + dset_nv = ImageTransformer("t").transform(dset_nv) + dset_nv.save(dset_nv_fn) + + +def _main(argv=None): + option = _get_parser().parse_args(argv) + kwargs = vars(option) + main(**kwargs) + + +if __name__ == "__main__": + _main() diff --git a/get_nv_collections.py b/get_nv_collections.py new file mode 100644 index 0000000..3b815ff --- /dev/null +++ b/get_nv_collections.py @@ -0,0 +1,319 @@ +"""Get NeuroVault Collections linked to PubMed articles.""" + +import argparse +import os.path as op +import re +import urllib.parse + +import numpy as np +import pandas as pd +import requests + +NEUROSCOUT_OWNER_ID = 5761 + + +def _get_parser(): + parser = argparse.ArgumentParser(description="Download NeuroVault data") + parser.add_argument( + "--project_dir", + dest="project_dir", + required=True, + help="Path to project directory", + ) + parser.add_argument( + "--neurovault_version", + dest="neurovault_version", + required=False, + default="february_2024", + help="NeuroVault version", + ) + parser.add_argument( + "--pg_query_id", + dest="pg_query_id", + required=False, + default="a444c1d1cc79f746a519d97ce9672089", + help="Pubget query ID", + ) + return parser + + +def get_pmid_from_doi(doi): + """Query PubMed for the PMID of a paper based on its DOI.""" + url = ( + "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&" + f'term="{doi}"&retmode=json' + ) + response = requests.get(url) + if response.status_code != 200: + raise Exception(f"PubMed API returned status code {response.status_code} for {url}") + data = response.json() + if data["esearchresult"]["idlist"]: + return data["esearchresult"]["idlist"][0] + else: + return None + + +def get_pmcid_from_pmid(pmid): + """Query PubMed for the PMC ID of a paper based on its PMID.""" + url = f"https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?ids={pmid}&format=json" + response = requests.get(url) + if response.status_code != 200: + raise Exception(f"PubMed API returned status code {response.status_code} for {url}") + data = response.json() + if data["records"] and "pmcid" in data["records"][0]: + pmcid = data["records"][0]["pmcid"] + return pmcid[3:] if pmcid.startswith("PMC") else pmcid + else: + return None + + +def get_pmid_pmcid_from_doi(doi): + pmid = get_pmid_from_doi(doi) + if pmid is None: + return pmid, None + + pmcid = get_pmcid_from_pmid(pmid) + + return pmid, pmcid + + +def _check_string(s): + return all(c.isdigit() for c in s) + + +def _convert_collection_id(collection_id, collections_df): + if str(collection_id).isalpha(): + matches = collections_df[collections_df.private_token == collection_id] + return matches.id.values[0] if matches.size > 0 else None + else: + return int(collection_id) if _check_string(str(collection_id)) else None + + +def _look_up_doi(row): + doi_regex = re.compile(r"10.\d{4,9}/[-._;()/:a-zA-Z0-9]+") + + if isinstance(row.description, str): + dois = re.findall(doi_regex, row.description) + if dois: + doi = dois[0] + while doi.endswith((")", ".")): # Check if the string ends with ")" or "." + doi = doi[:-1] # Remove the last character + return doi + + return np.nan + + +def search_by_title(title): + title_encoded = urllib.parse.quote_plus(title) + term = f'"{title_encoded}"[Title:~1]' + url = ( + "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&" + f"term={term}&retmode=json" + ) + response = requests.get(url) + if response.status_code != 200: + raise Exception(f"PubMed API returned status code {response.status_code} for {url}") + + data = response.json() + id_list = data.get("esearchresult", {}).get("idlist", []) + + return id_list[0] if id_list else None + + +def _add_pmid_pmcid(data_df): + # Get PMIDs and PMCIDs + # Drop collections without PMIDs. It either means the DOI is invalid or the paper is not + # indexed in PubMed. + data_df["pmid"] = data_df.doi.apply(get_pmid_from_doi) + data_df = data_df[data_df.pmid.notnull()] + data_df["pmcid"] = data_df.pmid.apply(get_pmcid_from_pmid) + + return data_df + + +def _get_col_doi(data_df): + collections_with_dois = data_df[data_df["DOI"].notnull()][["id", "name", "DOI"]] + collections_with_dois = collections_with_dois.rename( + columns={"id": "collection_id", "name": "collection_name", "DOI": "doi"} + ) + + collections_with_dois = _add_pmid_pmcid(collections_with_dois) + collections_with_dois["source"] = "neurovault" + + return collections_with_dois + + +def _get_col_doi_meta(data_df): + # Find DOI in collection description + data_df["DOI"] = data_df.apply(_look_up_doi, axis=1) + data_df = data_df.dropna(subset="DOI") + + data_df = data_df[["id", "name", "DOI"]] + data_df = data_df.rename( + columns={"id": "collection_id", "name": "collection_name", "DOI": "doi"} + ) + + data_df = _add_pmid_pmcid(data_df) + data_df["source"] = "metadata" + + return data_df + + +def _get_col_pmid_title(data_df): + # Drop collections with names that are too short + data_df = data_df[data_df.name.notnull()] + data_df = data_df[data_df["name"].str.len() > 40] + + # Get PMIDs and PMCIDs from title + data_df["pmid"] = data_df.name.apply(search_by_title) + + data_df = data_df[data_df.pmid.notnull()][["id", "name", "pmid"]] + data_df = data_df.rename(columns={"id": "collection_id", "name": "collection_name"}) + + data_df["pmcid"] = data_df.pmid.apply(get_pmcid_from_pmid) + data_df["source"] = "pubmed" + + return data_df + + +def _get_col_pubget(collections_df, data_df, pubget_nv_df, pubget_metadata_df): + # Convert private_token to collection_id + collection_ids = pubget_nv_df["collection_id"].to_list() + pubget_nv_df["collection_id"] = [ + _convert_collection_id(id_, collections_df) for id_ in collection_ids + ] + + # Get PMIDs and PMCIDs from metadata + pubget_nv_df = pd.merge(pubget_nv_df, pubget_metadata_df[["pmcid", "pmid", "doi"]], on="pmcid") + pubget_nv_df = pubget_nv_df.reindex(columns=["pmid", "pmcid", "doi", "collection_id"]) + pubget_nv_df = pubget_nv_df.rename(columns={"doi": "secondary_doi"}) + pubget_nv_df["pmid"] = pubget_nv_df["pmid"].astype("Int64") + + # Some private collections couldnt be mapped to public ones + pubget_nv_df = pubget_nv_df.dropna(subset=["collection_id"]) + + # Get collections found by pubget + nv_coll = data_df["collection_id"].to_list() + pubget_nv_coll = pubget_nv_df["collection_id"].to_list() + matching_ids = np.intersect1d(nv_coll, pubget_nv_coll) + + pubget_mask = ~pubget_nv_df["collection_id"].isin(matching_ids) + pubget_nv_df = pubget_nv_df[pubget_mask] + + # Select unique collections + pubget_nv_df = pubget_nv_df.sort_values("pmid") + pubget_nv_df = pubget_nv_df.drop_duplicates("collection_id", keep="first") + + # Get collection names + pubget_nv_df = pd.merge( + pubget_nv_df, collections_df[["id", "name"]], left_on="collection_id", right_on="id" + ) + pubget_nv_df = pubget_nv_df.rename(columns={"name": "collection_name"}) + pubget_nv_df = pubget_nv_df.drop(columns="id") + pubget_nv_df["source"] = "pubget" + + return pubget_nv_df + + +def main(project_dir, neurovault_version, pg_query_id): + data_dir = op.join(project_dir, "data") + nv_data_dir = op.join(data_dir, "neurovault", neurovault_version) + pubget_dir = op.join(data_dir, "pubget_data") + pubget_query = op.join(pubget_dir, f"query_{pg_query_id}") + + # Load NV data + collections_df = pd.read_csv(op.join(nv_data_dir, "statmaps_collection.csv")) + print(f"Found {collections_df.shape[0]} collections") + + # Load pubget data + pubget_metadata_fn = op.join(pubget_query, "subset_allArticles_extractedData", "metadata.csv") + pubget_nv_fn = op.join( + pubget_query, + "subset_allArticles_extractedData", + "neurovault_collections.csv", + ) + pubget_nv_df = pd.read_csv(pubget_nv_fn) + pubget_metadata_df = pd.read_csv(pubget_metadata_fn) + + # 0. Remove Neuroscout collections + collections_df = collections_df[collections_df.owner_id != NEUROSCOUT_OWNER_ID] + print(f"Found {collections_df.shape[0]} collections after removing Neuroscout collections") + + # 1. Get collections with DOIs + # ================================= + collections_with_dois = _get_col_doi(collections_df) + print(f"Found {collections_with_dois.shape[0]} collections with DOIs") + + # 2. Find DOI for NeuroVault collections using the metadata + # ====================================================== + # Get the collections without DOI links + collections_without_dois = collections_df[ + ~collections_df["id"].isin(collections_with_dois["collection_id"]) + ] + collections_without_dois = _get_col_doi_meta(collections_without_dois) + print(f"Found {collections_without_dois.shape[0]} new collections with DOIs from metadata") + + # Concatenate the collections + collections_with_pmid = pd.concat( + [collections_with_dois, collections_without_dois], ignore_index=True, sort=False + ) + + # 3. Find PMID for NeuroVault collections using the collection name + # ====================================================== + collections_missing = collections_df[ + ~collections_df["id"].isin(collections_with_pmid["collection_id"]) + ] + collections_missing = _get_col_pmid_title(collections_missing) + print(f"Found {collections_missing.shape[0]} new collections with using the collection name") + + collections_with_pmid = pd.concat( + [collections_with_pmid, collections_missing], ignore_index=True, sort=False + ) + + # 4. Find NeuroVault collections using pubget search + # ====================================================== + # Load Pubget data + pubget_nv_df = _get_col_pubget( + collections_df, + collections_with_pmid, + pubget_nv_df, + pubget_metadata_df, + ) + print(f"Found {pubget_nv_df.shape[0]} new collections with using the pubget search") + + # Concatenate the collections + collections_with_pmid = pd.concat( + [collections_with_pmid, pubget_nv_df], ignore_index=True, sort=False + ) + + # Add missing collections + collections_missing = collections_df[ + ~collections_df["id"].isin(collections_with_pmid["collection_id"]) + ][["id", "name"]] + collections_missing["source"] = "missing" + collections_missing["pmid"] = np.nan + collections_missing["pmcid"] = np.nan + collections_missing["doi"] = np.nan + collections_missing = collections_missing.rename( + columns={"id": "collection_id", "name": "collection_name"} + ) + + collections_final_df = pd.concat( + [collections_with_pmid, collections_missing], ignore_index=True, sort=False + ) + + collections_with_pmid.to_csv(op.join(data_dir, "nv_pmid_collections.csv"), index=False) + collections_final_df.to_csv(op.join(data_dir, "nv_all_collections.csv"), index=False) + + pmcids = collections_with_pmid["pmcid"].dropna().astype(int).astype(str).unique() + np.savetxt(op.join(data_dir, "neurovault", "nv-pmcids.txt"), pmcids, fmt="%s") + + +def _main(argv=None): + option = _get_parser().parse_args(argv) + kwargs = vars(option) + main(**kwargs) + + +if __name__ == "__main__": + _main() diff --git a/get_nv_images.py b/get_nv_images.py new file mode 100644 index 0000000..5484bee --- /dev/null +++ b/get_nv_images.py @@ -0,0 +1,236 @@ +"""Get NeuroVault images for collections linked to PubMed articles.""" + +import argparse +import os +import os.path as op + +import nibabel as nib +import pandas as pd +import requests + +NEUROSCOUT_OWNER_ID = 5761 +NV_VERSION = "february_2024" + +KEEP_IMG_COLUMNS = [ + "image_name", + "map_type", + "image_file", + "collection_id", + "image_id", + "number_of_subjects", + "cognitive_paradigm_cogatlas_id", + "cognitive_contrast_cogatlas_id", + "contrast_definition", +] + +KEEP_COL_COLUMNS = [ + "pmid", + "pmcid", + "doi", + "secondary_doi", + "collection_id", + "collection_name", + "source", +] + + +def _get_parser(): + parser = argparse.ArgumentParser(description="Download NeuroVault data") + parser.add_argument( + "--project_dir", + dest="project_dir", + required=True, + help="Path to project directory", + ) + return parser + + +def download_images(image_ids, output_directory): + # Ensure the output directory exists + os.makedirs(output_directory, exist_ok=True) + + image_info_dict = {"image_id": [], "image_path": []} + for image_id in image_ids: + # Construct the NeuroVault API URL for image info + image_info_url = f"https://neurovault.org/api/images/{image_id}/" + + try: + # Make a GET request to fetch image info + response = requests.get(image_info_url) + + if response.status_code == 200: + image_info = response.json() + + # Download the image file + image_url = image_info["file"] + collection_id = image_info["collection_id"] + image_filename = os.path.basename(image_url) + rel_path = f"{collection_id}-{image_id}_{image_filename}" + image_path = os.path.join(output_directory, rel_path) + if not os.path.exists(image_path): + # Download the image + response = requests.get(image_url) + with open(image_path, "wb") as image_file: + image_file.write(response.content) + + try: + # Some image may be invalid + nib.load(image_path) + except Exception: + pass + else: + # Append image info to the list + image_info_dict["image_id"].append(image_id) + image_info_dict["image_path"].append(rel_path) + + except Exception as e: + print( + f"An error occurred while processing image ID {image_id}: {str(e)}", + flush=True, + ) + + return image_info_dict + + +def derive_map_type(row): + if row["map_type"] == "Other": + row["description"] = str(row["description"]) + + if "zstat" in row["name"]: + map_type = "Z" + elif "tstat" in row["name"]: + map_type = "T" + elif "zstat" in row["file"]: + map_type = "Z" + elif "tstat" in row["file"]: + map_type = "T" + elif "Z_" in row["description"]: + map_type = "Z" + elif "T_" in row["description"]: + map_type = "T" + else: + map_type = None + else: + map_type = row["map_type"] + + return map_type + + +def main(project_dir): + data_dir = op.join(project_dir, "data") + nv_data_dir = op.join(data_dir, "neurovault", NV_VERSION) + image_dir = op.join(data_dir, "neurovault", "images") + os.makedirs(image_dir, exist_ok=True) + + basecollectionitem = pd.read_csv(op.join(nv_data_dir, "statmaps_basecollectionitem.csv")) + image = pd.read_csv(op.join(nv_data_dir, "statmaps_image.csv")) + statisticmap = pd.read_csv(op.join(nv_data_dir, "statmaps_statisticmap.csv")) + cogat_tasks = pd.read_csv(op.join(nv_data_dir, "statmaps_cognitiveatlastask.csv")) + cogat_contrasts = pd.read_csv(op.join(nv_data_dir, "statmaps_cognitiveatlascontrast.csv"))[ + ["name", "cog_atlas_id"] + ] + + # Load the file with NeuroVault collections linked to PubMed articles + # (created by get_nv_collections.py) + collections_pmid_df = pd.read_csv(op.join(data_dir, "nv_all_collections.csv")) + image_merged = pd.merge( + image, basecollectionitem, left_on="basecollectionitem_ptr_id", right_on="id" + ) + statisticmap_merged = pd.merge( + statisticmap, image_merged, left_on="image_ptr_id", right_on="basecollectionitem_ptr_id" + ) + + # Remove rows with missing cognitive_paradigm_cogatlas_id and number_of_subjects + # We need both to perform IBMA + statisticmap_merged = statisticmap_merged.dropna( + subset=["cognitive_paradigm_cogatlas_id", "number_of_subjects"] + ) + + # Keep only rows with collection_id in collections_pmid_df + # Skip this for now, as we want to keep all collections for the Baseline model + # statisticmap_merged = statisticmap_merged[ + # statisticmap_merged.collection_id.isin(collections_pmid_df.collection_id) + # ] + + # Filter the statisticmap_merged DataFrame + statisticmap_filtered = statisticmap_merged.query( + 'modality == "fMRI-BOLD"' + ' & analysis_level == "G"' + ' & is_thresholded == "f"' + ' & (map_type == "Z" | map_type == "Other" | map_type == "T")' + " & brain_coverage > 40" + " & number_of_subjects > 10" + ' & cognitive_paradigm_cogatlas_id != "trm_4c8a834779883"' # rest eyes open + ' & cognitive_paradigm_cogatlas_id != "trm_54e69c642d89b"' # rest eyes closed + ' & not_mni == "f"' + ) + + # Relabel the "Other" map type into "Z" or "T" based on the file name and decription + statisticmap_filtered["map_type"] = statisticmap_filtered.apply(derive_map_type, axis=1) + statisticmap_filtered = statisticmap_filtered[statisticmap_filtered.map_type.notnull()] + + # Rename columns before merging with collections_pmid_df + statisticmap_filtered = statisticmap_filtered.rename( + columns={"image_ptr_id": "image_id", "file": "image_file", "name": "image_name"} + ) + statisticmap_filtered = statisticmap_filtered[KEEP_IMG_COLUMNS] + + statisticmap_colelctions = pd.merge( + statisticmap_filtered, + collections_pmid_df, + how="left", + on="collection_id", + ) + sorted_columns = KEEP_COL_COLUMNS + KEEP_IMG_COLUMNS + statisticmap_colelctions = statisticmap_colelctions[sorted_columns] + + # Get the cognitive paradigm names + statisticmap_colelctions = pd.merge( + statisticmap_colelctions, + cogat_tasks, + how="left", + left_on="cognitive_paradigm_cogatlas_id", + right_on="cog_atlas_id", + ) + statisticmap_colelctions = statisticmap_colelctions.rename( + columns={"name": "cognitive_paradigm_cogatlas_name"} + ) + statisticmap_colelctions = statisticmap_colelctions.drop(columns=["cog_atlas_id"]) + + # Get the cognitive contrast names + statisticmap_colelctions = pd.merge( + statisticmap_colelctions, + cogat_contrasts, + how="left", + left_on="cognitive_contrast_cogatlas_id", + right_on="cog_atlas_id", + ) + statisticmap_colelctions = statisticmap_colelctions.rename( + columns={"name": "cognitive_contrast_cogatlas_name"} + ) + statisticmap_colelctions = statisticmap_colelctions.drop(columns=["cog_atlas_id"]) + + # Keep downloaded images only + image_ids = statisticmap_colelctions["image_id"].unique() + usable_images_dict = download_images(image_ids, image_dir) + usable_images_df = pd.DataFrame(usable_images_dict) + print(f"Usable images: {len(usable_images_df)}/{len(image_ids)}") + + nv_collections_images_df = pd.merge(statisticmap_colelctions, usable_images_df, on="image_id") + + # Add "99999999" to collections with no PMIDs + nv_collections_images_df["pmid"] = nv_collections_images_df["pmid"].fillna(99999999) + + nv_collections_images_df.to_csv( + op.join(data_dir, "nv_all_collections_images.csv"), index=False + ) + + +def _main(argv=None): + option = _get_parser().parse_args(argv) + kwargs = vars(option) + main(**kwargs) + + +if __name__ == "__main__": + _main()