From 6d6afcb2edf7d849cc34c352330d4d7d2de550b6 Mon Sep 17 00:00:00 2001
From: JulioAPeraza <julio.peraza@outlook.com>
Date: Tue, 15 Oct 2024 18:24:29 -0700
Subject: [PATCH] Add scripts to link nv data to pmids

---
 gen_nimare_dset.py    | 105 ++++++++++++++
 get_nv_collections.py | 319 ++++++++++++++++++++++++++++++++++++++++++
 get_nv_images.py      | 236 +++++++++++++++++++++++++++++++
 3 files changed, 660 insertions(+)
 create mode 100644 gen_nimare_dset.py
 create mode 100644 get_nv_collections.py
 create mode 100644 get_nv_images.py

diff --git a/gen_nimare_dset.py b/gen_nimare_dset.py
new file mode 100644
index 0000000..75c9a71
--- /dev/null
+++ b/gen_nimare_dset.py
@@ -0,0 +1,105 @@
+import argparse
+import os.path as op
+
+import pandas as pd
+from nimare.dataset import Dataset
+from nimare.io import DEFAULT_MAP_TYPE_CONVERSION
+from nimare.transforms import ImageTransformer
+
+
+def _get_parser():
+    parser = argparse.ArgumentParser(description="Generate NiMARE dataset from NeuroVault data")
+    parser.add_argument(
+        "--project_dir",
+        dest="project_dir",
+        required=True,
+        help="Path to project directory",
+    )
+    return parser
+
+
+def convert_to_nimare_dataset(images_df, text_df, img_dir, suffix=""):
+    suffix = f"-{suffix}" if suffix else ""
+
+    images_df["pmid"] = images_df["pmid"].astype(int).astype(str)
+
+    dataset_dict = {}
+    for _, image in images_df.iterrows():
+        id_ = image["pmid"]
+        image_id = image["image_id"]
+        collection_id = image["collection_id"]
+        map_type = f"{image['map_type']} map"
+        new_contrast_name = f"{collection_id}-{image_id}" + suffix
+
+        if id_ not in dataset_dict:
+            dataset_dict[id_] = {}
+
+        if "contrasts" not in dataset_dict[id_]:
+            dataset_dict[id_]["contrasts"] = {}
+
+        text_df_row = text_df[text_df["pmid"] == int(id_)]
+        if text_df_row.shape[0] == 0:
+            title, keywords, abstract, body = None, None, None, None
+        else:
+            title = text_df_row["title"].values[0]
+            keywords = text_df_row["keywords"].values[0]
+            abstract = text_df_row["abstract"].values[0]
+            body = text_df_row["body"].values[0]
+
+        dataset_dict[id_]["contrasts"][new_contrast_name] = {
+            "metadata": {
+                "sample_sizes": [image["number_of_subjects"]],
+                "pmid": image["pmid"],
+                "pmcid": image["pmcid"],
+                "collection_id": collection_id,
+                "image_id": image_id,
+                "map_type": image["map_type"],
+                "cognitive_paradigm_cogatlas_id": image["cognitive_paradigm_cogatlas_id"],
+                "cognitive_contrast_cogatlas_id": image["cognitive_contrast_cogatlas_id"],
+                "contrast_definition": image["contrast_definition"],
+                "cognitive_paradigm_cogatlas_name": image["cognitive_paradigm_cogatlas_name"],
+                "cognitive_contrast_cogatlas_name": image["cognitive_contrast_cogatlas_name"],
+                "image_name": image["image_name"],
+                "image_file": image["image_file"],
+            },
+            "text": {
+                "title": title,
+                "keywords": keywords,
+                "abstract": abstract,
+                "body": body,
+            },
+            "images": {
+                DEFAULT_MAP_TYPE_CONVERSION[map_type]: op.join(img_dir, image["image_path"])
+            },
+        }
+
+    return Dataset(dataset_dict)
+
+
+def main(project_dir):
+    data_dir = op.join(project_dir, "data")
+    image_dir = op.join(data_dir, "neurovault", "images")
+
+    nv_collections_images_df = pd.read_csv(op.join(data_dir, "nv_all_collections_images.csv"))
+    nv_text_df = pd.read_csv(op.join(data_dir, "pmid_text.csv"))
+    dset_nv_fn = op.join(data_dir, "neurovault_all_dataset.pkl")
+
+    print(f"Creating full dataset {nv_collections_images_df.shape[0]}", flush=True)
+    dset_nv = convert_to_nimare_dataset(
+        nv_collections_images_df,
+        nv_text_df,
+        image_dir,
+    )
+    dset_nv = ImageTransformer("z").transform(dset_nv)
+    dset_nv = ImageTransformer("t").transform(dset_nv)
+    dset_nv.save(dset_nv_fn)
+
+
+def _main(argv=None):
+    option = _get_parser().parse_args(argv)
+    kwargs = vars(option)
+    main(**kwargs)
+
+
+if __name__ == "__main__":
+    _main()
diff --git a/get_nv_collections.py b/get_nv_collections.py
new file mode 100644
index 0000000..3b815ff
--- /dev/null
+++ b/get_nv_collections.py
@@ -0,0 +1,319 @@
+"""Get NeuroVault Collections linked to PubMed articles."""
+
+import argparse
+import os.path as op
+import re
+import urllib.parse
+
+import numpy as np
+import pandas as pd
+import requests
+
+NEUROSCOUT_OWNER_ID = 5761
+
+
+def _get_parser():
+    parser = argparse.ArgumentParser(description="Download NeuroVault data")
+    parser.add_argument(
+        "--project_dir",
+        dest="project_dir",
+        required=True,
+        help="Path to project directory",
+    )
+    parser.add_argument(
+        "--neurovault_version",
+        dest="neurovault_version",
+        required=False,
+        default="february_2024",
+        help="NeuroVault version",
+    )
+    parser.add_argument(
+        "--pg_query_id",
+        dest="pg_query_id",
+        required=False,
+        default="a444c1d1cc79f746a519d97ce9672089",
+        help="Pubget query ID",
+    )
+    return parser
+
+
+def get_pmid_from_doi(doi):
+    """Query PubMed for the PMID of a paper based on its DOI."""
+    url = (
+        "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&"
+        f'term="{doi}"&retmode=json'
+    )
+    response = requests.get(url)
+    if response.status_code != 200:
+        raise Exception(f"PubMed API returned status code {response.status_code} for {url}")
+    data = response.json()
+    if data["esearchresult"]["idlist"]:
+        return data["esearchresult"]["idlist"][0]
+    else:
+        return None
+
+
+def get_pmcid_from_pmid(pmid):
+    """Query PubMed for the PMC ID of a paper based on its PMID."""
+    url = f"https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?ids={pmid}&format=json"
+    response = requests.get(url)
+    if response.status_code != 200:
+        raise Exception(f"PubMed API returned status code {response.status_code} for {url}")
+    data = response.json()
+    if data["records"] and "pmcid" in data["records"][0]:
+        pmcid = data["records"][0]["pmcid"]
+        return pmcid[3:] if pmcid.startswith("PMC") else pmcid
+    else:
+        return None
+
+
+def get_pmid_pmcid_from_doi(doi):
+    pmid = get_pmid_from_doi(doi)
+    if pmid is None:
+        return pmid, None
+
+    pmcid = get_pmcid_from_pmid(pmid)
+
+    return pmid, pmcid
+
+
+def _check_string(s):
+    return all(c.isdigit() for c in s)
+
+
+def _convert_collection_id(collection_id, collections_df):
+    if str(collection_id).isalpha():
+        matches = collections_df[collections_df.private_token == collection_id]
+        return matches.id.values[0] if matches.size > 0 else None
+    else:
+        return int(collection_id) if _check_string(str(collection_id)) else None
+
+
+def _look_up_doi(row):
+    doi_regex = re.compile(r"10.\d{4,9}/[-._;()/:a-zA-Z0-9]+")
+
+    if isinstance(row.description, str):
+        dois = re.findall(doi_regex, row.description)
+        if dois:
+            doi = dois[0]
+            while doi.endswith((")", ".")):  # Check if the string ends with ")" or "."
+                doi = doi[:-1]  # Remove the last character
+            return doi
+
+    return np.nan
+
+
+def search_by_title(title):
+    title_encoded = urllib.parse.quote_plus(title)
+    term = f'"{title_encoded}"[Title:~1]'
+    url = (
+        "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&"
+        f"term={term}&retmode=json"
+    )
+    response = requests.get(url)
+    if response.status_code != 200:
+        raise Exception(f"PubMed API returned status code {response.status_code} for {url}")
+
+    data = response.json()
+    id_list = data.get("esearchresult", {}).get("idlist", [])
+
+    return id_list[0] if id_list else None
+
+
+def _add_pmid_pmcid(data_df):
+    # Get PMIDs and PMCIDs
+    # Drop collections without PMIDs. It either means the DOI is invalid or the paper is not
+    # indexed in PubMed.
+    data_df["pmid"] = data_df.doi.apply(get_pmid_from_doi)
+    data_df = data_df[data_df.pmid.notnull()]
+    data_df["pmcid"] = data_df.pmid.apply(get_pmcid_from_pmid)
+
+    return data_df
+
+
+def _get_col_doi(data_df):
+    collections_with_dois = data_df[data_df["DOI"].notnull()][["id", "name", "DOI"]]
+    collections_with_dois = collections_with_dois.rename(
+        columns={"id": "collection_id", "name": "collection_name", "DOI": "doi"}
+    )
+
+    collections_with_dois = _add_pmid_pmcid(collections_with_dois)
+    collections_with_dois["source"] = "neurovault"
+
+    return collections_with_dois
+
+
+def _get_col_doi_meta(data_df):
+    # Find DOI in collection description
+    data_df["DOI"] = data_df.apply(_look_up_doi, axis=1)
+    data_df = data_df.dropna(subset="DOI")
+
+    data_df = data_df[["id", "name", "DOI"]]
+    data_df = data_df.rename(
+        columns={"id": "collection_id", "name": "collection_name", "DOI": "doi"}
+    )
+
+    data_df = _add_pmid_pmcid(data_df)
+    data_df["source"] = "metadata"
+
+    return data_df
+
+
+def _get_col_pmid_title(data_df):
+    # Drop collections with names that are too short
+    data_df = data_df[data_df.name.notnull()]
+    data_df = data_df[data_df["name"].str.len() > 40]
+
+    # Get PMIDs and PMCIDs from title
+    data_df["pmid"] = data_df.name.apply(search_by_title)
+
+    data_df = data_df[data_df.pmid.notnull()][["id", "name", "pmid"]]
+    data_df = data_df.rename(columns={"id": "collection_id", "name": "collection_name"})
+
+    data_df["pmcid"] = data_df.pmid.apply(get_pmcid_from_pmid)
+    data_df["source"] = "pubmed"
+
+    return data_df
+
+
+def _get_col_pubget(collections_df, data_df, pubget_nv_df, pubget_metadata_df):
+    # Convert private_token to collection_id
+    collection_ids = pubget_nv_df["collection_id"].to_list()
+    pubget_nv_df["collection_id"] = [
+        _convert_collection_id(id_, collections_df) for id_ in collection_ids
+    ]
+
+    # Get PMIDs and PMCIDs from metadata
+    pubget_nv_df = pd.merge(pubget_nv_df, pubget_metadata_df[["pmcid", "pmid", "doi"]], on="pmcid")
+    pubget_nv_df = pubget_nv_df.reindex(columns=["pmid", "pmcid", "doi", "collection_id"])
+    pubget_nv_df = pubget_nv_df.rename(columns={"doi": "secondary_doi"})
+    pubget_nv_df["pmid"] = pubget_nv_df["pmid"].astype("Int64")
+
+    # Some private collections couldnt be mapped to public ones
+    pubget_nv_df = pubget_nv_df.dropna(subset=["collection_id"])
+
+    # Get collections found by pubget
+    nv_coll = data_df["collection_id"].to_list()
+    pubget_nv_coll = pubget_nv_df["collection_id"].to_list()
+    matching_ids = np.intersect1d(nv_coll, pubget_nv_coll)
+
+    pubget_mask = ~pubget_nv_df["collection_id"].isin(matching_ids)
+    pubget_nv_df = pubget_nv_df[pubget_mask]
+
+    # Select unique collections
+    pubget_nv_df = pubget_nv_df.sort_values("pmid")
+    pubget_nv_df = pubget_nv_df.drop_duplicates("collection_id", keep="first")
+
+    # Get collection names
+    pubget_nv_df = pd.merge(
+        pubget_nv_df, collections_df[["id", "name"]], left_on="collection_id", right_on="id"
+    )
+    pubget_nv_df = pubget_nv_df.rename(columns={"name": "collection_name"})
+    pubget_nv_df = pubget_nv_df.drop(columns="id")
+    pubget_nv_df["source"] = "pubget"
+
+    return pubget_nv_df
+
+
+def main(project_dir, neurovault_version, pg_query_id):
+    data_dir = op.join(project_dir, "data")
+    nv_data_dir = op.join(data_dir, "neurovault", neurovault_version)
+    pubget_dir = op.join(data_dir, "pubget_data")
+    pubget_query = op.join(pubget_dir, f"query_{pg_query_id}")
+
+    # Load NV data
+    collections_df = pd.read_csv(op.join(nv_data_dir, "statmaps_collection.csv"))
+    print(f"Found {collections_df.shape[0]} collections")
+
+    # Load pubget data
+    pubget_metadata_fn = op.join(pubget_query, "subset_allArticles_extractedData", "metadata.csv")
+    pubget_nv_fn = op.join(
+        pubget_query,
+        "subset_allArticles_extractedData",
+        "neurovault_collections.csv",
+    )
+    pubget_nv_df = pd.read_csv(pubget_nv_fn)
+    pubget_metadata_df = pd.read_csv(pubget_metadata_fn)
+
+    # 0. Remove Neuroscout collections
+    collections_df = collections_df[collections_df.owner_id != NEUROSCOUT_OWNER_ID]
+    print(f"Found {collections_df.shape[0]} collections after removing Neuroscout collections")
+
+    # 1. Get collections with DOIs
+    # =================================
+    collections_with_dois = _get_col_doi(collections_df)
+    print(f"Found {collections_with_dois.shape[0]} collections with DOIs")
+
+    # 2. Find DOI for NeuroVault collections using the metadata
+    # ======================================================
+    # Get the collections without DOI links
+    collections_without_dois = collections_df[
+        ~collections_df["id"].isin(collections_with_dois["collection_id"])
+    ]
+    collections_without_dois = _get_col_doi_meta(collections_without_dois)
+    print(f"Found {collections_without_dois.shape[0]} new collections with DOIs from metadata")
+
+    # Concatenate the collections
+    collections_with_pmid = pd.concat(
+        [collections_with_dois, collections_without_dois], ignore_index=True, sort=False
+    )
+
+    # 3. Find PMID for NeuroVault collections using the collection name
+    # ======================================================
+    collections_missing = collections_df[
+        ~collections_df["id"].isin(collections_with_pmid["collection_id"])
+    ]
+    collections_missing = _get_col_pmid_title(collections_missing)
+    print(f"Found {collections_missing.shape[0]} new collections with using the collection name")
+
+    collections_with_pmid = pd.concat(
+        [collections_with_pmid, collections_missing], ignore_index=True, sort=False
+    )
+
+    # 4. Find NeuroVault collections using pubget search
+    # ======================================================
+    # Load Pubget data
+    pubget_nv_df = _get_col_pubget(
+        collections_df,
+        collections_with_pmid,
+        pubget_nv_df,
+        pubget_metadata_df,
+    )
+    print(f"Found {pubget_nv_df.shape[0]} new collections with using the pubget search")
+
+    # Concatenate the collections
+    collections_with_pmid = pd.concat(
+        [collections_with_pmid, pubget_nv_df], ignore_index=True, sort=False
+    )
+
+    # Add missing collections
+    collections_missing = collections_df[
+        ~collections_df["id"].isin(collections_with_pmid["collection_id"])
+    ][["id", "name"]]
+    collections_missing["source"] = "missing"
+    collections_missing["pmid"] = np.nan
+    collections_missing["pmcid"] = np.nan
+    collections_missing["doi"] = np.nan
+    collections_missing = collections_missing.rename(
+        columns={"id": "collection_id", "name": "collection_name"}
+    )
+
+    collections_final_df = pd.concat(
+        [collections_with_pmid, collections_missing], ignore_index=True, sort=False
+    )
+
+    collections_with_pmid.to_csv(op.join(data_dir, "nv_pmid_collections.csv"), index=False)
+    collections_final_df.to_csv(op.join(data_dir, "nv_all_collections.csv"), index=False)
+
+    pmcids = collections_with_pmid["pmcid"].dropna().astype(int).astype(str).unique()
+    np.savetxt(op.join(data_dir, "neurovault", "nv-pmcids.txt"), pmcids, fmt="%s")
+
+
+def _main(argv=None):
+    option = _get_parser().parse_args(argv)
+    kwargs = vars(option)
+    main(**kwargs)
+
+
+if __name__ == "__main__":
+    _main()
diff --git a/get_nv_images.py b/get_nv_images.py
new file mode 100644
index 0000000..5484bee
--- /dev/null
+++ b/get_nv_images.py
@@ -0,0 +1,236 @@
+"""Get NeuroVault images for collections linked to PubMed articles."""
+
+import argparse
+import os
+import os.path as op
+
+import nibabel as nib
+import pandas as pd
+import requests
+
+NEUROSCOUT_OWNER_ID = 5761
+NV_VERSION = "february_2024"
+
+KEEP_IMG_COLUMNS = [
+    "image_name",
+    "map_type",
+    "image_file",
+    "collection_id",
+    "image_id",
+    "number_of_subjects",
+    "cognitive_paradigm_cogatlas_id",
+    "cognitive_contrast_cogatlas_id",
+    "contrast_definition",
+]
+
+KEEP_COL_COLUMNS = [
+    "pmid",
+    "pmcid",
+    "doi",
+    "secondary_doi",
+    "collection_id",
+    "collection_name",
+    "source",
+]
+
+
+def _get_parser():
+    parser = argparse.ArgumentParser(description="Download NeuroVault data")
+    parser.add_argument(
+        "--project_dir",
+        dest="project_dir",
+        required=True,
+        help="Path to project directory",
+    )
+    return parser
+
+
+def download_images(image_ids, output_directory):
+    # Ensure the output directory exists
+    os.makedirs(output_directory, exist_ok=True)
+
+    image_info_dict = {"image_id": [], "image_path": []}
+    for image_id in image_ids:
+        # Construct the NeuroVault API URL for image info
+        image_info_url = f"https://neurovault.org/api/images/{image_id}/"
+
+        try:
+            # Make a GET request to fetch image info
+            response = requests.get(image_info_url)
+
+            if response.status_code == 200:
+                image_info = response.json()
+
+                # Download the image file
+                image_url = image_info["file"]
+                collection_id = image_info["collection_id"]
+                image_filename = os.path.basename(image_url)
+                rel_path = f"{collection_id}-{image_id}_{image_filename}"
+                image_path = os.path.join(output_directory, rel_path)
+                if not os.path.exists(image_path):
+                    # Download the image
+                    response = requests.get(image_url)
+                    with open(image_path, "wb") as image_file:
+                        image_file.write(response.content)
+
+                try:
+                    # Some image may be invalid
+                    nib.load(image_path)
+                except Exception:
+                    pass
+                else:
+                    # Append image info to the list
+                    image_info_dict["image_id"].append(image_id)
+                    image_info_dict["image_path"].append(rel_path)
+
+        except Exception as e:
+            print(
+                f"An error occurred while processing image ID {image_id}: {str(e)}",
+                flush=True,
+            )
+
+    return image_info_dict
+
+
+def derive_map_type(row):
+    if row["map_type"] == "Other":
+        row["description"] = str(row["description"])
+
+        if "zstat" in row["name"]:
+            map_type = "Z"
+        elif "tstat" in row["name"]:
+            map_type = "T"
+        elif "zstat" in row["file"]:
+            map_type = "Z"
+        elif "tstat" in row["file"]:
+            map_type = "T"
+        elif "Z_" in row["description"]:
+            map_type = "Z"
+        elif "T_" in row["description"]:
+            map_type = "T"
+        else:
+            map_type = None
+    else:
+        map_type = row["map_type"]
+
+    return map_type
+
+
+def main(project_dir):
+    data_dir = op.join(project_dir, "data")
+    nv_data_dir = op.join(data_dir, "neurovault", NV_VERSION)
+    image_dir = op.join(data_dir, "neurovault", "images")
+    os.makedirs(image_dir, exist_ok=True)
+
+    basecollectionitem = pd.read_csv(op.join(nv_data_dir, "statmaps_basecollectionitem.csv"))
+    image = pd.read_csv(op.join(nv_data_dir, "statmaps_image.csv"))
+    statisticmap = pd.read_csv(op.join(nv_data_dir, "statmaps_statisticmap.csv"))
+    cogat_tasks = pd.read_csv(op.join(nv_data_dir, "statmaps_cognitiveatlastask.csv"))
+    cogat_contrasts = pd.read_csv(op.join(nv_data_dir, "statmaps_cognitiveatlascontrast.csv"))[
+        ["name", "cog_atlas_id"]
+    ]
+
+    # Load the file with NeuroVault collections linked to PubMed articles
+    # (created by get_nv_collections.py)
+    collections_pmid_df = pd.read_csv(op.join(data_dir, "nv_all_collections.csv"))
+    image_merged = pd.merge(
+        image, basecollectionitem, left_on="basecollectionitem_ptr_id", right_on="id"
+    )
+    statisticmap_merged = pd.merge(
+        statisticmap, image_merged, left_on="image_ptr_id", right_on="basecollectionitem_ptr_id"
+    )
+
+    # Remove rows with missing cognitive_paradigm_cogatlas_id and number_of_subjects
+    # We need both to perform IBMA
+    statisticmap_merged = statisticmap_merged.dropna(
+        subset=["cognitive_paradigm_cogatlas_id", "number_of_subjects"]
+    )
+
+    # Keep only rows with collection_id in collections_pmid_df
+    # Skip this for now, as we want to keep all collections for the Baseline model
+    # statisticmap_merged = statisticmap_merged[
+    #     statisticmap_merged.collection_id.isin(collections_pmid_df.collection_id)
+    # ]
+
+    # Filter the statisticmap_merged DataFrame
+    statisticmap_filtered = statisticmap_merged.query(
+        'modality == "fMRI-BOLD"'
+        ' & analysis_level == "G"'
+        ' & is_thresholded == "f"'
+        ' & (map_type == "Z" | map_type == "Other" | map_type == "T")'
+        " & brain_coverage > 40"
+        " & number_of_subjects > 10"
+        ' & cognitive_paradigm_cogatlas_id != "trm_4c8a834779883"'  # rest eyes open
+        ' & cognitive_paradigm_cogatlas_id != "trm_54e69c642d89b"'  # rest eyes closed
+        ' & not_mni == "f"'
+    )
+
+    # Relabel the "Other" map type into "Z" or "T" based on the file name and decription
+    statisticmap_filtered["map_type"] = statisticmap_filtered.apply(derive_map_type, axis=1)
+    statisticmap_filtered = statisticmap_filtered[statisticmap_filtered.map_type.notnull()]
+
+    # Rename columns before merging with collections_pmid_df
+    statisticmap_filtered = statisticmap_filtered.rename(
+        columns={"image_ptr_id": "image_id", "file": "image_file", "name": "image_name"}
+    )
+    statisticmap_filtered = statisticmap_filtered[KEEP_IMG_COLUMNS]
+
+    statisticmap_colelctions = pd.merge(
+        statisticmap_filtered,
+        collections_pmid_df,
+        how="left",
+        on="collection_id",
+    )
+    sorted_columns = KEEP_COL_COLUMNS + KEEP_IMG_COLUMNS
+    statisticmap_colelctions = statisticmap_colelctions[sorted_columns]
+
+    # Get the cognitive paradigm names
+    statisticmap_colelctions = pd.merge(
+        statisticmap_colelctions,
+        cogat_tasks,
+        how="left",
+        left_on="cognitive_paradigm_cogatlas_id",
+        right_on="cog_atlas_id",
+    )
+    statisticmap_colelctions = statisticmap_colelctions.rename(
+        columns={"name": "cognitive_paradigm_cogatlas_name"}
+    )
+    statisticmap_colelctions = statisticmap_colelctions.drop(columns=["cog_atlas_id"])
+
+    # Get the cognitive contrast names
+    statisticmap_colelctions = pd.merge(
+        statisticmap_colelctions,
+        cogat_contrasts,
+        how="left",
+        left_on="cognitive_contrast_cogatlas_id",
+        right_on="cog_atlas_id",
+    )
+    statisticmap_colelctions = statisticmap_colelctions.rename(
+        columns={"name": "cognitive_contrast_cogatlas_name"}
+    )
+    statisticmap_colelctions = statisticmap_colelctions.drop(columns=["cog_atlas_id"])
+
+    # Keep downloaded images only
+    image_ids = statisticmap_colelctions["image_id"].unique()
+    usable_images_dict = download_images(image_ids, image_dir)
+    usable_images_df = pd.DataFrame(usable_images_dict)
+    print(f"Usable images: {len(usable_images_df)}/{len(image_ids)}")
+
+    nv_collections_images_df = pd.merge(statisticmap_colelctions, usable_images_df, on="image_id")
+
+    # Add "99999999" to collections with no PMIDs
+    nv_collections_images_df["pmid"] = nv_collections_images_df["pmid"].fillna(99999999)
+
+    nv_collections_images_df.to_csv(
+        op.join(data_dir, "nv_all_collections_images.csv"), index=False
+    )
+
+
+def _main(argv=None):
+    option = _get_parser().parse_args(argv)
+    kwargs = vars(option)
+    main(**kwargs)
+
+
+if __name__ == "__main__":
+    _main()