data processing notebook

neurodata · Apr 13, 2024 · a1d4aab · a1d4aab
1 parent 755c0e8
commit a1d4aab
Showing 1 changed file with 261 additions and 0 deletions.
diff --git a/docs/paper/data.ipynb b/docs/paper/data.ipynb
@@ -0,0 +1,261 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Data Preprocessing\n",
+    "\n",
+    "The outputs from the `m2g` pipeline is available in our open-access AWS S3 bucket: `s3://open-neurodata/m2`. You can use the file tree to browse the outputs [http://open-neurodata.s3-website-us-east-1.amazonaws.com/](http://open-neurodata.s3-website-us-east-1.amazonaws.com/)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/j1c/miniconda3/envs/m2g/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "import boto3\n",
+    "from botocore import UNSIGNED\n",
+    "from botocore.client import Config\n",
+    "\n",
+    "from pathlib import Path\n",
+    "import numpy as np\n",
+    "\n",
+    "from graspologic.utils import import_edgelist, pass_to_ranks"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "modalities = [\"Diffusion\", \"Functional\"]\n",
+    "diffusion_datasets = [\n",
+    "    \"SWU4\",\n",
+    "    \"HNU1\",\n",
+    "    \"NKIENH\",\n",
+    "    \"XHCUMS\",\n",
+    "    \"BNU1\",\n",
+    "    \"BNU3\",\n",
+    "    \"NKI1\",\n",
+    "    \"NKI24\",\n",
+    "    \"IPCAS8\",\n",
+    "    \"MRN_1\",\n",
+    "]\n",
+    "functional_datasets = [\n",
+    "    \"NYU_2\",\n",
+    "    \"SWU4\",\n",
+    "    \"HNU1\",\n",
+    "    \"XHCUMS\",\n",
+    "    \"UPSM_1\",\n",
+    "    \"BNU3\",\n",
+    "    \"IPCAS7\",\n",
+    "    \"SWU1\",\n",
+    "    \"IPCAS1\",\n",
+    "    \"BNU1\",\n",
+    "]\n",
+    "\n",
+    "datasets = {\"Diffusion\": diffusion_datasets, \"Functional\": functional_datasets}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Fetch from S3 and Download to Local\n",
+    "\n",
+    "The files will be stored at `m2g/docs/paper/data/` directory."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading m2g/Diffusion/SWU4-8-27-20-m2g-native-csa-det/... Total files: 422\n",
+      "Downloading m2g/Diffusion/HNU1-8-27-20-m2g-native-csa-det/... Total files: 300\n",
+      "Downloading m2g/Diffusion/NKIENH-11-01-20-m2g-native-csa-det/... Total files: 129\n",
+      "Downloading m2g/Diffusion/XHCUMS-8-27-20-m2g-native-csa-det/... Total files: 117\n",
+      "Downloading m2g/Diffusion/BNU1-8-27-20-m2g-native-csa-det/... Total files: 114\n",
+      "Downloading m2g/Diffusion/BNU3-11-01-20-m2g-native-csa-det/... Total files: 47\n",
+      "Downloading m2g/Diffusion/NKI1-8-24-20-m2g-native-csa-det/... Total files: 40\n",
+      "Downloading m2g/Diffusion/NKI24-11-01-20-m2g-native-csa-det/... Total files: 38\n",
+      "Downloading m2g/Diffusion/IPCAS8-8-27-20-m2g-native-csa-det/... Total files: 26\n",
+      "Downloading m2g/Diffusion/MRN_1-8-27-20-m2g-native-csa-det/... Total files: 19\n",
+      "Downloading m2g/Functional/NYU_2-11-27-20-m2g-func/... Total files: 494\n",
+      "Downloading m2g/Functional/SWU4-11-12-20-m2g-func/... Total files: 425\n",
+      "Downloading m2g/Functional/HNU1-11-12-20-m2g-func/... Total files: 300\n",
+      "Downloading m2g/Functional/XHCUMS-11-27-20-m2g-func/... Total files: 247\n",
+      "Downloading m2g/Functional/UPSM_1-11-27-20-m2g-func/... Total files: 230\n",
+      "Downloading m2g/Functional/BNU3-11-12-20-m2g-func/... Total files: 144\n",
+      "Downloading m2g/Functional/IPCAS7-11-27-20-m2g-func/... Total files: 144\n",
+      "Downloading m2g/Functional/SWU1-11-27-20-m2g-func/... Total files: 119\n",
+      "Downloading m2g/Functional/IPCAS1-11-27-20-m2g-func/... Total files: 118\n",
+      "Downloading m2g/Functional/BNU1-11-12-20-m2g-func/... Total files: 106\n"
+     ]
+    }
+   ],
+   "source": [
+    "parcellation = \"DKT_space-MNI152NLin6_res-2x2x2\"\n",
+    "bucket = \"open-neurodata\"\n",
+    "\n",
+    "for modality in modalities:\n",
+    "    if modality == \"Diffusion\":\n",
+    "        parcellation = \"DKT_space-MNI152NLin6_res-2x2x2\"\n",
+    "    else:\n",
+    "        parcellation = \"DKT_space-MNI152NLin6_res-2x2x2.nii.gz\"\n",
+    "\n",
+    "    prefix = f\"m2g/{modality}/\"\n",
+    "\n",
+    "    s3 = boto3.client(\"s3\", config=Config(signature_version=UNSIGNED))\n",
+    "    resp = s3.list_objects_v2(Bucket=bucket, Prefix=prefix, Delimiter=\"/\")\n",
+    "\n",
+    "    dataset_fullnames = []\n",
+    "    for dset in datasets[modality]:\n",
+    "        for r in resp.get(\"CommonPrefixes\"):\n",
+    "            if dset in r.get(\"Prefix\"):\n",
+    "                dataset_fullnames.append(r.get(\"Prefix\"))\n",
+    "\n",
+    "    for dset, dset_abbrev in zip(dataset_fullnames, datasets[modality]):\n",
+    "        prefix = f\"{dset}Connectomes/{parcellation}/\"\n",
+    "\n",
+    "        resp = s3.list_objects_v2(Bucket=bucket, Prefix=prefix, Delimiter=\"/\")\n",
+    "        contents = resp[\"Contents\"]\n",
+    "\n",
+    "        files = []\n",
+    "        for obj in contents:\n",
+    "            key = obj[\"Key\"]\n",
+    "            if modality == \"Functional\":\n",
+    "                if key.endswith(\".csv\") and \"abs\" in key:\n",
+    "                    files.append(key)\n",
+    "            else:\n",
+    "                if key.endswith(\".csv\"):\n",
+    "                    files.append(key)\n",
+    "\n",
+    "        print(f\"Downloading {dset}... Total files: {len(files)}\")\n",
+    "\n",
+    "        # Save to data folder\n",
+    "        p = Path(f\"./data/{modality}/{dset_abbrev}\")\n",
+    "        p.mkdir(parents=True, exist_ok=True)\n",
+    "\n",
+    "        # Download files\n",
+    "        for f in files:\n",
+    "            out = p / Path(f).name\n",
+    "            if not out.exists():\n",
+    "                s3.download_file(bucket, f, out)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Compute mean connectomes\n",
+    "\n",
+    "This data will be used for plotting in Figure 2."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Computing mean graph for Diffusion SWU4... Total files: 422\n",
+      "Computing mean graph for Diffusion HNU1... Total files: 300\n",
+      "Computing mean graph for Diffusion NKIENH... Total files: 129\n",
+      "Computing mean graph for Diffusion XHCUMS... Total files: 117\n",
+      "Computing mean graph for Diffusion BNU1... Total files: 114\n",
+      "Computing mean graph for Diffusion BNU3... Total files: 47\n",
+      "Computing mean graph for Diffusion NKI1... Total files: 40\n",
+      "Computing mean graph for Diffusion NKI24... Total files: 38\n",
+      "Computing mean graph for Diffusion IPCAS8... Total files: 26\n",
+      "Computing mean graph for Diffusion MRN_1... Total files: 19\n",
+      "Computing mean graph for Functional NYU_2... Total files: 494\n",
+      "Computing mean graph for Functional SWU4... Total files: 425\n",
+      "Computing mean graph for Functional HNU1... Total files: 300\n",
+      "Computing mean graph for Functional XHCUMS... Total files: 247\n",
+      "Computing mean graph for Functional UPSM_1... Total files: 230\n",
+      "Computing mean graph for Functional BNU3... Total files: 144\n",
+      "Computing mean graph for Functional IPCAS7... Total files: 144\n",
+      "Computing mean graph for Functional SWU1... Total files: 119\n",
+      "Computing mean graph for Functional IPCAS1... Total files: 118\n",
+      "Computing mean graph for Functional BNU1... Total files: 106\n"
+     ]
+    }
+   ],
+   "source": [
+    "out_dir = Path(f\"./data/mean_connectomes/\")\n",
+    "out_dir.mkdir(parents=True, exist_ok=True)\n",
+    "\n",
+    "for modality, dsets in datasets.items():\n",
+    "    if modality == \"Functional\":\n",
+    "        keyword = \"*abs*\"\n",
+    "    else:\n",
+    "        keyword = \"*\"\n",
+    "\n",
+    "    for dset in dsets:\n",
+    "        p = Path(f\"./data/{modality}/{dset}\")\n",
+    "        files = list(p.glob(keyword))\n",
+    "\n",
+    "        print(\n",
+    "            f\"Computing mean graph for {modality} {dset}... Total files: {len(files)}\"\n",
+    "        )\n",
+    "\n",
+    "        graphs = import_edgelist(files, \"csv\")\n",
+    "        graphs = [pass_to_ranks(g) for g in graphs]\n",
+    "\n",
+    "        # Compute mean graph\n",
+    "        mean_graph = np.array(graphs).mean(axis=0)\n",
+    "\n",
+    "        # Save mean graph\n",
+    "        np.save(out_dir / f\"{len(files):>03}_{modality}_{dset}\", mean_graph)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "m2g",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}