From a1d4aabfb914e285cd60511e477b9a63bbb33b4f Mon Sep 17 00:00:00 2001 From: j1c Date: Sat, 13 Apr 2024 00:52:21 -0400 Subject: [PATCH] data processing notebook --- docs/paper/data.ipynb | 261 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 261 insertions(+) create mode 100644 docs/paper/data.ipynb diff --git a/docs/paper/data.ipynb b/docs/paper/data.ipynb new file mode 100644 index 00000000..a7149c10 --- /dev/null +++ b/docs/paper/data.ipynb @@ -0,0 +1,261 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Data Preprocessing\n", + "\n", + "The outputs from the `m2g` pipeline is available in our open-access AWS S3 bucket: `s3://open-neurodata/m2`. You can use the file tree to browse the outputs [http://open-neurodata.s3-website-us-east-1.amazonaws.com/](http://open-neurodata.s3-website-us-east-1.amazonaws.com/)." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/j1c/miniconda3/envs/m2g/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "import boto3\n", + "from botocore import UNSIGNED\n", + "from botocore.client import Config\n", + "\n", + "from pathlib import Path\n", + "import numpy as np\n", + "\n", + "from graspologic.utils import import_edgelist, pass_to_ranks" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "modalities = [\"Diffusion\", \"Functional\"]\n", + "diffusion_datasets = [\n", + " \"SWU4\",\n", + " \"HNU1\",\n", + " \"NKIENH\",\n", + " \"XHCUMS\",\n", + " \"BNU1\",\n", + " \"BNU3\",\n", + " \"NKI1\",\n", + " \"NKI24\",\n", + " \"IPCAS8\",\n", + " \"MRN_1\",\n", + "]\n", + "functional_datasets = [\n", + " \"NYU_2\",\n", + " \"SWU4\",\n", + " \"HNU1\",\n", + " \"XHCUMS\",\n", + " \"UPSM_1\",\n", + " \"BNU3\",\n", + " \"IPCAS7\",\n", + " \"SWU1\",\n", + " \"IPCAS1\",\n", + " \"BNU1\",\n", + "]\n", + "\n", + "datasets = {\"Diffusion\": diffusion_datasets, \"Functional\": functional_datasets}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Fetch from S3 and Download to Local\n", + "\n", + "The files will be stored at `m2g/docs/paper/data/` directory." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading m2g/Diffusion/SWU4-8-27-20-m2g-native-csa-det/... Total files: 422\n", + "Downloading m2g/Diffusion/HNU1-8-27-20-m2g-native-csa-det/... Total files: 300\n", + "Downloading m2g/Diffusion/NKIENH-11-01-20-m2g-native-csa-det/... Total files: 129\n", + "Downloading m2g/Diffusion/XHCUMS-8-27-20-m2g-native-csa-det/... Total files: 117\n", + "Downloading m2g/Diffusion/BNU1-8-27-20-m2g-native-csa-det/... Total files: 114\n", + "Downloading m2g/Diffusion/BNU3-11-01-20-m2g-native-csa-det/... Total files: 47\n", + "Downloading m2g/Diffusion/NKI1-8-24-20-m2g-native-csa-det/... Total files: 40\n", + "Downloading m2g/Diffusion/NKI24-11-01-20-m2g-native-csa-det/... Total files: 38\n", + "Downloading m2g/Diffusion/IPCAS8-8-27-20-m2g-native-csa-det/... Total files: 26\n", + "Downloading m2g/Diffusion/MRN_1-8-27-20-m2g-native-csa-det/... Total files: 19\n", + "Downloading m2g/Functional/NYU_2-11-27-20-m2g-func/... Total files: 494\n", + "Downloading m2g/Functional/SWU4-11-12-20-m2g-func/... Total files: 425\n", + "Downloading m2g/Functional/HNU1-11-12-20-m2g-func/... Total files: 300\n", + "Downloading m2g/Functional/XHCUMS-11-27-20-m2g-func/... Total files: 247\n", + "Downloading m2g/Functional/UPSM_1-11-27-20-m2g-func/... Total files: 230\n", + "Downloading m2g/Functional/BNU3-11-12-20-m2g-func/... Total files: 144\n", + "Downloading m2g/Functional/IPCAS7-11-27-20-m2g-func/... Total files: 144\n", + "Downloading m2g/Functional/SWU1-11-27-20-m2g-func/... Total files: 119\n", + "Downloading m2g/Functional/IPCAS1-11-27-20-m2g-func/... Total files: 118\n", + "Downloading m2g/Functional/BNU1-11-12-20-m2g-func/... Total files: 106\n" + ] + } + ], + "source": [ + "parcellation = \"DKT_space-MNI152NLin6_res-2x2x2\"\n", + "bucket = \"open-neurodata\"\n", + "\n", + "for modality in modalities:\n", + " if modality == \"Diffusion\":\n", + " parcellation = \"DKT_space-MNI152NLin6_res-2x2x2\"\n", + " else:\n", + " parcellation = \"DKT_space-MNI152NLin6_res-2x2x2.nii.gz\"\n", + "\n", + " prefix = f\"m2g/{modality}/\"\n", + "\n", + " s3 = boto3.client(\"s3\", config=Config(signature_version=UNSIGNED))\n", + " resp = s3.list_objects_v2(Bucket=bucket, Prefix=prefix, Delimiter=\"/\")\n", + "\n", + " dataset_fullnames = []\n", + " for dset in datasets[modality]:\n", + " for r in resp.get(\"CommonPrefixes\"):\n", + " if dset in r.get(\"Prefix\"):\n", + " dataset_fullnames.append(r.get(\"Prefix\"))\n", + "\n", + " for dset, dset_abbrev in zip(dataset_fullnames, datasets[modality]):\n", + " prefix = f\"{dset}Connectomes/{parcellation}/\"\n", + "\n", + " resp = s3.list_objects_v2(Bucket=bucket, Prefix=prefix, Delimiter=\"/\")\n", + " contents = resp[\"Contents\"]\n", + "\n", + " files = []\n", + " for obj in contents:\n", + " key = obj[\"Key\"]\n", + " if modality == \"Functional\":\n", + " if key.endswith(\".csv\") and \"abs\" in key:\n", + " files.append(key)\n", + " else:\n", + " if key.endswith(\".csv\"):\n", + " files.append(key)\n", + "\n", + " print(f\"Downloading {dset}... Total files: {len(files)}\")\n", + "\n", + " # Save to data folder\n", + " p = Path(f\"./data/{modality}/{dset_abbrev}\")\n", + " p.mkdir(parents=True, exist_ok=True)\n", + "\n", + " # Download files\n", + " for f in files:\n", + " out = p / Path(f).name\n", + " if not out.exists():\n", + " s3.download_file(bucket, f, out)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Compute mean connectomes\n", + "\n", + "This data will be used for plotting in Figure 2." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Computing mean graph for Diffusion SWU4... Total files: 422\n", + "Computing mean graph for Diffusion HNU1... Total files: 300\n", + "Computing mean graph for Diffusion NKIENH... Total files: 129\n", + "Computing mean graph for Diffusion XHCUMS... Total files: 117\n", + "Computing mean graph for Diffusion BNU1... Total files: 114\n", + "Computing mean graph for Diffusion BNU3... Total files: 47\n", + "Computing mean graph for Diffusion NKI1... Total files: 40\n", + "Computing mean graph for Diffusion NKI24... Total files: 38\n", + "Computing mean graph for Diffusion IPCAS8... Total files: 26\n", + "Computing mean graph for Diffusion MRN_1... Total files: 19\n", + "Computing mean graph for Functional NYU_2... Total files: 494\n", + "Computing mean graph for Functional SWU4... Total files: 425\n", + "Computing mean graph for Functional HNU1... Total files: 300\n", + "Computing mean graph for Functional XHCUMS... Total files: 247\n", + "Computing mean graph for Functional UPSM_1... Total files: 230\n", + "Computing mean graph for Functional BNU3... Total files: 144\n", + "Computing mean graph for Functional IPCAS7... Total files: 144\n", + "Computing mean graph for Functional SWU1... Total files: 119\n", + "Computing mean graph for Functional IPCAS1... Total files: 118\n", + "Computing mean graph for Functional BNU1... Total files: 106\n" + ] + } + ], + "source": [ + "out_dir = Path(f\"./data/mean_connectomes/\")\n", + "out_dir.mkdir(parents=True, exist_ok=True)\n", + "\n", + "for modality, dsets in datasets.items():\n", + " if modality == \"Functional\":\n", + " keyword = \"*abs*\"\n", + " else:\n", + " keyword = \"*\"\n", + "\n", + " for dset in dsets:\n", + " p = Path(f\"./data/{modality}/{dset}\")\n", + " files = list(p.glob(keyword))\n", + "\n", + " print(\n", + " f\"Computing mean graph for {modality} {dset}... Total files: {len(files)}\"\n", + " )\n", + "\n", + " graphs = import_edgelist(files, \"csv\")\n", + " graphs = [pass_to_ranks(g) for g in graphs]\n", + "\n", + " # Compute mean graph\n", + " mean_graph = np.array(graphs).mean(axis=0)\n", + "\n", + " # Save mean graph\n", + " np.save(out_dir / f\"{len(files):>03}_{modality}_{dset}\", mean_graph)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "m2g", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}