single-cell-data · bkmartinjr · Sep 23, 2024 · aaronwolen · Sep 27, 2024
diff --git a/.github/workflows/python-tiledbsoma-ml.yml b/.github/workflows/python-tiledbsoma-ml.yml
@@ -3,10 +3,14 @@ name: python-tiledbsoma-ml CI
 on:
  pull_request:
  branches: ["**"]
- paths-ignore: ['scripts/**']
+ paths-ignore:
+ - "scripts/**"
+ - "notebooks/**"
  push:
  branches: [main]
- paths-ignore: ['scripts/**']
+ paths-ignore:
+ - "scripts/**"
+ - "notebooks/**"
  workflow_dispatch:
 
 jobs:

diff --git a/notebooks/tutorial_lightning.ipynb b/notebooks/tutorial_lightning.ipynb
@@ -0,0 +1,242 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Training a model with PyTorch Lightning\n",
+ "\n",
+ "This tutorial provides a quick overview of training a toy model with Lightning, using the `tiledbsoma_ml.ExperimentAxisQueryIterableDataset` class, on data from the [CZI CELLxGENE Census](https://chanzuckerberg.github.io/cellxgene-census/) dataset. This is intended only to demonstrate the use of the `ExperimentAxisQueryIterableDataset`, and not as an example of how to train a biologically useful model.\n",
+ "\n",
+ "For more information on these API, please refer to the [`tutorial_pytorch` notebook](tutorial_pytorch.ipynb).\n",
+ "\n",
+ "**Prerequesites**\n",
+ "\n",
+ "Install `tiledbsoma_ml` and `scikit-learn`, for example:\n",
+ "\n",
+ "> pip install tiledbsoma_ml scikit-learn\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Initialize SOMA Experiment query as training data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/home/bruce/miniforge3/envs/toymodel/lib/python3.11/site-packages/torchdata/datapipes/__init__.py:18: UserWarning: \n",
+ "################################################################################\n",
+ "WARNING!\n",
+ "The 'datapipes', 'dataloader2' modules are deprecated and will be removed in a\n",
+ "future torchdata release! Please see https://github.com/pytorch/data/issues/1196\n",
+ "to learn more and leave feedback.\n",
+ "################################################################################\n",
+ "\n",
+ " deprecation_warning()\n"
+ ]
+ }
+ ],
+ "source": [
+ "import pytorch_lightning as pl\n",
+ "import tiledbsoma as soma\n",
+ "import torch\n",
+ "from sklearn.preprocessing import LabelEncoder\n",
+ "\n",
+ "import tiledbsoma_ml as soma_ml\n",
+ "\n",
+ "CZI_Census_Homo_Sapiens_URL = \"s3://cellxgene-census-public-us-west-2/cell-census/2024-07-01/soma/census_data/homo_sapiens/\"\n",
+ "\n",
+ "experiment = soma.open(\n",
+ " CZI_Census_Homo_Sapiens_URL,\n",
+ " context=soma.SOMATileDBContext(tiledb_config={\"vfs.s3.region\": \"us-west-2\"}),\n",
+ ")\n",
+ "obs_value_filter = \"tissue_general == 'tongue' and is_primary_data == True\"\n",
+ "\n",
+ "with experiment.axis_query(\n",
+ " measurement_name=\"RNA\", obs_query=soma.AxisQuery(value_filter=obs_value_filter)\n",
+ ") as query:\n",
+ " obs_df = query.obs(column_names=[\"cell_type\"]).concat().to_pandas()\n",
+ " cell_type_encoder = LabelEncoder().fit(obs_df[\"cell_type\"].unique())\n",
+ "\n",
+ " experiment_dataset = soma_ml.ExperimentAxisQueryIterableDataset(\n",
+ " query,\n",
+ " X_name=\"raw\",\n",
+ " obs_column_names=[\"cell_type\"],\n",
+ " batch_size=128,\n",
+ " shuffle=True,\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Define the Lightning module"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "class LogisticRegressionLightning(pl.LightningModule):\n",
+ " def __init__(self, input_dim, output_dim, cell_type_encoder, learning_rate=1e-5):\n",
+ " super(LogisticRegressionLightning, self).__init__()\n",
+ " self.linear = torch.nn.Linear(input_dim, output_dim)\n",
+ " self.cell_type_encoder = cell_type_encoder\n",
+ " self.learning_rate = learning_rate\n",
+ " self.loss_fn = torch.nn.CrossEntropyLoss()\n",
+ "\n",
+ " def forward(self, x):\n",
+ " outputs = torch.sigmoid(self.linear(x))\n",
+ " return outputs\n",
+ "\n",
+ " def training_step(self, batch, batch_idx):\n",
+ " X_batch, y_batch = batch\n",
+ " # X_batch = X_batch.float()\n",
+ " X_batch = torch.from_numpy(X_batch).float().to(self.device)\n",
+ "\n",
+ " # Perform prediction\n",
+ " outputs = self(X_batch)\n",
+ "\n",
+ " # Determine the predicted label\n",
+ " probabilities = torch.nn.functional.softmax(outputs, 1)\n",
+ " predictions = torch.argmax(probabilities, axis=1)\n",
+ "\n",
+ " # Compute loss\n",
+ " y_batch = torch.from_numpy(\n",
+ " self.cell_type_encoder.transform(y_batch[\"cell_type\"])\n",
+ " ).to(self.device)\n",
+ " loss = self.loss_fn(outputs, y_batch.long())\n",
+ "\n",
+ " # Compute accuracy\n",
+ " train_correct = (predictions == y_batch).sum().item()\n",
+ " train_accuracy = train_correct / len(predictions)\n",
+ "\n",
+ " # Log loss and accuracy\n",
+ " self.log(\"train_loss\", loss, prog_bar=True)\n",
+ " self.log(\"train_accuracy\", train_accuracy, prog_bar=True)\n",
+ "\n",
+ " return loss\n",
+ "\n",
+ " def configure_optimizers(self):\n",
+ " optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)\n",
+ " return optimizer"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Train the model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "GPU available: True (cuda), used: True\n",
+ "TPU available: False, using: 0 TPU cores\n",
+ "HPU available: False, using: 0 HPUs\n",
+ "/home/bruce/miniforge3/envs/toymodel/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:75: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default\n",
+ "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n",
+ "\n",
+ " | Name | Type | Params | Mode \n",
+ "-----------------------------------------------------\n",
+ "0 | linear | Linear | 726 K | train\n",
+ "1 | loss_fn | CrossEntropyLoss | 0 | train\n",
+ "-----------------------------------------------------\n",
+ "726 K Trainable params\n",
+ "0 Non-trainable params\n",
+ "726 K Total params\n",
+ "2.905 Total estimated model params size (MB)\n",
+ "2 Modules in train mode\n",
+ "0 Modules in eval mode\n",
+ "/home/bruce/miniforge3/envs/toymodel/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.\n",
+ "/home/bruce/miniforge3/envs/toymodel/lib/python3.11/site-packages/pytorch_lightning/utilities/data.py:122: Your `IterableDataset` has `__len__` defined. In combination with multi-process data loading (when num_workers > 1), `__len__` could be inaccurate if each worker is not configured independently to avoid having duplicate data.\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Epoch 19: 100%|██████████| 118/118 [00:08<00:00, 14.31it/s, v_num=5, train_loss=1.670, train_accuracy=0.977]"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "`Trainer.fit` stopped: `max_epochs=20` reached.\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Epoch 19: 100%|██████████| 118/118 [00:08<00:00, 14.28it/s, v_num=5, train_loss=1.670, train_accuracy=0.977]\n"
+ ]
+ }
+ ],
+ "source": [
+ "dataloader = soma_ml.experiment_dataloader(experiment_dataset)\n",
+ "\n",
+ "# The size of the input dimension is the number of genes\n",
+ "input_dim = experiment_dataset.shape[1]\n",
+ "\n",
+ "# The size of the output dimension is the number of distinct cell_type values\n",
+ "output_dim = len(cell_type_encoder.classes_)\n",
+ "\n",
+ "# Initialize the PyTorch Lightning model\n",
+ "model = LogisticRegressionLightning(\n",
+ " input_dim, output_dim, cell_type_encoder=cell_type_encoder\n",
+ ")\n",
+ "\n",
+ "# Define the PyTorch Lightning Trainer\n",
+ "trainer = pl.Trainer(max_epochs=20)\n",
+ "\n",
+ "# set precision\n",
+ "torch.set_float32_matmul_precision(\"high\")\n",
+ "\n",
+ "# Train the model\n",
+ "trainer.fit(model, train_dataloaders=dataloader)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "toymodel",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.9"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}