From 4354e81a07a3656868d204bda7d53f9198a19a0b Mon Sep 17 00:00:00 2001 From: David Ackerman Date: Tue, 19 Mar 2024 01:34:37 -0400 Subject: [PATCH 1/4] fix: switch to using cellmap fork of funlib persistence for multiscale compatibility --- pyproject.toml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 5a5ad22aa..4a2766ad7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,9 +45,8 @@ dependencies = [ "funlib.math>=0.1", "funlib.geometry>=0.2", "mwatershed>=0.1", - "funlib.persistence", "cellmap-models", - # "funlib.persistence @ git+https://github.com/janelia-cellmap/funlib.persistence", + "funlib.persistence @ git+https://github.com/janelia-cellmap/funlib.persistence", "funlib.evaluate @ git+https://github.com/pattonw/funlib.evaluate", "gunpowder>=1.3", # "lsds>=0.1.3", From 1cf3993b546fd397b374ec66d3efccb7a811e3d1 Mon Sep 17 00:00:00 2001 From: David Ackerman Date: Tue, 19 Mar 2024 03:00:58 -0400 Subject: [PATCH 2/4] update example scripts --- .../examples/distance_task/cosem_example.py | 74 +------- .../distance_task/cosem_finetune_example.py | 89 +-------- .../distance_task/synthetic_example.py | 169 ++++++++++-------- 3 files changed, 106 insertions(+), 226 deletions(-) diff --git a/dacapo/examples/distance_task/cosem_example.py b/dacapo/examples/distance_task/cosem_example.py index b1103db7e..30dc262eb 100644 --- a/dacapo/examples/distance_task/cosem_example.py +++ b/dacapo/examples/distance_task/cosem_example.py @@ -1,59 +1,3 @@ -# %% [markdown] -# # Dacapo -# -# DaCapo is a framework that allows for easy configuration and execution of established machine learning techniques on arbitrarily large volumes of multi-dimensional images. -# -# DaCapo has 4 major configurable components: -# 1. **dacapo.datasplits.DataSplit** -# -# 2. **dacapo.tasks.Task** -# -# 3. **dacapo.architectures.Architecture** -# -# 4. **dacapo.trainers.Trainer** -# -# These are then combined in a single **dacapo.experiments.Run** that includes your starting point (whether you want to start training from scratch or continue off of a previously trained model) and stopping criterion (the number of iterations you want to train). - -# %% [markdown] -# ## Environment setup -# If you have not already done so, you will need to install DaCapo. You can do this by first creating a new environment and then installing DaCapo using pip. -# -# ```bash -# conda create -n dacapo python=3.10 -# conda activate dacapo -# ``` -# -# Then, you can install DaCapo using pip, via GitHub: -# -# ```bash -# pip install git+https://github.com/janelia-cellmap/dacapo.git -# ``` -# -# Or you can clone the repository and install it locally: -# -# ```bash -# git clone https://github.com/janelia-cellmap/dacapo.git -# cd dacapo -# pip install -e . -# ``` -# -# Be sure to select this environment in your Jupyter notebook or JupyterLab. - -# %% [markdown] -# ## Config Store -# To define where the data goes, create a dacapo.yaml configuration file either in `~/.config/dacapo/dacapo.yaml` or in `./dacapo.yaml`. Here is a template: -# -# ```yaml -# type: files -# runs_base_dir: /path/to/my/data/storage -# ``` -# The `runs_base_dir` defines where your on-disk data will be stored. The `type` setting determines the database backend. The default is `files`, which stores the data in a file tree on disk. Alternatively, you can use `mongodb` to store the data in a MongoDB database. To use MongoDB, you will need to provide a `mongodbhost` and `mongodbname` in the configuration file: -# -# ```yaml -# ... -# mongodbhost: mongodb://dbuser:dbpass@dburl:dbport/ -# mongodbname: dacapo - # %% # First we need to create a config store to store our configurations from dacapo.store.create_store import create_config_store @@ -75,7 +19,9 @@ input_resolution = Coordinate(8, 8, 8) output_resolution = Coordinate(4, 4, 4) datasplit_config = DataSplitGenerator.generate_from_csv( - "cosem_example.csv", input_resolution, output_resolution + "/misc/public/dacapo_learnathon/datasplit_csvs/cosem_example.csv", + input_resolution, + output_resolution, ).compute() datasplit = datasplit_config.datasplit_type(datasplit_config) @@ -94,7 +40,7 @@ task_config = DistanceTaskConfig( name="cosem_distance_task_4nm", - channels=["labels"], + channels=["mito"], clip_distance=40.0, tol_distance=40.0, scale_factor=80.0, @@ -177,7 +123,7 @@ # ) iterations = 2000 -validation_interval = 50 +validation_interval = iterations // 2 repetitions = 1 for i in range(repetitions): run_config = RunConfig( @@ -221,13 +167,3 @@ run = Run(config_store.retrieve_run_config("cosem_distance_run_4nm")) train_run(run) - -# %% [markdown] -# If you want to start your run on some compute cluster, you might want to use the command line interface: dacapo train -r {run_config.name}. This makes it particularly convenient to run on compute nodes where you can specify specific compute requirements. - -# # %% -# from dacapo.validate import validate - -# # validate(run_config.name, iterations, num_workers=32) -# validate("cosem_distance_run", 1500, num_workers=10) -# # %% diff --git a/dacapo/examples/distance_task/cosem_finetune_example.py b/dacapo/examples/distance_task/cosem_finetune_example.py index 804cf6404..b77fe2d01 100644 --- a/dacapo/examples/distance_task/cosem_finetune_example.py +++ b/dacapo/examples/distance_task/cosem_finetune_example.py @@ -1,59 +1,3 @@ -# %% [markdown] -# # Dacapo -# -# DaCapo is a framework that allows for easy configuration and execution of established machine learning techniques on arbitrarily large volumes of multi-dimensional images. -# -# DaCapo has 4 major configurable components: -# 1. **dacapo.datasplits.DataSplit** -# -# 2. **dacapo.tasks.Task** -# -# 3. **dacapo.architectures.Architecture** -# -# 4. **dacapo.trainers.Trainer** -# -# These are then combined in a single **dacapo.experiments.Run** that includes your starting point (whether you want to start training from scratch or continue off of a previously trained model) and stopping criterion (the number of iterations you want to train). - -# %% [markdown] -# ## Environment setup -# If you have not already done so, you will need to install DaCapo. You can do this by first creating a new environment and then installing DaCapo using pip. -# -# ```bash -# conda create -n dacapo python=3.10 -# conda activate dacapo -# ``` -# -# Then, you can install DaCapo using pip, via GitHub: -# -# ```bash -# pip install git+https://github.com/janelia-cellmap/dacapo.git -# ``` -# -# Or you can clone the repository and install it locally: -# -# ```bash -# git clone https://github.com/janelia-cellmap/dacapo.git -# cd dacapo -# pip install -e . -# ``` -# -# Be sure to select this environment in your Jupyter notebook or JupyterLab. - -# %% [markdown] -# ## Config Store -# To define where the data goes, create a dacapo.yaml configuration file either in `~/.config/dacapo/dacapo.yaml` or in `./dacapo.yaml`. Here is a template: -# -# ```yaml -# type: files -# runs_base_dir: /path/to/my/data/storage -# ``` -# The `runs_base_dir` defines where your on-disk data will be stored. The `type` setting determines the database backend. The default is `files`, which stores the data in a file tree on disk. Alternatively, you can use `mongodb` to store the data in a MongoDB database. To use MongoDB, you will need to provide a `mongodbhost` and `mongodbname` in the configuration file: -# -# ```yaml -# ... -# mongodbhost: mongodb://dbuser:dbpass@dburl:dbport/ -# mongodbname: dacapo - # %% # First we need to create a config store to store our configurations from dacapo.store.create_store import create_config_store @@ -75,7 +19,9 @@ input_resolution = Coordinate(8, 8, 8) output_resolution = Coordinate(4, 4, 4) datasplit_config = DataSplitGenerator.generate_from_csv( - "cosem_example.csv", input_resolution, output_resolution + "/misc/public/dacapo_learnathon/datasplit_csvs/cosem_example.csv", + input_resolution, + output_resolution, ).compute() datasplit = datasplit_config.datasplit_type(datasplit_config) @@ -94,7 +40,7 @@ task_config = DistanceTaskConfig( name="cosem_distance_task_4nm", - channels=["labels"], + channels=["mito"], clip_distance=40.0, tol_distance=40.0, scale_factor=80.0, @@ -168,31 +114,19 @@ from dacapo.experiments import RunConfig from dacapo.experiments.run import Run -start_config = None - -# Uncomment to start from a pretrained model from dacapo.experiments.starts import CosemStartConfig +# We will now download a pretrained cosem model and finetune from that model. It will only have to download the first time it is used. + start_config = CosemStartConfig("setup04", "1820500") start_config.start_type(start_config).check() + iterations = 2000 -validation_interval = 50 +validation_interval = iterations // 2 repetitions = 1 for i in range(repetitions): run_config = RunConfig( name="cosem_distance_run_4nm_finetune", - # # NOTE: This is a template for the name of the run. You can customize it as you see fit. - # name=("_").join( - # [ - # "example", - # "scratch" if start_config is None else "finetuned", - # datasplit_config.name, - # task_config.name, - # architecture_config.name, - # trainer_config.name, - # ] - # ) - # + f"__{i}", datasplit_config=datasplit_config, task_config=task_config, architecture_config=architecture_config, @@ -223,10 +157,3 @@ # %% [markdown] # If you want to start your run on some compute cluster, you might want to use the command line interface: dacapo train -r {run_config.name}. This makes it particularly convenient to run on compute nodes where you can specify specific compute requirements. - -# # %% -# from dacapo.validate import validate - -# # validate(run_config.name, iterations, num_workers=32) -# validate("cosem_distance_run", 1500, num_workers=10) -# # %% diff --git a/dacapo/examples/distance_task/synthetic_example.py b/dacapo/examples/distance_task/synthetic_example.py index 3612d35d2..de38f7da9 100644 --- a/dacapo/examples/distance_task/synthetic_example.py +++ b/dacapo/examples/distance_task/synthetic_example.py @@ -60,8 +60,10 @@ config_store = create_config_store() + +# %% +# Then let's make sure we have data to train on. If this is already provided, you can skip to the Datasplit section. # %% -# Then let's make sure we have data to train on from pathlib import Path from dacapo import Options from dacapo.examples.utils import get_viewer @@ -71,13 +73,13 @@ options = Options.instance() runs_base_dir = options.runs_base_dir -force = False +force_example_creation = False num_workers = 32 # First for training data train_data_path = Path(runs_base_dir, "example_train.zarr") try: - assert not force + assert not force_example_creation raw_array = open_ds(str(train_data_path), "raw") labels_array = open_ds(str(train_data_path), "labels") except: @@ -98,7 +100,7 @@ # Then for validation data validate_data_path = Path(runs_base_dir, "example_validate.zarr") try: - assert not force + assert not force_example_creation raw_array = open_ds(str(validate_data_path), "raw") labels_array = open_ds(str(validate_data_path), "labels") except: @@ -121,76 +123,97 @@ # NOTE: You may need to delete old config stores if you are re-running this cell with modifications to the configs. The config names are unique and will throw an error if you try to store a config with the same name as an existing config. For the `files` backend, you can delete the `runs_base_dir/configs` directory to remove all stored configs. # %% -from dacapo.experiments.datasplits.datasets.arrays import ( - BinarizeArrayConfig, - ZarrArrayConfig, - IntensitiesArrayConfig, -) -from dacapo.experiments.datasplits import TrainValidateDataSplitConfig -from dacapo.experiments.datasplits.datasets import RawGTDatasetConfig from pathlib import Path -from dacapo import Options - -options = Options.instance() -runs_base_dir = options.runs_base_dir +from dacapo.experiments.datasplits import DataSplitGenerator +from funlib.geometry import Coordinate -datasplit_config = TrainValidateDataSplitConfig( - name="synthetic_datasplit_config", - train_configs=[ - RawGTDatasetConfig( - name="train_data", - weight=1, - raw_config=IntensitiesArrayConfig( - name="raw_train_data", - source_array_config=ZarrArrayConfig( - name="raw_train_data_uint8", - file_name=Path(runs_base_dir, "example_train.zarr"), - dataset="raw", - ), - min=0.0, - max=255.0, - ), - gt_config=BinarizeArrayConfig( - name="gt_train_data", - source_array_config=ZarrArrayConfig( - name="gt_train_data_zarr", - file_name=Path(runs_base_dir, "example_train.zarr"), - dataset="labels", - ), - groupings=[("labels", [])], - ), - ) - ], - validate_configs=[ - RawGTDatasetConfig( - name="validate_data", - weight=1, - raw_config=IntensitiesArrayConfig( - name="raw_validate_data", - source_array_config=ZarrArrayConfig( - name="raw_validate_data_uint8", - file_name=Path(runs_base_dir, "example_validate.zarr"), - dataset="raw", - ), - min=0.0, - max=255.0, - ), - gt_config=BinarizeArrayConfig( - name="gt_validate_data", - source_array_config=ZarrArrayConfig( - name="gt_validate_data_zarr", - file_name=Path(runs_base_dir, "example_validate.zarr"), - dataset="labels", - ), - groupings=[("labels", [])], - ), - ), - ], -) +input_resolution = Coordinate(8, 8, 8) +output_resolution = Coordinate(8, 8, 8) +datasplit_config = DataSplitGenerator.generate_from_csv( + "/misc/public/dacapo_learnathon/datasplit_csvs/synthetic_example.csv", + input_resolution, + output_resolution, +).compute() -config_store.store_datasplit_config(datasplit_config) datasplit = datasplit_config.datasplit_type(datasplit_config) viewer = datasplit._neuroglancer() +config_store.store_datasplit_config(datasplit_config) + +# %% [markdown] +# The above datasplit_generator automates a lot of the heavy lifting for configuring data to set up a run. The following shows everything that it is doing, and an equivalent way to set up the datasplit. +# ```python +# datasplit_config = TrainValidateDataSplitConfig( +# name="synthetic_example_semantic_['labels']_8nm", +# train_configs=[ +# RawGTDatasetConfig( +# name="example_train_[labels]_['labels']_8nm", +# weight=1, +# raw_config=IntensitiesArrayConfig( +# name="raw_example_train_uint8", +# source_array_config=ZarrArrayConfig( +# name="raw_example_train_uint8", +# file_name=Path( +# "/misc/public/dacapo_learnathon/synthetic/example_train.zarr" +# ), +# dataset="raw", +# ), +# min=0, +# max=255, +# ), +# gt_config=BinarizeArrayConfig( +# name="example_train_[labels]_labels_8nm_binarized", +# source_array_config=ZarrArrayConfig( +# name="gt_example_train_labels_uint8", +# file_name=Path( +# "/misc/public/dacapo_learnathon/synthetic/example_train.zarr" +# ), +# dataset="labels", +# ), +# groupings=[("labels", [])], +# background=0, +# ), +# mask_config=None, +# sample_points=None, +# ) +# ], +# validate_configs=[ +# RawGTDatasetConfig( +# name="example_validate_[labels]_['labels']_8nm", +# weight=1, +# raw_config=IntensitiesArrayConfig( +# name="raw_example_validate_uint8", +# source_array_config=ZarrArrayConfig( +# name="raw_example_validate_uint8", +# file_name=Path( +# "/misc/public/dacapo_learnathon/synthetic/example_validate.zarr" +# ), +# dataset="raw", +# ), +# min=0, +# max=255, +# ), +# gt_config=BinarizeArrayConfig( +# name="example_validate_[labels]_labels_8nm_binarized", +# source_array_config=ZarrArrayConfig( +# name="gt_example_validate_labels_uint8", +# file_name=Path( +# "/misc/public/dacapo_learnathon/synthetic/example_validate.zarr" +# ), +# dataset="labels", +# ), +# groupings=[("labels", [])], +# background=0, +# ), +# mask_config=None, +# sample_points=None, +# ) +# ], +# ) +# config_store.store_datasplit_config(datasplit_config) +# datasplit = datasplit_config.datasplit_type(datasplit_config) +# viewer = datasplit._neuroglancer() +# ``` + # %% [markdown] # ## Task @@ -363,7 +386,7 @@ # First let's make some test data test_data_path = Path(runs_base_dir, "example_test.zarr") try: - assert not force + assert not force_example_creation raw_array = open_ds(str(test_data_path), "raw") labels_array = open_ds(str(test_data_path), "labels") except: @@ -392,9 +415,3 @@ output_dtype="float32", output_roi=raw_array.roi, ) -# %% -from dacapo.validate import validate_run - -validate_run(run.name, 50, num_workers=32) - -# %% From 21c2976fc25d25ee686d1e440dea23f766061b6d Mon Sep 17 00:00:00 2001 From: David Ackerman Date: Tue, 19 Mar 2024 03:01:32 -0400 Subject: [PATCH 3/4] add corresponding jupyter notebooks for scripts --- .../distance_task/cosem_example.ipynb | 260 ++++++++ .../cosem_finetune_example.ipynb | 256 ++++++++ .../distance_task/synthetic_example.ipynb | 576 ++++++++++++++++++ 3 files changed, 1092 insertions(+) create mode 100644 dacapo/examples/distance_task/cosem_example.ipynb create mode 100644 dacapo/examples/distance_task/cosem_finetune_example.ipynb create mode 100644 dacapo/examples/distance_task/synthetic_example.ipynb diff --git a/dacapo/examples/distance_task/cosem_example.ipynb b/dacapo/examples/distance_task/cosem_example.ipynb new file mode 100644 index 000000000..916cb7c51 --- /dev/null +++ b/dacapo/examples/distance_task/cosem_example.ipynb @@ -0,0 +1,260 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# First we need to create a config store to store our configurations\n", + "from dacapo.store.create_store import create_config_store\n", + "\n", + "config_store = create_config_store()\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Datasplit\n", + " Where can you find your data? What format is it in? Does it need to be normalized? What data do you want to use for validation?\n", + " We'll assume your data is in a zarr file, and that you have a raw and a ground truth dataset, all stored in your `runs_base_dir` as `example_{type}.zarr` where `{type}` is either `train` or `validate`.\n", + " NOTE: You may need to delete old config stores if you are re-running this cell with modifications to the configs. The config names are unique and will throw an error if you try to store a config with the same name as an existing config. For the `files` backend, you can delete the `runs_base_dir/configs` directory to remove all stored configs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dacapo.experiments.datasplits import DataSplitGenerator\n", + "from funlib.geometry import Coordinate\n", + "\n", + "input_resolution = Coordinate(8, 8, 8)\n", + "output_resolution = Coordinate(4, 4, 4)\n", + "datasplit_config = DataSplitGenerator.generate_from_csv(\n", + " \"/misc/public/dacapo_learnathon/datasplit_csvs/cosem_example.csv\",\n", + " input_resolution,\n", + " output_resolution,\n", + ").compute()\n", + "\n", + "datasplit = datasplit_config.datasplit_type(datasplit_config)\n", + "viewer = datasplit._neuroglancer()\n", + "config_store.store_datasplit_config(datasplit_config)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Task\n", + " What do you want to learn? An instance segmentation? If so, how? Affinities,\n", + " Distance Transform, Foreground/Background, etc. Each of these tasks are commonly learned\n", + " and evaluated with specific loss functions and evaluation metrics. Some tasks may\n", + " also require specific non-linearities or output formats from your model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dacapo.experiments.tasks import DistanceTaskConfig\n", + "\n", + "task_config = DistanceTaskConfig(\n", + " name=\"cosem_distance_task_4nm\",\n", + " channels=[\"mito\"],\n", + " clip_distance=40.0,\n", + " tol_distance=40.0,\n", + " scale_factor=80.0,\n", + ")\n", + "config_store.store_task_config(task_config)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Architecture\n", + "\n", + " The setup of the network you will train. Biomedical image to image translation often utilizes a UNet, but even after choosing a UNet you still need to provide some additional parameters. How much do you want to downsample? How many convolutional layers do you want?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dacapo.experiments.architectures import CNNectomeUNetConfig\n", + "\n", + "architecture_config = CNNectomeUNetConfig(\n", + " name=\"upsample_unet\",\n", + " input_shape=Coordinate(216, 216, 216),\n", + " eval_shape_increase=Coordinate(72, 72, 72),\n", + " fmaps_in=1,\n", + " num_fmaps=12,\n", + " fmaps_out=72,\n", + " fmap_inc_factor=6,\n", + " downsample_factors=[(2, 2, 2), (3, 3, 3), (3, 3, 3)],\n", + " constant_upsample=True,\n", + " upsample_factors=[(2, 2, 2)],\n", + ")\n", + "config_store.store_architecture_config(architecture_config)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Trainer\n", + "\n", + " How do you want to train? This config defines the training loop and how the other three components work together. What sort of augmentations to apply during training, what learning rate and optimizer to use, what batch size to train with." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dacapo.experiments.trainers import GunpowderTrainerConfig\n", + "from dacapo.experiments.trainers.gp_augments import (\n", + " ElasticAugmentConfig,\n", + " GammaAugmentConfig,\n", + " IntensityAugmentConfig,\n", + " IntensityScaleShiftAugmentConfig,\n", + ")\n", + "\n", + "trainer_config = GunpowderTrainerConfig(\n", + " name=\"cosem\",\n", + " batch_size=1,\n", + " learning_rate=0.0001,\n", + " num_data_fetchers=20,\n", + " augments=[\n", + " ElasticAugmentConfig(\n", + " control_point_spacing=[100, 100, 100],\n", + " control_point_displacement_sigma=[10.0, 10.0, 10.0],\n", + " rotation_interval=(0.0, 1.5707963267948966),\n", + " subsample=8,\n", + " uniform_3d_rotation=True,\n", + " ),\n", + " IntensityAugmentConfig(scale=(0.25, 1.75), shift=(-0.5, 0.35), clip=True),\n", + " GammaAugmentConfig(gamma_range=(0.5, 2.0)),\n", + " IntensityScaleShiftAugmentConfig(scale=2.0, shift=-1.0),\n", + " ],\n", + " snapshot_interval=10000,\n", + " min_masked=0.05,\n", + " clip_raw=True,\n", + ")\n", + "config_store.store_trainer_config(trainer_config)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Run\n", + " Now that we have our components configured, we just need to combine them into a run and start training. We can have multiple repetitions of a single set of configs in order to increase our chances of finding an optimum." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dacapo.experiments import RunConfig\n", + "from dacapo.experiments.run import Run\n", + "\n", + "start_config = None\n", + "\n", + "# Uncomment to start from a pretrained model\n", + "# start_config = StartConfig(\n", + "# \"setup04\",\n", + "# \"best\",\n", + "# )\n", + "\n", + "iterations = 2000\n", + "validation_interval = iterations // 2\n", + "repetitions = 1\n", + "for i in range(repetitions):\n", + " run_config = RunConfig(\n", + " name=\"cosem_distance_run_4nm\",\n", + " # # NOTE: This is a template for the name of the run. You can customize it as you see fit.\n", + " # name=(\"_\").join(\n", + " # [\n", + " # \"example\",\n", + " # \"scratch\" if start_config is None else \"finetuned\",\n", + " # datasplit_config.name,\n", + " # task_config.name,\n", + " # architecture_config.name,\n", + " # trainer_config.name,\n", + " # ]\n", + " # )\n", + " # + f\"__{i}\",\n", + " datasplit_config=datasplit_config,\n", + " task_config=task_config,\n", + " architecture_config=architecture_config,\n", + " trainer_config=trainer_config,\n", + " num_iterations=iterations,\n", + " validation_interval=validation_interval,\n", + " repetition=i,\n", + " start_config=start_config,\n", + " )\n", + "\n", + " print(run_config.name)\n", + " config_store.store_run_config(run_config)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Train\n", + " To train one of the runs, you can either do it by first creating a **Run** directly from the run config\n", + " NOTE: The run stats are stored in the `runs_base_dir/stats` directory. You can delete this directory to remove all stored stats if you want to re-run training. Otherwise, the stats will be appended to the existing files, and the run won't start from scratch. This may cause errors" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dacapo.train import train_run\n", + "from dacapo.experiments.run import Run\n", + "from dacapo.store.create_store import create_config_store\n", + "\n", + "config_store = create_config_store()\n", + "\n", + "run = Run(config_store.retrieve_run_config(\"cosem_distance_run_4nm\"))\n", + "train_run(run)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/dacapo/examples/distance_task/cosem_finetune_example.ipynb b/dacapo/examples/distance_task/cosem_finetune_example.ipynb new file mode 100644 index 000000000..3517d96e4 --- /dev/null +++ b/dacapo/examples/distance_task/cosem_finetune_example.ipynb @@ -0,0 +1,256 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# First we need to create a config store to store our configurations\n", + "from dacapo.store.create_store import create_config_store\n", + "\n", + "config_store = create_config_store()\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Datasplit\n", + " Where can you find your data? What format is it in? Does it need to be normalized? What data do you want to use for validation?\n", + " We'll assume your data is in a zarr file, and that you have a raw and a ground truth dataset, all stored in your `runs_base_dir` as `example_{type}.zarr` where `{type}` is either `train` or `validate`.\n", + " NOTE: You may need to delete old config stores if you are re-running this cell with modifications to the configs. The config names are unique and will throw an error if you try to store a config with the same name as an existing config. For the `files` backend, you can delete the `runs_base_dir/configs` directory to remove all stored configs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dacapo.experiments.datasplits import DataSplitGenerator\n", + "from funlib.geometry import Coordinate\n", + "\n", + "input_resolution = Coordinate(8, 8, 8)\n", + "output_resolution = Coordinate(4, 4, 4)\n", + "datasplit_config = DataSplitGenerator.generate_from_csv(\n", + " \"/misc/public/dacapo_learnathon/datasplit_csvs/cosem_example.csv\",\n", + " input_resolution,\n", + " output_resolution,\n", + ").compute()\n", + "\n", + "datasplit = datasplit_config.datasplit_type(datasplit_config)\n", + "viewer = datasplit._neuroglancer()\n", + "config_store.store_datasplit_config(datasplit_config)\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Task\n", + " What do you want to learn? An instance segmentation? If so, how? Affinities,\n", + " Distance Transform, Foreground/Background, etc. Each of these tasks are commonly learned\n", + " and evaluated with specific loss functions and evaluation metrics. Some tasks may\n", + " also require specific non-linearities or output formats from your model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dacapo.experiments.tasks import DistanceTaskConfig\n", + "\n", + "task_config = DistanceTaskConfig(\n", + " name=\"cosem_distance_task_4nm\",\n", + " channels=[\"mito\"],\n", + " clip_distance=40.0,\n", + " tol_distance=40.0,\n", + " scale_factor=80.0,\n", + ")\n", + "config_store.store_task_config(task_config)\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Architecture\n", + "\n", + " The setup of the network you will train. Biomedical image to image translation often utilizes a UNet, but even after choosing a UNet you still need to provide some additional parameters. How much do you want to downsample? How many convolutional layers do you want?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dacapo.experiments.architectures import CNNectomeUNetConfig\n", + "\n", + "architecture_config = CNNectomeUNetConfig(\n", + " name=\"upsample_unet\",\n", + " input_shape=Coordinate(216, 216, 216),\n", + " eval_shape_increase=Coordinate(72, 72, 72),\n", + " fmaps_in=1,\n", + " num_fmaps=12,\n", + " fmaps_out=72,\n", + " fmap_inc_factor=6,\n", + " downsample_factors=[(2, 2, 2), (3, 3, 3), (3, 3, 3)],\n", + " constant_upsample=True,\n", + " upsample_factors=[(2, 2, 2)],\n", + ")\n", + "config_store.store_architecture_config(architecture_config)\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Trainer\n", + "\n", + " How do you want to train? This config defines the training loop and how the other three components work together. What sort of augmentations to apply during training, what learning rate and optimizer to use, what batch size to train with." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dacapo.experiments.trainers import GunpowderTrainerConfig\n", + "from dacapo.experiments.trainers.gp_augments import (\n", + " ElasticAugmentConfig,\n", + " GammaAugmentConfig,\n", + " IntensityAugmentConfig,\n", + " IntensityScaleShiftAugmentConfig,\n", + ")\n", + "\n", + "trainer_config = GunpowderTrainerConfig(\n", + " name=\"cosem_finetune\",\n", + " batch_size=1,\n", + " learning_rate=0.0001,\n", + " num_data_fetchers=20,\n", + " augments=[\n", + " ElasticAugmentConfig(\n", + " control_point_spacing=[100, 100, 100],\n", + " control_point_displacement_sigma=[10.0, 10.0, 10.0],\n", + " rotation_interval=(0.0, 1.5707963267948966),\n", + " subsample=8,\n", + " uniform_3d_rotation=True,\n", + " ),\n", + " IntensityAugmentConfig(scale=(0.25, 1.75), shift=(-0.5, 0.35), clip=True),\n", + " GammaAugmentConfig(gamma_range=(0.5, 2.0)),\n", + " IntensityScaleShiftAugmentConfig(scale=2.0, shift=-1.0),\n", + " ],\n", + " snapshot_interval=10000,\n", + " min_masked=0.05,\n", + " clip_raw=True,\n", + ")\n", + "config_store.store_trainer_config(trainer_config)\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Run\n", + " Now that we have our components configured, we just need to combine them into a run and start training. We can have multiple repetitions of a single set of configs in order to increase our chances of finding an optimum." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dacapo.experiments import RunConfig\n", + "from dacapo.experiments.run import Run\n", + "\n", + "from dacapo.experiments.starts import CosemStartConfig\n", + "\n", + "# We will now download a pretrained cosem model and finetune from that model. It will only have to download the first time it is used.\n", + "\n", + "start_config = CosemStartConfig(\"setup04\", \"1820500\")\n", + "start_config.start_type(start_config).check()\n", + "\n", + "iterations = 2000\n", + "validation_interval = iterations // 2\n", + "repetitions = 1\n", + "for i in range(repetitions):\n", + " run_config = RunConfig(\n", + " name=\"cosem_distance_run_4nm_finetune\",\n", + " datasplit_config=datasplit_config,\n", + " task_config=task_config,\n", + " architecture_config=architecture_config,\n", + " trainer_config=trainer_config,\n", + " num_iterations=iterations,\n", + " validation_interval=validation_interval,\n", + " repetition=i,\n", + " start_config=start_config,\n", + " )\n", + "\n", + " print(run_config.name)\n", + " config_store.store_run_config(run_config)\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Train\n", + " To train one of the runs, you can either do it by first creating a **Run** directly from the run config\n", + " NOTE: The run stats are stored in the `runs_base_dir/stats` directory. You can delete this directory to remove all stored stats if you want to re-run training. Otherwise, the stats will be appended to the existing files, and the run won't start from scratch. This may cause errors" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dacapo.train import train_run\n", + "from dacapo.experiments.run import Run\n", + "from dacapo.store.create_store import create_config_store\n", + "\n", + "config_store = create_config_store()\n", + "\n", + "run = Run(config_store.retrieve_run_config(\"cosem_distance_run_4nm_finetune\"))\n", + "train_run(run)\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " If you want to start your run on some compute cluster, you might want to use the command line interface: dacapo train -r {run_config.name}. This makes it particularly convenient to run on compute nodes where you can specify specific compute requirements." + ] + } + ], + "nbformat": 4, + "nbformat_minor": 2, + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": 3 + } + } +} \ No newline at end of file diff --git a/dacapo/examples/distance_task/synthetic_example.ipynb b/dacapo/examples/distance_task/synthetic_example.ipynb new file mode 100644 index 000000000..5673ea26e --- /dev/null +++ b/dacapo/examples/distance_task/synthetic_example.ipynb @@ -0,0 +1,576 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " # Dacapo\n", + "\n", + " DaCapo is a framework that allows for easy configuration and execution of established machine learning techniques on arbitrarily large volumes of multi-dimensional images.\n", + "\n", + " DaCapo has 4 major configurable components:\n", + " 1. **dacapo.datasplits.DataSplit**\n", + "\n", + " 2. **dacapo.tasks.Task**\n", + "\n", + " 3. **dacapo.architectures.Architecture**\n", + "\n", + " 4. **dacapo.trainers.Trainer**\n", + "\n", + " These are then combined in a single **dacapo.experiments.Run** that includes your starting point (whether you want to start training from scratch or continue off of a previously trained model) and stopping criterion (the number of iterations you want to train)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Environment setup\n", + " If you have not already done so, you will need to install DaCapo. You can do this by first creating a new environment and then installing DaCapo using pip.\n", + "\n", + " ```bash\n", + " conda create -n dacapo python=3.10\n", + " conda activate dacapo\n", + " ```\n", + "\n", + " Then, you can install DaCapo using pip, via GitHub:\n", + "\n", + " ```bash\n", + " pip install git+https://github.com/janelia-cellmap/dacapo.git\n", + " ```\n", + "\n", + " Or you can clone the repository and install it locally:\n", + "\n", + " ```bash\n", + " git clone https://github.com/janelia-cellmap/dacapo.git\n", + " cd dacapo\n", + " pip install -e .\n", + " ```\n", + "\n", + " Be sure to select this environment in your Jupyter notebook or JupyterLab." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Config Store\n", + " To define where the data goes, create a dacapo.yaml configuration file either in `~/.config/dacapo/dacapo.yaml` or in `./dacapo.yaml`. Here is a template:\n", + "\n", + " ```yaml\n", + " type: files\n", + " runs_base_dir: /path/to/my/data/storage\n", + " ```\n", + " The `runs_base_dir` defines where your on-disk data will be stored. The `type` setting determines the database backend. The default is `files`, which stores the data in a file tree on disk. Alternatively, you can use `mongodb` to store the data in a MongoDB database. To use MongoDB, you will need to provide a `mongodbhost` and `mongodbname` in the configuration file:\n", + "\n", + " ```yaml\n", + " ...\n", + " mongodbhost: mongodb://dbuser:dbpass@dburl:dbport/\n", + " mongodbname: dacapo" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# First we need to create a config store to store our configurations\n", + "from dacapo.store.create_store import create_config_store\n", + "\n", + "config_store = create_config_store()\n", + "\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Then let's make sure we have data to train on. If this is already provided, you can skip to the Datasplit section." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "from dacapo import Options\n", + "from dacapo.examples.utils import get_viewer\n", + "from dacapo.examples.synthetic_source_worker import generate_synthetic_dataset\n", + "from funlib.geometry import Coordinate\n", + "from funlib.persistence import open_ds\n", + "\n", + "options = Options.instance()\n", + "runs_base_dir = options.runs_base_dir\n", + "force_example_creation = False\n", + "num_workers = 32\n", + "\n", + "# First for training data\n", + "train_data_path = Path(runs_base_dir, \"example_train.zarr\")\n", + "try:\n", + " assert not force_example_creation\n", + " raw_array = open_ds(str(train_data_path), \"raw\")\n", + " labels_array = open_ds(str(train_data_path), \"labels\")\n", + "except:\n", + " train_shape = Coordinate((512, 512, 512))\n", + " generate_synthetic_dataset(\n", + " train_data_path,\n", + " shape=train_shape,\n", + " overwrite=True,\n", + " num_workers=num_workers,\n", + " write_shape=Coordinate((128, 128, 128)),\n", + " )\n", + " raw_array = open_ds(str(train_data_path), \"raw\")\n", + " labels_array = open_ds(str(train_data_path), \"labels\")\n", + "\n", + "get_viewer(raw_array, labels_array)\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Then for validation data\n", + "validate_data_path = Path(runs_base_dir, \"example_validate.zarr\")\n", + "try:\n", + " assert not force_example_creation\n", + " raw_array = open_ds(str(validate_data_path), \"raw\")\n", + " labels_array = open_ds(str(validate_data_path), \"labels\")\n", + "except:\n", + " validate_shape = Coordinate((152, 152, 152)) * 3\n", + " generate_synthetic_dataset(\n", + " validate_data_path,\n", + " shape=validate_shape,\n", + " write_shape=Coordinate((152, 152, 152)),\n", + " overwrite=True,\n", + " num_workers=num_workers,\n", + " )\n", + "\n", + "get_viewer(raw_array, labels_array)\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Datasplit\n", + " Where can you find your data? What format is it in? Does it need to be normalized? What data do you want to use for validation?\n", + " We'll assume your data is in a zarr file, and that you have a raw and a ground truth dataset, all stored in your `runs_base_dir` as `example_{type}.zarr` where `{type}` is either `train` or `validate`.\n", + " NOTE: You may need to delete old config stores if you are re-running this cell with modifications to the configs. The config names are unique and will throw an error if you try to store a config with the same name as an existing config. For the `files` backend, you can delete the `runs_base_dir/configs` directory to remove all stored configs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "from dacapo.experiments.datasplits import DataSplitGenerator\n", + "from funlib.geometry import Coordinate\n", + "\n", + "input_resolution = Coordinate(8, 8, 8)\n", + "output_resolution = Coordinate(8, 8, 8)\n", + "datasplit_config = DataSplitGenerator.generate_from_csv(\n", + " \"/misc/public/dacapo_learnathon/datasplit_csvs/synthetic_example.csv\",\n", + " input_resolution,\n", + " output_resolution,\n", + ").compute()\n", + "\n", + "datasplit = datasplit_config.datasplit_type(datasplit_config)\n", + "viewer = datasplit._neuroglancer()\n", + "config_store.store_datasplit_config(datasplit_config)\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " The above datasplit_generator automates a lot of the heavy lifting for configuring data to set up a run. The following shows everything that it is doing, and an equivalent way to set up the datasplit.\n", + " ```python\n", + " datasplit_config = TrainValidateDataSplitConfig(\n", + " name=\"synthetic_example_semantic_['labels']_8nm\",\n", + " train_configs=[\n", + " RawGTDatasetConfig(\n", + " name=\"example_train_[labels]_['labels']_8nm\",\n", + " weight=1,\n", + " raw_config=IntensitiesArrayConfig(\n", + " name=\"raw_example_train_uint8\",\n", + " source_array_config=ZarrArrayConfig(\n", + " name=\"raw_example_train_uint8\",\n", + " file_name=Path(\n", + " \"/misc/public/dacapo_learnathon/synthetic/example_train.zarr\"\n", + " ),\n", + " dataset=\"raw\",\n", + " ),\n", + " min=0,\n", + " max=255,\n", + " ),\n", + " gt_config=BinarizeArrayConfig(\n", + " name=\"example_train_[labels]_labels_8nm_binarized\",\n", + " source_array_config=ZarrArrayConfig(\n", + " name=\"gt_example_train_labels_uint8\",\n", + " file_name=Path(\n", + " \"/misc/public/dacapo_learnathon/synthetic/example_train.zarr\"\n", + " ),\n", + " dataset=\"labels\",\n", + " ),\n", + " groupings=[(\"labels\", [])],\n", + " background=0,\n", + " ),\n", + " mask_config=None,\n", + " sample_points=None,\n", + " )\n", + " ],\n", + " validate_configs=[\n", + " RawGTDatasetConfig(\n", + " name=\"example_validate_[labels]_['labels']_8nm\",\n", + " weight=1,\n", + " raw_config=IntensitiesArrayConfig(\n", + " name=\"raw_example_validate_uint8\",\n", + " source_array_config=ZarrArrayConfig(\n", + " name=\"raw_example_validate_uint8\",\n", + " file_name=Path(\n", + " \"/misc/public/dacapo_learnathon/synthetic/example_validate.zarr\"\n", + " ),\n", + " dataset=\"raw\",\n", + " ),\n", + " min=0,\n", + " max=255,\n", + " ),\n", + " gt_config=BinarizeArrayConfig(\n", + " name=\"example_validate_[labels]_labels_8nm_binarized\",\n", + " source_array_config=ZarrArrayConfig(\n", + " name=\"gt_example_validate_labels_uint8\",\n", + " file_name=Path(\n", + " \"/misc/public/dacapo_learnathon/synthetic/example_validate.zarr\"\n", + " ),\n", + " dataset=\"labels\",\n", + " ),\n", + " groupings=[(\"labels\", [])],\n", + " background=0,\n", + " ),\n", + " mask_config=None,\n", + " sample_points=None,\n", + " )\n", + " ],\n", + " )\n", + " config_store.store_datasplit_config(datasplit_config)\n", + " datasplit = datasplit_config.datasplit_type(datasplit_config)\n", + " viewer = datasplit._neuroglancer()\n", + " ```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Task\n", + " What do you want to learn? An instance segmentation? If so, how? Affinities,\n", + " Distance Transform, Foreground/Background, etc. Each of these tasks are commonly learned\n", + " and evaluated with specific loss functions and evaluation metrics. Some tasks may\n", + " also require specific non-linearities or output formats from your model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dacapo.experiments.tasks import DistanceTaskConfig\n", + "\n", + "task_config = DistanceTaskConfig(\n", + " name=\"example_distance_task\",\n", + " channels=[\"labels\"],\n", + " clip_distance=80.0,\n", + " tol_distance=80.0,\n", + " scale_factor=160.0,\n", + ")\n", + "config_store.store_task_config(task_config)\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Architecture\n", + "\n", + " The setup of the network you will train. Biomedical image to image translation often utilizes a UNet, but even after choosing a UNet you still need to provide some additional parameters. How much do you want to downsample? How many convolutional layers do you want?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dacapo.experiments.architectures import CNNectomeUNetConfig\n", + "\n", + "architecture_config = CNNectomeUNetConfig(\n", + " name=\"example-mini_unet\",\n", + " input_shape=(172, 172, 172),\n", + " fmaps_out=24,\n", + " fmaps_in=1,\n", + " num_fmaps=12,\n", + " fmap_inc_factor=2,\n", + " downsample_factors=[(2, 2, 2), (2, 2, 2), (2, 2, 2)],\n", + " eval_shape_increase=(72, 72, 72),\n", + ")\n", + "try:\n", + " config_store.store_architecture_config(architecture_config)\n", + "except:\n", + " config_store.delete_architecture_config(architecture_config.name)\n", + " config_store.store_architecture_config(architecture_config)\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Trainer\n", + "\n", + " How do you want to train? This config defines the training loop and how the other three components work together. What sort of augmentations to apply during training, what learning rate and optimizer to use, what batch size to train with." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dacapo.experiments.trainers import GunpowderTrainerConfig\n", + "from dacapo.experiments.trainers.gp_augments import (\n", + " ElasticAugmentConfig,\n", + " GammaAugmentConfig,\n", + " IntensityAugmentConfig,\n", + " IntensityScaleShiftAugmentConfig,\n", + ")\n", + "\n", + "trainer_config = GunpowderTrainerConfig(\n", + " name=\"synthetic_distance_trainer\",\n", + " batch_size=1,\n", + " learning_rate=0.0001,\n", + " num_data_fetchers=20,\n", + " augments=[\n", + " ElasticAugmentConfig(\n", + " control_point_spacing=[100, 100, 100],\n", + " control_point_displacement_sigma=[10.0, 10.0, 10.0],\n", + " rotation_interval=(0.0, 1.5707963267948966),\n", + " subsample=8,\n", + " uniform_3d_rotation=True,\n", + " ),\n", + " IntensityAugmentConfig(scale=(0.25, 1.75), shift=(-0.5, 0.35), clip=True),\n", + " GammaAugmentConfig(gamma_range=(0.5, 2.0)),\n", + " IntensityScaleShiftAugmentConfig(scale=2.0, shift=-1.0),\n", + " ],\n", + " snapshot_interval=10000,\n", + " min_masked=0.05,\n", + " clip_raw=True,\n", + ")\n", + "config_store.store_trainer_config(trainer_config)\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Run\n", + " Now that we have our components configured, we just need to combine them into a run and start training. We can have multiple repetitions of a single set of configs in order to increase our chances of finding an optimum." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dacapo.experiments import RunConfig\n", + "from dacapo.experiments.run import Run\n", + "\n", + "start_config = None\n", + "\n", + "# Uncomment to start from a pretrained model\n", + "# start_config = StartConfig(\n", + "# \"setup04\",\n", + "# \"best\",\n", + "# )\n", + "\n", + "iterations = 2000\n", + "validation_interval = iterations // 2\n", + "repetitions = 1\n", + "for i in range(repetitions):\n", + " run_config = RunConfig(\n", + " name=\"example_synthetic_distance_run\",\n", + " # # NOTE: This is a template for the name of the run. You can customize it as you see fit.\n", + " # name=(\"_\").join(\n", + " # [\n", + " # \"example\",\n", + " # \"scratch\" if start_config is None else \"finetuned\",\n", + " # datasplit_config.name,\n", + " # task_config.name,\n", + " # architecture_config.name,\n", + " # trainer_config.name,\n", + " # ]\n", + " # )\n", + " # + f\"__{i}\",\n", + " datasplit_config=datasplit_config,\n", + " task_config=task_config,\n", + " architecture_config=architecture_config,\n", + " trainer_config=trainer_config,\n", + " num_iterations=iterations,\n", + " validation_interval=validation_interval,\n", + " repetition=i,\n", + " start_config=start_config,\n", + " )\n", + "\n", + " print(run_config.name)\n", + " try:\n", + " config_store.store_run_config(run_config)\n", + " except:\n", + " config_store.delete_run_config(run_config.name)\n", + " config_store.store_run_config(run_config)\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Train\n", + " To train one of the runs, you can either do it by first creating a **Run** directly from the run config\n", + " NOTE: The run stats are stored in the `runs_base_dir/stats` directory. You can delete this directory to remove all stored stats if you want to re-run training. Otherwise, the stats will be appended to the existing files, and the run won't start from scratch. This may cause errors" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dacapo.train import train_run\n", + "from dacapo.experiments.run import Run\n", + "from dacapo.store.create_store import create_config_store\n", + "\n", + "config_store = create_config_store()\n", + "\n", + "run = Run(config_store.retrieve_run_config(run_config.name))\n", + "train_run(run)\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " If you want to start your run on some compute cluster, you might want to use the command line interface: dacapo train -r {run_config.name}. This makes it particularly convenient to run on compute nodes where you can specify specific compute requirements." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Validate\n", + " Once you have trained your model, you can validate it on the validation datasets used during training. You can use the `dacapo.validate` function to do this. You can also use the command line interface to validate a run: dacapo validate -r {run_config.name} -i {iteration}\n", + " Generally we setup training to automatically validate at a set interval and the model checkpoints are saved at these intervals." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dacapo.validate import validate\n", + "\n", + "validate(run_config.name, iterations, num_workers=16, overwrite=True)\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Predict\n", + " Once you have trained and validated your model, you can use it to predict on new data. You can use the `dacapo.predict` function to do this. You can also use the command line interface to predict on a run: dacapo predict -r {run_config.name} -i {iteration} -ic {input_container} -id {input_dataset} -op {output_path}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# First let's make some test data\n", + "test_data_path = Path(runs_base_dir, \"example_test.zarr\")\n", + "try:\n", + " assert not force_example_creation\n", + " raw_array = open_ds(str(test_data_path), \"raw\")\n", + " labels_array = open_ds(str(test_data_path), \"labels\")\n", + "except:\n", + " test_shape = Coordinate((152, 152, 152)) * 5\n", + " generate_synthetic_dataset(\n", + " test_data_path,\n", + " shape=test_shape,\n", + " overwrite=True,\n", + " write_shape=Coordinate((152, 152, 152)),\n", + " num_workers=num_workers,\n", + " )\n", + "\n", + "get_viewer(raw_array, labels_array)\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dacapo.predict import predict\n", + "\n", + "predict(\n", + " run_config.name,\n", + " iterations,\n", + " test_data_path,\n", + " \"raw\",\n", + " test_data_path,\n", + " num_workers=32,\n", + " overwrite=True,\n", + " output_dtype=\"float32\",\n", + " output_roi=raw_array.roi,\n", + ")\n", + "" + ] + } + ], + "nbformat": 4, + "nbformat_minor": 2, + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": 3 + } + } +} \ No newline at end of file From 25bf0c36b18c9cab0be55703601f88f9f85b6757 Mon Sep 17 00:00:00 2001 From: David Ackerman Date: Tue, 19 Mar 2024 03:01:57 -0400 Subject: [PATCH 4/4] add fill in the blank tutorial example --- .../cosem_example_fill_in_the_blank.ipynb | 235 ++++++++++++++++++ .../cosem_example_fill_in_the_blank.py | 143 +++++++++++ 2 files changed, 378 insertions(+) create mode 100644 dacapo/examples/distance_task/cosem_example_fill_in_the_blank.ipynb create mode 100644 dacapo/examples/distance_task/cosem_example_fill_in_the_blank.py diff --git a/dacapo/examples/distance_task/cosem_example_fill_in_the_blank.ipynb b/dacapo/examples/distance_task/cosem_example_fill_in_the_blank.ipynb new file mode 100644 index 000000000..639a5f77c --- /dev/null +++ b/dacapo/examples/distance_task/cosem_example_fill_in_the_blank.ipynb @@ -0,0 +1,235 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# First we need to create a config store to store our configurations\n", + "from dacapo.store.create_store import create_config_store\n", + "\n", + "# create the config store\n", + "config_store = ..." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Datasplit\n", + " Where can you find your data? What format is it in? Does it need to be normalized? What data do you want to use for validation?\n", + " We'll assume your data is in a zarr file, and that you have a raw and a ground truth dataset, all stored in your `runs_base_dir` as `example_{type}.zarr` where `{type}` is either `train` or `validate`.\n", + " NOTE: You may need to delete old config stores if you are re-running this cell with modifications to the configs. The config names are unique and will throw an error if you try to store a config with the same name as an existing config. For the `files` backend, you can delete the `runs_base_dir/configs` directory to remove all stored configs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dacapo.experiments.datasplits import DataSplitGenerator\n", + "from funlib.geometry import Coordinate\n", + "\n", + "# We will be working with cosem data and we want to work with 8nm isotropic input resolution for the raw data and output at 4 nm resolution.\n", + "# Create these resolutions as Coordinates.\n", + "input_resolution = ...\n", + "output_resolution = ...\n", + "\n", + "# Create the datasplit config using the cosem_example.csv located in the shared learnathon examples\n", + "datasplit_config = ...\n", + "\n", + "# Create the datasplit, produce the neuroglancer link and store the datasplit\n", + "datasplit = ...\n", + "viewer = ...\n", + "config_store...\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Task\n", + " What do you want to learn? An instance segmentation? If so, how? Affinities,\n", + " Distance Transform, Foreground/Background, etc. Each of these tasks are commonly learned\n", + " and evaluated with specific loss functions and evaluation metrics. Some tasks may\n", + " also require specific non-linearities or output formats from your model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dacapo.experiments.tasks import DistanceTaskConfig\n", + "\n", + "# Create a distance task config where the clip_distance=tol_distance=10x the output resolution,\n", + "# and scale_factor = 20x the output resolution\n", + "task_config = \n", + "config_store....\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Architecture\n", + "\n", + " The setup of the network you will train. Biomedical image to image translation often utilizes a UNet, but even after choosing a UNet you still need to provide some additional parameters. How much do you want to downsample? How many convolutional layers do you want?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dacapo.experiments.architectures import CNNectomeUNetConfig\n", + "\n", + "architecture_config = CNNectomeUNetConfig(\n", + " name=\"upsample_unet\",\n", + " input_shape=Coordinate(216, 216, 216),\n", + " eval_shape_increase=Coordinate(72, 72, 72),\n", + " fmaps_in=1,\n", + " num_fmaps=12,\n", + " fmaps_out=72,\n", + " fmap_inc_factor=6,\n", + " downsample_factors=[(2, 2, 2), (3, 3, 3), (3, 3, 3)],\n", + " constant_upsample=True,\n", + " upsample_factors=[(2, 2, 2)],\n", + ")\n", + "config_store.store_architecture_config(architecture_config)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Trainer\n", + "\n", + " How do you want to train? This config defines the training loop and how the other three components work together. What sort of augmentations to apply during training, what learning rate and optimizer to use, what batch size to train with." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dacapo.experiments.trainers import GunpowderTrainerConfig\n", + "from dacapo.experiments.trainers.gp_augments import (\n", + " ElasticAugmentConfig,\n", + " GammaAugmentConfig,\n", + " IntensityAugmentConfig,\n", + " IntensityScaleShiftAugmentConfig,\n", + ")\n", + "\n", + "trainer_config = GunpowderTrainerConfig(\n", + " name=\"cosem\",\n", + " batch_size=1,\n", + " learning_rate=0.0001,\n", + " num_data_fetchers=20,\n", + " augments=[\n", + " ElasticAugmentConfig(\n", + " control_point_spacing=[100, 100, 100],\n", + " control_point_displacement_sigma=[10.0, 10.0, 10.0],\n", + " rotation_interval=(0.0, 1.5707963267948966),\n", + " subsample=8,\n", + " uniform_3d_rotation=True,\n", + " ),\n", + " # Create an intensity augment config scaling from .25 to 1.25, shifting from -.5 to .35, and with clipping\n", + " ...,\n", + " # Create a gamma augment config with range .5 to 2\n", + " ...,\n", + " # Create an intensity scale shift agument config to rescale data from the range 0->1 to -1->1\n", + " ...,\n", + " ],\n", + " snapshot_interval=10000,\n", + " min_masked=0.05,\n", + " clip_raw=True,\n", + ")\n", + "# Store the trainer\n", + "config_store....\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Run\n", + " Now that we have our components configured, we just need to combine them into a run and start training. We can have multiple repetitions of a single set of configs in order to increase our chances of finding an optimum." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dacapo.experiments import RunConfig\n", + "from dacapo.experiments.run import Run\n", + "\n", + "start_config = None\n", + "\n", + "# Uncomment to start from a pretrained model\n", + "# start_config = StartConfig(\n", + "# \"setup04\",\n", + "# \"best\",\n", + "# )\n", + "\n", + "iterations = 2000\n", + "validation_interval = iterations // 2\n", + "# Set up a run using all of the configs and settings you created above\n", + "run_config = ...\n", + "\n", + "print(run_config.name)\n", + "config_store...\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## Train\n", + " To train one of the runs, you can either do it by first creating a **Run** directly from the run config\n", + " NOTE: The run stats are stored in the `runs_base_dir/stats` directory. You can delete this directory to remove all stored stats if you want to re-run training. Otherwise, the stats will be appended to the existing files, and the run won't start from scratch. This may cause errors" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dacapo.train import train_run\n", + "from dacapo.experiments.run import Run\n", + "# load the run and train it\n", + "run = Run(config_store...)\n", + "train_run(run)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "DaCapo Learnathon", + "language": "python", + "name": "dacapo_learnathon" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/dacapo/examples/distance_task/cosem_example_fill_in_the_blank.py b/dacapo/examples/distance_task/cosem_example_fill_in_the_blank.py new file mode 100644 index 000000000..9a7fbaf77 --- /dev/null +++ b/dacapo/examples/distance_task/cosem_example_fill_in_the_blank.py @@ -0,0 +1,143 @@ +# %% +# First we need to create a config store to store our configurations +from dacapo.store.create_store import create_config_store + +# create the config store +config_store = ... +# %% [markdown] +# ## Datasplit +# Where can you find your data? What format is it in? Does it need to be normalized? What data do you want to use for validation? + +# We'll assume your data is in a zarr file, and that you have a raw and a ground truth dataset, all stored in your `runs_base_dir` as `example_{type}.zarr` where `{type}` is either `train` or `validate`. +# NOTE: You may need to delete old config stores if you are re-running this cell with modifications to the configs. The config names are unique and will throw an error if you try to store a config with the same name as an existing config. For the `files` backend, you can delete the `runs_base_dir/configs` directory to remove all stored configs. + +# %% +from dacapo.experiments.datasplits import DataSplitGenerator +from funlib.geometry import Coordinate + +# We will be working with cosem data and we want to work with 8nm isotropic input resolution for the raw data and output at 4 nm resolution. +# Create these resolutions as Coordinates. +input_resolution = ... +output_resolution = ... + +# Create the datasplit config using the cosem_example.csv located in the shared learnathon examples +datasplit_config = ... + +# Create the datasplit, produce the neuroglancer link and store the datasplit +datasplit = ... +viewer = ... +config_store... + +# %% [markdown] +# ## Task +# What do you want to learn? An instance segmentation? If so, how? Affinities, +# Distance Transform, Foreground/Background, etc. Each of these tasks are commonly learned +# and evaluated with specific loss functions and evaluation metrics. Some tasks may +# also require specific non-linearities or output formats from your model. + +# %% +from dacapo.experiments.tasks import DistanceTaskConfig + +# Create a distance task config where the clip_distance=tol_distance=10x the output resolution, +# and scale_factor = 20x the output resolution +task_config = +config_store.... + +# %% [markdown] +# ## Architecture +# +# The setup of the network you will train. Biomedical image to image translation often utilizes a UNet, but even after choosing a UNet you still need to provide some additional parameters. How much do you want to downsample? How many convolutional layers do you want? + +# %% +from dacapo.experiments.architectures import CNNectomeUNetConfig + +architecture_config = CNNectomeUNetConfig( + name="upsample_unet", + input_shape=Coordinate(216, 216, 216), + eval_shape_increase=Coordinate(72, 72, 72), + fmaps_in=1, + num_fmaps=12, + fmaps_out=72, + fmap_inc_factor=6, + downsample_factors=[(2, 2, 2), (3, 3, 3), (3, 3, 3)], + constant_upsample=True, + upsample_factors=[(2, 2, 2)], +) +config_store.store_architecture_config(architecture_config) + +# %% [markdown] +# ## Trainer +# +# How do you want to train? This config defines the training loop and how the other three components work together. What sort of augmentations to apply during training, what learning rate and optimizer to use, what batch size to train with. + +# %% +from dacapo.experiments.trainers import GunpowderTrainerConfig +from dacapo.experiments.trainers.gp_augments import ( + ElasticAugmentConfig, + GammaAugmentConfig, + IntensityAugmentConfig, + IntensityScaleShiftAugmentConfig, +) + +trainer_config = GunpowderTrainerConfig( + name="cosem", + batch_size=1, + learning_rate=0.0001, + num_data_fetchers=20, + augments=[ + ElasticAugmentConfig( + control_point_spacing=[100, 100, 100], + control_point_displacement_sigma=[10.0, 10.0, 10.0], + rotation_interval=(0.0, 1.5707963267948966), + subsample=8, + uniform_3d_rotation=True, + ), + # Create an intensity augment config scaling from .25 to 1.25, shifting from -.5 to .35, and with clipping + ..., + # Create a gamma augment config with range .5 to 2 + ..., + # Create an intensity scale shift agument config to rescale data from the range 0->1 to -1->1 + ..., + ], + snapshot_interval=10000, + min_masked=0.05, + clip_raw=True, +) +# Store the trainer +config_store.... + +# %% [markdown] +# ## Run +# Now that we have our components configured, we just need to combine them into a run and start training. We can have multiple repetitions of a single set of configs in order to increase our chances of finding an optimum. + +# %% +from dacapo.experiments import RunConfig +from dacapo.experiments.run import Run + +start_config = None + +# Uncomment to start from a pretrained model +# start_config = StartConfig( +# "setup04", +# "best", +# ) + +iterations = 2000 +validation_interval = iterations // 2 +# Set up a run using all of the configs and settings you created above +run_config = ... + +print(run_config.name) +config_store... + +# %% [markdown] +# ## Train + +# To train one of the runs, you can either do it by first creating a **Run** directly from the run config +# NOTE: The run stats are stored in the `runs_base_dir/stats` directory. You can delete this directory to remove all stored stats if you want to re-run training. Otherwise, the stats will be appended to the existing files, and the run won't start from scratch. This may cause errors +# %% +from dacapo.train import train_run +from dacapo.experiments.run import Run +# load the run and train it +run = Run(config_store...) +train_run(run)