janelia-cellmap · mzouink · Sep 12, 2024 · Sep 5, 2024 · Sep 5, 2024 · Sep 10, 2024
diff --git a/.gitignore b/.gitignore
@@ -30,4 +30,6 @@ tmp/
 daisy_logs/
 
 *.csv
-*.private
+*.private
+
+user_experiments/*
diff --git a/dacapo/experiments/datasplits/datasets/arrays/constant_array.py b/dacapo/experiments/datasplits/datasets/arrays/constant_array.py
@@ -438,7 +438,8 @@ def _neuroglancer_source(self):
             This method is used to return the source array for neuroglancer.
         """
         # return self._source_array._neuroglancer_source()
-        return np.ones_like(self.source_array.data, dtype=np.uint64) * self._constant
+        shape = self.source_array[self.source_array.roi].shape
+        return np.ones(shape, dtype=np.uint64) * self._constant
 
     def _combined_neuroglancer_source(self) -> neuroglancer.LocalVolume:
         """

diff --git a/dacapo/experiments/datasplits/datasplit_generator.py b/dacapo/experiments/datasplits/datasplit_generator.py
@@ -439,10 +439,10 @@ class DataSplitGenerator:
             The minimum raw value.
         raw_max : int
             The maximum raw value.
-        classes_separator_caracter : str
+        classes_separator_character : str
             The classes separator character.
     Methods:
-        __init__(name, datasets, input_resolution, output_resolution, targets, segmentation_type, max_gt_downsample, max_gt_upsample, max_raw_training_downsample, max_raw_training_upsample, max_raw_validation_downsample, max_raw_validation_upsample, min_training_volume_size, raw_min, raw_max, classes_separator_caracter)
+        __init__(name, datasets, input_resolution, output_resolution, targets, segmentation_type, max_gt_downsample, max_gt_upsample, max_raw_training_downsample, max_raw_training_upsample, max_raw_validation_downsample, max_raw_validation_upsample, min_training_volume_size, raw_min, raw_max, classes_separator_character)
             Initializes the DataSplitGenerator class with the specified name, datasets, input resolution, output resolution, targets, segmentation type, maximum ground truth downsample, maximum ground truth upsample, maximum raw training downsample, maximum raw training upsample, maximum raw validation downsample, maximum raw validation upsample, minimum training volume size, minimum raw value, maximum raw value, and classes separator character.
         __str__(self)
             A method to get the string representation of the class.
@@ -482,8 +482,9 @@ def __init__(
         min_training_volume_size=8_000,  # 20**3
         raw_min=0,
         raw_max=255,
-        classes_separator_caracter="&",
+        classes_separator_character="&",
         use_negative_class=False,
+        binarize_gt=False,
     ):
         """
         Initializes the DataSplitGenerator class with the specified:
@@ -503,6 +504,8 @@ def __init__(
         - minimum raw value
         - maximum raw value
         - classes separator character
+        - use negative class
+        - binarize ground truth
 
         Args:
             name : str
@@ -535,15 +538,19 @@ def __init__(
                 The minimum raw value.
             raw_max : int
                 The maximum raw value.
-            classes_separator_caracter : str
+            classes_separator_character : str
                 The classes separator character.
+            use_negative_class : bool
+                Whether to use negative classes.
+            binarize_gt : bool
+                Whether to binarize the ground truth as part of preprocessing. Use this if you are doing semantic segmentation on instance labels (where each object has a unique ID).
         Returns:
             obj : The DataSplitGenerator class.
         Raises:
             ValueError
             If the class name is already set, a ValueError is raised.
         Examples:
-            >>> DataSplitGenerator(name, datasets, input_resolution, output_resolution, targets, segmentation_type, max_gt_downsample, max_gt_upsample, max_raw_training_downsample, max_raw_training_upsample, max_raw_validation_downsample, max_raw_validation_upsample, min_training_volume_size, raw_min, raw_max, classes_separator_caracter)
+            >>> DataSplitGenerator(name, datasets, input_resolution, output_resolution, targets, segmentation_type, max_gt_downsample, max_gt_upsample, max_raw_training_downsample, max_raw_training_upsample, max_raw_validation_downsample, max_raw_validation_upsample, min_training_volume_size, raw_min, raw_max, classes_separator_character)
         Notes:
             This function is used to initialize the DataSplitGenerator class with the specified name, datasets, input resolution, output resolution, targets, segmentation type, maximum ground truth downsample, maximum ground truth upsample, maximum raw training downsample, maximum raw training upsample, maximum raw validation downsample, maximum raw validation upsample, minimum training volume size, minimum raw value, maximum raw value, and classes separator character.
 
@@ -571,8 +578,9 @@ def __init__(
         self.min_training_volume_size = min_training_volume_size
         self.raw_min = raw_min
         self.raw_max = raw_max
-        self.classes_separator_caracter = classes_separator_caracter
+        self.classes_separator_character = classes_separator_character
         self.use_negative_class = use_negative_class
+        self.binarize_gt = binarize_gt
         if use_negative_class:
             if targets is None:
                 raise ValueError(
@@ -671,7 +679,7 @@ def check_class_name(self, class_name):
 
         """
         datasets, classes = format_class_name(
-            class_name, self.classes_separator_caracter, self.targets
+            class_name, self.classes_separator_character, self.targets
         )
         if self.class_name is None:
             self.class_name = classes
@@ -736,7 +744,7 @@ def __generate_semantic_seg_datasplit(self):
                 mask_config,
             ) = self.__generate_semantic_seg_dataset_crop(dataset)
             if type(self.class_name) == list:
-                classes = self.classes_separator_caracter.join(self.class_name)
+                classes = self.classes_separator_character.join(self.class_name)
             else:
                 classes = self.class_name
             if dataset.dataset_type == DatasetType.train:
@@ -821,7 +829,7 @@ def __generate_semantic_seg_dataset_crop(self, dataset: DatasetSpec):
         organelle_arrays = {}
         # classes_datasets, classes = self.check_class_name(gt_dataset)
         classes_datasets, classes = format_class_name(
-            gt_dataset, self.classes_separator_caracter, self.targets
+            gt_dataset, self.classes_separator_character, self.targets
         )
         for current_class_dataset, current_class_name in zip(classes_datasets, classes):
             if not (gt_path / current_class_dataset).exists():
@@ -843,11 +851,12 @@ def __generate_semantic_seg_dataset_crop(self, dataset: DatasetSpec):
                     self.output_resolution,
                     "gt",
                 )
-            # gt_config = BinarizeArrayConfig(
-            #     f"{dataset}_{current_class_name}_{self.output_resolution[0]}nm_binarized",
-            #     source_array_config=gt_config,
-            #     groupings=[(current_class_name, [])],
-            # )
+            if self.binarize_gt:
+                gt_config = BinarizeArrayConfig(
+                    f"{dataset}_{current_class_name}_{self.output_resolution[0]}nm_binarized",
+                    source_array_config=gt_config,
+                    groupings=[(current_class_name, [])],
+                )
             organelle_arrays[current_class_name] = gt_config
 
         if self.targets is None:

diff --git a/dacapo/experiments/run.py b/dacapo/experiments/run.py
@@ -1,7 +1,7 @@
 from .datasplits import DataSplit
 from .tasks.task import Task
 from .architectures import Architecture
-from .trainers import Trainer
+from .trainers import Trainer, GunpowderTrainer
 from .training_stats import TrainingStats
 from .validation_scores import ValidationScores
 from .starts import Start
@@ -215,3 +215,27 @@ def move_optimizer(
 
     def __str__(self):
         return self.name
+
+    def visualize_pipeline(self):
+        """
+        Visualizes the pipeline for the run, including all produced arrays.
+
+        Examples:
+            >>> run.visualize_pipeline()
+
+        """
+        if not isinstance(self.trainer, GunpowderTrainer):
+            raise NotImplementedError(
+                "Only GunpowderTrainer is supported for visualization"
+            )
+        if not hasattr(self.trainer, "_pipeline"):
+            from ..store.create_store import create_array_store
+
+            array_store = create_array_store()
+            self.trainer.build_batch_provider(
+                self.datasplit.train,
+                self.model,
+                self.task,
+                array_store.snapshot_container(self.name),
+            )
+        self.trainer.visualize_pipeline()
diff --git a/dacapo/experiments/trainers/gunpowder_trainer.py b/dacapo/experiments/trainers/gunpowder_trainer.py
@@ -489,3 +489,90 @@ def can_train(self, datasets) -> bool:
 
         """
         return all([dataset.gt is not None for dataset in datasets])
+
+    def visualize_pipeline(self):
+        if self._pipeline is None:
+            raise ValueError("Pipeline not initialized!")
+
+        import neuroglancer
+
+        # self.iteration = 0
+
+        pipeline = self._pipeline.children[0].children[0].copy()
+        if self.num_data_fetchers > 1:
+            pipeline = pipeline.children[0]
+
+        pipeline += gp.Stack(1)
+
+        request = self._request
+        # raise Exception(request)
+
+        def batch_generator():
+            with gp.build(pipeline):
+                while True:
+                    yield pipeline.request_batch(request)
+
+        batch_gen = batch_generator()
+
+        def load_batch(event):
+            print("fetching_batch")
+            batch = next(batch_gen)
+
+            with viewer.txn() as s:
+                while len(s.layers) > 0:
+                    del s.layers[0]
+
+                # reverse order for raw so we can set opacity to 1, this
+                # way higher res raw replaces low res when available
+                for name, array in batch.arrays.items():
+                    print(name)
+                    data = array.data[0]
+
+                    channel_dims = len(data.shape) - len(array.spec.voxel_size)
+                    assert channel_dims <= 1
+
+                    dims = neuroglancer.CoordinateSpace(
+                        names=["c^", "z", "y", "x"][-len(data.shape) :],
+                        units="nm",
+                        scales=tuple([1] * channel_dims) + tuple(array.spec.voxel_size),
+                    )
+
+                    local_vol = neuroglancer.LocalVolume(
+                        data=data,
+                        voxel_offset=tuple([0] * channel_dims)
+                        + tuple((-array.spec.roi.shape / 2) / array.spec.voxel_size),
+                        dimensions=dims,
+                    )
+
+                    if name == self._gt_key:
+                        s.layers[str(name)] = neuroglancer.SegmentationLayer(
+                            source=local_vol
+                        )
+                    else:
+                        s.layers[str(name)] = neuroglancer.ImageLayer(source=local_vol)
+
+                s.layout = neuroglancer.row_layout(
+                    [
+                        neuroglancer.column_layout(
+                            [
+                                neuroglancer.LayerGroupViewer(
+                                    layers=[str(k) for k, v in batch.items()]
+                                ),
+                            ]
+                        )
+                    ]
+                )
+
+        neuroglancer.set_server_bind_address("0.0.0.0")
+
+        viewer = neuroglancer.Viewer()
+
+        viewer.actions.add("load_batch", load_batch)
+
+        with viewer.config_state.txn() as s:
+            s.input_event_bindings.data_view["keyt"] = "load_batch"
+
+        print(viewer)
+        load_batch(None)
+
+        input("Enter to quit!")
diff --git a/examples/distance_task/synthetic_example.py b/examples/distance_task/synthetic_example.py
@@ -157,12 +157,23 @@
 from dacapo.experiments.datasplits import DataSplitGenerator
 from funlib.geometry import Coordinate
 
+csv_path = Path(runs_base_dir, "synthetic_example.csv")
+if not csv_path.exists():
+    # Create a csv file with the paths to the zarr files
+    with open(csv_path, "w") as f:
+        f.write(
+            f"train,{train_data_path},raw,{train_data_path},[labels]\n"
+            f"val,{validate_data_path},raw,{validate_data_path},[labels]\n"
+            # f"test,{test_data_path},raw,{test_data_path},[labels]\n"
+        )
+
 input_resolution = Coordinate(8, 8, 8)
 output_resolution = Coordinate(8, 8, 8)
 datasplit_config = DataSplitGenerator.generate_from_csv(
-    "/misc/public/dacapo_learnathon/datasplit_csvs/synthetic_example.csv",
+    csv_path,
     input_resolution,
     output_resolution,
+    binarize_gt=True,  # Binarize the ground truth data to convert from instance segmentation to semantic segmentation
 ).compute()
 
 datasplit = datasplit_config.datasplit_type(datasplit_config)
@@ -390,6 +401,20 @@
 config_store = create_config_store()
 run = Run(config_store.retrieve_run_config(run_config.name))
 
+# First visualize all the steps in the data preprocessing pipeline
+from dacapo.store.create_store import create_array_store
+
+array_store = create_array_store()
+run.trainer.build_batch_provider(
+    run.datasplit.train,
+    run.model,
+    run.task,
+    array_store.snapshot_container(run.name),
+)
+run.trainer.visualize_pipeline()
+
+# %% Now let's train!
+
 # Visualize as we go
 run_viewer = NeuroglancerRunViewer(run)
 run_viewer.start()