Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: ✨ Add more extensive pipeline visualize for GunpowderTrainer/Run. #287

Merged
merged 3 commits into from
Sep 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,6 @@ tmp/
daisy_logs/

*.csv
*.private
*.private

user_experiments/*
Original file line number Diff line number Diff line change
Expand Up @@ -438,7 +438,8 @@ def _neuroglancer_source(self):
This method is used to return the source array for neuroglancer.
"""
# return self._source_array._neuroglancer_source()
return np.ones_like(self.source_array.data, dtype=np.uint64) * self._constant
shape = self.source_array[self.source_array.roi].shape
return np.ones(shape, dtype=np.uint64) * self._constant

def _combined_neuroglancer_source(self) -> neuroglancer.LocalVolume:
"""
Expand Down
37 changes: 23 additions & 14 deletions dacapo/experiments/datasplits/datasplit_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -439,10 +439,10 @@ class DataSplitGenerator:
The minimum raw value.
raw_max : int
The maximum raw value.
classes_separator_caracter : str
classes_separator_character : str
The classes separator character.
Methods:
__init__(name, datasets, input_resolution, output_resolution, targets, segmentation_type, max_gt_downsample, max_gt_upsample, max_raw_training_downsample, max_raw_training_upsample, max_raw_validation_downsample, max_raw_validation_upsample, min_training_volume_size, raw_min, raw_max, classes_separator_caracter)
__init__(name, datasets, input_resolution, output_resolution, targets, segmentation_type, max_gt_downsample, max_gt_upsample, max_raw_training_downsample, max_raw_training_upsample, max_raw_validation_downsample, max_raw_validation_upsample, min_training_volume_size, raw_min, raw_max, classes_separator_character)
Initializes the DataSplitGenerator class with the specified name, datasets, input resolution, output resolution, targets, segmentation type, maximum ground truth downsample, maximum ground truth upsample, maximum raw training downsample, maximum raw training upsample, maximum raw validation downsample, maximum raw validation upsample, minimum training volume size, minimum raw value, maximum raw value, and classes separator character.
__str__(self)
A method to get the string representation of the class.
Expand Down Expand Up @@ -482,8 +482,9 @@ def __init__(
min_training_volume_size=8_000, # 20**3
raw_min=0,
raw_max=255,
classes_separator_caracter="&",
classes_separator_character="&",
use_negative_class=False,
binarize_gt=False,
):
"""
Initializes the DataSplitGenerator class with the specified:
Expand All @@ -503,6 +504,8 @@ def __init__(
- minimum raw value
- maximum raw value
- classes separator character
- use negative class
- binarize ground truth

Args:
name : str
Expand Down Expand Up @@ -535,15 +538,19 @@ def __init__(
The minimum raw value.
raw_max : int
The maximum raw value.
classes_separator_caracter : str
classes_separator_character : str
The classes separator character.
use_negative_class : bool
Whether to use negative classes.
binarize_gt : bool
Whether to binarize the ground truth as part of preprocessing. Use this if you are doing semantic segmentation on instance labels (where each object has a unique ID).
Returns:
obj : The DataSplitGenerator class.
Raises:
ValueError
If the class name is already set, a ValueError is raised.
Examples:
>>> DataSplitGenerator(name, datasets, input_resolution, output_resolution, targets, segmentation_type, max_gt_downsample, max_gt_upsample, max_raw_training_downsample, max_raw_training_upsample, max_raw_validation_downsample, max_raw_validation_upsample, min_training_volume_size, raw_min, raw_max, classes_separator_caracter)
>>> DataSplitGenerator(name, datasets, input_resolution, output_resolution, targets, segmentation_type, max_gt_downsample, max_gt_upsample, max_raw_training_downsample, max_raw_training_upsample, max_raw_validation_downsample, max_raw_validation_upsample, min_training_volume_size, raw_min, raw_max, classes_separator_character)
Notes:
This function is used to initialize the DataSplitGenerator class with the specified name, datasets, input resolution, output resolution, targets, segmentation type, maximum ground truth downsample, maximum ground truth upsample, maximum raw training downsample, maximum raw training upsample, maximum raw validation downsample, maximum raw validation upsample, minimum training volume size, minimum raw value, maximum raw value, and classes separator character.

Expand Down Expand Up @@ -571,8 +578,9 @@ def __init__(
self.min_training_volume_size = min_training_volume_size
self.raw_min = raw_min
self.raw_max = raw_max
self.classes_separator_caracter = classes_separator_caracter
self.classes_separator_character = classes_separator_character
self.use_negative_class = use_negative_class
self.binarize_gt = binarize_gt
if use_negative_class:
if targets is None:
raise ValueError(
Expand Down Expand Up @@ -671,7 +679,7 @@ def check_class_name(self, class_name):

"""
datasets, classes = format_class_name(
class_name, self.classes_separator_caracter, self.targets
class_name, self.classes_separator_character, self.targets
)
if self.class_name is None:
self.class_name = classes
Expand Down Expand Up @@ -736,7 +744,7 @@ def __generate_semantic_seg_datasplit(self):
mask_config,
) = self.__generate_semantic_seg_dataset_crop(dataset)
if type(self.class_name) == list:
classes = self.classes_separator_caracter.join(self.class_name)
classes = self.classes_separator_character.join(self.class_name)
else:
classes = self.class_name
if dataset.dataset_type == DatasetType.train:
Expand Down Expand Up @@ -821,7 +829,7 @@ def __generate_semantic_seg_dataset_crop(self, dataset: DatasetSpec):
organelle_arrays = {}
# classes_datasets, classes = self.check_class_name(gt_dataset)
classes_datasets, classes = format_class_name(
gt_dataset, self.classes_separator_caracter, self.targets
gt_dataset, self.classes_separator_character, self.targets
)
for current_class_dataset, current_class_name in zip(classes_datasets, classes):
if not (gt_path / current_class_dataset).exists():
Expand All @@ -843,11 +851,12 @@ def __generate_semantic_seg_dataset_crop(self, dataset: DatasetSpec):
self.output_resolution,
"gt",
)
# gt_config = BinarizeArrayConfig(
# f"{dataset}_{current_class_name}_{self.output_resolution[0]}nm_binarized",
# source_array_config=gt_config,
# groupings=[(current_class_name, [])],
# )
if self.binarize_gt:
gt_config = BinarizeArrayConfig(
f"{dataset}_{current_class_name}_{self.output_resolution[0]}nm_binarized",
source_array_config=gt_config,
groupings=[(current_class_name, [])],
)
organelle_arrays[current_class_name] = gt_config

if self.targets is None:
Expand Down
26 changes: 25 additions & 1 deletion dacapo/experiments/run.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from .datasplits import DataSplit
from .tasks.task import Task
from .architectures import Architecture
from .trainers import Trainer
from .trainers import Trainer, GunpowderTrainer
from .training_stats import TrainingStats
from .validation_scores import ValidationScores
from .starts import Start
Expand Down Expand Up @@ -215,3 +215,27 @@ def move_optimizer(

def __str__(self):
return self.name

def visualize_pipeline(self):
"""
Visualizes the pipeline for the run, including all produced arrays.

Examples:
>>> run.visualize_pipeline()

"""
if not isinstance(self.trainer, GunpowderTrainer):
raise NotImplementedError(
"Only GunpowderTrainer is supported for visualization"
)
if not hasattr(self.trainer, "_pipeline"):
from ..store.create_store import create_array_store

array_store = create_array_store()
self.trainer.build_batch_provider(
self.datasplit.train,
self.model,
self.task,
array_store.snapshot_container(self.name),
)
self.trainer.visualize_pipeline()
87 changes: 87 additions & 0 deletions dacapo/experiments/trainers/gunpowder_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -489,3 +489,90 @@ def can_train(self, datasets) -> bool:

"""
return all([dataset.gt is not None for dataset in datasets])

def visualize_pipeline(self):
if self._pipeline is None:
raise ValueError("Pipeline not initialized!")

import neuroglancer

# self.iteration = 0

pipeline = self._pipeline.children[0].children[0].copy()
if self.num_data_fetchers > 1:
pipeline = pipeline.children[0]

pipeline += gp.Stack(1)

request = self._request
# raise Exception(request)

def batch_generator():
with gp.build(pipeline):
while True:
yield pipeline.request_batch(request)

batch_gen = batch_generator()

def load_batch(event):
print("fetching_batch")
batch = next(batch_gen)

with viewer.txn() as s:
while len(s.layers) > 0:
del s.layers[0]

# reverse order for raw so we can set opacity to 1, this
# way higher res raw replaces low res when available
for name, array in batch.arrays.items():
print(name)
data = array.data[0]

channel_dims = len(data.shape) - len(array.spec.voxel_size)
assert channel_dims <= 1

dims = neuroglancer.CoordinateSpace(
names=["c^", "z", "y", "x"][-len(data.shape) :],
units="nm",
scales=tuple([1] * channel_dims) + tuple(array.spec.voxel_size),
)

local_vol = neuroglancer.LocalVolume(
data=data,
voxel_offset=tuple([0] * channel_dims)
+ tuple((-array.spec.roi.shape / 2) / array.spec.voxel_size),
dimensions=dims,
)

if name == self._gt_key:
s.layers[str(name)] = neuroglancer.SegmentationLayer(
source=local_vol
)
else:
s.layers[str(name)] = neuroglancer.ImageLayer(source=local_vol)

s.layout = neuroglancer.row_layout(
[
neuroglancer.column_layout(
[
neuroglancer.LayerGroupViewer(
layers=[str(k) for k, v in batch.items()]
),
]
)
]
)

neuroglancer.set_server_bind_address("0.0.0.0")

viewer = neuroglancer.Viewer()

viewer.actions.add("load_batch", load_batch)

with viewer.config_state.txn() as s:
s.input_event_bindings.data_view["keyt"] = "load_batch"

print(viewer)
load_batch(None)

input("Enter to quit!")
27 changes: 26 additions & 1 deletion examples/distance_task/synthetic_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,12 +157,23 @@
from dacapo.experiments.datasplits import DataSplitGenerator
from funlib.geometry import Coordinate

csv_path = Path(runs_base_dir, "synthetic_example.csv")
if not csv_path.exists():
# Create a csv file with the paths to the zarr files
with open(csv_path, "w") as f:
f.write(
f"train,{train_data_path},raw,{train_data_path},[labels]\n"
f"val,{validate_data_path},raw,{validate_data_path},[labels]\n"
# f"test,{test_data_path},raw,{test_data_path},[labels]\n"
)

input_resolution = Coordinate(8, 8, 8)
output_resolution = Coordinate(8, 8, 8)
datasplit_config = DataSplitGenerator.generate_from_csv(
"/misc/public/dacapo_learnathon/datasplit_csvs/synthetic_example.csv",
csv_path,
input_resolution,
output_resolution,
binarize_gt=True, # Binarize the ground truth data to convert from instance segmentation to semantic segmentation
).compute()

datasplit = datasplit_config.datasplit_type(datasplit_config)
Expand Down Expand Up @@ -390,6 +401,20 @@
config_store = create_config_store()
run = Run(config_store.retrieve_run_config(run_config.name))

# First visualize all the steps in the data preprocessing pipeline
from dacapo.store.create_store import create_array_store

array_store = create_array_store()
run.trainer.build_batch_provider(
run.datasplit.train,
run.model,
run.task,
array_store.snapshot_container(run.name),
)
run.trainer.visualize_pipeline()

# %% Now let's train!

# Visualize as we go
run_viewer = NeuroglancerRunViewer(run)
run_viewer.start()
Expand Down
Loading