diff --git a/dacapo/experiments/tasks/evaluators/instance_evaluator.py b/dacapo/experiments/tasks/evaluators/instance_evaluator.py index 0f3427a40..d2fc91678 100644 --- a/dacapo/experiments/tasks/evaluators/instance_evaluator.py +++ b/dacapo/experiments/tasks/evaluators/instance_evaluator.py @@ -3,22 +3,48 @@ from .evaluator import Evaluator from .instance_evaluation_scores import InstanceEvaluationScores -from funlib.evaluate import rand_voi +from funlib.evaluate import rand_voi, detection_scores + +try: + from funlib.segment.arrays import relabel + + iou = True +except ImportError: + iou = False import numpy as np class InstanceEvaluator(Evaluator): - criteria = ["voi_merge", "voi_split", "voi"] + criteria = ["voi_merge", "voi_split", "voi", "avg_iou"] def evaluate(self, output_array_identifier, evaluation_array): output_array = ZarrArray.open_from_array_identifier(output_array_identifier) evaluation_data = evaluation_array[evaluation_array.roi].astype(np.uint64) output_data = output_array[output_array.roi].astype(np.uint64) results = rand_voi(evaluation_data, output_data) + if iou: + try: + output_data, _ = relabel(output_data) + results.update( + detection_scores( + evaluation_data, + output_data, + matching_score="iou", + ) + ) + except Exception: + results["avg_iou"] = 0 + logger.warning( + "Could not compute IoU because of an unknown error. Sorry about that." + ) + else: + results["avg_iou"] = 0 return InstanceEvaluationScores( - voi_merge=results["voi_merge"], voi_split=results["voi_split"] + voi_merge=results["voi_merge"], + voi_split=results["voi_split"], + avg_iou=results["avg_iou"], ) @property diff --git a/dacapo/experiments/validation_scores.py b/dacapo/experiments/validation_scores.py index 8fba05687..17727cc22 100644 --- a/dacapo/experiments/validation_scores.py +++ b/dacapo/experiments/validation_scores.py @@ -113,7 +113,7 @@ def get_best( best value in two seperate arrays. """ if "criteria" in data.coords.keys(): - if len(data.coords["criteria"].shape) == 1: + if len(data.coords["criteria"].shape) > 1: criteria_bests: List[Tuple[xr.DataArray, xr.DataArray]] = [] for criterion in data.coords["criteria"].values: if self.evaluation_scores.higher_is_better(criterion.item()): @@ -142,7 +142,10 @@ def get_best( return (da_best_indexes, da_best_scores) else: if self.evaluation_scores.higher_is_better( - data.coords["criteria"].item() + list(data.coords["criteria"].values)[ + 0 + ] # TODO: what is the intended behavior here? (hot fix in place) + # data.coords["criteria"].item() ): return ( data.idxmax(dim, skipna=True, fill_value=None), diff --git a/dacapo/validate.py b/dacapo/validate.py index a1cf9da7d..39b231b8e 100644 --- a/dacapo/validate.py +++ b/dacapo/validate.py @@ -12,6 +12,7 @@ import torch from pathlib import Path +from reloading import reloading import logging logger = logging.getLogger(__name__) @@ -47,6 +48,7 @@ def validate( return validate_run(run, iteration, compute_context=compute_context) +# @reloading # allows us to fix validation bugs without interrupting training def validate_run( run: Run, iteration: int, compute_context: ComputeContext = LocalTorch() ): @@ -54,85 +56,97 @@ def validate_run( load the weights of that iteration, it is assumed that the model is already loaded correctly. Returns the best parameters and scores for this iteration.""" - # set benchmark flag to True for performance - torch.backends.cudnn.benchmark = True - run.model.eval() - - if ( - run.datasplit.validate is None - or len(run.datasplit.validate) == 0 - or run.datasplit.validate[0].gt is None - ): - logger.info("Cannot validate run %s. Continuing training!", run.name) - return None, None - - # get array and weight store - weights_store = create_weights_store() - array_store = create_array_store() - iteration_scores = [] - - # get post processor and evaluator - post_processor = run.task.post_processor - evaluator = run.task.evaluator - - # Initialize the evaluator with the best scores seen so far - evaluator.set_best(run.validation_scores) - - for validation_dataset in run.datasplit.validate: - assert ( - validation_dataset.gt is not None - ), "We do not yet support validating on datasets without ground truth" - logger.info( - "Validating run %s on dataset %s", run.name, validation_dataset.name - ) + try: # we don't want this to hold up training + # set benchmark flag to True for performance + torch.backends.cudnn.benchmark = True + run.model.to(compute_context.device) + run.model.eval() - ( - input_raw_array_identifier, - input_gt_array_identifier, - ) = array_store.validation_input_arrays(run.name, validation_dataset.name) if ( - not Path( - f"{input_raw_array_identifier.container}/{input_raw_array_identifier.dataset}" - ).exists() - or not Path( - f"{input_gt_array_identifier.container}/{input_gt_array_identifier.dataset}" - ).exists() + run.datasplit.validate is None + or len(run.datasplit.validate) == 0 + or run.datasplit.validate[0].gt is None ): - logger.info("Copying validation inputs!") - input_voxel_size = validation_dataset.raw.voxel_size - output_voxel_size = run.model.scale(input_voxel_size) - input_shape = run.model.eval_input_shape - input_size = input_voxel_size * input_shape - output_shape = run.model.compute_output_shape(input_shape)[1] - output_size = output_voxel_size * output_shape - context = (input_size - output_size) / 2 - output_roi = validation_dataset.gt.roi - - input_roi = ( - output_roi.grow(context, context) - .snap_to_grid(validation_dataset.raw.voxel_size, mode="grow") - .intersect(validation_dataset.raw.roi) + logger.info("Cannot validate run %s. Continuing training!", run.name) + return None, None + + # get array and weight store + weights_store = create_weights_store() + array_store = create_array_store() + iteration_scores = [] + + # get post processor and evaluator + post_processor = run.task.post_processor + evaluator = run.task.evaluator + + # Initialize the evaluator with the best scores seen so far + evaluator.set_best(run.validation_scores) + + for validation_dataset in run.datasplit.validate: + if validation_dataset.gt is None: + logger.error( + "We do not yet support validating on datasets without ground truth" + ) + raise NotImplementedError + + logger.info( + "Validating run %s on dataset %s", run.name, validation_dataset.name ) - input_raw = ZarrArray.create_from_array_identifier( + + ( input_raw_array_identifier, - validation_dataset.raw.axes, - input_roi, - validation_dataset.raw.num_channels, - validation_dataset.raw.voxel_size, - validation_dataset.raw.dtype, - name=f"{run.name}_validation_raw", - write_size=input_size, - ) - input_raw[input_roi] = validation_dataset.raw[input_roi] - input_gt = ZarrArray.create_from_array_identifier( input_gt_array_identifier, - validation_dataset.gt.axes, - output_roi, - validation_dataset.gt.num_channels, - validation_dataset.gt.voxel_size, - validation_dataset.gt.dtype, - name=f"{run.name}_validation_gt", - write_size=output_size, + ) = array_store.validation_input_arrays(run.name, validation_dataset.name) + if ( + not Path( + f"{input_raw_array_identifier.container}/{input_raw_array_identifier.dataset}" + ).exists() + or not Path( + f"{input_gt_array_identifier.container}/{input_gt_array_identifier.dataset}" + ).exists() + ): + logger.info("Copying validation inputs!") + input_voxel_size = validation_dataset.raw.voxel_size + output_voxel_size = run.model.scale(input_voxel_size) + input_shape = run.model.eval_input_shape + input_size = input_voxel_size * input_shape + output_shape = run.model.compute_output_shape(input_shape)[1] + output_size = output_voxel_size * output_shape + context = (input_size - output_size) / 2 + output_roi = validation_dataset.gt.roi + + input_roi = ( + output_roi.grow(context, context) + .snap_to_grid(validation_dataset.raw.voxel_size, mode="grow") + .intersect(validation_dataset.raw.roi) + ) + input_raw = ZarrArray.create_from_array_identifier( + input_raw_array_identifier, + validation_dataset.raw.axes, + input_roi, + validation_dataset.raw.num_channels, + validation_dataset.raw.voxel_size, + validation_dataset.raw.dtype, + name=f"{run.name}_validation_raw", + write_size=input_size, + ) + input_raw[input_roi] = validation_dataset.raw[input_roi] + input_gt = ZarrArray.create_from_array_identifier( + input_gt_array_identifier, + validation_dataset.gt.axes, + output_roi, + validation_dataset.gt.num_channels, + validation_dataset.gt.voxel_size, + validation_dataset.gt.dtype, + name=f"{run.name}_validation_gt", + write_size=output_size, + ) + input_gt[output_roi] = validation_dataset.gt[output_roi] + else: + logger.info("validation inputs already copied!") + + prediction_array_identifier = array_store.validation_prediction_array( + run.name, iteration, validation_dataset ) input_gt[output_roi] = validation_dataset.gt[output_roi] else: @@ -160,58 +174,126 @@ def validate_run( run.name, iteration, parameters, validation_dataset ) - post_processed_array = post_processor.process( - parameters, output_array_identifier - ) + post_processor.set_prediction(prediction_array_identifier) - scores = evaluator.evaluate(output_array_identifier, validation_dataset.gt) + dataset_iteration_scores = [] + # set up dict for overall best scores + overall_best_scores = {} for criterion in run.validation_scores.criteria: - # replace predictions in array with the new better predictions - if evaluator.is_best( - validation_dataset, - parameters, - criterion, - scores, - ): - best_array_identifier = array_store.best_validation_array( - run.name, criterion, index=validation_dataset.name - ) - best_array = ZarrArray.create_from_array_identifier( - best_array_identifier, - post_processed_array.axes, - post_processed_array.roi, - post_processed_array.num_channels, - post_processed_array.voxel_size, - post_processed_array.dtype, + overall_best_scores[criterion] = evaluator.get_overall_best( + validation_dataset, criterion + ) + + any_overall_best = False + output_array_identifiers = [] + for parameters in post_processor.enumerate_parameters(): + output_array_identifier = array_store.validation_output_array( + run.name, iteration, parameters, validation_dataset + ) + output_array_identifiers.append(output_array_identifier) + post_processed_array = post_processor.process( + parameters, output_array_identifier + ) + + try: + scores = evaluator.evaluate( + output_array_identifier, validation_dataset.gt ) - best_array[best_array.roi] = post_processed_array[ - post_processed_array.roi - ] - best_array.add_metadata( - { - "iteration": iteration, - criterion: getattr(scores, criterion), - "parameters_id": parameters.id, - } - ) - weights_store.store_best( - run, iteration, validation_dataset.name, criterion + for criterion in run.validation_scores.criteria: + # replace predictions in array with the new better predictions + if evaluator.is_best( + validation_dataset, + parameters, + criterion, + scores, + ): + # then this is the current best score for this parameter, but not necessarily the overall best + higher_is_better = scores.higher_is_better(criterion) + # initial_best_score = overall_best_scores[criterion] + current_score = getattr(scores, criterion) + if not overall_best_scores[ + criterion + ] or ( # TODO: should be in evaluator + ( + higher_is_better + and current_score > overall_best_scores[criterion] + ) + or ( + not higher_is_better + and current_score < overall_best_scores[criterion] + ) + ): + any_overall_best = True + overall_best_scores[criterion] = current_score + + # For example, if parameter 2 did better this round than it did in other rounds, but it was still worse than parameter 1 + # the code would have overwritten it below since all parameters write to the same file. Now each parameter will be its own file + # Either we do that, or we only write out the overall best, regardless of parameters + best_array_identifier = ( + array_store.best_validation_array( + run.name, + criterion, + index=validation_dataset.name, + ) + ) + best_array = ZarrArray.create_from_array_identifier( + best_array_identifier, + post_processed_array.axes, + post_processed_array.roi, + post_processed_array.num_channels, + post_processed_array.voxel_size, + post_processed_array.dtype, + ) + best_array[best_array.roi] = post_processed_array[ + post_processed_array.roi + ] + best_array.add_metadata( + { + "iteration": iteration, + criterion: getattr(scores, criterion), + "parameters_id": parameters.id, + } + ) + weights_store.store_best( + run, iteration, validation_dataset.name, criterion + ) + except: + logger.error( + f"Could not evaluate run {run.name} on dataset {validation_dataset.name} with parameters {parameters}.", + exc_info=True, ) - # delete current output. We only keep the best outputs as determined by - # the evaluator - array_store.remove(output_array_identifier) + dataset_iteration_scores.append( + [getattr(scores, criterion) for criterion in scores.criteria] + ) - dataset_iteration_scores.append( - [getattr(scores, criterion) for criterion in scores.criteria] - ) + if not any_overall_best: + # We only keep the best outputs as determined by the evaluator + for output_array_identifier in output_array_identifiers: + array_store.remove(prediction_array_identifier) + array_store.remove(output_array_identifier) - iteration_scores.append(dataset_iteration_scores) - array_store.remove(prediction_array_identifier) + iteration_scores.append(dataset_iteration_scores) - run.validation_scores.add_iteration_scores( - ValidationIterationScores(iteration, iteration_scores) - ) - stats_store = create_stats_store() - stats_store.store_validation_iteration_scores(run.name, run.validation_scores) + run.validation_scores.add_iteration_scores( + ValidationIterationScores(iteration, iteration_scores) + ) + stats_store = create_stats_store() + stats_store.store_validation_iteration_scores(run.name, run.validation_scores) + except Exception as e: + logger.error( + f"Validation failed for run {run.name} at iteration " f"{iteration}.", + exc_info=e, + ) + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument("run_name", type=str) + parser.add_argument("iteration", type=int) + args = parser.parse_args() + + validate(args.run_name, args.iteration)