From 88c0c575267855c39bcb8cc22a0e736b6acd711a Mon Sep 17 00:00:00 2001 From: Philipp Otto Date: Mon, 10 Aug 2020 14:22:15 +0200 Subject: [PATCH] Add check-equality task (#231) * implement check equality task to compare two datasets * update readme * add test for check equality task * format * format * clean up * remove unused var * remove unused verify flag * update test script name in github action * Merge branch 'master' into check-equality --- .github/workflows/main.yml | 4 +- README.md | 4 + test.sh | 2 +- tests/scripts/compression.sh | 9 -- tests/scripts/compression_and_verification.sh | 30 ++++ wkcuber/api/Properties/LayerProperties.py | 5 + wkcuber/check_equality.py | 149 ++++++++++++++++++ wkcuber/compress.py | 9 +- 8 files changed, 196 insertions(+), 16 deletions(-) delete mode 100755 tests/scripts/compression.sh create mode 100755 tests/scripts/compression_and_verification.sh create mode 100644 wkcuber/check_equality.py diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index e6ed9f8c3..50de01b11 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -79,8 +79,8 @@ jobs: - name: Test anisotropic downsampling run: tests/scripts/anisotropic_downsampling.sh - - name: Test compression - run: tests/scripts/compression.sh + - name: Test compression and verification + run: tests/scripts/compression_and_verification.sh - name: Test in-place compression run: tests/scripts/in_place_compression.sh diff --git a/README.md b/README.md index 3b5365204..d05b42520 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,7 @@ Created with [Python3](https://www.python.org/). * `wkcuber.compress`: Compress WKW cubes for efficient file storage (especially useful for segmentation data) * `wkcuber.metadata`: Create (or refresh) metadata (with guessing of most parameters) * `wkcuber.recubing`: Read existing WKW cubes in and write them again specifying the WKW file length. Useful when dataset was written e.g. with file length 1. +* `wkcuber.check_equality`: Compare two WKW datasets to check whether they are equal (e.g., after compressing a dataset, this task can be useful to double-check that the compressed dataset contains the same data). * Most modules support multiprocessing ## Supported input formats @@ -89,6 +90,9 @@ python -m wkcuber.metadata --refresh data/target # Recubing an existing dataset python -m wkcuber.recubing --layer_name color --dtype uint8 /data/source/wkw /data/target + +# Check two datasets for equality +python -m wkcuber.check_equality /data/source /data/target ``` ### Parallelization diff --git a/test.sh b/test.sh index e6adc4051..1bf312d9a 100755 --- a/test.sh +++ b/test.sh @@ -12,7 +12,7 @@ tests/scripts/simple_tiff_cubing_no_compression.sh tests/scripts/knossos_conversion.sh tests/scripts/decompress_reference_mag.sh tests/scripts/downsampling.sh -tests/scripts/compression.sh +tests/scripts/compression_and_verification.sh tests/scripts/in_place_compression.sh tests/scripts/meta_generation.sh tests/scripts/simple_anisotropic_tiff_cubing.sh diff --git a/tests/scripts/compression.sh b/tests/scripts/compression.sh deleted file mode 100755 index c832ad0f4..000000000 --- a/tests/scripts/compression.sh +++ /dev/null @@ -1,9 +0,0 @@ -set -xe -python -m wkcuber.compress \ - --jobs 2 \ - --layer_name color \ - testoutput/tiff testoutput/tiff_compress -[ -d testoutput/tiff_compress/color/1 ] -[ -d testoutput/tiff_compress/color/2 ] -[ -d testoutput/tiff_compress/color/4 ] -[ -d testoutput/tiff_compress/color/8 ] \ No newline at end of file diff --git a/tests/scripts/compression_and_verification.sh b/tests/scripts/compression_and_verification.sh new file mode 100755 index 000000000..e5df7fc6a --- /dev/null +++ b/tests/scripts/compression_and_verification.sh @@ -0,0 +1,30 @@ +set -xe +python -m wkcuber.compress \ + --jobs 2 \ + --layer_name color \ + testoutput/tiff testoutput/tiff_compress +[ -d testoutput/tiff_compress/color/1 ] +[ -d testoutput/tiff_compress/color/2 ] +[ -d testoutput/tiff_compress/color/4 ] +[ -d testoutput/tiff_compress/color/8 ] + +echo "Generate metadata" +python -m wkcuber.metadata --name great_dataset --scale 11.24,11.24,25 testoutput/tiff +python -m wkcuber.metadata --name great_dataset --scale 11.24,11.24,25 testoutput/tiff_compress + +echo "Check equality for uncompressed and compressed dataset" +python -m wkcuber.check_equality testoutput/tiff testoutput/tiff_compress + +echo "Create broken copy of dataset" +rm -rf testoutput/tiff_compress-broken +cp -R testoutput/tiff_compress{,-broken} +rm -r testoutput/tiff_compress-broken/color/1/z0/y0/x0.wkw + +echo "Compare original dataset to broken one and expect to determine difference" +if python -m wkcuber.check_equality testoutput/tiff testoutput/tiff_compress-broken ; then + echo "Equality check did not fail even though the dataset is broken." + exit 1 +else + echo "Equality check failed as expected for broken dataset." + exit 0 +fi \ No newline at end of file diff --git a/wkcuber/api/Properties/LayerProperties.py b/wkcuber/api/Properties/LayerProperties.py index 1f0e12859..9e8f6473c 100644 --- a/wkcuber/api/Properties/LayerProperties.py +++ b/wkcuber/api/Properties/LayerProperties.py @@ -3,6 +3,7 @@ from wkw import wkw from wkcuber.mag import Mag +from wkcuber.api.bounding_box import BoundingBox def extract_num_channels(num_channels_in_properties, path, layer, mag): @@ -107,6 +108,10 @@ def _add_resolution(self, resolution): def _delete_resolution(self, resolution): self._wkw_magnifications.delete(resolution) + def get_bounding_box(self) -> BoundingBox: + + return BoundingBox(self.get_bounding_box_offset(), self.get_bounding_box_size()) + def get_bounding_box_size(self) -> tuple: return ( self.bounding_box["width"], diff --git a/wkcuber/check_equality.py b/wkcuber/check_equality.py new file mode 100644 index 000000000..0ec2c6e1e --- /dev/null +++ b/wkcuber/check_equality.py @@ -0,0 +1,149 @@ +import logging +from argparse import ArgumentParser +from wkcuber.api.Dataset import WKDataset +from wkcuber.api.bounding_box import BoundingBox +import numpy as np + +from .utils import ( + add_verbose_flag, + open_wkw, + WkwDatasetInfo, + add_distribution_flags, + get_executor_for_args, + wait_and_ensure_success, + setup_logging, +) +from .metadata import detect_resolutions, detect_bbox, detect_layers +import functools +from .compress import BACKUP_EXT + +CHUNK_SIZE = 1024 + + +def named_partial(func, *args, **kwargs): + # Propagate __name__ and __doc__ attributes to partial function + partial_func = functools.partial(func, *args, **kwargs) + functools.update_wrapper(partial_func, func) + if hasattr(func, "__annotations__"): + # Generic types cannot be pickled in Python <= 3.6, see https://github.com/python/typing/issues/511 + partial_func.__annotations__ = {} + return partial_func + + +def create_parser(): + parser = ArgumentParser() + + parser.add_argument("source_path", help="Path to input WKW dataset") + + parser.add_argument( + "target_path", help="WKW dataset with which to compare the input dataset." + ) + + parser.add_argument( + "--layer_name", + "-l", + help="Name of the layer to compare (if not provided, all layers are compared)", + default=None, + ) + + add_verbose_flag(parser) + add_distribution_flags(parser) + + return parser + + +def assert_equality_for_chunk( + source_path: str, target_path: str, layer_name: str, mag, sub_box +): + wk_dataset = WKDataset(source_path) + layer = wk_dataset.layers[layer_name] + backup_wkw_info = WkwDatasetInfo(target_path, layer_name, mag, header=None) + with open_wkw(backup_wkw_info) as backup_wkw: + mag_ds = layer.get_mag(mag) + logging.info(f"Checking sub_box: {sub_box}") + + data = mag_ds.read(sub_box.size, sub_box.topleft) + backup_data = backup_wkw.read(sub_box.topleft, sub_box.size) + assert np.all( + data == backup_data + ), f"Data differs in bounding box {sub_box} for layer {layer_name} with mag {mag}" + + +def check_equality(source_path: str, target_path: str, args=None): + + logging.info(f"Comparing {source_path} with {target_path}") + + wk_src_dataset = WKDataset(source_path) + src_layer_names = wk_src_dataset.layers.keys() + target_layer_names = [ + layer["name"] for layer in detect_layers(target_path, 0, False) + ] + assert set(src_layer_names) == set( + target_layer_names + ), f"The provided input datasets have different layers: {src_layer_names} != {target_layer_names}" + + existing_layer_names = src_layer_names + + if args.layer_name is not None: + assert ( + args.layer_name in existing_layer_names + ), f"Provided layer {args.layer_name} does not exist in input dataset." + existing_layer_names = [args.layer_name] + + for layer_name in existing_layer_names: + + logging.info(f"Checking layer_name: {layer_name}") + + source_mags = list(detect_resolutions(source_path, layer_name)) + target_mags = list(detect_resolutions(target_path, layer_name)) + source_mags.sort() + target_mags.sort() + mags = source_mags + + assert ( + source_mags == target_mags + ), f"The mags between {source_path}/{layer_name} and {target_path}/{layer_name} are not equal: {source_mags} != {target_mags}" + + layer_properties = wk_src_dataset.properties.data_layers[layer_name] + + official_bbox = layer_properties.get_bounding_box() + + for mag in mags: + inferred_src_bbox = BoundingBox.from_auto( + detect_bbox(source_path, layer_name, mag) + ) + inferred_target_bbox = BoundingBox.from_auto( + detect_bbox(target_path, layer_name, mag) + ) + + bbox = inferred_src_bbox.extended_by(inferred_target_bbox).extended_by( + official_bbox + ) + logging.info(f"Start verification of {layer_name} in mag {mag} in {bbox}") + + with get_executor_for_args(args) as executor: + boxes = list( + bbox.chunk([CHUNK_SIZE, CHUNK_SIZE, CHUNK_SIZE], [CHUNK_SIZE]) + ) + assert_fn = named_partial( + assert_equality_for_chunk, source_path, target_path, layer_name, mag + ) + + wait_and_ensure_success(executor.map_to_futures(assert_fn, boxes)) + + logging.info( + f"The following datasets seem to be equal (with regard to the layers: {existing_layer_names}):" + ) + logging.info(source_path) + logging.info(target_path) + + +if __name__ == "__main__": + args = create_parser().parse_args() + setup_logging(args) + + if args.target_path is None: + target_path = args.source_path + BACKUP_EXT + else: + target_path = args.target_path + check_equality(args.source_path, target_path, args) diff --git a/wkcuber/compress.py b/wkcuber/compress.py index 34e07a7d1..0c1c48e3b 100644 --- a/wkcuber/compress.py +++ b/wkcuber/compress.py @@ -16,10 +16,11 @@ wait_and_ensure_success, setup_logging, ) -from .metadata import detect_resolutions -from .metadata import convert_element_class_to_dtype +from .metadata import detect_resolutions, convert_element_class_to_dtype from typing import List +BACKUP_EXT = ".bak" + def create_parser(): parser = ArgumentParser() @@ -128,11 +129,11 @@ def compress_mags( compress_mag(source_path, layer_name, target_path, mag, args) if with_tmp_dir: - makedirs(path.join(source_path + ".bak", layer_name), exist_ok=True) + makedirs(path.join(source_path + BACKUP_EXT, layer_name), exist_ok=True) for mag in mags: shutil.move( path.join(source_path, layer_name, str(mag)), - path.join(source_path + ".bak", layer_name, str(mag)), + path.join(source_path + BACKUP_EXT, layer_name, str(mag)), ) shutil.move( path.join(target_path, layer_name, str(mag)),