Skip to content

Commit

Permalink
Add check-equality task (#231)
Browse files Browse the repository at this point in the history
* implement check equality task to compare two datasets
* update readme
* add test for check equality task
* format
* format
* clean up
* remove unused var
* remove unused verify flag
* update test script name in github action
* Merge branch 'master' into check-equality
  • Loading branch information
philippotto authored Aug 10, 2020
1 parent 6870a17 commit 88c0c57
Show file tree
Hide file tree
Showing 8 changed files with 196 additions and 16 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,8 @@ jobs:
- name: Test anisotropic downsampling
run: tests/scripts/anisotropic_downsampling.sh

- name: Test compression
run: tests/scripts/compression.sh
- name: Test compression and verification
run: tests/scripts/compression_and_verification.sh

- name: Test in-place compression
run: tests/scripts/in_place_compression.sh
Expand Down
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ Created with [Python3](https://www.python.org/).
* `wkcuber.compress`: Compress WKW cubes for efficient file storage (especially useful for segmentation data)
* `wkcuber.metadata`: Create (or refresh) metadata (with guessing of most parameters)
* `wkcuber.recubing`: Read existing WKW cubes in and write them again specifying the WKW file length. Useful when dataset was written e.g. with file length 1.
* `wkcuber.check_equality`: Compare two WKW datasets to check whether they are equal (e.g., after compressing a dataset, this task can be useful to double-check that the compressed dataset contains the same data).
* Most modules support multiprocessing

## Supported input formats
Expand Down Expand Up @@ -89,6 +90,9 @@ python -m wkcuber.metadata --refresh data/target
# Recubing an existing dataset
python -m wkcuber.recubing --layer_name color --dtype uint8 /data/source/wkw /data/target
# Check two datasets for equality
python -m wkcuber.check_equality /data/source /data/target
```

### Parallelization
Expand Down
2 changes: 1 addition & 1 deletion test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ tests/scripts/simple_tiff_cubing_no_compression.sh
tests/scripts/knossos_conversion.sh
tests/scripts/decompress_reference_mag.sh
tests/scripts/downsampling.sh
tests/scripts/compression.sh
tests/scripts/compression_and_verification.sh
tests/scripts/in_place_compression.sh
tests/scripts/meta_generation.sh
tests/scripts/simple_anisotropic_tiff_cubing.sh
Expand Down
9 changes: 0 additions & 9 deletions tests/scripts/compression.sh

This file was deleted.

30 changes: 30 additions & 0 deletions tests/scripts/compression_and_verification.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
set -xe
python -m wkcuber.compress \
--jobs 2 \
--layer_name color \
testoutput/tiff testoutput/tiff_compress
[ -d testoutput/tiff_compress/color/1 ]
[ -d testoutput/tiff_compress/color/2 ]
[ -d testoutput/tiff_compress/color/4 ]
[ -d testoutput/tiff_compress/color/8 ]

echo "Generate metadata"
python -m wkcuber.metadata --name great_dataset --scale 11.24,11.24,25 testoutput/tiff
python -m wkcuber.metadata --name great_dataset --scale 11.24,11.24,25 testoutput/tiff_compress

echo "Check equality for uncompressed and compressed dataset"
python -m wkcuber.check_equality testoutput/tiff testoutput/tiff_compress

echo "Create broken copy of dataset"
rm -rf testoutput/tiff_compress-broken
cp -R testoutput/tiff_compress{,-broken}
rm -r testoutput/tiff_compress-broken/color/1/z0/y0/x0.wkw

echo "Compare original dataset to broken one and expect to determine difference"
if python -m wkcuber.check_equality testoutput/tiff testoutput/tiff_compress-broken ; then
echo "Equality check did not fail even though the dataset is broken."
exit 1
else
echo "Equality check failed as expected for broken dataset."
exit 0
fi
5 changes: 5 additions & 0 deletions wkcuber/api/Properties/LayerProperties.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from wkw import wkw

from wkcuber.mag import Mag
from wkcuber.api.bounding_box import BoundingBox


def extract_num_channels(num_channels_in_properties, path, layer, mag):
Expand Down Expand Up @@ -107,6 +108,10 @@ def _add_resolution(self, resolution):
def _delete_resolution(self, resolution):
self._wkw_magnifications.delete(resolution)

def get_bounding_box(self) -> BoundingBox:

return BoundingBox(self.get_bounding_box_offset(), self.get_bounding_box_size())

def get_bounding_box_size(self) -> tuple:
return (
self.bounding_box["width"],
Expand Down
149 changes: 149 additions & 0 deletions wkcuber/check_equality.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
import logging
from argparse import ArgumentParser
from wkcuber.api.Dataset import WKDataset
from wkcuber.api.bounding_box import BoundingBox
import numpy as np

from .utils import (
add_verbose_flag,
open_wkw,
WkwDatasetInfo,
add_distribution_flags,
get_executor_for_args,
wait_and_ensure_success,
setup_logging,
)
from .metadata import detect_resolutions, detect_bbox, detect_layers
import functools
from .compress import BACKUP_EXT

CHUNK_SIZE = 1024


def named_partial(func, *args, **kwargs):
# Propagate __name__ and __doc__ attributes to partial function
partial_func = functools.partial(func, *args, **kwargs)
functools.update_wrapper(partial_func, func)
if hasattr(func, "__annotations__"):
# Generic types cannot be pickled in Python <= 3.6, see https://github.com/python/typing/issues/511
partial_func.__annotations__ = {}
return partial_func


def create_parser():
parser = ArgumentParser()

parser.add_argument("source_path", help="Path to input WKW dataset")

parser.add_argument(
"target_path", help="WKW dataset with which to compare the input dataset."
)

parser.add_argument(
"--layer_name",
"-l",
help="Name of the layer to compare (if not provided, all layers are compared)",
default=None,
)

add_verbose_flag(parser)
add_distribution_flags(parser)

return parser


def assert_equality_for_chunk(
source_path: str, target_path: str, layer_name: str, mag, sub_box
):
wk_dataset = WKDataset(source_path)
layer = wk_dataset.layers[layer_name]
backup_wkw_info = WkwDatasetInfo(target_path, layer_name, mag, header=None)
with open_wkw(backup_wkw_info) as backup_wkw:
mag_ds = layer.get_mag(mag)
logging.info(f"Checking sub_box: {sub_box}")

data = mag_ds.read(sub_box.size, sub_box.topleft)
backup_data = backup_wkw.read(sub_box.topleft, sub_box.size)
assert np.all(
data == backup_data
), f"Data differs in bounding box {sub_box} for layer {layer_name} with mag {mag}"


def check_equality(source_path: str, target_path: str, args=None):

logging.info(f"Comparing {source_path} with {target_path}")

wk_src_dataset = WKDataset(source_path)
src_layer_names = wk_src_dataset.layers.keys()
target_layer_names = [
layer["name"] for layer in detect_layers(target_path, 0, False)
]
assert set(src_layer_names) == set(
target_layer_names
), f"The provided input datasets have different layers: {src_layer_names} != {target_layer_names}"

existing_layer_names = src_layer_names

if args.layer_name is not None:
assert (
args.layer_name in existing_layer_names
), f"Provided layer {args.layer_name} does not exist in input dataset."
existing_layer_names = [args.layer_name]

for layer_name in existing_layer_names:

logging.info(f"Checking layer_name: {layer_name}")

source_mags = list(detect_resolutions(source_path, layer_name))
target_mags = list(detect_resolutions(target_path, layer_name))
source_mags.sort()
target_mags.sort()
mags = source_mags

assert (
source_mags == target_mags
), f"The mags between {source_path}/{layer_name} and {target_path}/{layer_name} are not equal: {source_mags} != {target_mags}"

layer_properties = wk_src_dataset.properties.data_layers[layer_name]

official_bbox = layer_properties.get_bounding_box()

for mag in mags:
inferred_src_bbox = BoundingBox.from_auto(
detect_bbox(source_path, layer_name, mag)
)
inferred_target_bbox = BoundingBox.from_auto(
detect_bbox(target_path, layer_name, mag)
)

bbox = inferred_src_bbox.extended_by(inferred_target_bbox).extended_by(
official_bbox
)
logging.info(f"Start verification of {layer_name} in mag {mag} in {bbox}")

with get_executor_for_args(args) as executor:
boxes = list(
bbox.chunk([CHUNK_SIZE, CHUNK_SIZE, CHUNK_SIZE], [CHUNK_SIZE])
)
assert_fn = named_partial(
assert_equality_for_chunk, source_path, target_path, layer_name, mag
)

wait_and_ensure_success(executor.map_to_futures(assert_fn, boxes))

logging.info(
f"The following datasets seem to be equal (with regard to the layers: {existing_layer_names}):"
)
logging.info(source_path)
logging.info(target_path)


if __name__ == "__main__":
args = create_parser().parse_args()
setup_logging(args)

if args.target_path is None:
target_path = args.source_path + BACKUP_EXT
else:
target_path = args.target_path
check_equality(args.source_path, target_path, args)
9 changes: 5 additions & 4 deletions wkcuber/compress.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,11 @@
wait_and_ensure_success,
setup_logging,
)
from .metadata import detect_resolutions
from .metadata import convert_element_class_to_dtype
from .metadata import detect_resolutions, convert_element_class_to_dtype
from typing import List

BACKUP_EXT = ".bak"


def create_parser():
parser = ArgumentParser()
Expand Down Expand Up @@ -128,11 +129,11 @@ def compress_mags(
compress_mag(source_path, layer_name, target_path, mag, args)

if with_tmp_dir:
makedirs(path.join(source_path + ".bak", layer_name), exist_ok=True)
makedirs(path.join(source_path + BACKUP_EXT, layer_name), exist_ok=True)
for mag in mags:
shutil.move(
path.join(source_path, layer_name, str(mag)),
path.join(source_path + ".bak", layer_name, str(mag)),
path.join(source_path + BACKUP_EXT, layer_name, str(mag)),
)
shutil.move(
path.join(target_path, layer_name, str(mag)),
Expand Down

0 comments on commit 88c0c57

Please sign in to comment.