From 29821eeabdb06e4ea5d6bf931017c59177910fc2 Mon Sep 17 00:00:00 2001
From: Philipp Otto <philippotto@users.noreply.github.com>
Date: Fri, 18 Mar 2022 10:47:25 +0100
Subject: [PATCH] Fix downsampling/compression of segmentation layers + misc
 (#657)

* add test for cubing/downsampling/compressing (end-to-end) for a color and segmentation layer

* fix downsampling and compressing segmentation layers (cast largest_segment_id to int if necessary and safe; also pass when creating copy of layer)

* improve logging (e.g., point out deprecated caller location) and remove some deprecation warnings

* upgrade zarr in wkcuber to be in sync with webknossos package

* add/clean up ./lint.sh, ./typecheck.sh and ./format.sh for wkcuber

* remove isort from wkcuber for now

* format

* fix typing

* update changelog

Co-authored-by: Norman Rzepka <code@normanrz.com>
---
 .github/workflows/ci.yml                  |  6 +--
 webknossos/Changelog.md                   |  2 +-
 webknossos/webknossos/dataset/layer.py    | 12 +++++
 webknossos/webknossos/dataset/mag_view.py |  1 +
 webknossos/webknossos/utils.py            |  4 +-
 wkcuber/format.sh                         |  8 +++
 wkcuber/lint.sh                           |  4 ++
 wkcuber/poetry.lock                       |  8 +--
 wkcuber/pyproject.toml                    |  2 +-
 wkcuber/tests/test_main.py                | 65 +++++++++++++++++++++++
 wkcuber/tests/test_raw_conversion.py      |  2 +-
 wkcuber/typecheck.sh                      |  4 +-
 wkcuber/wkcuber/converter.py              |  2 +-
 wkcuber/wkcuber/cubing.py                 |  4 +-
 14 files changed, 108 insertions(+), 16 deletions(-)
 create mode 100755 wkcuber/format.sh
 create mode 100755 wkcuber/lint.sh
 create mode 100644 wkcuber/tests/test_main.py

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 12a558aa3..a4162a191 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -194,15 +194,15 @@ jobs:
       run: tar -xzvf testdata/WT1_wkw.tar.gz
 
     - name: Check formatting
-      run: poetry run black --check .
+      run: ./format.sh check
       if: ${{ needs.changes.outputs.wkcuber == 'true' }}
 
     - name: Lint code
-      run: poetry run pylint -j4 wkcuber
+      run: ./lint.sh
       if: ${{ needs.changes.outputs.wkcuber == 'true' }}
 
     - name: Check typing
-      run: poetry run ./typecheck.sh
+      run: ./typecheck.sh
 
     - name: Python tests
       run: poetry run pytest tests
diff --git a/webknossos/Changelog.md b/webknossos/Changelog.md
index 3097ab975..c85766235 100644
--- a/webknossos/Changelog.md
+++ b/webknossos/Changelog.md
@@ -32,7 +32,7 @@ For upgrade instructions, please check the respective *Breaking Changes* section
 - Dataset: `block_len` and `file_len` attributes are now deprecated, but still available for backwards compatibility. Use `chunk_size` and `chunks_per_shard` instead. These new attributes are `Vec3Int`, so they can be set non-uniformly. However, WKW-backed layers still require uniform `chunk_size` and `chunks_per_shard`. [#627](https://github.com/scalableminds/webknossos-libs/pull/627)
 
 ### Fixed
-
+- Fixed crash during downsampling and compression of segmentation layers. [#657](https://github.com/scalableminds/webknossos-libs/pull/657)
 
 ## [0.9.11](https://github.com/scalableminds/webknossos-libs/releases/tag/v0.9.11) - 2022-03-16
 [Commits](https://github.com/scalableminds/webknossos-libs/compare/v0.9.10...v0.9.11)
diff --git a/webknossos/webknossos/dataset/layer.py b/webknossos/webknossos/dataset/layer.py
index 5f29d3445..035d3446c 100644
--- a/webknossos/webknossos/dataset/layer.py
+++ b/webknossos/webknossos/dataset/layer.py
@@ -998,6 +998,9 @@ def dtype_per_layer(self) -> str:
             self.dtype_per_channel, self.num_channels
         )
 
+    def _get_largest_segment_id_maybe(self) -> Optional[int]:
+        return None
+
 
 class SegmentationLayer(Layer):
 
@@ -1009,9 +1012,18 @@ def largest_segment_id(self) -> int:
 
     @largest_segment_id.setter
     def largest_segment_id(self, largest_segment_id: int) -> None:
+        if type(largest_segment_id) != int:
+            assert largest_segment_id == int(
+                largest_segment_id
+            ), f"A non-integer value was passed for largest_segment_id ({largest_segment_id})."
+            largest_segment_id = int(largest_segment_id)
+
         self._properties.largest_segment_id = largest_segment_id
         self.dataset._export_as_json()
 
     @property
     def category(self) -> LayerCategoryType:
         return SEGMENTATION_CATEGORY
+
+    def _get_largest_segment_id_maybe(self) -> Optional[int]:
+        return self.largest_segment_id
diff --git a/webknossos/webknossos/dataset/mag_view.py b/webknossos/webknossos/dataset/mag_view.py
index 0f12dd7b8..a54e12a9b 100644
--- a/webknossos/webknossos/dataset/mag_view.py
+++ b/webknossos/webknossos/dataset/mag_view.py
@@ -289,6 +289,7 @@ def compress(
             dtype_per_channel=self.layer.dtype_per_channel,
             num_channels=self.layer.num_channels,
             data_format=self.layer.data_format,
+            largest_segment_id=self.layer._get_largest_segment_id_maybe(),
         ).get_or_add_mag(
             mag=self.mag,
             chunk_size=self.info.chunk_size,
diff --git a/webknossos/webknossos/utils.py b/webknossos/webknossos/utils.py
index 1b9c4e948..7d6d0bfff 100644
--- a/webknossos/webknossos/utils.py
+++ b/webknossos/webknossos/utils.py
@@ -9,6 +9,7 @@
 from concurrent.futures import as_completed
 from concurrent.futures._base import Future
 from datetime import datetime
+from inspect import getframeinfo, stack
 from multiprocessing import cpu_count
 from os.path import relpath
 from pathlib import Path
@@ -199,7 +200,8 @@ def get_rich_progress() -> Progress:
 
 
 def warn_deprecated(deprecated_item: str, alternative_item: str) -> None:
+    caller = getframeinfo(stack()[2][0])
     warnings.warn(
-        f"[DEPRECATION] `{deprecated_item}` is deprecated, please use `{alternative_item}` instead.",
+        f"[DEPRECATION] `{deprecated_item}` is deprecated, please use `{alternative_item}` instead (see {caller.filename}:{caller.lineno})",
         DeprecationWarning,
     )
diff --git a/wkcuber/format.sh b/wkcuber/format.sh
new file mode 100755
index 000000000..91b356252
--- /dev/null
+++ b/wkcuber/format.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+set -eEuo pipefail
+
+if [ $# -eq 1 ] && [ "$1" = "check" ]; then
+    poetry run black --check .
+else
+    poetry run black .
+fi
diff --git a/wkcuber/lint.sh b/wkcuber/lint.sh
new file mode 100755
index 000000000..89548a2d8
--- /dev/null
+++ b/wkcuber/lint.sh
@@ -0,0 +1,4 @@
+#!/usr/bin/env bash
+set -eEuo pipefail
+
+poetry run pylint -j4 wkcuber
\ No newline at end of file
diff --git a/wkcuber/poetry.lock b/wkcuber/poetry.lock
index 5ee878c25..a44fbc260 100644
--- a/wkcuber/poetry.lock
+++ b/wkcuber/poetry.lock
@@ -1121,7 +1121,7 @@ python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7"
 
 [[package]]
 name = "zarr"
-version = "2.10.3"
+version = "2.11.1"
 description = "An implementation of chunked, compressed, N-dimensional arrays for Python."
 category = "main"
 optional = false
@@ -1151,7 +1151,7 @@ testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-
 [metadata]
 lock-version = "1.1"
 python-versions = "^3.7,>=3.7.1"
-content-hash = "108f4e0afeae30d19508bbc9dc0a7486ce5db4126cb2ebe28fbfcca9b2bdd80f"
+content-hash = "c305b92a4b584885b5a455422cc48b34b66b8dd6dde157584d589b6430a25adb"
 
 [metadata.files]
 anyio = [
@@ -2149,8 +2149,8 @@ wrapt = [
     {file = "wrapt-1.13.3.tar.gz", hash = "sha256:1fea9cd438686e6682271d36f3481a9f3636195578bab9ca3382e2f5f01fc185"},
 ]
 zarr = [
-    {file = "zarr-2.10.3-py3-none-any.whl", hash = "sha256:1354d6de15683a3f7ea9c47e7bfa5772da445d25298988bacc8e499db8896186"},
-    {file = "zarr-2.10.3.tar.gz", hash = "sha256:76932665c2146ebdf15f6dba254f9e0030552fbfcf9322dea822bff96fbce693"},
+    {file = "zarr-2.11.1-py3-none-any.whl", hash = "sha256:126cf3fe6d0276f64a1590eb6e18edf5e7c903cc4a879829b3ebdc85238b7894"},
+    {file = "zarr-2.11.1.tar.gz", hash = "sha256:11b628f42dec36e0147879e8bd471524b59b238094b9b21e3c35be78399c115e"},
 ]
 zipp = [
     {file = "zipp-3.7.0-py3-none-any.whl", hash = "sha256:b47250dd24f92b7dd6a0a8fc5244da14608f3ca90a5efcd37a3b1642fac9a375"},
diff --git a/wkcuber/pyproject.toml b/wkcuber/pyproject.toml
index 5c9aa209b..277e5ea80 100644
--- a/wkcuber/pyproject.toml
+++ b/wkcuber/pyproject.toml
@@ -28,7 +28,7 @@ scipy = "^1.6.0"
 tifffile = "^2020.11.26"
 webknossos = { path = "../webknossos/", develop = true }
 wkw = "1.1.11"
-zarr = "^2.10.3"
+zarr = "^2.11.0"
 
 [tool.poetry.dev-dependencies]
 black = "^20.8b1"
diff --git a/wkcuber/tests/test_main.py b/wkcuber/tests/test_main.py
new file mode 100644
index 000000000..df3c6a663
--- /dev/null
+++ b/wkcuber/tests/test_main.py
@@ -0,0 +1,65 @@
+from pathlib import Path
+import numpy as np
+import pytest
+from wkcuber.utils import (
+    setup_logging,
+)
+from webknossos import Dataset
+from wkcuber.__main__ import create_parser, cube_with_args
+from tifffile import TiffWriter
+
+TESTOUTPUT_DIR = Path("testoutput")
+
+
+@pytest.mark.parametrize("category", ["color", "segmentation"])
+def test_main(category: str) -> None:
+    input_folder = TESTOUTPUT_DIR / "raw_dataset" / category
+    input_folder.mkdir(parents=True, exist_ok=True)
+
+    raw_file = input_folder / "input.tif"
+
+    input_dtype = "uint32"
+    shape = 64, 128, 256
+    data = np.arange(np.prod(shape), dtype=input_dtype).reshape(shape)
+    with TiffWriter(raw_file) as tif:
+        tif.write(data.transpose([2, 1, 0]))
+
+    output_path = TESTOUTPUT_DIR / "output_2"
+    output_path.mkdir()
+
+    args_list = [
+        str(TESTOUTPUT_DIR / "raw_dataset"),
+        str(output_path),
+        "--jobs",
+        "1",
+        "--scale",
+        "11,11,11",
+        "--max_mag",
+        "4",
+    ]
+
+    args = create_parser().parse_args(args_list)
+    cube_with_args(args)
+
+    dataset = Dataset.open(output_path)
+    if category == "color":
+        layer = dataset.get_color_layers()[0]
+    else:
+        layer = dataset.get_segmentation_layers()[0]
+    mag_view = layer.get_mag(1)
+    view = mag_view.get_view()
+    read_data = view.read()
+
+    assert view.size == shape
+    assert view.get_dtype() == data.dtype
+    assert np.array_equal(
+        read_data[0],
+        data,
+    )
+
+
+if __name__ == "__main__":
+    from argparse import Namespace
+
+    setup_logging(Namespace(verbose=False))
+    test_main("color")
diff --git a/wkcuber/tests/test_raw_conversion.py b/wkcuber/tests/test_raw_conversion.py
index e63aa2040..a32b001a1 100644
--- a/wkcuber/tests/test_raw_conversion.py
+++ b/wkcuber/tests/test_raw_conversion.py
@@ -43,7 +43,7 @@ def test_main(order: str, flip_axes: Optional[Tuple[int, int]]) -> None:
     main(args)
 
     dataset = Dataset.open(output_path)
-    layer = dataset.get_color_layer()
+    layer = dataset.get_color_layers()[0]
     mag_view = layer.get_mag(1)
     view = mag_view.get_view()
     read_data = view.read()
diff --git a/wkcuber/typecheck.sh b/wkcuber/typecheck.sh
index 1f8f7876f..c17eb79a6 100755
--- a/wkcuber/typecheck.sh
+++ b/wkcuber/typecheck.sh
@@ -2,7 +2,7 @@
 set -eEuo pipefail
 
 echo "Typecheck wkcuber module..."
-python -m mypy -p wkcuber --disallow-untyped-defs --show-error-codes --strict-equality --namespace-packages --no-implicit-optional
+poetry run python -m mypy -p wkcuber --disallow-untyped-defs --show-error-codes --strict-equality --namespace-packages --no-implicit-optional
 
 echo "Typecheck tests..."
-python -m mypy -p tests --disallow-untyped-defs --show-error-codes --strict-equality --namespace-packages --no-implicit-optional
+poetry run python -m mypy -p tests --disallow-untyped-defs --show-error-codes --strict-equality --namespace-packages --no-implicit-optional
diff --git a/wkcuber/wkcuber/converter.py b/wkcuber/wkcuber/converter.py
index 1d94ca5d8..5f2aee1df 100644
--- a/wkcuber/wkcuber/converter.py
+++ b/wkcuber/wkcuber/converter.py
@@ -541,7 +541,7 @@ def main(args: Namespace) -> None:
         exit(1)
     elif len(matching_converters) > 1:
         logger.info(
-            "Multiple converters found. Check if your source path contains multiple datasets."
+            f"Multiple converters found. Check if your source path contains multiple datasets. Converters: {matching_converters}"
         )
         exit(1)
 
diff --git a/wkcuber/wkcuber/cubing.py b/wkcuber/wkcuber/cubing.py
index 56850fe61..8021215c0 100644
--- a/wkcuber/wkcuber/cubing.py
+++ b/wkcuber/wkcuber/cubing.py
@@ -220,7 +220,7 @@ def cubing_job(
                 # Image shape will be (x, y, channel_count, z=1)
                 image = read_image_file(
                     file_name,
-                    target_view.header.voxel_type,
+                    target_view.info.voxel_type,
                     z,
                     channel_index,
                     sample_index,
@@ -378,7 +378,7 @@ def cubing(
     )
 
     target_mag_view = target_layer.get_or_add_mag(
-        target_mag, file_len=wkw_file_len, block_len=BLOCK_LEN
+        target_mag, chunks_per_shard=wkw_file_len, chunk_size=BLOCK_LEN
     )
 
     interpolation_mode = parse_interpolation_mode(