From 59f208977d6a10c720e16edb5bf283bd6d9e2fcc Mon Sep 17 00:00:00 2001 From: Joel Jaeschke Date: Tue, 30 Jul 2024 23:26:39 +0200 Subject: [PATCH 1/5] Add warning for quantized variables. This lets a user know that the variable in question is stored as a quantized integer, but loaded as a float, which will cause the calculation of the bitinformation to yield bogus results depending on the data. --- tests/test_get_bitinformation.py | 15 +++++++++++++++ xbitinfo/xbitinfo.py | 21 +++++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/tests/test_get_bitinformation.py b/tests/test_get_bitinformation.py index 68b53804..74543403 100644 --- a/tests/test_get_bitinformation.py +++ b/tests/test_get_bitinformation.py @@ -5,6 +5,7 @@ import numpy as np import pytest import xarray as xr +import warnings from numpy.testing import assert_allclose, assert_equal from xarray.core import formatting from xarray.core.dataarray import DataArray @@ -258,3 +259,17 @@ def test_implementations_agree(ds, dim, axis, request): masked_value=None, ) bitinfo_assert_allclose(bi_python, bi_julia, rtol=1e-4) + + +@pytest.mark.parametrize("implementation", ["python", "julia"]) +@pytest.mark.parametrize("dataset_name", ["air_temperature", "eraint_uvz"]) +def test_warn_on_quantized_variables(dataset_name, implementation): + ds_quantized = xr.tutorial.load_dataset(dataset_name) + ds_raw = xr.tutorial.load_dataset(dataset_name, mask_and_scale=False) + + with pytest.warns(UserWarning): + _ = xb.get_bitinformation(ds_quantized, implementation=implementation) + + with warnings.catch_warnings(): + warnings.simplefilter("error") + _ = xb.get_bitinformation(ds_raw, implementation=implementation) \ No newline at end of file diff --git a/xbitinfo/xbitinfo.py b/xbitinfo/xbitinfo.py index d5d6b543..2f125990 100644 --- a/xbitinfo/xbitinfo.py +++ b/xbitinfo/xbitinfo.py @@ -1,6 +1,7 @@ import json import logging import os +import warnings import numpy as np import xarray as xr @@ -233,6 +234,15 @@ def get_bitinformation( # noqa: C901 pbar = tqdm(ds.data_vars) for var in pbar: pbar.set_description(f"Processing var: {var} for dim: {dim}") + + if _quantized_variable_is_scaled(ds, var): + loaded_dtype = ds[var].dtype + quantized_storage_dtype = ds[var].encoding["dtype"] + warnings.warn( + f"Variable {var} is quantized as {quantized_storage_dtype}, but loaded as {loaded_dtype}. Consider reopening using `mask_and_scale=False` to get sensible results", + category=UserWarning + ) + if implementation == "julia": info_per_bit_var = _jl_get_bitinformation(ds, var, axis, dim, kwargs) if info_per_bit_var is None: @@ -260,6 +270,17 @@ def get_bitinformation( # noqa: C901 return info_per_bit +def _quantized_variable_is_scaled(ds: xr.DataArray, var: str) -> bool: + loaded_dtype = ds[var].dtype + quantized_storage_dtype = ds[var].encoding["dtype"] + has_scale_or_offset = any(["add_offset" in ds[var].encoding, "scale_factor" in ds[var].encoding]) + + if has_scale_or_offset and quantized_storage_dtype != loaded_dtype: + return True + else: + return False + + def _jl_get_bitinformation(ds, var, axis, dim, kwargs={}): X = ds[var].values Main.X = X From 89519607331b30a8e6aa31c817bbfff7511edf39 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 30 Jul 2024 21:32:39 +0000 Subject: [PATCH 2/5] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_get_bitinformation.py | 4 ++-- xbitinfo/xbitinfo.py | 6 ++++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/test_get_bitinformation.py b/tests/test_get_bitinformation.py index 74543403..5498e494 100644 --- a/tests/test_get_bitinformation.py +++ b/tests/test_get_bitinformation.py @@ -1,11 +1,11 @@ """Tests for `xbitinfo` package.""" import os +import warnings import numpy as np import pytest import xarray as xr -import warnings from numpy.testing import assert_allclose, assert_equal from xarray.core import formatting from xarray.core.dataarray import DataArray @@ -272,4 +272,4 @@ def test_warn_on_quantized_variables(dataset_name, implementation): with warnings.catch_warnings(): warnings.simplefilter("error") - _ = xb.get_bitinformation(ds_raw, implementation=implementation) \ No newline at end of file + _ = xb.get_bitinformation(ds_raw, implementation=implementation) diff --git a/xbitinfo/xbitinfo.py b/xbitinfo/xbitinfo.py index 2f125990..f2518164 100644 --- a/xbitinfo/xbitinfo.py +++ b/xbitinfo/xbitinfo.py @@ -240,7 +240,7 @@ def get_bitinformation( # noqa: C901 quantized_storage_dtype = ds[var].encoding["dtype"] warnings.warn( f"Variable {var} is quantized as {quantized_storage_dtype}, but loaded as {loaded_dtype}. Consider reopening using `mask_and_scale=False` to get sensible results", - category=UserWarning + category=UserWarning, ) if implementation == "julia": @@ -273,7 +273,9 @@ def get_bitinformation( # noqa: C901 def _quantized_variable_is_scaled(ds: xr.DataArray, var: str) -> bool: loaded_dtype = ds[var].dtype quantized_storage_dtype = ds[var].encoding["dtype"] - has_scale_or_offset = any(["add_offset" in ds[var].encoding, "scale_factor" in ds[var].encoding]) + has_scale_or_offset = any( + ["add_offset" in ds[var].encoding, "scale_factor" in ds[var].encoding] + ) if has_scale_or_offset and quantized_storage_dtype != loaded_dtype: return True From e0aa4338629b3e7f9c35501ca60cc0842b3af859 Mon Sep 17 00:00:00 2001 From: Joel Jaeschke Date: Wed, 31 Jul 2024 10:52:21 +0200 Subject: [PATCH 3/5] Make testing for stored and loaded dtype less agressive --- xbitinfo/xbitinfo.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/xbitinfo/xbitinfo.py b/xbitinfo/xbitinfo.py index f2518164..93921e91 100644 --- a/xbitinfo/xbitinfo.py +++ b/xbitinfo/xbitinfo.py @@ -271,17 +271,20 @@ def get_bitinformation( # noqa: C901 def _quantized_variable_is_scaled(ds: xr.DataArray, var: str) -> bool: + has_scale_or_offset = any(["add_offset" in ds[var].encoding, "scale_factor" in ds[var].encoding]) + + if not has_scale_or_offset: + return False + loaded_dtype = ds[var].dtype - quantized_storage_dtype = ds[var].encoding["dtype"] - has_scale_or_offset = any( - ["add_offset" in ds[var].encoding, "scale_factor" in ds[var].encoding] - ) + storage_dtype = ds[var].encoding.get("dtype", None) + assert storage_dtype is not None, f"Variable {var} is likely quantized, but does not have a storage dtype" - if has_scale_or_offset and quantized_storage_dtype != loaded_dtype: - return True - else: + if loaded_dtype == storage_dtype: return False + return True + def _jl_get_bitinformation(ds, var, axis, dim, kwargs={}): X = ds[var].values From 63ed2a958f61a3a5d18e7806d765125f9741da39 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 31 Jul 2024 08:56:13 +0000 Subject: [PATCH 4/5] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xbitinfo/xbitinfo.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/xbitinfo/xbitinfo.py b/xbitinfo/xbitinfo.py index 93921e91..c3c3e86b 100644 --- a/xbitinfo/xbitinfo.py +++ b/xbitinfo/xbitinfo.py @@ -271,14 +271,18 @@ def get_bitinformation( # noqa: C901 def _quantized_variable_is_scaled(ds: xr.DataArray, var: str) -> bool: - has_scale_or_offset = any(["add_offset" in ds[var].encoding, "scale_factor" in ds[var].encoding]) + has_scale_or_offset = any( + ["add_offset" in ds[var].encoding, "scale_factor" in ds[var].encoding] + ) if not has_scale_or_offset: return False loaded_dtype = ds[var].dtype storage_dtype = ds[var].encoding.get("dtype", None) - assert storage_dtype is not None, f"Variable {var} is likely quantized, but does not have a storage dtype" + assert ( + storage_dtype is not None + ), f"Variable {var} is likely quantized, but does not have a storage dtype" if loaded_dtype == storage_dtype: return False From b7c57901f083cdeabc4f6cbeb5ca0f92bf934d35 Mon Sep 17 00:00:00 2001 From: Hauke Schulz <43613877+observingClouds@users.noreply.github.com> Date: Thu, 15 Aug 2024 10:05:17 +0200 Subject: [PATCH 5/5] Update CHANGELOG.rst --- CHANGELOG.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index e2dd6ab1..db2a2d37 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -5,6 +5,7 @@ CHANGELOG X.X.X (unreleased) ------------------ +* Add warning for quantized variables (:pr:`286`, :issue:`202`) `Joel Jaeschke`_. * Update BitInformation.jl version to v0.6.3 (:pr:`292`) `Hauke Schulz`_ * Improve test/docs environment separation (:pr:`275`, :issue:`267`) `Aryan Bakliwal`_. * Set default masked value to None for integers (:pr:`289`) `Hauke Schulz`_.