Fix dtype conversion on DatapointsArray (#1699)

cognitedata · Apr 1, 2024 · 51eb721 · 51eb721
1 parent 0f73ab1
commit 51eb721
Show file tree

Hide file tree

Showing 5 changed files with 58 additions and 22 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -17,6 +17,11 @@ Changes are grouped as follows
 - `Fixed` for any bug fixes.
 - `Security` in case of vulnerabilities.
 
+## [7.32.4] - 2024-03-28
+### Fixed
+- Several methods for `DatapointsArray` that previously failed for string datapoints due to bad handling
+  of numpy `dtype`-to-native conversion.
+
 ## [7.32.3] - 2024-03-27
 ### Removed
 - Support for `protobuf==3.*` was dropped.

diff --git a/cognite/client/_version.py b/cognite/client/_version.py
@@ -1,4 +1,4 @@
 from __future__ import annotations
 
-__version__ = "7.32.3"
+__version__ = "7.32.4"
 __api_subversion__ = "20230101"
diff --git a/cognite/client/data_classes/datapoints.py b/cognite/client/data_classes/datapoints.py
@@ -1,16 +1,13 @@
 from __future__ import annotations
 
-import operator as op
 import typing
 import warnings
 from collections import defaultdict
 from dataclasses import dataclass
 from datetime import datetime
-from functools import cached_property
 from typing import (
     TYPE_CHECKING,
     Any,
-    Callable,
     Collection,
     Iterator,
     Literal,
@@ -20,7 +17,7 @@
 
 from cognite.client.data_classes._base import CogniteResource, CogniteResourceList
 from cognite.client.utils import _json
-from cognite.client.utils._auxiliary import find_duplicates, no_op
+from cognite.client.utils._auxiliary import find_duplicates
 from cognite.client.utils._identifier import Identifier
 from cognite.client.utils._importing import local_import
 from cognite.client.utils._pandas_helpers import (
@@ -71,6 +68,17 @@
     NumpyObjArray = npt.NDArray[np.object_]
 
 
+def numpy_dtype_fix(element: np.float64 | str) -> float | str:
+    try:
+        # Using .item() on numpy scalars gives us vanilla python types:
+        return element.item()  # type: ignore [union-attr]
+    except AttributeError:
+        # Return no-op as array contains just references to vanilla python objects:
+        if isinstance(element, str):
+            return element
+        raise
+
+
 @dataclass(frozen=True)
 class LatestDatapointQuery:
     """Parameters describing a query for the latest datapoint from a time series.
@@ -292,8 +300,8 @@ def __getitem__(self, item: int | slice) -> Datapoint | DatapointsArray:
             return self._slice(item)
         attrs, arrays = self._data_fields()
         return Datapoint(
-            timestamp=self._dtype_fix(arrays[0][item]) // 1_000_000,
-            **{attr: self._dtype_fix(arr[item]) for attr, arr in zip(attrs[1:], arrays[1:])},
+            timestamp=arrays[0][item].item() // 1_000_000,
+            **{attr: numpy_dtype_fix(arr[item]) for attr, arr in zip(attrs[1:], arrays[1:])},  # type: ignore [arg-type]
         )
 
     def _slice(self, part: slice) -> DatapointsArray:
@@ -307,25 +315,26 @@ def __iter__(self) -> Iterator[Datapoint]:
             For efficient storage, datapoints are not stored as a sequence of (singular) Datapoint
             objects, so these are created on demand while iterating (slow).
 
-        Returns:
-            Iterator[Datapoint]: No description."""
-        # Let's not create a single Datapoint more than we have too:
+        Yields:
+            Datapoint: No description.
+        """
+        warnings.warn(
+            "Iterating through a DatapointsArray is very inefficient. Tip: Access the arrays directly and use "
+            "vectorised numpy ops on those. E.g. `dps.average` for the 'average' aggregate, `dps.value` for the "
+            "raw datapoints or `dps.timestamp` for the timestamps. You may also convert to a pandas DataFrame using "
+            "`dps.to_pandas()`.",
+            UserWarning,
+        )
         attrs, arrays = self._data_fields()
-        return (
+        # Let's not create a single Datapoint more than we have too:
+        yield from (
             Datapoint(
-                timestamp=self._dtype_fix(row[0]) // 1_000_000, **dict(zip(attrs[1:], map(self._dtype_fix, row[1:])))
+                timestamp=row[0].item() // 1_000_000,
+                **dict(zip(attrs[1:], map(numpy_dtype_fix, row[1:]))),  # type: ignore [arg-type]
             )
             for row in zip(*arrays)
         )
 
-    @cached_property
-    def _dtype_fix(self) -> Callable:
-        if self.is_string:
-            # Return no-op as array contains just references to vanilla python objects:
-            return no_op
-        # Using .item() on numpy scalars gives us vanilla python types:
-        return op.methodcaller("item")
-
     def _data_fields(self) -> tuple[list[str], list[npt.NDArray]]:
         data_field_tuples = [
             (attr, arr)
@@ -356,7 +365,7 @@ def dump(self, camel_case: bool = True, convert_timestamps: bool = False) -> dic
         if camel_case:
             attrs = list(map(to_camel_case, attrs))
 
-        dumped = {**self._ts_info, "datapoints": [dict(zip(attrs, map(self._dtype_fix, row))) for row in zip(*arrays)]}
+        dumped = {**self._ts_info, "datapoints": [dict(zip(attrs, map(numpy_dtype_fix, row))) for row in zip(*arrays)]}
         if camel_case:
             dumped = convert_all_keys_to_camel_case(dumped)
         return {k: v for k, v in dumped.items() if v is not None}

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,7 +1,7 @@
 [tool.poetry]
 name = "cognite-sdk"
 
-version = "7.32.3"
+version = "7.32.4"
 description = "Cognite Python SDK"
 readme = "README.md"
 documentation = "https://cognite-sdk-python.readthedocs-hosted.com"

diff --git a/tests/tests_integration/test_api/test_datapoints.py b/tests/tests_integration/test_api/test_datapoints.py
@@ -672,6 +672,28 @@ def test_unit_external_id__is_overridden_if_converted(
         assert res[1].unit_external_id == "temperature:deg_f"
         assert res[2].unit_external_id == "temperature:k"
 
+    def test_numpy_dtypes_conversions_for_string_and_numeric(self, cognite_client, all_test_time_series):
+        # Bug prior to 7.32.4, several methods on DatapointsArray would fail due to a bad
+        # conversion of numpy dtypes to native.
+        str_ts = all_test_time_series[1]
+        # We only test retrieve_array since that uses numpy arrays
+        dps_arr = cognite_client.time_series.data.retrieve_arrays(id=str_ts.id, limit=3)
+        # Test __iter__
+        for dp in dps_arr:
+            assert type(dp.timestamp) is int  # noqa: E721
+            assert type(dp.value) is str  # noqa: E721
+        # Test __getitem__ of non-slices
+        dp = dps_arr[0]
+        assert type(dp.timestamp) is int  # noqa: E721
+        assert type(dp.value) is str  # noqa: E721
+        # Test dump()
+        dumped = dps_arr.dump(camel_case=False)
+        dp_dumped = dumped["datapoints"][0]
+        assert dumped["is_string"] is True
+        assert dp_dumped == {"timestamp": 0, "value": "2"}
+        assert type(dp_dumped["timestamp"]) is int  # noqa: E721
+        assert type(dp_dumped["value"]) is str  # noqa: E721
+
 
 class TestRetrieveAggregateDatapointsAPI:
     @pytest.mark.parametrize(