Skip to content

Commit

Permalink
Fix dtype conversion on DatapointsArray (#1699)
Browse files Browse the repository at this point in the history
  • Loading branch information
haakonvt authored Apr 1, 2024
1 parent 0f73ab1 commit 51eb721
Show file tree
Hide file tree
Showing 5 changed files with 58 additions and 22 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@ Changes are grouped as follows
- `Fixed` for any bug fixes.
- `Security` in case of vulnerabilities.

## [7.32.4] - 2024-03-28
### Fixed
- Several methods for `DatapointsArray` that previously failed for string datapoints due to bad handling
of numpy `dtype`-to-native conversion.

## [7.32.3] - 2024-03-27
### Removed
- Support for `protobuf==3.*` was dropped.
Expand Down
2 changes: 1 addition & 1 deletion cognite/client/_version.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from __future__ import annotations

__version__ = "7.32.3"
__version__ = "7.32.4"
__api_subversion__ = "20230101"
49 changes: 29 additions & 20 deletions cognite/client/data_classes/datapoints.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,13 @@
from __future__ import annotations

import operator as op
import typing
import warnings
from collections import defaultdict
from dataclasses import dataclass
from datetime import datetime
from functools import cached_property
from typing import (
TYPE_CHECKING,
Any,
Callable,
Collection,
Iterator,
Literal,
Expand All @@ -20,7 +17,7 @@

from cognite.client.data_classes._base import CogniteResource, CogniteResourceList
from cognite.client.utils import _json
from cognite.client.utils._auxiliary import find_duplicates, no_op
from cognite.client.utils._auxiliary import find_duplicates
from cognite.client.utils._identifier import Identifier
from cognite.client.utils._importing import local_import
from cognite.client.utils._pandas_helpers import (
Expand Down Expand Up @@ -71,6 +68,17 @@
NumpyObjArray = npt.NDArray[np.object_]


def numpy_dtype_fix(element: np.float64 | str) -> float | str:
try:
# Using .item() on numpy scalars gives us vanilla python types:
return element.item() # type: ignore [union-attr]
except AttributeError:
# Return no-op as array contains just references to vanilla python objects:
if isinstance(element, str):
return element
raise


@dataclass(frozen=True)
class LatestDatapointQuery:
"""Parameters describing a query for the latest datapoint from a time series.
Expand Down Expand Up @@ -292,8 +300,8 @@ def __getitem__(self, item: int | slice) -> Datapoint | DatapointsArray:
return self._slice(item)
attrs, arrays = self._data_fields()
return Datapoint(
timestamp=self._dtype_fix(arrays[0][item]) // 1_000_000,
**{attr: self._dtype_fix(arr[item]) for attr, arr in zip(attrs[1:], arrays[1:])},
timestamp=arrays[0][item].item() // 1_000_000,
**{attr: numpy_dtype_fix(arr[item]) for attr, arr in zip(attrs[1:], arrays[1:])}, # type: ignore [arg-type]
)

def _slice(self, part: slice) -> DatapointsArray:
Expand All @@ -307,25 +315,26 @@ def __iter__(self) -> Iterator[Datapoint]:
For efficient storage, datapoints are not stored as a sequence of (singular) Datapoint
objects, so these are created on demand while iterating (slow).
Returns:
Iterator[Datapoint]: No description."""
# Let's not create a single Datapoint more than we have too:
Yields:
Datapoint: No description.
"""
warnings.warn(
"Iterating through a DatapointsArray is very inefficient. Tip: Access the arrays directly and use "
"vectorised numpy ops on those. E.g. `dps.average` for the 'average' aggregate, `dps.value` for the "
"raw datapoints or `dps.timestamp` for the timestamps. You may also convert to a pandas DataFrame using "
"`dps.to_pandas()`.",
UserWarning,
)
attrs, arrays = self._data_fields()
return (
# Let's not create a single Datapoint more than we have too:
yield from (
Datapoint(
timestamp=self._dtype_fix(row[0]) // 1_000_000, **dict(zip(attrs[1:], map(self._dtype_fix, row[1:])))
timestamp=row[0].item() // 1_000_000,
**dict(zip(attrs[1:], map(numpy_dtype_fix, row[1:]))), # type: ignore [arg-type]
)
for row in zip(*arrays)
)

@cached_property
def _dtype_fix(self) -> Callable:
if self.is_string:
# Return no-op as array contains just references to vanilla python objects:
return no_op
# Using .item() on numpy scalars gives us vanilla python types:
return op.methodcaller("item")

def _data_fields(self) -> tuple[list[str], list[npt.NDArray]]:
data_field_tuples = [
(attr, arr)
Expand Down Expand Up @@ -356,7 +365,7 @@ def dump(self, camel_case: bool = True, convert_timestamps: bool = False) -> dic
if camel_case:
attrs = list(map(to_camel_case, attrs))

dumped = {**self._ts_info, "datapoints": [dict(zip(attrs, map(self._dtype_fix, row))) for row in zip(*arrays)]}
dumped = {**self._ts_info, "datapoints": [dict(zip(attrs, map(numpy_dtype_fix, row))) for row in zip(*arrays)]}
if camel_case:
dumped = convert_all_keys_to_camel_case(dumped)
return {k: v for k, v in dumped.items() if v is not None}
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[tool.poetry]
name = "cognite-sdk"

version = "7.32.3"
version = "7.32.4"
description = "Cognite Python SDK"
readme = "README.md"
documentation = "https://cognite-sdk-python.readthedocs-hosted.com"
Expand Down
22 changes: 22 additions & 0 deletions tests/tests_integration/test_api/test_datapoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -672,6 +672,28 @@ def test_unit_external_id__is_overridden_if_converted(
assert res[1].unit_external_id == "temperature:deg_f"
assert res[2].unit_external_id == "temperature:k"

def test_numpy_dtypes_conversions_for_string_and_numeric(self, cognite_client, all_test_time_series):
# Bug prior to 7.32.4, several methods on DatapointsArray would fail due to a bad
# conversion of numpy dtypes to native.
str_ts = all_test_time_series[1]
# We only test retrieve_array since that uses numpy arrays
dps_arr = cognite_client.time_series.data.retrieve_arrays(id=str_ts.id, limit=3)
# Test __iter__
for dp in dps_arr:
assert type(dp.timestamp) is int # noqa: E721
assert type(dp.value) is str # noqa: E721
# Test __getitem__ of non-slices
dp = dps_arr[0]
assert type(dp.timestamp) is int # noqa: E721
assert type(dp.value) is str # noqa: E721
# Test dump()
dumped = dps_arr.dump(camel_case=False)
dp_dumped = dumped["datapoints"][0]
assert dumped["is_string"] is True
assert dp_dumped == {"timestamp": 0, "value": "2"}
assert type(dp_dumped["timestamp"]) is int # noqa: E721
assert type(dp_dumped["value"]) is str # noqa: E721


class TestRetrieveAggregateDatapointsAPI:
@pytest.mark.parametrize(
Expand Down

0 comments on commit 51eb721

Please sign in to comment.