Exclude always-NA columns in ProductionRuns DataFrame.

The `ProductionRun` dataclass contains some fields that in turn are dataclasses themselves. When such fields are turned into DataFrame columns, they are flattened such that every field contained in the sub-objects becomes a column in the final DataFrame. Since some of these fields are actually optional, also the field name itself will turn into a column which in practice always is `None`. This PR removes such useless columns from the DataFrame.
enlyze · Jan 24, 2024 · c2f6eba · c2f6eba
1 parent e79438d
commit c2f6eba
Show file tree

Hide file tree

Showing 2 changed files with 121 additions and 7 deletions.
diff --git a/src/enlyze/models.py b/src/enlyze/models.py
@@ -1,4 +1,5 @@
-from dataclasses import asdict, dataclass
+import typing
+from dataclasses import asdict, dataclass, is_dataclass
 from datetime import date, datetime, timedelta, timezone
 from enum import Enum
 from itertools import chain
@@ -8,6 +9,30 @@
 import pandas
 
 
+def _get_optional_dataclass_fields(cls: object) -> set[str]:
+ hints = typing.get_type_hints(cls)
+ optional_fields = set()
+ for field, typ in hints.items():
+ if typing.get_origin(typ) != typing.Union:
+ continue
+
+ args = list(typing.get_args(typ))
+ try:
+ args.remove(type(None))
+ except ValueError:
+ continue
+
+ try:
+ (nested_type,) = args
+ except ValueError:
+ continue
+
+ if is_dataclass(nested_type):
+ optional_fields.add(field)
+
+ return optional_fields
+
+
 @dataclass(frozen=True)
 class Site:
  """Representation of a :ref:`site <site>` in the ENLYZE platform.
@@ -289,24 +314,83 @@ class ProductionRun:
  #: Aggregate OEE score that comprises availability, performance and quality.
  productivity: Optional[OEEComponent]
 
+ def to_dict(self, exclude_unset_objects: bool = False) -> dict[str, Any]:
+ """Convert to Python dictionary.
+
+ The ``start`` and ``end`` fields will be represented as
+ :ref:`timezone-aware <python:datetime-naive-aware>`
+ :py:class:`datetime.datetime` localized in UTC.
+
+ :param exclude_unset_objects: Exclude fields that are typed as optional
+ dataclasses and set to ``None``.
+
+ :returns: Production run represented as Python dictionary.
+
+ """
+ data = asdict(
+ self,
+ dict_factory=lambda items: {
+ k: v
+ for k, v in items
+ if not (
+ exclude_unset_objects
+ and k in _get_optional_dataclass_fields(self)
+ and v is None
+ )
+ },
+ )
+
+ data["start"] = data["start"].astimezone(timezone.utc)
+ if data["end"]:
+ data["end"] = data["end"].astimezone(timezone.utc)
+
+ return data
+
+ def to_dataframe(self) -> pandas.DataFrame:
+ """Convert to :py:class:`pandas.DataFrame`
+
+ The ``start`` and ``end`` fields will be represented as
+ :ref:`timezone-aware <python:datetime-naive-aware>`
+ :py:class:`datetime.datetime` localized in UTC.
+
+ :returns: DataFrame with a single row representing the production run.
+ """
+ return pandas.json_normalize(self.to_dict(exclude_unset_objects=True))
+
 
 class ProductionRuns(list[ProductionRun]):
  """Representation of multiple production runs."""
 
+ def to_dicts(self, exclude_unset_objects: bool = False) -> list[dict[str, Any]]:
+ """Convert to Python dictionaries.
+
+ The ``start`` and ``end`` fields will be represented as
+ :ref:`timezone-aware <python:datetime-naive-aware>`
+ :py:class:`datetime.datetime` localized in UTC.
+
+ :param exclude_unset_objects: Exclude fields that are typed as optional
+ dataclasses and set to ``None``.
+
+ :returns: List of Production runs represented as Python dictionaries.
+
+ """
+ return [
+ run.to_dict(
+ exclude_unset_objects=exclude_unset_objects,
+ )
+ for run in self
+ ]
+
  def to_dataframe(self) -> pandas.DataFrame:
  """Convert production runs into :py:class:`pandas.DataFrame`
 
  Each row in the dataframe represents one production run. The ``start`` and
  ``end`` of every production run will be represented as :ref:`timezone-aware
  <python:datetime-naive-aware>` :py:class:`datetime.datetime` localized in UTC.
 
- :returns: DataFrame with production runs
-
+ :returns: DataFrame with production runs.
  """
  if not self:
  return pandas.DataFrame()
 
- df = pandas.json_normalize([asdict(run) for run in self])
- df.start = pandas.to_datetime(df.start, utc=True, format="ISO8601")
- df.end = pandas.to_datetime(df.end, utc=True, format="ISO8601")
- return df
+ return pandas.json_normalize(self.to_dicts(exclude_unset_objects=True))
diff --git a/tests/enlyze/test_models.py b/tests/enlyze/test_models.py
@@ -0,0 +1,30 @@
+from dataclasses import replace
+from datetime import datetime
+
+import hypothesis.strategies as st
+from hypothesis import given
+
+from enlyze.models import ProductionRun, ProductionRuns, _get_optional_dataclass_fields
+
+# https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timestamp-limitations
+st.register_type_strategy(
+ datetime,
+ st.datetimes(
+ min_value=datetime(1677, 9, 21, 0, 12, 44),
+ max_value=datetime(2262, 4, 11, 23, 47, 16),
+ ),
+)
+
+
+@given(run=st.from_type(ProductionRun))
+def test_production_run_to_dict_exclude_unset_objects(run: ProductionRun):
+ run = replace(run, quality=None)
+ assert "quality" in run.to_dict(exclude_unset_objects=False)
+ assert "quality" not in run.to_dict(exclude_unset_objects=True)
+
+
+@given(runs=st.lists(st.from_type(ProductionRun), max_size=10))
+def test_production_runs_to_dataframe(runs: list[ProductionRun]):
+ runs = ProductionRuns(runs)
+ df = runs.to_dataframe()
+ assert not set(df) & _get_optional_dataclass_fields(ProductionRun)