Skip to content

Commit

Permalink
Exclude always-NA columns in ProductionRuns DataFrame.
Browse files Browse the repository at this point in the history
The `ProductionRun` dataclass contains some fields that in turn are dataclasses
themselves. When such fields are turned into DataFrame columns, they are
flattened such that every field contained in the sub-objects becomes a column
in the final DataFrame. Since some of these fields are actually optional, also
the field name itself will turn into a column which in practice always is
`None`. This PR removes such useless columns from the DataFrame.
  • Loading branch information
daniel-k committed Jan 24, 2024
1 parent e79438d commit c2f6eba
Show file tree
Hide file tree
Showing 2 changed files with 121 additions and 7 deletions.
98 changes: 91 additions & 7 deletions src/enlyze/models.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from dataclasses import asdict, dataclass
import typing
from dataclasses import asdict, dataclass, is_dataclass
from datetime import date, datetime, timedelta, timezone
from enum import Enum
from itertools import chain
Expand All @@ -8,6 +9,30 @@
import pandas


def _get_optional_dataclass_fields(cls: object) -> set[str]:
hints = typing.get_type_hints(cls)
optional_fields = set()
for field, typ in hints.items():
if typing.get_origin(typ) != typing.Union:
continue

args = list(typing.get_args(typ))
try:
args.remove(type(None))
except ValueError:
continue

try:
(nested_type,) = args
except ValueError:
continue

if is_dataclass(nested_type):
optional_fields.add(field)

return optional_fields


@dataclass(frozen=True)
class Site:
"""Representation of a :ref:`site <site>` in the ENLYZE platform.
Expand Down Expand Up @@ -289,24 +314,83 @@ class ProductionRun:
#: Aggregate OEE score that comprises availability, performance and quality.
productivity: Optional[OEEComponent]

def to_dict(self, exclude_unset_objects: bool = False) -> dict[str, Any]:
"""Convert to Python dictionary.
The ``start`` and ``end`` fields will be represented as
:ref:`timezone-aware <python:datetime-naive-aware>`
:py:class:`datetime.datetime` localized in UTC.
:param exclude_unset_objects: Exclude fields that are typed as optional
dataclasses and set to ``None``.
:returns: Production run represented as Python dictionary.
"""
data = asdict(
self,
dict_factory=lambda items: {
k: v
for k, v in items
if not (
exclude_unset_objects
and k in _get_optional_dataclass_fields(self)
and v is None
)
},
)

data["start"] = data["start"].astimezone(timezone.utc)
if data["end"]:
data["end"] = data["end"].astimezone(timezone.utc)

return data

def to_dataframe(self) -> pandas.DataFrame:
"""Convert to :py:class:`pandas.DataFrame`
The ``start`` and ``end`` fields will be represented as
:ref:`timezone-aware <python:datetime-naive-aware>`
:py:class:`datetime.datetime` localized in UTC.
:returns: DataFrame with a single row representing the production run.
"""
return pandas.json_normalize(self.to_dict(exclude_unset_objects=True))


class ProductionRuns(list[ProductionRun]):
"""Representation of multiple production runs."""

def to_dicts(self, exclude_unset_objects: bool = False) -> list[dict[str, Any]]:
"""Convert to Python dictionaries.
The ``start`` and ``end`` fields will be represented as
:ref:`timezone-aware <python:datetime-naive-aware>`
:py:class:`datetime.datetime` localized in UTC.
:param exclude_unset_objects: Exclude fields that are typed as optional
dataclasses and set to ``None``.
:returns: List of Production runs represented as Python dictionaries.
"""
return [
run.to_dict(
exclude_unset_objects=exclude_unset_objects,
)
for run in self
]

def to_dataframe(self) -> pandas.DataFrame:
"""Convert production runs into :py:class:`pandas.DataFrame`
Each row in the dataframe represents one production run. The ``start`` and
``end`` of every production run will be represented as :ref:`timezone-aware
<python:datetime-naive-aware>` :py:class:`datetime.datetime` localized in UTC.
:returns: DataFrame with production runs
:returns: DataFrame with production runs.
"""
if not self:
return pandas.DataFrame()

df = pandas.json_normalize([asdict(run) for run in self])
df.start = pandas.to_datetime(df.start, utc=True, format="ISO8601")
df.end = pandas.to_datetime(df.end, utc=True, format="ISO8601")
return df
return pandas.json_normalize(self.to_dicts(exclude_unset_objects=True))
30 changes: 30 additions & 0 deletions tests/enlyze/test_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from dataclasses import replace
from datetime import datetime

import hypothesis.strategies as st
from hypothesis import given

from enlyze.models import ProductionRun, ProductionRuns, _get_optional_dataclass_fields

# https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timestamp-limitations
st.register_type_strategy(
datetime,
st.datetimes(
min_value=datetime(1677, 9, 21, 0, 12, 44),
max_value=datetime(2262, 4, 11, 23, 47, 16),
),
)


@given(run=st.from_type(ProductionRun))
def test_production_run_to_dict_exclude_unset_objects(run: ProductionRun):
run = replace(run, quality=None)
assert "quality" in run.to_dict(exclude_unset_objects=False)
assert "quality" not in run.to_dict(exclude_unset_objects=True)


@given(runs=st.lists(st.from_type(ProductionRun), max_size=10))
def test_production_runs_to_dataframe(runs: list[ProductionRun]):
runs = ProductionRuns(runs)
df = runs.to_dataframe()
assert not set(df) & _get_optional_dataclass_fields(ProductionRun)

0 comments on commit c2f6eba

Please sign in to comment.