feat: add Granularity support for base load calculations, feat: clean…

…ed up base_load demo to be neat and packaged. chore: added some performance testing.
EnergieID · Dec 9, 2024 · 5cbb5fc · 5cbb5fc
1 parent fc1874b
commit 5cbb5fc
Show file tree

Hide file tree

Showing 8 changed files with 276 additions and 1,924 deletions.
diff --git a/demo_baseLoad.ipynb b/demo_baseLoad.ipynb
diff --git a/openenergyid/baseload/__init__.py b/openenergyid/baseload/__init__.py
@@ -5,11 +5,13 @@
     EnergySchema,
     load_data,
     calculate_base_load,
+    Granularity,
 )
 
 __all__ = [
     "BaseLoadMetrics",
     "EnergySchema",
     "load_data",
     "calculate_base_load",
+    "Granularity",
 ]
diff --git a/openenergyid/baseload/main.py b/openenergyid/baseload/main.py
@@ -10,26 +10,27 @@
     load_data(path: str) -> pl.LazyFrame:
         Loads and validates energy usage data from an NDJSON file.
 
-    calculate_base_load(lf: pl.LazyFrame, timeframe: TimeFrame = TimeFrame.DAILY) -> pl.DataFrame:
-        Calculates base load metrics from energy usage data aggregated by the specified timeframe.
+    calculate_base_load(lf: pl.LazyFrame, granularity: Granularity = Granularity.DAILY) -> pl.DataFrame:
+        Calculates base load metrics from energy usage data aggregated by the specified granularity.
 
-    main(file_path: str, timeframe: TimeFrame) -> pl.DataFrame:
-        Processes energy data and returns base load metrics for the specified timeframe.
+    main(file_path: str, granularity: Granularity) -> pl.DataFrame:
+        Processes energy data and returns base load metrics for the specified granularity.
 """
 
-from enum import Enum
 from typing import NamedTuple
 import polars as pl
 import pandera.polars as pa
+from openenergyid.enums import Granularity
 ## VERY important to use pandera.polars instead of pandera to avoid pandas errors
 
-
-class TimeFrame(Enum):
-    HOURLY = "1h"
-    DAILY = "1d"
-    WEEKLY = "1w"
-    MONTHLY = "1mo"
-    YEARLY = "1y"
+# Map Granularity to polars format
+GRANULARITY_TO_POLARS = {
+    Granularity.PT15M: "15m",
+    Granularity.PT1H: "1h",
+    Granularity.P1D: "1d",
+    Granularity.P1M: "1mo",
+    Granularity.P1Y: "1y",
+}
 
 
 class BaseLoadMetrics(NamedTuple):
@@ -74,12 +75,15 @@ def load_data(path: str) -> pl.LazyFrame:
     return pl.LazyFrame(validated_df)
 
 
-def calculate_base_load(lf: pl.LazyFrame, timeframe: TimeFrame = TimeFrame.DAILY) -> pl.DataFrame:
-    """Calculate base load metrics aggregated by specified timeframe"""
+def calculate_base_load(
+    lf: pl.LazyFrame, granularity: Granularity = Granularity.P1D
+) -> pl.DataFrame:
+    """Calculate base load metrics aggregated by specified granularity"""
+    polars_interval = GRANULARITY_TO_POLARS[granularity]
     return (
         lf.filter(pl.col("total") >= 0)
         .sort("timestamp")
-        .group_by_dynamic("timestamp", every=timeframe.value)
+        .group_by_dynamic("timestamp", every=polars_interval)
         .agg(
             [
                 pl.col("total").sum().alias("total_usage"),
@@ -98,12 +102,12 @@ def calculate_base_load(lf: pl.LazyFrame, timeframe: TimeFrame = TimeFrame.DAILY
     )
 
 
-def main(file_path: str, timeframe: TimeFrame) -> pl.DataFrame:
-    """Process energy data and return base load metrics for specified timeframe"""
-    return calculate_base_load(load_data(file_path), timeframe)
+def main(file_path: str, granularity: Granularity) -> pl.DataFrame:
+    """Process energy data and return base load metrics for specified granularity"""
+    return calculate_base_load(load_data(file_path), granularity)
 
 
 # Example usage:
 if __name__ == "__main__":
-    results = main("data/energy_use.ndjson", TimeFrame.MONTHLY)
+    results = main("data/PP/energy_use_test1.ndjson", Granularity.P1M)
     print(results)
diff --git a/performance_testing.ipynb b/performance_testing.ipynb
@@ -0,0 +1,164 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import polars as pl\n",
+    "import json"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# General Performance Testing\n",
+    "\n",
+    "In here we test and try some general things for the codebase.\n",
+    "Fe. the polars efficiency, we try to document and reference relevant docs where needed to keep it peer reviewed."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Some speedtests regarding polars reading in of files/frames/\n",
+    "\n",
+    "references:\n",
+    "* [pandasVSpolars speed test, apr 2023](https://medium.com/cuenex/pandas-2-0-vs-polars-the-ultimate-battle-a378eb75d6d1)\n",
+    "* [input/output in polars](https://docs.pola.rs/api/python/stable/reference/io.html)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## test 1 reading in a newline delimited json to check efficiency\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "9.57 μs ± 218 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%timeit\n",
+    "energy_use_df = pl.scan_ndjson(\n",
+    "    \"data/PP/energy_use_test1.ndjson\",\n",
+    "    schema={\"timestamp\": pl.Datetime(time_zone=\"Europe/Brussels\"), \"total\": pl.Float64},\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div><style>\n",
+       ".dataframe > thead > tr,\n",
+       ".dataframe > tbody > tr {\n",
+       "  text-align: right;\n",
+       "  white-space: pre-wrap;\n",
+       "}\n",
+       "</style>\n",
+       "<small>shape: (5, 2)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>timestamp</th><th>total</th></tr><tr><td>datetime[μs, Europe/Brussels]</td><td>f64</td></tr></thead><tbody><tr><td>2023-01-01 00:00:00 CET</td><td>0.025</td></tr><tr><td>2023-01-01 00:15:00 CET</td><td>0.017</td></tr><tr><td>2023-01-01 00:30:00 CET</td><td>0.023</td></tr><tr><td>2023-01-01 00:45:00 CET</td><td>0.024</td></tr><tr><td>2023-01-01 01:00:00 CET</td><td>0.023</td></tr></tbody></table></div>"
+      ],
+      "text/plain": [
+       "shape: (5, 2)\n",
+       "┌───────────────────────────────┬───────┐\n",
+       "│ timestamp                     ┆ total │\n",
+       "│ ---                           ┆ ---   │\n",
+       "│ datetime[μs, Europe/Brussels] ┆ f64   │\n",
+       "╞═══════════════════════════════╪═══════╡\n",
+       "│ 2023-01-01 00:00:00 CET       ┆ 0.025 │\n",
+       "│ 2023-01-01 00:15:00 CET       ┆ 0.017 │\n",
+       "│ 2023-01-01 00:30:00 CET       ┆ 0.023 │\n",
+       "│ 2023-01-01 00:45:00 CET       ┆ 0.024 │\n",
+       "│ 2023-01-01 01:00:00 CET       ┆ 0.023 │\n",
+       "└───────────────────────────────┴───────┘"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "energy_use_lf_1 = pl.scan_ndjson(\n",
+    "    \"data/PP/energy_use_test1.ndjson\",\n",
+    "    schema={\"timestamp\": pl.Datetime(time_zone=\"Europe/Brussels\"), \"total\": pl.Float64},\n",
+    ")\n",
+    "energy_use_lf_1.collect().head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Test 2, reading in the \"smaller version of the json\" and tranforming it into polars."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "34.5 ms ± 1.31 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%timeit\n",
+    "# Read the JSON file\n",
+    "with open(\"data/PP/energy_use.json\", \"r\") as file:\n",
+    "    data = json.load(file)\n",
+    "\n",
+    "# Convert the data into a list of dictionaries\n",
+    "data_list = [{\"timestamp\": int(k), \"value\": v} for k, v in data.items()]\n",
+    "\n",
+    "# Create a DataFrame from the list\n",
+    "df = pl.DataFrame(\n",
+    "    data_list, schema={\"timestamp\": pl.Datetime(time_zone=\"Europe/Brussels\"), \"value\": pl.Float64}\n",
+    ")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "openenergyid-Nm3FK_LY-py3.11",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -69,3 +69,4 @@ energyid = "^0.0.17"
 snakeviz = "^2.2.0"
 plotly = "^5.24.1"
 vegafusion = {version = ">=1.5.0", extras = ["embed"]}
+vl-convert-python = "^1.7.0"
diff --git a/vis/KDE of EnUsage.png b/vis/KDE of EnUsage.png
diff --git a/vis/heatmap.png b/vis/heatmap.png