Merge pull request #103 from ImperialCollegeLondon/optimise-dsr-api

Use HDF5 file upload for DSR API data
ImperialCollegeLondon · Jul 25, 2023 · e8eb8a7 · e8eb8a7
2 parents 25ebdcc + 11bbd02
commit e8eb8a7
Show file tree

Hide file tree

Showing 7 changed files with 184 additions and 84 deletions.
diff --git a/datahub/dsr.py b/datahub/dsr.py
@@ -1,29 +1,31 @@
 """This module defines the data structures for the MEDUSA Demand Simulator model."""
 import numpy as np
+from fastapi import HTTPException
+from numpy.typing import NDArray
 from pydantic import BaseModel, Field
 
 
 class DSRModel(BaseModel):
     """Define required key values for Demand Side Response data."""
 
-    amount: list = Field(alias="Amount", size=(13,))
-    cost: list = Field(alias="Cost", size=(1440, 13))
-    kwh_cost: list = Field(alias="kWh Cost", size=(2,))
-    activities: list = Field(alias="Activities", size=(1440, 7))
+    amount: list = Field(alias="Amount", shape=(13,))
+    cost: list = Field(alias="Cost", shape=(1440, 13))
+    kwh_cost: list = Field(alias="kWh Cost", shape=(2,))
+    activities: list = Field(alias="Activities", shape=(1440, 7))
     activities_outside_home: list = Field(
-        alias="Activities Outside Home", size=(1440, 7)
+        alias="Activities Outside Home", shape=(1440, 7)
     )
-    activity_types: list = Field(alias="Activity Types", size=(7,))
-    ev_id_matrix: list = Field(alias="EV ID Matrix", default=None, size=(1440, 4329))
-    ev_dt: list = Field(alias="EV DT", size=(1440, 2))
-    ev_locations: list = Field(alias="EV Locations", default=None, size=(1440, 4329))
-    ev_battery: list = Field(alias="EV Battery", default=None, size=(1440, 4329))
-    ev_state: list = Field(alias="EV State", size=(1440, 4329))
-    ev_mask: list = Field(alias="EV Mask", default=None, size=(1440, 4329))
-    baseline_ev: list = Field(alias="Baseline EV", size=(1440,))
-    baseline_non_ev: list = Field(alias="Baseline Non-EV", size=(1440,))
-    actual_ev: list = Field(alias="Actual EV", size=(1440,))
-    actual_non_ev: list = Field(alias="Actual Non-EV", size=(1440,))
+    activity_types: list = Field(alias="Activity Types", shape=(7,))
+    ev_id_matrix: list = Field(alias="EV ID Matrix", default=None, shape=(1440, 4329))
+    ev_dt: list = Field(alias="EV DT", shape=(1440, 2))
+    ev_locations: list = Field(alias="EV Locations", default=None, shape=(1440, 4329))
+    ev_battery: list = Field(alias="EV Battery", default=None, shape=(1440, 4329))
+    ev_state: list = Field(alias="EV State", shape=(1440, 4329))
+    ev_mask: list = Field(alias="EV Mask", default=None, shape=(1440, 4329))
+    baseline_ev: list = Field(alias="Baseline EV", shape=(1440,))
+    baseline_non_ev: list = Field(alias="Baseline Non-EV", shape=(1440,))
+    actual_ev: list = Field(alias="Actual EV", shape=(1440,))
+    actual_non_ev: list = Field(alias="Actual Non-EV", shape=(1440,))
     name: str = Field(alias="Name", default="")
     warn: str = Field(alias="Warn", default="")
 
@@ -33,24 +35,40 @@ class Config:
         allow_population_by_field_name = True
 
 
-def validate_dsr_arrays(data: dict[str, str | list]) -> list[str]:
-    """Validate the sizes of the arrays in the DSR data.
+def validate_dsr_data(data: dict[str, NDArray]) -> None:
+    """Validate the shapes of the arrays in the DSR data.
 
     Args:
         data: The dictionary representation of the DSR Data. The keys are field aliases.
             It is generated with the data.dict(by_alias=True) where data is a DSRModel.
 
-    Returns:
-        An empty list if there are no issues. A list of the failing fields if there are.
+    Raises:
+        A HTTPException is there are mising failing fields if there are.
     """
+    missing_fields = [
+        field for field in DSRModel.schema()["required"] if field not in data.keys()
+    ]
+    if missing_fields:
+        raise HTTPException(
+            status_code=422,
+            detail=f"Missing required fields: {', '.join(missing_fields)}.",
+        )
+
     aliases = []
     for alias, field in DSRModel.schema()["properties"].items():
-        if field["type"] == "array":
-            try:
-                array = np.array(data[alias])
-            except ValueError:
+        try:
+            array = data[alias]
+        except KeyError:
+            if field:
                 aliases.append(alias)
-                continue
-            if array.shape != field["size"] or array.dtype != np.float64:
+            continue
+        if field["type"] == "array":
+            if array.shape != field["shape"] or not np.issubdtype(
+                array.dtype, np.number
+            ):
                 aliases.append(alias)
-    return aliases
+    if aliases:
+        raise HTTPException(
+            status_code=422,
+            detail=f"Invalid size for: {', '.join(aliases)}.",
+        )
diff --git a/datahub/main.py b/datahub/main.py
@@ -1,12 +1,13 @@
 """Script for running Datahub API."""
 from typing import Any, Hashable
 
-from fastapi import FastAPI, HTTPException
+import h5py  # type: ignore
+from fastapi import FastAPI, HTTPException, UploadFile
 from pydantic import BaseModel
 
 from . import data as dt
 from . import log
-from .dsr import DSRModel, validate_dsr_arrays
+from .dsr import validate_dsr_data
 from .opal import OpalModel
 from .wesim import get_wesim
 
@@ -62,7 +63,6 @@ def get_opal_data(  # type: ignore[misc]
 
     Args:
         start: Starting index for exported Dataframe
-
         end: Last index that will be included in exported Dataframe
 
     Returns:
@@ -88,31 +88,60 @@ def get_opal_data(  # type: ignore[misc]
 
 
 @app.post("/dsr")
-def update_dsr_data(data: DSRModel) -> dict[str, str]:
-    """POST method function for appending data to the DSR list.
+def upload_dsr(file: UploadFile) -> dict[str, str | None]:
+    """POST method for appending data to the DSR list.
+
+    This takes a HDF5 file as input. This file has a flat structure, with each dataset
+    available at the top level.
+
+    The required fields (datasets) are:
+    - Amount (13 x 1)
+    - Cost (1440 x 13)
+    - kWh Cost (2 x 1)
+    - Activities (1440 x 7)
+    - Activities Outside Home (1440 x 7)
+    - Activity Types (7 x 1)
+    - EV DT (1440 x 2)
+    - EV State (1440 x 4329)
+    - Baseline EV (1440 x 1)
+    - Baseline Non-EV (1440 x 1)
+    - Actual EV (1440 x 1)
+    - Actual Non-EV (1440 x 1)
+
+    The optional fields are:
+    - EV ID Matrix (1440 x 4329)
+    - EV Locations (1440 x 4329)
+    - EV Battery (1440 x 4329)
+    - EV Mask (1440 x 4329)
+    - Name (str)
+    - Warn (str)
+
+    Further details for the DSR data specification can be found in
+    [the GitHub wiki.](https://github.com/ImperialCollegeLondon/gridlington-datahub/wiki/Agent-model-data#output)
+
+    \f
 
     Args:
-        data: The DSR Data
-
-    Returns:
-        A dictionary with a success message
+        file (UploadFile): A HDF5 file with the DSR data.
 
     Raises:
-        A HTTPException if the data is invalid
-    """
+        HTTPException: If the data is invalid
+
+    Returns:
+        dict[str, str]: dictionary with the filename
+    """  # noqa: D301
     log.info("Recieved Opal data.")
-    data_dict = data.dict(by_alias=True)
-    if alias := validate_dsr_arrays(data_dict):
-        message = f"Invalid size for: {', '.join(alias)}."
-        log.error(message)
-        raise HTTPException(status_code=400, detail=message)
+    with h5py.File(file.file, "r") as h5file:
+        data = {key: value[...] for key, value in h5file.items()}
+
+    validate_dsr_data(data)
 
     log.info("Appending new data...")
     log.debug(f"Current DSR data length: {len(dt.dsr_data)}")
-    dt.dsr_data.append(data_dict)
+    dt.dsr_data.append(data)
     log.debug(f"Updated DSR data length: {len(dt.dsr_data)}")
 
-    return {"message": "Data submitted successfully."}
+    return {"filename": file.filename}
 
 
 @app.get("/dsr")
@@ -123,7 +152,6 @@ def get_dsr_data(  # type: ignore[misc]
 
     Args:
         start: Starting index for exported list
-
         end: Last index that will be included in exported list
 
     Returns:

diff --git a/pyproject.toml b/pyproject.toml
@@ -11,7 +11,9 @@ requires-python = ">=3.10"
 dependencies = [
     "pandas[excel]",
     "fastapi",
-    "uvicorn"
+    "uvicorn",
+    "python-multipart",
+    "h5py"
 ]
 
 [project.optional-dependencies]

diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -47,6 +47,8 @@ h11==0.14.0
     # via
     #   httpcore
     #   uvicorn
+h5py==3.9.0
+    # via datahub (pyproject.toml)
 httpcore==0.17.2
     # via httpx
 httpx==0.24.1
@@ -71,6 +73,7 @@ nodeenv==1.8.0
     # via pre-commit
 numpy==1.25.0
     # via
+    #   h5py
     #   pandas
     #   pandas-stubs
 odfpy==1.4.1
@@ -116,6 +119,8 @@ pytest-mypy==0.10.3
     # via datahub (pyproject.toml)
 python-dateutil==2.8.2
     # via pandas
+python-multipart==0.0.6
+    # via datahub (pyproject.toml)
 pytz==2023.3
     # via pandas
 pyxlsb==1.0.10

diff --git a/requirements.txt b/requirements.txt
@@ -18,10 +18,14 @@ fastapi==0.99.1
     # via datahub (pyproject.toml)
 h11==0.14.0
     # via uvicorn
+h5py==3.9.0
+    # via datahub (pyproject.toml)
 idna==3.4
     # via anyio
 numpy==1.25.0
-    # via pandas
+    # via
+    #   h5py
+    #   pandas
 odfpy==1.4.1
     # via pandas
 openpyxl==3.1.2
@@ -32,6 +36,8 @@ pydantic==1.10.10
     # via fastapi
 python-dateutil==2.8.2
     # via pandas
+python-multipart==0.0.6
+    # via datahub (pyproject.toml)
 pytz==2023.3
     # via pandas
 pyxlsb==1.0.10

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,5 +1,6 @@
 import random
 
+import h5py  # type: ignore
 import numpy as np
 import pytest
 from fastapi.testclient import TestClient
@@ -38,12 +39,33 @@ def opal_data_array():
 
 
 @pytest.fixture
-def dsr_data():
-    """Pytest Fixture for random Opal data input."""
-    data = {}
-    for field in list(DSRModel.__fields__.values()):
-        if field.annotation == str:
-            data[field.alias] = "Name or Warning"
-        else:
-            data[field.alias] = np.random.rand(*field.field_info.extra["size"]).tolist()
+def dsr_data(dsr_data_path):
+    """Pytest Fixture for DSR data as a dictionary."""
+    with h5py.File(dsr_data_path, "r") as h5file:
+        data = {key: value[...] for key, value in h5file.items()}
     return data
+
+
+@pytest.fixture
+def dsr_data_path(tmp_path):
+    """The path to a temporary HDF5 file with first-time-only generated DSR data."""
+    # Define the file path within the temporary directory
+    file_path = tmp_path / "data.h5"
+
+    # Check if the file already exists
+    if file_path.is_file():
+        # If the file exists, return its path
+        return file_path
+
+    # Otherwise, create and write data to the file
+    with h5py.File(file_path, "w") as h5file:
+        for field in list(DSRModel.__fields__.values()):
+            if field.annotation == str:
+                h5file[field.alias] = "Name or Warning"
+            else:
+                h5file[field.alias] = np.random.rand(
+                    *field.field_info.extra["shape"]
+                ).astype("float16")
+
+    # Return the path to the file
+    return file_path