Refactor capacity input model, add default threshold, update cap anal…

…ysis logic - Refactored the capacity input model in the `models.py` file to include a default value for the `threshold` field. The `threshold` field now has a default value of 2.5 and must be greater than or equal to 0. - Updated the `main.py` file to import the `pandera.typing` module for type annotations. - Modified the `find_peaks_with_surroundings` method in the `CapacityAnalysis` class to skip peaks that are within a certain time window of previously found peaks. - Made other minor code improvements and optimizations.
EnergieID · Aug 13, 2024 · 75cacb9 · 75cacb9
1 parent b61c3fb
commit 75cacb9
Show file tree

Hide file tree

Showing 3 changed files with 77 additions and 69 deletions.
diff --git a/demo_capacity_analysis.ipynb b/demo_capacity_analysis.ipynb
diff --git a/openenergyid/capacity/main.py b/openenergyid/capacity/main.py
@@ -1,7 +1,9 @@
 """Main module for capacity analysis."""
 
 import datetime as dt
+import typing
 import pandas as pd
+import pandera.typing as pdt
 
 
 class CapacityAnalysis:
@@ -21,7 +23,7 @@ class CapacityAnalysis:
 
     def __init__(
         self,
-        data: pd.Series,
+        data: pdt.Series,
         threshold: float = 2.5,
         window: str = "MS",  # Default to month start
         x_padding: int = 4,
@@ -50,11 +52,12 @@ def find_peaks(self) -> pd.Series:
         """
         # Group by the specified window (default is month start)
         grouped = self.data.groupby(pd.Grouper(freq=self.window))
+
         # Find the index (timestamp) of the maximum value in each group
         peak_indices = grouped.idxmax()
+
         # Get the corresponding peak values
         peaks = self.data.loc[peak_indices][self.data > self.threshold]
-
         return peaks
 
     def find_peaks_with_surroundings(
@@ -69,12 +72,20 @@ def find_peaks_with_surroundings(
         Returns:
             List[tuple[dt.datetime,float,pd.Series]]: A list of tuples containing peak time, peak value, and surrounding data.
         """
-        peaks = self.data.sort_values(ascending=False).head(num_peaks)
+        peaks = self.data.nlargest(num_peaks * 2)
         peaks = peaks[peaks > self.threshold]
         if peaks.empty:
             return []
+
         result = []
+        window_size = dt.timedelta(minutes=15 * (2 * self.x_padding + 1))
+
         for peak_time, peak_value in peaks.items():
+            peak_time = typing.cast(pd.Timestamp, peak_time)
+
+            if any(abs(peak_time - prev_peak[0]) < window_size for prev_peak in result):
+                continue
+
             start_time = peak_time - dt.timedelta(minutes=15 * self.x_padding)
             end_time = peak_time + dt.timedelta(minutes=15 * (self.x_padding + 1))
             surrounding_data = self.data[start_time:end_time]
@@ -86,5 +97,6 @@ def find_peaks_with_surroundings(
                     surrounding_data,
                 ]
             )
-
+            if len(result) == num_peaks:
+                break
         return result
diff --git a/openenergyid/capacity/models.py b/openenergyid/capacity/models.py
@@ -10,7 +10,7 @@ class CapacityInput(BaseModel):
 
     timezone: str = Field(alias="timeZone")
     series: TimeSeries
-    threshold: float = Field(ge=0)
+    threshold: float = Field(default=2.5, ge=0)
 
 
 class PeakDetail(BaseModel):