add long conversions

NREL · Sep 5, 2024 · 129cfe4 · 129cfe4
1 parent bdc95e0
commit 129cfe4
Show file tree

Hide file tree

Showing 2 changed files with 199 additions and 55 deletions.
diff --git a/flasc/flasc_dataframe.py b/flasc/flasc_dataframe.py
@@ -1,4 +1,5 @@
 """FLASC DataFrame module."""
+
 from pandas import DataFrame
 
 
@@ -34,29 +35,48 @@ class FlascDataFrame(DataFrame):
     # Attributes to pickle must be in this list
     _metadata = ["name_map", "_user_format"]
 
-    def __init__(self, *args, name_map=None, **kwargs):
+    def __init__(self, *args, name_map=None, in_flasc_format=True, user_format="wide", **kwargs):
         """Initialize the FlascDataFrame class, a subclass of pandas.DataFrame.
 
         Args:
             *args: arguments to pass to the DataFrame constructor
             name_map (dict): Dictionary of column names to map from the user format to the FLASC
                 format, where the key string is the user format and the value string is the FLASC
                 equivalent. Defaults to None.
+            in_flasc_format (bool): Whether the data is in FLASC format. Defaults to True.
+            user_format (str): The format that the user expects the data to be in. Must be one of
+                'long', 'semiwide', or 'wide'. Defaults to 'wide'.
             **kwargs: keyword arguments to pass to the DataFrame constructor
         """
         super().__init__(*args, **kwargs)
 
-        self._user_format = "wide"  # or "long" or "semiwide"
-
         # check that name_map dictionary is valid
         if name_map is not None:
             if not isinstance(name_map, dict):
                 raise ValueError("name_map must be a dictionary")
             if not all(isinstance(k, str) and isinstance(v, str) for k, v in name_map.items()):
                 raise ValueError("name_map must be a dictionary of strings")
         self.name_map = name_map
-        # Apply the name_map
-        self.convert_to_flasc_format(inplace=True)  # Do we want to do this here?
+
+        # Save the reversed name_map (to go to user_format)
+        self._name_map_to_user = (
+            {v: k for k, v in name_map.items()} if name_map is not None else None
+        )
+
+        # Set the format
+        self._in_flasc_format = in_flasc_format
+
+        # Save the user format
+        if user_format not in ["long", "semiwide", "wide"]:
+            raise ValueError("user_format must be one of 'long', 'semiwide', 'wide'")
+        self._user_format = user_format
+
+        # I think we should not convert to allow to stay in user format
+        # # Convert to flasc format if not already
+        # if not in_flasc_format:
+        #     self.convert_to_flasc_format(inplace=True)
+        # else:
+        #     self._in_flasc_format = True
 
     @property
     def _constructor(self):
@@ -95,55 +115,110 @@ def convert_to_user_format(self, inplace=False):
         """Convert the DataFrame to the format that the user expects, given the name_map."""
         # Convert the format
         if self._user_format == "long":
-            self._convert_wide_to_long()  # Should this be assigned to something?
+            df_user = self._convert_wide_to_long()
         elif self._user_format == "semiwide":
-            self._convert_wide_to_semiwide()  # Should this be assigned to something?
+            df_user = self._convert_wide_to_semiwide()
         elif self._user_format == "wide":
-            pass
-
-        # Set the flag
-        self._in_flasc_format = False
-
-        # Convert column names and return
-        if self.name_map is not None:
-            return self.rename(columns={v: k for k, v in self.name_map.items()}, inplace=inplace)
+            df_user = self.copy()
+
+            # In wide to wide conversion, only need to rename the columns
+            if self.name_map is not None:
+                df_user.rename(self._name_map_to_user, inplace=inplace)
+
+        # Assign to self or return
+        if inplace:
+            self.__init__(
+                df_user,
+                name_map=self.name_map,
+                in_flasc_format=False,
+                user_format=self._user_format,
+            )
         else:
-            return None if inplace else self.copy()
+            # Force in flasc format to False
+            df_user._in_flasc_format = False
+
+            return df_user
 
     def convert_to_flasc_format(self, inplace=False):
         """Convert the DataFrame to the format that FLASC expects."""
         # Convert the format
         if self._user_format == "long":
-            self._convert_long_to_wide()  # Should this be assigned to something?
+            df_flasc = self._convert_long_to_wide()  # Should this be assigned to something?
         elif self._user_format == "semiwide":
-            self._convert_semiwide_to_wide()  # Should this be assigned to something?
+            df_flasc = self._convert_semiwide_to_wide()  # Should this be assigned to something?
         elif self._user_format == "wide":
-            pass
-
-        # Set the flag
-        self._in_flasc_format = True
-
-        # Convert column names and return
-        if self.name_map is not None:
-            return self.rename(columns=self.name_map, inplace=inplace)
+            df_flasc = self.copy()
+
+            # In wide to wide conversion, only need to rename the columns
+            if self.name_map is not None:
+                df_flasc.rename(columns=self.name_map, inplace=inplace)
+
+        # Assign to self or return
+        if inplace:
+            self.__init__(
+                df_flasc,
+                name_map=self.name_map,
+                in_flasc_format=True,
+                user_format=self._user_format,
+            )
         else:
-            return None if inplace else self.copy()
+            # Force in flasc format to True
+            df_flasc._in_flasc_format = True
+
+            return df_flasc
 
     def _convert_long_to_wide(self):
         """Convert a long format DataFrame to a wide format DataFrame."""
-        # raise NotImplementedError("TO DO")
-        pass
+        # Start by converting the variable names
+        df_wide = self.copy()
+        if df_wide.name_map is not None:
+            df_wide["variable"] = df_wide["variable"].map(df_wide.name_map)
+
+        # Pivot the table so the variable column becomes the column names with time
+        # kept as the first column and value as the values
+        df_wide = df_wide.pivot(index="time", columns="variable", values="value").reset_index()
+
+        # Remove the name
+        df_wide.columns.name = None
+
+        # Reset the index to make the time column a regular column
+        return FlascDataFrame(
+            df_wide,
+            name_map=self.name_map,
+            in_flasc_format=self._in_flasc_format,
+            user_format=self._user_format,
+        )
 
     def _convert_semiwide_to_wide(self):
         """Convert a semiwide format DataFrame to a wide format DataFrame."""
         raise NotImplementedError("TO DO")
 
     def _convert_wide_to_long(self):
-        """Convert a wide format DataFrame to a long format DataFrame."""
+        """Convert a wide format DataFrame to a long format DataFrame.
+
+        Returns:
+            FlascDataFrame: Long format FlascDataFrame
+
+        """
         if "time" not in self.columns:
             raise ValueError("Column 'time' must be present in the DataFrame")
 
-        return self.melt(id_vars="time", var_name="variable", value_name="value")
+        df_long = self.melt(id_vars="time", var_name="variable", value_name="value").sort_values(
+            ["time", "variable"]
+        )
+
+        if self.name_map is not None:
+            df_long["variable"] = df_long["variable"].map(self._name_map_to_user)
+
+        # Reset index for cleanliness
+        df_long = df_long.reset_index(drop=True)
+
+        return FlascDataFrame(
+            df_long,
+            name_map=self.name_map,
+            in_flasc_format=self._in_flasc_format,
+            user_format=self._user_format,
+        )
 
     def _convert_wide_to_semiwide(self):
         """Convert a wide format DataFrame to a semiwide format DataFrame."""

diff --git a/tests/flasc_dataframe_test.py b/tests/flasc_dataframe_test.py
@@ -5,27 +5,56 @@
 
 from flasc.flasc_dataframe import FlascDataFrame
 
-test_data_dict = {"time": [0, 10, 20], "a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}
-
-test_name_map = {"a": "AA"}
+# Define dataframes in each format that relate through the test name map
+test_wide_dict = {
+    "time": [0, 10, 20],
+    "pow_000": [0, 100, 200],
+    "ws_000": [8, 8, 8],
+    "pow_001": [50, 150, 250],
+    "ws_001": [9, 9, 9],
+}
+
+test_name_map = {"T1PWR": "pow_000", "T1WS": "ws_000", "T2PWR": "pow_001", "T2WS": "ws_001"}
+
+
+test_semi_wide_dict = {
+    "time": [0, 0, 10, 10, 20, 20],
+    "turbine_id": [0, 1, 0, 1, 0, 1],
+    "pow": [0, 50, 100, 150, 200, 250],
+    "ws": [8, 9, 8, 9, 8, 9],
+}
+
+test_long_dict = {
+    "time": [0, 0, 0, 0, 10, 10, 10, 10, 20, 20, 20, 20],
+    "variable": ["T1PWR", "T2PWR", "T1WS", "T2WS"] * 3,
+    "value": [0, 50, 8, 9, 100, 150, 8, 9, 200, 250, 8, 9],
+}
+
+test_wide_user_dict = {
+    "time": [0, 10, 20],
+    "T1PWR": [0, 100, 200],
+    "T1WS": [8, 8, 8],
+    "T2PWR": [50, 150, 250],
+    "T2WS": [9, 9, 9],
+}
 
 
 def test_type():
-    df = FlascDataFrame(test_data_dict, name_map=test_name_map)
+    df = FlascDataFrame(test_wide_dict, name_map=test_name_map)
     assert isinstance(df, FlascDataFrame)
 
-    df2 = df.drop(columns="c")  # Modifies the dataframe, returns a copy
+    df2 = df.drop(columns="ws_001")  # Modifies the dataframe, returns a copy
     assert isinstance(df2, FlascDataFrame)
 
     # Assert df is a pandas DataFrame
     assert isinstance(df, pd.DataFrame)
 
 
 def test__metadata():
-    df = FlascDataFrame(test_data_dict, name_map=test_name_map)
+    df = FlascDataFrame(test_wide_dict, name_map=test_name_map)
     df._user_format = "long"
     df._in_flasc_format = False
-    df2 = df.drop(columns="c")  # Modifies the dataframe, returns a copy
+    df2 = df.drop(columns="ws_001")  # Modifies the dataframe, returns a copy
     assert hasattr(df2, "name_map")
     assert df2.name_map == test_name_map
     assert hasattr(df2, "_user_format")
@@ -37,7 +66,7 @@ def test__metadata():
 
 
 def test_printout():
-    df = FlascDataFrame(test_data_dict, name_map=test_name_map)
+    df = FlascDataFrame(test_wide_dict, name_map=test_name_map)
     df._in_flasc_format = True
     print(df)
     print("\n")
@@ -48,7 +77,7 @@ def test_printout():
 
 
 def test_check_flasc_format():
-    df = FlascDataFrame(test_data_dict, name_map=test_name_map)
+    df = FlascDataFrame(test_wide_dict, name_map=test_name_map)
 
     # Should not raise an error
     df.check_flasc_format()
@@ -61,19 +90,63 @@ def test_check_flasc_format():
 
 
 def test_convert_to_long_format():
-    df = FlascDataFrame(test_data_dict, name_map=test_name_map)
-    df._user_format = "long"  # Should be detected internally
-    df.convert_to_user_format(inplace=True)  # Should not pass
+    df_wide = FlascDataFrame(test_wide_dict, name_map=test_name_map)
+    df_long_test = pd.DataFrame(test_long_dict)
+
+    # Test conversion with return
+    df_wide._user_format = "long"  # Should be detected internally
+    df_wide_copy = df_wide.copy()
+    df_long = df_wide.convert_to_user_format(inplace=False)
+
+    # Test df_long is not in flasc format
+    assert not df_long._in_flasc_format
+
+    # Test returned frame is matched to expected value
+    pd.testing.assert_frame_equal(df_long, df_long_test)
+
+    # Test original frame is unchanged
+    pd.testing.assert_frame_equal(df_wide, df_wide_copy)
+
+    # Now test in place conversion
+    df_wide.convert_to_user_format(inplace=True)
+    pd.testing.assert_frame_equal(df_wide, df_long_test)
+
+    # Assert not in flasc format
+    assert not df_wide._in_flasc_format
+
+    # Now test the back conversion
+    df_back_to_wide = df_wide.convert_to_flasc_format(inplace=False)
+
+    # Resort the columns to match
+    df_back_to_wide = df_back_to_wide[df_wide_copy.columns]
+
+    pd.testing.assert_frame_equal(df_back_to_wide, df_wide_copy)
+
+    # Assert is in flasc format
+    assert df_back_to_wide._in_flasc_format
+
+    # Test in place version
+    df_wide.convert_to_flasc_format(inplace=True)
+
+    # Sort columns to match
+    df_wide = df_wide[df_wide_copy.columns]
+
+    pd.testing.assert_frame_equal(df_wide, df_wide_copy)
 
     # Check operation not allowed if no "time" column
-    df.convert_to_flasc_format(inplace=True)
-    df.drop(columns="time", inplace=True)
+    df_wide.drop(columns="time", inplace=True)
     with pytest.raises(ValueError):
-        df.convert_to_user_format(inplace=True)
+        df_wide.convert_to_user_format(inplace=True)
+
+
+def test_convert_to_wide_format():
+    # Test wide to wide conversion
+
+    pass
 
 
 def test_pickle():
-    df = FlascDataFrame(test_data_dict)
+    df = FlascDataFrame(test_wide_dict)
     df.name_map = test_name_map
     df.to_pickle("test_pickle.pkl")
 
@@ -85,7 +158,7 @@ def test_pickle():
 
 
 def test_feather():
-    df = FlascDataFrame(test_data_dict, name_map=test_name_map)
+    df = FlascDataFrame(test_wide_dict, name_map=test_name_map)
     df.to_feather("test_feather.ftr")
 
     df2 = pd.read_feather("test_feather.ftr")
@@ -98,7 +171,7 @@ def test_feather():
 
 
 def test_csv():
-    df = FlascDataFrame(test_data_dict, name_map=test_name_map)
+    df = FlascDataFrame(test_wide_dict, name_map=test_name_map)
     df.to_csv("test_csv.csv")
 
     df2 = pd.read_csv("test_csv.csv")
@@ -112,15 +185,11 @@ def test_csv():
 
 def test_n_turbines():
     # Currently, n_turbines based only on number of pow columns
-    name_map = {"a": "pow_000", "b": "pow_001", "c": "ws_000"}
-    df = FlascDataFrame(test_data_dict, name_map=name_map)
+    df = FlascDataFrame(test_wide_dict, name_map=test_name_map)
     assert df.n_turbines == 2
 
-    name_map = {"a": "pow_000", "b": "ws_000", "c": "ws_001"}
-    df = FlascDataFrame(test_data_dict, name_map=name_map)
-    assert df.n_turbines == 1
-
     # Check n_turbines not valid if not in flasc format
+    df._user_format = "long"
     df.convert_to_user_format(inplace=True)
     with pytest.raises(ValueError):
         df.n_turbines