Skip to content

Commit

Permalink
add long conversions
Browse files Browse the repository at this point in the history
  • Loading branch information
paulf81 committed Sep 5, 2024
1 parent bdc95e0 commit 129cfe4
Show file tree
Hide file tree
Showing 2 changed files with 199 additions and 55 deletions.
137 changes: 106 additions & 31 deletions flasc/flasc_dataframe.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""FLASC DataFrame module."""

from pandas import DataFrame


Expand Down Expand Up @@ -34,29 +35,48 @@ class FlascDataFrame(DataFrame):
# Attributes to pickle must be in this list
_metadata = ["name_map", "_user_format"]

def __init__(self, *args, name_map=None, **kwargs):
def __init__(self, *args, name_map=None, in_flasc_format=True, user_format="wide", **kwargs):
"""Initialize the FlascDataFrame class, a subclass of pandas.DataFrame.
Args:
*args: arguments to pass to the DataFrame constructor
name_map (dict): Dictionary of column names to map from the user format to the FLASC
format, where the key string is the user format and the value string is the FLASC
equivalent. Defaults to None.
in_flasc_format (bool): Whether the data is in FLASC format. Defaults to True.
user_format (str): The format that the user expects the data to be in. Must be one of
'long', 'semiwide', or 'wide'. Defaults to 'wide'.
**kwargs: keyword arguments to pass to the DataFrame constructor
"""
super().__init__(*args, **kwargs)

self._user_format = "wide" # or "long" or "semiwide"

# check that name_map dictionary is valid
if name_map is not None:
if not isinstance(name_map, dict):
raise ValueError("name_map must be a dictionary")
if not all(isinstance(k, str) and isinstance(v, str) for k, v in name_map.items()):
raise ValueError("name_map must be a dictionary of strings")
self.name_map = name_map
# Apply the name_map
self.convert_to_flasc_format(inplace=True) # Do we want to do this here?

# Save the reversed name_map (to go to user_format)
self._name_map_to_user = (
{v: k for k, v in name_map.items()} if name_map is not None else None
)

# Set the format
self._in_flasc_format = in_flasc_format

# Save the user format
if user_format not in ["long", "semiwide", "wide"]:
raise ValueError("user_format must be one of 'long', 'semiwide', 'wide'")
self._user_format = user_format

# I think we should not convert to allow to stay in user format
# # Convert to flasc format if not already
# if not in_flasc_format:
# self.convert_to_flasc_format(inplace=True)
# else:
# self._in_flasc_format = True

@property
def _constructor(self):
Expand Down Expand Up @@ -95,55 +115,110 @@ def convert_to_user_format(self, inplace=False):
"""Convert the DataFrame to the format that the user expects, given the name_map."""
# Convert the format
if self._user_format == "long":
self._convert_wide_to_long() # Should this be assigned to something?
df_user = self._convert_wide_to_long()
elif self._user_format == "semiwide":
self._convert_wide_to_semiwide() # Should this be assigned to something?
df_user = self._convert_wide_to_semiwide()
elif self._user_format == "wide":
pass

# Set the flag
self._in_flasc_format = False

# Convert column names and return
if self.name_map is not None:
return self.rename(columns={v: k for k, v in self.name_map.items()}, inplace=inplace)
df_user = self.copy()

# In wide to wide conversion, only need to rename the columns
if self.name_map is not None:
df_user.rename(self._name_map_to_user, inplace=inplace)

# Assign to self or return
if inplace:
self.__init__(
df_user,
name_map=self.name_map,
in_flasc_format=False,
user_format=self._user_format,
)
else:
return None if inplace else self.copy()
# Force in flasc format to False
df_user._in_flasc_format = False

return df_user

def convert_to_flasc_format(self, inplace=False):
"""Convert the DataFrame to the format that FLASC expects."""
# Convert the format
if self._user_format == "long":
self._convert_long_to_wide() # Should this be assigned to something?
df_flasc = self._convert_long_to_wide() # Should this be assigned to something?
elif self._user_format == "semiwide":
self._convert_semiwide_to_wide() # Should this be assigned to something?
df_flasc = self._convert_semiwide_to_wide() # Should this be assigned to something?
elif self._user_format == "wide":
pass

# Set the flag
self._in_flasc_format = True

# Convert column names and return
if self.name_map is not None:
return self.rename(columns=self.name_map, inplace=inplace)
df_flasc = self.copy()

# In wide to wide conversion, only need to rename the columns
if self.name_map is not None:
df_flasc.rename(columns=self.name_map, inplace=inplace)

# Assign to self or return
if inplace:
self.__init__(
df_flasc,
name_map=self.name_map,
in_flasc_format=True,
user_format=self._user_format,
)
else:
return None if inplace else self.copy()
# Force in flasc format to True
df_flasc._in_flasc_format = True

return df_flasc

def _convert_long_to_wide(self):
"""Convert a long format DataFrame to a wide format DataFrame."""
# raise NotImplementedError("TO DO")
pass
# Start by converting the variable names
df_wide = self.copy()
if df_wide.name_map is not None:
df_wide["variable"] = df_wide["variable"].map(df_wide.name_map)

# Pivot the table so the variable column becomes the column names with time
# kept as the first column and value as the values
df_wide = df_wide.pivot(index="time", columns="variable", values="value").reset_index()

# Remove the name
df_wide.columns.name = None

# Reset the index to make the time column a regular column
return FlascDataFrame(
df_wide,
name_map=self.name_map,
in_flasc_format=self._in_flasc_format,
user_format=self._user_format,
)

def _convert_semiwide_to_wide(self):
"""Convert a semiwide format DataFrame to a wide format DataFrame."""
raise NotImplementedError("TO DO")

def _convert_wide_to_long(self):
"""Convert a wide format DataFrame to a long format DataFrame."""
"""Convert a wide format DataFrame to a long format DataFrame.
Returns:
FlascDataFrame: Long format FlascDataFrame
"""
if "time" not in self.columns:
raise ValueError("Column 'time' must be present in the DataFrame")

return self.melt(id_vars="time", var_name="variable", value_name="value")
df_long = self.melt(id_vars="time", var_name="variable", value_name="value").sort_values(
["time", "variable"]
)

if self.name_map is not None:
df_long["variable"] = df_long["variable"].map(self._name_map_to_user)

# Reset index for cleanliness
df_long = df_long.reset_index(drop=True)

return FlascDataFrame(
df_long,
name_map=self.name_map,
in_flasc_format=self._in_flasc_format,
user_format=self._user_format,
)

def _convert_wide_to_semiwide(self):
"""Convert a wide format DataFrame to a semiwide format DataFrame."""
Expand Down
117 changes: 93 additions & 24 deletions tests/flasc_dataframe_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,27 +5,56 @@

from flasc.flasc_dataframe import FlascDataFrame

test_data_dict = {"time": [0, 10, 20], "a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}

test_name_map = {"a": "AA"}
# Define dataframes in each format that relate through the test name map
test_wide_dict = {
"time": [0, 10, 20],
"pow_000": [0, 100, 200],
"ws_000": [8, 8, 8],
"pow_001": [50, 150, 250],
"ws_001": [9, 9, 9],
}

test_name_map = {"T1PWR": "pow_000", "T1WS": "ws_000", "T2PWR": "pow_001", "T2WS": "ws_001"}


test_semi_wide_dict = {
"time": [0, 0, 10, 10, 20, 20],
"turbine_id": [0, 1, 0, 1, 0, 1],
"pow": [0, 50, 100, 150, 200, 250],
"ws": [8, 9, 8, 9, 8, 9],
}

test_long_dict = {
"time": [0, 0, 0, 0, 10, 10, 10, 10, 20, 20, 20, 20],
"variable": ["T1PWR", "T2PWR", "T1WS", "T2WS"] * 3,
"value": [0, 50, 8, 9, 100, 150, 8, 9, 200, 250, 8, 9],
}

test_wide_user_dict = {
"time": [0, 10, 20],
"T1PWR": [0, 100, 200],
"T1WS": [8, 8, 8],
"T2PWR": [50, 150, 250],
"T2WS": [9, 9, 9],
}


def test_type():
df = FlascDataFrame(test_data_dict, name_map=test_name_map)
df = FlascDataFrame(test_wide_dict, name_map=test_name_map)
assert isinstance(df, FlascDataFrame)

df2 = df.drop(columns="c") # Modifies the dataframe, returns a copy
df2 = df.drop(columns="ws_001") # Modifies the dataframe, returns a copy
assert isinstance(df2, FlascDataFrame)

# Assert df is a pandas DataFrame
assert isinstance(df, pd.DataFrame)


def test__metadata():
df = FlascDataFrame(test_data_dict, name_map=test_name_map)
df = FlascDataFrame(test_wide_dict, name_map=test_name_map)
df._user_format = "long"
df._in_flasc_format = False
df2 = df.drop(columns="c") # Modifies the dataframe, returns a copy
df2 = df.drop(columns="ws_001") # Modifies the dataframe, returns a copy
assert hasattr(df2, "name_map")
assert df2.name_map == test_name_map
assert hasattr(df2, "_user_format")
Expand All @@ -37,7 +66,7 @@ def test__metadata():


def test_printout():
df = FlascDataFrame(test_data_dict, name_map=test_name_map)
df = FlascDataFrame(test_wide_dict, name_map=test_name_map)
df._in_flasc_format = True
print(df)
print("\n")
Expand All @@ -48,7 +77,7 @@ def test_printout():


def test_check_flasc_format():
df = FlascDataFrame(test_data_dict, name_map=test_name_map)
df = FlascDataFrame(test_wide_dict, name_map=test_name_map)

# Should not raise an error
df.check_flasc_format()
Expand All @@ -61,19 +90,63 @@ def test_check_flasc_format():


def test_convert_to_long_format():
df = FlascDataFrame(test_data_dict, name_map=test_name_map)
df._user_format = "long" # Should be detected internally
df.convert_to_user_format(inplace=True) # Should not pass
df_wide = FlascDataFrame(test_wide_dict, name_map=test_name_map)
df_long_test = pd.DataFrame(test_long_dict)

# Test conversion with return
df_wide._user_format = "long" # Should be detected internally
df_wide_copy = df_wide.copy()
df_long = df_wide.convert_to_user_format(inplace=False)

# Test df_long is not in flasc format
assert not df_long._in_flasc_format

# Test returned frame is matched to expected value
pd.testing.assert_frame_equal(df_long, df_long_test)

# Test original frame is unchanged
pd.testing.assert_frame_equal(df_wide, df_wide_copy)

# Now test in place conversion
df_wide.convert_to_user_format(inplace=True)
pd.testing.assert_frame_equal(df_wide, df_long_test)

# Assert not in flasc format
assert not df_wide._in_flasc_format

# Now test the back conversion
df_back_to_wide = df_wide.convert_to_flasc_format(inplace=False)

# Resort the columns to match
df_back_to_wide = df_back_to_wide[df_wide_copy.columns]

pd.testing.assert_frame_equal(df_back_to_wide, df_wide_copy)

# Assert is in flasc format
assert df_back_to_wide._in_flasc_format

# Test in place version
df_wide.convert_to_flasc_format(inplace=True)

# Sort columns to match
df_wide = df_wide[df_wide_copy.columns]

pd.testing.assert_frame_equal(df_wide, df_wide_copy)

# Check operation not allowed if no "time" column
df.convert_to_flasc_format(inplace=True)
df.drop(columns="time", inplace=True)
df_wide.drop(columns="time", inplace=True)
with pytest.raises(ValueError):
df.convert_to_user_format(inplace=True)
df_wide.convert_to_user_format(inplace=True)


def test_convert_to_wide_format():
# Test wide to wide conversion

pass


def test_pickle():
df = FlascDataFrame(test_data_dict)
df = FlascDataFrame(test_wide_dict)
df.name_map = test_name_map
df.to_pickle("test_pickle.pkl")

Expand All @@ -85,7 +158,7 @@ def test_pickle():


def test_feather():
df = FlascDataFrame(test_data_dict, name_map=test_name_map)
df = FlascDataFrame(test_wide_dict, name_map=test_name_map)
df.to_feather("test_feather.ftr")

df2 = pd.read_feather("test_feather.ftr")
Expand All @@ -98,7 +171,7 @@ def test_feather():


def test_csv():
df = FlascDataFrame(test_data_dict, name_map=test_name_map)
df = FlascDataFrame(test_wide_dict, name_map=test_name_map)
df.to_csv("test_csv.csv")

df2 = pd.read_csv("test_csv.csv")
Expand All @@ -112,15 +185,11 @@ def test_csv():

def test_n_turbines():
# Currently, n_turbines based only on number of pow columns
name_map = {"a": "pow_000", "b": "pow_001", "c": "ws_000"}
df = FlascDataFrame(test_data_dict, name_map=name_map)
df = FlascDataFrame(test_wide_dict, name_map=test_name_map)
assert df.n_turbines == 2

name_map = {"a": "pow_000", "b": "ws_000", "c": "ws_001"}
df = FlascDataFrame(test_data_dict, name_map=name_map)
assert df.n_turbines == 1

# Check n_turbines not valid if not in flasc format
df._user_format = "long"
df.convert_to_user_format(inplace=True)
with pytest.raises(ValueError):
df.n_turbines

0 comments on commit 129cfe4

Please sign in to comment.