diff --git a/examples_artificial_data/01_raw_data_processing/02_flasc_data_frame.ipynb b/examples_artificial_data/01_raw_data_processing/02_flasc_data_frame.ipynb new file mode 100644 index 00000000..09f9fe07 --- /dev/null +++ b/examples_artificial_data/01_raw_data_processing/02_flasc_data_frame.ipynb @@ -0,0 +1,443 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# FlascDataFrame" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "FlascDataFrame...." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "from flasc.flasc_dataframe import FlascDataFrame" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Generate synthetic data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Suppose the we have a 3 turbine farm with turbines names 'TB01', 'TB02', 'TB03'\n", + "# For each turbine we have power, wind speed and wind direction data\n", + "# Assume that in the native data collection system,\n", + "# the signal names for each channel are given below\n", + "\n", + "N = 20 # Number of data points\n", + "\n", + "# Wind speeds\n", + "wind_speed_TB01 = np.random.rand(N) + 8.0\n", + "wind_speed_TB02 = np.random.rand(N) + 7.5\n", + "wind_speed_TB03 = np.random.rand(N) + 8.5\n", + "\n", + "# Wind directions\n", + "wind_dir_TB01 = 10 * np.random.rand(N) + 270.0\n", + "wind_dir_TB02 = 10 * np.random.rand(N) + 270.0\n", + "wind_dir_TB03 = 10 * np.random.rand(N) + 270.0\n", + "\n", + "# Power\n", + "power_TB01 = wind_speed_TB01**3\n", + "power_TB02 = wind_speed_TB02**3\n", + "power_TB03 = wind_speed_TB03**3\n", + "\n", + "# Time\n", + "time = np.arange(N)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# Add this data to a pandas dataframe\n", + "df = pd.DataFrame(\n", + " {\n", + " \"time\": time,\n", + " \"wind_speed_TB01\": wind_speed_TB01,\n", + " \"wind_speed_TB02\": wind_speed_TB02,\n", + " \"wind_speed_TB03\": wind_speed_TB03,\n", + " \"wind_dir_TB01\": wind_dir_TB01,\n", + " \"wind_dir_TB02\": wind_dir_TB02,\n", + " \"wind_dir_TB03\": wind_dir_TB03,\n", + " \"power_TB01\": power_TB01,\n", + " \"power_TB02\": power_TB02,\n", + " \"power_TB03\": power_TB03,\n", + " }\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Add to FlascDataFrame" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# Declare a name_map dictionary to map the signal names to the turbine names\n", + "name_map = {\n", + " \"time\": \"time\",\n", + " \"wind_speed_TB01\": \"ws_000\",\n", + " \"wind_speed_TB02\": \"ws_001\",\n", + " \"wind_speed_TB03\": \"ws_002\",\n", + " \"wind_dir_TB01\": \"wd_000\",\n", + " \"wind_dir_TB02\": \"wd_001\",\n", + " \"wind_dir_TB03\": \"wd_002\",\n", + " \"power_TB01\": \"pow_000\",\n", + " \"power_TB02\": \"pow_001\",\n", + " \"power_TB03\": \"pow_002\",\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
timews_000ws_001ws_002wd_000wd_001wd_002pow_000pow_001pow_002
008.7645028.1573099.307117275.810165276.755093272.444566673.258291542.801206806.204968
118.4504108.4527728.535425275.299285274.063929271.070424603.438938603.945117621.835407
228.6682157.6547088.733668275.228593275.422351273.983654651.311896448.524130666.177672
338.0305838.4348888.868566277.248916275.195408278.022614517.894491600.119784697.525638
448.0697367.8154179.014511273.859713279.687027276.033408525.506343477.371464732.531900
\n", + "
" + ], + "text/plain": [ + " time ws_000 ws_001 ws_002 wd_000 wd_001 wd_002 \\\n", + "0 0 8.764502 8.157309 9.307117 275.810165 276.755093 272.444566 \n", + "1 1 8.450410 8.452772 8.535425 275.299285 274.063929 271.070424 \n", + "2 2 8.668215 7.654708 8.733668 275.228593 275.422351 273.983654 \n", + "3 3 8.030583 8.434888 8.868566 277.248916 275.195408 278.022614 \n", + "4 4 8.069736 7.815417 9.014511 273.859713 279.687027 276.033408 \n", + "\n", + " pow_000 pow_001 pow_002 \n", + "0 673.258291 542.801206 806.204968 \n", + "1 603.438938 603.945117 621.835407 \n", + "2 651.311896 448.524130 666.177672 \n", + "3 517.894491 600.119784 697.525638 \n", + "4 525.506343 477.371464 732.531900 " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "## Declare an instance of FlascDataFrame\n", + "fdf = FlascDataFrame(df, name_map=name_map)\n", + "\n", + "fdf.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
timewind_speed_TB01wind_speed_TB02wind_speed_TB03wind_dir_TB01wind_dir_TB02wind_dir_TB03power_TB01power_TB02power_TB03
008.7645028.1573099.307117275.810165276.755093272.444566673.258291542.801206806.204968
118.4504108.4527728.535425275.299285274.063929271.070424603.438938603.945117621.835407
228.6682157.6547088.733668275.228593275.422351273.983654651.311896448.524130666.177672
338.0305838.4348888.868566277.248916275.195408278.022614517.894491600.119784697.525638
448.0697367.8154179.014511273.859713279.687027276.033408525.506343477.371464732.531900
\n", + "
" + ], + "text/plain": [ + " time wind_speed_TB01 wind_speed_TB02 wind_speed_TB03 wind_dir_TB01 \\\n", + "0 0 8.764502 8.157309 9.307117 275.810165 \n", + "1 1 8.450410 8.452772 8.535425 275.299285 \n", + "2 2 8.668215 7.654708 8.733668 275.228593 \n", + "3 3 8.030583 8.434888 8.868566 277.248916 \n", + "4 4 8.069736 7.815417 9.014511 273.859713 \n", + "\n", + " wind_dir_TB02 wind_dir_TB03 power_TB01 power_TB02 power_TB03 \n", + "0 276.755093 272.444566 673.258291 542.801206 806.204968 \n", + "1 274.063929 271.070424 603.438938 603.945117 621.835407 \n", + "2 275.422351 273.983654 651.311896 448.524130 666.177672 \n", + "3 275.195408 278.022614 517.894491 600.119784 697.525638 \n", + "4 279.687027 276.033408 525.506343 477.371464 732.531900 " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "## Illustrate transformation back to user names\n", + "fdf.convert_to_user_format().head()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "FlascDataFrame in FLASC format\n", + " time variable value\n", + "0 0 ws_000 8.764502\n", + "1 1 ws_000 8.450410\n", + "2 2 ws_000 8.668215\n", + "3 3 ws_000 8.030583\n", + "4 4 ws_000 8.069736\n", + "FlascDataFrame in FLASC format\n", + " time variable value\n", + "175 15 pow_002 812.743716\n", + "176 16 pow_002 690.231480\n", + "177 17 pow_002 770.042469\n", + "178 18 pow_002 843.600158\n", + "179 19 pow_002 739.761434\n" + ] + } + ], + "source": [ + "## Illustrate wide to long transformation\n", + "print(fdf._convert_wide_to_long().head())\n", + "print(fdf._convert_wide_to_long().tail())" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "flasc", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/flasc/flasc_dataframe.py b/flasc/flasc_dataframe.py new file mode 100644 index 00000000..f22862f4 --- /dev/null +++ b/flasc/flasc_dataframe.py @@ -0,0 +1,168 @@ +"""FLASC DataFrame module.""" +from pandas import DataFrame + + +# Create a new DataFrame subclass +class FlascDataFrame(DataFrame): + """Subclass of pandas.DataFrame for working with FLASC data. + + I think it makes most sense to store it as FLASC expects it: + - with the correct column names + - in wide format + + Then, can offer a transformation to export as the user would like it, for them to work on it + further. How, then, would I revert it back to the needed format + + + Two possible types of data we should try to handle: + 1. Semiwide: + - One column for time stamp + - One column for turbine id + - Many data channel columns + 2. Long: + - One column for time stamp + - One column for variable name + - One column for value + + FLASC format is wide, i.e. + - One column for time stamp + - One column for each channel for each turbine + + Want handling to go between long and wide and semiwide and wide. + """ + + # Attributes to pickle must be in this list + _metadata = ["name_map", "_user_format"] + + def __init__(self, *args, name_map=None, **kwargs): + """Initialize the FlascDataFrame class, a subclass of pandas.DataFrame. + + Args: + *args: arguments to pass to the DataFrame constructor + name_map (dict): Dictionary of column names to map from the user format to the FLASC + format, where the key string is the user format and the value string is the FLASC + equivalent. Defaults to None. + **kwargs: keyword arguments to pass to the DataFrame constructor + """ + super().__init__(*args, **kwargs) + + self._user_format = "wide" # or "long" or "semiwide" + + # check that name_map dictionary is valid + if name_map is not None: + if not isinstance(name_map, dict): + raise ValueError("name_map must be a dictionary") + if not all(isinstance(k, str) and isinstance(v, str) for k, v in name_map.items()): + raise ValueError("name_map must be a dictionary of strings") + self.name_map = name_map + # Apply the name_map + self.convert_to_flasc_format(inplace=True) # Do we want to do this here? + + @property + def _constructor(self): + return FlascDataFrame + + def __str__(self): + """Printout when calling print(df).""" + if self._in_flasc_format: + return "FlascDataFrame in FLASC format\n" + super().__str__() + else: + return "FlascDataFrame in user format\n" + super().__str__() + + @property + def n_turbines(self): + """Return the number of turbines in the dataset.""" + self.check_flasc_format() + + nt = 0 + while ("pow_%03d" % nt) in self.columns: + nt += 1 + return nt + + def check_flasc_format(self): + """Raise an error if the data is not in FLASC format.""" + if not self._in_flasc_format: + raise ValueError( + ( + "Data must be in FLASC format to perform this operation." + "Call df.convert_to_flasc_format() to convert the data to FLASC format." + ) + ) + else: + pass + + def convert_to_user_format(self, inplace=False): + """Convert the DataFrame to the format that the user expects, given the name_map.""" + # Convert the format + if self._user_format == "long": + self._convert_wide_to_long() # Should this be assigned to something? + elif self._user_format == "semiwide": + self._convert_wide_to_semiwide() # Should this be assigned to something? + elif self._user_format == "wide": + pass + + # Set the flag + self._in_flasc_format = False + + # Convert column names and return + if self.name_map is not None: + return self.rename(columns={v: k for k, v in self.name_map.items()}, inplace=inplace) + else: + return None if inplace else self.copy() + + def convert_to_flasc_format(self, inplace=False): + """Convert the DataFrame to the format that FLASC expects.""" + # Convert the format + if self._user_format == "long": + self._convert_long_to_wide() # Should this be assigned to something? + elif self._user_format == "semiwide": + self._convert_semiwide_to_wide() # Should this be assigned to something? + elif self._user_format == "wide": + pass + + # Set the flag + self._in_flasc_format = True + + # Convert column names and return + if self.name_map is not None: + return self.rename(columns=self.name_map, inplace=inplace) + else: + return None if inplace else self.copy() + + def _convert_long_to_wide(self): + """Convert a long format DataFrame to a wide format DataFrame.""" + # raise NotImplementedError("TO DO") + pass + + def _convert_semiwide_to_wide(self): + """Convert a semiwide format DataFrame to a wide format DataFrame.""" + raise NotImplementedError("TO DO") + + def _convert_wide_to_long(self): + """Convert a wide format DataFrame to a long format DataFrame.""" + if "time" not in self.columns: + raise ValueError("Column 'time' must be present in the DataFrame") + + return self.melt(id_vars="time", var_name="variable", value_name="value") + + def _convert_wide_to_semiwide(self): + """Convert a wide format DataFrame to a semiwide format DataFrame.""" + if "time" not in self.columns: + raise ValueError("Column 'time' must be present in the DataFrame") + + raise NotImplementedError("TO DO") + # Should have columns: + # time + # turbine_id (as specified by the user) + # variable + # value + + def to_feather(self, path, **kwargs): + """Raise warning about lost information and save to feather format.""" + print( + "Dataframe will be saved as a pandas DataFrame. " + "Extra attributes from FlascDataFrame will be lost. " + "We recommend using df.to_pickle() and pd.read_pickle() instead, " + "as this will retain FlascDataFrame attributes." + ) + return super().to_feather(path, **kwargs) diff --git a/tests/flasc_dataframe_test.py b/tests/flasc_dataframe_test.py new file mode 100644 index 00000000..c0a8baf7 --- /dev/null +++ b/tests/flasc_dataframe_test.py @@ -0,0 +1,126 @@ +import os + +import pandas as pd +import pytest + +from flasc.flasc_dataframe import FlascDataFrame + +test_data_dict = {"time": [0, 10, 20], "a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]} + +test_name_map = {"a": "AA"} + + +def test_type(): + df = FlascDataFrame(test_data_dict, name_map=test_name_map) + assert isinstance(df, FlascDataFrame) + + df2 = df.drop(columns="c") # Modifies the dataframe, returns a copy + assert isinstance(df2, FlascDataFrame) + + # Assert df is a pandas DataFrame + assert isinstance(df, pd.DataFrame) + + +def test__metadata(): + df = FlascDataFrame(test_data_dict, name_map=test_name_map) + df._user_format = "long" + df._in_flasc_format = False + df2 = df.drop(columns="c") # Modifies the dataframe, returns a copy + assert hasattr(df2, "name_map") + assert df2.name_map == test_name_map + assert hasattr(df2, "_user_format") + assert df2._user_format == "long" + assert hasattr(df2, "_in_flasc_format") + assert df2._in_flasc_format == True # Resets, since "_in_flasc_format" not in _metadata. + # May want to add "_in_flasc_format" to _metadata in future, but this + # demonstrates functionality + + +def test_printout(): + df = FlascDataFrame(test_data_dict, name_map=test_name_map) + df._in_flasc_format = True + print(df) + print("\n") + df._in_flasc_format = False + print(df) + print("\n") + print(df.head()) # In FLASC format, presumably because .head() returns a reinstantiated copy? + + +def test_check_flasc_format(): + df = FlascDataFrame(test_data_dict, name_map=test_name_map) + + # Should not raise an error + df.check_flasc_format() + + # Convert to non-flasc format; should now raise an error + df._user_format = "long" + df.convert_to_user_format(inplace=True) + with pytest.raises(ValueError): + df.check_flasc_format() + + +def test_convert_to_long_format(): + df = FlascDataFrame(test_data_dict, name_map=test_name_map) + df._user_format = "long" # Should be detected internally + df.convert_to_user_format(inplace=True) # Should not pass + + # Check operation not allowed if no "time" column + df.convert_to_flasc_format(inplace=True) + df.drop(columns="time", inplace=True) + with pytest.raises(ValueError): + df.convert_to_user_format(inplace=True) + + +def test_pickle(): + df = FlascDataFrame(test_data_dict) + df.name_map = test_name_map + df.to_pickle("test_pickle.pkl") + + df2 = pd.read_pickle("test_pickle.pkl") + assert isinstance(df2, FlascDataFrame) + assert df2.name_map == test_name_map + + os.remove("test_pickle.pkl") + + +def test_feather(): + df = FlascDataFrame(test_data_dict, name_map=test_name_map) + df.to_feather("test_feather.ftr") + + df2 = pd.read_feather("test_feather.ftr") + # Loaded DataFrame is a pandas DataFrame, not a FlascDataFrame + assert not isinstance(df2, FlascDataFrame) + assert isinstance(df2, pd.DataFrame) + assert not hasattr(df2, "name_map") + + os.remove("test_feather.ftr") + + +def test_csv(): + df = FlascDataFrame(test_data_dict, name_map=test_name_map) + df.to_csv("test_csv.csv") + + df2 = pd.read_csv("test_csv.csv") + # Loaded DataFrame is a pandas DataFrame, not a FlascDataFrame + assert not isinstance(df2, FlascDataFrame) + assert isinstance(df2, pd.DataFrame) + assert not hasattr(df2, "name_map") + + os.remove("test_csv.csv") + + +def test_n_turbines(): + # Currently, n_turbines based only on number of pow columns + name_map = {"a": "pow_000", "b": "pow_001", "c": "ws_000"} + df = FlascDataFrame(test_data_dict, name_map=name_map) + assert df.n_turbines == 2 + + name_map = {"a": "pow_000", "b": "ws_000", "c": "ws_001"} + df = FlascDataFrame(test_data_dict, name_map=name_map) + assert df.n_turbines == 1 + + # Check n_turbines not valid if not in flasc format + df.convert_to_user_format(inplace=True) + with pytest.raises(ValueError): + df.n_turbines