Reverted deleted code

alteryx · Jul 27, 2023 · 536e345 · 536e345
1 parent 3faf04f
commit 536e345
Show file tree

Hide file tree

Showing 6 changed files with 233 additions and 2 deletions.
diff --git a/.github/meta.yaml b/.github/meta.yaml
@@ -25,7 +25,7 @@ outputs:
         - setuptools ==58.0.4
       run:
         - numpy >=1.21.0
-        - pandas >=2.0.2
+        - pandas >=1.5.0
         - dask >=2022.2.0, !=2022.10.1
         - scipy >=1.5.0
         - scikit-learn >=1.3.0

diff --git a/core-requirements.txt b/core-requirements.txt
@@ -1,5 +1,5 @@
 numpy>=1.21.0
-pandas>=2.0.3
+pandas>=1.5.0
 scipy>=1.5.0
 scikit-learn>=1.3.0
 scikit-optimize>=0.9.0

diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -3,6 +3,7 @@ Release Notes
 **Future Releases**
     * Enhancements
         * Updated regression metrics to handle multioutput dataframes as well as single output series :pr:`4233`
+        * Added stacking and unstacking utility functions to work with multiseries data :pr:`4250`
     * Fixes
         * Added support for pandas 2 :pr:`4216`
     * Changes

diff --git a/evalml/pipelines/utils.py b/evalml/pipelines/utils.py
@@ -5,6 +5,7 @@
 
 import black
 import featuretools as ft
+import pandas as pd
 from woodwork import logical_types
 
 from evalml.data_checks import DataCheckActionCode, DataCheckActionOption
@@ -1348,3 +1349,108 @@ def rows_of_interest(
 
     preds_value_proba = preds_value_proba[preds_value_proba <= epsilon]
     return preds_value_proba.index.tolist()
+
+
+def unstack_multiseries(
+    X,
+    y,
+    series_id,
+    time_index,
+    target_name,
+    keep_time_in_index=True,
+):
+    """Converts multiseries data with one series_id column and one target column to one target column per series id.
+
+    Args:
+        X (pd.DataFrame): Data of shape [n_samples, n_features].
+        y (pd.Series): Target data.
+        series_id (str): The column which identifies which series each row belongs to.
+        time_index (str): Specifies the name of the column in X that provides the datetime objects.
+        target_name (str): The name of the target column.
+        keep_time_in_index (bool): Whether to maintain the time index as the index of the returned dataframes. Defaults to True.
+            If set to false, will discard the time index information entirely.
+
+    Returns:
+        pd.DataFrame, pd.DataFrame: The unstacked X and y data.
+    """
+    # Combine X and y to make it easier to unstack
+    full_dataset = pd.concat([X, y.set_axis(X.index)], axis=1)
+
+    # Get the total number of series, with their names
+    series_id_unique = full_dataset[series_id].unique()
+
+    # Perform the unstacking
+    X_unstacked_cols = []
+    y_unstacked_cols = []
+    for s_id in series_id_unique:
+        single_series = full_dataset[full_dataset[series_id] == s_id]
+
+        # Save the time_index for alignment
+        new_time_index = single_series[time_index]
+        for column_name in full_dataset.columns.drop([time_index, series_id]):
+            new_column = single_series[column_name]
+            new_column.index = new_time_index
+            new_column.name = f"{column_name}_{s_id}"
+
+            if column_name == target_name:
+                y_unstacked_cols.append(new_column)
+            else:
+                X_unstacked_cols.append(new_column)
+
+    # Concatenate all the single series to reform dataframes
+    X_unstacked = pd.concat(X_unstacked_cols, axis=1)
+    y_unstacked = pd.concat(y_unstacked_cols, axis=1)
+
+    # Reset the axis if need be
+    if not keep_time_in_index:
+        X_unstacked.reset_index(drop=True, inplace=True)
+        y_unstacked.reset_index(drop=True, inplace=True)
+
+    return X_unstacked, y_unstacked
+
+
+def stack_data(data, include_series_id=False, series_id_name=None):
+    """Stacks the given DataFrame back into a single Series, or a DataFrame if include_series_id is True.
+
+    Should only be used for data that is expected to be a single series. To stack multiple unstacked columns,
+    call this function multiple times on the desired subsets.
+
+    Args:
+        data (pd.DataFrame): The data to stack.
+        include_series_id (bool): Whether or not to extract the series id and include it in a separate columns
+        series_id_name (str): If include_series_id is True, the series_id name to set for the column. The column
+            will be named 'series_id' if this parameter is None.
+
+    Returns:
+        pd.Series or pd.DataFrame: The data in stacked series form.
+    """
+    if data is None or isinstance(data, pd.Series):
+        return data
+
+    stacked_series = data.stack(0)
+
+    # Extract the original column name
+    series_id_with_name = stacked_series.index.droplevel()
+    stacked_series.name = "_".join(series_id_with_name[0].split("_")[:-1])
+
+    # If the index is the time index, keep it
+    if not data.index.is_numeric():
+        new_time_index = data.index.unique().repeat(len(data.columns))
+    # Otherwise, set it to unique integers
+    else:
+        new_time_index = pd.RangeIndex(
+            start=data.index[0],
+            stop=data.index[0] + len(stacked_series),
+        )
+    stacked_series = stacked_series.set_axis(new_time_index)
+
+    # Pull out the series id information, if requested
+    if include_series_id:
+        series_id_col = pd.Series(
+            series_id_with_name.map(lambda col_name: col_name.split("_")[-1]),
+            name=series_id_name or "series_id",
+            index=stacked_series.index,
+        )
+        stacked_series = pd.concat([series_id_col, stacked_series], axis=1)
+
+    return stacked_series
diff --git a/evalml/tests/conftest.py b/evalml/tests/conftest.py
@@ -1007,6 +1007,41 @@ def ts_data_seasonal_test():
     return X, y
 
 
+@pytest.fixture
+def multiseries_ts_data_stacked():
+    time_index = pd.date_range(start="1/1/2018", periods=20).repeat(5)
+    series_id = list(range(5)) * 20
+
+    X = pd.DataFrame(
+        {
+            "date": time_index,
+            "series_id": series_id,
+            "feature_a": range(100),
+            "feature_b": reversed(range(100)),
+        },
+    )
+    y = pd.Series(range(100))
+    return X, y
+
+
+@pytest.fixture
+def multiseries_ts_data_unstacked():
+    feature_a = pd.DataFrame({f"feature_a_{i}": range(i, 100, 5) for i in range(5)})
+    feature_b = pd.DataFrame(
+        {f"feature_b_{i}": range(99 - i, -1, -5) for i in range(5)},
+    )
+    X = pd.concat([feature_a, feature_b], axis=1)
+
+    y = pd.DataFrame({f"target_{i}": range(i, 100, 5) for i in range(5)})
+
+    X.index = pd.date_range(start="1/1/2018", periods=20)
+    X.index.name = "date"
+    y.index = pd.date_range(start="1/1/2018", periods=20)
+    y.index.name = "date"
+
+    return X, y
+
+
 @pytest.fixture
 def dummy_pipeline_hyperparameters():
     return {

diff --git a/evalml/tests/pipeline_tests/test_pipeline_utils.py b/evalml/tests/pipeline_tests/test_pipeline_utils.py
@@ -54,6 +54,8 @@
     make_pipeline,
     make_pipeline_from_actions,
     rows_of_interest,
+    stack_data,
+    unstack_multiseries,
 )
 from evalml.problem_types import ProblemTypes, is_time_series
 
@@ -1374,3 +1376,90 @@ def test_make_pipeline_features_and_dfs(X_y_binary):
     )
 
     assert "DFS Transformer" == pipeline.component_graph.compute_order[0]
+
+
+@pytest.mark.parametrize("target_name", ["target", "Target_Data"])
+@pytest.mark.parametrize("keep_time_in_index", [True, False])
+def test_unstack_multiseries(
+    target_name,
+    keep_time_in_index,
+    multiseries_ts_data_stacked,
+    multiseries_ts_data_unstacked,
+):
+    X, y = multiseries_ts_data_stacked
+    X_unstacked, y_unstacked = multiseries_ts_data_unstacked
+    y.name = target_name
+    y_unstacked.columns = [
+        f"{target_name}_{i}" for i in range(len(y_unstacked.columns))
+    ]
+    if not keep_time_in_index:
+        X_unstacked.reset_index(drop=True, inplace=True)
+        y_unstacked.reset_index(drop=True, inplace=True)
+
+    X_unstacked_transformed, y_unstacked_transformed = unstack_multiseries(
+        X,
+        y,
+        "series_id",
+        "date",
+        target_name=target_name,
+        keep_time_in_index=keep_time_in_index,
+    )
+    pd.testing.assert_frame_equal(
+        X_unstacked.sort_index(axis=1),
+        X_unstacked_transformed.sort_index(axis=1),
+        check_freq=False,
+    )
+    pd.testing.assert_frame_equal(
+        y_unstacked,
+        y_unstacked_transformed,
+        check_freq=False,
+    )
+
+
+@pytest.mark.parametrize("include_series_id", [True, False])
+@pytest.mark.parametrize("series_id_name", [None, "SERIES"])
+@pytest.mark.parametrize("index_type", ["datetime", "int"])
+def test_stack_data(
+    include_series_id,
+    series_id_name,
+    index_type,
+    multiseries_ts_data_stacked,
+    multiseries_ts_data_unstacked,
+):
+    _, y = multiseries_ts_data_unstacked
+    _, y_stacked = multiseries_ts_data_stacked
+
+    y_stacked.name = "target"
+
+    if index_type == "datetime":
+        y_stacked.index = pd.date_range(start="1/1/2018", periods=20).repeat(5)
+        y_stacked.index.name = "date"
+    else:
+        y = y.reset_index(drop=True)
+
+    y_stacked_transformed = stack_data(
+        y,
+        include_series_id=include_series_id,
+        series_id_name=series_id_name,
+    )
+
+    if include_series_id:
+        series_id_name = series_id_name or "series_id"
+        series_id_col = pd.Series(
+            list(range(5)) * 20,
+            dtype="str",
+            index=y_stacked.index,
+        )
+        y_stacked = pd.DataFrame({series_id_name: series_id_col, "target": y_stacked})
+        pd.testing.assert_frame_equal(y_stacked, y_stacked_transformed)
+
+    else:
+        pd.testing.assert_series_equal(y_stacked, y_stacked_transformed)
+
+
+def test_stack_data_noop():
+    none_y = None
+    series_y = pd.Series(range(100))
+
+    assert stack_data(none_y) is None
+    pd.testing.assert_series_equal(stack_data(series_y), series_y)