Skip to content

Commit

Permalink
Reverted deleted code
Browse files Browse the repository at this point in the history
  • Loading branch information
christopherbunn committed Jul 27, 2023
1 parent 3faf04f commit 536e345
Show file tree
Hide file tree
Showing 6 changed files with 233 additions and 2 deletions.
2 changes: 1 addition & 1 deletion .github/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ outputs:
- setuptools ==58.0.4
run:
- numpy >=1.21.0
- pandas >=2.0.2
- pandas >=1.5.0
- dask >=2022.2.0, !=2022.10.1
- scipy >=1.5.0
- scikit-learn >=1.3.0
Expand Down
2 changes: 1 addition & 1 deletion core-requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
numpy>=1.21.0
pandas>=2.0.3
pandas>=1.5.0
scipy>=1.5.0
scikit-learn>=1.3.0
scikit-optimize>=0.9.0
Expand Down
1 change: 1 addition & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ Release Notes
**Future Releases**
* Enhancements
* Updated regression metrics to handle multioutput dataframes as well as single output series :pr:`4233`
* Added stacking and unstacking utility functions to work with multiseries data :pr:`4250`
* Fixes
* Added support for pandas 2 :pr:`4216`
* Changes
Expand Down
106 changes: 106 additions & 0 deletions evalml/pipelines/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import black
import featuretools as ft
import pandas as pd
from woodwork import logical_types

from evalml.data_checks import DataCheckActionCode, DataCheckActionOption
Expand Down Expand Up @@ -1348,3 +1349,108 @@ def rows_of_interest(

preds_value_proba = preds_value_proba[preds_value_proba <= epsilon]
return preds_value_proba.index.tolist()


def unstack_multiseries(
X,
y,
series_id,
time_index,
target_name,
keep_time_in_index=True,
):
"""Converts multiseries data with one series_id column and one target column to one target column per series id.
Args:
X (pd.DataFrame): Data of shape [n_samples, n_features].
y (pd.Series): Target data.
series_id (str): The column which identifies which series each row belongs to.
time_index (str): Specifies the name of the column in X that provides the datetime objects.
target_name (str): The name of the target column.
keep_time_in_index (bool): Whether to maintain the time index as the index of the returned dataframes. Defaults to True.
If set to false, will discard the time index information entirely.
Returns:
pd.DataFrame, pd.DataFrame: The unstacked X and y data.
"""
# Combine X and y to make it easier to unstack
full_dataset = pd.concat([X, y.set_axis(X.index)], axis=1)

# Get the total number of series, with their names
series_id_unique = full_dataset[series_id].unique()

# Perform the unstacking
X_unstacked_cols = []
y_unstacked_cols = []
for s_id in series_id_unique:
single_series = full_dataset[full_dataset[series_id] == s_id]

# Save the time_index for alignment
new_time_index = single_series[time_index]
for column_name in full_dataset.columns.drop([time_index, series_id]):
new_column = single_series[column_name]
new_column.index = new_time_index
new_column.name = f"{column_name}_{s_id}"

if column_name == target_name:
y_unstacked_cols.append(new_column)
else:
X_unstacked_cols.append(new_column)

# Concatenate all the single series to reform dataframes
X_unstacked = pd.concat(X_unstacked_cols, axis=1)
y_unstacked = pd.concat(y_unstacked_cols, axis=1)

# Reset the axis if need be
if not keep_time_in_index:
X_unstacked.reset_index(drop=True, inplace=True)
y_unstacked.reset_index(drop=True, inplace=True)

return X_unstacked, y_unstacked


def stack_data(data, include_series_id=False, series_id_name=None):
"""Stacks the given DataFrame back into a single Series, or a DataFrame if include_series_id is True.
Should only be used for data that is expected to be a single series. To stack multiple unstacked columns,
call this function multiple times on the desired subsets.
Args:
data (pd.DataFrame): The data to stack.
include_series_id (bool): Whether or not to extract the series id and include it in a separate columns
series_id_name (str): If include_series_id is True, the series_id name to set for the column. The column
will be named 'series_id' if this parameter is None.
Returns:
pd.Series or pd.DataFrame: The data in stacked series form.
"""
if data is None or isinstance(data, pd.Series):
return data

stacked_series = data.stack(0)

# Extract the original column name
series_id_with_name = stacked_series.index.droplevel()
stacked_series.name = "_".join(series_id_with_name[0].split("_")[:-1])

# If the index is the time index, keep it
if not data.index.is_numeric():
new_time_index = data.index.unique().repeat(len(data.columns))
# Otherwise, set it to unique integers
else:
new_time_index = pd.RangeIndex(
start=data.index[0],
stop=data.index[0] + len(stacked_series),
)
stacked_series = stacked_series.set_axis(new_time_index)

# Pull out the series id information, if requested
if include_series_id:
series_id_col = pd.Series(
series_id_with_name.map(lambda col_name: col_name.split("_")[-1]),
name=series_id_name or "series_id",
index=stacked_series.index,
)
stacked_series = pd.concat([series_id_col, stacked_series], axis=1)

return stacked_series
35 changes: 35 additions & 0 deletions evalml/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -1007,6 +1007,41 @@ def ts_data_seasonal_test():
return X, y


@pytest.fixture
def multiseries_ts_data_stacked():
time_index = pd.date_range(start="1/1/2018", periods=20).repeat(5)
series_id = list(range(5)) * 20

X = pd.DataFrame(
{
"date": time_index,
"series_id": series_id,
"feature_a": range(100),
"feature_b": reversed(range(100)),
},
)
y = pd.Series(range(100))
return X, y


@pytest.fixture
def multiseries_ts_data_unstacked():
feature_a = pd.DataFrame({f"feature_a_{i}": range(i, 100, 5) for i in range(5)})
feature_b = pd.DataFrame(
{f"feature_b_{i}": range(99 - i, -1, -5) for i in range(5)},
)
X = pd.concat([feature_a, feature_b], axis=1)

y = pd.DataFrame({f"target_{i}": range(i, 100, 5) for i in range(5)})

X.index = pd.date_range(start="1/1/2018", periods=20)
X.index.name = "date"
y.index = pd.date_range(start="1/1/2018", periods=20)
y.index.name = "date"

return X, y


@pytest.fixture
def dummy_pipeline_hyperparameters():
return {
Expand Down
89 changes: 89 additions & 0 deletions evalml/tests/pipeline_tests/test_pipeline_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@
make_pipeline,
make_pipeline_from_actions,
rows_of_interest,
stack_data,
unstack_multiseries,
)
from evalml.problem_types import ProblemTypes, is_time_series

Expand Down Expand Up @@ -1374,3 +1376,90 @@ def test_make_pipeline_features_and_dfs(X_y_binary):
)

assert "DFS Transformer" == pipeline.component_graph.compute_order[0]


@pytest.mark.parametrize("target_name", ["target", "Target_Data"])
@pytest.mark.parametrize("keep_time_in_index", [True, False])
def test_unstack_multiseries(
target_name,
keep_time_in_index,
multiseries_ts_data_stacked,
multiseries_ts_data_unstacked,
):
X, y = multiseries_ts_data_stacked
X_unstacked, y_unstacked = multiseries_ts_data_unstacked
y.name = target_name
y_unstacked.columns = [
f"{target_name}_{i}" for i in range(len(y_unstacked.columns))
]
if not keep_time_in_index:
X_unstacked.reset_index(drop=True, inplace=True)
y_unstacked.reset_index(drop=True, inplace=True)

X_unstacked_transformed, y_unstacked_transformed = unstack_multiseries(
X,
y,
"series_id",
"date",
target_name=target_name,
keep_time_in_index=keep_time_in_index,
)
pd.testing.assert_frame_equal(
X_unstacked.sort_index(axis=1),
X_unstacked_transformed.sort_index(axis=1),
check_freq=False,
)
pd.testing.assert_frame_equal(
y_unstacked,
y_unstacked_transformed,
check_freq=False,
)


@pytest.mark.parametrize("include_series_id", [True, False])
@pytest.mark.parametrize("series_id_name", [None, "SERIES"])
@pytest.mark.parametrize("index_type", ["datetime", "int"])
def test_stack_data(
include_series_id,
series_id_name,
index_type,
multiseries_ts_data_stacked,
multiseries_ts_data_unstacked,
):
_, y = multiseries_ts_data_unstacked
_, y_stacked = multiseries_ts_data_stacked

y_stacked.name = "target"

if index_type == "datetime":
y_stacked.index = pd.date_range(start="1/1/2018", periods=20).repeat(5)
y_stacked.index.name = "date"
else:
y = y.reset_index(drop=True)

y_stacked_transformed = stack_data(
y,
include_series_id=include_series_id,
series_id_name=series_id_name,
)

if include_series_id:
series_id_name = series_id_name or "series_id"
series_id_col = pd.Series(
list(range(5)) * 20,
dtype="str",
index=y_stacked.index,
)
y_stacked = pd.DataFrame({series_id_name: series_id_col, "target": y_stacked})
pd.testing.assert_frame_equal(y_stacked, y_stacked_transformed)

else:
pd.testing.assert_series_equal(y_stacked, y_stacked_transformed)


def test_stack_data_noop():
none_y = None
series_y = pd.Series(range(100))

assert stack_data(none_y) is None
pd.testing.assert_series_equal(stack_data(series_y), series_y)

0 comments on commit 536e345

Please sign in to comment.