Skip to content

Commit

Permalink
Extended time series regularizer to support multiseries (#4303)
Browse files Browse the repository at this point in the history
* extended time series regularizer to support multiseries
  • Loading branch information
MichaelFu512 authored Sep 14, 2023
1 parent cc60b68 commit 79a3200
Show file tree
Hide file tree
Showing 3 changed files with 128 additions and 33 deletions.
1 change: 1 addition & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ Release Notes
* Added datacheck to check for mismatched series length in multiseries :pr:`4296`
* Added STLDecomposer to multiseries pipelines :pr:`4299`
* Extended DateTimeFormatCheck data check to support multiseries :pr:`4300`
* Extended TimeSeriesRegularizer to support multiseries :pr:`4303`
* Fixes
* Changes
* Documentation Changes
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
"""Transformer that regularizes a dataset with an uninferrable offset frequency for time series problems."""
import pandas as pd
import woodwork as ww
from woodwork.logical_types import Datetime
from woodwork.statistics_utils import infer_frequency

Expand All @@ -25,6 +24,8 @@ class TimeSeriesRegularizer(Transformer):
This Transformer should be used before the `TimeSeriesImputer` in order to impute the missing values that were
added to X and y (if passed).
If used on multiseries dataset, works specifically on unstacked datasets.
Args:
time_index (string): Name of the column containing the datetime information used to order the data, required. Defaults to None.
frequency_payload (tuple): Payload returned from Woodwork's infer_frequency function where debug is True. Defaults to None.
Expand Down Expand Up @@ -295,7 +296,13 @@ def transform(self, X, y=None):

cleaned_y = None
if y is not None:
y_dates = pd.DataFrame({self.time_index: X[self.time_index], "target": y})
if isinstance(y, pd.Series):
y_dates = pd.DataFrame(
{self.time_index: X[self.time_index], "target": y},
)
else:
y_dates = y
y_dates[self.time_index] = X[self.time_index]
cleaned_y = cleaned_df.merge(y_dates, on=[self.time_index], how="left")
cleaned_y = cleaned_y.groupby(self.time_index).first().reset_index()

Expand All @@ -305,15 +312,19 @@ def transform(self, X, y=None):
cleaned_x.loc[
cleaned_x[self.time_index] == values["correct"]
] = to_replace.values
if y is not None:
if y is not None and isinstance(y, pd.Series):
cleaned_y.loc[cleaned_y[self.time_index] == values["correct"]] = y.iloc[
index
]

if cleaned_y is not None:
cleaned_y = cleaned_y["target"]
cleaned_y = ww.init_series(cleaned_y)
if isinstance(y, pd.Series):
cleaned_y = cleaned_y["target"]
elif isinstance(y, pd.DataFrame):
# remove date time column from unstacked y
cleaned_y = cleaned_y.drop(columns=self.time_index, axis=1)

cleaned_x.ww.init()
cleaned_y.ww.init()

cleaned_x.ww.init()
return cleaned_x, cleaned_y
137 changes: 110 additions & 27 deletions evalml/tests/component_tests/test_time_series_regularizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ def get_df(dates):
reg_X = pd.DataFrame()

reg_X["dates"] = dates
reg_X["ints"] = [int(i) for i in range(len(dates))]
reg_X["ints"] = list(range(len(dates)))
reg_X["doubles"] = [i / 0.25 ** (i / 100) for i in range(len(dates))]
reg_X["bools"] = [bool(min(1, i % 3)) for i in range(len(dates))]
reg_X["cats"] = np.random.choice(
Expand All @@ -23,13 +23,29 @@ def get_df(dates):
return reg_X, reg_y


def get_unstacked_df(dates):
reg_X = pd.DataFrame()

reg_X["dates"] = dates
reg_X["feature_a_0"] = list(range(len(dates)))
reg_X["feature_a_1"] = list(range(len(dates), 0, -1))
reg_X["feature_b_0"] = list(range(len(dates) * 2, 0, -2))
reg_X["feature_b_1"] = list(range(0, len(dates) * 2, 2))

reg_y = pd.DataFrame()
reg_y["target_0"] = list(range(len(dates)))
reg_y["target_1"] = list(range(len(dates), 0, -1))
return reg_X, reg_y


def assert_features_and_length_equal(
X,
y,
X_output,
y_output,
error_dict,
has_target=True,
check_dtype=True,
):
ww_payload = infer_frequency(X["dates"], debug=True, window_length=4, threshold=0.4)

Expand Down Expand Up @@ -60,16 +76,47 @@ def assert_features_and_length_equal(
X["dates"] == each_date,
list(set(X.columns) - {"dates"}),
].iloc[0]
outout_feat = non_nan_X.loc[
output_feat = non_nan_X.loc[
non_nan_X["dates"] == each_date,
list(set(non_nan_X.columns) - {"dates"}),
].iloc[0]
pd.testing.assert_series_equal(
pd.Series(input_feat.values),
pd.Series(outout_feat.values),
pd.Series(output_feat.values),
check_dtype=check_dtype,
)


def check_x_and_y_output(is_multiseries, X, y, X_output, y_output, error_dict):
if is_multiseries:
# put date column into the y dataframes for testing purposes
y["dates"] = X["dates"]
y_output["dates"] = X_output["dates"]

assert_features_and_length_equal(
X,
y,
X_output,
y_output,
error_dict,
has_target=False,
check_dtype=False,
)

# the function only really checks what's passed into "X" so passed in "y" as X in order to have it check y
assert_features_and_length_equal(
y,
X,
y_output,
X_output,
error_dict,
has_target=False,
check_dtype=False,
)
else:
assert_features_and_length_equal(X, y, X_output, y_output, error_dict)


def test_ts_regularizer_init():
ts_regularizer = TimeSeriesRegularizer(time_index="dates")

Expand Down Expand Up @@ -130,9 +177,14 @@ def test_ts_regularizer_time_index_is_None(duplicate_beginning):
ts_regularizer.fit(X, y)


def test_ts_regularizer_mismatch_target_length(duplicate_beginning):
X, _ = get_df(duplicate_beginning)
y = pd.Series([i for i in range(25)])
@pytest.mark.parametrize("is_multiseries", [True, False])
def test_ts_regularizer_mismatch_target_length(duplicate_beginning, is_multiseries):
if is_multiseries:
X, y = get_unstacked_df(duplicate_beginning)
y = y.drop(10)
else:
X, _ = get_df(duplicate_beginning)
y = pd.Series([i for i in range(25)])

ts_regularizer = TimeSeriesRegularizer(time_index="dates")
with pytest.raises(
Expand All @@ -142,7 +194,8 @@ def test_ts_regularizer_mismatch_target_length(duplicate_beginning):
ts_regularizer.fit(X, y)


def test_ts_regularizer_no_freq():
@pytest.mark.parametrize("is_multiseries", [True, False])
def test_ts_regularizer_no_freq(is_multiseries):
dates_1 = pd.date_range("2015-01-01", periods=5, freq="D")
dates_2 = pd.date_range("2015-01-08", periods=3, freq="D")
dates_3 = pd.DatetimeIndex(["2015-01-12"])
Expand All @@ -158,7 +211,10 @@ def test_ts_regularizer_no_freq():
.append(dates_6)
)

X, y = get_df(dates)
if is_multiseries:
X, y = get_unstacked_df(dates)
else:
X, y = get_df(dates)

ts_regularizer = TimeSeriesRegularizer(time_index="dates")
with pytest.raises(
Expand All @@ -168,16 +224,28 @@ def test_ts_regularizer_no_freq():
ts_regularizer.fit(X, y)


def test_ts_regularizer_no_issues(ts_data):
X, _, y = ts_data()
@pytest.mark.parametrize("is_multiseries", [True, False])
def test_ts_regularizer_no_issues(
ts_data,
is_multiseries,
multiseries_ts_data_unstacked,
):
if is_multiseries:
X, y = multiseries_ts_data_unstacked
else:
X, _, y = ts_data()

ts_regularizer = TimeSeriesRegularizer(time_index="date")
X_output, y_output = ts_regularizer.fit_transform(X, y)

assert ts_regularizer.inferred_freq is not None
assert len(ts_regularizer.error_dict) == 0
pd.testing.assert_frame_equal(X, X_output)
pd.testing.assert_series_equal(y, y_output)

if is_multiseries:
pd.testing.assert_frame_equal(y, y_output)
else:
pd.testing.assert_series_equal(y, y_output)


@pytest.mark.parametrize("y_passed", [True, False])
Expand Down Expand Up @@ -220,6 +288,7 @@ def test_ts_regularizer_X_only_equal_payload(y_passed, combination_of_faulty_dat
pd.testing.assert_series_equal(y_output, y_output_payload)


@pytest.mark.parametrize("is_multiseries", [True, False])
@pytest.mark.parametrize(
"duplicate_location",
["beginning", "middle", "end", "scattered", "continuous"],
Expand All @@ -231,6 +300,7 @@ def test_ts_regularizer_duplicate(
duplicate_end,
duplicate_scattered,
duplicate_continuous,
is_multiseries,
):
if duplicate_location == "beginning":
dates = duplicate_beginning
Expand All @@ -243,15 +313,19 @@ def test_ts_regularizer_duplicate(
else:
dates = duplicate_continuous

X, y = get_df(dates)
if is_multiseries:
X, y = get_unstacked_df(dates)
else:
X, y = get_df(dates)

ts_regularizer = TimeSeriesRegularizer(time_index="dates")
X_output, y_output = ts_regularizer.fit_transform(X, y)

error_dict = ts_regularizer.error_dict
assert_features_and_length_equal(X, y, X_output, y_output, error_dict)
check_x_and_y_output(is_multiseries, X, y, X_output, y_output, error_dict)


@pytest.mark.parametrize("is_multiseries", [True, False])
@pytest.mark.parametrize(
"missing_location",
["beginning", "middle", "end", "scattered", "continuous"],
Expand All @@ -263,6 +337,7 @@ def test_ts_regularizer_missing(
missing_end,
missing_scattered,
missing_continuous,
is_multiseries,
):
if missing_location == "beginning":
dates = missing_beginning
Expand All @@ -275,15 +350,19 @@ def test_ts_regularizer_missing(
else:
dates = missing_continuous

X, y = get_df(dates)
if is_multiseries:
X, y = get_unstacked_df(dates)
else:
X, y = get_df(dates)

ts_regularizer = TimeSeriesRegularizer(time_index="dates")
X_output, y_output = ts_regularizer.fit_transform(X, y)

error_dict = ts_regularizer.error_dict
assert_features_and_length_equal(X, y, X_output, y_output, error_dict)
check_x_and_y_output(is_multiseries, X, y, X_output, y_output, error_dict)


@pytest.mark.parametrize("is_multiseries", [True, False])
@pytest.mark.parametrize(
"uneven_type",
["beginning", "middle", "end", "scattered", "continuous", "work week"],
Expand All @@ -296,6 +375,7 @@ def test_ts_regularizer_uneven(
uneven_scattered,
uneven_continuous,
uneven_work_week,
is_multiseries,
):
if uneven_type == "beginning":
dates = uneven_beginning
Expand All @@ -310,18 +390,21 @@ def test_ts_regularizer_uneven(
else:
dates = uneven_work_week

X, y = get_df(dates)
if is_multiseries:
X, y = get_unstacked_df(dates)
else:
X, y = get_df(dates)
ts_regularizer = TimeSeriesRegularizer(time_index="dates")
X_output, y_output = ts_regularizer.fit_transform(X, y)

if uneven_type == "beginning":
assert X.iloc[0]["dates"] not in X_output["dates"]
assert X.iloc[1]["dates"] not in X_output["dates"]
assert y.iloc[0] not in y_output.values
assert y.iloc[1] not in y_output.values
elif uneven_type == "end":
assert X.iloc[-1]["dates"] not in X_output["dates"]
assert y.iloc[-1] not in y_output.values

error_dict = ts_regularizer.error_dict
assert_features_and_length_equal(X, y, X_output, y_output, error_dict)

check_x_and_y_output(is_multiseries, X, y, X_output, y_output, error_dict)
if not is_multiseries:
if uneven_type == "beginning":
assert X.iloc[0]["dates"] not in X_output["dates"]
assert X.iloc[1]["dates"] not in X_output["dates"]
assert y.iloc[0] not in y_output.values
assert y.iloc[1] not in y_output.values
elif uneven_type == "end":
assert X.iloc[-1]["dates"] not in X_output["dates"]
assert y.iloc[-1] not in y_output.values

0 comments on commit 79a3200

Please sign in to comment.