From 69344b27fac85bf35aa34932496fccb99458eeec Mon Sep 17 00:00:00 2001 From: remyogasawara <67338690+remyogasawara@users.noreply.github.com> Date: Thu, 31 Aug 2023 14:56:07 -0700 Subject: [PATCH] Extend STLDecomposer to Support Multiseries (#4253) * creates multiple graphs * reset condition for period * take dataframe as y input and fix indexing * update inverse_transform and get_trend_dataframe * update get_trend_prediction_intervals * add ms seasonal data * add multiseries tests * add plot test * add periods parameters * add unstacking and test --------- Co-authored-by: Becca McBrayer Co-authored-by: Christopher Bunn --- docs/source/release_notes.rst | 1 + .../transformers/preprocessing/decomposer.py | 58 +- .../preprocessing/stl_decomposer.py | 515 ++++++++++++------ .../decomposer_tests/test_decomposer.py | 299 ++++++++-- .../test_polynomial_decomposer.py | 23 + .../decomposer_tests/test_stl_decomposer.py | 441 ++++++++++++--- evalml/tests/conftest.py | 68 ++- 7 files changed, 1079 insertions(+), 326 deletions(-) diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index 1b82cdfb23..881451efa2 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -2,6 +2,7 @@ Release Notes ------------- **Future Releases** * Enhancements + * Extended STLDecomposer to Support Multiseries :pr:`4253` * Fixes * Changes * Documentation Changes diff --git a/evalml/pipelines/components/transformers/preprocessing/decomposer.py b/evalml/pipelines/components/transformers/preprocessing/decomposer.py index 3f3d0e0718..10222b3fc7 100644 --- a/evalml/pipelines/components/transformers/preprocessing/decomposer.py +++ b/evalml/pipelines/components/transformers/preprocessing/decomposer.py @@ -3,6 +3,7 @@ import re from abc import abstractmethod +from typing import Union import matplotlib.pyplot as plt import numpy as np @@ -324,9 +325,9 @@ def _project_seasonal( def plot_decomposition( self, X: pd.DataFrame, - y: pd.Series, + y: Union[pd.Series, pd.DataFrame], show: bool = False, - ) -> tuple[plt.Figure, list]: + ) -> Union[tuple[plt.Figure, list], dict[str, tuple[plt.Figure]]]: """Plots the decomposition of the target signal. Args: @@ -336,24 +337,49 @@ def plot_decomposition( show (bool): Whether to display the plot or not. Defaults to False. Returns: - matplotlib.pyplot.Figure, list[matplotlib.pyplot.Axes]: The figure and axes that have the decompositions + (Single series) matplotlib.pyplot.Figure, list[matplotlib.pyplot.Axes]: The figure and axes that have the decompositions plotted on them + (Multi series) dict[str, (matplotlib.pyplot.Figure, list[matplotlib.pyplot.Axes])]: A dictionary that maps the series id + to the figure and axes that have the decompositions plotted on them """ + if isinstance(y, pd.Series): + y = y.to_frame() + + plot_info = {} + if self.frequency and self.time_index and len(y.columns) > 1: + X.index = pd.DatetimeIndex(X[self.time_index], freq=self.frequency) decomposition_results = self.get_trend_dataframe(X, y) - fig, axs = plt.subplots(4) - fig.set_size_inches(18.5, 14.5) - axs[0].plot(decomposition_results[0]["signal"], "r") - axs[0].set_title("signal") - axs[1].plot(decomposition_results[0]["trend"], "b") - axs[1].set_title("trend") - axs[2].plot(decomposition_results[0]["seasonality"], "g") - axs[2].set_title("seasonality") - axs[3].plot(decomposition_results[0]["residual"], "y") - axs[3].set_title("residual") - if show: # pragma: no cover - plt.show() - return fig, axs + + # Iterate through each series id + for id in y.columns: + fig, axs = plt.subplots(4) + fig.set_size_inches(18.5, 14.5) + + if len(y.columns) > 1: + results = decomposition_results[id][0] + else: + results = decomposition_results[0] + axs[0].plot(results["signal"], "r") + axs[0].set_title("signal") + axs[1].plot(results["trend"], "b") + axs[1].set_title("trend") + axs[2].plot(results["seasonality"], "g") + axs[2].set_title("seasonality") + axs[3].plot(results["residual"], "y") + axs[3].set_title("residual") + + # If multiseries, return a dictionary of tuples + if len(y.columns) > 1: + fig.suptitle("Decomposition for Series {}".format(id)) + plot_info[id] = (fig, axs) + else: + plot_info = (fig, axs) + + if show: # pragma: no cover + plt.show() + + return plot_info def _check_target(self, X: pd.DataFrame, y: pd.Series): """Function to ensure target is not None and has a pandas.DatetimeIndex.""" diff --git a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py index 0486b214f8..b4bcfdd029 100644 --- a/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py +++ b/evalml/pipelines/components/transformers/preprocessing/stl_decomposer.py @@ -2,6 +2,7 @@ from __future__ import annotations import logging +from typing import Union import pandas as pd from pandas import RangeIndex @@ -20,6 +21,7 @@ class STLDecomposer(Decomposer): Args: time_index (str): Specifies the name of the column in X that provides the datetime objects. Defaults to None. + series_id (str): Specifies the name of the column in X that provides the series_id objects for multiseries. Defaults to None. degree (int): Not currently used. STL 3x "degree-like" values. None are able to be set at this time. Defaults to 1. period (int): The number of entries in the time series data that corresponds to one period of a @@ -40,14 +42,17 @@ class STLDecomposer(Decomposer): def __init__( self, time_index: str = None, + series_id: str = None, degree: int = 1, # Currently unused. period: int = None, + periods: dict = None, seasonal_smoother: int = 7, random_seed: int = 0, **kwargs, ): self.logger = logging.getLogger(__name__) - + self.series_id = series_id + self.periods = periods # Programmatically adjust seasonal_smoother to fit underlying STL requirements, # that seasonal_smoother must be odd. if seasonal_smoother % 2 == 0: @@ -58,18 +63,24 @@ def __init__( seasonal_smoother += 1 self.forecast_summary = None + parameters = { + "degree": degree, + "period": period, + "periods": periods, + "seasonal_smoother": seasonal_smoother, + "time_index": time_index, + "series_id": series_id, + } + parameters.update(kwargs) super().__init__( component_obj=None, random_seed=random_seed, - degree=degree, - period=period, - seasonal_smoother=seasonal_smoother, - time_index=time_index, + **parameters, **kwargs, ) - def _project_trend(self, y): + def _project_trend(self, y, trend, period): """Function to project the in-sample trend into the future.""" self._check_oos_past(y) @@ -81,7 +92,7 @@ def _project_trend(self, y): units_forward = ( len( pd.date_range( - start=self.trend.index[-1], + start=trend.index[-1], end=y.index[-1], freq=self.frequency, ), @@ -93,18 +104,18 @@ def _project_trend(self, y): # Model the trend and project it forward stlf = STLForecast( - self.trend, + trend, ARIMA, model_kwargs=dict(order=(1, 1, 0), trend="t"), - period=self.period, + period=period, ) stlf = stlf.fit() forecast = stlf.forecast(units_forward) # Store forecast summary for use in calculating trend prediction intervals. self.forecast_summary = stlf.get_prediction( - len(self.trend), - len(self.trend) + units_forward - 1, + len(trend), + len(trend) + units_forward - 1, ) # Handle out-of-sample forecasts. The forecast will have additional data @@ -121,19 +132,23 @@ def _project_trend(self, y): fore.index = y.index return fore - def _project_trend_and_seasonality(self, y): + def _project_trend_and_seasonality(self, y, trend, seasonality, periodicity): """Function to project both trend and seasonality forward into the future.""" - projected_trend = self._project_trend(y) + projected_trend = self._project_trend(y, trend, periodicity) projected_seasonality = self._project_seasonal( y, - self.seasonality, - self.period, + seasonality, + periodicity, self.frequency, ) return projected_trend, projected_seasonality - def fit(self, X: pd.DataFrame, y: pd.Series = None) -> STLDecomposer: + def fit( + self, + X: pd.DataFrame, + y: Union[pd.Series, pd.DataFrame] = None, + ) -> STLDecomposer: """Fits the STLDecomposer and determine the seasonal signal. Instantiates a statsmodels STL decompose object with the component's stored @@ -149,7 +164,7 @@ def fit(self, X: pd.DataFrame, y: pd.Series = None) -> STLDecomposer: Args: X (pd.DataFrame, optional): Conditionally used to build datetime index. - y (pd.Series): Target variable to detrend and deseasonalize. + y (pd.Series or pd.DataFrame): Target variable to detrend and deseasonalize. Returns: self @@ -158,9 +173,7 @@ def fit(self, X: pd.DataFrame, y: pd.Series = None) -> STLDecomposer: ValueError: If y is None. ValueError: If target data doesn't have DatetimeIndex AND no Datetime features in features data """ - self.original_index = y.index if y is not None else None - X, y = self._check_target(X, y) - self._map_dt_to_integer(self.original_index, y.index) + from evalml.pipelines.utils import unstack_multiseries # Warn for poor decomposition use with higher seasonal smoothers if self.seasonal_smoother > 14: @@ -168,33 +181,68 @@ def fit(self, X: pd.DataFrame, y: pd.Series = None) -> STLDecomposer: f"STLDecomposer may perform poorly on data with a high seasonal smoother ({self.seasonal_smoother}).", ) + # If y is a stacked pd.Series, unstack it + if self.series_id is not None and isinstance(y, pd.Series): + X, y = unstack_multiseries(X, y, self.series_id, self.time_index, y.name) + + if isinstance(y, pd.Series): + y = y.to_frame() + + self.original_index = y.index if y is not None else None + + X, y = self._check_target(X, y) + + self._map_dt_to_integer(self.original_index, y.index) + # Save the frequency of the fitted series for checking against transform data. self.frequency = y.index.freqstr or pd.infer_freq(y.index) + # Iterate through each id group + self.seasonals = {} + self.seasonalities = {} + self.trends = {} + self.residuals = {} + if self.periods is None: + self.periods = {} + + for id in y.columns: + series_y = y[id] + + # Determine the period of the seasonal component + if id not in self.periods: + # If the user provides a period for single series, use that + period = ( + self.period + if len(y.columns) == 1 and self.period is not None + else self.determine_periodicity(X, series_y) + ) + self.periods[id] = period - # Determine the period of the seasonal component - if self.period is None: - self.set_period(X, y) - - stl = STL(y, seasonal=self.seasonal_smoother, period=self.period) - res = stl.fit() - self.seasonal = res.seasonal - self.period = stl.period - dist = len(y) % self.period - self.seasonality = ( - self.seasonal[-(dist + self.period) : -dist] - if dist > 0 - else self.seasonal[-self.period :] - ) - self.trend = res.trend - self.residual = res.resid + stl = STL( + series_y, + seasonal=self.seasonal_smoother, + period=self.periods[id], + ) + res = stl.fit() + self.seasonals[id] = res.seasonal + self.periods[id] = stl.period + dist = len(series_y) % stl.period + seasonality = ( + res.seasonal[-(dist + stl.period) : -dist] + if dist > 0 + else res.seasonal[-stl.period :] + ) + self.seasonalities[id] = seasonality + self.trends[id] = res.trend + self.residuals[id] = res.resid + self.update_parameters({"periods": self.periods}) return self def transform( self, X: pd.DataFrame, - y: pd.Series = None, - ) -> tuple[pd.DataFrame, pd.Series]: + y: Union[pd.Series, pd.DataFrame] = None, + ) -> Union[tuple[pd.DataFrame, pd.Series], tuple[pd.DataFrame, pd.DataFrame]]: """Transforms the target data by removing the STL trend and seasonality. Uses an ARIMA model to project forward the addititve trend and removes it. Then, utilizes the first period's @@ -203,64 +251,111 @@ def transform( Args: X (pd.DataFrame, optional): Conditionally used to build datetime index. - y (pd.Series): Target variable to detrend and deseasonalize. + y (pd.Series or pd.DataFrame): Target variable to detrend and deseasonalize. Returns: - tuple of pd.DataFrame, pd.Series: The input features are returned without modification. The target + (Single series) pd.DataFrame, pd.Series: The list of input features are returned without modification. The target + variable y is detrended and deseasonalized. + (Multi series) pd.DataFrame, pd.DataFrame: The list of input features are returned without modification. The target variable y is detrended and deseasonalized. Raises: ValueError: If target data doesn't have DatetimeIndex AND no Datetime features in features data """ + from evalml.pipelines.utils import unstack_multiseries + if y is None: return X, y + + # If y is a stacked pd.Series, unstack it + if self.series_id is not None and isinstance(y, pd.Series): + X, y = unstack_multiseries(X, y, self.series_id, self.time_index, y.name) + + if isinstance(y, pd.Series): + y = y.to_frame() + original_index = y.index X, y = self._check_target(X, y) self._check_oos_past(y) - y_in_sample = pd.Series([]) - y_out_of_sample = pd.Series([]) - - # For partially and wholly in-sample data, retrieve stored results. - if self.trend.index[0] <= y.index[0] <= self.trend.index[-1]: - y_in_sample = self.residual[y.index[0] : y.index[-1]] - - # For out of sample data.... - if y.index[-1] > self.trend.index[-1]: - try: - # ...that is partially out of sample and partially in sample. - truncated_y = y[y.index.get_loc(self.trend.index[-1]) + 1 :] - except KeyError: - # ...that is entirely out of sample. - truncated_y = y - - ( - projected_trend, - projected_seasonality, - ) = self._project_trend_and_seasonality(truncated_y) - - y_out_of_sample = infer_feature_types( - pd.Series( - truncated_y - projected_trend - projected_seasonality, - index=truncated_y.index, - ), - ) - y_t = pd.concat([y_in_sample, y_out_of_sample]) - y_t.index = original_index - return X, y_t + detrending_list = [] + # Iterate through each id group + for id in y.columns: + series_y = y[id] + + if len(y.columns) > 1: + seasonality = self.seasonalities[id] + trend = self.trends[id] + residual = self.residuals[id] + period = self.periods[id] + else: + seasonality = list(self.seasonalities.values())[0] + trend = list(self.trends.values())[0] + residual = list(self.residuals.values())[0] + period = list(self.periods.values())[0] + + y_in_sample = pd.Series([]) + y_out_of_sample = pd.Series([]) + + # For partially and wholly in-sample data, retrieve stored results. + if trend.index[0] <= series_y.index[0] <= trend.index[-1]: + y_in_sample = residual[series_y.index[0] : series_y.index[-1]] + + # For out of sample data.... + if series_y.index[-1] > trend.index[-1]: + try: + # ...that is partially out of sample and partially in sample. + truncated_y = series_y[ + series_y.index.get_loc(trend.index[-1]) + 1 : + ] + except KeyError: + # ...that is entirely out of sample. + truncated_y = series_y + + ( + projected_trend, + projected_seasonality, + ) = self._project_trend_and_seasonality( + truncated_y, + trend, + seasonality, + period, + ) + + y_out_of_sample = infer_feature_types( + pd.Series( + truncated_y - projected_trend - projected_seasonality, + index=truncated_y.index, + ), + ) + y_t = pd.concat([y_in_sample, y_out_of_sample]) + y_t.index = original_index + + # If it is a single series time series, return tuple[pd.DataFrame, pd.Series] + if len(y.columns) <= 1: + return X, y_t + + detrending_list.append(y_t) - def inverse_transform(self, y_t: pd.Series) -> tuple[pd.DataFrame, pd.Series]: + # Convert the list to a DataFrame + # For multiseries, return tuple[pd.DataFrame, pd.Dataframe] where each column is a series_id + detrending_df = pd.DataFrame(detrending_list).T + return X, detrending_df + + def inverse_transform( + self, + y_t: Union[pd.Series, pd.DataFrame], + ) -> Union[pd.Series, pd.DataFrame]: """Adds back fitted trend and seasonality to target variable. The STL trend is projected to cover the entire requested target range, then added back into the signal. Then, the seasonality is projected forward to and added back into the signal. Args: - y_t (pd.Series): Target variable. + y_t (pd.Series or pd.DataFrame): Target variable. Returns: - tuple of pd.DataFrame, pd.Series: The first element are the input features returned without modification. - The second element is the target variable y with the trend and seasonality added back in. + pd.Series or pd.DataFrame: The target variable y with the trend and seasonality added back in. Raises: ValueError: If y is None. @@ -272,54 +367,82 @@ def inverse_transform(self, y_t: pd.Series) -> tuple[pd.DataFrame, pd.Series]: y_t = infer_feature_types(y_t) self._check_oos_past(y_t) + if isinstance(y_t, pd.Series): + y_t = y_t.to_frame() + index = self._choose_proper_index(y_t) + y = [] + for id in y_t.columns: + y_in_sample = pd.Series([]) + y_out_of_sample = pd.Series([]) + series_y = y_t[id] + + if len(y_t.columns) > 1: + old_trend = self.trends[id] + old_seasonal = self.seasonals[id] + period = self.periods[id] + else: + old_trend = list(self.trends.values())[0] + old_seasonal = list(self.seasonals.values())[0] + period = list(self.periods.values())[0] + # For partially and wholly in-sample data, retrieve stored results. + if index[0] <= series_y.index[0] <= index[-1]: + left_index = series_y.index[0] + right_index = ( + series_y.index[-1] + 1 + if isinstance(series_y.index, pd.RangeIndex) + or series_y.index.is_numeric() + else series_y.index[-1] + 1 * series_y.index.freq + ) + trend = ( + old_trend.reset_index(drop=True)[left_index:right_index] + if isinstance(series_y.index, pd.RangeIndex) + or series_y.index.is_numeric() + else old_trend[left_index:right_index] + ) + seasonal = ( + old_seasonal.reset_index(drop=True)[left_index:right_index] + if isinstance(series_y.index, pd.RangeIndex) + or series_y.index.is_numeric() + else old_seasonal[left_index:right_index] + ) + y_in_sample = series_y + trend + seasonal + y_in_sample = y_in_sample.dropna() + + # For out of sample data.... + if series_y.index[-1] > index[-1]: + try: + # ...that is partially out of sample and partially in sample. + truncated_y_t = series_y[series_y.index.get_loc(index[-1]) + 1 :] + except KeyError: + # ...that is entirely out of sample. + truncated_y_t = series_y + ( + projected_trend, + projected_seasonality, + ) = self._project_trend_and_seasonality( + truncated_y_t, + old_trend, + old_seasonal, + period, + ) - y_in_sample = pd.Series([]) - y_out_of_sample = pd.Series([]) + y_out_of_sample = infer_feature_types( + pd.Series( + truncated_y_t + projected_trend + projected_seasonality, + index=truncated_y_t.index, + ), + ) + y_series = pd.concat([y_in_sample, y_out_of_sample]) + # If it is a single series time series, return tuple[pd.DataFrame, pd.Series] + if len(y_t.columns) <= 1: + y_series.index = original_index + return y_series - # For partially and wholly in-sample data, retrieve stored results. - if index[0] <= y_t.index[0] <= index[-1]: - left_index = y_t.index[0] - right_index = ( - y_t.index[-1] + 1 - if isinstance(y_t.index, pd.RangeIndex) or y_t.index.is_numeric() - else y_t.index[-1] + 1 * y_t.index.freq - ) - trend = ( - self.trend.reset_index(drop=True)[left_index:right_index] - if isinstance(y_t.index, pd.RangeIndex) or y_t.index.is_numeric() - else self.trend[left_index:right_index] - ) - seasonal = ( - self.seasonal.reset_index(drop=True)[left_index:right_index] - if isinstance(y_t.index, pd.RangeIndex) or y_t.index.is_numeric() - else self.seasonal[left_index:right_index] - ) - y_in_sample = y_t + trend + seasonal - y_in_sample = y_in_sample.dropna() - - # For out of sample data.... - if y_t.index[-1] > index[-1]: - try: - # ...that is partially out of sample and partially in sample. - truncated_y_t = y_t[y_t.index.get_loc(index[-1]) + 1 :] - except KeyError: - # ...that is entirely out of sample. - truncated_y_t = y_t - ( - projected_trend, - projected_seasonality, - ) = self._project_trend_and_seasonality(truncated_y_t) - - y_out_of_sample = infer_feature_types( - pd.Series( - truncated_y_t + projected_trend + projected_seasonality, - index=truncated_y_t.index, - ), - ) - y = pd.concat([y_in_sample, y_out_of_sample]) - y.index = original_index - return y + y.append(y_series) + y_df = pd.DataFrame(y).T + y_df.index = original_index + return y_df def get_trend_dataframe(self, X, y): """Return a list of dataframes with 4 columns: signal, trend, seasonality, residual. @@ -330,9 +453,10 @@ def get_trend_dataframe(self, X, y): a DataFrame for multivariate problems. Returns: - list of pd.DataFrame: Each DataFrame contains the columns "signal", "trend", "seasonality" and "residual," + (Single series) list of pd.DataFrame: Each DataFrame contains the columns "signal", "trend", "seasonality" and "residual," with the latter 3 column values being the decomposed elements of the target data. The "signal" column is simply the input target signal but reindexed with a datetime index to match the input features. + (Multi series) dictionary of lists: Series id maps to a list of pd.DataFrames that each contain the columns "signal", "trend", "seasonality" and "residual" Raises: TypeError: If X does not have time-series data in the index. @@ -342,93 +466,124 @@ def get_trend_dataframe(self, X, y): """ X = infer_feature_types(X) - if not isinstance(X.index, pd.DatetimeIndex): - raise TypeError("Provided X should have datetimes in the index.") - if X.index.freq is None: - raise ValueError( - "Provided DatetimeIndex of X should have an inferred frequency.", - ) + if not isinstance(X.index, pd.DatetimeIndex) and not isinstance( + y.index, + pd.DatetimeIndex, + ): + raise TypeError("Provided X or y should have datetimes in the index.") # Change the y index to a matching datetimeindex or else we get a failure # in ForecastingHorizon during decomposition. if not isinstance(y.index, pd.DatetimeIndex): y = self._set_time_index(X, y) - + if not isinstance(X.index, pd.DatetimeIndex): + X.index = y.index self._check_oos_past(y) - result_dfs = [] - def _decompose_target(X, y, fh): """Function to generate a single DataFrame with trend, seasonality and residual components.""" - if len(y.index) == len(self.trend.index) and all( - y.index == self.trend.index, + if isinstance(y, pd.Series): + y = y.to_frame() + if all( + len(y.index) == len(self.trends[id].index) + and all( + y.index == self.trends[id].index, + ) + for id in y.columns ): - trend = self.trend - seasonal = self.seasonal - residual = self.residual - else: # TODO: Do a better job cloning. decomposer = STLDecomposer( seasonal_smoother=self.seasonal_smoother, - period=self.period, + periods=self.periods, ) decomposer.fit(X, y) - trend = decomposer.trend - seasonal = decomposer.seasonal - residual = decomposer.residual - return pd.DataFrame( - { - "signal": y, - "trend": trend, - "seasonality": seasonal, - "residual": residual, - }, - ) + trend = decomposer.trends + seasonal = decomposer.seasonals + residual = decomposer.residuals + else: + trend = self.trends + seasonal = self.seasonals + residual = self.residuals + result_dict = {} + for id in y.columns: + df = pd.DataFrame( + { + "signal": y[id], + "trend": trend[id], + "seasonality": seasonal[id], + "residual": residual[id], + }, + ) + if len(y.columns) == 1: + return [df] + else: + result_dict[id] = [df] + return result_dict - if isinstance(y, pd.Series): - result_dfs.append(_decompose_target(X, y, None)) - elif isinstance(y, pd.DataFrame): - for colname in y.columns: - result_dfs.append(_decompose_target(X, y[colname], None)) - return result_dfs + return _decompose_target(X, y, None) def get_trend_prediction_intervals(self, y, coverage=None): """Calculate the prediction intervals for the trend data. Args: - y (pd.Series): Target data. + y (pd.Series or pd.DataFrame): Target data. coverage (list[float]): A list of floats between the values 0 and 1 that the upper and lower bounds of the prediction interval should be calculated for. Returns: - dict of pd.Series: Prediction intervals, keys are in the format {coverage}_lower or {coverage}_upper. + (Single series) dict of pd.Series: Prediction intervals, keys are in the format {coverage}_lower or {coverage}_upper. + (Multi series) dict of dict of pd.Series: Each series id maps to a dictionary of prediction intervals """ + if isinstance(y, pd.Series): + y = y.to_frame() + if coverage is None: coverage = [0.95] self._check_oos_past(y) - alphas = [1 - val for val in coverage] + series_results = {} + for id in y.columns: + y_series = y[id] - if not self.forecast_summary or len(y) != len( - self.forecast_summary.predicted_mean, - ): - self._project_trend_and_seasonality(y) - - prediction_interval_result = {} - for i, alpha in enumerate(alphas): - result = self.forecast_summary.summary_frame(alpha=alpha) - overlapping_ind = [ind for ind in y.index if ind in result.index] - intervals = pd.DataFrame( - { - "lower": result["mean_ci_lower"] - result["mean"], - "upper": result["mean_ci_upper"] - result["mean"], - }, - ) - if len(overlapping_ind) > 0: # y.index is datetime - intervals = intervals.loc[overlapping_ind] - else: # y.index is not datetime (e.g. int) - intervals = intervals[-len(y) :] - intervals.index = y.index - prediction_interval_result[f"{coverage[i]}_lower"] = intervals["lower"] - prediction_interval_result[f"{coverage[i]}_upper"] = intervals["upper"] - - return prediction_interval_result + alphas = [1 - val for val in coverage] + + if len(y.columns) > 1: + trend = self.trends[id] + seasonality = self.seasonalities[id] + period = self.periods[id] + else: + trend = list(self.trends.values())[0] + seasonality = list(self.seasonalities.values())[0] + period = list(self.periods.values())[0] + if not self.forecast_summary or len(y_series) != len( + self.forecast_summary.predicted_mean, + ): + self._project_trend_and_seasonality( + y_series, + trend, + seasonality, + period, + ) + + prediction_interval_result = {} + for i, alpha in enumerate(alphas): + result = self.forecast_summary.summary_frame(alpha=alpha) + overlapping_ind = [ind for ind in y_series.index if ind in result.index] + intervals = pd.DataFrame( + { + "lower": result["mean_ci_lower"] - result["mean"], + "upper": result["mean_ci_upper"] - result["mean"], + }, + ) + if len(overlapping_ind) > 0: # y.index is datetime + intervals = intervals.loc[overlapping_ind] + else: # y.index is not datetime (e.g. int) + intervals = intervals[-len(y_series) :] + intervals.index = y_series.index + prediction_interval_result[f"{coverage[i]}_lower"] = intervals["lower"] + prediction_interval_result[f"{coverage[i]}_upper"] = intervals["upper"] + series_results[id] = prediction_interval_result + + # only return the dictionary if single series + if len(y.columns) <= 1: + return prediction_interval_result + return series_results diff --git a/evalml/tests/component_tests/decomposer_tests/test_decomposer.py b/evalml/tests/component_tests/decomposer_tests/test_decomposer.py index aaf924f626..9979dd9eb4 100644 --- a/evalml/tests/component_tests/decomposer_tests/test_decomposer.py +++ b/evalml/tests/component_tests/decomposer_tests/test_decomposer.py @@ -61,23 +61,52 @@ def test_decomposer_init_raises_error_if_degree_not_int(decomposer_child_class): "y_has_time_index", ["y_has_time_index", "y_doesnt_have_time_index"], ) +@pytest.mark.parametrize( + "variateness", + [ + "univariate", + "multivariate", + ], +) def test_decomposer_plot_decomposition( decomposer_child_class, y_has_time_index, generate_seasonal_data, + variateness, ): + if variateness == "multivariate" and isinstance( + decomposer_child_class(), + PolynomialDecomposer, + ): + pytest.skip( + "Skipping Decomposer because multiseries is not implemented for Polynomial Decomposer", + ) + step = 0.01 period = 9 - X, y = generate_seasonal_data(real_or_synthetic="synthetic")(period, step) + X, y = generate_seasonal_data( + real_or_synthetic="synthetic", + univariate_or_multivariate=variateness, + )(period, step) + if y_has_time_index == "y_has_time_index": y = y.set_axis(X.index) dec = decomposer_child_class(degree=1, period=period) dec.fit_transform(X, y) - fig, axs = dec.plot_decomposition(X, y, show=False) - assert isinstance(fig, matplotlib.pyplot.Figure) - assert isinstance(axs, np.ndarray) - assert all([isinstance(ax, matplotlib.pyplot.Axes) for ax in axs]) + + if variateness == "univariate": + fig, axs = dec.plot_decomposition(X, y, show=False) + assert isinstance(fig, matplotlib.pyplot.Figure) + assert isinstance(axs, np.ndarray) + assert all([isinstance(ax, matplotlib.pyplot.Axes) for ax in axs]) + elif variateness == "multivariate": + result_plots = dec.plot_decomposition(X, y, show=False) + for id in y.columns: + fig, axs = result_plots[id] + assert isinstance(fig, matplotlib.pyplot.Figure) + assert isinstance(axs, np.ndarray) + assert all([isinstance(ax, matplotlib.pyplot.Axes) for ax in axs]) @pytest.mark.parametrize( @@ -101,15 +130,31 @@ def test_decomposer_plot_decomposition( "time_index_is_specified_but_wrong", ], ) +@pytest.mark.parametrize( + "variateness", + [ + "univariate", + "multivariate", + ], +) def test_decomposer_uses_time_index( decomposer_child_class, ts_data, + ts_multiseries_data, + variateness, X_has_time_index, X_num_time_columns, y_has_time_index, time_index_specified, ): - X, _, y = ts_data() + if variateness == "univariate": + X, _, y = ts_data() + elif variateness == "multivariate": + if isinstance(decomposer_child_class(), PolynomialDecomposer): + pytest.skip( + "Skipping Decomposer because multiseries is not implemented for Polynomial Decomposer", + ) + X, _, y = ts_multiseries_data() time_index_col_name = "date" assert isinstance(X.index, pd.DatetimeIndex) @@ -237,7 +282,10 @@ def test_decomposer_prefers_users_time_index( X_t, y_t = dec.fit_transform(X, y) else: X_t, y_t = dec.fit_transform(X, y) - assert all(dec.trend.index.values == expected_values) + if isinstance(dec, STLDecomposer): + assert all(dec.trends[0].index.values == expected_values) + elif isinstance(dec, PolynomialDecomposer): + assert all(dec.trend.index.values == expected_values) @pytest.mark.parametrize( @@ -345,6 +393,7 @@ def test_decomposer_projected_seasonality_integer_and_datetime( }[test_first_index] X, _, y = ts_data() + datetime_index = pd.date_range(start="01-01-2002", periods=len(X), freq="M") if not has_freq: datetime_index.freq = None @@ -389,33 +438,6 @@ def test_decomposer_projected_seasonality_integer_and_datetime( ) -@pytest.mark.parametrize( - "decomposer_child_class", - decomposer_list, -) -def test_decomposer_get_trend_dataframe_raises_errors( - decomposer_child_class, - ts_data, -): - X, _, y = ts_data() - dec = decomposer_child_class() - dec.fit_transform(X, y) - - with pytest.raises( - TypeError, - match="Provided X should have datetimes in the index.", - ): - X_int_index = X.reset_index() - dec.get_trend_dataframe(X_int_index, y) - - with pytest.raises( - ValueError, - match="Provided DatetimeIndex of X should have an inferred frequency.", - ): - X.index.freq = None - dec.get_trend_dataframe(X, y) - - @pytest.mark.parametrize( "decomposer_child_class", decomposer_list, @@ -546,15 +568,30 @@ def test_decomposer_determine_periodicity_nullable_type_incompatibility( "decomposer_child_class", decomposer_list, ) +@pytest.mark.parametrize( + "variateness", + [ + "univariate", + "multivariate", + ], +) @pytest.mark.parametrize("fit_before_decompose", [True, False]) def test_decomposer_get_trend_dataframe_error_not_fit( decomposer_child_class, ts_data, + ts_multiseries_data, + variateness, fit_before_decompose, ): - X, _, y = ts_data() - - dec = decomposer_child_class() + if variateness == "univariate": + X, _, y = ts_data() + elif variateness == "multivariate": + if isinstance(decomposer_child_class(), PolynomialDecomposer): + pytest.skip( + "Skipping Decomposer because multiseries is not implemented for Polynomial Decomposer", + ) + X, _, y = ts_multiseries_data() + dec = decomposer_child_class(time_index="date") if fit_before_decompose: dec.fit_transform(X, y) dec.get_trend_dataframe(X, y) @@ -569,11 +606,28 @@ def test_decomposer_get_trend_dataframe_error_not_fit( "decomposer_child_class", decomposer_list, ) +@pytest.mark.parametrize( + "variateness", + [ + "univariate", + "multivariate", + ], +) def test_decomposer_transform_returns_same_when_y_none( decomposer_child_class, ts_data, + ts_multiseries_data, + variateness, ): - X, _, y = ts_data() + if variateness == "univariate": + X, _, y = ts_data() + elif variateness == "multivariate": + if isinstance(decomposer_child_class(), PolynomialDecomposer): + pytest.skip( + "Skipping Decomposer because multiseries is not implemented for Polynomial Decomposer", + ) + X, _, y = ts_multiseries_data() + dec = decomposer_child_class().fit(X, y) X_t, y_t = dec.transform(X, None) pd.testing.assert_frame_equal(X, X_t) @@ -584,11 +638,27 @@ def test_decomposer_transform_returns_same_when_y_none( "decomposer_child_class", decomposer_list, ) +@pytest.mark.parametrize( + "variateness", + [ + "univariate", + "multivariate", + ], +) def test_decomposer_raises_value_error_target_is_none( decomposer_child_class, ts_data, + ts_multiseries_data, + variateness, ): - X, _, y = ts_data() + if variateness == "univariate": + X, _, y = ts_data() + elif variateness == "multivariate": + if isinstance(decomposer_child_class(), PolynomialDecomposer): + pytest.skip( + "Skipping Decomposer because multiseries is not implemented for Polynomial Decomposer", + ) + X, _, y = ts_multiseries_data() with pytest.raises(ValueError, match="cannot be None for Decomposer!"): decomposer_child_class(degree=3).fit_transform(X, None) @@ -606,11 +676,28 @@ def test_decomposer_raises_value_error_target_is_none( "decomposer_child_class", decomposer_list, ) +@pytest.mark.parametrize( + "variateness", + [ + "univariate", + "multivariate", + ], +) def test_decomposer_bad_target_index( decomposer_child_class, ts_data, + ts_multiseries_data, + variateness, ): - X, _, y = ts_data() + if variateness == "univariate": + X, _, y = ts_data() + elif variateness == "multivariate": + if isinstance(decomposer_child_class(), PolynomialDecomposer): + pytest.skip( + "Skipping Decomposer because multiseries is not implemented for Polynomial Decomposer", + ) + X, _, y = ts_multiseries_data() + dec = decomposer_child_class() y.index = pd.CategoricalIndex(["cat_index" for x in range(len(y))]) with pytest.raises( @@ -636,29 +723,57 @@ def test_decomposer_bad_target_index( "partially-out-of-sample-in-past", ], ) +@pytest.mark.parametrize( + "variateness", + [ + "univariate", + "multivariate", + ], +) def test_decomposer_fit_transform_out_of_sample( decomposer_child_class, + variateness, generate_seasonal_data, transformer_fit_on_data, ): + if variateness == "multivariate" and isinstance( + decomposer_child_class(), + PolynomialDecomposer, + ): + pytest.skip( + "Skipping Decomposer because multiseries is not implemented for Polynomial Decomposer", + ) + # Generate 10 periods (the default) of synthetic seasonal data period = 7 - X, y = generate_seasonal_data(real_or_synthetic="synthetic")( + X, y = generate_seasonal_data( + real_or_synthetic="synthetic", + univariate_or_multivariate=variateness, + )( period=period, freq_str="D", set_time_index=True, seasonal_scale=0.05, # Increasing this value causes the decomposer to miscalculate trend ) + + subset_y = y.loc[y.index[2 * period : 7 * period]] subset_X = X[2 * period : 7 * period] - subset_y = y[2 * period : 7 * period] decomposer = decomposer_child_class(period=period) decomposer.fit(subset_X, subset_y) if transformer_fit_on_data == "in-sample": output_X, output_y = decomposer.transform(subset_X, subset_y) - pd.testing.assert_series_equal( - pd.Series(np.zeros(len(output_y))).set_axis(subset_y.index), + if variateness == "multivariate": + assert_function = pd.testing.assert_frame_equal + y_expected = y_expected = pd.DataFrame( + [np.zeros(len(output_y)), np.zeros(len(output_y))], + ).T.set_axis(subset_y.index) + else: + assert_function = pd.testing.assert_series_equal + y_expected = pd.Series(np.zeros(len(output_y))).set_axis(subset_y.index) + assert_function( + y_expected, output_y, check_dtype=False, check_names=False, @@ -682,10 +797,19 @@ def test_decomposer_fit_transform_out_of_sample( ): output_X, output_inverse_y = decomposer.transform(None, y_new) else: - output_X, output_y_t = decomposer.transform(None, y[y_new.index]) - - pd.testing.assert_series_equal( - pd.Series(np.zeros(len(output_y_t))).set_axis(y_new.index), + output_X, output_y_t = decomposer.transform(None, y.loc[y_new.index]) + if variateness == "multivariate": + assert_function = pd.testing.assert_frame_equal + y_new = pd.DataFrame([y_new, y_new]).T + y_expected = pd.DataFrame( + [np.zeros(len(output_y_t)), np.zeros(len(output_y_t))], + ).T.set_axis(y_new.index) + else: + assert_function = pd.testing.assert_series_equal + y_expected = pd.Series(np.zeros(len(output_y_t))).set_axis(y_new.index) + + assert_function( + y_expected, output_y_t, check_exact=False, atol=0.1, # STLDecomposer is within atol=5.0e-4 @@ -709,31 +833,61 @@ def test_decomposer_fit_transform_out_of_sample( "partially-out-of-sample-in-past", ], ) +@pytest.mark.parametrize( + "variateness", + [ + "univariate", + "multivariate", + ], +) def test_decomposer_inverse_transform( decomposer_child_class, index_type, generate_seasonal_data, + variateness, transformer_fit_on_data, ): + if variateness == "multivariate" and isinstance( + decomposer_child_class(), + PolynomialDecomposer, + ): + pytest.skip( + "Skipping Decomposer because multiseries is not implemented for Polynomial Decomposer", + ) + # Generate 10 periods (the default) of synthetic seasonal data period = 7 - X, y = generate_seasonal_data(real_or_synthetic="synthetic")( + X, y = generate_seasonal_data( + real_or_synthetic="synthetic", + univariate_or_multivariate=variateness, + )( period=period, freq_str="D", set_time_index=True, - seasonal_scale=0.05, + seasonal_scale=0.05, # Increasing this value causes the decomposer to miscalculate trend ) if index_type == "integer_index": y = y.reset_index(drop=True) + subset_X = X[: 5 * period] - subset_y = y[: 5 * period] + subset_y = y.loc[y.index[: 5 * period]] decomposer = decomposer_child_class(period=period) output_X, output_y = decomposer.fit_transform(subset_X, subset_y) if transformer_fit_on_data == "in-sample": output_inverse_y = decomposer.inverse_transform(output_y) - pd.testing.assert_series_equal(subset_y, output_inverse_y, check_dtype=False) + if variateness == "multivariate": + assert_function = pd.testing.assert_frame_equal + y_expected = pd.DataFrame(subset_y) + else: + assert_function = pd.testing.assert_series_equal + y_expected = pd.Series(subset_y) + assert_function( + y_expected, + output_inverse_y, + check_dtype=False, + ) if transformer_fit_on_data != "in-sample": y_t_new = build_test_target( @@ -742,6 +896,8 @@ def test_decomposer_inverse_transform( transformer_fit_on_data, to_test="inverse_transform", ) + if variateness == "multivariate": + y_t_new = pd.DataFrame([y_t_new, y_t_new]).T if transformer_fit_on_data in [ "out-of-sample-in-past", "partially-out-of-sample-in-past", @@ -755,15 +911,22 @@ def test_decomposer_inverse_transform( output_inverse_y = decomposer.inverse_transform(y_t_new) # Because output_inverse_y.index is int32 and y[y_t_new.index].index is int64 in windows, # we need to test the indices equivalence separately. - pd.testing.assert_series_equal( - y[y_t_new.index], + + if variateness == "multivariate": + assert_function = pd.testing.assert_frame_equal + y_expected = pd.DataFrame(y.loc[y_t_new.index]) + else: + assert_function = pd.testing.assert_series_equal + y_expected = pd.Series(y[y_t_new.index]) + assert_function( + y_expected, output_inverse_y, check_exact=False, - check_index=False, rtol=1.0e-1, ) + pd.testing.assert_index_equal( - y[y_t_new.index].index, + y.loc[y_t_new.index].index, output_inverse_y.index, exact=False, ) @@ -801,8 +964,28 @@ def test_decomposer_doesnt_modify_target_index( "decomposer_child_class", decomposer_list, ) -def test_decomposer_monthly_begin_data(decomposer_child_class, ts_data): - X, _, y = ts_data() +@pytest.mark.parametrize( + "variateness", + [ + "univariate", + "multivariate", + ], +) +def test_decomposer_monthly_begin_data( + decomposer_child_class, + ts_data, + ts_multiseries_data, + variateness, +): + if variateness == "univariate": + X, _, y = ts_data() + elif variateness == "multivariate": + if isinstance(decomposer_child_class(), PolynomialDecomposer): + pytest.skip( + "Skipping Decomposer because multiseries is not implemented for Polynomial Decomposer", + ) + X, _, y = ts_multiseries_data() + dts = pd.date_range("01-01-2000", periods=len(X), freq="MS") datetime_index = pd.DatetimeIndex(dts) X.index = datetime_index diff --git a/evalml/tests/component_tests/decomposer_tests/test_polynomial_decomposer.py b/evalml/tests/component_tests/decomposer_tests/test_polynomial_decomposer.py index 2f2f9a049d..a8fc871ed7 100644 --- a/evalml/tests/component_tests/decomposer_tests/test_polynomial_decomposer.py +++ b/evalml/tests/component_tests/decomposer_tests/test_polynomial_decomposer.py @@ -98,3 +98,26 @@ def test_polynomial_decomposer_needs_monotonic_index(ts_data): decomposer.fit_transform(X, y_shuffled) expected_errors = ["monotonically", "X must be in an sktime compatible format"] assert any([error in str(exec_info.value) for error in expected_errors]) + + +def test_polynomial_decomposer_get_trend_dataframe_raises_errors( + ts_data, +): + X, _, y = ts_data() + + dec = PolynomialDecomposer() + dec.fit_transform(X, y) + + with pytest.raises( + TypeError, + match="Provided X should have datetimes in the index.", + ): + X_int_index = X.reset_index() + dec.get_trend_dataframe(X_int_index, y) + + with pytest.raises( + ValueError, + match="Provided DatetimeIndex of X should have an inferred frequency.", + ): + X.index.freq = None + dec.get_trend_dataframe(X, y) diff --git a/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py b/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py index afedf2c686..6e17067d59 100644 --- a/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py +++ b/evalml/tests/component_tests/decomposer_tests/test_stl_decomposer.py @@ -1,3 +1,4 @@ +import matplotlib import numpy as np import pandas as pd import pytest @@ -16,8 +17,22 @@ def test_stl_decomposer_init(): assert decomp.parameters == { "degree": 3, "period": None, + "periods": None, "seasonal_smoother": 7, "time_index": "dates", + "series_id": None, + } + + +def test_stl_decomposer_multiseries_init(): + decomp = STLDecomposer(degree=3, time_index="dates", series_id="ids") + assert decomp.parameters == { + "degree": 3, + "period": None, + "periods": None, + "seasonal_smoother": 7, + "time_index": "dates", + "series_id": "ids", } @@ -29,8 +44,23 @@ def test_stl_decomposer_auto_sets_seasonal_smoother_to_odd(): assert stl.seasonal_smoother == 5 -def test_stl_raises_warning_high_smoother(caplog, ts_data): - X, _, y = ts_data() +@pytest.mark.parametrize( + "variateness", + [ + "univariate", + "multivariate", + ], +) +def test_stl_raises_warning_high_smoother( + caplog, + ts_data, + ts_multiseries_data, + variateness, +): + if variateness == "univariate": + X, _, y = ts_data() + elif variateness == "multivariate": + X, _, y = ts_multiseries_data() stl = STLDecomposer(seasonal_smoother=101) stl.fit(X, y) assert "STLDecomposer may perform poorly" in caplog.text @@ -46,20 +76,34 @@ def test_stl_raises_warning_high_smoother(caplog, ts_data): (40, "M"), ], ) +@pytest.mark.parametrize( + "variateness", + [ + "univariate", + "multivariate", + ], +) def test_stl_sets_determined_period( period, freq, generate_seasonal_data, + variateness, ): - X, y = generate_seasonal_data(real_or_synthetic="synthetic")( + X, y = generate_seasonal_data( + real_or_synthetic="synthetic", + univariate_or_multivariate=variateness, + )( period, freq_str=freq, ) stl = STLDecomposer() stl.fit(X, y) + if isinstance(y, pd.Series): + y = y.to_frame() # Allow for a slight margin of error with detection - assert period * 0.99 <= stl.period <= period * 1.01 + for id in y.columns: + assert period * 0.99 <= stl.periods[id] <= period * 1.01 @pytest.mark.parametrize( @@ -79,50 +123,68 @@ def test_stl_sets_determined_period( ], ) @pytest.mark.parametrize("trend_degree", [1, 2, 3]) +@pytest.mark.parametrize( + "variateness", + [ + "univariate", + "multivariate", + ], +) def test_stl_fit_transform_in_sample( period, freq, trend_degree, generate_seasonal_data, + variateness, ): - X, y = generate_seasonal_data(real_or_synthetic="synthetic")( + X, y = generate_seasonal_data( + real_or_synthetic="synthetic", + univariate_or_multivariate=variateness, + )( period, freq_str=freq, trend_degree=trend_degree, ) - # Get the expected answer - lin_reg = LinearRegression(fit_intercept=True) - features = PolynomialFeatures(degree=trend_degree).fit_transform( - np.arange(X.shape[0]).reshape(-1, 1), - ) - lin_reg.fit(features, y) - expected_trend = lin_reg.predict(features) - stl = STLDecomposer(period=period) X_t, y_t = stl.fit_transform(X, y) - # Check to make sure STL detrended/deseasoned - pd.testing.assert_series_equal( - pd.Series(np.zeros(len(y_t))), - y_t, - check_exact=False, - check_index=False, - check_names=False, - atol=0.1, - ) - - # Check the trend to make sure STL worked properly - pd.testing.assert_series_equal( - pd.Series(expected_trend), - pd.Series(stl.trend), - check_exact=False, - check_index=False, - check_names=False, - atol=0.3, - ) - + # If y_t is a pd.Series, give it columns + if isinstance(y_t, pd.Series): + y_t = y_t.to_frame() + if isinstance(y, pd.Series): + y = y.to_frame() + # Get the expected answer + for id in y_t.columns: + y_t_series = y_t[id] + y_series = y[id] + # Get the expected answer + lin_reg = LinearRegression(fit_intercept=True) + features = PolynomialFeatures(degree=trend_degree).fit_transform( + np.arange(X.shape[0]).reshape(-1, 1), + ) + lin_reg.fit(features, y_series) + expected_trend = lin_reg.predict(features) + + # Check to make sure STL detrended/deseasoned + pd.testing.assert_series_equal( + pd.Series(np.zeros(len(y_t_series))), + y_t_series, + check_exact=False, + check_index=False, + check_names=False, + atol=0.1, + ) + # Check the trend to make sure STL worked properly + pd.testing.assert_series_equal( + pd.Series(expected_trend), + pd.Series(stl.trends[0]), + check_exact=False, + check_index=False, + check_names=False, + atol=0.3, + ) # Verify the X is not changed pd.testing.assert_frame_equal(X, X_t) @@ -140,29 +202,47 @@ def test_stl_fit_transform_in_sample( "partially-out-of-sample-in-past", ], ) +@pytest.mark.parametrize( + "variateness", + [ + "univariate", + "multivariate", + ], +) def test_stl_decomposer_inverse_transform( index_type, generate_seasonal_data, + variateness, transformer_fit_on_data, ): # Generate 10 periods (the default) of synthetic seasonal data period = 7 - X, y = generate_seasonal_data(real_or_synthetic="synthetic")( + X, y = generate_seasonal_data( + real_or_synthetic="synthetic", + univariate_or_multivariate=variateness, + )( period=period, freq_str="D", set_time_index=True, ) if index_type == "integer_index": y = y.reset_index(drop=True) + subset_X = X[: 5 * period] - subset_y = y[: 5 * period] + subset_y = y.loc[y.index[: 5 * period]] decomposer = STLDecomposer(period=period) output_X, output_y = decomposer.fit_transform(subset_X, subset_y) if transformer_fit_on_data == "in-sample": output_inverse_y = decomposer.inverse_transform(output_y) - pd.testing.assert_series_equal(subset_y, output_inverse_y, check_dtype=False) + if variateness == "multivariate": + assert_function = pd.testing.assert_frame_equal + y_expected = pd.DataFrame(subset_y) + else: + assert_function = pd.testing.assert_series_equal + y_expected = subset_y + assert_function(y_expected, output_inverse_y, check_dtype=False) if transformer_fit_on_data != "in-sample": y_t_new = build_test_target( @@ -171,6 +251,8 @@ def test_stl_decomposer_inverse_transform( transformer_fit_on_data, to_test="inverse_transform", ) + if variateness == "multivariate": + y_t_new = pd.DataFrame([y_t_new, y_t_new]).T if transformer_fit_on_data in [ "out-of-sample-in-past", "partially-out-of-sample-in-past", @@ -184,14 +266,22 @@ def test_stl_decomposer_inverse_transform( # Because output_inverse_y.index is int32 and y[y_t_new.index].index is int64 in windows, # we need to test the indices equivalence separately. output_inverse_y = decomposer.inverse_transform(y_t_new) - pd.testing.assert_series_equal( - y[y_t_new.index], + + if variateness == "multivariate": + assert_function = pd.testing.assert_frame_equal + y_expected = pd.DataFrame(y.loc[y_t_new.index]) + else: + assert_function = pd.testing.assert_series_equal + y_expected = y[y_t_new.index] + assert_function( + y_expected, output_inverse_y, - check_index=False, - rtol=1.0e-2, + check_exact=False, + rtol=1.0e-1, ) + pd.testing.assert_index_equal( - y[y_t_new.index].index, + y.loc[y_t_new.index].index, output_inverse_y.index, exact=False, ) @@ -224,33 +314,41 @@ def test_stl_decomposer_get_trend_dataframe( variateness, ): period = 7 - X, y = generate_seasonal_data(real_or_synthetic="synthetic")( + X, y = generate_seasonal_data( + real_or_synthetic="synthetic", + univariate_or_multivariate=variateness, + )( period=period, freq_str="D", set_time_index=True, ) subset_X = X[: 5 * period] - subset_y = y[: 5 * period] + subset_y = y.loc[y.index[: 5 * period]] if transformer_fit_on_data == "in-sample": dec = STLDecomposer() dec.fit(subset_X, subset_y) # get_trend_dataframe() is only expected to work with datetime indices - if variateness == "multivariate": - subset_y = pd.concat([subset_y, subset_y], axis=1) result_dfs = dec.get_trend_dataframe(subset_X, subset_y) - assert isinstance(result_dfs, list) - assert all(isinstance(x, pd.DataFrame) for x in result_dfs) - assert all(get_trend_dataframe_format_correct(x) for x in result_dfs) if variateness == "univariate": + assert isinstance(result_dfs, list) + assert all(isinstance(x, pd.DataFrame) for x in result_dfs) + assert all(get_trend_dataframe_format_correct(x) for x in result_dfs) assert len(result_dfs) == 1 [get_trend_dataframe_format_correct(x) for x in result_dfs] + elif variateness == "multivariate": + assert isinstance(result_dfs, dict) + assert all(isinstance(result_dfs[x], list) for x in result_dfs) + assert all(isinstance(result_dfs[x][0], pd.DataFrame) for x in result_dfs) + assert all( + get_trend_dataframe_format_correct(result_dfs[x][0]) for x in result_dfs + ) assert len(result_dfs) == 2 - [get_trend_dataframe_format_correct(x) for idx, x in enumerate(result_dfs)] + [get_trend_dataframe_format_correct(result_dfs[x][0]) for x in result_dfs] elif transformer_fit_on_data != "in-sample": y_t_new = build_test_target( @@ -259,12 +357,12 @@ def test_stl_decomposer_get_trend_dataframe( transformer_fit_on_data, to_test="transform", ) + if variateness == "multivariate": + y_t_new = pd.DataFrame([y_t_new, y_t_new]).T dec = STLDecomposer() dec.fit(subset_X, subset_y) # get_trend_dataframe() is only expected to work with datetime indices - if variateness == "multivariate": - y_t_new = pd.concat([y_t_new, y_t_new], axis=1) if transformer_fit_on_data in [ "out-of-sample-in-past", @@ -279,66 +377,212 @@ def test_stl_decomposer_get_trend_dataframe( else: result_dfs = dec.get_trend_dataframe(X.loc[y_t_new.index], y_t_new) - assert isinstance(result_dfs, list) - assert all(isinstance(x, pd.DataFrame) for x in result_dfs) - assert all(get_trend_dataframe_format_correct(x) for x in result_dfs) if variateness == "univariate": + assert isinstance(result_dfs, list) + assert all(isinstance(x, pd.DataFrame) for x in result_dfs) + assert all(get_trend_dataframe_format_correct(x) for x in result_dfs) assert len(result_dfs) == 1 [get_trend_dataframe_format_correct(x) for x in result_dfs] elif variateness == "multivariate": + assert isinstance(result_dfs, dict) + assert all(isinstance(result_dfs[x], list) for x in result_dfs) + assert all( + isinstance(result_dfs[x][0], pd.DataFrame) for x in result_dfs + ) + assert all( + get_trend_dataframe_format_correct(result_dfs[x][0]) + for x in result_dfs + ) assert len(result_dfs) == 2 [ - get_trend_dataframe_format_correct(x) - for idx, x in enumerate(result_dfs) + get_trend_dataframe_format_correct(result_dfs[x][0]) + for x in result_dfs ] -def test_stl_decomposer_get_trend_dataframe_sets_time_index_internally( +@pytest.mark.parametrize( + "variateness", + [ + "univariate", + "multivariate", + ], +) +def test_stl_decomposer_get_trend_dataframe_raises_errors( + variateness, generate_seasonal_data, ): - X, y = generate_seasonal_data(real_or_synthetic="synthetic")( + X, y = generate_seasonal_data( + real_or_synthetic="synthetic", + univariate_or_multivariate=variateness, + )( period=7, set_time_index=False, ) + + stl = STLDecomposer() + stl.fit_transform(X, y) + + with pytest.raises( + TypeError, + match="Provided X or y should have datetimes in the index.", + ): + X_int_index = X.reset_index() + y_int_index = y.reset_index() + stl.get_trend_dataframe(X_int_index, y_int_index) + + +@pytest.mark.parametrize( + "variateness", + [ + "univariate", + "multivariate", + ], +) +def test_stl_decomposer_get_trend_dataframe_sets_X_index_internally( + variateness, + ts_data, + ts_multiseries_data, +): + X, _, y = ts_data() if variateness == "univariate" else ts_multiseries_data() + assert isinstance(y.index, pd.DatetimeIndex) + X = X.reset_index() + assert not isinstance(X.index, pd.DatetimeIndex) + + stl = STLDecomposer() + stl.fit(X, y) + result_dfs = stl.get_trend_dataframe(X, y) + + if variateness == "univariate": + assert isinstance(result_dfs, list) + assert all(isinstance(x, pd.DataFrame) for x in result_dfs) + assert all(get_trend_dataframe_format_correct(x) for x in result_dfs) + elif variateness == "multivariate": + assert isinstance(result_dfs, dict) + assert all(isinstance(result_dfs[x], list) for x in result_dfs) + assert all(isinstance(result_dfs[x][0], pd.DataFrame) for x in result_dfs) + assert all( + get_trend_dataframe_format_correct(result_dfs[x][0]) for x in result_dfs + ) + assert len(result_dfs) == 2 + [get_trend_dataframe_format_correct(result_dfs[x][0]) for x in result_dfs] + + +@pytest.mark.parametrize( + "variateness", + [ + "univariate", + "multivariate", + ], +) +def test_stl_decomposer_get_trend_dataframe_sets_y_index_internally( + generate_seasonal_data, + variateness, +): + X, y = generate_seasonal_data( + real_or_synthetic="synthetic", + univariate_or_multivariate=variateness, + )( + period=7, + set_time_index=False, + ) + assert not isinstance(y.index, pd.DatetimeIndex) stl = STLDecomposer() stl.fit(X, y) result_dfs = stl.get_trend_dataframe(X, y) - assert isinstance(result_dfs, list) - assert all(isinstance(x, pd.DataFrame) for x in result_dfs) - assert all(get_trend_dataframe_format_correct(x) for x in result_dfs) + if variateness == "univariate": + assert isinstance(result_dfs, list) + assert all(isinstance(x, pd.DataFrame) for x in result_dfs) + assert all(get_trend_dataframe_format_correct(x) for x in result_dfs) + elif variateness == "multivariate": + assert isinstance(result_dfs, dict) + assert all(isinstance(result_dfs[x], list) for x in result_dfs) + assert all(isinstance(result_dfs[x][0], pd.DataFrame) for x in result_dfs) + assert all( + get_trend_dataframe_format_correct(result_dfs[x][0]) for x in result_dfs + ) + assert len(result_dfs) == 2 + [get_trend_dataframe_format_correct(result_dfs[x][0]) for x in result_dfs] @pytest.mark.parametrize( "bad_frequency", ["T", "A"], ) +@pytest.mark.parametrize( + "variateness", + [ + "univariate", + "multivariate", + ], +) def test_unsupported_frequencies( bad_frequency, generate_seasonal_data, + variateness, ): """This test exists to highlight that even though the underlying statsmodels STL component won't work for minute or annual frequencies, we can still run these frequencies with automatic period detection. """ - X, y = generate_seasonal_data(real_or_synthetic="synthetic")( + X, y = generate_seasonal_data( + real_or_synthetic="synthetic", + univariate_or_multivariate=variateness, + )( period=7, freq_str=bad_frequency, ) stl = STLDecomposer() X_t, y_t = stl.fit_transform(X, y) - assert stl.period is not None + assert stl.periods is not None +@pytest.mark.parametrize( + "variateness", + [ + "univariate", + "multivariate", + ], +) +def test_init_periods( + generate_seasonal_data, + variateness, +): + period = 7 + X, y = generate_seasonal_data( + real_or_synthetic="synthetic", + univariate_or_multivariate=variateness, + )(period) + periods = {id: 8 for id in y.columns} if variateness == "multivariate" else None + stl = STLDecomposer(period=period, periods=periods) + X_t, y_t = stl.fit_transform(X, y) + if variateness == "univariate": + assert stl.period == period + else: + assert stl.periods == periods + + +@pytest.mark.parametrize( + "variateness", + [ + "univariate", + "multivariate", + ], +) def test_stl_decomposer_doesnt_modify_target_index( generate_seasonal_data, + variateness, ): - X, y = generate_seasonal_data(real_or_synthetic="synthetic")( + X, y = generate_seasonal_data( + real_or_synthetic="synthetic", + univariate_or_multivariate=variateness, + )( period=7, set_time_index=False, ) + original_X_index = X.index original_y_index = y.index @@ -357,22 +601,32 @@ def test_stl_decomposer_doesnt_modify_target_index( @pytest.mark.parametrize("index_type", ["datetime", "int"]) @pytest.mark.parametrize("set_coverage", [True, False]) +@pytest.mark.parametrize( + "variateness", + [ + "univariate", + "multivariate", + ], +) def test_stl_decomposer_get_trend_prediction_intervals( set_coverage, index_type, generate_seasonal_data, + variateness, ): coverage = [0.75, 0.85, 0.95] if set_coverage else None period = 7 - X, y = generate_seasonal_data(real_or_synthetic="synthetic")( + X, y = generate_seasonal_data( + real_or_synthetic="synthetic", + univariate_or_multivariate=variateness, + )( period=period, freq_str="D", set_time_index=True, ) X_train = X[: 15 * period] - y_train = y[: 15 * period] - - y_validate = y[15 * period :] + y_train = y.loc[y.index[: 15 * period]] + y_validate = y.loc[y.index[15 * period :]] stl = STLDecomposer() stl.fit(X_train, y_train) @@ -394,4 +648,53 @@ def assert_pred_interval_coverage(pred_interval): y_validate, coverage=coverage, ) - assert_pred_interval_coverage(trend_pred_intervals) + + if variateness == "univariate": + assert_pred_interval_coverage(trend_pred_intervals) + elif variateness == "multivariate": + for id in y_validate: + assert_pred_interval_coverage(trend_pred_intervals[id]) + + +@pytest.mark.parametrize( + "variateness", + [ + "univariate", + "multivariate", + ], +) +def test_stl_decomposer_plot_decomposition( + ts_data, + ts_multiseries_data, + variateness, +): + if variateness == "univariate": + X, _, y = ts_data() + elif variateness == "multivariate": + X, _, y = ts_multiseries_data() + + dec = STLDecomposer(time_index="date") + dec.fit_transform(X, y) + + if variateness == "univariate": + fig, axs = dec.plot_decomposition(X, y, show=False) + assert isinstance(fig, matplotlib.pyplot.Figure) + assert isinstance(axs, np.ndarray) + assert all([isinstance(ax, matplotlib.pyplot.Axes) for ax in axs]) + elif variateness == "multivariate": + result_plots = dec.plot_decomposition(X, y, show=False) + for id in y.columns: + fig, axs = result_plots[id] + assert isinstance(fig, matplotlib.pyplot.Figure) + assert isinstance(axs, np.ndarray) + assert all([isinstance(ax, matplotlib.pyplot.Axes) for ax in axs]) + + +def test_stl_decomposer_unstack_series_id( + multiseries_ts_data_stacked, +): + X, y = multiseries_ts_data_stacked + + dec = STLDecomposer(series_id="series_id", time_index="date") + X_output, y_output = dec.fit_transform(X, y) + assert len(y_output.columns) == X["series_id"].nunique() diff --git a/evalml/tests/conftest.py b/evalml/tests/conftest.py index 882209d52c..3440e5ec91 100644 --- a/evalml/tests/conftest.py +++ b/evalml/tests/conftest.py @@ -2521,11 +2521,73 @@ def generate_synthetic_data( y = y.set_axis(dts) return X, y - def _return_proper_func(real_or_synthetic): - if real_or_synthetic == "synthetic": + def generate_multiseries_synthetic_data( + period, + step=None, + num_periods=20, + scale=1, + seasonal_scale=1, + trend_degree=1, + freq_str="D", + set_time_index=False, + ): + """Function to generate a sinusoidal signal with a polynomial trend. + + Args: + period: The length, in units, of the seasonal signal. + num_periods: How many periods of the seasonal signal to generate. + scale: The relative scale of the trend. Setting it higher increases + the comparative strength of the trend. + seasonal_scale: The relative scale of the sinusoidal seasonality. + Setting it higher increases the comparative strength of the + trend. + trend_degree: The degree of the polynomial trend. 1 = linear, 2 = + quadratic, 3 = cubic. Specific functional forms defined + below. + freq_str: The pandas frequency string used to define the unit of + time in the series time index. + set_time_index: Whether to set the time index with a pandas. + DatetimeIndex. + + Returns: + X (pandas.DateFrame): A placeholder feature matrix. + y (pandas.Series): A synthetic, time series target Series. + + """ + freq = 2 * np.pi / period + x = np.arange(0, period * num_periods, 1) + dts = pd.date_range(datetime.today(), periods=len(x), freq=freq_str) + X = pd.DataFrame({"x": x}) + X = X.set_index(dts) + + y_ms_list = [] + for i in range(2): + for j in range(5): + if trend_degree == 1: + y_trend = pd.Series(scale * minmax_scale(x + 2)) + elif trend_degree == 2: + y_trend = pd.Series(scale * minmax_scale(x**2)) + elif trend_degree == 3: + y_trend = pd.Series(scale * minmax_scale((x - 5) ** 3 + x**2)) + if period is not None: + y_seasonal = pd.Series(seasonal_scale * np.sin(freq * x)) + y = y_trend + y_seasonal + if set_time_index: + y = y.set_axis(dts) + y_ms_list.append(y) + y_ms = pd.DataFrame(y_ms_list).T + return X, y_ms + + def _return_proper_func(real_or_synthetic, univariate_or_multivariate="univariate"): + if ( + real_or_synthetic == "synthetic" + and univariate_or_multivariate == "univariate" + ): return generate_synthetic_data - elif real_or_synthetic == "real": + elif real_or_synthetic == "real" and univariate_or_multivariate == "univariate": return generate_real_data + if univariate_or_multivariate == "multivariate": + return generate_multiseries_synthetic_data return _return_proper_func