Skip to content

Commit

Permalink
add cross validation fitted values (#164)
Browse files Browse the repository at this point in the history
  • Loading branch information
jmoralez authored Jul 5, 2023
1 parent 8583b39 commit 8b56955
Show file tree
Hide file tree
Showing 3 changed files with 261 additions and 7 deletions.
2 changes: 2 additions & 0 deletions mlforecast/_modidx.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,8 @@
'mlforecast/forecast.py'),
'mlforecast.forecast.MLForecast.cross_validation': ( 'forecast.html#mlforecast.cross_validation',
'mlforecast/forecast.py'),
'mlforecast.forecast.MLForecast.cross_validation_fitted_values': ( 'forecast.html#mlforecast.cross_validation_fitted_values',
'mlforecast/forecast.py'),
'mlforecast.forecast.MLForecast.fit': ('forecast.html#mlforecast.fit', 'mlforecast/forecast.py'),
'mlforecast.forecast.MLForecast.fit_models': ( 'forecast.html#mlforecast.fit_models',
'mlforecast/forecast.py'),
Expand Down
33 changes: 33 additions & 0 deletions mlforecast/forecast.py
Original file line number Diff line number Diff line change
Expand Up @@ -520,6 +520,7 @@ def cross_validation(
prediction_intervals: Optional[PredictionIntervals] = None,
level: Optional[List[Union[int, float]]] = None,
input_size: Optional[int] = None,
fitted: bool = False,
):
"""Perform time series cross validation.
Creates `n_windows` splits where each window has `window_size` test periods,
Expand Down Expand Up @@ -566,6 +567,8 @@ def cross_validation(
Confidence levels between 0 and 100 for prediction intervals.
input_size : int, optional (default=None)
Maximum training samples per serie in each window. If None, will use an expanding window.
fitted : bool (default=False)
Store the in-sample predictions.
Returns
-------
Expand Down Expand Up @@ -597,6 +600,7 @@ def cross_validation(
if static_features is not None:
ex_cols_to_drop.extend(static_features)
has_ex = not data.columns.drop(ex_cols_to_drop).empty
self.cv_fitted_values_ = []
for i_window, (cutoffs, train, valid) in enumerate(splits):
if refit or i_window == 0:
self.fit(
Expand All @@ -611,6 +615,28 @@ def cross_validation(
prediction_intervals=prediction_intervals,
)
self.cv_models_.append(self.models_)
if fitted:
insample_results = train[[id_col, time_col]].copy()
trainX, _ = self.preprocess(
train,
id_col=id_col,
time_col=time_col,
target_col=target_col,
static_features=static_features,
dropna=False,
keep_last_n=keep_last_n,
max_horizon=max_horizon,
return_X_y=True,
)
trainX = trainX[self.ts.features_order_]
for name, model in self.models_.items():
insample_results[name] = model.predict(trainX) # type: ignore[union-attr]
if self.ts.target_transforms is not None:
for tfm in self.ts.target_transforms[::-1]:
insample_results = tfm.inverse_transform(insample_results)
insample_results["fold"] = i_window
insample_results[target_col] = train[target_col].values
self.cv_fitted_values_.append(insample_results)
dynamic_dfs = [valid.drop(columns=[target_col])] if has_ex else None
y_pred = self.predict(
window_size,
Expand All @@ -634,3 +660,10 @@ def cross_validation(
out = pd.concat(results)
cols_order = [id_col, time_col, "cutoff", target_col]
return out[cols_order + out.columns.drop(cols_order).tolist()]

def cross_validation_fitted_values(self):
if not getattr(self, "cv_fitted_values_", []):
raise ValueError("Please run cross_validation with fitted=True first.")
cols_order = [self.ts.id_col, self.ts.time_col, "fold", self.ts.target_col]
out = pd.concat(self.cv_fitted_values_).reset_index(drop=True)
return out[cols_order + out.columns.drop(cols_order).tolist()]
233 changes: 226 additions & 7 deletions nbs/forecast.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -607,7 +607,8 @@
" after_predict_callback: Optional[Callable] = None,\n",
" prediction_intervals: Optional[PredictionIntervals] = None,\n",
" level: Optional[List[Union[int, float]]] = None,\n",
" input_size: Optional[int] = None, \n",
" input_size: Optional[int] = None,\n",
" fitted: bool = False,\n",
" ):\n",
" \"\"\"Perform time series cross validation.\n",
" Creates `n_windows` splits where each window has `window_size` test periods, \n",
Expand Down Expand Up @@ -653,7 +654,9 @@
" level : list of ints or floats, optional (default=None)\n",
" Confidence levels between 0 and 100 for prediction intervals.\n",
" input_size : int, optional (default=None)\n",
" Maximum training samples per serie in each window. If None, will use an expanding window. \n",
" Maximum training samples per serie in each window. If None, will use an expanding window.\n",
" fitted : bool (default=False)\n",
" Store the in-sample predictions.\n",
"\n",
" Returns\n",
" -------\n",
Expand Down Expand Up @@ -683,6 +686,7 @@
" if static_features is not None:\n",
" ex_cols_to_drop.extend(static_features)\n",
" has_ex = not data.columns.drop(ex_cols_to_drop).empty\n",
" self.cv_fitted_values_ = []\n",
" for i_window, (cutoffs, train, valid) in enumerate(splits):\n",
" if refit or i_window == 0:\n",
" self.fit(\n",
Expand All @@ -697,6 +701,28 @@
" prediction_intervals=prediction_intervals,\n",
" )\n",
" self.cv_models_.append(self.models_)\n",
" if fitted:\n",
" insample_results = train[[id_col, time_col]].copy()\n",
" trainX, _ = self.preprocess(\n",
" train,\n",
" id_col=id_col,\n",
" time_col=time_col,\n",
" target_col=target_col,\n",
" static_features=static_features,\n",
" dropna=False,\n",
" keep_last_n=keep_last_n,\n",
" max_horizon=max_horizon,\n",
" return_X_y=True,\n",
" )\n",
" trainX = trainX[self.ts.features_order_]\n",
" for name, model in self.models_.items():\n",
" insample_results[name] = model.predict(trainX) # type: ignore[union-attr]\n",
" if self.ts.target_transforms is not None:\n",
" for tfm in self.ts.target_transforms[::-1]:\n",
" insample_results = tfm.inverse_transform(insample_results)\n",
" insample_results['fold'] = i_window \n",
" insample_results[target_col] = train[target_col].values\n",
" self.cv_fitted_values_.append(insample_results)\n",
" dynamic_dfs = [valid.drop(columns=[target_col])] if has_ex else None\n",
" y_pred = self.predict(\n",
" window_size,\n",
Expand All @@ -717,6 +743,13 @@
" results.append(result)\n",
" out = pd.concat(results)\n",
" cols_order = [id_col, time_col, 'cutoff', target_col]\n",
" return out[cols_order + out.columns.drop(cols_order).tolist()]\n",
" \n",
" def cross_validation_fitted_values(self):\n",
" if not getattr(self, 'cv_fitted_values_', []):\n",
" raise ValueError('Please run cross_validation with fitted=True first.')\n",
" cols_order = [self.ts.id_col, self.ts.time_col, 'fold', self.ts.target_col]\n",
" out = pd.concat(self.cv_fitted_values_).reset_index(drop=True)\n",
" return out[cols_order + out.columns.drop(cols_order).tolist()]"
]
},
Expand Down Expand Up @@ -2405,7 +2438,8 @@
"> diction_intervals:Optional[mlforecast.utils.\n",
"> PredictionIntervals]=None,\n",
"> level:Optional[List[Union[int,float]]]=None,\n",
"> input_size:Optional[int]=None)\n",
"> input_size:Optional[int]=None,\n",
"> fitted:bool=False)\n",
"\n",
"Perform time series cross validation.\n",
"Creates `n_windows` splits where each window has `window_size` test periods, \n",
Expand All @@ -2429,7 +2463,8 @@
"| after_predict_callback | Optional | None | Function to call on the predictions before updating the targets.<br> This function will take a pandas Series with the predictions and should return another one with the same structure.<br> The series identifier is on the index. |\n",
"| prediction_intervals | Optional | None | Configuration to calibrate prediction intervals (Conformal Prediction). |\n",
"| level | Optional | None | Confidence levels between 0 and 100 for prediction intervals. |\n",
"| input_size | Optional | None | Maximum training samples per serie in each window. If None, will use an expanding window. |\n",
"| input_size | Optional | None | Maximum training samples per serie in each window. If None, will use an expanding window. |\n",
"| fitted | bool | False | Store the in-sample predictions. |\n",
"| **Returns** | **pandas DataFrame** | | **Predictions for each window with the series id, timestamp, last train date, target value and predictions from each model.** |"
],
"text/plain": [
Expand All @@ -2452,7 +2487,8 @@
"> diction_intervals:Optional[mlforecast.utils.\n",
"> PredictionIntervals]=None,\n",
"> level:Optional[List[Union[int,float]]]=None,\n",
"> input_size:Optional[int]=None)\n",
"> input_size:Optional[int]=None,\n",
"> fitted:bool=False)\n",
"\n",
"Perform time series cross validation.\n",
"Creates `n_windows` splits where each window has `window_size` test periods, \n",
Expand All @@ -2476,7 +2512,8 @@
"| after_predict_callback | Optional | None | Function to call on the predictions before updating the targets.<br> This function will take a pandas Series with the predictions and should return another one with the same structure.<br> The series identifier is on the index. |\n",
"| prediction_intervals | Optional | None | Configuration to calibrate prediction intervals (Conformal Prediction). |\n",
"| level | Optional | None | Confidence levels between 0 and 100 for prediction intervals. |\n",
"| input_size | Optional | None | Maximum training samples per serie in each window. If None, will use an expanding window. |\n",
"| input_size | Optional | None | Maximum training samples per serie in each window. If None, will use an expanding window. |\n",
"| fitted | bool | False | Store the in-sample predictions. |\n",
"| **Returns** | **pandas DataFrame** | | **Predictions for each window with the series id, timestamp, last train date, target value and predictions from each model.** |"
]
},
Expand All @@ -2492,7 +2529,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "541510fd-3fd5-49ab-baf3-9da4e9956087",
"id": "a9667452-b4b4-4d8c-a952-b7e70a7e92f9",
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -2640,15 +2677,197 @@
}
],
"source": [
"fcst = MLForecast(\n",
" models=lgb.LGBMRegressor(random_state=0),\n",
" lags=[24 * (i+1) for i in range(7)],\n",
" lag_transforms={\n",
" 1: [(rolling_mean, 24)],\n",
" 24: [(rolling_mean, 24)],\n",
" 48: [(ewm_mean, 0.3)],\n",
" },\n",
" num_threads=1,\n",
" target_transforms=[Differences([24])],\n",
")\n",
"cv_results = fcst.cross_validation(\n",
" train,\n",
" n_windows=4,\n",
" window_size=horizon,\n",
" step_size=horizon,\n",
" fitted=True,\n",
")\n",
"cv_results"
]
},
{
"cell_type": "markdown",
"id": "9a71926a-fbfe-4d2f-99c3-804fe6e2908f",
"metadata": {},
"source": [
"Since we set `fitted=True` we can access the predictions for the training sets as well with the `cross_validation_fitted_values` method."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4c295f09-1416-4e55-8701-7a5ef8e2200d",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>unique_id</th>\n",
" <th>ds</th>\n",
" <th>fold</th>\n",
" <th>y</th>\n",
" <th>LGBMRegressor</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>H196</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>11.8</td>\n",
" <td>15.167163</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>H196</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>11.4</td>\n",
" <td>14.767163</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>H196</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>11.1</td>\n",
" <td>14.467163</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>H196</td>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>10.8</td>\n",
" <td>14.167163</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>H196</td>\n",
" <td>5</td>\n",
" <td>0</td>\n",
" <td>10.6</td>\n",
" <td>13.867163</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13435</th>\n",
" <td>H413</td>\n",
" <td>908</td>\n",
" <td>3</td>\n",
" <td>49.0</td>\n",
" <td>40.262691</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13436</th>\n",
" <td>H413</td>\n",
" <td>909</td>\n",
" <td>3</td>\n",
" <td>39.0</td>\n",
" <td>26.603123</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13437</th>\n",
" <td>H413</td>\n",
" <td>910</td>\n",
" <td>3</td>\n",
" <td>29.0</td>\n",
" <td>42.545732</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13438</th>\n",
" <td>H413</td>\n",
" <td>911</td>\n",
" <td>3</td>\n",
" <td>24.0</td>\n",
" <td>30.053714</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13439</th>\n",
" <td>H413</td>\n",
" <td>912</td>\n",
" <td>3</td>\n",
" <td>20.0</td>\n",
" <td>-13.589900</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>13440 rows × 5 columns</p>\n",
"</div>"
],
"text/plain": [
" unique_id ds fold y LGBMRegressor\n",
"0 H196 1 0 11.8 15.167163\n",
"1 H196 2 0 11.4 14.767163\n",
"2 H196 3 0 11.1 14.467163\n",
"3 H196 4 0 10.8 14.167163\n",
"4 H196 5 0 10.6 13.867163\n",
"... ... ... ... ... ...\n",
"13435 H413 908 3 49.0 40.262691\n",
"13436 H413 909 3 39.0 26.603123\n",
"13437 H413 910 3 29.0 42.545732\n",
"13438 H413 911 3 24.0 30.053714\n",
"13439 H413 912 3 20.0 -13.589900\n",
"\n",
"[13440 rows x 5 columns]"
]
},
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"fcst.cross_validation_fitted_values()"
]
},
{
"cell_type": "markdown",
"id": "f9193261-df9c-40f4-a2ea-27cf39b90cd5",
"metadata": {},
"source": [
"We can also compute prediction intervals by passing a configuration to `prediction_intervals` as well as values for the width through `levels`."
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down

0 comments on commit 8b56955

Please sign in to comment.