add cross validation fitted values (#164)

Nixtla · Jul 5, 2023 · 8b56955 · 8b56955
1 parent 8583b39
commit 8b56955
Show file tree

Hide file tree

Showing 3 changed files with 261 additions and 7 deletions.
diff --git a/mlforecast/_modidx.py b/mlforecast/_modidx.py
@@ -107,6 +107,8 @@
  'mlforecast/forecast.py'),
  'mlforecast.forecast.MLForecast.cross_validation': ( 'forecast.html#mlforecast.cross_validation',
  'mlforecast/forecast.py'),
+ 'mlforecast.forecast.MLForecast.cross_validation_fitted_values': ( 'forecast.html#mlforecast.cross_validation_fitted_values',
+ 'mlforecast/forecast.py'),
  'mlforecast.forecast.MLForecast.fit': ('forecast.html#mlforecast.fit', 'mlforecast/forecast.py'),
  'mlforecast.forecast.MLForecast.fit_models': ( 'forecast.html#mlforecast.fit_models',
  'mlforecast/forecast.py'),

diff --git a/mlforecast/forecast.py b/mlforecast/forecast.py
@@ -520,6 +520,7 @@ def cross_validation(
  prediction_intervals: Optional[PredictionIntervals] = None,
  level: Optional[List[Union[int, float]]] = None,
  input_size: Optional[int] = None,
+ fitted: bool = False,
  ):
  """Perform time series cross validation.
  Creates `n_windows` splits where each window has `window_size` test periods,
@@ -566,6 +567,8 @@ def cross_validation(
  Confidence levels between 0 and 100 for prediction intervals.
  input_size : int, optional (default=None)
  Maximum training samples per serie in each window. If None, will use an expanding window.
+ fitted : bool (default=False)
+ Store the in-sample predictions.
 
  Returns
  -------
@@ -597,6 +600,7 @@ def cross_validation(
  if static_features is not None:
  ex_cols_to_drop.extend(static_features)
  has_ex = not data.columns.drop(ex_cols_to_drop).empty
+ self.cv_fitted_values_ = []
  for i_window, (cutoffs, train, valid) in enumerate(splits):
  if refit or i_window == 0:
  self.fit(
@@ -611,6 +615,28 @@ def cross_validation(
  prediction_intervals=prediction_intervals,
  )
  self.cv_models_.append(self.models_)
+ if fitted:
+ insample_results = train[[id_col, time_col]].copy()
+ trainX, _ = self.preprocess(
+ train,
+ id_col=id_col,
+ time_col=time_col,
+ target_col=target_col,
+ static_features=static_features,
+ dropna=False,
+ keep_last_n=keep_last_n,
+ max_horizon=max_horizon,
+ return_X_y=True,
+ )
+ trainX = trainX[self.ts.features_order_]
+ for name, model in self.models_.items():
+ insample_results[name] = model.predict(trainX) # type: ignore[union-attr]
+ if self.ts.target_transforms is not None:
+ for tfm in self.ts.target_transforms[::-1]:
+ insample_results = tfm.inverse_transform(insample_results)
+ insample_results["fold"] = i_window
+ insample_results[target_col] = train[target_col].values
+ self.cv_fitted_values_.append(insample_results)
  dynamic_dfs = [valid.drop(columns=[target_col])] if has_ex else None
  y_pred = self.predict(
  window_size,
@@ -634,3 +660,10 @@ def cross_validation(
  out = pd.concat(results)
  cols_order = [id_col, time_col, "cutoff", target_col]
  return out[cols_order + out.columns.drop(cols_order).tolist()]
+
+ def cross_validation_fitted_values(self):
+ if not getattr(self, "cv_fitted_values_", []):
+ raise ValueError("Please run cross_validation with fitted=True first.")
+ cols_order = [self.ts.id_col, self.ts.time_col, "fold", self.ts.target_col]
+ out = pd.concat(self.cv_fitted_values_).reset_index(drop=True)
+ return out[cols_order + out.columns.drop(cols_order).tolist()]
diff --git a/nbs/forecast.ipynb b/nbs/forecast.ipynb
@@ -607,7 +607,8 @@
  " after_predict_callback: Optional[Callable] = None,\n",
  " prediction_intervals: Optional[PredictionIntervals] = None,\n",
  " level: Optional[List[Union[int, float]]] = None,\n",
- " input_size: Optional[int] = None, \n",
+ " input_size: Optional[int] = None,\n",
+ " fitted: bool = False,\n",
  " ):\n",
  " \"\"\"Perform time series cross validation.\n",
  " Creates `n_windows` splits where each window has `window_size` test periods, \n",
@@ -653,7 +654,9 @@
  " level : list of ints or floats, optional (default=None)\n",
  " Confidence levels between 0 and 100 for prediction intervals.\n",
  " input_size : int, optional (default=None)\n",
- " Maximum training samples per serie in each window. If None, will use an expanding window. \n",
+ " Maximum training samples per serie in each window. If None, will use an expanding window.\n",
+ " fitted : bool (default=False)\n",
+ " Store the in-sample predictions.\n",
  "\n",
  " Returns\n",
  " -------\n",
@@ -683,6 +686,7 @@
  " if static_features is not None:\n",
  " ex_cols_to_drop.extend(static_features)\n",
  " has_ex = not data.columns.drop(ex_cols_to_drop).empty\n",
+ " self.cv_fitted_values_ = []\n",
  " for i_window, (cutoffs, train, valid) in enumerate(splits):\n",
  " if refit or i_window == 0:\n",
  " self.fit(\n",
@@ -697,6 +701,28 @@
  " prediction_intervals=prediction_intervals,\n",
  " )\n",
  " self.cv_models_.append(self.models_)\n",
+ " if fitted:\n",
+ " insample_results = train[[id_col, time_col]].copy()\n",
+ " trainX, _ = self.preprocess(\n",
+ " train,\n",
+ " id_col=id_col,\n",
+ " time_col=time_col,\n",
+ " target_col=target_col,\n",
+ " static_features=static_features,\n",
+ " dropna=False,\n",
+ " keep_last_n=keep_last_n,\n",
+ " max_horizon=max_horizon,\n",
+ " return_X_y=True,\n",
+ " )\n",
+ " trainX = trainX[self.ts.features_order_]\n",
+ " for name, model in self.models_.items():\n",
+ " insample_results[name] = model.predict(trainX) # type: ignore[union-attr]\n",
+ " if self.ts.target_transforms is not None:\n",
+ " for tfm in self.ts.target_transforms[::-1]:\n",
+ " insample_results = tfm.inverse_transform(insample_results)\n",
+ " insample_results['fold'] = i_window \n",
+ " insample_results[target_col] = train[target_col].values\n",
+ " self.cv_fitted_values_.append(insample_results)\n",
  " dynamic_dfs = [valid.drop(columns=[target_col])] if has_ex else None\n",
  " y_pred = self.predict(\n",
  " window_size,\n",
@@ -717,6 +743,13 @@
  " results.append(result)\n",
  " out = pd.concat(results)\n",
  " cols_order = [id_col, time_col, 'cutoff', target_col]\n",
+ " return out[cols_order + out.columns.drop(cols_order).tolist()]\n",
+ " \n",
+ " def cross_validation_fitted_values(self):\n",
+ " if not getattr(self, 'cv_fitted_values_', []):\n",
+ " raise ValueError('Please run cross_validation with fitted=True first.')\n",
+ " cols_order = [self.ts.id_col, self.ts.time_col, 'fold', self.ts.target_col]\n",
+ " out = pd.concat(self.cv_fitted_values_).reset_index(drop=True)\n",
  " return out[cols_order + out.columns.drop(cols_order).tolist()]"
  ]
  },
@@ -2405,7 +2438,8 @@
  "> diction_intervals:Optional[mlforecast.utils.\n",
  "> PredictionIntervals]=None,\n",
  "> level:Optional[List[Union[int,float]]]=None,\n",
- "> input_size:Optional[int]=None)\n",
+ "> input_size:Optional[int]=None,\n",
+ "> fitted:bool=False)\n",
  "\n",
  "Perform time series cross validation.\n",
  "Creates `n_windows` splits where each window has `window_size` test periods, \n",
@@ -2429,7 +2463,8 @@
  "| after_predict_callback | Optional | None | Function to call on the predictions before updating the targets.<br> This function will take a pandas Series with the predictions and should return another one with the same structure.<br> The series identifier is on the index. |\n",
  "| prediction_intervals | Optional | None | Configuration to calibrate prediction intervals (Conformal Prediction). |\n",
  "| level | Optional | None | Confidence levels between 0 and 100 for prediction intervals. |\n",
- "| input_size | Optional | None | Maximum training samples per serie in each window. If None, will use an expanding window. |\n",
+ "| input_size | Optional | None | Maximum training samples per serie in each window. If None, will use an expanding window. |\n",
+ "| fitted | bool | False | Store the in-sample predictions. |\n",
  "| **Returns** | **pandas DataFrame** | | **Predictions for each window with the series id, timestamp, last train date, target value and predictions from each model.** |"
  ],
  "text/plain": [
@@ -2452,7 +2487,8 @@
  "> diction_intervals:Optional[mlforecast.utils.\n",
  "> PredictionIntervals]=None,\n",
  "> level:Optional[List[Union[int,float]]]=None,\n",
- "> input_size:Optional[int]=None)\n",
+ "> input_size:Optional[int]=None,\n",
+ "> fitted:bool=False)\n",
  "\n",
  "Perform time series cross validation.\n",
  "Creates `n_windows` splits where each window has `window_size` test periods, \n",
@@ -2476,7 +2512,8 @@
  "| after_predict_callback | Optional | None | Function to call on the predictions before updating the targets.<br> This function will take a pandas Series with the predictions and should return another one with the same structure.<br> The series identifier is on the index. |\n",
  "| prediction_intervals | Optional | None | Configuration to calibrate prediction intervals (Conformal Prediction). |\n",
  "| level | Optional | None | Confidence levels between 0 and 100 for prediction intervals. |\n",
- "| input_size | Optional | None | Maximum training samples per serie in each window. If None, will use an expanding window. |\n",
+ "| input_size | Optional | None | Maximum training samples per serie in each window. If None, will use an expanding window. |\n",
+ "| fitted | bool | False | Store the in-sample predictions. |\n",
  "| **Returns** | **pandas DataFrame** | | **Predictions for each window with the series id, timestamp, last train date, target value and predictions from each model.** |"
  ]
  },
@@ -2492,7 +2529,7 @@
  {
  "cell_type": "code",
  "execution_count": null,
- "id": "541510fd-3fd5-49ab-baf3-9da4e9956087",
+ "id": "a9667452-b4b4-4d8c-a952-b7e70a7e92f9",
  "metadata": {},
  "outputs": [
  {
@@ -2640,15 +2677,197 @@
  }
  ],
  "source": [
+ "fcst = MLForecast(\n",
+ " models=lgb.LGBMRegressor(random_state=0),\n",
+ " lags=[24 * (i+1) for i in range(7)],\n",
+ " lag_transforms={\n",
+ " 1: [(rolling_mean, 24)],\n",
+ " 24: [(rolling_mean, 24)],\n",
+ " 48: [(ewm_mean, 0.3)],\n",
+ " },\n",
+ " num_threads=1,\n",
+ " target_transforms=[Differences([24])],\n",
+ ")\n",
  "cv_results = fcst.cross_validation(\n",
  " train,\n",
  " n_windows=4,\n",
  " window_size=horizon,\n",
  " step_size=horizon,\n",
+ " fitted=True,\n",
  ")\n",
  "cv_results"
  ]
  },
+ {
+ "cell_type": "markdown",
+ "id": "9a71926a-fbfe-4d2f-99c3-804fe6e2908f",
+ "metadata": {},
+ "source": [
+ "Since we set `fitted=True` we can access the predictions for the training sets as well with the `cross_validation_fitted_values` method."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4c295f09-1416-4e55-8701-7a5ef8e2200d",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "<div>\n",
+ "<style scoped>\n",
+ " .dataframe tbody tr th:only-of-type {\n",
+ " vertical-align: middle;\n",
+ " }\n",
+ "\n",
+ " .dataframe tbody tr th {\n",
+ " vertical-align: top;\n",
+ " }\n",
+ "\n",
+ " .dataframe thead th {\n",
+ " text-align: right;\n",
+ " }\n",
+ "</style>\n",
+ "<table border=\"1\" class=\"dataframe\">\n",
+ " <thead>\n",
+ " <tr style=\"text-align: right;\">\n",
+ " <th></th>\n",
+ " <th>unique_id</th>\n",
+ " <th>ds</th>\n",
+ " <th>fold</th>\n",
+ " <th>y</th>\n",
+ " <th>LGBMRegressor</th>\n",
+ " </tr>\n",
+ " </thead>\n",
+ " <tbody>\n",
+ " <tr>\n",
+ " <th>0</th>\n",
+ " <td>H196</td>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " <td>11.8</td>\n",
+ " <td>15.167163</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>1</th>\n",
+ " <td>H196</td>\n",
+ " <td>2</td>\n",
+ " <td>0</td>\n",
+ " <td>11.4</td>\n",
+ " <td>14.767163</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>2</th>\n",
+ " <td>H196</td>\n",
+ " <td>3</td>\n",
+ " <td>0</td>\n",
+ " <td>11.1</td>\n",
+ " <td>14.467163</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>3</th>\n",
+ " <td>H196</td>\n",
+ " <td>4</td>\n",
+ " <td>0</td>\n",
+ " <td>10.8</td>\n",
+ " <td>14.167163</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>4</th>\n",
+ " <td>H196</td>\n",
+ " <td>5</td>\n",
+ " <td>0</td>\n",
+ " <td>10.6</td>\n",
+ " <td>13.867163</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>...</th>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>13435</th>\n",
+ " <td>H413</td>\n",
+ " <td>908</td>\n",
+ " <td>3</td>\n",
+ " <td>49.0</td>\n",
+ " <td>40.262691</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>13436</th>\n",
+ " <td>H413</td>\n",
+ " <td>909</td>\n",
+ " <td>3</td>\n",
+ " <td>39.0</td>\n",
+ " <td>26.603123</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>13437</th>\n",
+ " <td>H413</td>\n",
+ " <td>910</td>\n",
+ " <td>3</td>\n",
+ " <td>29.0</td>\n",
+ " <td>42.545732</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>13438</th>\n",
+ " <td>H413</td>\n",
+ " <td>911</td>\n",
+ " <td>3</td>\n",
+ " <td>24.0</td>\n",
+ " <td>30.053714</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>13439</th>\n",
+ " <td>H413</td>\n",
+ " <td>912</td>\n",
+ " <td>3</td>\n",
+ " <td>20.0</td>\n",
+ " <td>-13.589900</td>\n",
+ " </tr>\n",
+ " </tbody>\n",
+ "</table>\n",
+ "<p>13440 rows × 5 columns</p>\n",
+ "</div>"
+ ],
+ "text/plain": [
+ " unique_id ds fold y LGBMRegressor\n",
+ "0 H196 1 0 11.8 15.167163\n",
+ "1 H196 2 0 11.4 14.767163\n",
+ "2 H196 3 0 11.1 14.467163\n",
+ "3 H196 4 0 10.8 14.167163\n",
+ "4 H196 5 0 10.6 13.867163\n",
+ "... ... ... ... ... ...\n",
+ "13435 H413 908 3 49.0 40.262691\n",
+ "13436 H413 909 3 39.0 26.603123\n",
+ "13437 H413 910 3 29.0 42.545732\n",
+ "13438 H413 911 3 24.0 30.053714\n",
+ "13439 H413 912 3 20.0 -13.589900\n",
+ "\n",
+ "[13440 rows x 5 columns]"
+ ]
+ },
+ "execution_count": null,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "fcst.cross_validation_fitted_values()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f9193261-df9c-40f4-a2ea-27cf39b90cd5",
+ "metadata": {},
+ "source": [
+ "We can also compute prediction intervals by passing a configuration to `prediction_intervals` as well as values for the width through `levels`."
+ ]
+ },
  {
  "cell_type": "code",
  "execution_count": null,