Skip to content

Commit

Permalink
fix keep_last_n with target_transforms and add local standard scaler (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
jmoralez authored Jul 20, 2023
1 parent 80fb9fb commit 0584e0b
Show file tree
Hide file tree
Showing 5 changed files with 168 additions and 11 deletions.
12 changes: 11 additions & 1 deletion mlforecast/_modidx.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,17 @@
'mlforecast.target_transforms.Differences.fit_transform': ( 'target_transforms.html#differences.fit_transform',
'mlforecast/target_transforms.py'),
'mlforecast.target_transforms.Differences.inverse_transform': ( 'target_transforms.html#differences.inverse_transform',
'mlforecast/target_transforms.py')},
'mlforecast/target_transforms.py'),
'mlforecast.target_transforms.LocalStandardScaler': ( 'target_transforms.html#localstandardscaler',
'mlforecast/target_transforms.py'),
'mlforecast.target_transforms.LocalStandardScaler.fit_transform': ( 'target_transforms.html#localstandardscaler.fit_transform',
'mlforecast/target_transforms.py'),
'mlforecast.target_transforms.LocalStandardScaler.inverse_transform': ( 'target_transforms.html#localstandardscaler.inverse_transform',
'mlforecast/target_transforms.py'),
'mlforecast.target_transforms._standard_scaler_inverse_transform': ( 'target_transforms.html#_standard_scaler_inverse_transform',
'mlforecast/target_transforms.py'),
'mlforecast.target_transforms._standard_scaler_transform': ( 'target_transforms.html#_standard_scaler_transform',
'mlforecast/target_transforms.py')},
'mlforecast.utils': { 'mlforecast.utils.PredictionIntervals': ('utils.html#predictionintervals', 'mlforecast/utils.py'),
'mlforecast.utils.PredictionIntervals.__init__': ( 'utils.html#predictionintervals.__init__',
'mlforecast/utils.py'),
Expand Down
8 changes: 4 additions & 4 deletions mlforecast/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,7 @@ def _fit(
self.id_col = id_col
self.target_col = target_col
self.time_col = time_col
self.keep_last_n = keep_last_n
to_drop = [id_col, time_col, target_col]
self.static_features = static_features
if static_features is None:
Expand All @@ -247,9 +248,6 @@ def _fit(
sorted_df = sorted_df.set_index([id_col, time_col])
self.uids = sorted_df.index.unique(level=0)
self.ga = GroupedArray.from_sorted_df(sorted_df, id_col, target_col)
self.features_ = self._compute_transforms()
if keep_last_n is not None:
self.ga = self.ga.take_from_groups(slice(-keep_last_n, None))
self._ga = GroupedArray(self.ga.data, self.ga.indptr)
self.last_dates = sorted_df.index.get_level_values(self.time_col)[
self.ga.indptr[1:] - 1
Expand Down Expand Up @@ -330,6 +328,7 @@ def _transform(
if `dropna=True` then all the null rows are dropped."""
modifies_target = bool(self.target_transforms)
df = df.copy(deep=modifies_target and not return_X_y)
self.features_ = self._compute_transforms()

# lag transforms
for feat in self.transforms.keys():
Expand Down Expand Up @@ -399,7 +398,6 @@ def fit_transform(
If `keep_last_n` is not None then that number of observations is kept across all series for updates.
"""
self.dropna = dropna
self.keep_last_n = keep_last_n
self._fit(data, id_col, time_col, target_col, static_features, keep_last_n)
return self._transform(
data, dropna=dropna, max_horizon=max_horizon, return_X_y=return_X_y
Expand Down Expand Up @@ -461,6 +459,8 @@ def _predict_setup(self) -> None:
self.test_dates = []
self.y_pred = []
self.ga = GroupedArray(self._ga.data, self._ga.indptr)
if self.keep_last_n is not None:
self.ga = self.ga.take_from_groups(slice(-self.keep_last_n, None))

def _get_features_for_next_step(self, dynamic_dfs):
new_x = self._update_features()
Expand Down
49 changes: 48 additions & 1 deletion mlforecast/target_transforms.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/target_transforms.ipynb.

# %% auto 0
__all__ = ['BaseTargetTransform', 'Differences']
__all__ = ['BaseTargetTransform', 'Differences', 'LocalStandardScaler']

# %% ../nbs/target_transforms.ipynb 2
import abc
Expand All @@ -11,6 +11,7 @@
if TYPE_CHECKING:
import pandas as pd
import numpy as np
from numba import njit

from .grouped_array import GroupedArray, _apply_difference

Expand Down Expand Up @@ -67,3 +68,49 @@ def inverse_transform(self, df: "pd.DataFrame") -> "pd.DataFrame":
ga.restore_difference(model_preds, d)
df[model] = model_preds
return df

# %% ../nbs/target_transforms.ipynb 5
@njit
def _standard_scaler_transform(data, indptr, stats, out):
n_series = len(indptr) - 1
for i in range(n_series):
sl = slice(indptr[i], indptr[i + 1])
subs = data[sl]
mean_ = subs.mean()
std_ = subs.std()
stats[i] = mean_, std_
out[sl] = (data[sl] - mean_) / std_


@njit
def _standard_scaler_inverse_transform(preds, stats):
n_series = stats.shape[0]
h = preds.size // n_series
k = 0
for i in range(n_series):
mean_, std_ = stats[i]
for _ in range(h):
preds[k] = preds[k] * std_ + mean_
k += 1

# %% ../nbs/target_transforms.ipynb 6
class LocalStandardScaler(BaseTargetTransform):
"""Standardizes each serie by subtracting its mean and dividing by its standard deviation."""

def fit_transform(self, df: "pd.DataFrame") -> "pd.DataFrame":
ga = GroupedArray.from_sorted_df(df, self.id_col, self.target_col)
self.stats_ = np.empty((len(ga.indptr) - 1, 2))
out = np.empty_like(ga.data)
_standard_scaler_transform(ga.data, ga.indptr, self.stats_, out)
df = df.copy()
df[self.target_col] = out
return df

def inverse_transform(self, df: "pd.DataFrame") -> "pd.DataFrame":
df = df.copy()
model_cols = df.columns.drop([self.id_col, self.time_col])
for model in model_cols:
model_preds = df[model].values
_standard_scaler_inverse_transform(model_preds, self.stats_)
df[model] = model_preds
return df
28 changes: 23 additions & 5 deletions nbs/core.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@
"from window_ops.rolling import rolling_mean\n",
"from window_ops.shift import shift_array\n",
"\n",
"from mlforecast.target_transforms import LocalStandardScaler\n",
"from mlforecast.utils import generate_daily_series, generate_prices_for_series"
]
},
Expand Down Expand Up @@ -671,6 +672,7 @@
" self.id_col = id_col\n",
" self.target_col = target_col\n",
" self.time_col = time_col\n",
" self.keep_last_n = keep_last_n \n",
" to_drop = [id_col, time_col, target_col]\n",
" self.static_features = static_features\n",
" if static_features is None:\n",
Expand All @@ -697,9 +699,6 @@
" sorted_df = sorted_df.set_index([id_col, time_col])\n",
" self.uids = sorted_df.index.unique(level=0)\n",
" self.ga = GroupedArray.from_sorted_df(sorted_df, id_col, target_col)\n",
" self.features_ = self._compute_transforms()\n",
" if keep_last_n is not None:\n",
" self.ga = self.ga.take_from_groups(slice(-keep_last_n, None))\n",
" self._ga = GroupedArray(self.ga.data, self.ga.indptr)\n",
" self.last_dates = sorted_df.index.get_level_values(self.time_col)[self.ga.indptr[1:] - 1]\n",
" self.features_order_ = df.columns.drop(to_drop).tolist() + self.features\n",
Expand Down Expand Up @@ -778,6 +777,7 @@
" if `dropna=True` then all the null rows are dropped.\"\"\"\n",
" modifies_target = bool(self.target_transforms)\n",
" df = df.copy(deep=modifies_target and not return_X_y)\n",
" self.features_ = self._compute_transforms()\n",
"\n",
" # lag transforms\n",
" for feat in self.transforms.keys():\n",
Expand Down Expand Up @@ -848,7 +848,6 @@
" If `keep_last_n` is not None then that number of observations is kept across all series for updates.\n",
" \"\"\"\n",
" self.dropna = dropna\n",
" self.keep_last_n = keep_last_n\n",
" self._fit(data, id_col, time_col, target_col, static_features, keep_last_n)\n",
" return self._transform(data, dropna=dropna, max_horizon=max_horizon, return_X_y=return_X_y)\n",
"\n",
Expand Down Expand Up @@ -907,7 +906,9 @@
" self.test_dates = []\n",
" self.y_pred = []\n",
" self.ga = GroupedArray(self._ga.data, self._ga.indptr)\n",
" \n",
" if self.keep_last_n is not None:\n",
" self.ga = self.ga.take_from_groups(slice(-self.keep_last_n, None))\n",
"\n",
" def _get_features_for_next_step(self, dynamic_dfs):\n",
" new_x = self._update_features()\n",
" if dynamic_dfs:\n",
Expand Down Expand Up @@ -1496,6 +1497,7 @@
"\n",
"ts = TimeSeries(**flow_config)\n",
"df = ts.fit_transform(series, id_col='unique_id', time_col='ds', target_col='y', keep_last_n=keep_last_n)\n",
"ts._predict_setup()\n",
"\n",
"expected_lags = ['lag7', 'lag14']\n",
"expected_transforms = ['rolling_mean_lag2_window_size7', \n",
Expand Down Expand Up @@ -1816,6 +1818,22 @@
" )\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#| hide\n",
"# target_transform with keep_last_n\n",
"ts = TimeSeries(freq='D', lags=[1], target_transforms=[LocalStandardScaler()])\n",
"ts.fit_transform(series, id_col='unique_id', time_col='ds', target_col='y', keep_last_n=10)\n",
"preds = ts.predict({'y': NaiveModel()}, 1)\n",
"expected = series.groupby('unique_id').tail(1)[['unique_id', 'ds', 'y']].reset_index(drop=True)\n",
"expected['ds'] += pd.offsets.Day()\n",
"pd.testing.assert_frame_equal(preds, expected)"
]
}
],
"metadata": {
Expand Down
82 changes: 82 additions & 0 deletions nbs/target_transforms.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
"if TYPE_CHECKING:\n",
" import pandas as pd\n",
"import numpy as np\n",
"from numba import njit\n",
"\n",
"from mlforecast.grouped_array import GroupedArray, _apply_difference"
]
Expand Down Expand Up @@ -106,6 +107,87 @@
" df[model] = model_preds\n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "008b5feb-1622-4293-ba3b-e7f9704de6cb",
"metadata": {},
"outputs": [],
"source": [
"#| exporti\n",
"@njit\n",
"def _standard_scaler_transform(data, indptr, stats, out):\n",
" n_series = len(indptr) - 1\n",
" for i in range(n_series):\n",
" sl = slice(indptr[i], indptr[i + 1])\n",
" subs = data[sl]\n",
" mean_ = subs.mean()\n",
" std_ = subs.std()\n",
" stats[i] = mean_, std_\n",
" out[sl] = (data[sl] - mean_) / std_\n",
"\n",
"@njit\n",
"def _standard_scaler_inverse_transform(preds, stats):\n",
" n_series = stats.shape[0]\n",
" h = preds.size // n_series\n",
" k = 0\n",
" for i in range(n_series):\n",
" mean_, std_ = stats[i]\n",
" for _ in range(h):\n",
" preds[k] = preds[k] * std_ + mean_\n",
" k += 1"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "67473e24-19f4-4c04-bc4b-13c313cbf52a",
"metadata": {},
"outputs": [],
"source": [
"#| export\n",
"class LocalStandardScaler(BaseTargetTransform):\n",
" \"\"\"Standardizes each serie by subtracting its mean and dividing by its standard deviation.\"\"\" \n",
" \n",
" def fit_transform(self, df: 'pd.DataFrame') -> 'pd.DataFrame':\n",
" ga = GroupedArray.from_sorted_df(df, self.id_col, self.target_col)\n",
" self.stats_ = np.empty((len(ga.indptr) - 1, 2)) \n",
" out = np.empty_like(ga.data)\n",
" _standard_scaler_transform(ga.data, ga.indptr, self.stats_, out)\n",
" df = df.copy()\n",
" df[self.target_col] = out\n",
" return df\n",
"\n",
" def inverse_transform(self, df: 'pd.DataFrame') -> 'pd.DataFrame': \n",
" df = df.copy()\n",
" model_cols = df.columns.drop([self.id_col, self.time_col]) \n",
" for model in model_cols:\n",
" model_preds = df[model].values\n",
" _standard_scaler_inverse_transform(model_preds, self.stats_)\n",
" df[model] = model_preds\n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ab57942f-4b55-4748-858c-26e3be6832ae",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"from mlforecast.utils import generate_daily_series\n",
"\n",
"series = generate_daily_series(10, min_length=50, max_length=50)\n",
"sc = LocalStandardScaler()\n",
"sc.set_column_names('unique_id', 'ds', 'y')\n",
"pd.testing.assert_frame_equal(\n",
" sc.inverse_transform(sc.fit_transform(series)),\n",
" series,\n",
")"
]
}
],
"metadata": {
Expand Down

0 comments on commit 0584e0b

Please sign in to comment.