Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

breaking: raise error for gaps in series #504

Merged
merged 19 commits into from
Oct 31, 2024
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ nixtla_client.plot(df, fcst_df, level=[80, 90])
nixtla_client = NixtlaClient(api_key = 'YOUR API KEY HERE')

# 2. Read Data # Wikipedia visits of NFL Star (
df = pd.read_csv('https://raw.githubusercontent.com/Nixtla/transfer-learning-time-series/main/datasets/peyton_manning.csv')
df = pd.read_csv('https://datasets-nixtla.s3.amazonaws.com/peyton-manning.csv')


# 3. Detect Anomalies
Expand Down
27 changes: 6 additions & 21 deletions nbs/docs/capabilities/anomaly-detection/01_quickstart.ipynb

Large diffs are not rendered by default.

Large diffs are not rendered by default.

23 changes: 6 additions & 17 deletions nbs/docs/capabilities/anomaly-detection/04_confidence_levels.ipynb

Large diffs are not rendered by default.

12 changes: 7 additions & 5 deletions nbs/docs/capabilities/forecast/11_irregular_timestamps.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -130,9 +130,8 @@
"output_type": "stream",
"text": [
"INFO:nixtla.nixtla_client:Validating inputs...\n",
"INFO:nixtla.nixtla_client:Preprocessing dataframes...\n",
"WARNING:nixtla.nixtla_client:You did not provide X_df. Exogenous variables in df are ignored. To surpress this warning, please add X_df with exogenous variables: Open, High, Low, Adj Close, Volume, Dividends, Stock Splits\n",
"WARNING:nixtla.nixtla_client:The specified horizon \"h\" exceeds the model horizon. This may lead to less accurate forecasts. Please consider using a smaller horizon.\n",
"INFO:nixtla.nixtla_client:Preprocessing dataframes...\n",
"INFO:nixtla.nixtla_client:Restricting input...\n",
"INFO:nixtla.nixtla_client:Calling Forecast Endpoint...\n"
]
Expand All @@ -141,12 +140,15 @@
"source": [
"# Read data\n",
"# Dates for the weekends are missing\n",
"df = pd.read_csv('https://raw.githubusercontent.com/Nixtla/transfer-learning-time-series/main/datasets/openbb/pltr.csv')\n",
"df = pd.read_csv(\n",
" 'https://datasets-nixtla.s3.amazonaws.com/pltr.csv',\n",
" usecols=['date', 'Close'],\n",
")\n",
"\n",
"# Forecast\n",
"# We use B for the freq, as only business days are represented in the dataset\n",
"forecast_df = nixtla_client.forecast(\n",
" df=df, \n",
" df=df,\n",
Copy link
Contributor

@elephaint elephaint Oct 28, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe we should add a callout to this capabilities notebook at the bottom stating that TimeGPT doesn't allow gaps in the timestamps? E.g.

"Make sure there are no gaps in your time series data. This means that even if the chosen frequency is irregular, you should still make sure you provide a value for every irregular timestamp in the data. For example, if your frequency is "B" (business day), there can't be a gap (missing datapoint) between two consecutive business days."

Edit: perhaps add a similar comment also to the beginning or end of the tutorial notebook 12_irregular.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We already have that in the data requirements notebook

When using TimeGPT, the data cannot contain missing values. This means that for every series, there should be no gaps in the timestamps and no missing values in the target variable.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, but repetition is the key to education? 😆

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah that may help, but I'd prefer to add a link to that section instead, otherwise we'll have to remember to change that in every place we set it and will most likely miss some.

" h=14, \n",
" freq='B',\n",
" time_col='date', \n",
Expand Down Expand Up @@ -185,5 +187,5 @@
}
},
"nbformat": 4,
"nbformat_minor": 2
"nbformat_minor": 4
}
323 changes: 159 additions & 164 deletions nbs/docs/tutorials/08_cross_validation.ipynb

Large diffs are not rendered by default.

297 changes: 139 additions & 158 deletions nbs/docs/tutorials/12_irregular_timestamps.ipynb

Large diffs are not rendered by default.

152 changes: 77 additions & 75 deletions nbs/docs/tutorials/20_anomaly_detection.ipynb

Large diffs are not rendered by default.

72 changes: 61 additions & 11 deletions nbs/src/nixtla_client.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@
")\n",
"from utilsforecast.compat import DFType, DataFrame, pl_DataFrame\n",
"from utilsforecast.feature_engineering import _add_time_features, time_features\n",
"from utilsforecast.preprocessing import id_time_grid\n",
"from utilsforecast.validation import ensure_time_dtype, validate_format\n",
"if TYPE_CHECKING:\n",
" try:\n",
Expand Down Expand Up @@ -871,6 +872,7 @@
" target_col: str,\n",
" model: str,\n",
" validate_api_key: bool,\n",
" freq: Optional[str],\n",
" ) -> Tuple[DFType, Optional[DFType], bool]:\n",
" if validate_api_key and not self.validate_api_key(log=False):\n",
" raise Exception('API Key not valid, please email [email protected]')\n",
Expand All @@ -896,7 +898,25 @@
" validate_format(df=df, id_col=id_col, time_col=time_col, target_col=target_col)\n",
" if ufp.is_nan_or_none(df[target_col]).any():\n",
" raise ValueError(f'Target column ({target_col}) cannot contain missing values.')\n",
" return df, X_df, drop_id\n",
" freq = _maybe_infer_freq(df, freq=freq, id_col=id_col, time_col=time_col)\n",
" expected_ids_times = id_time_grid(\n",
" df,\n",
" freq=freq,\n",
" start=\"per_serie\",\n",
" end=\"per_serie\",\n",
" id_col=id_col,\n",
" time_col=time_col,\n",
" )\n",
" if len(df) != len(expected_ids_times):\n",
" raise ValueError(\n",
" \"Series contain missing or duplicate timestamps, or the timestamps \"\n",
" \"do not match the provided frequency.\\n\"\n",
" \"Please make sure that all series have a single observation from the first \"\n",
" \"to the last timestamp and that the provided frequency matches the timestamps'.\\n\"\n",
" \"You can refer to https://docs.nixtla.io/docs/tutorials-missing_values \"\n",
" \"for an end to end example.\"\n",
" )\n",
" return df, X_df, drop_id, freq\n",
"\n",
" def validate_api_key(self, log: bool = True) -> bool:\n",
" \"\"\"Returns True if your api_key is valid.\"\"\"\n",
Expand Down Expand Up @@ -1045,20 +1065,20 @@
" self.__dict__.pop('feature_contributions', None)\n",
" model = self._maybe_override_model(model)\n",
" logger.info('Validating inputs...')\n",
" df, X_df, drop_id = self._run_validations(\n",
" df, X_df, drop_id, freq = self._run_validations(\n",
" df=df,\n",
" X_df=X_df,\n",
" id_col=id_col,\n",
" time_col=time_col,\n",
" target_col=target_col,\n",
" validate_api_key=validate_api_key,\n",
" model=model,\n",
" freq=freq,\n",
" )\n",
" df, X_df = _validate_exog(\n",
" df, X_df, id_col=id_col, time_col=time_col, target_col=target_col\n",
" )\n",
" level, quantiles = _prepare_level_and_quantiles(level, quantiles)\n",
" freq = _maybe_infer_freq(df, freq=freq, id_col=id_col, time_col=time_col)\n",
" standard_freq = _standardize_freq(freq)\n",
" model_input_size, model_horizon = self._get_model_params(model, standard_freq)\n",
" if finetune_steps > 0 or level is not None or add_history:\n",
Expand Down Expand Up @@ -1279,16 +1299,16 @@
" self.__dict__.pop('weights_x', None)\n",
" model = self._maybe_override_model(model)\n",
" logger.info('Validating inputs...')\n",
" df, _, drop_id = self._run_validations(\n",
" df, _, drop_id, freq = self._run_validations(\n",
" df=df,\n",
" X_df=None,\n",
" id_col=id_col,\n",
" time_col=time_col,\n",
" target_col=target_col,\n",
" validate_api_key=validate_api_key,\n",
" model=model,\n",
" freq=freq,\n",
" )\n",
" freq = _maybe_infer_freq(df, freq=freq, id_col=id_col, time_col=time_col)\n",
" standard_freq = _standardize_freq(freq)\n",
" model_input_size, model_horizon = self._get_model_params(model, standard_freq)\n",
"\n",
Expand Down Expand Up @@ -1468,16 +1488,16 @@
" )\n",
" model = self._maybe_override_model(model)\n",
" logger.info('Validating inputs...')\n",
" df, _, drop_id = self._run_validations(\n",
" df, _, drop_id, freq = self._run_validations(\n",
" df=df,\n",
" X_df=None,\n",
" id_col=id_col,\n",
" time_col=time_col,\n",
" target_col=target_col,\n",
" validate_api_key=validate_api_key,\n",
" model=model,\n",
" freq=freq,\n",
" )\n",
" freq = _maybe_infer_freq(df, freq=freq, id_col=id_col, time_col=time_col)\n",
" standard_freq = _standardize_freq(freq)\n",
" level, quantiles = _prepare_level_and_quantiles(level, quantiles)\n",
" model_input_size, model_horizon = self._get_model_params(model, standard_freq)\n",
Expand Down Expand Up @@ -1734,6 +1754,34 @@
"nixtla_client.validate_api_key()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#| hide\n",
"# missing times\n",
"series = generate_series(2, min_length=100, freq='5min')\n",
"with_gaps = series.sample(frac=0.5, random_state=0)\n",
"expected_msg = 'missing or duplicate timestamps, or the timestamps do not match'\n",
"# gaps\n",
"test_fail(\n",
" lambda: nixtla_client.forecast(df=with_gaps, h=1, freq='5min'),\n",
" contains=expected_msg,\n",
")\n",
"# duplicates\n",
"test_fail(\n",
" lambda: nixtla_client.forecast(df=pd.concat([series, series]), h=1, freq='5min'),\n",
" contains=expected_msg,\n",
")\n",
"# wrong freq\n",
"test_fail(\n",
" lambda: nixtla_client.forecast(df=series, h=1, freq='1min'),\n",
" contains=expected_msg,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down Expand Up @@ -2394,8 +2442,8 @@
"anom_inferred_df_index = nixtla_client.detect_anomalies(df_ds_index)\n",
"fcst_inferred_df = nixtla_client.forecast(df_[['ds', 'unique_id', 'y']], h=10)\n",
"anom_inferred_df = nixtla_client.detect_anomalies(df_[['ds', 'unique_id', 'y']])\n",
"pd.testing.assert_frame_equal(fcst_inferred_df_index, fcst_inferred_df, atol=1e-3)\n",
"pd.testing.assert_frame_equal(anom_inferred_df_index, anom_inferred_df, atol=1e-3)\n",
"pd.testing.assert_frame_equal(fcst_inferred_df_index, fcst_inferred_df, atol=1e-4, rtol=1e-3)\n",
"pd.testing.assert_frame_equal(anom_inferred_df_index, anom_inferred_df, atol=1e-4, rtol=1e-3)\n",
"df_ds_index = df_ds_index.groupby('unique_id').tail(80)\n",
"for freq in ['Y', 'W-MON', 'Q-DEC', 'H']:\n",
" df_ds_index.index = np.concatenate(\n",
Expand All @@ -2405,7 +2453,7 @@
" fcst_inferred_df_index = nixtla_client.forecast(df_ds_index, h=10)\n",
" df_test = df_ds_index.reset_index()\n",
" fcst_inferred_df = nixtla_client.forecast(df_test, h=10)\n",
" pd.testing.assert_frame_equal(fcst_inferred_df_index, fcst_inferred_df, atol=1e-3)"
" pd.testing.assert_frame_equal(fcst_inferred_df_index, fcst_inferred_df, atol=1e-4, rtol=1e-3)"
]
},
{
Expand Down Expand Up @@ -2547,7 +2595,9 @@
"\n",
"pd.testing.assert_frame_equal(\n",
" timegpt_anomalies_df_1,\n",
" timegpt_anomalies_df_2 \n",
" timegpt_anomalies_df_2,\n",
" atol=1e-4,\n",
" rtol=1e-3,\n",
")"
]
},
Expand Down
36 changes: 28 additions & 8 deletions nixtla/nixtla_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
)
from utilsforecast.compat import DFType, DataFrame, pl_DataFrame
from utilsforecast.feature_engineering import _add_time_features, time_features
from utilsforecast.preprocessing import id_time_grid
from utilsforecast.validation import ensure_time_dtype, validate_format

if TYPE_CHECKING:
Expand Down Expand Up @@ -800,6 +801,7 @@ def _run_validations(
target_col: str,
model: str,
validate_api_key: bool,
freq: Optional[str],
) -> Tuple[DFType, Optional[DFType], bool]:
if validate_api_key and not self.validate_api_key(log=False):
raise Exception("API Key not valid, please email [email protected]")
Expand Down Expand Up @@ -827,7 +829,25 @@ def _run_validations(
raise ValueError(
f"Target column ({target_col}) cannot contain missing values."
)
return df, X_df, drop_id
freq = _maybe_infer_freq(df, freq=freq, id_col=id_col, time_col=time_col)
elephaint marked this conversation as resolved.
Show resolved Hide resolved
expected_ids_times = id_time_grid(
df,
freq=freq,
start="per_serie",
end="per_serie",
id_col=id_col,
time_col=time_col,
)
if len(df) != len(expected_ids_times):
raise ValueError(
"Series contain missing or duplicate timestamps, or the timestamps "
"do not match the provided frequency.\n"
"Please make sure that all series have a single observation from the first "
"to the last timestamp and that the provided frequency matches the timestamps'.\n"
"You can refer to https://docs.nixtla.io/docs/tutorials-missing_values "
"for an end to end example."
)
return df, X_df, drop_id, freq

def validate_api_key(self, log: bool = True) -> bool:
"""Returns True if your api_key is valid."""
Expand Down Expand Up @@ -975,20 +995,20 @@ def forecast(
self.__dict__.pop("feature_contributions", None)
model = self._maybe_override_model(model)
logger.info("Validating inputs...")
df, X_df, drop_id = self._run_validations(
df, X_df, drop_id, freq = self._run_validations(
df=df,
X_df=X_df,
id_col=id_col,
time_col=time_col,
target_col=target_col,
validate_api_key=validate_api_key,
model=model,
freq=freq,
)
df, X_df = _validate_exog(
df, X_df, id_col=id_col, time_col=time_col, target_col=target_col
)
level, quantiles = _prepare_level_and_quantiles(level, quantiles)
freq = _maybe_infer_freq(df, freq=freq, id_col=id_col, time_col=time_col)
standard_freq = _standardize_freq(freq)
model_input_size, model_horizon = self._get_model_params(model, standard_freq)
if finetune_steps > 0 or level is not None or add_history:
Expand Down Expand Up @@ -1215,16 +1235,16 @@ def detect_anomalies(
self.__dict__.pop("weights_x", None)
model = self._maybe_override_model(model)
logger.info("Validating inputs...")
df, _, drop_id = self._run_validations(
df, _, drop_id, freq = self._run_validations(
df=df,
X_df=None,
id_col=id_col,
time_col=time_col,
target_col=target_col,
validate_api_key=validate_api_key,
model=model,
freq=freq,
)
freq = _maybe_infer_freq(df, freq=freq, id_col=id_col, time_col=time_col)
standard_freq = _standardize_freq(freq)
model_input_size, model_horizon = self._get_model_params(model, standard_freq)

Expand Down Expand Up @@ -1406,16 +1426,16 @@ def cross_validation(
)
model = self._maybe_override_model(model)
logger.info("Validating inputs...")
df, _, drop_id = self._run_validations(
df, _, drop_id, freq = self._run_validations(
df=df,
X_df=None,
id_col=id_col,
time_col=time_col,
target_col=target_col,
validate_api_key=validate_api_key,
model=model,
freq=freq,
)
freq = _maybe_infer_freq(df, freq=freq, id_col=id_col, time_col=time_col)
standard_freq = _standardize_freq(freq)
level, quantiles = _prepare_level_and_quantiles(level, quantiles)
model_input_size, model_horizon = self._get_model_params(model, standard_freq)
Expand Down Expand Up @@ -1628,7 +1648,7 @@ def plot(
ax=ax,
)

# %% ../nbs/src/nixtla_client.ipynb 50
# %% ../nbs/src/nixtla_client.ipynb 51
def _forecast_wrapper(
df: pd.DataFrame,
client: NixtlaClient,
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
"tabulate",
]
distributed = ["fugue[dask,ray,spark]>=0.8.7", "pandas<2.2", "ray<2.6.3"]
plotting = ["utilsforecast[plotting]>=0.2.3"]
plotting = ["utilsforecast[plotting]>=0.2.7"]
date_extras = ["holidays"]

setuptools.setup(
Expand Down