Skip to content

Commit

Permalink
clean_up_code
Browse files Browse the repository at this point in the history
  • Loading branch information
elephaint committed Nov 29, 2024
1 parent e7e25ad commit 046c5be
Show file tree
Hide file tree
Showing 6 changed files with 140 additions and 309 deletions.
2 changes: 0 additions & 2 deletions hierarchicalforecast/_modidx.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,8 +202,6 @@
'hierarchicalforecast/utils.py'),
'hierarchicalforecast.utils.aggregate': ( 'src/utils.html#aggregate',
'hierarchicalforecast/utils.py'),
'hierarchicalforecast.utils.cov2corr': ( 'src/utils.html#cov2corr',
'hierarchicalforecast/utils.py'),
'hierarchicalforecast.utils.is_strictly_hierarchical': ( 'src/utils.html#is_strictly_hierarchical',
'hierarchicalforecast/utils.py'),
'hierarchicalforecast.utils.level_to_outputs': ( 'src/utils.html#level_to_outputs',
Expand Down
5 changes: 3 additions & 2 deletions hierarchicalforecast/probabilistic_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from scipy.stats import norm
from sklearn.preprocessing import OneHotEncoder

from .utils import is_strictly_hierarchical, cov2corr
from .utils import is_strictly_hierarchical

# %% ../nbs/src/probabilistic_methods.ipynb 6
class Normality:
Expand Down Expand Up @@ -64,7 +64,8 @@ def __init__(

# Base Normality Errors assume independence/diagonal covariance
# TODO: replace bilinearity with elementwise row multiplication
R1 = cov2corr(self.W)
std_ = np.sqrt(np.diag(self.W))
R1 = self.W / np.outer(std_, std_)
Wh = [np.diag(sigma) @ R1 @ np.diag(sigma).T for sigma in self.sigmah.T]

# Reconciled covariances across forecast horizon
Expand Down
91 changes: 46 additions & 45 deletions hierarchicalforecast/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from narwhals.typing import Frame, FrameT
from numba import njit, prange
from sklearn.preprocessing import OneHotEncoder
from typing import Dict, List, Optional, Iterable, Union, Sequence
from typing import Dict, List, Optional, Union, Sequence

# %% ../nbs/src/utils.ipynb 6
# Global variables
Expand Down Expand Up @@ -44,7 +44,7 @@ def __exit__(self, exc_type, exc_value, traceback):
)

# %% ../nbs/src/utils.ipynb 8
def is_strictly_hierarchical(S: np.ndarray, tags: Dict[str, np.ndarray]):
def is_strictly_hierarchical(S: np.ndarray, tags: Dict[str, np.ndarray]) -> bool:
# main idea:
# if S represents a strictly hierarchical structure
# the number of paths before the bottom level
Expand All @@ -60,25 +60,10 @@ def is_strictly_hierarchical(S: np.ndarray, tags: Dict[str, np.ndarray]):
nodes = levels_.popitem()[1].size
return paths == nodes

# %% ../nbs/src/utils.ipynb 9
def cov2corr(cov, return_std=False):
"""convert covariance matrix to correlation matrix
**Parameters:**<br>
`cov`: array_like, 2d covariance matrix.<br>
`return_std`: bool=False, if True returned std.<br>
**Returns:**<br>
`corr`: ndarray (subclass) correlation matrix
"""
cov = np.asanyarray(cov)
std_ = np.sqrt(np.diag(cov))
corr = cov / np.outer(std_, std_)
if return_std:
return corr, std_
else:
return corr

# %% ../nbs/src/utils.ipynb 11
def _to_upper_hierarchy(bottom_split, bottom_values, upper_key):
# %% ../nbs/src/utils.ipynb 10
def _to_upper_hierarchy(
bottom_split: List[str], bottom_values: str, upper_key: str
) -> List[str]:
upper_split = upper_key.split("/")
upper_idxs = [bottom_split.index(i) for i in upper_split]

Expand All @@ -88,7 +73,7 @@ def join_upper(bottom_value):

return [join_upper(val) for val in bottom_values]

# %% ../nbs/src/utils.ipynb 14
# %% ../nbs/src/utils.ipynb 11
def aggregate(
df: Frame,
spec: List[List[str]],
Expand Down Expand Up @@ -139,16 +124,18 @@ def aggregate(
raise ValueError("Sparse output is only supported for Pandas DataFrames.")

for col in df_nw.columns:
assert (
not df_nw[col].is_null().any()
), f"Column {col} contains null values. Make sure no column in the DataFrame contains null values."
if df_nw[col].is_null().any():
raise ValueError(
f"Column {col} contains null values. Make sure no column in the DataFrame contains null values."
)

# Check whether all columns in the spec are in the df
aggregation_cols_in_spec = list(
dict.fromkeys([col for cols in spec for col in cols])
)
for col in aggregation_cols_in_spec:
assert col in df_nw.columns, f"Column {col} in spec not present in df"
if col not in df_nw.columns:
raise ValueError(f"Column {col} in spec not present in df")

# Prepare the aggregation dictionary
agg_dict = dict(
Expand Down Expand Up @@ -247,7 +234,7 @@ def aggregate(

return Y_df, S_df, tags

# %% ../nbs/src/utils.ipynb 30
# %% ../nbs/src/utils.ipynb 25
class HierarchicalPlot:
"""Hierarchical Plot
Expand Down Expand Up @@ -532,9 +519,9 @@ def plot_hierarchical_predictions_gap(
plt.grid()
plt.show()

# %% ../nbs/src/utils.ipynb 51
# %% ../nbs/src/utils.ipynb 46
# convert levels to output quantile names
def level_to_outputs(level: Iterable[int]):
def level_to_outputs(level: List[int]) -> tuple[List[float], List[str]]:
"""Converts list of levels into output names matching StatsForecast and NeuralForecast methods.
**Parameters:**<br>
Expand All @@ -558,7 +545,7 @@ def level_to_outputs(level: Iterable[int]):


# convert quantiles to output quantile names
def quantiles_to_outputs(quantiles: Iterable[float]):
def quantiles_to_outputs(quantiles: List[float]) -> tuple[List[float], List[str]]:
"""Converts list of quantiles into output names matching StatsForecast and NeuralForecast methods.
**Parameters:**<br>
Expand All @@ -577,7 +564,7 @@ def quantiles_to_outputs(quantiles: Iterable[float]):
output_names.append("-median")
return quantiles, output_names

# %% ../nbs/src/utils.ipynb 52
# %% ../nbs/src/utils.ipynb 47
# given input array of sample forecasts and inptut quantiles/levels,
# output a Pandas Dataframe with columns of quantile predictions
def samples_to_quantiles_df(
Expand All @@ -586,10 +573,11 @@ def samples_to_quantiles_df(
dates: List[str],
quantiles: Optional[List[float]] = None,
level: Optional[List[int]] = None,
model_name: Optional[str] = "model",
model_name: str = "model",
id_col: str = "unique_id",
time_col: str = "ds",
):
backend: str = "pandas",
) -> tuple[List[float], FrameT]:
"""Transform Random Samples into HierarchicalForecast input.
Auxiliary function to create compatible HierarchicalForecast input `Y_hat_df` dataframe.
Expand All @@ -602,26 +590,35 @@ def samples_to_quantiles_df(
`model_name`: string. Name of forecasting model.<br>
`id_col` : str='unique_id', column that identifies each serie.<br>
`time_col` : str='ds', column that identifies each timestep, its values can be timestamps or integers.<br>
`backend` : str='pandas', backend to use for the output dataframe, either 'pandas' or 'polars'.<br>
**Returns:**<br>
`quantiles`: float list in [0., 1.]. quantiles to estimate from y distribution .<br>
`Y_hat_df`: pd.DataFrame. With base quantile forecasts with columns ds and models to reconcile indexed by unique_id.
`Y_hat_df`: DataFrame. With base quantile forecasts with columns ds and models to reconcile indexed by unique_id.
"""

# Get the shape of the array
n_series, n_samples, horizon = samples.shape

assert n_series == len(unique_ids)
assert horizon == len(dates)
assert (quantiles is not None) ^ (
level is not None
) # check exactly one of quantiles/levels has been input
if n_series != len(unique_ids):
raise ValueError(
f"Number of unique_ids ({len(unique_ids)}) must match the number of series ({n_series})."
)
if horizon != len(dates):
raise ValueError(
f"Number of dates ({len(dates)}) must match third dimension of samples array ({horizon})."
)
if not ((quantiles is None) ^ (level is None)):
raise ValueError("Either quantiles or level must be provided, but not both.")

namespace = sys.modules.get(backend, None)
if namespace is None:
raise ValueError(f"DataFrame backend {backend} not installed.")

# create initial dictionary
forecasts_mean = np.mean(samples, axis=1).flatten()
unique_ids = np.repeat(unique_ids, horizon)
ds = np.tile(dates, n_series)
data = pd.DataFrame({id_col: unique_ids, time_col: ds, model_name: forecasts_mean})

# create quantiles and quantile names
if level is not None:
Expand All @@ -642,11 +639,15 @@ def samples_to_quantiles_df(
) # [Q,H,N] -> [N,H,Q]
forecasts_quantiles = forecasts_quantiles.reshape(-1, len(_quantiles))

df = pd.DataFrame(data=forecasts_quantiles, columns=col_names)
df_nw = nw.from_dict(
{id_col: unique_ids, time_col: ds, model_name: forecasts_mean},
native_namespace=namespace,
)
df_nw = df_nw.with_columns(**dict(zip(col_names, forecasts_quantiles.T)))

return _quantiles, pd.concat([data, df], axis=1).set_index(id_col)
return _quantiles, df_nw.to_native()

# %% ../nbs/src/utils.ipynb 59
# %% ../nbs/src/utils.ipynb 55
# Masked empirical covariance matrix
@njit(
"Array(float64, 2, 'F')(Array(float64, 2, 'C'), Array(bool, 2, 'C'))",
Expand Down Expand Up @@ -685,7 +686,7 @@ def _ma_cov(residuals: np.ndarray, not_nan_mask: np.ndarray):

return W

# %% ../nbs/src/utils.ipynb 60
# %% ../nbs/src/utils.ipynb 56
# Shrunk covariance matrix using the Schafer-Strimmer method


Expand Down Expand Up @@ -836,7 +837,7 @@ def _shrunk_covariance_schaferstrimmer_with_nans(

return W

# %% ../nbs/src/utils.ipynb 62
# %% ../nbs/src/utils.ipynb 58
# Lasso cyclic coordinate descent
@njit(
"Array(float64, 1, 'C')(Array(float64, 2, 'C'), Array(float64, 1, 'C'), float64, int64, float64)",
Expand Down
27 changes: 15 additions & 12 deletions nbs/src/core.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@
"import os\n",
"\n",
"from fastcore.test import test_close, test_eq, test_fail\n",
"from nbdev.showdoc import add_docs, show_doc\n",
"from nbdev.showdoc import show_doc\n",
"import pandas as pd"
]
},
Expand Down Expand Up @@ -1282,7 +1282,7 @@
"#| hide\n",
"from statsforecast import StatsForecast\n",
"from statsforecast.utils import generate_series\n",
"from statsforecast.models import RandomWalkWithDrift, AutoETS"
"from statsforecast.models import RandomWalkWithDrift"
]
},
{
Expand All @@ -1294,15 +1294,15 @@
"#| hide\n",
"# test unbalanced dataset\n",
"max_tenure = 24\n",
"dates = pd.date_range(start='2019-01-31', freq='M', periods=max_tenure)\n",
"dates = pd.date_range(start='2019-01-31', freq='ME', periods=max_tenure)\n",
"cohort_tenure = [24, 23, 22, 21]\n",
"\n",
"ts_list = []\n",
"\n",
"# Create ts for each cohort\n",
"for i in range(len(cohort_tenure)):\n",
" ts_list.append(\n",
" generate_series(n_series=1, freq='M', min_length=cohort_tenure[i], max_length=cohort_tenure[i]).reset_index() \\\n",
" generate_series(n_series=1, freq='ME', min_length=cohort_tenure[i], max_length=cohort_tenure[i]).reset_index() \\\n",
" .assign(ult=i) \\\n",
" .assign(ds=dates[-cohort_tenure[i]:]) \\\n",
" .drop(columns=['unique_id'])\n",
Expand All @@ -1328,7 +1328,7 @@
" models=[\n",
" RandomWalkWithDrift(),\n",
" ],\n",
" freq='M',\n",
" freq='ME',\n",
" n_jobs=1,\n",
")\n",
"\n",
Expand All @@ -1343,8 +1343,8 @@
"fitted_df = fcst.forecast_fitted_values()\n",
"\n",
"fcst_df = hrec.reconcile(\n",
" Y_hat_df=fcst_df.reset_index(),\n",
" Y_df=fitted_df.reset_index(),\n",
" Y_hat_df=fcst_df,\n",
" Y_df=fitted_df,\n",
" S=S_df,\n",
" tags=tags,\n",
")"
Expand Down Expand Up @@ -1380,15 +1380,18 @@
" ]\n",
")\n",
"\n",
"fcst_df = fcst.forecast(df=train_df, h=12, fitted=True)\n",
"fcst_df_pl = fcst.forecast(df=train_df, h=12, fitted=True)\n",
"fitted_df = fcst.forecast_fitted_values()\n",
"\n",
"fcst_df = hrec.reconcile(\n",
" Y_hat_df=fcst_df,\n",
"fcst_df_pl = hrec.reconcile(\n",
" Y_hat_df=fcst_df_pl,\n",
" Y_df=fitted_df,\n",
" S=S_df_pl,\n",
" tags=tags,\n",
")"
")\n",
"\n",
"# Test equivalence\n",
"pd.testing.assert_frame_equal(fcst_df, fcst_df_pl.to_pandas())"
]
},
{
Expand Down Expand Up @@ -1744,7 +1747,7 @@
"import pandas as pd\n",
"\n",
"from statsforecast.core import StatsForecast\n",
"from statsforecast.models import AutoETS, Naive\n",
"from statsforecast.models import AutoETS\n",
"\n",
"from hierarchicalforecast.utils import aggregate\n",
"from hierarchicalforecast.core import HierarchicalReconciliation\n",
Expand Down
5 changes: 3 additions & 2 deletions nbs/src/probabilistic_methods.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
"from scipy.stats import norm\n",
"from sklearn.preprocessing import OneHotEncoder\n",
"\n",
"from hierarchicalforecast.utils import is_strictly_hierarchical, cov2corr"
"from hierarchicalforecast.utils import is_strictly_hierarchical"
]
},
{
Expand Down Expand Up @@ -120,7 +120,8 @@
"\n",
" # Base Normality Errors assume independence/diagonal covariance\n",
" # TODO: replace bilinearity with elementwise row multiplication\n",
" R1 = cov2corr(self.W)\n",
" std_ = np.sqrt(np.diag(self.W))\n",
" R1 = self.W / np.outer(std_, std_) \n",
" Wh = [np.diag(sigma) @ R1 @ np.diag(sigma).T for sigma in self.sigmah.T]\n",
"\n",
" # Reconciled covariances across forecast horizon\n",
Expand Down
Loading

0 comments on commit 046c5be

Please sign in to comment.