clean_up_code

Nixtla · Nov 29, 2024 · 046c5be · 046c5be
1 parent e7e25ad
commit 046c5be
Show file tree

Hide file tree

Showing 6 changed files with 140 additions and 309 deletions.
diff --git a/hierarchicalforecast/_modidx.py b/hierarchicalforecast/_modidx.py
@@ -202,8 +202,6 @@
                                                                                                 'hierarchicalforecast/utils.py'),
                                             'hierarchicalforecast.utils.aggregate': ( 'src/utils.html#aggregate',
                                                                                       'hierarchicalforecast/utils.py'),
-                                            'hierarchicalforecast.utils.cov2corr': ( 'src/utils.html#cov2corr',
-                                                                                     'hierarchicalforecast/utils.py'),
                                             'hierarchicalforecast.utils.is_strictly_hierarchical': ( 'src/utils.html#is_strictly_hierarchical',
                                                                                                      'hierarchicalforecast/utils.py'),
                                             'hierarchicalforecast.utils.level_to_outputs': ( 'src/utils.html#level_to_outputs',

diff --git a/hierarchicalforecast/probabilistic_methods.py b/hierarchicalforecast/probabilistic_methods.py
@@ -11,7 +11,7 @@
 from scipy.stats import norm
 from sklearn.preprocessing import OneHotEncoder
 
-from .utils import is_strictly_hierarchical, cov2corr
+from .utils import is_strictly_hierarchical
 
 # %% ../nbs/src/probabilistic_methods.ipynb 6
 class Normality:
@@ -64,7 +64,8 @@ def __init__(
 
         # Base Normality Errors assume independence/diagonal covariance
         # TODO: replace bilinearity with elementwise row multiplication
-        R1 = cov2corr(self.W)
+        std_ = np.sqrt(np.diag(self.W))
+        R1 = self.W / np.outer(std_, std_)
         Wh = [np.diag(sigma) @ R1 @ np.diag(sigma).T for sigma in self.sigmah.T]
 
         # Reconciled covariances across forecast horizon

diff --git a/hierarchicalforecast/utils.py b/hierarchicalforecast/utils.py
@@ -15,7 +15,7 @@
 from narwhals.typing import Frame, FrameT
 from numba import njit, prange
 from sklearn.preprocessing import OneHotEncoder
-from typing import Dict, List, Optional, Iterable, Union, Sequence
+from typing import Dict, List, Optional, Union, Sequence
 
 # %% ../nbs/src/utils.ipynb 6
 # Global variables
@@ -44,7 +44,7 @@ def __exit__(self, exc_type, exc_value, traceback):
             )
 
 # %% ../nbs/src/utils.ipynb 8
-def is_strictly_hierarchical(S: np.ndarray, tags: Dict[str, np.ndarray]):
+def is_strictly_hierarchical(S: np.ndarray, tags: Dict[str, np.ndarray]) -> bool:
     # main idea:
     # if S represents a strictly hierarchical structure
     # the number of paths before the bottom level
@@ -60,25 +60,10 @@ def is_strictly_hierarchical(S: np.ndarray, tags: Dict[str, np.ndarray]):
     nodes = levels_.popitem()[1].size
     return paths == nodes
 
-# %% ../nbs/src/utils.ipynb 9
-def cov2corr(cov, return_std=False):
-    """convert covariance matrix to correlation matrix
-    **Parameters:**<br>
-    `cov`: array_like, 2d covariance matrix.<br>
-    `return_std`: bool=False, if True returned std.<br>
-    **Returns:**<br>
-    `corr`: ndarray (subclass) correlation matrix
-    """
-    cov = np.asanyarray(cov)
-    std_ = np.sqrt(np.diag(cov))
-    corr = cov / np.outer(std_, std_)
-    if return_std:
-        return corr, std_
-    else:
-        return corr
-
-# %% ../nbs/src/utils.ipynb 11
-def _to_upper_hierarchy(bottom_split, bottom_values, upper_key):
+# %% ../nbs/src/utils.ipynb 10
+def _to_upper_hierarchy(
+    bottom_split: List[str], bottom_values: str, upper_key: str
+) -> List[str]:
     upper_split = upper_key.split("/")
     upper_idxs = [bottom_split.index(i) for i in upper_split]
 
@@ -88,7 +73,7 @@ def join_upper(bottom_value):
 
     return [join_upper(val) for val in bottom_values]
 
-# %% ../nbs/src/utils.ipynb 14
+# %% ../nbs/src/utils.ipynb 11
 def aggregate(
     df: Frame,
     spec: List[List[str]],
@@ -139,16 +124,18 @@ def aggregate(
         raise ValueError("Sparse output is only supported for Pandas DataFrames.")
 
     for col in df_nw.columns:
-        assert (
-            not df_nw[col].is_null().any()
-        ), f"Column {col} contains null values. Make sure no column in the DataFrame contains null values."
+        if df_nw[col].is_null().any():
+            raise ValueError(
+                f"Column {col} contains null values. Make sure no column in the DataFrame contains null values."
+            )
 
     # Check whether all columns in the spec are in the df
     aggregation_cols_in_spec = list(
         dict.fromkeys([col for cols in spec for col in cols])
     )
     for col in aggregation_cols_in_spec:
-        assert col in df_nw.columns, f"Column {col} in spec not present in df"
+        if col not in df_nw.columns:
+            raise ValueError(f"Column {col} in spec not present in df")
 
     # Prepare the aggregation dictionary
     agg_dict = dict(
@@ -247,7 +234,7 @@ def aggregate(
 
     return Y_df, S_df, tags
 
-# %% ../nbs/src/utils.ipynb 30
+# %% ../nbs/src/utils.ipynb 25
 class HierarchicalPlot:
     """Hierarchical Plot
 
@@ -532,9 +519,9 @@ def plot_hierarchical_predictions_gap(
         plt.grid()
         plt.show()
 
-# %% ../nbs/src/utils.ipynb 51
+# %% ../nbs/src/utils.ipynb 46
 # convert levels to output quantile names
-def level_to_outputs(level: Iterable[int]):
+def level_to_outputs(level: List[int]) -> tuple[List[float], List[str]]:
     """Converts list of levels into output names matching StatsForecast and NeuralForecast methods.
 
     **Parameters:**<br>
@@ -558,7 +545,7 @@ def level_to_outputs(level: Iterable[int]):
 
 
 # convert quantiles to output quantile names
-def quantiles_to_outputs(quantiles: Iterable[float]):
+def quantiles_to_outputs(quantiles: List[float]) -> tuple[List[float], List[str]]:
     """Converts list of quantiles into output names matching StatsForecast and NeuralForecast methods.
 
     **Parameters:**<br>
@@ -577,7 +564,7 @@ def quantiles_to_outputs(quantiles: Iterable[float]):
             output_names.append("-median")
     return quantiles, output_names
 
-# %% ../nbs/src/utils.ipynb 52
+# %% ../nbs/src/utils.ipynb 47
 # given input array of sample forecasts and inptut quantiles/levels,
 # output a Pandas Dataframe with columns of quantile predictions
 def samples_to_quantiles_df(
@@ -586,10 +573,11 @@ def samples_to_quantiles_df(
     dates: List[str],
     quantiles: Optional[List[float]] = None,
     level: Optional[List[int]] = None,
-    model_name: Optional[str] = "model",
+    model_name: str = "model",
     id_col: str = "unique_id",
     time_col: str = "ds",
-):
+    backend: str = "pandas",
+) -> tuple[List[float], FrameT]:
     """Transform Random Samples into HierarchicalForecast input.
     Auxiliary function to create compatible HierarchicalForecast input `Y_hat_df` dataframe.
 
@@ -602,26 +590,35 @@ def samples_to_quantiles_df(
     `model_name`: string. Name of forecasting model.<br>
     `id_col` : str='unique_id', column that identifies each serie.<br>
     `time_col` : str='ds', column that identifies each timestep, its values can be timestamps or integers.<br>
+    `backend` : str='pandas', backend to use for the output dataframe, either 'pandas' or 'polars'.<br>
 
     **Returns:**<br>
     `quantiles`: float list in [0., 1.]. quantiles to estimate from y distribution .<br>
-    `Y_hat_df`: pd.DataFrame. With base quantile forecasts with columns ds and models to reconcile indexed by unique_id.
+    `Y_hat_df`: DataFrame. With base quantile forecasts with columns ds and models to reconcile indexed by unique_id.
     """
 
     # Get the shape of the array
     n_series, n_samples, horizon = samples.shape
 
-    assert n_series == len(unique_ids)
-    assert horizon == len(dates)
-    assert (quantiles is not None) ^ (
-        level is not None
-    )  # check exactly one of quantiles/levels has been input
+    if n_series != len(unique_ids):
+        raise ValueError(
+            f"Number of unique_ids ({len(unique_ids)}) must match the number of series ({n_series})."
+        )
+    if horizon != len(dates):
+        raise ValueError(
+            f"Number of dates ({len(dates)}) must match third dimension of samples array ({horizon})."
+        )
+    if not ((quantiles is None) ^ (level is None)):
+        raise ValueError("Either quantiles or level must be provided, but not both.")
+
+    namespace = sys.modules.get(backend, None)
+    if namespace is None:
+        raise ValueError(f"DataFrame backend {backend} not installed.")
 
     # create initial dictionary
     forecasts_mean = np.mean(samples, axis=1).flatten()
     unique_ids = np.repeat(unique_ids, horizon)
     ds = np.tile(dates, n_series)
-    data = pd.DataFrame({id_col: unique_ids, time_col: ds, model_name: forecasts_mean})
 
     # create quantiles and quantile names
     if level is not None:
@@ -642,11 +639,15 @@ def samples_to_quantiles_df(
     )  # [Q,H,N] -> [N,H,Q]
     forecasts_quantiles = forecasts_quantiles.reshape(-1, len(_quantiles))
 
-    df = pd.DataFrame(data=forecasts_quantiles, columns=col_names)
+    df_nw = nw.from_dict(
+        {id_col: unique_ids, time_col: ds, model_name: forecasts_mean},
+        native_namespace=namespace,
+    )
+    df_nw = df_nw.with_columns(**dict(zip(col_names, forecasts_quantiles.T)))
 
-    return _quantiles, pd.concat([data, df], axis=1).set_index(id_col)
+    return _quantiles, df_nw.to_native()
 
-# %% ../nbs/src/utils.ipynb 59
+# %% ../nbs/src/utils.ipynb 55
 # Masked empirical covariance matrix
 @njit(
     "Array(float64, 2, 'F')(Array(float64, 2, 'C'), Array(bool, 2, 'C'))",
@@ -685,7 +686,7 @@ def _ma_cov(residuals: np.ndarray, not_nan_mask: np.ndarray):
 
     return W
 
-# %% ../nbs/src/utils.ipynb 60
+# %% ../nbs/src/utils.ipynb 56
 # Shrunk covariance matrix using the Schafer-Strimmer method
 
 
@@ -836,7 +837,7 @@ def _shrunk_covariance_schaferstrimmer_with_nans(
 
     return W
 
-# %% ../nbs/src/utils.ipynb 62
+# %% ../nbs/src/utils.ipynb 58
 # Lasso cyclic coordinate descent
 @njit(
     "Array(float64, 1, 'C')(Array(float64, 2, 'C'), Array(float64, 1, 'C'), float64, int64, float64)",

diff --git a/nbs/src/core.ipynb b/nbs/src/core.ipynb
@@ -69,7 +69,7 @@
     "import os\n",
     "\n",
     "from fastcore.test import test_close, test_eq, test_fail\n",
-    "from nbdev.showdoc import add_docs, show_doc\n",
+    "from nbdev.showdoc import show_doc\n",
     "import pandas as pd"
    ]
   },
@@ -1282,7 +1282,7 @@
     "#| hide\n",
     "from statsforecast import StatsForecast\n",
     "from statsforecast.utils import generate_series\n",
-    "from statsforecast.models import RandomWalkWithDrift, AutoETS"
+    "from statsforecast.models import RandomWalkWithDrift"
    ]
   },
   {
@@ -1294,15 +1294,15 @@
     "#| hide\n",
     "# test unbalanced dataset\n",
     "max_tenure = 24\n",
-    "dates = pd.date_range(start='2019-01-31', freq='M', periods=max_tenure)\n",
+    "dates = pd.date_range(start='2019-01-31', freq='ME', periods=max_tenure)\n",
     "cohort_tenure = [24, 23, 22, 21]\n",
     "\n",
     "ts_list = []\n",
     "\n",
     "# Create ts for each cohort\n",
     "for i in range(len(cohort_tenure)):\n",
     "    ts_list.append(\n",
-    "        generate_series(n_series=1, freq='M', min_length=cohort_tenure[i], max_length=cohort_tenure[i]).reset_index() \\\n",
+    "        generate_series(n_series=1, freq='ME', min_length=cohort_tenure[i], max_length=cohort_tenure[i]).reset_index() \\\n",
     "            .assign(ult=i) \\\n",
     "            .assign(ds=dates[-cohort_tenure[i]:]) \\\n",
     "            .drop(columns=['unique_id'])\n",
@@ -1328,7 +1328,7 @@
     "    models=[\n",
     "        RandomWalkWithDrift(),\n",
     "    ],\n",
-    "    freq='M',\n",
+    "    freq='ME',\n",
     "    n_jobs=1,\n",
     ")\n",
     "\n",
@@ -1343,8 +1343,8 @@
     "fitted_df = fcst.forecast_fitted_values()\n",
     "\n",
     "fcst_df = hrec.reconcile(\n",
-    "    Y_hat_df=fcst_df.reset_index(),\n",
-    "    Y_df=fitted_df.reset_index(),\n",
+    "    Y_hat_df=fcst_df,\n",
+    "    Y_df=fitted_df,\n",
     "    S=S_df,\n",
     "    tags=tags,\n",
     ")"
@@ -1380,15 +1380,18 @@
     "    ]\n",
     ")\n",
     "\n",
-    "fcst_df = fcst.forecast(df=train_df, h=12, fitted=True)\n",
+    "fcst_df_pl = fcst.forecast(df=train_df, h=12, fitted=True)\n",
     "fitted_df = fcst.forecast_fitted_values()\n",
     "\n",
-    "fcst_df = hrec.reconcile(\n",
-    "    Y_hat_df=fcst_df,\n",
+    "fcst_df_pl = hrec.reconcile(\n",
+    "    Y_hat_df=fcst_df_pl,\n",
     "    Y_df=fitted_df,\n",
     "    S=S_df_pl,\n",
     "    tags=tags,\n",
-    ")"
+    ")\n",
+    "\n",
+    "# Test equivalence\n",
+    "pd.testing.assert_frame_equal(fcst_df, fcst_df_pl.to_pandas())"
    ]
   },
   {
@@ -1744,7 +1747,7 @@
     "import pandas as pd\n",
     "\n",
     "from statsforecast.core import StatsForecast\n",
-    "from statsforecast.models import AutoETS, Naive\n",
+    "from statsforecast.models import AutoETS\n",
     "\n",
     "from hierarchicalforecast.utils import aggregate\n",
     "from hierarchicalforecast.core import HierarchicalReconciliation\n",

diff --git a/nbs/src/probabilistic_methods.ipynb b/nbs/src/probabilistic_methods.ipynb
@@ -42,7 +42,7 @@
     "from scipy.stats import norm\n",
     "from sklearn.preprocessing import OneHotEncoder\n",
     "\n",
-    "from hierarchicalforecast.utils import is_strictly_hierarchical, cov2corr"
+    "from hierarchicalforecast.utils import is_strictly_hierarchical"
    ]
   },
   {
@@ -120,7 +120,8 @@
     "\n",
     "        # Base Normality Errors assume independence/diagonal covariance\n",
     "        # TODO: replace bilinearity with elementwise row multiplication\n",
-    "        R1 = cov2corr(self.W)\n",
+    "        std_ = np.sqrt(np.diag(self.W))\n",
+    "        R1 = self.W / np.outer(std_, std_)       \n",
     "        Wh = [np.diag(sigma) @ R1 @ np.diag(sigma).T for sigma in self.sigmah.T]\n",
     "\n",
     "        # Reconciled covariances across forecast horizon\n",