From 4fa55ae6a540f564c2235d0c51a8c181555435ee Mon Sep 17 00:00:00 2001
From: Wenjie Du <wenjay.du@gmail.com>
Date: Sun, 25 Aug 2024 10:25:12 +0800
Subject: [PATCH 1/3] refactor: import preprocess_random_walk from benchpots;

---
 pypots/data/__init__.py   |  10 +-
 pypots/data/generating.py | 326 +++-----------------------------------
 2 files changed, 20 insertions(+), 316 deletions(-)

diff --git a/pypots/data/__init__.py b/pypots/data/__init__.py
index 26c4ba1d..3a2d3fde 100644
--- a/pypots/data/__init__.py
+++ b/pypots/data/__init__.py
@@ -6,12 +6,7 @@
 # License: BSD-3-Clause
 
 from .dataset import BaseDataset, SUPPORTED_DATASET_FILE_FORMATS
-from .generating import (
-    gene_complete_random_walk,
-    gene_complete_random_walk_for_anomaly_detection,
-    gene_complete_random_walk_for_classification,
-    gene_random_walk,
-)
+from .generating import gene_random_walk
 from .saving import (
     save_dict_into_h5,
     load_dict_from_h5,
@@ -30,9 +25,6 @@
     "BaseDataset",
     "SUPPORTED_DATASET_FILE_FORMATS",
     # dataset generation functions
-    "gene_complete_random_walk",
-    "gene_complete_random_walk_for_anomaly_detection",
-    "gene_complete_random_walk_for_classification",
     "gene_random_walk",
     "load_specific_dataset",
     # utils
diff --git a/pypots/data/generating.py b/pypots/data/generating.py
index f50b5276..c513f759 100644
--- a/pypots/data/generating.py
+++ b/pypots/data/generating.py
@@ -5,322 +5,34 @@
 # Created by Wenjie Du <wenjay.du@gmail.com>
 # License: BSD-3-Clause
 
-import math
-from typing import Optional, Tuple
-
-import numpy as np
-from benchpots.datasets import preprocess_physionet2012
-from pygrinder import mcar
-from sklearn.model_selection import train_test_split
-from sklearn.preprocessing import StandardScaler
-from sklearn.utils import check_random_state
+from benchpots.datasets import preprocess_physionet2012, preprocess_random_walk
 
 from ..utils.logging import logger
 
 
-def gene_complete_random_walk(
-    n_samples: int = 1000,
-    n_steps: int = 24,
-    n_features: int = 10,
-    mu: float = 0.0,
-    std: float = 1.0,
-    random_state: Optional[int] = None,
-) -> np.ndarray:
-    """Generate complete random walk time-series data, i.e. having no missing values.
-
-    Parameters
-    ----------
-    n_samples : int, default=1000
-        The number of training time-series samples to generate.
-
-    n_steps: int, default=24
-        The number of time steps (length) of generated time-series samples.
-
-    n_features : int, default=10
-        The number of features (dimensions) of generated time-series samples.
-
-    mu : float, default=0.0
-        Mean of the normal distribution, which random walk steps are sampled from.
-
-    std : float, default=1.0
-        Standard deviation of the normal distribution, which random walk steps are sampled from.
-
-    random_state : int, default=None
-        Random seed for data generation.
-
-    Returns
-    -------
-    ts_samples: array, shape of [n_samples, n_steps, n_features]
-        Generated random walk time series.
-    """
-    seed = check_random_state(random_state)
-    ts_samples = np.zeros([n_samples, n_steps, n_features])
-    random_values = seed.randn(n_samples, n_steps, n_features) * std + mu
-    ts_samples[:, 0, :] = random_values[:, 0, :]
-    for t in range(1, n_steps):
-        ts_samples[:, t, :] = ts_samples[:, t - 1, :] + random_values[:, t, :]
-    ts_samples = np.asarray(ts_samples)
-    return ts_samples
-
-
-def gene_complete_random_walk_for_classification(
-    n_classes: int = 2,
-    n_samples_each_class: int = 500,
-    n_steps: int = 24,
-    n_features: int = 10,
-    shuffle: bool = True,
-    random_state: Optional[int] = None,
-) -> Tuple[np.ndarray, np.ndarray]:
-    """Generate complete random walk time-series data for the classification task.
-
-    Parameters
-    ----------
-    n_classes : int, must >=1, default=2
-        Number of classes (types) of the generated data.
-
-    n_samples_each_class : int, default=500
-        Number of samples for each class to generate.
-
-    n_steps : int, default=24
-        Number of time steps in each sample.
-
-    n_features : int, default=10
-        Number of features.
-
-    shuffle : bool, default=True
-        Whether to shuffle generated samples.
-        If not, you can separate samples of each class according to `n_samples_each_class`.
-        For example,
-        X_class0=X[:n_samples_each_class],
-        X_class1=X[n_samples_each_class:n_samples_each_class*2]
-
-    random_state : int, default=None
-        Random seed for data generation.
-
-    Returns
-    -------
-    X : array, shape of [n_samples, n_steps, n_features]
-        Generated time-series data.
-
-    y : array, shape of [n_samples]
-        Labels indicating classes of time-series samples.
-
-    """
-    assert n_classes > 1, f"n_classes should be >1, but got {n_classes}"
-
-    ts_collector = []
-    label_collector = []
-
-    mu = 0
-    std = 1
-
-    for c_ in range(n_classes):
-        ts_samples = gene_complete_random_walk(
-            n_samples_each_class, n_steps, n_features, mu, std, random_state
-        )
-        label_samples = np.asarray([1 for _ in range(n_samples_each_class)]) * c_
-        ts_collector.extend(ts_samples)
-        label_collector.extend(label_samples)
-        mu += 1
-
-    X = np.asarray(ts_collector)
-    y = np.asarray(label_collector)
-
-    # if shuffling, then shuffle the order of samples
-    if shuffle:
-        indices = np.arange(len(X))
-        np.random.shuffle(indices)
-        X = X[indices]
-        y = y[indices]
-
-    return X, y
-
-
-def gene_complete_random_walk_for_anomaly_detection(
-    n_samples: int = 1000,
-    n_steps: int = 24,
-    n_features: int = 10,
-    mu: float = 0.0,
-    std: float = 1.0,
-    anomaly_proportion: float = 0.1,
-    anomaly_fraction: float = 0.02,
-    anomaly_scale_factor: float = 2.0,
-    random_state: Optional[int] = None,
-) -> Tuple[np.ndarray, np.ndarray]:
-    """Generate random walk time-series data for the anomaly-detection task.
-
-    Parameters
-    ----------
-    n_samples : int, default=1000
-        The number of training time-series samples to generate.
-
-    n_features : int, default=10
-        The number of features (dimensions) of generated time-series samples.
-
-    n_steps: int, default=24
-        The number of time steps (length) of generated time-series samples.
-
-    mu : float, default=0.0
-        Mean of the normal distribution, which random walk steps are sampled from.
-
-    std : float, default=1.0
-        Standard deviation of the normal distribution, which random walk steps are sampled from.
-
-    anomaly_proportion : float, default=0.1
-        Proportion of anomaly samples in all samples.
-
-    anomaly_fraction : float, default=0.02
-        Fraction of anomaly points in each anomaly sample.
-
-    anomaly_scale_factor : float, default=2.0
-        Scale factor for value scaling to create anomaly points in time series samples.
-
-    random_state : int, default=None
-        Random seed for data generation.
-
-    Returns
-    -------
-    X : array, shape of [n_samples, n_steps, n_features]
-        Generated time-series data.
-
-    y : array, shape of [n_samples]
-        Labels indicating if time-series samples are anomalies.
-    """
-    assert (
-        0 < anomaly_proportion < 1
-    ), f"anomaly_proportion should be >0 and <1, but got {anomaly_proportion}"
-    assert (
-        0 < anomaly_fraction < 1
-    ), f"anomaly_fraction should be >0 and <1, but got {anomaly_fraction}"
-    seed = check_random_state(random_state)
-    X = seed.randn(n_samples, n_steps, n_features) * std + mu
-    n_anomaly = math.floor(n_samples * anomaly_proportion)
-    anomaly_indices = np.random.choice(n_samples, size=n_anomaly, replace=False)
-    for a_i in anomaly_indices:
-        anomaly_sample = X[a_i]
-        anomaly_sample = anomaly_sample.flatten()
-        min_val = anomaly_sample.min()
-        max_val = anomaly_sample.max()
-        max_difference = min_val - max_val
-        n_points = n_steps * n_features
-        n_anomaly_points = int(n_points * anomaly_fraction)
-        point_indices = np.random.choice(
-            a=n_points, size=n_anomaly_points, replace=False
-        )
-        for p_i in point_indices:
-            anomaly_sample[p_i] = mu + np.random.uniform(
-                low=min_val - anomaly_scale_factor * max_difference,
-                high=max_val + anomaly_scale_factor * max_difference,
-            )
-        X[a_i] = anomaly_sample.reshape(n_steps, n_features)
-
-    # create labels
-    y = np.zeros(n_samples)
-    y[anomaly_indices] = 1
-
-    # shuffling
-    indices = np.arange(n_samples)
-    np.random.shuffle(indices)
-    X = X[indices]
-    y = y[indices]
-
-    return X, y
-
-
 def gene_random_walk(
     n_steps=24,
     n_features=10,
     n_classes=2,
     n_samples_each_class=1000,
     missing_rate=0.1,
-) -> dict:
-    """Generate a random-walk data.
-
-    Parameters
-    ----------
-    n_steps : int, default=24
-        Number of time steps in each sample.
-
-    n_features : int, default=10
-        Number of features.
-
-    n_classes : int, default=2
-        Number of classes (types) of the generated data.
-
-    n_samples_each_class : int, default=1000
-        Number of samples for each class to generate.
-
-    missing_rate : float, default=0.1
-        The rate of randomly missing values to generate, should be in [0,1).
-
-    Returns
-    -------
-    data: dict,
-        A dictionary containing the generated data.
-    """
-    assert 0 <= missing_rate < 1, "missing_rate must be in [0,1)"
-
-    # generate samples
-    X, y = gene_complete_random_walk_for_classification(
-        n_classes=n_classes,
-        n_samples_each_class=n_samples_each_class,
-        n_steps=n_steps,
-        n_features=n_features,
+):
+    dataset_from_benchpots = preprocess_random_walk(
+        n_steps,
+        n_features,
+        n_classes,
+        n_samples_each_class,
+        missing_rate,
     )
-    # split into train/val/test sets
-    train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2)
-    train_X, val_X, train_y, val_y = train_test_split(train_X, train_y, test_size=0.2)
-
-    if missing_rate > 0:
-        # create random missing values
-        train_X_ori = train_X
-        train_X = mcar(train_X, missing_rate)
-        # test set is left to mask after normalization
-
-    train_X = train_X.reshape(-1, n_features)
-    val_X = val_X.reshape(-1, n_features)
-    test_X = test_X.reshape(-1, n_features)
-    # normalization
-    scaler = StandardScaler()
-    train_X = scaler.fit_transform(train_X)
-    val_X = scaler.transform(val_X)
-    test_X = scaler.transform(test_X)
-    # reshape into time series samples
-    train_X = train_X.reshape(-1, n_steps, n_features)
-    val_X = val_X.reshape(-1, n_steps, n_features)
-    test_X = test_X.reshape(-1, n_steps, n_features)
-    data = {
-        "n_classes": n_classes,
-        "n_steps": n_steps,
-        "n_features": n_features,
-        "train_X": train_X,
-        "train_y": train_y,
-        "val_X": val_X,
-        "val_y": val_y,
-        "test_X": test_X,
-        "test_y": test_y,
-        "scaler": scaler,
-    }
-
-    if missing_rate > 0:
-        # mask values in the test set as ground truth
-        train_X_ori = scaler.transform(train_X_ori.reshape(-1, n_features)).reshape(
-            -1, n_steps, n_features
-        )
-        data["train_X_ori"] = train_X_ori
-
-        val_X_ori = val_X
-        val_X = mcar(val_X, missing_rate)
-        data["val_X"] = val_X
-        data["val_X_ori"] = val_X_ori
-
-        test_X_ori = test_X
-        test_X = mcar(test_X, missing_rate)
-        data["test_X"] = test_X
-        data["test_X_ori"] = np.nan_to_num(test_X_ori)  # fill NaNs for later error calc
-        data["test_X_indicating_mask"] = np.isnan(test_X_ori) ^ np.isnan(test_X)
-
-    return data
+    logger.warning(
+        "🚨 BenchPOTS package now is fully released and includes preprocessing functions for 170+ datasets. "
+        "gene_random_walk() has been deprecated and will be removed in pypots v0.9"
+    )
+    logger.info(
+        "🌟 Please refer to https://github.com/WenjieDu/BenchPOTS and "
+        "check out the func benchpots.datasets.preprocess_physionet2012()"
+    )
+    return dataset_from_benchpots
 
 
 def gene_physionet2012(artificially_missing_rate: float = 0.1):
@@ -328,8 +40,8 @@ def gene_physionet2012(artificially_missing_rate: float = 0.1):
         subset="all", rate=artificially_missing_rate
     )
     logger.warning(
-        "🚨 Due to the full release of BenchPOTS package, "
-        "gene_physionet2012() has been deprecated and will be removed in pypots v0.8"
+        "🚨 BenchPOTS package now is fully released and includes preprocessing functions for 170+ datasets. "
+        "gene_physionet2012() has been deprecated and will be removed in pypots v0.9"
     )
     logger.info(
         "🌟 Please refer to https://github.com/WenjieDu/BenchPOTS and "

From 65e059e195f485f9e97248afe2fec62f18e111c5 Mon Sep 17 00:00:00 2001
From: Wenjie Du <wenjay.du@gmail.com>
Date: Thu, 29 Aug 2024 18:57:20 +0800
Subject: [PATCH 2/3] test: adapt for updated gene_random_walk;

---
 tests/global_test_config.py | 39 ++++++++++++-------------------------
 1 file changed, 12 insertions(+), 27 deletions(-)

diff --git a/tests/global_test_config.py b/tests/global_test_config.py
index b737f61b..587162ad 100644
--- a/tests/global_test_config.py
+++ b/tests/global_test_config.py
@@ -26,9 +26,7 @@
 MODEL_SAVING_DIR = f"{RESULT_SAVING_DIR}/models"
 DATA_SAVING_DIR = f"{RESULT_SAVING_DIR}/datasets"
 RESULT_SAVING_DIR_FOR_IMPUTATION = os.path.join(MODEL_SAVING_DIR, "imputation")
-RESULT_SAVING_DIR_FOR_ANOMALY_DETECTION = os.path.join(
-    MODEL_SAVING_DIR, "anomaly_detection"
-)
+RESULT_SAVING_DIR_FOR_ANOMALY_DETECTION = os.path.join(MODEL_SAVING_DIR, "anomaly_detection")
 RESULT_SAVING_DIR_FOR_CLASSIFICATION = os.path.join(MODEL_SAVING_DIR, "classification")
 RESULT_SAVING_DIR_FOR_CLUSTERING = os.path.join(MODEL_SAVING_DIR, "clustering")
 RESULT_SAVING_DIR_FOR_FORECASTING = os.path.join(MODEL_SAVING_DIR, "forecasting")
@@ -39,15 +37,9 @@
 GENERAL_H5_TEST_SET_PATH = os.path.abspath(f"{GENERAL_DATA_SAVING_DIR}/test_set.h5")
 # paths to save the generated dataset for testing forecasting models with the lazy-loading strategy
 FORECASTING_DATA_SAVING_DIR = f"{DATA_SAVING_DIR}/forecasting_h5dataset"
-FORECASTING_H5_TRAIN_SET_PATH = os.path.abspath(
-    f"{FORECASTING_DATA_SAVING_DIR}/train_set.h5"
-)
-FORECASTING_H5_VAL_SET_PATH = os.path.abspath(
-    f"{FORECASTING_DATA_SAVING_DIR}/val_set.h5"
-)
-FORECASTING_H5_TEST_SET_PATH = os.path.abspath(
-    f"{FORECASTING_DATA_SAVING_DIR}/test_set.h5"
-)
+FORECASTING_H5_TRAIN_SET_PATH = os.path.abspath(f"{FORECASTING_DATA_SAVING_DIR}/train_set.h5")
+FORECASTING_H5_VAL_SET_PATH = os.path.abspath(f"{FORECASTING_DATA_SAVING_DIR}/val_set.h5")
+FORECASTING_H5_TEST_SET_PATH = os.path.abspath(f"{FORECASTING_DATA_SAVING_DIR}/test_set.h5")
 
 
 set_random_seed(RANDOM_SEED)
@@ -63,6 +55,9 @@
 )
 # DATA = gene_physionet2012()
 
+DATA["test_X_indicating_mask"] = np.isnan(DATA["test_X"]) ^ np.isnan(DATA["test_X_ori"])
+DATA["test_X_ori"] = np.nan_to_num(DATA["test_X_ori"])
+
 TRAIN_SET = {
     "X": DATA["train_X"],
     "y": DATA["train_y"].astype(float),
@@ -78,9 +73,7 @@
     "y": DATA["test_y"].astype(float),
 }
 
-assert (
-    N_PRED_STEPS <= DATA["train_X"].shape[1]
-), "N_PRED_STEPS should be less than the sequence length."
+assert N_PRED_STEPS <= DATA["train_X"].shape[1], "N_PRED_STEPS should be less than the sequence length."
 FORECASTING_TRAIN_SET = {
     "X": DATA["train_X"][:, :-N_PRED_STEPS],
     "X_pred": DATA["train_X_ori"][:, -N_PRED_STEPS:],
@@ -99,9 +92,7 @@
 cuda_devices = [torch.device(i) for i in range(n_cuda_devices)]
 if n_cuda_devices > 1:
     DEVICE = cuda_devices[np.random.randint(n_cuda_devices)]
-    logger.info(
-        f"❗️Detected multiple cuda devices, using one of them {DEVICE} to run testing."
-    )
+    logger.info(f"❗️Detected multiple cuda devices, using one of them {DEVICE} to run testing.")
 else:
     # if having no multiple cuda devices, leave it as None to use the default device
     DEVICE = None
@@ -112,9 +103,7 @@ def check_tb_and_model_checkpoints_existence(model):
     saved_files = os.listdir(model.saving_path)
     if ".DS_Store" in saved_files:  # for macOS
         saved_files.remove(".DS_Store")
-    assert (
-        model.saving_path is not None and len(saved_files) > 0
-    ), "tensorboard file does not exist"
+    assert model.saving_path is not None and len(saved_files) > 0, "tensorboard file does not exist"
     # check the model checkpoints existence
     saved_model_files = [i for i in saved_files if i.endswith(".pypots")]
     assert len(saved_model_files) > 0, "No model checkpoint saved."
@@ -135,9 +124,5 @@ def check_tb_and_model_checkpoints_existence(model):
     if not os.path.exists(FORECASTING_H5_TEST_SET_PATH):
         save_dict_into_h5(FORECASTING_TEST_SET, FORECASTING_H5_TEST_SET_PATH)
 
-    logger.info(
-        f"Files under GENERAL_DATA_SAVING_DIR: {os.listdir(GENERAL_DATA_SAVING_DIR)}"
-    )
-    logger.info(
-        f"Files under FORECASTING_DATA_SAVING_DIR: {os.listdir(FORECASTING_DATA_SAVING_DIR)}"
-    )
+    logger.info(f"Files under GENERAL_DATA_SAVING_DIR: {os.listdir(GENERAL_DATA_SAVING_DIR)}")
+    logger.info(f"Files under FORECASTING_DATA_SAVING_DIR: {os.listdir(FORECASTING_DATA_SAVING_DIR)}")

From bdcbace904e9d4c9b7d36e34ec5c6326b1617532 Mon Sep 17 00:00:00 2001
From: Wenjie Du <wenjay.du@gmail.com>
Date: Thu, 5 Sep 2024 23:05:18 +0800
Subject: [PATCH 3/3] feat: add ai4ts as a dependency and update requirements
 files;

---
 pypots/gungnir/client.py      | 7 ++-----
 requirements/conda_env.yml    | 8 ++++++--
 requirements/requirements.txt | 5 +++--
 3 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/pypots/gungnir/client.py b/pypots/gungnir/client.py
index 7c18cd51..a81f8e60 100644
--- a/pypots/gungnir/client.py
+++ b/pypots/gungnir/client.py
@@ -5,12 +5,9 @@
 # Created by Wenjie Du <wenjay.du@gmail.com>
 # License: BSD-3-Clause
 
-from ..utils.logging import logger
-
+from ai4ts.client import TimeSeriesAI
 
-class TimeSeriesAI:
-    def __init__(self):
-        pass
+from ..utils.logging import logger
 
 
 class Gungnir(TimeSeriesAI):
diff --git a/requirements/conda_env.yml b/requirements/conda_env.yml
index 56b51cc4..39c03e27 100644
--- a/requirements/conda_env.yml
+++ b/requirements/conda_env.yml
@@ -19,7 +19,11 @@ dependencies:
     - conda-forge::matplotlib
     - conda-forge::tensorboard
     - conda-forge::scikit-learn
-    - conda-forge::pygrinder >=0.6.2
+    - conda-forge::pygrinder >=0.6.4
     - conda-forge::tsdb >=0.6.1
-    - conda-forge::benchpots >=0.2.1
+    - conda-forge::benchpots >=0.3
     - pytorch::pytorch >=1.10.0
+
+    - pip:
+        # not published on conda-forge yet
+        - ai4ts
\ No newline at end of file
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
index 68e99e77..03436541 100644
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@@ -10,5 +10,6 @@ tensorboard
 scikit-learn
 torch>=1.10.0
 tsdb>=0.6.1
-pygrinder>=0.6.2
-benchpots>=0.2.1
+pygrinder>=0.6.4
+benchpots>=0.3
+ai4ts
\ No newline at end of file