Merge pull request #510 from WenjieDu/(refactor)random_walk

Import random walk funcs from BenchPOTS and add AI4TS as a dependency
WenjieDu · Sep 12, 2024 · 63666fb · 63666fb
2 parents e20ede8 + 2f4413c
commit 63666fb
Show file tree

Hide file tree

Showing 6 changed files with 43 additions and 342 deletions.
diff --git a/pypots/data/__init__.py b/pypots/data/__init__.py
@@ -6,12 +6,7 @@
 # License: BSD-3-Clause
 
 from .dataset import BaseDataset, SUPPORTED_DATASET_FILE_FORMATS
-from .generating import (
-    gene_complete_random_walk,
-    gene_complete_random_walk_for_anomaly_detection,
-    gene_complete_random_walk_for_classification,
-    gene_random_walk,
-)
+from .generating import gene_random_walk
 from .saving import (
     save_dict_into_h5,
     load_dict_from_h5,
@@ -30,9 +25,6 @@
     "BaseDataset",
     "SUPPORTED_DATASET_FILE_FORMATS",
     # dataset generation functions
-    "gene_complete_random_walk",
-    "gene_complete_random_walk_for_anomaly_detection",
-    "gene_complete_random_walk_for_classification",
     "gene_random_walk",
     "load_specific_dataset",
     # utils

diff --git a/pypots/data/generating.py b/pypots/data/generating.py
@@ -5,319 +5,41 @@
 # Created by Wenjie Du <[email protected]>
 # License: BSD-3-Clause
 
-import math
-from typing import Optional, Tuple
-
-import numpy as np
-from benchpots.datasets import preprocess_physionet2012
-from pygrinder import mcar
-from sklearn.model_selection import train_test_split
-from sklearn.preprocessing import StandardScaler
-from sklearn.utils import check_random_state
+from benchpots.datasets import preprocess_physionet2012, preprocess_random_walk
 
 from ..utils.logging import logger
 
 
-def gene_complete_random_walk(
-    n_samples: int = 1000,
-    n_steps: int = 24,
-    n_features: int = 10,
-    mu: float = 0.0,
-    std: float = 1.0,
-    random_state: Optional[int] = None,
-) -> np.ndarray:
-    """Generate complete random walk time-series data, i.e. having no missing values.
-
-    Parameters
-    ----------
-    n_samples : int, default=1000
-        The number of training time-series samples to generate.
-
-    n_steps: int, default=24
-        The number of time steps (length) of generated time-series samples.
-
-    n_features : int, default=10
-        The number of features (dimensions) of generated time-series samples.
-
-    mu : float, default=0.0
-        Mean of the normal distribution, which random walk steps are sampled from.
-
-    std : float, default=1.0
-        Standard deviation of the normal distribution, which random walk steps are sampled from.
-
-    random_state : int, default=None
-        Random seed for data generation.
-
-    Returns
-    -------
-    ts_samples: array, shape of [n_samples, n_steps, n_features]
-        Generated random walk time series.
-    """
-    seed = check_random_state(random_state)
-    ts_samples = np.zeros([n_samples, n_steps, n_features])
-    random_values = seed.randn(n_samples, n_steps, n_features) * std + mu
-    ts_samples[:, 0, :] = random_values[:, 0, :]
-    for t in range(1, n_steps):
-        ts_samples[:, t, :] = ts_samples[:, t - 1, :] + random_values[:, t, :]
-    ts_samples = np.asarray(ts_samples)
-    return ts_samples
-
-
-def gene_complete_random_walk_for_classification(
-    n_classes: int = 2,
-    n_samples_each_class: int = 500,
-    n_steps: int = 24,
-    n_features: int = 10,
-    shuffle: bool = True,
-    random_state: Optional[int] = None,
-) -> Tuple[np.ndarray, np.ndarray]:
-    """Generate complete random walk time-series data for the classification task.
-
-    Parameters
-    ----------
-    n_classes : int, must >=1, default=2
-        Number of classes (types) of the generated data.
-
-    n_samples_each_class : int, default=500
-        Number of samples for each class to generate.
-
-    n_steps : int, default=24
-        Number of time steps in each sample.
-
-    n_features : int, default=10
-        Number of features.
-
-    shuffle : bool, default=True
-        Whether to shuffle generated samples.
-        If not, you can separate samples of each class according to `n_samples_each_class`.
-        For example,
-        X_class0=X[:n_samples_each_class],
-        X_class1=X[n_samples_each_class:n_samples_each_class*2]
-
-    random_state : int, default=None
-        Random seed for data generation.
-
-    Returns
-    -------
-    X : array, shape of [n_samples, n_steps, n_features]
-        Generated time-series data.
-
-    y : array, shape of [n_samples]
-        Labels indicating classes of time-series samples.
-
-    """
-    assert n_classes > 1, f"n_classes should be >1, but got {n_classes}"
-
-    ts_collector = []
-    label_collector = []
-
-    mu = 0
-    std = 1
-
-    for c_ in range(n_classes):
-        ts_samples = gene_complete_random_walk(n_samples_each_class, n_steps, n_features, mu, std, random_state)
-        label_samples = np.asarray([1 for _ in range(n_samples_each_class)]) * c_
-        ts_collector.extend(ts_samples)
-        label_collector.extend(label_samples)
-        mu += 1
-
-    X = np.asarray(ts_collector)
-    y = np.asarray(label_collector)
-
-    # if shuffling, then shuffle the order of samples
-    if shuffle:
-        indices = np.arange(len(X))
-        np.random.shuffle(indices)
-        X = X[indices]
-        y = y[indices]
-
-    return X, y
-
-
-def gene_complete_random_walk_for_anomaly_detection(
-    n_samples: int = 1000,
-    n_steps: int = 24,
-    n_features: int = 10,
-    mu: float = 0.0,
-    std: float = 1.0,
-    anomaly_proportion: float = 0.1,
-    anomaly_fraction: float = 0.02,
-    anomaly_scale_factor: float = 2.0,
-    random_state: Optional[int] = None,
-) -> Tuple[np.ndarray, np.ndarray]:
-    """Generate random walk time-series data for the anomaly-detection task.
-
-    Parameters
-    ----------
-    n_samples : int, default=1000
-        The number of training time-series samples to generate.
-
-    n_features : int, default=10
-        The number of features (dimensions) of generated time-series samples.
-
-    n_steps: int, default=24
-        The number of time steps (length) of generated time-series samples.
-
-    mu : float, default=0.0
-        Mean of the normal distribution, which random walk steps are sampled from.
-
-    std : float, default=1.0
-        Standard deviation of the normal distribution, which random walk steps are sampled from.
-
-    anomaly_proportion : float, default=0.1
-        Proportion of anomaly samples in all samples.
-
-    anomaly_fraction : float, default=0.02
-        Fraction of anomaly points in each anomaly sample.
-
-    anomaly_scale_factor : float, default=2.0
-        Scale factor for value scaling to create anomaly points in time series samples.
-
-    random_state : int, default=None
-        Random seed for data generation.
-
-    Returns
-    -------
-    X : array, shape of [n_samples, n_steps, n_features]
-        Generated time-series data.
-
-    y : array, shape of [n_samples]
-        Labels indicating if time-series samples are anomalies.
-    """
-    assert 0 < anomaly_proportion < 1, f"anomaly_proportion should be >0 and <1, but got {anomaly_proportion}"
-    assert 0 < anomaly_fraction < 1, f"anomaly_fraction should be >0 and <1, but got {anomaly_fraction}"
-    seed = check_random_state(random_state)
-    X = seed.randn(n_samples, n_steps, n_features) * std + mu
-    n_anomaly = math.floor(n_samples * anomaly_proportion)
-    anomaly_indices = np.random.choice(n_samples, size=n_anomaly, replace=False)
-    for a_i in anomaly_indices:
-        anomaly_sample = X[a_i]
-        anomaly_sample = anomaly_sample.flatten()
-        min_val = anomaly_sample.min()
-        max_val = anomaly_sample.max()
-        max_difference = min_val - max_val
-        n_points = n_steps * n_features
-        n_anomaly_points = int(n_points * anomaly_fraction)
-        point_indices = np.random.choice(a=n_points, size=n_anomaly_points, replace=False)
-        for p_i in point_indices:
-            anomaly_sample[p_i] = mu + np.random.uniform(
-                low=min_val - anomaly_scale_factor * max_difference,
-                high=max_val + anomaly_scale_factor * max_difference,
-            )
-        X[a_i] = anomaly_sample.reshape(n_steps, n_features)
-
-    # create labels
-    y = np.zeros(n_samples)
-    y[anomaly_indices] = 1
-
-    # shuffling
-    indices = np.arange(n_samples)
-    np.random.shuffle(indices)
-    X = X[indices]
-    y = y[indices]
-
-    return X, y
-
-
 def gene_random_walk(
     n_steps=24,
     n_features=10,
     n_classes=2,
     n_samples_each_class=1000,
     missing_rate=0.1,
-) -> dict:
-    """Generate a random-walk data.
-
-    Parameters
-    ----------
-    n_steps : int, default=24
-        Number of time steps in each sample.
-
-    n_features : int, default=10
-        Number of features.
-
-    n_classes : int, default=2
-        Number of classes (types) of the generated data.
-
-    n_samples_each_class : int, default=1000
-        Number of samples for each class to generate.
-
-    missing_rate : float, default=0.1
-        The rate of randomly missing values to generate, should be in [0,1).
-
-    Returns
-    -------
-    data: dict,
-        A dictionary containing the generated data.
-    """
-    assert 0 <= missing_rate < 1, "missing_rate must be in [0,1)"
-
-    # generate samples
-    X, y = gene_complete_random_walk_for_classification(
-        n_classes=n_classes,
-        n_samples_each_class=n_samples_each_class,
-        n_steps=n_steps,
-        n_features=n_features,
+):
+    dataset_from_benchpots = preprocess_random_walk(
+        n_steps,
+        n_features,
+        n_classes,
+        n_samples_each_class,
+        missing_rate,
     )
-    # split into train/val/test sets
-    train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2)
-    train_X, val_X, train_y, val_y = train_test_split(train_X, train_y, test_size=0.2)
-
-    if missing_rate > 0:
-        # create random missing values
-        train_X_ori = train_X
-        train_X = mcar(train_X, missing_rate)
-        # test set is left to mask after normalization
-
-    train_X = train_X.reshape(-1, n_features)
-    val_X = val_X.reshape(-1, n_features)
-    test_X = test_X.reshape(-1, n_features)
-    # normalization
-    scaler = StandardScaler()
-    train_X = scaler.fit_transform(train_X)
-    val_X = scaler.transform(val_X)
-    test_X = scaler.transform(test_X)
-    # reshape into time series samples
-    train_X = train_X.reshape(-1, n_steps, n_features)
-    val_X = val_X.reshape(-1, n_steps, n_features)
-    test_X = test_X.reshape(-1, n_steps, n_features)
-    data = {
-        "n_classes": n_classes,
-        "n_steps": n_steps,
-        "n_features": n_features,
-        "train_X": train_X,
-        "train_y": train_y,
-        "val_X": val_X,
-        "val_y": val_y,
-        "test_X": test_X,
-        "test_y": test_y,
-        "scaler": scaler,
-    }
-
-    if missing_rate > 0:
-        # mask values in the test set as ground truth
-        train_X_ori = scaler.transform(train_X_ori.reshape(-1, n_features)).reshape(-1, n_steps, n_features)
-        data["train_X_ori"] = train_X_ori
-
-        val_X_ori = val_X
-        val_X = mcar(val_X, missing_rate)
-        data["val_X"] = val_X
-        data["val_X_ori"] = val_X_ori
-
-        test_X_ori = test_X
-        test_X = mcar(test_X, missing_rate)
-        data["test_X"] = test_X
-        data["test_X_ori"] = np.nan_to_num(test_X_ori)  # fill NaNs for later error calc
-        data["test_X_indicating_mask"] = np.isnan(test_X_ori) ^ np.isnan(test_X)
-
-    return data
+    logger.warning(
+        "🚨 BenchPOTS package now is fully released and includes preprocessing functions for 170+ datasets. "
+        "gene_random_walk() has been deprecated and will be removed in pypots v0.9"
+    )
+    logger.info(
+        "🌟 Please refer to https://github.com/WenjieDu/BenchPOTS and "
+        "check out the func benchpots.datasets.preprocess_physionet2012()"
+    )
+    return dataset_from_benchpots
 
 
 def gene_physionet2012(artificially_missing_rate: float = 0.1):
     dataset_from_benchpots = preprocess_physionet2012(subset="all", rate=artificially_missing_rate)
     logger.warning(
-        "🚨 Due to the full release of BenchPOTS package, "
-        "gene_physionet2012() has been deprecated and will be removed in pypots v0.8"
+        "🚨 BenchPOTS package now is fully released and includes preprocessing functions for 170+ datasets. "
+        "gene_physionet2012() has been deprecated and will be removed in pypots v0.9"
     )
     logger.info(
         "🌟 Please refer to https://github.com/WenjieDu/BenchPOTS and "

diff --git a/pypots/gungnir/client.py b/pypots/gungnir/client.py
@@ -5,12 +5,9 @@
 # Created by Wenjie Du <[email protected]>
 # License: BSD-3-Clause
 
-from ..utils.logging import logger
-
+from ai4ts.client import TimeSeriesAI
 
-class TimeSeriesAI:
-    def __init__(self):
-        pass
+from ..utils.logging import logger
 
 
 class Gungnir(TimeSeriesAI):

diff --git a/requirements/conda_env.yml b/requirements/conda_env.yml
@@ -19,7 +19,11 @@ dependencies:
     - conda-forge::matplotlib
     - conda-forge::tensorboard
     - conda-forge::scikit-learn
-    - conda-forge::pygrinder >=0.6.2
+    - conda-forge::pygrinder >=0.6.4
     - conda-forge::tsdb >=0.6.1
-    - conda-forge::benchpots >=0.2.1
+    - conda-forge::benchpots >=0.3
     - pytorch::pytorch >=1.10.0
+
+    - pip:
+        # not published on conda-forge yet
+        - ai4ts