From 4fa55ae6a540f564c2235d0c51a8c181555435ee Mon Sep 17 00:00:00 2001 From: Wenjie Du Date: Sun, 25 Aug 2024 10:25:12 +0800 Subject: [PATCH 1/3] refactor: import preprocess_random_walk from benchpots; --- pypots/data/__init__.py | 10 +- pypots/data/generating.py | 326 +++----------------------------------- 2 files changed, 20 insertions(+), 316 deletions(-) diff --git a/pypots/data/__init__.py b/pypots/data/__init__.py index 26c4ba1d..3a2d3fde 100644 --- a/pypots/data/__init__.py +++ b/pypots/data/__init__.py @@ -6,12 +6,7 @@ # License: BSD-3-Clause from .dataset import BaseDataset, SUPPORTED_DATASET_FILE_FORMATS -from .generating import ( - gene_complete_random_walk, - gene_complete_random_walk_for_anomaly_detection, - gene_complete_random_walk_for_classification, - gene_random_walk, -) +from .generating import gene_random_walk from .saving import ( save_dict_into_h5, load_dict_from_h5, @@ -30,9 +25,6 @@ "BaseDataset", "SUPPORTED_DATASET_FILE_FORMATS", # dataset generation functions - "gene_complete_random_walk", - "gene_complete_random_walk_for_anomaly_detection", - "gene_complete_random_walk_for_classification", "gene_random_walk", "load_specific_dataset", # utils diff --git a/pypots/data/generating.py b/pypots/data/generating.py index f50b5276..c513f759 100644 --- a/pypots/data/generating.py +++ b/pypots/data/generating.py @@ -5,322 +5,34 @@ # Created by Wenjie Du # License: BSD-3-Clause -import math -from typing import Optional, Tuple - -import numpy as np -from benchpots.datasets import preprocess_physionet2012 -from pygrinder import mcar -from sklearn.model_selection import train_test_split -from sklearn.preprocessing import StandardScaler -from sklearn.utils import check_random_state +from benchpots.datasets import preprocess_physionet2012, preprocess_random_walk from ..utils.logging import logger -def gene_complete_random_walk( - n_samples: int = 1000, - n_steps: int = 24, - n_features: int = 10, - mu: float = 0.0, - std: float = 1.0, - random_state: Optional[int] = None, -) -> np.ndarray: - """Generate complete random walk time-series data, i.e. having no missing values. - - Parameters - ---------- - n_samples : int, default=1000 - The number of training time-series samples to generate. - - n_steps: int, default=24 - The number of time steps (length) of generated time-series samples. - - n_features : int, default=10 - The number of features (dimensions) of generated time-series samples. - - mu : float, default=0.0 - Mean of the normal distribution, which random walk steps are sampled from. - - std : float, default=1.0 - Standard deviation of the normal distribution, which random walk steps are sampled from. - - random_state : int, default=None - Random seed for data generation. - - Returns - ------- - ts_samples: array, shape of [n_samples, n_steps, n_features] - Generated random walk time series. - """ - seed = check_random_state(random_state) - ts_samples = np.zeros([n_samples, n_steps, n_features]) - random_values = seed.randn(n_samples, n_steps, n_features) * std + mu - ts_samples[:, 0, :] = random_values[:, 0, :] - for t in range(1, n_steps): - ts_samples[:, t, :] = ts_samples[:, t - 1, :] + random_values[:, t, :] - ts_samples = np.asarray(ts_samples) - return ts_samples - - -def gene_complete_random_walk_for_classification( - n_classes: int = 2, - n_samples_each_class: int = 500, - n_steps: int = 24, - n_features: int = 10, - shuffle: bool = True, - random_state: Optional[int] = None, -) -> Tuple[np.ndarray, np.ndarray]: - """Generate complete random walk time-series data for the classification task. - - Parameters - ---------- - n_classes : int, must >=1, default=2 - Number of classes (types) of the generated data. - - n_samples_each_class : int, default=500 - Number of samples for each class to generate. - - n_steps : int, default=24 - Number of time steps in each sample. - - n_features : int, default=10 - Number of features. - - shuffle : bool, default=True - Whether to shuffle generated samples. - If not, you can separate samples of each class according to `n_samples_each_class`. - For example, - X_class0=X[:n_samples_each_class], - X_class1=X[n_samples_each_class:n_samples_each_class*2] - - random_state : int, default=None - Random seed for data generation. - - Returns - ------- - X : array, shape of [n_samples, n_steps, n_features] - Generated time-series data. - - y : array, shape of [n_samples] - Labels indicating classes of time-series samples. - - """ - assert n_classes > 1, f"n_classes should be >1, but got {n_classes}" - - ts_collector = [] - label_collector = [] - - mu = 0 - std = 1 - - for c_ in range(n_classes): - ts_samples = gene_complete_random_walk( - n_samples_each_class, n_steps, n_features, mu, std, random_state - ) - label_samples = np.asarray([1 for _ in range(n_samples_each_class)]) * c_ - ts_collector.extend(ts_samples) - label_collector.extend(label_samples) - mu += 1 - - X = np.asarray(ts_collector) - y = np.asarray(label_collector) - - # if shuffling, then shuffle the order of samples - if shuffle: - indices = np.arange(len(X)) - np.random.shuffle(indices) - X = X[indices] - y = y[indices] - - return X, y - - -def gene_complete_random_walk_for_anomaly_detection( - n_samples: int = 1000, - n_steps: int = 24, - n_features: int = 10, - mu: float = 0.0, - std: float = 1.0, - anomaly_proportion: float = 0.1, - anomaly_fraction: float = 0.02, - anomaly_scale_factor: float = 2.0, - random_state: Optional[int] = None, -) -> Tuple[np.ndarray, np.ndarray]: - """Generate random walk time-series data for the anomaly-detection task. - - Parameters - ---------- - n_samples : int, default=1000 - The number of training time-series samples to generate. - - n_features : int, default=10 - The number of features (dimensions) of generated time-series samples. - - n_steps: int, default=24 - The number of time steps (length) of generated time-series samples. - - mu : float, default=0.0 - Mean of the normal distribution, which random walk steps are sampled from. - - std : float, default=1.0 - Standard deviation of the normal distribution, which random walk steps are sampled from. - - anomaly_proportion : float, default=0.1 - Proportion of anomaly samples in all samples. - - anomaly_fraction : float, default=0.02 - Fraction of anomaly points in each anomaly sample. - - anomaly_scale_factor : float, default=2.0 - Scale factor for value scaling to create anomaly points in time series samples. - - random_state : int, default=None - Random seed for data generation. - - Returns - ------- - X : array, shape of [n_samples, n_steps, n_features] - Generated time-series data. - - y : array, shape of [n_samples] - Labels indicating if time-series samples are anomalies. - """ - assert ( - 0 < anomaly_proportion < 1 - ), f"anomaly_proportion should be >0 and <1, but got {anomaly_proportion}" - assert ( - 0 < anomaly_fraction < 1 - ), f"anomaly_fraction should be >0 and <1, but got {anomaly_fraction}" - seed = check_random_state(random_state) - X = seed.randn(n_samples, n_steps, n_features) * std + mu - n_anomaly = math.floor(n_samples * anomaly_proportion) - anomaly_indices = np.random.choice(n_samples, size=n_anomaly, replace=False) - for a_i in anomaly_indices: - anomaly_sample = X[a_i] - anomaly_sample = anomaly_sample.flatten() - min_val = anomaly_sample.min() - max_val = anomaly_sample.max() - max_difference = min_val - max_val - n_points = n_steps * n_features - n_anomaly_points = int(n_points * anomaly_fraction) - point_indices = np.random.choice( - a=n_points, size=n_anomaly_points, replace=False - ) - for p_i in point_indices: - anomaly_sample[p_i] = mu + np.random.uniform( - low=min_val - anomaly_scale_factor * max_difference, - high=max_val + anomaly_scale_factor * max_difference, - ) - X[a_i] = anomaly_sample.reshape(n_steps, n_features) - - # create labels - y = np.zeros(n_samples) - y[anomaly_indices] = 1 - - # shuffling - indices = np.arange(n_samples) - np.random.shuffle(indices) - X = X[indices] - y = y[indices] - - return X, y - - def gene_random_walk( n_steps=24, n_features=10, n_classes=2, n_samples_each_class=1000, missing_rate=0.1, -) -> dict: - """Generate a random-walk data. - - Parameters - ---------- - n_steps : int, default=24 - Number of time steps in each sample. - - n_features : int, default=10 - Number of features. - - n_classes : int, default=2 - Number of classes (types) of the generated data. - - n_samples_each_class : int, default=1000 - Number of samples for each class to generate. - - missing_rate : float, default=0.1 - The rate of randomly missing values to generate, should be in [0,1). - - Returns - ------- - data: dict, - A dictionary containing the generated data. - """ - assert 0 <= missing_rate < 1, "missing_rate must be in [0,1)" - - # generate samples - X, y = gene_complete_random_walk_for_classification( - n_classes=n_classes, - n_samples_each_class=n_samples_each_class, - n_steps=n_steps, - n_features=n_features, +): + dataset_from_benchpots = preprocess_random_walk( + n_steps, + n_features, + n_classes, + n_samples_each_class, + missing_rate, ) - # split into train/val/test sets - train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2) - train_X, val_X, train_y, val_y = train_test_split(train_X, train_y, test_size=0.2) - - if missing_rate > 0: - # create random missing values - train_X_ori = train_X - train_X = mcar(train_X, missing_rate) - # test set is left to mask after normalization - - train_X = train_X.reshape(-1, n_features) - val_X = val_X.reshape(-1, n_features) - test_X = test_X.reshape(-1, n_features) - # normalization - scaler = StandardScaler() - train_X = scaler.fit_transform(train_X) - val_X = scaler.transform(val_X) - test_X = scaler.transform(test_X) - # reshape into time series samples - train_X = train_X.reshape(-1, n_steps, n_features) - val_X = val_X.reshape(-1, n_steps, n_features) - test_X = test_X.reshape(-1, n_steps, n_features) - data = { - "n_classes": n_classes, - "n_steps": n_steps, - "n_features": n_features, - "train_X": train_X, - "train_y": train_y, - "val_X": val_X, - "val_y": val_y, - "test_X": test_X, - "test_y": test_y, - "scaler": scaler, - } - - if missing_rate > 0: - # mask values in the test set as ground truth - train_X_ori = scaler.transform(train_X_ori.reshape(-1, n_features)).reshape( - -1, n_steps, n_features - ) - data["train_X_ori"] = train_X_ori - - val_X_ori = val_X - val_X = mcar(val_X, missing_rate) - data["val_X"] = val_X - data["val_X_ori"] = val_X_ori - - test_X_ori = test_X - test_X = mcar(test_X, missing_rate) - data["test_X"] = test_X - data["test_X_ori"] = np.nan_to_num(test_X_ori) # fill NaNs for later error calc - data["test_X_indicating_mask"] = np.isnan(test_X_ori) ^ np.isnan(test_X) - - return data + logger.warning( + "🚨 BenchPOTS package now is fully released and includes preprocessing functions for 170+ datasets. " + "gene_random_walk() has been deprecated and will be removed in pypots v0.9" + ) + logger.info( + "🌟 Please refer to https://github.com/WenjieDu/BenchPOTS and " + "check out the func benchpots.datasets.preprocess_physionet2012()" + ) + return dataset_from_benchpots def gene_physionet2012(artificially_missing_rate: float = 0.1): @@ -328,8 +40,8 @@ def gene_physionet2012(artificially_missing_rate: float = 0.1): subset="all", rate=artificially_missing_rate ) logger.warning( - "🚨 Due to the full release of BenchPOTS package, " - "gene_physionet2012() has been deprecated and will be removed in pypots v0.8" + "🚨 BenchPOTS package now is fully released and includes preprocessing functions for 170+ datasets. " + "gene_physionet2012() has been deprecated and will be removed in pypots v0.9" ) logger.info( "🌟 Please refer to https://github.com/WenjieDu/BenchPOTS and " From 65e059e195f485f9e97248afe2fec62f18e111c5 Mon Sep 17 00:00:00 2001 From: Wenjie Du Date: Thu, 29 Aug 2024 18:57:20 +0800 Subject: [PATCH 2/3] test: adapt for updated gene_random_walk; --- tests/global_test_config.py | 39 ++++++++++++------------------------- 1 file changed, 12 insertions(+), 27 deletions(-) diff --git a/tests/global_test_config.py b/tests/global_test_config.py index b737f61b..587162ad 100644 --- a/tests/global_test_config.py +++ b/tests/global_test_config.py @@ -26,9 +26,7 @@ MODEL_SAVING_DIR = f"{RESULT_SAVING_DIR}/models" DATA_SAVING_DIR = f"{RESULT_SAVING_DIR}/datasets" RESULT_SAVING_DIR_FOR_IMPUTATION = os.path.join(MODEL_SAVING_DIR, "imputation") -RESULT_SAVING_DIR_FOR_ANOMALY_DETECTION = os.path.join( - MODEL_SAVING_DIR, "anomaly_detection" -) +RESULT_SAVING_DIR_FOR_ANOMALY_DETECTION = os.path.join(MODEL_SAVING_DIR, "anomaly_detection") RESULT_SAVING_DIR_FOR_CLASSIFICATION = os.path.join(MODEL_SAVING_DIR, "classification") RESULT_SAVING_DIR_FOR_CLUSTERING = os.path.join(MODEL_SAVING_DIR, "clustering") RESULT_SAVING_DIR_FOR_FORECASTING = os.path.join(MODEL_SAVING_DIR, "forecasting") @@ -39,15 +37,9 @@ GENERAL_H5_TEST_SET_PATH = os.path.abspath(f"{GENERAL_DATA_SAVING_DIR}/test_set.h5") # paths to save the generated dataset for testing forecasting models with the lazy-loading strategy FORECASTING_DATA_SAVING_DIR = f"{DATA_SAVING_DIR}/forecasting_h5dataset" -FORECASTING_H5_TRAIN_SET_PATH = os.path.abspath( - f"{FORECASTING_DATA_SAVING_DIR}/train_set.h5" -) -FORECASTING_H5_VAL_SET_PATH = os.path.abspath( - f"{FORECASTING_DATA_SAVING_DIR}/val_set.h5" -) -FORECASTING_H5_TEST_SET_PATH = os.path.abspath( - f"{FORECASTING_DATA_SAVING_DIR}/test_set.h5" -) +FORECASTING_H5_TRAIN_SET_PATH = os.path.abspath(f"{FORECASTING_DATA_SAVING_DIR}/train_set.h5") +FORECASTING_H5_VAL_SET_PATH = os.path.abspath(f"{FORECASTING_DATA_SAVING_DIR}/val_set.h5") +FORECASTING_H5_TEST_SET_PATH = os.path.abspath(f"{FORECASTING_DATA_SAVING_DIR}/test_set.h5") set_random_seed(RANDOM_SEED) @@ -63,6 +55,9 @@ ) # DATA = gene_physionet2012() +DATA["test_X_indicating_mask"] = np.isnan(DATA["test_X"]) ^ np.isnan(DATA["test_X_ori"]) +DATA["test_X_ori"] = np.nan_to_num(DATA["test_X_ori"]) + TRAIN_SET = { "X": DATA["train_X"], "y": DATA["train_y"].astype(float), @@ -78,9 +73,7 @@ "y": DATA["test_y"].astype(float), } -assert ( - N_PRED_STEPS <= DATA["train_X"].shape[1] -), "N_PRED_STEPS should be less than the sequence length." +assert N_PRED_STEPS <= DATA["train_X"].shape[1], "N_PRED_STEPS should be less than the sequence length." FORECASTING_TRAIN_SET = { "X": DATA["train_X"][:, :-N_PRED_STEPS], "X_pred": DATA["train_X_ori"][:, -N_PRED_STEPS:], @@ -99,9 +92,7 @@ cuda_devices = [torch.device(i) for i in range(n_cuda_devices)] if n_cuda_devices > 1: DEVICE = cuda_devices[np.random.randint(n_cuda_devices)] - logger.info( - f"❗️Detected multiple cuda devices, using one of them {DEVICE} to run testing." - ) + logger.info(f"❗️Detected multiple cuda devices, using one of them {DEVICE} to run testing.") else: # if having no multiple cuda devices, leave it as None to use the default device DEVICE = None @@ -112,9 +103,7 @@ def check_tb_and_model_checkpoints_existence(model): saved_files = os.listdir(model.saving_path) if ".DS_Store" in saved_files: # for macOS saved_files.remove(".DS_Store") - assert ( - model.saving_path is not None and len(saved_files) > 0 - ), "tensorboard file does not exist" + assert model.saving_path is not None and len(saved_files) > 0, "tensorboard file does not exist" # check the model checkpoints existence saved_model_files = [i for i in saved_files if i.endswith(".pypots")] assert len(saved_model_files) > 0, "No model checkpoint saved." @@ -135,9 +124,5 @@ def check_tb_and_model_checkpoints_existence(model): if not os.path.exists(FORECASTING_H5_TEST_SET_PATH): save_dict_into_h5(FORECASTING_TEST_SET, FORECASTING_H5_TEST_SET_PATH) - logger.info( - f"Files under GENERAL_DATA_SAVING_DIR: {os.listdir(GENERAL_DATA_SAVING_DIR)}" - ) - logger.info( - f"Files under FORECASTING_DATA_SAVING_DIR: {os.listdir(FORECASTING_DATA_SAVING_DIR)}" - ) + logger.info(f"Files under GENERAL_DATA_SAVING_DIR: {os.listdir(GENERAL_DATA_SAVING_DIR)}") + logger.info(f"Files under FORECASTING_DATA_SAVING_DIR: {os.listdir(FORECASTING_DATA_SAVING_DIR)}") From bdcbace904e9d4c9b7d36e34ec5c6326b1617532 Mon Sep 17 00:00:00 2001 From: Wenjie Du Date: Thu, 5 Sep 2024 23:05:18 +0800 Subject: [PATCH 3/3] feat: add ai4ts as a dependency and update requirements files; --- pypots/gungnir/client.py | 7 ++----- requirements/conda_env.yml | 8 ++++++-- requirements/requirements.txt | 5 +++-- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/pypots/gungnir/client.py b/pypots/gungnir/client.py index 7c18cd51..a81f8e60 100644 --- a/pypots/gungnir/client.py +++ b/pypots/gungnir/client.py @@ -5,12 +5,9 @@ # Created by Wenjie Du # License: BSD-3-Clause -from ..utils.logging import logger - +from ai4ts.client import TimeSeriesAI -class TimeSeriesAI: - def __init__(self): - pass +from ..utils.logging import logger class Gungnir(TimeSeriesAI): diff --git a/requirements/conda_env.yml b/requirements/conda_env.yml index 56b51cc4..39c03e27 100644 --- a/requirements/conda_env.yml +++ b/requirements/conda_env.yml @@ -19,7 +19,11 @@ dependencies: - conda-forge::matplotlib - conda-forge::tensorboard - conda-forge::scikit-learn - - conda-forge::pygrinder >=0.6.2 + - conda-forge::pygrinder >=0.6.4 - conda-forge::tsdb >=0.6.1 - - conda-forge::benchpots >=0.2.1 + - conda-forge::benchpots >=0.3 - pytorch::pytorch >=1.10.0 + + - pip: + # not published on conda-forge yet + - ai4ts \ No newline at end of file diff --git a/requirements/requirements.txt b/requirements/requirements.txt index 68e99e77..03436541 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -10,5 +10,6 @@ tensorboard scikit-learn torch>=1.10.0 tsdb>=0.6.1 -pygrinder>=0.6.2 -benchpots>=0.2.1 +pygrinder>=0.6.4 +benchpots>=0.3 +ai4ts \ No newline at end of file