Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Import random walk funcs from BenchPOTS and add AI4TS as a dependency #510

Merged
merged 4 commits into from
Sep 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 1 addition & 9 deletions pypots/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,7 @@
# License: BSD-3-Clause

from .dataset import BaseDataset, SUPPORTED_DATASET_FILE_FORMATS
from .generating import (
gene_complete_random_walk,
gene_complete_random_walk_for_anomaly_detection,
gene_complete_random_walk_for_classification,
gene_random_walk,
)
from .generating import gene_random_walk
from .saving import (
save_dict_into_h5,
load_dict_from_h5,
Expand All @@ -30,9 +25,6 @@
"BaseDataset",
"SUPPORTED_DATASET_FILE_FORMATS",
# dataset generation functions
"gene_complete_random_walk",
"gene_complete_random_walk_for_anomaly_detection",
"gene_complete_random_walk_for_classification",
"gene_random_walk",
"load_specific_dataset",
# utils
Expand Down
316 changes: 19 additions & 297 deletions pypots/data/generating.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,319 +5,41 @@
# Created by Wenjie Du <[email protected]>
# License: BSD-3-Clause

import math
from typing import Optional, Tuple

import numpy as np
from benchpots.datasets import preprocess_physionet2012
from pygrinder import mcar
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import check_random_state
from benchpots.datasets import preprocess_physionet2012, preprocess_random_walk

from ..utils.logging import logger


def gene_complete_random_walk(
n_samples: int = 1000,
n_steps: int = 24,
n_features: int = 10,
mu: float = 0.0,
std: float = 1.0,
random_state: Optional[int] = None,
) -> np.ndarray:
"""Generate complete random walk time-series data, i.e. having no missing values.

Parameters
----------
n_samples : int, default=1000
The number of training time-series samples to generate.

n_steps: int, default=24
The number of time steps (length) of generated time-series samples.

n_features : int, default=10
The number of features (dimensions) of generated time-series samples.

mu : float, default=0.0
Mean of the normal distribution, which random walk steps are sampled from.

std : float, default=1.0
Standard deviation of the normal distribution, which random walk steps are sampled from.

random_state : int, default=None
Random seed for data generation.

Returns
-------
ts_samples: array, shape of [n_samples, n_steps, n_features]
Generated random walk time series.
"""
seed = check_random_state(random_state)
ts_samples = np.zeros([n_samples, n_steps, n_features])
random_values = seed.randn(n_samples, n_steps, n_features) * std + mu
ts_samples[:, 0, :] = random_values[:, 0, :]
for t in range(1, n_steps):
ts_samples[:, t, :] = ts_samples[:, t - 1, :] + random_values[:, t, :]
ts_samples = np.asarray(ts_samples)
return ts_samples


def gene_complete_random_walk_for_classification(
n_classes: int = 2,
n_samples_each_class: int = 500,
n_steps: int = 24,
n_features: int = 10,
shuffle: bool = True,
random_state: Optional[int] = None,
) -> Tuple[np.ndarray, np.ndarray]:
"""Generate complete random walk time-series data for the classification task.

Parameters
----------
n_classes : int, must >=1, default=2
Number of classes (types) of the generated data.

n_samples_each_class : int, default=500
Number of samples for each class to generate.

n_steps : int, default=24
Number of time steps in each sample.

n_features : int, default=10
Number of features.

shuffle : bool, default=True
Whether to shuffle generated samples.
If not, you can separate samples of each class according to `n_samples_each_class`.
For example,
X_class0=X[:n_samples_each_class],
X_class1=X[n_samples_each_class:n_samples_each_class*2]

random_state : int, default=None
Random seed for data generation.

Returns
-------
X : array, shape of [n_samples, n_steps, n_features]
Generated time-series data.

y : array, shape of [n_samples]
Labels indicating classes of time-series samples.

"""
assert n_classes > 1, f"n_classes should be >1, but got {n_classes}"

ts_collector = []
label_collector = []

mu = 0
std = 1

for c_ in range(n_classes):
ts_samples = gene_complete_random_walk(n_samples_each_class, n_steps, n_features, mu, std, random_state)
label_samples = np.asarray([1 for _ in range(n_samples_each_class)]) * c_
ts_collector.extend(ts_samples)
label_collector.extend(label_samples)
mu += 1

X = np.asarray(ts_collector)
y = np.asarray(label_collector)

# if shuffling, then shuffle the order of samples
if shuffle:
indices = np.arange(len(X))
np.random.shuffle(indices)
X = X[indices]
y = y[indices]

return X, y


def gene_complete_random_walk_for_anomaly_detection(
n_samples: int = 1000,
n_steps: int = 24,
n_features: int = 10,
mu: float = 0.0,
std: float = 1.0,
anomaly_proportion: float = 0.1,
anomaly_fraction: float = 0.02,
anomaly_scale_factor: float = 2.0,
random_state: Optional[int] = None,
) -> Tuple[np.ndarray, np.ndarray]:
"""Generate random walk time-series data for the anomaly-detection task.

Parameters
----------
n_samples : int, default=1000
The number of training time-series samples to generate.

n_features : int, default=10
The number of features (dimensions) of generated time-series samples.

n_steps: int, default=24
The number of time steps (length) of generated time-series samples.

mu : float, default=0.0
Mean of the normal distribution, which random walk steps are sampled from.

std : float, default=1.0
Standard deviation of the normal distribution, which random walk steps are sampled from.

anomaly_proportion : float, default=0.1
Proportion of anomaly samples in all samples.

anomaly_fraction : float, default=0.02
Fraction of anomaly points in each anomaly sample.

anomaly_scale_factor : float, default=2.0
Scale factor for value scaling to create anomaly points in time series samples.

random_state : int, default=None
Random seed for data generation.

Returns
-------
X : array, shape of [n_samples, n_steps, n_features]
Generated time-series data.

y : array, shape of [n_samples]
Labels indicating if time-series samples are anomalies.
"""
assert 0 < anomaly_proportion < 1, f"anomaly_proportion should be >0 and <1, but got {anomaly_proportion}"
assert 0 < anomaly_fraction < 1, f"anomaly_fraction should be >0 and <1, but got {anomaly_fraction}"
seed = check_random_state(random_state)
X = seed.randn(n_samples, n_steps, n_features) * std + mu
n_anomaly = math.floor(n_samples * anomaly_proportion)
anomaly_indices = np.random.choice(n_samples, size=n_anomaly, replace=False)
for a_i in anomaly_indices:
anomaly_sample = X[a_i]
anomaly_sample = anomaly_sample.flatten()
min_val = anomaly_sample.min()
max_val = anomaly_sample.max()
max_difference = min_val - max_val
n_points = n_steps * n_features
n_anomaly_points = int(n_points * anomaly_fraction)
point_indices = np.random.choice(a=n_points, size=n_anomaly_points, replace=False)
for p_i in point_indices:
anomaly_sample[p_i] = mu + np.random.uniform(
low=min_val - anomaly_scale_factor * max_difference,
high=max_val + anomaly_scale_factor * max_difference,
)
X[a_i] = anomaly_sample.reshape(n_steps, n_features)

# create labels
y = np.zeros(n_samples)
y[anomaly_indices] = 1

# shuffling
indices = np.arange(n_samples)
np.random.shuffle(indices)
X = X[indices]
y = y[indices]

return X, y


def gene_random_walk(
n_steps=24,
n_features=10,
n_classes=2,
n_samples_each_class=1000,
missing_rate=0.1,
) -> dict:
"""Generate a random-walk data.

Parameters
----------
n_steps : int, default=24
Number of time steps in each sample.

n_features : int, default=10
Number of features.

n_classes : int, default=2
Number of classes (types) of the generated data.

n_samples_each_class : int, default=1000
Number of samples for each class to generate.

missing_rate : float, default=0.1
The rate of randomly missing values to generate, should be in [0,1).

Returns
-------
data: dict,
A dictionary containing the generated data.
"""
assert 0 <= missing_rate < 1, "missing_rate must be in [0,1)"

# generate samples
X, y = gene_complete_random_walk_for_classification(
n_classes=n_classes,
n_samples_each_class=n_samples_each_class,
n_steps=n_steps,
n_features=n_features,
):
dataset_from_benchpots = preprocess_random_walk(
n_steps,
n_features,
n_classes,
n_samples_each_class,
missing_rate,
)
# split into train/val/test sets
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2)
train_X, val_X, train_y, val_y = train_test_split(train_X, train_y, test_size=0.2)

if missing_rate > 0:
# create random missing values
train_X_ori = train_X
train_X = mcar(train_X, missing_rate)
# test set is left to mask after normalization

train_X = train_X.reshape(-1, n_features)
val_X = val_X.reshape(-1, n_features)
test_X = test_X.reshape(-1, n_features)
# normalization
scaler = StandardScaler()
train_X = scaler.fit_transform(train_X)
val_X = scaler.transform(val_X)
test_X = scaler.transform(test_X)
# reshape into time series samples
train_X = train_X.reshape(-1, n_steps, n_features)
val_X = val_X.reshape(-1, n_steps, n_features)
test_X = test_X.reshape(-1, n_steps, n_features)
data = {
"n_classes": n_classes,
"n_steps": n_steps,
"n_features": n_features,
"train_X": train_X,
"train_y": train_y,
"val_X": val_X,
"val_y": val_y,
"test_X": test_X,
"test_y": test_y,
"scaler": scaler,
}

if missing_rate > 0:
# mask values in the test set as ground truth
train_X_ori = scaler.transform(train_X_ori.reshape(-1, n_features)).reshape(-1, n_steps, n_features)
data["train_X_ori"] = train_X_ori

val_X_ori = val_X
val_X = mcar(val_X, missing_rate)
data["val_X"] = val_X
data["val_X_ori"] = val_X_ori

test_X_ori = test_X
test_X = mcar(test_X, missing_rate)
data["test_X"] = test_X
data["test_X_ori"] = np.nan_to_num(test_X_ori) # fill NaNs for later error calc
data["test_X_indicating_mask"] = np.isnan(test_X_ori) ^ np.isnan(test_X)

return data
logger.warning(
"🚨 BenchPOTS package now is fully released and includes preprocessing functions for 170+ datasets. "
"gene_random_walk() has been deprecated and will be removed in pypots v0.9"
)
logger.info(
"🌟 Please refer to https://github.com/WenjieDu/BenchPOTS and "
"check out the func benchpots.datasets.preprocess_physionet2012()"
)
return dataset_from_benchpots


def gene_physionet2012(artificially_missing_rate: float = 0.1):
dataset_from_benchpots = preprocess_physionet2012(subset="all", rate=artificially_missing_rate)
logger.warning(
"🚨 Due to the full release of BenchPOTS package, "
"gene_physionet2012() has been deprecated and will be removed in pypots v0.8"
"🚨 BenchPOTS package now is fully released and includes preprocessing functions for 170+ datasets. "
"gene_physionet2012() has been deprecated and will be removed in pypots v0.9"
)
logger.info(
"🌟 Please refer to https://github.com/WenjieDu/BenchPOTS and "
Expand Down
7 changes: 2 additions & 5 deletions pypots/gungnir/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,9 @@
# Created by Wenjie Du <[email protected]>
# License: BSD-3-Clause

from ..utils.logging import logger

from ai4ts.client import TimeSeriesAI

class TimeSeriesAI:
def __init__(self):
pass
from ..utils.logging import logger


class Gungnir(TimeSeriesAI):
Expand Down
8 changes: 6 additions & 2 deletions requirements/conda_env.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,11 @@ dependencies:
- conda-forge::matplotlib
- conda-forge::tensorboard
- conda-forge::scikit-learn
- conda-forge::pygrinder >=0.6.2
- conda-forge::pygrinder >=0.6.4
- conda-forge::tsdb >=0.6.1
- conda-forge::benchpots >=0.2.1
- conda-forge::benchpots >=0.3
- pytorch::pytorch >=1.10.0

- pip:
# not published on conda-forge yet
- ai4ts
Loading
Loading