Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add models GP-VAE/USGAN #176

Merged
merged 17 commits into from
Sep 21, 2023
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pypots/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
#
# Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer.
# 'X.Y.dev0' is the canonical version of 'X.Y.dev'
__version__ = "0.1.2"
__version__ = "0.1.1"


__all__ = [
Expand Down
7 changes: 4 additions & 3 deletions pypots/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
import torch
from torch.utils.tensorboard import SummaryWriter

from .utils.file import create_dir_if_not_exist
from .utils.files import create_dir_if_not_exist
from .utils.logging import logger


Expand Down Expand Up @@ -96,7 +96,9 @@ def _setup_device(self, device: Union[None, str, torch.device, list]):
self.device = device
elif isinstance(device, list):
if len(device) == 0:
raise ValueError("The list of devices should have at least 1 device, but got 0.")
raise ValueError(
"The list of devices should have at least 1 device, but got 0."
)
elif len(device) == 1:
return self._setup_device(device[0])
# parallely training on multiple CUDA devices
Expand Down Expand Up @@ -176,7 +178,6 @@ def _send_data_to_given_device(self, data):
if isinstance(self.device, torch.device): # single device
data = map(lambda x: x.to(self.device), data)
else: # parallely training on multiple devices

# randomly choose one device to balance the workload
# device = np.random.choice(self.device)

Expand Down
1 change: 0 additions & 1 deletion pypots/classification/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,6 @@ def _train_model(
training_loader: DataLoader,
val_loader: DataLoader = None,
) -> None:

# each training starts from the very beginning, so reset the loss and model dict here
self.best_loss = float("inf")
self.best_model_dict = None
Expand Down
1 change: 0 additions & 1 deletion pypots/classification/raindrop/modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,6 @@ def forward(
edge_attr: OptTensor = None,
return_attention_weights=None,
) -> Tuple[torch.Tensor, Any]:

r"""
Args:
return_attention_weights (bool, optional): If set to :obj:`True`,
Expand Down
2 changes: 1 addition & 1 deletion pypots/cli/doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import shutil
from argparse import Namespace

from tsdb.utils.downloading import _download_and_extract
from tsdb.data_processing import _download_and_extract

from ..cli.base import BaseCommand
from ..utils.logging import logger
Expand Down
1 change: 0 additions & 1 deletion pypots/clustering/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,6 @@ def _train_model(
training_loader: DataLoader,
val_loader: DataLoader = None,
) -> None:

"""

Parameters
Expand Down
1 change: 0 additions & 1 deletion pypots/clustering/crli/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,6 @@ def __init__(
saving_path: Optional[str] = None,
model_saving_strategy: Optional[str] = "best",
):

super().__init__(
n_clusters,
batch_size,
Expand Down
2 changes: 0 additions & 2 deletions pypots/clustering/vader/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,6 @@ def forward(
) = self.get_results(X, missing_mask)

if not training and not pretrain:

results = {
"mu_tilde": mu_tilde,
"mu": mu_c,
Expand Down Expand Up @@ -403,7 +402,6 @@ def _train_model(
training_loader: DataLoader,
val_loader: DataLoader = None,
) -> None:

# each training starts from the very beginning, so reset the loss and model dict here
self.best_loss = float("inf")
self.best_model_dict = None
Expand Down
1 change: 0 additions & 1 deletion pypots/forecasting/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,6 @@ def _train_model(
training_loader: DataLoader,
val_loader: DataLoader = None,
) -> None:

# each training starts from the very beginning, so reset the loss and model dict here
self.best_loss = float("inf")
self.best_model_dict = None
Expand Down
10 changes: 3 additions & 7 deletions pypots/imputation/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,7 @@
from .saits import SAITS
from .transformer import Transformer
from .mrnn import MRNN
from .gpvae import GPVAE
from .usgan import USGAN

__all__ = [
"SAITS",
"Transformer",
"BRITS",
"MRNN",
"LOCF",
]
__all__ = ["SAITS", "Transformer", "BRITS", "MRNN", "LOCF", "GPVAE" "USGAN"]
12 changes: 12 additions & 0 deletions pypots/imputation/gpvae/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
"""
The package of the partially-observed time-series imputation method GP-VAE.
"""

# Created by Jun Wang <[email protected]>
# License: GLP-v3

from .model import GPVAE

__all__ = [
"GPVAE",
]
144 changes: 144 additions & 0 deletions pypots/imputation/gpvae/data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
"""
Dataset class for model GP-VAE.
"""

# Created by Jun Wang <[email protected]>
# License: GLP-v3

from typing import Union, Iterable

import torch

from ...data.base import BaseDataset
from ...data.utils import torch_parse_delta


class DatasetForGPVAE(BaseDataset):
"""Dataset class for BRITS.

Parameters
----------
data : dict or str,
The dataset for model input, should be a dictionary including keys as 'X' and 'y',
or a path string locating a data file.
If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features],
which is time-series data for input, can contain missing values, and y should be array-like of shape
[n_samples], which is classification labels of X.
If it is a path string, the path should point to a data file, e.g. a h5 file, which contains
key-value pairs like a dict, and it has to include keys as 'X' and 'y'.

return_labels : bool, default = True,
Whether to return labels in function __getitem__() if they exist in the given data. If `True`, for example,
during training of classification models, the Dataset class will return labels in __getitem__() for model input.
Otherwise, labels won't be included in the data returned by __getitem__(). This parameter exists because we
need the defined Dataset class for all training/validating/testing stages. For those big datasets stored in h5
files, they already have both X and y saved. But we don't read labels from the file for validating and testing
with function _fetch_data_from_file(), which works for all three stages. Therefore, we need this parameter for
distinction.

file_type : str, default = "h5py"
The type of the given file if train_set and val_set are path strings.
"""

def __init__(
self,
data: Union[dict, str],
return_labels: bool = True,
file_type: str = "h5py",
):
super().__init__(data, return_labels, file_type)

if not isinstance(self.data, str):
# calculate all delta here.
missing_mask = (~torch.isnan(self.X)).type(torch.float32)
X = torch.nan_to_num(self.X)
delta = torch_parse_delta(missing_mask)

self.processed_data = {
"X": X,
"missing_mask": missing_mask,
"delta": delta,
}

def _fetch_data_from_array(self, idx: int) -> Iterable:
"""Fetch data from self.X if it is given.

Parameters
----------
idx : int,
The index of the sample to be return.

Returns
-------
sample : list,
A list contains

index : int tensor,
The index of the sample.

X : tensor,
The feature vector for model input.

missing_mask : tensor,
The mask indicates all missing values in X.

delta : tensor,
The delta matrix contains time gaps of missing values.

label (optional) : tensor,
The target label of the time-series sample.
"""
sample = [
torch.tensor(idx),
# for forward
self.processed_data["X"][idx].to(torch.float32),
self.processed_data["missing_mask"][idx].to(torch.float32),
self.processed_data["delta"][idx].to(torch.float32),
]

if self.y is not None and self.return_labels:
sample.append(self.y[idx].to(torch.long))

return sample

def _fetch_data_from_file(self, idx: int) -> Iterable:
"""Fetch data with the lazy-loading strategy, i.e. only loading data from the file while requesting for samples.
Here the opened file handle doesn't load the entire dataset into RAM but only load the currently accessed slice.

Parameters
----------
idx : int,
The index of the sample to be return.

Returns
-------
sample : list,
The collated data sample, a list including all necessary sample info.
"""

if self.file_handle is None:
self.file_handle = self._open_file_handle()

X = torch.from_numpy(self.file_handle["X"][idx])
missing_mask = (~torch.isnan(X)).to(torch.float32)
X = torch.nan_to_num(X)

forward = {
"X": X,
"missing_mask": missing_mask,
"deltas": torch_parse_delta(missing_mask),
}

sample = [
torch.tensor(idx),
# for forward
forward["X"],
forward["missing_mask"],
forward["deltas"],
]

# if the dataset has labels and is for training, then fetch it from the file
if "y" in self.file_handle.keys() and self.return_labels:
sample.append(torch.tensor(self.file_handle["y"][idx], dtype=torch.long))

return sample
Loading