diff --git a/.github/workflows/testing_ci.yml b/.github/workflows/testing_ci.yml index d339afe5..4cdfe5bc 100644 --- a/.github/workflows/testing_ci.yml +++ b/.github/workflows/testing_ci.yml @@ -15,43 +15,61 @@ jobs: runs-on: ${{ matrix.os }} defaults: run: - shell: bash -l {0} + shell: bash {0} strategy: fail-fast: false matrix: os: [ubuntu-latest, windows-latest, macOS-latest] - python-version: ["3.7", "3.8", "3.9", "3.10"] + python-version: ["3.7", "3.10"] + torch-version: ["1.13.1"] steps: - name: Check out the repo code uses: actions/checkout@v3 - - name: Set up Conda - uses: conda-incubator/setup-miniconda@v2 + - name: Determine the Python version + uses: haya14busa/action-cond@v1 + id: condval with: - activate-environment: pypots-test - python-version: ${{ matrix.python-version }} - environment-file: tests/environment_for_conda_test.yml - auto-activate-base: false + cond: ${{ matrix.python-version == 3.7 && matrix.os == 'macOS-latest' }} + # Note: the latest 3.7 subversion 3.7.17 for MacOS has "ModuleNotFoundError: No module named '_bz2'" + if_true: "3.7.16" + if_false: ${{ matrix.python-version }} + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: ${{ steps.condval.outputs.value }} + check-latest: true + cache: pip + cache-dependency-path: | + setup.cfg + + - name: Install PyTorch ${{ matrix.torch-version }}+cpu + # we have to install torch in advance because torch_sparse needs it for compilation, + # refer to https://github.com/rusty1s/pytorch_sparse/issues/156#issuecomment-1304869772 for details + run: | + which python + which pip + python -m pip install --upgrade pip + pip install torch==${{ matrix.torch-version }} -f https://download.pytorch.org/whl/cpu + python -c "import torch; print('PyTorch:', torch.__version__)" + + - name: Install other dependencies + run: | + pip install pypots + pip install torch-geometric torch-scatter torch-sparse -f "https://data.pyg.org/whl/torch-${{ matrix.torch-version }}+cpu.html" + pip install -e ".[dev]" - name: Fetch the test environment details run: | which python - conda info - conda list + pip list - name: Test with pytest run: | - # run tests separately here due to Segmentation Fault in test_clustering when run all in - # one command with `pytest` on MacOS. Bugs not caught, so this is a trade-off to avoid SF. - python -m pytest -rA tests/test_classification.py -n auto --cov=pypots --dist=loadgroup --cov-config=.coveragerc - python -m pytest -rA tests/test_imputation.py -n auto --cov=pypots --cov-append --dist=loadgroup --cov-config=.coveragerc - python -m pytest -rA tests/test_clustering.py -n auto --cov=pypots --cov-append --dist=loadgroup --cov-config=.coveragerc - python -m pytest -rA tests/test_forecasting.py -n auto --cov=pypots --cov-append --dist=loadgroup --cov-config=.coveragerc - python -m pytest -rA tests/test_optim.py -n auto --cov=pypots --cov-append --dist=loadgroup --cov-config=.coveragerc - python -m pytest -rA tests/test_data.py -n auto --cov=pypots --cov-append --dist=loadgroup --cov-config=.coveragerc - python -m pytest -rA tests/test_utils.py -n auto --cov=pypots --cov-append --dist=loadgroup --cov-config=.coveragerc - python -m pytest -rA tests/test_cli.py -n auto --cov=pypots --cov-append --dist=loadgroup --cov-config=.coveragerc + rm -rf tests/__pycache__ + python -m pytest -rA tests/*/* -n auto --cov=pypots --dist=loadgroup --cov-config=.coveragerc - name: Generate the LCOV report run: | @@ -61,4 +79,4 @@ jobs: uses: coverallsapp/github-action@master with: github-token: ${{ secrets.GITHUB_TOKEN }} - path-to-lcov: 'coverage.lcov' + path-to-lcov: "coverage.lcov" diff --git a/.github/workflows/testing_daily.yml b/.github/workflows/testing_daily.yml index f0b3ba61..5e41630f 100644 --- a/.github/workflows/testing_daily.yml +++ b/.github/workflows/testing_daily.yml @@ -10,61 +10,43 @@ jobs: runs-on: ${{ matrix.os }} defaults: run: - shell: bash {0} + shell: bash -l {0} strategy: fail-fast: false matrix: os: [ubuntu-latest, windows-latest, macOS-latest] - python-version: ["3.7", "3.8", "3.9", "3.10"] - torch-version: ["1.13.1"] + python-version: ["3.7", "3.10"] steps: - name: Check out the repo code uses: actions/checkout@v3 - - name: Determine the Python version - uses: haya14busa/action-cond@v1 - id: condval + - name: Set up Conda + uses: conda-incubator/setup-miniconda@v2 with: - cond: ${{ matrix.python-version == 3.7 && matrix.os == 'macOS-latest' }} - # Note: the latest 3.7 subversion 3.7.17 for MacOS has "ModuleNotFoundError: No module named '_bz2'" - if_true: "3.7.16" - if_false: ${{ matrix.python-version }} - - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: ${{ steps.condval.outputs.value }} - check-latest: true - cache: pip - cache-dependency-path: | - setup.cfg - - - name: Install PyTorch ${{ matrix.torch-version }}+cpu - # we have to install torch in advance because torch_sparse needs it for compilation, - # refer to https://github.com/rusty1s/pytorch_sparse/issues/156#issuecomment-1304869772 for details - run: | - which python - which pip - python -m pip install --upgrade pip - pip install torch==${{ matrix.torch-version }} -f https://download.pytorch.org/whl/cpu - python -c "import torch; print('PyTorch:', torch.__version__)" - - - name: Install other dependencies - run: | - pip install pypots - pip install torch-geometric torch-scatter torch-sparse -f "https://data.pyg.org/whl/torch-${{ matrix.torch-version }}+cpu.html" - pip install -e ".[dev]" + activate-environment: pypots-test + python-version: ${{ matrix.python-version }} + environment-file: tests/environment_for_conda_test.yml + auto-activate-base: false - name: Fetch the test environment details run: | which python - pip list + conda info + conda list - name: Test with pytest run: | - coverage run --source=pypots -m pytest --ignore tests/test_training_on_multi_gpus.py - # ignore the test_training_on_multi_gpus.py because it requires multiple GPUs which are not available on GitHub Actions + # run tests separately here due to Segmentation Fault in test_clustering when run all in + # one command with `pytest` on MacOS. Bugs not caught, so this is a trade-off to avoid SF. + python -m pytest -rA tests/classification/* -n auto --cov=pypots --dist=loadgroup --cov-config=.coveragerc + python -m pytest -rA tests/imputation/* -n auto --cov=pypots --cov-append --dist=loadgroup --cov-config=.coveragerc + python -m pytest -rA tests/clustering/* -n auto --cov=pypots --cov-append --dist=loadgroup --cov-config=.coveragerc + python -m pytest -rA tests/forecasting/* -n auto --cov=pypots --cov-append --dist=loadgroup --cov-config=.coveragerc + python -m pytest -rA tests/optim/* -n auto --cov=pypots --cov-append --dist=loadgroup --cov-config=.coveragerc + python -m pytest -rA tests/data/* -n auto --cov=pypots --cov-append --dist=loadgroup --cov-config=.coveragerc + python -m pytest -rA tests/utils/* -n auto --cov=pypots --cov-append --dist=loadgroup --cov-config=.coveragerc + python -m pytest -rA tests/cli/* -n auto --cov=pypots --cov-append --dist=loadgroup --cov-config=.coveragerc - name: Generate the LCOV report run: | @@ -74,4 +56,4 @@ jobs: uses: coverallsapp/github-action@master with: github-token: ${{ secrets.GITHUB_TOKEN }} - path-to-lcov: "coverage.lcov" + path-to-lcov: 'coverage.lcov' diff --git a/.gitignore b/.gitignore index 0841fdef..51294f38 100644 --- a/.gitignore +++ b/.gitignore @@ -14,7 +14,8 @@ docs/_build .coverage .pytest_cache *__pycache__* -*testing_results* +*test* # ignore specific kinds of files like all PDFs *.pdf +*.ipynb diff --git a/README.md b/README.md index 7b591634..9c86f08a 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,8 @@ -##

Welcome to PyPOTS

+

Welcome to PyPOTS

+ **

A Python Toolbox for Data Mining on Partially-Observed Time Series

**

@@ -161,6 +162,8 @@ PyPOTS supports imputation, classification, clustering, and forecasting tasks on | **Type** | **Abbr.** | **Full name of the algorithm/model/paper** | **Year** | | Neural Net | SAITS | Self-Attention-based Imputation for Time Series [^1] | 2023 | | Neural Net | Transformer | Attention is All you Need [^2];
Self-Attention-based Imputation for Time Series [^1];
Note: proposed in [^2], and re-implemented as an imputation model in [^1]. | 2017 | +| Neural Net | US-GAN | Generative Semi-supervised Learning for Multivariate Time Series Imputation [^10] | 2021 | +| Neural Net | GP-VAE | GP-VAE: Deep Probabilistic Time Series Imputation [^11] | 2020 | | Neural Net | BRITS | Bidirectional Recurrent Imputation for Time Series [^3] | 2018 | | Neural Net | M-RNN | Multi-directional Recurrent Neural Network [^9] | 2019 | | Naive | LOCF | Last Observation Carried Forward | - | @@ -253,7 +256,7 @@ We care about the feedback from our users, so we're building PyPOTS community on If you have any suggestions or want to contribute ideas or share time-series related papers, join us and tell. PyPOTS community is open, transparent, and surely friendly. Let's work together to build and improve PyPOTS! - +[//]: # (Use APA reference style below) [^1]: Du, W., Cote, D., & Liu, Y. (2023). [SAITS: Self-Attention-based Imputation for Time Series](https://doi.org/10.1016/j.eswa.2023.119619). *Expert systems with applications*. [^2]: Vaswani, A., Shazeer, N.M., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.N., Kaiser, L., & Polosukhin, I. (2017). [Attention is All you Need](https://papers.nips.cc/paper/2017/hash/3f5ee243547dee91fbd053c1c4a845aa-Abstract.html). *NeurIPS 2017*. [^3]: Cao, W., Wang, D., Li, J., Zhou, H., Li, L., & Li, Y. (2018). [BRITS: Bidirectional Recurrent Imputation for Time Series](https://papers.nips.cc/paper/2018/hash/734e6bfcd358e25ac1db0a4241b95651-Abstract.html). *NeurIPS 2018*. @@ -263,7 +266,8 @@ PyPOTS community is open, transparent, and surely friendly. Let's work together [^7]: Jong, J.D., Emon, M.A., Wu, P., Karki, R., Sood, M., Godard, P., Ahmad, A., Vrooman, H.A., Hofmann-Apitius, M., & Fröhlich, H. (2019). [Deep learning for clustering of multivariate clinical patient trajectories with missing values](https://academic.oup.com/gigascience/article/8/11/giz134/5626377). *GigaScience*. [^8]: Chen, X., & Sun, L. (2021). [Bayesian Temporal Factorization for Multidimensional Time Series Prediction](https://arxiv.org/abs/1910.06366). *IEEE transactions on pattern analysis and machine intelligence*. [^9]: Yoon, J., Zame, W. R., & van der Schaar, M. (2019). [Estimating Missing Data in Temporal Data Streams Using Multi-Directional Recurrent Neural Networks](https://ieeexplore.ieee.org/document/8485748). *IEEE Transactions on Biomedical Engineering*. - +[^10]: Miao, X., Wu, Y., Wang, J., Gao, Y., Mao, X., & Yin, J. (2021). [Generative Semi-supervised Learning for Multivariate Time Series Imputation](https://ojs.aaai.org/index.php/AAAI/article/view/17086). *AAAI 2021*. +[^11]: Fortuin, V., Baranchuk, D., Raetsch, G. & Mandt, S.. (2020). [GP-VAE: Deep Probabilistic Time Series Imputation](https://proceedings.mlr.press/v108/fortuin20a.html). *AISTATS 2020*.

🏠 Visits @@ -271,4 +275,4 @@ PyPOTS community is open, transparent, and surely friendly. Let's work together PyPOTS visits
-
\ No newline at end of file +
diff --git a/docs/about_us.rst b/docs/about_us.rst index aaaab944..370a3e0d 100644 --- a/docs/about_us.rst +++ b/docs/about_us.rst @@ -33,5 +33,5 @@ PyPOTS exists thanks to all the nice people (sorted by contribution time) who co .. raw:: html - + diff --git a/docs/pypots.data.rst b/docs/pypots.data.rst index d792d6aa..fe7c4678 100644 --- a/docs/pypots.data.rst +++ b/docs/pypots.data.rst @@ -10,6 +10,15 @@ pypots.data.base module :show-inheritance: :inherited-members: +pypots.data.saving module +----------------------------- + +.. automodule:: pypots.data.saving + :members: + :undoc-members: + :show-inheritance: + :inherited-members: + pypots.data.generating module ----------------------------- diff --git a/docs/pypots.forecasting.rst b/docs/pypots.forecasting.rst index 2ae67b85..c4ac76b7 100644 --- a/docs/pypots.forecasting.rst +++ b/docs/pypots.forecasting.rst @@ -1,11 +1,31 @@ pypots.forecasting package ========================== +Subpackages +----------- -pypots.forecasting.bttf module +.. toctree:: + :maxdepth: 4 + + pypots.forecasting.bttf + pypots.forecasting.template + +Submodules +---------- + +pypots.forecasting.base module ------------------------------ -.. automodule:: pypots.forecasting.bttf +.. automodule:: pypots.forecasting.base + :members: + :undoc-members: + :show-inheritance: + :inherited-members: + +Module contents +--------------- + +.. automodule:: pypots.forecasting :members: :undoc-members: :show-inheritance: diff --git a/docs/pypots.imputation.rst b/docs/pypots.imputation.rst index 0e31f8c8..a33e0fdf 100644 --- a/docs/pypots.imputation.rst +++ b/docs/pypots.imputation.rst @@ -19,6 +19,24 @@ pypots.imputation.transformer module :show-inheritance: :inherited-members: +pypots.imputation.usgan module +------------------------------ + +.. automodule:: pypots.imputation.usgan + :members: + :undoc-members: + :show-inheritance: + :inherited-members: + +pypots.imputation.gpvae module +------------------------------ + +.. automodule:: pypots.imputation.gpvae + :members: + :undoc-members: + :show-inheritance: + :inherited-members: + pypots.imputation.brits module ------------------------------ diff --git a/pypots/base.py b/pypots/base.py index f55033e3..7a12fe94 100644 --- a/pypots/base.py +++ b/pypots/base.py @@ -96,7 +96,9 @@ def _setup_device(self, device: Union[None, str, torch.device, list]): self.device = device elif isinstance(device, list): if len(device) == 0: - raise ValueError("The list of devices should have at least 1 device, but got 0.") + raise ValueError( + "The list of devices should have at least 1 device, but got 0." + ) elif len(device) == 1: return self._setup_device(device[0]) # parallely training on multiple CUDA devices @@ -176,7 +178,6 @@ def _send_data_to_given_device(self, data): if isinstance(self.device, torch.device): # single device data = map(lambda x: x.to(self.device), data) else: # parallely training on multiple devices - # randomly choose one device to balance the workload # device = np.random.choice(self.device) diff --git a/pypots/classification/base.py b/pypots/classification/base.py index a30fd698..a16dbc01 100644 --- a/pypots/classification/base.py +++ b/pypots/classification/base.py @@ -256,7 +256,6 @@ def _train_model( training_loader: DataLoader, val_loader: DataLoader = None, ) -> None: - # each training starts from the very beginning, so reset the loss and model dict here self.best_loss = float("inf") self.best_model_dict = None diff --git a/pypots/classification/grud/data.py b/pypots/classification/grud/data.py index 52186017..edf1d4d0 100644 --- a/pypots/classification/grud/data.py +++ b/pypots/classification/grud/data.py @@ -123,7 +123,7 @@ def _fetch_data_from_file(self, idx: int) -> Iterable: if self.file_handle is None: self.file_handle = self._open_file_handle() - X = torch.from_numpy(self.file_handle["X"][idx]) + X = torch.from_numpy(self.file_handle["X"][idx]).to(torch.float32) missing_mask = (~torch.isnan(X)).to(torch.float32) X_filledLOCF = self.locf._locf_torch(X.unsqueeze(dim=0)).squeeze() X = torch.nan_to_num(X) diff --git a/pypots/classification/raindrop/modules.py b/pypots/classification/raindrop/modules.py index 76a992ef..191ff9c7 100644 --- a/pypots/classification/raindrop/modules.py +++ b/pypots/classification/raindrop/modules.py @@ -174,7 +174,6 @@ def forward( edge_attr: OptTensor = None, return_attention_weights=None, ) -> Tuple[torch.Tensor, Any]: - r""" Args: return_attention_weights (bool, optional): If set to :obj:`True`, diff --git a/pypots/classification/template/dataset.py b/pypots/classification/template/data.py similarity index 100% rename from pypots/classification/template/dataset.py rename to pypots/classification/template/data.py diff --git a/pypots/clustering/base.py b/pypots/clustering/base.py index 324e6718..fd9b7f0d 100644 --- a/pypots/clustering/base.py +++ b/pypots/clustering/base.py @@ -244,7 +244,6 @@ def _train_model( training_loader: DataLoader, val_loader: DataLoader = None, ) -> None: - """ Parameters diff --git a/pypots/clustering/crli/model.py b/pypots/clustering/crli/model.py index b5e2e14a..8b7a63a1 100644 --- a/pypots/clustering/crli/model.py +++ b/pypots/clustering/crli/model.py @@ -226,7 +226,6 @@ def __init__( saving_path: Optional[str] = None, model_saving_strategy: Optional[str] = "best", ): - super().__init__( n_clusters, batch_size, diff --git a/pypots/clustering/template/dataset.py b/pypots/clustering/template/data.py similarity index 100% rename from pypots/clustering/template/dataset.py rename to pypots/clustering/template/data.py diff --git a/pypots/clustering/vader/data.py b/pypots/clustering/vader/data.py index a3b2f91d..a8910b44 100644 --- a/pypots/clustering/vader/data.py +++ b/pypots/clustering/vader/data.py @@ -6,12 +6,12 @@ # License: GLP-v3 -from typing import Union +from typing import Union, Iterable -from ..crli.data import DatasetForCRLI +from ...data.base import BaseDataset -class DatasetForVaDER(DatasetForCRLI): +class DatasetForVaDER(BaseDataset): """Dataset class for model VaDER. Parameters @@ -45,3 +45,9 @@ def __init__( file_type: str = "h5py", ): super().__init__(data, return_labels, file_type) + + def _fetch_data_from_array(self, idx: int) -> Iterable: + return super()._fetch_data_from_array(idx) + + def _fetch_data_from_file(self, idx: int) -> Iterable: + return super()._fetch_data_from_file(idx) diff --git a/pypots/clustering/vader/model.py b/pypots/clustering/vader/model.py index f2912cce..5a44da85 100644 --- a/pypots/clustering/vader/model.py +++ b/pypots/clustering/vader/model.py @@ -184,7 +184,6 @@ def forward( ) = self.get_results(X, missing_mask) if not training and not pretrain: - results = { "mu_tilde": mu_tilde, "mu": mu_c, @@ -403,7 +402,6 @@ def _train_model( training_loader: DataLoader, val_loader: DataLoader = None, ) -> None: - # each training starts from the very beginning, so reset the loss and model dict here self.best_loss = float("inf") self.best_model_dict = None diff --git a/pypots/data/base.py b/pypots/data/base.py index 86b15fc2..1bef9f9c 100644 --- a/pypots/data/base.py +++ b/pypots/data/base.py @@ -204,13 +204,13 @@ def _fetch_data_from_array(self, idx: int) -> Iterable: The collated data sample, a list including all necessary sample info. """ - X = self.X[idx] - missing_mask = ~torch.isnan(X) + X = self.X[idx].to(torch.float32) + missing_mask = (~torch.isnan(X)).to(torch.float32) X = torch.nan_to_num(X) sample = [ torch.tensor(idx), - X.to(torch.float32), - missing_mask.to(torch.float32), + X, + missing_mask, ] if self.y is not None and self.return_labels: @@ -279,13 +279,13 @@ def _fetch_data_from_file(self, idx: int) -> Iterable: if self.file_handle is None: self.file_handle = self._open_file_handle() - X = torch.from_numpy(self.file_handle["X"][idx]) - missing_mask = ~torch.isnan(X) + X = torch.from_numpy(self.file_handle["X"][idx]).to(torch.float32) + missing_mask = (~torch.isnan(X)).to(torch.float32) X = torch.nan_to_num(X) sample = [ torch.tensor(idx), - X.to(torch.float32), - missing_mask.to(torch.float32), + X, + missing_mask, ] # if the dataset has labels and is for training, then fetch it from the file diff --git a/pypots/data/saving.py b/pypots/data/saving.py index 8581ad50..61138df2 100644 --- a/pypots/data/saving.py +++ b/pypots/data/saving.py @@ -14,7 +14,11 @@ from pypots.utils.logging import logger -def save_dict_into_h5(data_dict: dict, saving_dir: str) -> None: +def save_dict_into_h5( + data_dict: dict, + saving_dir: str, + saving_name: str = "datasets.h5", +) -> None: """Save the given data (in a dictionary) into the given h5 file. Parameters @@ -25,6 +29,9 @@ def save_dict_into_h5(data_dict: dict, saving_dir: str) -> None: saving_dir : str, The h5 file to save the data. + saving_name : str, optional (default="datasets.h5") + The final name of the saved h5 file. + """ def save_set(handle, name, data): @@ -36,7 +43,7 @@ def save_set(handle, name, data): handle.create_dataset(name, data=data) create_dir_if_not_exist(saving_dir) - saving_path = os.path.join(saving_dir, "datasets.h5") + saving_path = os.path.join(saving_dir, saving_name) with h5py.File(saving_path, "w") as hf: for k, v in data_dict.items(): save_set(hf, k, v) diff --git a/pypots/forecasting/base.py b/pypots/forecasting/base.py index 5188999b..079f5925 100644 --- a/pypots/forecasting/base.py +++ b/pypots/forecasting/base.py @@ -242,7 +242,6 @@ def _train_model( training_loader: DataLoader, val_loader: DataLoader = None, ) -> None: - # each training starts from the very beginning, so reset the loss and model dict here self.best_loss = float("inf") self.best_model_dict = None diff --git a/pypots/forecasting/template/dataset.py b/pypots/forecasting/template/data.py similarity index 100% rename from pypots/forecasting/template/dataset.py rename to pypots/forecasting/template/data.py diff --git a/pypots/imputation/__init__.py b/pypots/imputation/__init__.py index 9de8d0bc..a6c4dcd8 100644 --- a/pypots/imputation/__init__.py +++ b/pypots/imputation/__init__.py @@ -6,10 +6,12 @@ # License: GPL-v3 from .brits import BRITS +from .gpvae import GPVAE from .locf import LOCF +from .mrnn import MRNN from .saits import SAITS from .transformer import Transformer -from .mrnn import MRNN +from .usgan import USGAN __all__ = [ "SAITS", @@ -17,4 +19,6 @@ "BRITS", "MRNN", "LOCF", + "GPVAE", + "USGAN", ] diff --git a/pypots/imputation/brits/data.py b/pypots/imputation/brits/data.py index f39e411c..342ede98 100644 --- a/pypots/imputation/brits/data.py +++ b/pypots/imputation/brits/data.py @@ -59,14 +59,14 @@ def __init__( self.processed_data = { "forward": { - "X": forward_X, - "missing_mask": forward_missing_mask, - "delta": forward_delta, + "X": forward_X.to(torch.float32), + "missing_mask": forward_missing_mask.to(torch.float32), + "delta": forward_delta.to(torch.float32), }, "backward": { - "X": backward_X, - "missing_mask": backward_missing_mask, - "delta": backward_delta, + "X": backward_X.to(torch.float32), + "missing_mask": backward_missing_mask.to(torch.float32), + "delta": backward_delta.to(torch.float32), }, } @@ -101,13 +101,13 @@ def _fetch_data_from_array(self, idx: int) -> Iterable: sample = [ torch.tensor(idx), # for forward - self.processed_data["forward"]["X"][idx].to(torch.float32), - self.processed_data["forward"]["missing_mask"][idx].to(torch.float32), - self.processed_data["forward"]["delta"][idx].to(torch.float32), + self.processed_data["forward"]["X"][idx], + self.processed_data["forward"]["missing_mask"][idx], + self.processed_data["forward"]["delta"][idx], # for backward - self.processed_data["backward"]["X"][idx].to(torch.float32), - self.processed_data["backward"]["missing_mask"][idx].to(torch.float32), - self.processed_data["backward"]["delta"][idx].to(torch.float32), + self.processed_data["backward"]["X"][idx], + self.processed_data["backward"]["missing_mask"][idx], + self.processed_data["backward"]["delta"][idx], ] if self.y is not None and self.return_labels: @@ -133,7 +133,7 @@ def _fetch_data_from_file(self, idx: int) -> Iterable: if self.file_handle is None: self.file_handle = self._open_file_handle() - X = torch.from_numpy(self.file_handle["X"][idx]) + X = torch.from_numpy(self.file_handle["X"][idx]).to(torch.float32) missing_mask = (~torch.isnan(X)).to(torch.float32) X = torch.nan_to_num(X) diff --git a/pypots/imputation/gpvae/__init__.py b/pypots/imputation/gpvae/__init__.py new file mode 100644 index 00000000..f5ffb05e --- /dev/null +++ b/pypots/imputation/gpvae/__init__.py @@ -0,0 +1,12 @@ +""" +The package of the partially-observed time-series imputation method GP-VAE. +""" + +# Created by Jun Wang +# License: GLP-v3 + +from .model import GPVAE + +__all__ = [ + "GPVAE", +] diff --git a/pypots/imputation/gpvae/data.py b/pypots/imputation/gpvae/data.py new file mode 100644 index 00000000..8bb9be8c --- /dev/null +++ b/pypots/imputation/gpvae/data.py @@ -0,0 +1,132 @@ +""" +Dataset class for model GP-VAE. +""" + +# Created by Jun Wang and Wenjie Du +# License: GLP-v3 + +from typing import Union, Iterable + +import torch + +from ...data.base import BaseDataset + + +class DatasetForGPVAE(BaseDataset): + """Dataset class for GP-VAE. + + Parameters + ---------- + data : dict or str, + The dataset for model input, should be a dictionary including keys as 'X' and 'y', + or a path string locating a data file. + If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features], + which is time-series data for input, can contain missing values, and y should be array-like of shape + [n_samples], which is classification labels of X. + If it is a path string, the path should point to a data file, e.g. a h5 file, which contains + key-value pairs like a dict, and it has to include keys as 'X' and 'y'. + + return_labels : bool, default = True, + Whether to return labels in function __getitem__() if they exist in the given data. If `True`, for example, + during training of classification models, the Dataset class will return labels in __getitem__() for model input. + Otherwise, labels won't be included in the data returned by __getitem__(). This parameter exists because we + need the defined Dataset class for all training/validating/testing stages. For those big datasets stored in h5 + files, they already have both X and y saved. But we don't read labels from the file for validating and testing + with function _fetch_data_from_file(), which works for all three stages. Therefore, we need this parameter for + distinction. + + file_type : str, default = "h5py" + The type of the given file if train_set and val_set are path strings. + """ + + def __init__( + self, + data: Union[dict, str], + return_labels: bool = True, + file_type: str = "h5py", + ): + super().__init__(data, return_labels, file_type) + + if not isinstance(self.data, str): + # calculate all delta here. + missing_mask = (~torch.isnan(self.X)).type(torch.float32) + X = torch.nan_to_num(self.X).to(torch.float32) + + self.processed_data = { + "X": X, + "missing_mask": missing_mask, + } + + def _fetch_data_from_array(self, idx: int) -> Iterable: + """Fetch data from self.X if it is given. + + Parameters + ---------- + idx : int, + The index of the sample to be return. + + Returns + ------- + sample : list, + A list contains + + index : int tensor, + The index of the sample. + + X : tensor, + The feature vector for model input. + + missing_mask : tensor, + The mask indicates all missing values in X. + + delta : tensor, + The delta matrix contains time gaps of missing values. + + label (optional) : tensor, + The target label of the time-series sample. + """ + sample = [ + torch.tensor(idx), + # for forward + self.processed_data["X"][idx], + self.processed_data["missing_mask"][idx], + ] + + if self.y is not None and self.return_labels: + sample.append(self.y[idx].to(torch.long)) + + return sample + + def _fetch_data_from_file(self, idx: int) -> Iterable: + """Fetch data with the lazy-loading strategy, i.e. only loading data from the file while requesting for samples. + Here the opened file handle doesn't load the entire dataset into RAM but only load the currently accessed slice. + + Parameters + ---------- + idx : int, + The index of the sample to be return. + + Returns + ------- + sample : list, + The collated data sample, a list including all necessary sample info. + """ + + if self.file_handle is None: + self.file_handle = self._open_file_handle() + + X = torch.from_numpy(self.file_handle["X"][idx]).to(torch.float32) + missing_mask = (~torch.isnan(X)).to(torch.float32) + X = torch.nan_to_num(X) + + sample = [ + torch.tensor(idx), + X, + missing_mask, + ] + + # if the dataset has labels and is for training, then fetch it from the file + if "y" in self.file_handle.keys() and self.return_labels: + sample.append(torch.tensor(self.file_handle["y"][idx], dtype=torch.long)) + + return sample diff --git a/pypots/imputation/gpvae/model.py b/pypots/imputation/gpvae/model.py new file mode 100644 index 00000000..6b613d4d --- /dev/null +++ b/pypots/imputation/gpvae/model.py @@ -0,0 +1,446 @@ +""" +The implementation of GP-VAE for the partially-observed time-series imputation task. + +Refer to the paper Fortuin V, Baranchuk D, Rätsch G, et al. +GP-VAE: Deep probabilistic time series imputation. AISTATS. PMLR, 2020: 1651-1661. + +""" + +# Created by Jun Wang and Wenjie Du +# License: GPL-v3 + + +from typing import Union, Optional + +import h5py +import numpy as np +import torch +import torch.nn as nn +from torch.utils.data import DataLoader + +from .data import DatasetForGPVAE +from .modules import ( + Encoder, + rbf_kernel, + diffusion_kernel, + matern_kernel, + cauchy_kernel, + Decoder, +) +from ..base import BaseNNImputer +from ...optim.adam import Adam +from ...optim.base import Optimizer + + +class _GPVAE(nn.Module): + """model GPVAE with Gaussian Process prior + + Parameters + ---------- + input_dim : int, + the feature dimension of the input + + time_length : int, + the length of each time series + + latent_dim : int, + the feature dimension of the latent embedding + + encoder_sizes : tuple, + the tuple of the network size in encoder + + decoder_sizes : tuple, + the tuple of the network size in decoder + + beta : float, + the weight of the KL divergence + + M : int, + the number of Monte Carlo samples for ELBO estimation + + K : int, + the number of importance weights for IWAE model + + kernel : str, + the Gaussian Process kernel ["cauchy", "diffusion", "rbf", "matern"] + + sigma : float, + the scale parameter for a kernel function + + length_scale : float, + the length scale parameter for a kernel function + + kernel_scales : int, + the number of different length scales over latent space dimensions + """ + + def __init__( + self, + input_dim, + time_length, + latent_dim, + encoder_sizes=(64, 64), + decoder_sizes=(64, 64), + beta=1, + M=1, + K=1, + kernel="cauchy", + sigma=1.0, + length_scale=7.0, + kernel_scales=1, + window_size=24, + ): + super().__init__() + self.kernel = kernel + self.sigma = sigma + self.length_scale = length_scale + self.kernel_scales = kernel_scales + + self.input_dim = input_dim + self.time_length = time_length + self.latent_dim = latent_dim + self.beta = beta + self.encoder = Encoder(input_dim, latent_dim, encoder_sizes, window_size) + self.decoder = Decoder(latent_dim, input_dim, decoder_sizes) + self.M = M + self.K = K + + # Precomputed KL components for efficiency + self.prior = self._init_prior() + # self.pz_scale_inv = None + # self.pz_scale_log_abs_determinant = None + + def encode(self, x): + return self.encoder(x) + + def decode(self, z): + if not torch.is_tensor(z): + z = torch.tensor(z).float() + num_dim = len(z.shape) + assert num_dim > 2 + return self.decoder(torch.transpose(z, num_dim - 1, num_dim - 2)) + + def forward(self, inputs, training=True): + x = inputs["X"] + m_mask = inputs["missing_mask"] + x = x.repeat(self.M * self.K, 1, 1) + if m_mask is not None: + m_mask = m_mask.repeat(self.M * self.K, 1, 1) + m_mask = m_mask.type(torch.bool) + + # pz = self.prior() + qz_x = self.encode(x) + z = qz_x.rsample() + px_z = self.decode(z) + + nll = -px_z.log_prob(x) + nll = torch.where(torch.isfinite(nll), nll, torch.zeros_like(nll)) + if m_mask is not None: + nll = torch.where(m_mask, nll, torch.zeros_like(nll)) + nll = nll.sum(dim=(1, 2)) + + if self.K > 1: + kl = qz_x.log_prob(z) - self.prior.log_prob(z) + kl = torch.where(torch.isfinite(kl), kl, torch.zeros_like(kl)) + kl = kl.sum(1) + + weights = -nll - kl + weights = torch.reshape(weights, [self.M, self.K, -1]) + + elbo = torch.logsumexp(weights, dim=1) + elbo = elbo.mean() + else: + kl = self.kl_divergence(qz_x, self.prior) + kl = torch.where(torch.isfinite(kl), kl, torch.zeros_like(kl)) + kl = kl.sum(1) + + elbo = -nll - self.beta * kl + elbo = elbo.mean() + + imputed_data = self.decode(self.encode(x).mean).mean * ~m_mask + x * m_mask + + if not training: + # if not in training mode, return the classification result only + return { + "imputed_data": imputed_data, + } + + results = { + "loss": -elbo.mean(), + "imputed_data": imputed_data, + } + return results + + @staticmethod + def kl_divergence(a, b): + # TODO: different from the author's implementation + return torch.distributions.kl.kl_divergence(a, b) + + def _init_prior(self): + # Compute kernel matrices for each latent dimension + kernel_matrices = [] + for i in range(self.kernel_scales): + if self.kernel == "rbf": + kernel_matrices.append( + rbf_kernel(self.time_length, self.length_scale / 2**i) + ) + elif self.kernel == "diffusion": + kernel_matrices.append( + diffusion_kernel(self.time_length, self.length_scale / 2**i) + ) + elif self.kernel == "matern": + kernel_matrices.append( + matern_kernel(self.time_length, self.length_scale / 2**i) + ) + elif self.kernel == "cauchy": + kernel_matrices.append( + cauchy_kernel( + self.time_length, self.sigma, self.length_scale / 2**i + ) + ) + + # Combine kernel matrices for each latent dimension + tiled_matrices = [] + total = 0 + for i in range(self.kernel_scales): + if i == self.kernel_scales - 1: + multiplier = self.latent_dim - total + else: + multiplier = int(np.ceil(self.latent_dim / self.kernel_scales)) + total += multiplier + tiled_matrices.append( + torch.unsqueeze(kernel_matrices[i], 0).repeat(multiplier, 1, 1) + ) + kernel_matrix_tiled = torch.cat(tiled_matrices) + assert len(kernel_matrix_tiled) == self.latent_dim + prior = torch.distributions.MultivariateNormal( + loc=torch.zeros(self.latent_dim, self.time_length), + covariance_matrix=kernel_matrix_tiled, + ) + + return prior + + +class GPVAE(BaseNNImputer): + """The PyTorch implementation of the GPVAE model :cite:``. + + Parameters + ---------- + beta: + The weight of KL divergence in EBLO. + + kernel: + The type of kernel function chosen in the Gaussain Process Proir. ["cauchy", "diffusion", "rbf", "matern"] + + batch_size : + The batch size for training and evaluating the model. + + epochs : + The number of epochs for training the model. + + patience : + The patience for the early-stopping mechanism. Given a positive integer, the training process will be + stopped when the model does not perform better after that number of epochs. + Leaving it default as None will disable the early-stopping. + + optimizer : + The optimizer for model training. + If not given, will use a default Adam optimizer. + + num_workers : + The number of subprocesses to use for data loading. + `0` means data loading will be in the main process, i.e. there won't be subprocesses. + + device : + The device for the model to run on. It can be a string, a :class:`torch.device` object, or a list of them. + If not given, will try to use CUDA devices first (will use the default CUDA device if there are multiple), + then CPUs, considering CUDA and CPU are so far the main devices for people to train ML models. + If given a list of devices, e.g. ['cuda:0', 'cuda:1'], or [torch.device('cuda:0'), torch.device('cuda:1')] , the + model will be parallely trained on the multiple devices (so far only support parallel training on CUDA devices). + Other devices like Google TPU and Apple Silicon accelerator MPS may be added in the future. + + saving_path : + The path for automatically saving model checkpoints and tensorboard files (i.e. loss values recorded during + training into a tensorboard file). Will not save if not given. + + model_saving_strategy : + The strategy to save model checkpoints. It has to be one of [None, "best", "better"]. + No model will be saved when it is set as None. + The "best" strategy will only automatically save the best model after the training finished. + The "better" strategy will automatically save the model during training whenever the model performs + better than in previous epochs. + + Attributes + ---------- + model : :class:`torch.nn.Module` + The underlying GPVAE model. + + optimizer : :class:`pypots.optim.Optimizer` + The optimizer for model training. + + """ + + def __init__( + self, + n_steps: int, + n_features: int, + latent_size: int, + encoder_sizes: tuple = (64, 64), + decoder_sizes: tuple = (64, 64), + kernel: str = "cauchy", + beta: float = 0.2, + M: int = 1, + K: int = 1, + sigma: float = 1.0, + length_scale: float = 7.0, + kernel_scales: int = 1, + window_size: int = 3, + batch_size: int = 32, + epochs: int = 100, + patience: int = None, + optimizer: Optional[Optimizer] = Adam(), + num_workers: int = 0, + device: Optional[Union[str, torch.device, list]] = None, + saving_path: str = None, + model_saving_strategy: Optional[str] = "best", + ): + super().__init__( + batch_size, + epochs, + patience, + num_workers, + device, + saving_path, + model_saving_strategy, + ) + + self.n_steps = n_steps + self.n_features = n_features + self.latent_size = latent_size + self.kernel = kernel + self.encoder_sizes = encoder_sizes + self.decoder_sizes = decoder_sizes + self.beta = beta + self.M = M + self.K = K + self.sigma = sigma + self.length_scale = length_scale + self.kernel_scales = kernel_scales + + # set up the model + self.model = _GPVAE( + input_dim=self.n_features, + time_length=self.n_steps, + latent_dim=self.latent_size, + kernel=self.kernel, + encoder_sizes=self.encoder_sizes, + decoder_sizes=self.decoder_sizes, + beta=self.beta, + M=self.M, + K=self.K, + sigma=self.sigma, + length_scale=self.length_scale, + kernel_scales=self.kernel_scales, + window_size=window_size, + ) + self._send_model_to_given_device() + self._print_model_size() + + # set up the optimizer + self.optimizer = optimizer + self.optimizer.init_optimizer(self.model.parameters()) + + def _assemble_input_for_training(self, data: list) -> dict: + # fetch data + ( + indices, + X, + missing_mask, + ) = self._send_data_to_given_device(data) + + # assemble input data + inputs = { + "indices": indices, + "X": X, + "missing_mask": missing_mask, + } + + return inputs + + def _assemble_input_for_validating(self, data: list) -> dict: + return self._assemble_input_for_training(data) + + def _assemble_input_for_testing(self, data: list) -> dict: + return self._assemble_input_for_validating(data) + + def fit( + self, + train_set: Union[dict, str], + val_set: Optional[Union[dict, str]] = None, + file_type: str = "h5py", + ) -> None: + # Step 1: wrap the input data with classes Dataset and DataLoader + training_set = DatasetForGPVAE( + train_set, return_labels=False, file_type=file_type + ) + training_loader = DataLoader( + training_set, + batch_size=self.batch_size, + shuffle=True, + num_workers=self.num_workers, + ) + val_loader = None + if val_set is not None: + if isinstance(val_set, str): + with h5py.File(val_set, "r") as hf: + # Here we read the whole validation set from the file to mask a portion for validation. + # In PyPOTS, using a file usually because the data is too big. However, the validation set is + # generally shouldn't be too large. For example, we have 1 billion samples for model training. + # We won't take 20% of them as the validation set because we want as much as possible data for the + # training stage to enhance the model's generalization ability. Therefore, 100,000 representative + # samples will be enough to validate the model. + val_set = { + "X": hf["X"][:], + "X_intact": hf["X_intact"][:], + "indicating_mask": hf["indicating_mask"][:], + } + val_set = DatasetForGPVAE(val_set, return_labels=False, file_type=file_type) + val_loader = DataLoader( + val_set, + batch_size=self.batch_size, + shuffle=False, + num_workers=self.num_workers, + ) + + # Step 2: train the model and freeze it + self._train_model(training_loader, val_loader) + self.model.load_state_dict(self.best_model_dict) + self.model.eval() # set the model as eval status to freeze it. + + # Step 3: save the model if necessary + self._auto_save_model_if_necessary(training_finished=True) + + def impute( + self, + X: Union[dict, str], + file_type="h5py", + ) -> np.ndarray: + self.model.eval() # set the model as eval status to freeze it. + test_set = DatasetForGPVAE(X, return_labels=False, file_type=file_type) + test_loader = DataLoader( + test_set, + batch_size=self.batch_size, + shuffle=False, + num_workers=self.num_workers, + ) + imputation_collector = [] + + with torch.no_grad(): + for idx, data in enumerate(test_loader): + inputs = self._assemble_input_for_testing(data) + results = self.model.forward(inputs, training=False) + imputed_data = results["imputed_data"] + imputation_collector.append(imputed_data) + + imputation_collector = torch.cat(imputation_collector) + return imputation_collector.cpu().detach().numpy() diff --git a/pypots/imputation/gpvae/modules.py b/pypots/imputation/gpvae/modules.py new file mode 100644 index 00000000..5ad81e09 --- /dev/null +++ b/pypots/imputation/gpvae/modules.py @@ -0,0 +1,261 @@ +""" +The implementation of GP-VAE for the partially-observed time-series imputation task. + +Refer to the paper Fortuin V, Baranchuk D, Rätsch G, et al. +GP-VAE: Deep probabilistic time series imputation. AISTATS. PMLR, 2020: 1651-1661. + + +""" + +# Created by Jun Wang and Wenjie Du +# License: GPL-v3 + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + + +def rbf_kernel(T, length_scale): + xs = torch.arange(T).float() + xs_in = torch.unsqueeze(xs, 0) + xs_out = torch.unsqueeze(xs, 1) + distance_matrix = (xs_in - xs_out) ** 2 + distance_matrix_scaled = distance_matrix / length_scale**2 + kernel_matrix = torch.exp(-distance_matrix_scaled) + return kernel_matrix + + +def diffusion_kernel(T, length_scale): + assert length_scale < 0.5, ( + "length_scale has to be smaller than 0.5 for the " + "kernel matrix to be diagonally dominant" + ) + sigmas = torch.ones(T, T) * length_scale + sigmas_tridiag = torch.diagonal(sigmas, offset=0, dim1=-2, dim2=-1) + sigmas_tridiag += torch.diagonal(sigmas, offset=1, dim1=-2, dim2=-1) + sigmas_tridiag += torch.diagonal(sigmas, offset=-1, dim1=-2, dim2=-1) + kernel_matrix = sigmas_tridiag + torch.eye(T) * (1.0 - length_scale) + return kernel_matrix + + +def matern_kernel(T, length_scale): + xs = torch.arange(T).float() + xs_in = torch.unsqueeze(xs, 0) + xs_out = torch.unsqueeze(xs, 1) + distance_matrix = torch.abs(xs_in - xs_out) + distance_matrix_scaled = distance_matrix / torch.sqrt(length_scale).type( + torch.float32 + ) + kernel_matrix = torch.exp(-distance_matrix_scaled) + return kernel_matrix + + +def cauchy_kernel(T, sigma, length_scale): + xs = torch.arange(T).float() + xs_in = torch.unsqueeze(xs, 0) + xs_out = torch.unsqueeze(xs, 1) + distance_matrix = (xs_in - xs_out) ** 2 + distance_matrix_scaled = distance_matrix / length_scale**2 + kernel_matrix = sigma / (distance_matrix_scaled + 1.0) + + alpha = 0.001 + eye = torch.eye(kernel_matrix.shape[-1]) + return kernel_matrix + alpha * eye + + +def make_nn(input_size, output_size, hidden_sizes): + """This function used to creates fully connected neural network. + + Parameters + ---------- + input_size : int, + the dimension of input embeddings + + output_size : int, + the dimension of out embeddings + + hidden_sizes : tuple, + the tuple of hidden layer sizes, and the tuple length sets the number of hidden layers + + Returns + ------- + output: tensor + the processing embeddings + """ + layers = [] + for i in range(len(hidden_sizes)): + if i == 0: + layers.append( + nn.Linear(in_features=input_size, out_features=hidden_sizes[i]) + ) + else: + layers.append( + nn.Linear(in_features=hidden_sizes[i - 1], out_features=hidden_sizes[i]) + ) + layers.append(nn.ReLU()) + layers.append(nn.Linear(in_features=hidden_sizes[-1], out_features=output_size)) + return nn.Sequential(*layers) + + +class CustomConv1d(torch.nn.Conv1d): + def __init(self, in_channels, out_channels, kernel_size, padding): + super().__init__(in_channels, out_channels, kernel_size, padding) + + def forward(self, x): + if len(x.shape) > 2: + shape = list(np.arange(len(x.shape))) + new_shape = [0, shape[-1]] + shape[1:-1] + out = super(CustomConv1d, self).forward(x.permute(*new_shape)) + shape = list(np.arange(len(out.shape))) + new_shape = [0, shape[-1]] + shape[1:-1] + if self.kernel_size[0] % 2 == 0: + out = F.pad(out, (0, -1), "constant", 0) + return out.permute(new_shape) + + return super(CustomConv1d, self).forward(x) + + +def make_cnn(input_size, output_size, hidden_sizes, kernel_size=3): + """This function used to construct neural network consisting of + one 1d-convolutional layer that utilizes temporal dependencies, + fully connected network + + Parameters + ---------- + input_size : int, + the dimension of input embeddings + + output_size : int, + the dimension of out embeddings + + hidden_sizes : tuple, + the tuple of hidden layer sizes, and the tuple length sets the number of hidden layers, + + kernel_size : int + kernel size for convolutional layer + + Returns + ------- + output: tensor + the processing embeddings + """ + padding = kernel_size // 2 + + cnn_layer = CustomConv1d( + input_size, hidden_sizes[0], kernel_size=kernel_size, padding=padding + ) + layers = [cnn_layer] + + for i, h in zip(hidden_sizes, hidden_sizes[1:]): + layers.extend([nn.Linear(i, h), nn.ReLU()]) + if isinstance(output_size, tuple): + net = nn.Sequential(*layers) + return [net] + [nn.Linear(hidden_sizes[-1], o) for o in output_size] + + layers.append(nn.Linear(hidden_sizes[-1], output_size)) + return nn.Sequential(*layers) + + +class Encoder(nn.Module): + def __init__(self, input_size, z_size, hidden_sizes=(128, 128), window_size=24): + """This module is an encoder with 1d-convolutional network and multivariate Normal posterior used by GP-VAE with + proposed banded covariance matrix + + Parameters + ---------- + input_size : int, + the feature dimension of the input + + z_size : int, + the feature dimension of the output latent embedding + + hidden_sizes : tuple, + the tuple of the hidden layer sizes, and the tuple length sets the number of hidden layers + + window_size : int + the kernel size for the Conv1D layer + """ + super().__init__() + self.z_size = int(z_size) + self.input_size = input_size + self.net, self.mu_layer, self.logvar_layer = make_cnn( + input_size, (z_size, z_size * 2), hidden_sizes, window_size + ) + + def forward(self, x): + mapped = self.net(x) + batch_size = mapped.size(0) + time_length = mapped.size(1) + + num_dim = len(mapped.shape) + mu = self.mu_layer(mapped) + logvar = self.logvar_layer(mapped) + mapped_mean = torch.transpose(mu, num_dim - 1, num_dim - 2) + mapped_covar = torch.transpose(logvar, num_dim - 1, num_dim - 2) + mapped_covar = torch.sigmoid(mapped_covar) + mapped_reshaped = mapped_covar.reshape(batch_size, self.z_size, 2 * time_length) + + dense_shape = [batch_size, self.z_size, time_length, time_length] + idxs_1 = np.repeat(np.arange(batch_size), self.z_size * (2 * time_length - 1)) + idxs_2 = np.tile( + np.repeat(np.arange(self.z_size), (2 * time_length - 1)), batch_size + ) + idxs_3 = np.tile( + np.concatenate([np.arange(time_length), np.arange(time_length - 1)]), + batch_size * self.z_size, + ) + idxs_4 = np.tile( + np.concatenate([np.arange(time_length), np.arange(1, time_length)]), + batch_size * self.z_size, + ) + idxs_all = np.stack([idxs_1, idxs_2, idxs_3, idxs_4], axis=1) + + mapped_values = mapped_reshaped[:, :, :-1].reshape(-1) + prec_sparse = torch.sparse_coo_tensor( + torch.LongTensor(idxs_all).t().to(mapped.device), + (mapped_values).to(mapped.device), + (dense_shape), + ) + prec_sparse = prec_sparse.coalesce() + prec_tril = prec_sparse.to_dense() + eye = ( + torch.eye(prec_tril.shape[-1]) + .unsqueeze(0) + .repeat(prec_tril.shape[0], prec_tril.shape[1], 1, 1) + .to(mapped.device) + ) + prec_tril = prec_tril + eye + cov_tril = torch.linalg.solve_triangular(prec_tril, eye, upper=True) + cov_tril = torch.where( + torch.isfinite(cov_tril), cov_tril, torch.zeros_like(cov_tril) + ).to(mapped.device) + + num_dim = len(cov_tril.shape) + cov_tril_lower = torch.transpose(cov_tril, num_dim - 1, num_dim - 2) + + z_dist = torch.distributions.MultivariateNormal( + loc=mapped_mean, scale_tril=cov_tril_lower + ) + return z_dist + + +class Decoder(nn.Module): + def __init__(self, input_size, output_size, hidden_sizes=(256, 256)): + """This module is a decoder with Gaussian output distribution. + + Parameters + ---------- + output_size : int, + the feature dimension of the output + + hidden_sizes: tuple + the tuple of hidden layer sizes, and the tuple length sets the number of hidden layers. + """ + super().__init__() + self.net = make_nn(input_size, output_size, hidden_sizes) + + def forward(self, x): + mu = self.net(x) + var = torch.ones_like(mu) + return torch.distributions.Normal(mu, var) diff --git a/pypots/imputation/mrnn/module.py b/pypots/imputation/mrnn/module.py index 873d2d73..a143d121 100644 --- a/pypots/imputation/mrnn/module.py +++ b/pypots/imputation/mrnn/module.py @@ -18,7 +18,7 @@ class FCN_Regression(nn.Module): def __init__(self, feature_num, rnn_hid_size): - super(FCN_Regression, self).__init__() + super().__init__() self.feat_reg = FeatureRegression(rnn_hid_size * 2) self.U = Parameter(torch.Tensor(feature_num, feature_num)) self.V1 = Parameter(torch.Tensor(feature_num, feature_num)) diff --git a/pypots/imputation/saits/data.py b/pypots/imputation/saits/data.py index 2fb80bc3..5ff679a5 100644 --- a/pypots/imputation/saits/data.py +++ b/pypots/imputation/saits/data.py @@ -88,15 +88,15 @@ def _fetch_data_from_array(self, idx: int) -> Iterable: indicating_mask : tensor. The mask indicates artificially missing values in X. """ - X = self.X[idx] + X = self.X[idx].to(torch.float32) X_intact, X, missing_mask, indicating_mask = mcar(X, rate=self.rate) sample = [ torch.tensor(idx), - X_intact.to(torch.float32), - X.to(torch.float32), - missing_mask.to(torch.float32), - indicating_mask.to(torch.float32), + X_intact, + X, + missing_mask, + indicating_mask, ] if self.y is not None and self.return_labels: @@ -122,15 +122,15 @@ def _fetch_data_from_file(self, idx: int) -> Iterable: if self.file_handle is None: self.file_handle = self._open_file_handle() - X = torch.from_numpy(self.file_handle["X"][idx]) + X = torch.from_numpy(self.file_handle["X"][idx]).to(torch.float32) X_intact, X, missing_mask, indicating_mask = mcar(X, rate=self.rate) sample = [ torch.tensor(idx), - X_intact.to(torch.float32), - X.to(torch.float32), - missing_mask.to(torch.float32), - indicating_mask.to(torch.float32), + X_intact, + X, + missing_mask, + indicating_mask, ] # if the dataset has labels and is for training, then fetch it from the file diff --git a/pypots/imputation/template/dataset.py b/pypots/imputation/template/data.py similarity index 100% rename from pypots/imputation/template/dataset.py rename to pypots/imputation/template/data.py diff --git a/pypots/imputation/usgan/__init__.py b/pypots/imputation/usgan/__init__.py new file mode 100644 index 00000000..fb388d94 --- /dev/null +++ b/pypots/imputation/usgan/__init__.py @@ -0,0 +1,12 @@ +""" +The package of the partially-observed time-series imputation method USGAN. +""" + +# Created by Jun Wang +# License: GLP-v3 + +from .model import USGAN + +__all__ = [ + "USGAN", +] diff --git a/pypots/imputation/usgan/data.py b/pypots/imputation/usgan/data.py new file mode 100644 index 00000000..bd012c30 --- /dev/null +++ b/pypots/imputation/usgan/data.py @@ -0,0 +1,46 @@ +""" +Dataset class for model USGAN. +""" + +# Created by Jun Wang and Wenjie Du +# License: GLP-v3 + +from typing import Union + +from ..brits.data import DatasetForBRITS + + +class DatasetForUSGAN(DatasetForBRITS): + """Dataset class for USGAN, the same with the one for BRITS. + + Parameters + ---------- + data : dict or str, + The dataset for model input, should be a dictionary including keys as 'X' and 'y', + or a path string locating a data file. + If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features], + which is time-series data for input, can contain missing values, and y should be array-like of shape + [n_samples], which is classification labels of X. + If it is a path string, the path should point to a data file, e.g. a h5 file, which contains + key-value pairs like a dict, and it has to include keys as 'X' and 'y'. + + return_labels : bool, default = True, + Whether to return labels in function __getitem__() if they exist in the given data. If `True`, for example, + during training of classification models, the Dataset class will return labels in __getitem__() for model input. + Otherwise, labels won't be included in the data returned by __getitem__(). This parameter exists because we + need the defined Dataset class for all training/validating/testing stages. For those big datasets stored in h5 + files, they already have both X and y saved. But we don't read labels from the file for validating and testing + with function _fetch_data_from_file(), which works for all three stages. Therefore, we need this parameter for + distinction. + + file_type : str, default = "h5py" + The type of the given file if train_set and val_set are path strings. + """ + + def __init__( + self, + data: Union[dict, str], + return_labels: bool = True, + file_type: str = "h5py", + ): + super().__init__(data, return_labels, file_type) diff --git a/pypots/imputation/usgan/model.py b/pypots/imputation/usgan/model.py new file mode 100644 index 00000000..c171d810 --- /dev/null +++ b/pypots/imputation/usgan/model.py @@ -0,0 +1,539 @@ +""" +The implementation of USGAN for the partially-observed time-series imputation task. + +Refer to the paper "Miao, X., Wu, Y., Wang, J., Gao, Y., Mao, X., & Yin, J. (2021). +Generative Semi-supervised Learning for Multivariate Time Series Imputation. AAAI 2021." + +""" + +# Created by Jun Wang and Wenjie Du +# License: GPL-v3 + +from typing import Union, Optional + +import h5py +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.utils.data import DataLoader + +from .data import DatasetForUSGAN +from ..base import BaseNNImputer +from ..brits.model import _BRITS +from ...optim.adam import Adam +from ...optim.base import Optimizer +from ...utils.logging import logger + + +class Discriminator(nn.Module): + """model Discriminator: built on BiRNN + + Parameters + ---------- + n_features : + the feature dimension of the input + + rnn_hidden_size : + the hidden size of the RNN cell + + hint_rate : + the hint rate for the input imputed_data + + dropout_rate : + the dropout rate for the output layer + + device : + specify running the model on which device, CPU/GPU + + """ + + def __init__( + self, + n_features: int, + rnn_hidden_size: int, + hint_rate: float = 0.7, + dropout_rate: float = 0.0, + device: Union[str, torch.device] = "cpu", + ): + super().__init__() + self.hint_rate = hint_rate + self.device = device + self.biRNN = nn.GRU( + n_features * 2, rnn_hidden_size, bidirectional=True, batch_first=True + ).to(device) + self.dropout = nn.Dropout(dropout_rate).to(device) + self.read_out = nn.Linear(rnn_hidden_size * 2, n_features).to(device) + + def forward( + self, + imputed_X: torch.Tensor, + missing_mask: torch.Tensor, + ) -> torch.Tensor: + """Forward processing of USGAN Discriminator. + + Parameters + ---------- + imputed_X : torch.Tensor, + The original X with missing parts already imputed. + + missing_mask : torch.Tensor, + The missing mask of X. + + Returns + ------- + logits : torch.Tensor, + the logits of the probability of being the true value. + + """ + + hint = ( + torch.rand_like(missing_mask, dtype=torch.float, device=self.device) + < self.hint_rate + ) + hint = hint.int() + h = hint * missing_mask + (1 - hint) * 0.5 + x_in = torch.cat([imputed_X, h], dim=-1) + + out, _ = self.biRNN(x_in) + logits = self.read_out(self.dropout(out)) + return logits + + +class _USGAN(nn.Module): + """model USGAN: + USGAN consists of a generator, a discriminator, which are all built on bidirectional recurrent neural networks. + + Attributes + ---------- + n_steps : + sequence length (number of time steps) + + n_features : + number of features (input dimensions) + + rnn_hidden_size : + the hidden size of the RNN cell + + lambda_mse : + the weigth of the reconstruction loss + + hint_rate : + the hint rate for the discriminator + + dropout_rate : + the dropout rate for the last layer in Discriminator + + device : + specify running the model on which device, CPU/GPU + + """ + + def __init__( + self, + n_steps: int, + n_features: int, + rnn_hidden_size: int, + lambda_mse: float, + hint_rate: float = 0.7, + dropout_rate: float = 0.0, + device: Union[str, torch.device] = "cpu", + ): + super().__init__() + self.generator = _BRITS(n_steps, n_features, rnn_hidden_size, device) + self.discriminator = Discriminator( + n_features, + rnn_hidden_size, + hint_rate=hint_rate, + dropout_rate=dropout_rate, + device=device, + ) + + self.lambda_mse = lambda_mse + self.device = device + + def forward( + self, + inputs: dict, + training_object: str = "generator", + training: bool = True, + ) -> dict: + assert training_object in [ + "generator", + "discriminator", + ], 'training_object should be "generator" or "discriminator"' + + forward_X = inputs["forward"]["X"] + forward_missing_mask = inputs["forward"]["missing_mask"] + losses = {} + results = self.generator(inputs, training=training) + inputs["discrimination"] = self.discriminator(forward_X, forward_missing_mask) + if not training: + # if only run imputation operation, then no need to calculate loss + return results + + if training_object == "discriminator": + l_D = F.binary_cross_entropy_with_logits( + inputs["discrimination"], forward_missing_mask + ) + losses["discrimination_loss"] = l_D + else: + inputs["discrimination"] = inputs["discrimination"].detach() + l_G = F.binary_cross_entropy_with_logits( + inputs["discrimination"], + 1 - forward_missing_mask, + weight=1 - forward_missing_mask, + ) + loss_gene = l_G + self.lambda_mse * results["loss"] + losses["generation_loss"] = loss_gene + + losses["imputed_data"] = results["imputed_data"] + return losses + + +class USGAN(BaseNNImputer): + """The PyTorch implementation of the CRLI model :cite:`ma2021CRLI`. + + Parameters + ---------- + n_steps : + The number of time steps in the time-series data sample. + + n_features : + The number of features in the time-series data sample. + + rnn_hidden_size : + the hidden size of the RNN cell + + lambda_mse : + the weight of the reconstruction loss + + hint_rate : + the hint rate for the discriminator + + dropout_rate : + the dropout rate for the last layer in Discriminator + + G_steps : + The number of steps to train the generator in each iteration. + + D_steps : + The number of steps to train the discriminator in each iteration. + + batch_size : + The batch size for training and evaluating the model. + + epochs : + The number of epochs for training the model. + + patience : + The patience for the early-stopping mechanism. Given a positive integer, the training process will be + stopped when the model does not perform better after that number of epochs. + Leaving it default as None will disable the early-stopping. + + G_optimizer : + The optimizer for the generator training. + If not given, will use a default Adam optimizer. + + D_optimizer : + The optimizer for the discriminator training. + If not given, will use a default Adam optimizer. + + num_workers : + The number of subprocesses to use for data loading. + `0` means data loading will be in the main process, i.e. there won't be subprocesses. + + device : + The device for the model to run on. It can be a string, a :class:`torch.device` object, or a list of them. + If not given, will try to use CUDA devices first (will use the default CUDA device if there are multiple), + then CPUs, considering CUDA and CPU are so far the main devices for people to train ML models. + If given a list of devices, e.g. ['cuda:0', 'cuda:1'], or [torch.device('cuda:0'), torch.device('cuda:1')] , the + model will be parallely trained on the multiple devices (so far only support parallel training on CUDA devices). + Other devices like Google TPU and Apple Silicon accelerator MPS may be added in the future. + + saving_path : + The path for automatically saving model checkpoints and tensorboard files (i.e. loss values recorded during + training into a tensorboard file). Will not save if not given. + + model_saving_strategy : + The strategy to save model checkpoints. It has to be one of [None, "best", "better"]. + No model will be saved when it is set as None. + The "best" strategy will only automatically save the best model after the training finished. + The "better" strategy will automatically save the model during training whenever the model performs + better than in previous epochs. + + Attributes + ---------- + model : :class:`torch.nn.Module` + The underlying CRLI model. + + optimizer : :class:`pypots.optim.Optimizer` + The optimizer for model training. + + """ + + def __init__( + self, + n_steps: int, + n_features: int, + rnn_hidden_size: int, + lambda_mse: float = 1, + hint_rate: float = 0.7, + dropout_rate: float = 0.0, + G_steps: int = 1, + D_steps: int = 1, + batch_size: int = 32, + epochs: int = 100, + patience: Optional[int] = None, + G_optimizer: Optional[Optimizer] = Adam(), + D_optimizer: Optional[Optimizer] = Adam(), + num_workers: int = 0, + device: Optional[Union[str, torch.device, list]] = None, + saving_path: Optional[str] = None, + model_saving_strategy: Optional[str] = "best", + ): + super().__init__( + batch_size, + epochs, + patience, + num_workers, + device, + saving_path, + model_saving_strategy, + ) + assert G_steps > 0 and D_steps > 0, "G_steps and D_steps should both >0" + + self.n_steps = n_steps + self.n_features = n_features + self.G_steps = G_steps + self.D_steps = D_steps + + # set up the model + self.model = _USGAN( + n_steps, + n_features, + rnn_hidden_size, + lambda_mse, + hint_rate, + dropout_rate, + self.device, + ) + self._send_model_to_given_device() + self._print_model_size() + + # set up the optimizer + self.G_optimizer = G_optimizer + self.G_optimizer.init_optimizer(self.model.generator.parameters()) + self.D_optimizer = D_optimizer + self.D_optimizer.init_optimizer(self.model.discriminator.parameters()) + + def _assemble_input_for_training(self, data: list) -> dict: + # fetch data + ( + indices, + X, + missing_mask, + deltas, + back_X, + back_missing_mask, + back_deltas, + ) = self._send_data_to_given_device(data) + + # assemble input data + inputs = { + "indices": indices, + "forward": { + "X": X, + "missing_mask": missing_mask, + "deltas": deltas, + }, + "backward": { + "X": back_X, + "missing_mask": back_missing_mask, + "deltas": back_deltas, + }, + } + + return inputs + + def _assemble_input_for_validating(self, data: list) -> dict: + return self._assemble_input_for_training(data) + + def _assemble_input_for_testing(self, data: list) -> dict: + return self._assemble_input_for_validating(data) + + def _train_model( + self, + training_loader: DataLoader, + val_loader: DataLoader = None, + ) -> None: + # each training starts from the very beginning, so reset the loss and model dict here + self.best_loss = float("inf") + self.best_model_dict = None + + try: + training_step = 0 + epoch_train_loss_G_collector = [] + epoch_train_loss_D_collector = [] + for epoch in range(self.epochs): + self.model.train() + for idx, data in enumerate(training_loader): + training_step += 1 + inputs = self._assemble_input_for_training(data) + + step_train_loss_G_collector = [] + step_train_loss_D_collector = [] + + if idx % self.G_steps == 0: + self.G_optimizer.zero_grad() + results = self.model.forward( + inputs, training_object="generator" + ) + results["generation_loss"].backward() + self.G_optimizer.step() + step_train_loss_G_collector.append( + results["generation_loss"].item() + ) + + if idx % self.D_steps == 0: + self.D_optimizer.zero_grad() + results = self.model.forward( + inputs, training_object="discriminator" + ) + results["discrimination_loss"].backward(retain_graph=True) + self.D_optimizer.step() + step_train_loss_D_collector.append( + results["discrimination_loss"].item() + ) + + mean_step_train_D_loss = np.mean(step_train_loss_D_collector) + mean_step_train_G_loss = np.mean(step_train_loss_G_collector) + + epoch_train_loss_D_collector.append(mean_step_train_D_loss) + epoch_train_loss_G_collector.append(mean_step_train_G_loss) + + # save training loss logs into the tensorboard file for every step if in need + # Note: the `training_step` is not the actual number of steps that Discriminator and Generator get + # trained, the actual number should be D_steps*training_step and G_steps*training_step accordingly + if self.summary_writer is not None: + loss_results = { + "generation_loss": mean_step_train_G_loss, + "discrimination_loss": mean_step_train_D_loss, + } + self._save_log_into_tb_file( + training_step, "training", loss_results + ) + mean_epoch_train_D_loss = np.mean(epoch_train_loss_D_collector) + mean_epoch_train_G_loss = np.mean(epoch_train_loss_G_collector) + logger.info( + f"epoch {epoch}: " + f"training loss_generator {mean_epoch_train_G_loss:.4f}, " + f"train loss_discriminator {mean_epoch_train_D_loss:.4f}" + ) + mean_loss = mean_epoch_train_G_loss + + if mean_loss < self.best_loss: + self.best_loss = mean_loss + self.best_model_dict = self.model.state_dict() + self.patience = self.original_patience + # save the model if necessary + self._auto_save_model_if_necessary( + training_finished=False, + saving_name=f"{self.__class__.__name__}_epoch{epoch}_loss{mean_loss}", + ) + else: + self.patience -= 1 + if self.patience == 0: + logger.info( + "Exceeded the training patience. Terminating the training procedure..." + ) + break + except Exception as e: + logger.error(f"Exception: {e}") + if self.best_model_dict is None: + raise RuntimeError( + "Training got interrupted. Model was not trained. Please investigate the error printed above." + ) + else: + RuntimeWarning( + "Training got interrupted. Please investigate the error printed above.\n" + "Model got trained and will load the best checkpoint so far for testing.\n" + "If you don't want it, please try fit() again." + ) + + if np.equal(self.best_loss, float("inf")): + raise ValueError("Something is wrong. best_loss is Nan after training.") + + logger.info("Finished training.") + + def fit( + self, + train_set: Union[dict, str], + val_set: Optional[Union[dict, str]] = None, + file_type: str = "h5py", + ) -> None: + # Step 1: wrap the input data with classes Dataset and DataLoader + training_set = DatasetForUSGAN( + train_set, return_labels=False, file_type=file_type + ) + training_loader = DataLoader( + training_set, + batch_size=self.batch_size, + shuffle=True, + num_workers=self.num_workers, + ) + val_loader = None + if val_set is not None: + if isinstance(val_set, str): + with h5py.File(val_set, "r") as hf: + # Here we read the whole validation set from the file to mask a portion for validation. + # In PyPOTS, using a file usually because the data is too big. However, the validation set is + # generally shouldn't be too large. For example, we have 1 billion samples for model training. + # We won't take 20% of them as the validation set because we want as much as possible data for the + # training stage to enhance the model's generalization ability. Therefore, 100,000 representative + # samples will be enough to validate the model. + val_set = { + "X": hf["X"][:], + "X_intact": hf["X_intact"][:], + "indicating_mask": hf["indicating_mask"][:], + } + val_set = DatasetForUSGAN(val_set, return_labels=False, file_type=file_type) + val_loader = DataLoader( + val_set, + batch_size=self.batch_size, + shuffle=False, + num_workers=self.num_workers, + ) + + # Step 2: train the model and freeze it + self._train_model(training_loader, val_loader) + self.model.load_state_dict(self.best_model_dict) + self.model.eval() # set the model as eval status to freeze it. + + # Step 3: save the model if necessary + self._auto_save_model_if_necessary(training_finished=True) + + def impute( + self, + X: Union[dict, str], + file_type="h5py", + ) -> np.ndarray: + self.model.eval() # set the model as eval status to freeze it. + test_set = DatasetForUSGAN(X, return_labels=False, file_type=file_type) + test_loader = DataLoader( + test_set, + batch_size=self.batch_size, + shuffle=False, + num_workers=self.num_workers, + ) + imputation_collector = [] + + with torch.no_grad(): + for idx, data in enumerate(test_loader): + inputs = self._assemble_input_for_testing(data) + results = self.model.forward(inputs, training=False) + imputed_data = results["imputed_data"] + imputation_collector.append(imputed_data) + + imputation_collector = torch.cat(imputation_collector) + return imputation_collector.cpu().detach().numpy() diff --git a/pypots/utils/metrics.py b/pypots/utils/metrics.py index 85efb54d..cc349b50 100644 --- a/pypots/utils/metrics.py +++ b/pypots/utils/metrics.py @@ -574,73 +574,90 @@ def cal_cluster_purity( return cluster_purity -def cal_silhouette( - latent_rep: np.ndarray, - class_predictions: np.ndarray -) -> float: +def cal_silhouette(X: np.ndarray, predicted_labels: np.ndarray) -> float: """Compute the mean Silhouette Coefficient of all samples. Parameters ---------- - latent_rep : - Latent representation learned by a clusterer. + X : array-like of shape (n_samples_a, n_features) + A feature array, or learned latent representation, that can be used for clustering. - class_predictions : - Clustering results returned by a clusterer. + predicted_labels : array-like of shape (n_samples) + Predicted labels for each sample. Returns ------- - silhouette : + silhouette_score : float Mean Silhouette Coefficient for all samples. """ - silhouette = metrics.silhouette_score(latent_rep, class_predictions) - return silhouette + silhouette_score = metrics.silhouette_score(X, predicted_labels) + return silhouette_score -def cal_chs( - latent_rep: np.ndarray, - class_predictions: np.ndarray -) -> float: +def cal_chs(X: np.ndarray, predicted_labels: np.ndarray) -> float: """Compute the Calinski and Harabasz score (also known as the Variance Ratio Criterion). - Parameters - ---------- - latent_rep : - Latent representation learned by a clusterer. - - class_predictions : - Clustering results returned by a clusterer. + X : array-like of shape (n_samples_a, n_features) + A feature array, or learned latent representation, that can be used for clustering. + predicted_labels : array-like of shape (n_samples) + Predicted labels for each sample. Returns ------- - chs : + calinski_harabasz_score : float The resulting Calinski-Harabasz score. """ - chs = metrics.calinski_harabasz_score(latent_rep, class_predictions) - return chs + calinski_harabasz_score = metrics.calinski_harabasz_score(X, predicted_labels) + return calinski_harabasz_score -def cal_dbs( - latent_rep: np.ndarray, - class_predictions: np.ndarray -) -> float: +def cal_dbs(X: np.ndarray, predicted_labels: np.ndarray) -> float: """Compute the Davies-Bouldin score. Parameters ---------- - latent_rep : - Latent representation learned by a clusterer. + X : array-like of shape (n_samples_a, n_features) + A feature array, or learned latent representation, that can be used for clustering. - class_predictions : - Clustering results returned by a clusterer. + predicted_labels : array-like of shape (n_samples) + Predicted labels for each sample. Returns ------- - dbs : + davies_bouldin_score : float The resulting Davies-Bouldin score. """ - dbs = metrics.davies_bouldin_score(latent_rep, class_predictions) - return dbs + davies_bouldin_score = metrics.davies_bouldin_score(X, predicted_labels) + return davies_bouldin_score + + +def cal_internal_cluster_validation_metrics(X, predicted_labels): + """Computer all internal cluster validation metrics available in PyPOTS and return as a dictionary. + + Parameters + ---------- + X : array-like of shape (n_samples_a, n_features) + A feature array, or learned latent representation, that can be used for clustering. + + predicted_labels : array-like of shape (n_samples) + Predicted labels for each sample. + + Returns + ------- + internal_cluster_validation_metrics : dict + A dictionary contains all internal cluster validation metrics available in PyPOTS. + """ + + silhouette_score = cal_silhouette(X, predicted_labels) + calinski_harabasz_score = cal_chs(X, predicted_labels) + davies_bouldin_score = cal_dbs(X, predicted_labels) + + internal_cluster_validation_metrics = { + "silhouette_score": silhouette_score, + "calinski_harabasz_score": calinski_harabasz_score, + "davies_bouldin_score": davies_bouldin_score, + } + return internal_cluster_validation_metrics diff --git a/tests/classification/__init__.py b/tests/classification/__init__.py new file mode 100644 index 00000000..f0b4685e --- /dev/null +++ b/tests/classification/__init__.py @@ -0,0 +1,6 @@ +""" + +""" + +# Created by Wenjie Du +# License: GLP-v3 diff --git a/tests/classification/brits.py b/tests/classification/brits.py new file mode 100644 index 00000000..b1905c39 --- /dev/null +++ b/tests/classification/brits.py @@ -0,0 +1,106 @@ +""" +Test cases for BRITS classification model. +""" + +# Created by Wenjie Du +# License: GLP-v3 + +import os +import unittest + +import pytest + +from pypots.classification import BRITS +from pypots.optim import Adam +from pypots.utils.logging import logger +from pypots.utils.metrics import cal_binary_classification_metrics +from tests.classification.config import ( + EPOCHS, + TRAIN_SET, + VAL_SET, + TEST_SET, + RESULT_SAVING_DIR_FOR_CLASSIFICATION, +) +from tests.global_test_config import ( + DATA, + DEVICE, + check_tb_and_model_checkpoints_existence, +) + + +class TestBRITS(unittest.TestCase): + logger.info("Running tests for a classification model BRITS...") + + # set the log and model saving path + saving_path = os.path.join(RESULT_SAVING_DIR_FOR_CLASSIFICATION, "BRITS") + model_save_name = "saved_BRITS_model.pypots" + + # initialize an Adam optimizer + optimizer = Adam(lr=0.001, weight_decay=1e-5) + + # initialize a BRITS model + brits = BRITS( + DATA["n_steps"], + DATA["n_features"], + n_classes=DATA["n_classes"], + rnn_hidden_size=256, + epochs=EPOCHS, + saving_path=saving_path, + model_saving_strategy="better", + optimizer=optimizer, + device=DEVICE, + ) + + @pytest.mark.xdist_group(name="classification-brits") + def test_0_fit(self): + self.brits.fit(TRAIN_SET, VAL_SET) + + @pytest.mark.xdist_group(name="classification-brits") + def test_1_classify(self): + predictions = self.brits.classify(TEST_SET) + metrics = cal_binary_classification_metrics(predictions, DATA["test_y"]) + logger.info( + f'ROC_AUC: {metrics["roc_auc"]}, \n' + f'PR_AUC: {metrics["pr_auc"]},\n' + f'F1: {metrics["f1"]},\n' + f'Precision: {metrics["precision"]},\n' + f'Recall: {metrics["recall"]},\n' + ) + assert metrics["roc_auc"] >= 0.5, "ROC-AUC < 0.5" + + @pytest.mark.xdist_group(name="classification-brits") + def test_2_parameters(self): + assert hasattr(self.brits, "model") and self.brits.model is not None + + assert hasattr(self.brits, "optimizer") and self.brits.optimizer is not None + + assert hasattr(self.brits, "best_loss") + self.assertNotEqual(self.brits.best_loss, float("inf")) + + assert ( + hasattr(self.brits, "best_model_dict") + and self.brits.best_model_dict is not None + ) + + @pytest.mark.xdist_group(name="classification-brits") + def test_3_saving_path(self): + # whether the root saving dir exists, which should be created by save_log_into_tb_file + assert os.path.exists( + self.saving_path + ), f"file {self.saving_path} does not exist" + + # check if the tensorboard file and model checkpoints exist + check_tb_and_model_checkpoints_existence(self.brits) + + # save the trained model into file, and check if the path exists + self.brits.save_model( + saving_dir=self.saving_path, file_name=self.model_save_name + ) + + # test loading the saved model, not necessary, but need to test + saved_model_path = os.path.join(self.saving_path, self.model_save_name) + self.brits.load_model(saved_model_path) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/classification/config.py b/tests/classification/config.py new file mode 100644 index 00000000..35b17029 --- /dev/null +++ b/tests/classification/config.py @@ -0,0 +1,21 @@ +""" +Test configs for classification models. +""" + +# Created by Wenjie Du +# License: GLP-v3 + +import os + +from tests.global_test_config import ( + DATA, + RESULT_SAVING_DIR, +) + +EPOCHS = 5 + +TRAIN_SET = {"X": DATA["train_X"], "y": DATA["train_y"]} +VAL_SET = {"X": DATA["val_X"], "y": DATA["val_y"]} +TEST_SET = {"X": DATA["test_X"]} + +RESULT_SAVING_DIR_FOR_CLASSIFICATION = os.path.join(RESULT_SAVING_DIR, "classification") diff --git a/tests/classification/grud.py b/tests/classification/grud.py new file mode 100644 index 00000000..a662cb70 --- /dev/null +++ b/tests/classification/grud.py @@ -0,0 +1,105 @@ +""" +Test cases for GRUD classification model. +""" + +# Created by Wenjie Du +# License: GLP-v3 + +import os +import unittest + +import pytest + +from pypots.classification import GRUD +from pypots.optim import Adam +from pypots.utils.logging import logger +from pypots.utils.metrics import cal_binary_classification_metrics +from tests.classification.config import ( + EPOCHS, + TRAIN_SET, + VAL_SET, + TEST_SET, + RESULT_SAVING_DIR_FOR_CLASSIFICATION, +) +from tests.global_test_config import ( + DATA, + DEVICE, + check_tb_and_model_checkpoints_existence, +) + + +class TestGRUD(unittest.TestCase): + logger.info("Running tests for a classification model GRUD...") + + # set the log and model saving path + saving_path = os.path.join(RESULT_SAVING_DIR_FOR_CLASSIFICATION, "GRUD") + model_save_name = "saved_GRUD_model.pypots" + + # initialize an Adam optimizer + optimizer = Adam(lr=0.001, weight_decay=1e-5) + + # initialize a GRUD model + grud = GRUD( + DATA["n_steps"], + DATA["n_features"], + n_classes=DATA["n_classes"], + rnn_hidden_size=256, + epochs=EPOCHS, + saving_path=saving_path, + optimizer=optimizer, + device=DEVICE, + ) + + @pytest.mark.xdist_group(name="classification-grud") + def test_0_fit(self): + self.grud.fit(TRAIN_SET, VAL_SET) + + @pytest.mark.xdist_group(name="classification-grud") + def test_1_classify(self): + predictions = self.grud.classify(TEST_SET) + metrics = cal_binary_classification_metrics(predictions, DATA["test_y"]) + logger.info( + f'ROC_AUC: {metrics["roc_auc"]}, \n' + f'PR_AUC: {metrics["pr_auc"]},\n' + f'F1: {metrics["f1"]},\n' + f'Precision: {metrics["precision"]},\n' + f'Recall: {metrics["recall"]},\n' + ) + assert metrics["roc_auc"] >= 0.5, "ROC-AUC < 0.5" + + @pytest.mark.xdist_group(name="classification-grud") + def test_2_parameters(self): + assert hasattr(self.grud, "model") and self.grud.model is not None + + assert hasattr(self.grud, "optimizer") and self.grud.optimizer is not None + + assert hasattr(self.grud, "best_loss") + self.assertNotEqual(self.grud.best_loss, float("inf")) + + assert ( + hasattr(self.grud, "best_model_dict") + and self.grud.best_model_dict is not None + ) + + @pytest.mark.xdist_group(name="classification-grud") + def test_3_saving_path(self): + # whether the root saving dir exists, which should be created by save_log_into_tb_file + assert os.path.exists( + self.saving_path + ), f"file {self.saving_path} does not exist" + + # check if the tensorboard file and model checkpoints exist + check_tb_and_model_checkpoints_existence(self.grud) + + # save the trained model into file, and check if the path exists + self.grud.save_model( + saving_dir=self.saving_path, file_name=self.model_save_name + ) + + # test loading the saved model, not necessary, but need to test + saved_model_path = os.path.join(self.saving_path, self.model_save_name) + self.grud.load_model(saved_model_path) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/classification/raindrop.py b/tests/classification/raindrop.py new file mode 100644 index 00000000..277164dc --- /dev/null +++ b/tests/classification/raindrop.py @@ -0,0 +1,110 @@ +""" +Test cases for Raindrop classification model. +""" + +# Created by Wenjie Du +# License: GLP-v3 + +import os +import unittest + +import pytest + +from pypots.classification import Raindrop +from pypots.utils.logging import logger +from pypots.utils.metrics import cal_binary_classification_metrics +from tests.classification.config import ( + EPOCHS, + TRAIN_SET, + VAL_SET, + TEST_SET, + RESULT_SAVING_DIR_FOR_CLASSIFICATION, +) +from tests.global_test_config import ( + DATA, + DEVICE, + check_tb_and_model_checkpoints_existence, +) + + +class TestRaindrop(unittest.TestCase): + logger.info("Running tests for a classification model Raindrop...") + + # set the log and model saving path + saving_path = os.path.join(RESULT_SAVING_DIR_FOR_CLASSIFICATION, "Raindrop") + model_save_name = "saved_Raindrop_model.pypots" + + # initialize a Raindrop model + raindrop = Raindrop( + DATA["n_steps"], + DATA["n_features"], + DATA["n_classes"], + n_layers=2, + d_model=DATA["n_features"] * 4, + d_inner=256, + n_heads=2, + dropout=0.3, + d_static=0, + aggregation="mean", + sensor_wise_mask=False, + static=False, + epochs=EPOCHS, + saving_path=saving_path, + device=DEVICE, + ) + + @pytest.mark.xdist_group(name="classification-raindrop") + def test_0_fit(self): + self.raindrop.fit(TRAIN_SET, VAL_SET) + + @pytest.mark.xdist_group(name="classification-raindrop") + def test_1_classify(self): + predictions = self.raindrop.classify(TEST_SET) + metrics = cal_binary_classification_metrics(predictions, DATA["test_y"]) + logger.info( + f'ROC_AUC: {metrics["roc_auc"]}, \n' + f'PR_AUC: {metrics["pr_auc"]},\n' + f'F1: {metrics["f1"]},\n' + f'Precision: {metrics["precision"]},\n' + f'Recall: {metrics["recall"]},\n' + ) + assert metrics["roc_auc"] >= 0.5, "ROC-AUC < 0.5" + + @pytest.mark.xdist_group(name="classification-raindrop") + def test_2_parameters(self): + assert hasattr(self.raindrop, "model") and self.raindrop.model is not None + + assert ( + hasattr(self.raindrop, "optimizer") and self.raindrop.optimizer is not None + ) + + assert hasattr(self.raindrop, "best_loss") + self.assertNotEqual(self.raindrop.best_loss, float("inf")) + + assert ( + hasattr(self.raindrop, "best_model_dict") + and self.raindrop.best_model_dict is not None + ) + + @pytest.mark.xdist_group(name="classification-raindrop") + def test_3_saving_path(self): + # whether the root saving dir exists, which should be created by save_log_into_tb_file + assert os.path.exists( + self.saving_path + ), f"file {self.saving_path} does not exist" + + # check if the tensorboard file and model checkpoints exist + check_tb_and_model_checkpoints_existence(self.raindrop) + + # save the trained model into file, and check if the path exists + self.raindrop.save_model( + saving_dir=self.saving_path, file_name=self.model_save_name + ) + + # test loading the saved model, not necessary, but need to test + saved_model_path = os.path.join(self.saving_path, self.model_save_name) + self.raindrop.load_model(saved_model_path) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/cli/__init__.py b/tests/cli/__init__.py new file mode 100644 index 00000000..f0b4685e --- /dev/null +++ b/tests/cli/__init__.py @@ -0,0 +1,6 @@ +""" + +""" + +# Created by Wenjie Du +# License: GLP-v3 diff --git a/tests/cli/config.py b/tests/cli/config.py new file mode 100644 index 00000000..defdb211 --- /dev/null +++ b/tests/cli/config.py @@ -0,0 +1,11 @@ +""" +Test configs for CLI tools. +""" + +# Created by Wenjie Du +# License: GLP-v3 + +import os + + +PROJECT_ROOT_DIR = os.path.abspath(os.path.join(os.path.abspath(__file__), "../../..")) diff --git a/tests/cli/dev.py b/tests/cli/dev.py new file mode 100644 index 00000000..4387be29 --- /dev/null +++ b/tests/cli/dev.py @@ -0,0 +1,92 @@ +""" +Test cases for the functions and classes in package `pypots.cli.dev`. +""" + +# Created by Wenjie Du +# License: GLP-v3 + +import os +import threading +import unittest +from argparse import Namespace +from copy import copy + +import pytest + +from pypots.cli.dev import dev_command_factory +from tests.cli.config import PROJECT_ROOT_DIR + + +def callback_func(): + raise TimeoutError("Time out.") + + +def time_out(interval, callback): + def decorator(func): + def wrapper(*args, **kwargs): + t = threading.Thread(target=func, args=args, kwargs=kwargs) + t.setDaemon(True) + t.start() + t.join(interval) # wait for interval seconds + if t.is_alive(): + return threading.Timer(0, callback).start() # invoke callback() + else: + return + + return wrapper + + return decorator + + +@pytest.mark.xfail(reason="Allow tests for CLI to fail") +class TestPyPOTSCLIDev(unittest.TestCase): + # set up the default arguments + default_arguments = { + "build": False, + "cleanup": False, + "run_tests": False, + "k": None, + "show_coverage": False, + "lint_code": False, + } + # `pypots-cli dev` must run under the project root dir + os.chdir(PROJECT_ROOT_DIR) + + @pytest.mark.xdist_group(name="cli-dev") + def test_0_build(self): + arguments = copy(self.default_arguments) + arguments["build"] = True + args = Namespace(**arguments) + dev_command_factory(args).run() + + @pytest.mark.xdist_group(name="cli-dev") + def test_1_run_tests(self): + arguments = copy(self.default_arguments) + arguments["run_tests"] = True + arguments["k"] = "try_to_find_a_non_existing_test_case" + args = Namespace(**arguments) + try: + dev_command_factory(args).run() + except RuntimeError: # try to find a non-existing test case, so RuntimeError will be raised + pass + except Exception as e: # other exceptions will cause an error and result in failed testing + raise e + + # Don't test --lint-code because Black will reformat the code and cause error when generating the coverage report + # @pytest.mark.xdist_group(name="cli-dev") + # def test_2_lint_code(self): + # arguments = copy(self.default_arguments) + # arguments["lint_code"] = True + # args = Namespace(**arguments) + # dev_command_factory(args).run() + + @pytest.mark.xdist_group(name="cli-dev") + def test_3_cleanup(self): + arguments = copy(self.default_arguments) + arguments["cleanup"] = True + args = Namespace(**arguments) + dev_command_factory(args).run() + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/cli/doc.py b/tests/cli/doc.py new file mode 100644 index 00000000..85e4e190 --- /dev/null +++ b/tests/cli/doc.py @@ -0,0 +1,104 @@ +""" +Test cases for the functions and classes in package `pypots.cli.doc`. +""" + +# Created by Wenjie Du +# License: GLP-v3 + +import os +import threading +import unittest +from argparse import Namespace +from copy import copy + +import pytest + +from pypots.cli.doc import doc_command_factory +from pypots.utils.logging import logger +from tests.cli.config import PROJECT_ROOT_DIR + + +def callback_func(): + raise TimeoutError("Time out.") + + +def time_out(interval, callback): + def decorator(func): + def wrapper(*args, **kwargs): + t = threading.Thread(target=func, args=args, kwargs=kwargs) + t.setDaemon(True) + t.start() + t.join(interval) # wait for interval seconds + if t.is_alive(): + return threading.Timer(0, callback).start() # invoke callback() + else: + return + + return wrapper + + return decorator + + +@pytest.mark.xfail(reason="Allow tests for CLI to fail") +class TestPyPOTSCLIDoc(unittest.TestCase): + # set up the default arguments + default_arguments = { + "gene_rst": False, + "branch": "main", + "gene_html": False, + "view_doc": False, + "port": 9075, + "cleanup": False, + } + # `pypots-cli doc` must run under the project root dir + os.chdir(PROJECT_ROOT_DIR) + + @pytest.mark.xdist_group(name="cli-doc") + def test_0_gene_rst(self): + arguments = copy(self.default_arguments) + arguments["gene_rst"] = True + args = Namespace(**arguments) + doc_command_factory(args).run() + + logger.info("run again under a non-root dir") + try: + os.chdir(os.path.abspath(os.path.join(PROJECT_ROOT_DIR, "pypots"))) + doc_command_factory(args).run() + except RuntimeError: # try to run under a non-root dir, so RuntimeError will be raised + pass + except Exception as e: # other exceptions will cause an error and result in failed testing + raise e + finally: + os.chdir(PROJECT_ROOT_DIR) + + @pytest.mark.xdist_group(name="cli-doc") + def test_1_gene_html(self): + arguments = copy(self.default_arguments) + arguments["gene_html"] = True + args = Namespace(**arguments) + try: + doc_command_factory(args).run() + except Exception as e: # somehow we have some error when testing on Windows, so just print and pass below + logger.error(e) + + @pytest.mark.xdist_group(name="cli-doc") + @time_out(2, callback_func) # wait for two seconds + def test_2_view_doc(self): + arguments = copy(self.default_arguments) + arguments["view_doc"] = True + args = Namespace(**arguments) + try: + doc_command_factory(args).run() + except Exception as e: # somehow we have some error when testing on Windows, so just print and pass below + logger.error(e) + + @pytest.mark.xdist_group(name="cli-doc") + def test_3_cleanup(self): + arguments = copy(self.default_arguments) + arguments["cleanup"] = True + args = Namespace(**arguments) + doc_command_factory(args).run() + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/cli/env.py b/tests/cli/env.py new file mode 100644 index 00000000..36b5b20e --- /dev/null +++ b/tests/cli/env.py @@ -0,0 +1,49 @@ +""" +Test cases for the functions and classes in package `pypots.cli.env`. +""" + +# Created by Wenjie Du +# License: GLP-v3 + +import os +import unittest +from argparse import Namespace +from copy import copy + +import pytest + +from pypots.cli.env import env_command_factory +from pypots.utils.logging import logger +from tests.cli.config import PROJECT_ROOT_DIR + + +@pytest.mark.xfail(reason="Allow tests for CLI to fail") +class TestPyPOTSCLIEnv(unittest.TestCase): + # set up the default arguments + default_arguments = { + "install": "optional", + "tool": "conda", + } + + # `pypots-cli env` must run under the project root dir + os.chdir(PROJECT_ROOT_DIR) + + @pytest.mark.xdist_group(name="cli-env") + def test_0_install_with_conda(self): + arguments = copy(self.default_arguments) + arguments["tool"] = "conda" + args = Namespace(**arguments) + try: + env_command_factory(args).run() + except Exception as e: # somehow we have some error when testing on Windows, so just print and pass below + logger.error(e) + + @pytest.mark.xdist_group(name="cli-env") + def test_1_install_with_pip(self): + arguments = copy(self.default_arguments) + arguments["tool"] = "pip" + args = Namespace(**arguments) + try: + env_command_factory(args).run() + except Exception as e: # somehow we have some error when testing on Windows, so just print and pass below + logger.error(e) diff --git a/tests/clustering/__init__.py b/tests/clustering/__init__.py new file mode 100644 index 00000000..f0b4685e --- /dev/null +++ b/tests/clustering/__init__.py @@ -0,0 +1,6 @@ +""" + +""" + +# Created by Wenjie Du +# License: GLP-v3 diff --git a/tests/clustering/config.py b/tests/clustering/config.py new file mode 100644 index 00000000..aa43d7dd --- /dev/null +++ b/tests/clustering/config.py @@ -0,0 +1,22 @@ +""" +Test configs for clustering models. +""" + +# Created by Wenjie Du +# License: GLP-v3 + +import os + +from tests.global_test_config import ( + DATA, + RESULT_SAVING_DIR, +) + + +EPOCHS = 5 + +TRAIN_SET = {"X": DATA["train_X"]} +VAL_SET = {"X": DATA["val_X"]} +TEST_SET = {"X": DATA["test_X"]} + +RESULT_SAVING_DIR_FOR_CLUSTERING = os.path.join(RESULT_SAVING_DIR, "clustering") diff --git a/tests/clustering/crli.py b/tests/clustering/crli.py new file mode 100644 index 00000000..923911fd --- /dev/null +++ b/tests/clustering/crli.py @@ -0,0 +1,103 @@ +""" +Test cases for CRLI clustering model. +""" + +# Created by Wenjie Du +# License: GLP-v3 + + +import os +import unittest + +import pytest + +from pypots.clustering import CRLI +from pypots.optim import Adam +from pypots.utils.logging import logger +from pypots.utils.metrics import cal_rand_index, cal_cluster_purity +from tests.clustering.config import ( + EPOCHS, + TRAIN_SET, + TEST_SET, + RESULT_SAVING_DIR_FOR_CLUSTERING, +) +from tests.global_test_config import ( + DATA, + DEVICE, + check_tb_and_model_checkpoints_existence, +) + + +class TestCRLI(unittest.TestCase): + logger.info("Running tests for a clustering model CRLI...") + + # set the log and model saving path + saving_path = os.path.join(RESULT_SAVING_DIR_FOR_CLUSTERING, "CRLI") + model_save_name = "saved_CRLI_model.pypots" + + # initialize an Adam optimizer + G_optimizer = Adam(lr=0.001, weight_decay=1e-5) + D_optimizer = Adam(lr=0.001, weight_decay=1e-5) + + # initialize a CRLI model + crli = CRLI( + n_steps=DATA["n_steps"], + n_features=DATA["n_features"], + n_clusters=DATA["n_classes"], + n_generator_layers=2, + rnn_hidden_size=128, + epochs=EPOCHS, + saving_path=saving_path, + G_optimizer=G_optimizer, + D_optimizer=D_optimizer, + device=DEVICE, + ) + + @pytest.mark.xdist_group(name="clustering-crli") + def test_0_fit(self): + self.crli.fit(TRAIN_SET) + + @pytest.mark.xdist_group(name="clustering-crli") + def test_1_parameters(self): + assert hasattr(self.crli, "model") and self.crli.model is not None + + assert hasattr(self.crli, "G_optimizer") and self.crli.G_optimizer is not None + assert hasattr(self.crli, "D_optimizer") and self.crli.D_optimizer is not None + + assert hasattr(self.crli, "best_loss") + self.assertNotEqual(self.crli.best_loss, float("inf")) + + assert ( + hasattr(self.crli, "best_model_dict") + and self.crli.best_model_dict is not None + ) + + @pytest.mark.xdist_group(name="clustering-crli") + def test_2_cluster(self): + clustering = self.crli.cluster(TEST_SET) + RI = cal_rand_index(clustering, DATA["test_y"]) + CP = cal_cluster_purity(clustering, DATA["test_y"]) + logger.info(f"RI: {RI}\nCP: {CP}") + + @pytest.mark.xdist_group(name="clustering-crli") + def test_3_saving_path(self): + # whether the root saving dir exists, which should be created by save_log_into_tb_file + assert os.path.exists( + self.saving_path + ), f"file {self.saving_path} does not exist" + + # check if the tensorboard file and model checkpoints exist + check_tb_and_model_checkpoints_existence(self.crli) + + # save the trained model into file, and check if the path exists + self.crli.save_model( + saving_dir=self.saving_path, file_name=self.model_save_name + ) + + # test loading the saved model, not necessary, but need to test + saved_model_path = os.path.join(self.saving_path, self.model_save_name) + self.crli.load_model(saved_model_path) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_clustering.py b/tests/clustering/vader.py similarity index 51% rename from tests/test_clustering.py rename to tests/clustering/vader.py index bbd4d014..71a6a91d 100644 --- a/tests/test_clustering.py +++ b/tests/clustering/vader.py @@ -1,5 +1,5 @@ """ -Test cases for clustering models. +Test cases for VaDER clustering model. """ # Created by Wenjie Du @@ -12,94 +12,22 @@ import numpy as np import pytest -from pypots.clustering import VaDER, CRLI +from pypots.clustering import VaDER from pypots.optim import Adam from pypots.utils.logging import logger from pypots.utils.metrics import cal_rand_index, cal_cluster_purity +from tests.clustering.config import ( + EPOCHS, + TRAIN_SET, + TEST_SET, + RESULT_SAVING_DIR_FOR_CLUSTERING, +) from tests.global_test_config import ( DATA, - RESULT_SAVING_DIR, + DEVICE, check_tb_and_model_checkpoints_existence, ) -EPOCHS = 5 - -TRAIN_SET = {"X": DATA["train_X"]} -VAL_SET = {"X": DATA["val_X"]} -TEST_SET = {"X": DATA["test_X"]} - -RESULT_SAVING_DIR_FOR_CLUSTERING = os.path.join(RESULT_SAVING_DIR, "clustering") - - -class TestCRLI(unittest.TestCase): - logger.info("Running tests for a clustering model CRLI...") - - # set the log and model saving path - saving_path = os.path.join(RESULT_SAVING_DIR_FOR_CLUSTERING, "CRLI") - model_save_name = "saved_CRLI_model.pypots" - - # initialize an Adam optimizer - G_optimizer = Adam(lr=0.001, weight_decay=1e-5) - D_optimizer = Adam(lr=0.001, weight_decay=1e-5) - - # initialize a CRLI model - crli = CRLI( - n_steps=DATA["n_steps"], - n_features=DATA["n_features"], - n_clusters=DATA["n_classes"], - n_generator_layers=2, - rnn_hidden_size=128, - epochs=EPOCHS, - saving_path=saving_path, - G_optimizer=G_optimizer, - D_optimizer=D_optimizer, - ) - - @pytest.mark.xdist_group(name="clustering-crli") - def test_0_fit(self): - self.crli.fit(TRAIN_SET) - - @pytest.mark.xdist_group(name="clustering-crli") - def test_1_parameters(self): - assert hasattr(self.crli, "model") and self.crli.model is not None - - assert hasattr(self.crli, "G_optimizer") and self.crli.G_optimizer is not None - assert hasattr(self.crli, "D_optimizer") and self.crli.D_optimizer is not None - - assert hasattr(self.crli, "best_loss") - self.assertNotEqual(self.crli.best_loss, float("inf")) - - assert ( - hasattr(self.crli, "best_model_dict") - and self.crli.best_model_dict is not None - ) - - @pytest.mark.xdist_group(name="clustering-crli") - def test_2_cluster(self): - clustering = self.crli.cluster(TEST_SET) - RI = cal_rand_index(clustering, DATA["test_y"]) - CP = cal_cluster_purity(clustering, DATA["test_y"]) - logger.info(f"RI: {RI}\nCP: {CP}") - - @pytest.mark.xdist_group(name="clustering-crli") - def test_3_saving_path(self): - # whether the root saving dir exists, which should be created by save_log_into_tb_file - assert os.path.exists( - self.saving_path - ), f"file {self.saving_path} does not exist" - - # check if the tensorboard file and model checkpoints exist - check_tb_and_model_checkpoints_existence(self.crli) - - # save the trained model into file, and check if the path exists - self.crli.save_model( - saving_dir=self.saving_path, file_name=self.model_save_name - ) - - # test loading the saved model, not necessary, but need to test - saved_model_path = os.path.join(self.saving_path, self.model_save_name) - self.crli.load_model(saved_model_path) - class TestVaDER(unittest.TestCase): logger.info("Running tests for a clustering model Transformer...") @@ -120,8 +48,9 @@ class TestVaDER(unittest.TestCase): d_mu_stddev=5, pretrain_epochs=20, epochs=EPOCHS, - saving_path=saving_path, optimizer=optimizer, + saving_path=saving_path, + device=DEVICE, ) @pytest.mark.xdist_group(name="clustering-vader") diff --git a/tests/data/__init__.py b/tests/data/__init__.py new file mode 100644 index 00000000..f0b4685e --- /dev/null +++ b/tests/data/__init__.py @@ -0,0 +1,6 @@ +""" + +""" + +# Created by Wenjie Du +# License: GLP-v3 diff --git a/tests/test_data.py b/tests/data/lazy_loading_strategy.py similarity index 56% rename from tests/test_data.py rename to tests/data/lazy_loading_strategy.py index 27531098..8db1080c 100644 --- a/tests/test_data.py +++ b/tests/data/lazy_loading_strategy.py @@ -8,31 +8,28 @@ import os import unittest -import h5py import pytest from pypots.classification import BRITS, GRUD +from pypots.data.saving import save_dict_into_h5 from pypots.imputation import SAITS -from tests.global_test_config import DATA, DATA_SAVING_DIR from pypots.utils.logging import logger +from tests.global_test_config import DATA, DATA_SAVING_DIR - -TRAIN_SET = f"{DATA_SAVING_DIR}/train_set.h5" -VAL_SET = f"{DATA_SAVING_DIR}/val_set.h5" -TEST_SET = f"{DATA_SAVING_DIR}/test_set.h5" -IMPUTATION_TRAIN_SET = f"{DATA_SAVING_DIR}/imputation_train_set.h5" -IMPUTATION_VAL_SET = f"{DATA_SAVING_DIR}/imputation_val_set.h5" +TRAIN_SET_NAME = "train_set.h5" +TRAIN_SET_PATH = f"{DATA_SAVING_DIR}/{TRAIN_SET_NAME}" +VAL_SET_NAME = "val_set.h5" +VAL_SET_PATH = f"{DATA_SAVING_DIR}/{VAL_SET_NAME}" +TEST_SET_NAME = "test_set.h5" +TEST_SET_PATH = f"{DATA_SAVING_DIR}/{TEST_SET_NAME}" +IMPUTATION_TRAIN_SET_NAME = "imputation_train_set.h5" +IMPUTATION_TRAIN_SET_PATH = f"{DATA_SAVING_DIR}/{IMPUTATION_TRAIN_SET_NAME}" +IMPUTATION_VAL_SET_NAME = "imputation_val_set.h5" +IMPUTATION_VAL_SET_PATH = f"{DATA_SAVING_DIR}/{IMPUTATION_VAL_SET_NAME}" EPOCHS = 1 -def save_data_set_into_h5(data, path): - with h5py.File(path, "w") as hf: - for i in data.keys(): - tp = int if i == "y" else "float32" - hf.create_dataset(i, data=data[i].astype(tp)) - - class TestLazyLoadingClasses(unittest.TestCase): logger.info("Running tests for Dataset classes with lazy-loading strategy...") @@ -73,53 +70,63 @@ def test_0_save_datasets_into_files(self): # create the dir for saving files os.makedirs(DATA_SAVING_DIR, exist_ok=True) - if not os.path.exists(TRAIN_SET): - save_data_set_into_h5( - {"X": DATA["train_X"], "y": DATA["train_y"].astype(int)}, TRAIN_SET + if not os.path.exists(TRAIN_SET_PATH): + save_dict_into_h5( + {"X": DATA["train_X"], "y": DATA["train_y"].astype(float)}, + DATA_SAVING_DIR, + TRAIN_SET_NAME, ) - if not os.path.exists(VAL_SET): - save_data_set_into_h5( - {"X": DATA["val_X"], "y": DATA["val_y"].astype(int)}, VAL_SET + if not os.path.exists(VAL_SET_PATH): + save_dict_into_h5( + {"X": DATA["val_X"], "y": DATA["val_y"].astype(float)}, + DATA_SAVING_DIR, + VAL_SET_NAME, ) - if not os.path.exists(IMPUTATION_TRAIN_SET): - save_data_set_into_h5({"X": DATA["train_X"]}, IMPUTATION_TRAIN_SET) + if not os.path.exists(IMPUTATION_TRAIN_SET_PATH): + save_dict_into_h5( + {"X": DATA["train_X"]}, DATA_SAVING_DIR, IMPUTATION_TRAIN_SET_NAME + ) - if not os.path.exists(IMPUTATION_VAL_SET): - save_data_set_into_h5( + if not os.path.exists(IMPUTATION_VAL_SET_PATH): + save_dict_into_h5( { "X": DATA["val_X"], "X_intact": DATA["val_X_intact"], "indicating_mask": DATA["val_X_indicating_mask"], }, - IMPUTATION_VAL_SET, + DATA_SAVING_DIR, + IMPUTATION_VAL_SET_NAME, ) - if not os.path.exists(TEST_SET): - save_data_set_into_h5( + if not os.path.exists(TEST_SET_PATH): + save_dict_into_h5( { "X": DATA["test_X"], "X_intact": DATA["test_X_intact"], "indicating_mask": DATA["test_X_indicating_mask"], }, - TEST_SET, + DATA_SAVING_DIR, + TEST_SET_NAME, ) @pytest.mark.xdist_group(name="data-lazy-loading") def test_1_DatasetForMIT_BaseDataset(self): - self.saits.fit(train_set=IMPUTATION_TRAIN_SET, val_set=IMPUTATION_VAL_SET) - _ = self.saits.impute(X=TEST_SET) + self.saits.fit( + train_set=IMPUTATION_TRAIN_SET_PATH, val_set=IMPUTATION_VAL_SET_PATH + ) + _ = self.saits.impute(X=TEST_SET_PATH) @pytest.mark.xdist_group(name="data-lazy-loading") def test_2_DatasetForBRITS(self): - self.brits.fit(train_set=TRAIN_SET, val_set=VAL_SET) - _ = self.brits.classify(X=TEST_SET) + self.brits.fit(train_set=TRAIN_SET_PATH, val_set=VAL_SET_PATH) + _ = self.brits.classify(X=TEST_SET_PATH) @pytest.mark.xdist_group(name="data-lazy-loading") def test_3_DatasetForGRUD(self): - self.grud.fit(train_set=TRAIN_SET, val_set=VAL_SET) - _ = self.grud.classify(X=TEST_SET) + self.grud.fit(train_set=TRAIN_SET_PATH, val_set=VAL_SET_PATH) + _ = self.grud.classify(X=TEST_SET_PATH) if __name__ == "__main__": diff --git a/tests/forecasting/__init__.py b/tests/forecasting/__init__.py new file mode 100644 index 00000000..f0b4685e --- /dev/null +++ b/tests/forecasting/__init__.py @@ -0,0 +1,6 @@ +""" + +""" + +# Created by Wenjie Du +# License: GLP-v3 diff --git a/tests/test_forecasting.py b/tests/forecasting/bttf.py similarity index 78% rename from tests/test_forecasting.py rename to tests/forecasting/bttf.py index d2e8e14b..8e6946e7 100644 --- a/tests/test_forecasting.py +++ b/tests/forecasting/bttf.py @@ -1,5 +1,5 @@ """ -Test cases for forecasting models. +Test cases for BTTF forecasting model. """ # Created by Wenjie Du @@ -12,12 +12,13 @@ from pypots.forecasting import BTTF from pypots.utils.logging import logger from pypots.utils.metrics import cal_mae +from tests.forecasting.config import ( + TEST_SET, + TEST_SET_INTACT, + N_PRED_STEP, +) from tests.global_test_config import DATA -EPOCHS = 5 -N_PRED_STEP = 4 -TEST_SET = {"X": DATA["test_X"][:, :-N_PRED_STEP]} - class TestBTTF(unittest.TestCase): logger.info("Running tests for a forecasting model BTTF...") @@ -37,8 +38,7 @@ class TestBTTF(unittest.TestCase): @pytest.mark.xdist_group(name="forecasting-bttf") def test_0_forecasting(self): predictions = self.bttf.forecast(TEST_SET) - logger.info(f"prediction shape: {predictions.shape}") - mae = cal_mae(predictions, DATA["test_X_intact"][:, -N_PRED_STEP:]) + mae = cal_mae(predictions, TEST_SET_INTACT["X"][:, -N_PRED_STEP:]) logger.info(f"prediction MAE: {mae}") diff --git a/tests/forecasting/config.py b/tests/forecasting/config.py new file mode 100644 index 00000000..0a2a9e78 --- /dev/null +++ b/tests/forecasting/config.py @@ -0,0 +1,23 @@ +""" +Test configs for forecasting models. +""" + +# Created by Wenjie Du +# License: GLP-v3 + +import os + +from tests.global_test_config import ( + DATA, + RESULT_SAVING_DIR, +) + +EPOCHS = 5 +N_PRED_STEP = 4 + +TRAIN_SET = {"X": DATA["train_X"]} +VAL_SET = {"X": DATA["val_X"]} +TEST_SET = {"X": DATA["test_X"][:, :-N_PRED_STEP]} +TEST_SET_INTACT = {"X": DATA["test_X_intact"]} + +RESULT_SAVING_DIR_FOR_CLASSIFICATION = os.path.join(RESULT_SAVING_DIR, "forecasting") diff --git a/tests/global_test_config.py b/tests/global_test_config.py index f3349483..5e152734 100644 --- a/tests/global_test_config.py +++ b/tests/global_test_config.py @@ -7,7 +7,10 @@ import os +import torch + from pypots.data.generating import gene_incomplete_random_walk_dataset +from pypots.utils.logging import logger # Generate the unified data for testing and cache it first, DATA here is a singleton # Otherwise, file lock will cause bug if running test parallely with pytest-xdist. @@ -20,6 +23,16 @@ RESULT_SAVING_DIR = "testing_results" +# set DEVICES to None if no cuda device is available, to avoid initialization failed while importing test classes +cuda_devices = [torch.device(i) for i in range(torch.cuda.device_count())] +if len(cuda_devices) > 2: + logger.info("❗️Detected multiple cuda devices, using all of them to run testing.") + DEVICE = cuda_devices +else: + # if having no multiple cuda devices, leave it as None to use the default device + DEVICE = None + + def check_tb_and_model_checkpoints_existence(model): # check the tensorboard file existence saved_files = os.listdir(model.saving_path) diff --git a/tests/imputation/__init__.py b/tests/imputation/__init__.py new file mode 100644 index 00000000..f0b4685e --- /dev/null +++ b/tests/imputation/__init__.py @@ -0,0 +1,6 @@ +""" + +""" + +# Created by Wenjie Du +# License: GLP-v3 diff --git a/tests/imputation/brits.py b/tests/imputation/brits.py new file mode 100644 index 00000000..bf0a70c3 --- /dev/null +++ b/tests/imputation/brits.py @@ -0,0 +1,104 @@ +""" +Test cases for BRITS imputation model. +""" + +# Created by Wenjie Du +# License: GPL-v3 + + +import os.path +import unittest + +import numpy as np +import pytest + +from pypots.imputation import BRITS +from pypots.optim import Adam +from pypots.utils.logging import logger +from pypots.utils.metrics import cal_mae +from tests.global_test_config import ( + DATA, + DEVICE, + check_tb_and_model_checkpoints_existence, +) +from tests.imputation.config import ( + TRAIN_SET, + VAL_SET, + TEST_SET, + RESULT_SAVING_DIR_FOR_IMPUTATION, + EPOCHS, +) + + +class TestBRITS(unittest.TestCase): + logger.info("Running tests for an imputation model BRITS...") + + # set the log and model saving path + saving_path = os.path.join(RESULT_SAVING_DIR_FOR_IMPUTATION, "BRITS") + model_save_name = "saved_BRITS_model.pypots" + + # initialize an Adam optimizer + optimizer = Adam(lr=0.001, weight_decay=1e-5) + + # initialize a BRITS model + brits = BRITS( + DATA["n_steps"], + DATA["n_features"], + 256, + epochs=EPOCHS, + saving_path=saving_path, + optimizer=optimizer, + device=DEVICE, + ) + + @pytest.mark.xdist_group(name="imputation-brits") + def test_0_fit(self): + self.brits.fit(TRAIN_SET, VAL_SET) + + @pytest.mark.xdist_group(name="imputation-brits") + def test_1_impute(self): + imputed_X = self.brits.impute(TEST_SET) + assert not np.isnan( + imputed_X + ).any(), "Output still has missing values after running impute()." + test_MAE = cal_mae( + imputed_X, DATA["test_X_intact"], DATA["test_X_indicating_mask"] + ) + logger.info(f"BRITS test_MAE: {test_MAE}") + + @pytest.mark.xdist_group(name="imputation-brits") + def test_2_parameters(self): + assert hasattr(self.brits, "model") and self.brits.model is not None + + assert hasattr(self.brits, "optimizer") and self.brits.optimizer is not None + + assert hasattr(self.brits, "best_loss") + self.assertNotEqual(self.brits.best_loss, float("inf")) + + assert ( + hasattr(self.brits, "best_model_dict") + and self.brits.best_model_dict is not None + ) + + @pytest.mark.xdist_group(name="imputation-brits") + def test_3_saving_path(self): + # whether the root saving dir exists, which should be created by save_log_into_tb_file + assert os.path.exists( + self.saving_path + ), f"file {self.saving_path} does not exist" + + # check if the tensorboard file and model checkpoints exist + check_tb_and_model_checkpoints_existence(self.brits) + + # save the trained model into file, and check if the path exists + self.brits.save_model( + saving_dir=self.saving_path, file_name=self.model_save_name + ) + + # test loading the saved model, not necessary, but need to test + saved_model_path = os.path.join(self.saving_path, self.model_save_name) + self.brits.load_model(saved_model_path) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/imputation/config.py b/tests/imputation/config.py new file mode 100644 index 00000000..c225598b --- /dev/null +++ b/tests/imputation/config.py @@ -0,0 +1,25 @@ +""" +Test configs for imputation models. +""" + +# Created by Wenjie Du +# License: GLP-v3 + +import os + +from tests.global_test_config import ( + DATA, + RESULT_SAVING_DIR, +) + +EPOCHS = 5 + +TRAIN_SET = {"X": DATA["train_X"]} +VAL_SET = { + "X": DATA["val_X"], + "X_intact": DATA["val_X_intact"], + "indicating_mask": DATA["val_X_indicating_mask"], +} +TEST_SET = {"X": DATA["test_X"]} + +RESULT_SAVING_DIR_FOR_IMPUTATION = os.path.join(RESULT_SAVING_DIR, "imputation") diff --git a/tests/imputation/gpvae.py b/tests/imputation/gpvae.py new file mode 100644 index 00000000..9c59c5b2 --- /dev/null +++ b/tests/imputation/gpvae.py @@ -0,0 +1,104 @@ +""" +Test cases for GP-VAE imputation model. +""" + +# Created by Wenjie Du +# License: GPL-v3 + + +import os.path +import unittest + +import numpy as np +import pytest + +from pypots.imputation import GPVAE +from pypots.optim import Adam +from pypots.utils.logging import logger +from pypots.utils.metrics import cal_mae +from tests.global_test_config import ( + DATA, + DEVICE, + check_tb_and_model_checkpoints_existence, +) +from tests.imputation.config import ( + TRAIN_SET, + VAL_SET, + TEST_SET, + RESULT_SAVING_DIR_FOR_IMPUTATION, + EPOCHS, +) + + +class TestGPVAE(unittest.TestCase): + logger.info("Running tests for an imputation model GP-VAE...") + + # set the log and model saving path + saving_path = os.path.join(RESULT_SAVING_DIR_FOR_IMPUTATION, "GP-VAE") + model_save_name = "saved_GPVAE_model.pypots" + + # initialize an Adam optimizer + optimizer = Adam(lr=0.001, weight_decay=1e-5) + + # initialize a GP-VAE model + gp_vae = GPVAE( + DATA["n_steps"], + DATA["n_features"], + 256, + epochs=EPOCHS, + saving_path=saving_path, + optimizer=optimizer, + device=DEVICE, + ) + + @pytest.mark.xdist_group(name="imputation-gpvae") + def test_0_fit(self): + self.gp_vae.fit(TRAIN_SET, VAL_SET) + + @pytest.mark.xdist_group(name="imputation-gpvae") + def test_1_impute(self): + imputed_X = self.gp_vae.impute(TEST_SET) + assert not np.isnan( + imputed_X + ).any(), "Output still has missing values after running impute()." + test_MAE = cal_mae( + imputed_X, DATA["test_X_intact"], DATA["test_X_indicating_mask"] + ) + logger.info(f"GP-VAE test_MAE: {test_MAE}") + + @pytest.mark.xdist_group(name="imputation-gpvae") + def test_2_parameters(self): + assert hasattr(self.gp_vae, "model") and self.gp_vae.model is not None + + assert hasattr(self.gp_vae, "optimizer") and self.gp_vae.optimizer is not None + + assert hasattr(self.gp_vae, "best_loss") + self.assertNotEqual(self.gp_vae.best_loss, float("inf")) + + assert ( + hasattr(self.gp_vae, "best_model_dict") + and self.gp_vae.best_model_dict is not None + ) + + @pytest.mark.xdist_group(name="imputation-gpvae") + def test_3_saving_path(self): + # whether the root saving dir exists, which should be created by save_log_into_tb_file + assert os.path.exists( + self.saving_path + ), f"file {self.saving_path} does not exist" + + # check if the tensorboard file and model checkpoints exist + check_tb_and_model_checkpoints_existence(self.gp_vae) + + # save the trained model into file, and check if the path exists + self.gp_vae.save_model( + saving_dir=self.saving_path, file_name=self.model_save_name + ) + + # test loading the saved model, not necessary, but need to test + saved_model_path = os.path.join(self.saving_path, self.model_save_name) + self.gp_vae.load_model(saved_model_path) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/imputation/locf.py b/tests/imputation/locf.py new file mode 100644 index 00000000..8e54fbe0 --- /dev/null +++ b/tests/imputation/locf.py @@ -0,0 +1,46 @@ +""" +Test cases for LOCF imputation method. +""" + +# Created by Wenjie Du +# License: GPL-v3 + + +import unittest + +import numpy as np +import pytest + +from pypots.imputation import LOCF +from pypots.utils.logging import logger +from pypots.utils.metrics import cal_mae +from tests.global_test_config import ( + DATA, +) +from tests.imputation.config import ( + TEST_SET, +) + + +class TestLOCF(unittest.TestCase): + logger.info("Running tests for an imputation model LOCF...") + locf = LOCF(nan=0) + + @pytest.mark.xdist_group(name="imputation-locf") + def test_0_impute(self): + test_X_imputed = self.locf.impute(TEST_SET) + assert not np.isnan( + test_X_imputed + ).any(), "Output still has missing values after running impute()." + test_MAE = cal_mae( + test_X_imputed, DATA["test_X_intact"], DATA["test_X_indicating_mask"] + ) + logger.info(f"LOCF test_MAE: {test_MAE}") + + @pytest.mark.xdist_group(name="imputation-locf") + def test_1_parameters(self): + assert hasattr(self.locf, "nan") and self.locf.nan is not None + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/imputation/mrnn.py b/tests/imputation/mrnn.py new file mode 100644 index 00000000..681a9121 --- /dev/null +++ b/tests/imputation/mrnn.py @@ -0,0 +1,104 @@ +""" +Test cases for MRNN imputation model. +""" + +# Created by Wenjie Du +# License: GPL-v3 + + +import os.path +import unittest + +import numpy as np +import pytest + +from pypots.imputation import MRNN +from pypots.optim import Adam +from pypots.utils.logging import logger +from pypots.utils.metrics import cal_mae +from tests.global_test_config import ( + DATA, + DEVICE, + check_tb_and_model_checkpoints_existence, +) +from tests.imputation.config import ( + TRAIN_SET, + VAL_SET, + TEST_SET, + RESULT_SAVING_DIR_FOR_IMPUTATION, + EPOCHS, +) + + +class TestMRNN(unittest.TestCase): + logger.info("Running tests for an imputation model MRNN...") + + # set the log and model saving path + saving_path = os.path.join(RESULT_SAVING_DIR_FOR_IMPUTATION, "MRNN") + model_save_name = "saved_MRNN_model.pypots" + + # initialize an Adam optimizer + optimizer = Adam(lr=0.001, weight_decay=1e-5) + + # initialize a MRNN model + mrnn = MRNN( + DATA["n_steps"], + DATA["n_features"], + 256, + epochs=EPOCHS, + saving_path=saving_path, + optimizer=optimizer, + device=DEVICE, + ) + + @pytest.mark.xdist_group(name="imputation-mrnn") + def test_0_fit(self): + self.mrnn.fit(TRAIN_SET, VAL_SET) + + @pytest.mark.xdist_group(name="imputation-mrnn") + def test_1_impute(self): + imputed_X = self.mrnn.impute(TEST_SET) + assert not np.isnan( + imputed_X + ).any(), "Output still has missing values after running impute()." + test_MAE = cal_mae( + imputed_X, DATA["test_X_intact"], DATA["test_X_indicating_mask"] + ) + logger.info(f"MRNN test_MAE: {test_MAE}") + + @pytest.mark.xdist_group(name="imputation-mrnn") + def test_2_parameters(self): + assert hasattr(self.mrnn, "model") and self.mrnn.model is not None + + assert hasattr(self.mrnn, "optimizer") and self.mrnn.optimizer is not None + + assert hasattr(self.mrnn, "best_loss") + self.assertNotEqual(self.mrnn.best_loss, float("inf")) + + assert ( + hasattr(self.mrnn, "best_model_dict") + and self.mrnn.best_model_dict is not None + ) + + @pytest.mark.xdist_group(name="imputation-mrnn") + def test_3_saving_path(self): + # whether the root saving dir exists, which should be created by save_log_into_tb_file + assert os.path.exists( + self.saving_path + ), f"file {self.saving_path} does not exist" + + # check if the tensorboard file and model checkpoints exist + check_tb_and_model_checkpoints_existence(self.mrnn) + + # save the trained model into file, and check if the path exists + self.mrnn.save_model( + saving_dir=self.saving_path, file_name=self.model_save_name + ) + + # test loading the saved model, not necessary, but need to test + saved_model_path = os.path.join(self.saving_path, self.model_save_name) + self.mrnn.load_model(saved_model_path) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/imputation/saits.py b/tests/imputation/saits.py new file mode 100644 index 00000000..647e8657 --- /dev/null +++ b/tests/imputation/saits.py @@ -0,0 +1,110 @@ +""" +Test cases for SAITS imputation model. +""" + +# Created by Wenjie Du +# License: GPL-v3 + + +import os.path +import unittest + +import numpy as np +import pytest + +from pypots.imputation import SAITS +from pypots.optim import Adam +from pypots.utils.logging import logger +from pypots.utils.metrics import cal_mae +from tests.global_test_config import ( + DATA, + DEVICE, + check_tb_and_model_checkpoints_existence, +) +from tests.imputation.config import ( + TRAIN_SET, + VAL_SET, + TEST_SET, + RESULT_SAVING_DIR_FOR_IMPUTATION, + EPOCHS, +) + + +class TestSAITS(unittest.TestCase): + logger.info("Running tests for an imputation model SAITS...") + + # set the log and model saving path + saving_path = os.path.join(RESULT_SAVING_DIR_FOR_IMPUTATION, "SAITS") + model_save_name = "saved_saits_model.pypots" + + # initialize an Adam optimizer + optimizer = Adam(lr=0.001, weight_decay=1e-5) + + # initialize a SAITS model + saits = SAITS( + DATA["n_steps"], + DATA["n_features"], + n_layers=2, + d_model=256, + d_inner=128, + n_heads=4, + d_k=64, + d_v=64, + dropout=0.1, + epochs=EPOCHS, + saving_path=saving_path, + optimizer=optimizer, + device=DEVICE, + ) + + @pytest.mark.xdist_group(name="imputation-saits") + def test_0_fit(self): + self.saits.fit(TRAIN_SET, VAL_SET) + + @pytest.mark.xdist_group(name="imputation-saits") + def test_1_impute(self): + imputed_X = self.saits.impute(TEST_SET) + assert not np.isnan( + imputed_X + ).any(), "Output still has missing values after running impute()." + test_MAE = cal_mae( + imputed_X, DATA["test_X_intact"], DATA["test_X_indicating_mask"] + ) + logger.info(f"SAITS test_MAE: {test_MAE}") + + @pytest.mark.xdist_group(name="imputation-saits") + def test_2_parameters(self): + assert hasattr(self.saits, "model") and self.saits.model is not None + + assert hasattr(self.saits, "optimizer") and self.saits.optimizer is not None + + assert hasattr(self.saits, "best_loss") + self.assertNotEqual(self.saits.best_loss, float("inf")) + + assert ( + hasattr(self.saits, "best_model_dict") + and self.saits.best_model_dict is not None + ) + + @pytest.mark.xdist_group(name="imputation-saits") + def test_3_saving_path(self): + # whether the root saving dir exists, which should be created by save_log_into_tb_file + assert os.path.exists( + self.saving_path + ), f"file {self.saving_path} does not exist" + + # check if the tensorboard file and model checkpoints exist + check_tb_and_model_checkpoints_existence(self.saits) + + # save the trained model into file, and check if the path exists + self.saits.save_model( + saving_dir=self.saving_path, file_name=self.model_save_name + ) + + # test loading the saved model, not necessary, but need to test + saved_model_path = os.path.join(self.saving_path, self.model_save_name) + self.saits.load_model(saved_model_path) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/imputation/transformer.py b/tests/imputation/transformer.py new file mode 100644 index 00000000..965b2cf7 --- /dev/null +++ b/tests/imputation/transformer.py @@ -0,0 +1,113 @@ +""" +Test cases for Transformer imputation model. +""" + +# Created by Wenjie Du +# License: GPL-v3 + + +import os.path +import unittest + +import numpy as np +import pytest + +from pypots.imputation import Transformer +from pypots.optim import Adam +from pypots.utils.logging import logger +from pypots.utils.metrics import cal_mae +from tests.global_test_config import ( + DATA, + DEVICE, + check_tb_and_model_checkpoints_existence, +) +from tests.imputation.config import ( + TRAIN_SET, + VAL_SET, + TEST_SET, + RESULT_SAVING_DIR_FOR_IMPUTATION, + EPOCHS, +) + + +class TestTransformer(unittest.TestCase): + logger.info("Running tests for an imputation model Transformer...") + + # set the log and model saving path + saving_path = os.path.join(RESULT_SAVING_DIR_FOR_IMPUTATION, "Transformer") + model_save_name = "saved_transformer_model.pypots" + + # initialize an Adam optimizer + optimizer = Adam(lr=0.001, weight_decay=1e-5) + + # initialize a Transformer model + transformer = Transformer( + DATA["n_steps"], + DATA["n_features"], + n_layers=2, + d_model=256, + d_inner=128, + n_heads=4, + d_k=64, + d_v=64, + dropout=0.1, + epochs=EPOCHS, + saving_path=saving_path, + optimizer=optimizer, + device=DEVICE, + ) + + @pytest.mark.xdist_group(name="imputation-transformer") + def test_0_fit(self): + self.transformer.fit(TRAIN_SET, VAL_SET) + + @pytest.mark.xdist_group(name="imputation-transformer") + def test_1_impute(self): + imputed_X = self.transformer.impute(TEST_SET) + assert not np.isnan( + imputed_X + ).any(), "Output still has missing values after running impute()." + test_MAE = cal_mae( + imputed_X, DATA["test_X_intact"], DATA["test_X_indicating_mask"] + ) + logger.info(f"Transformer test_MAE: {test_MAE}") + + @pytest.mark.xdist_group(name="imputation-transformer") + def test_2_parameters(self): + assert hasattr(self.transformer, "model") and self.transformer.model is not None + + assert ( + hasattr(self.transformer, "optimizer") + and self.transformer.optimizer is not None + ) + + assert hasattr(self.transformer, "best_loss") + self.assertNotEqual(self.transformer.best_loss, float("inf")) + + assert ( + hasattr(self.transformer, "best_model_dict") + and self.transformer.best_model_dict is not None + ) + + @pytest.mark.xdist_group(name="imputation-transformer") + def test_3_saving_path(self): + # whether the root saving dir exists, which should be created by save_log_into_tb_file + assert os.path.exists( + self.saving_path + ), f"file {self.saving_path} does not exist" + + # check if the tensorboard file and model checkpoints exist + check_tb_and_model_checkpoints_existence(self.transformer) + + # save the trained model into file, and check if the path exists + self.transformer.save_model( + saving_dir=self.saving_path, file_name=self.model_save_name + ) + + # test loading the saved model, not necessary, but need to test + saved_model_path = os.path.join(self.saving_path, self.model_save_name) + self.transformer.load_model(saved_model_path) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/imputation/usgan.py b/tests/imputation/usgan.py new file mode 100644 index 00000000..c91a17a1 --- /dev/null +++ b/tests/imputation/usgan.py @@ -0,0 +1,111 @@ +""" +Test cases for US-GAN imputation model. +""" + +# Created by Wenjie Du +# License: GPL-v3 + + +import os.path +import unittest + +import numpy as np +import pytest + +from pypots.imputation import USGAN +from pypots.optim import Adam +from pypots.utils.logging import logger +from pypots.utils.metrics import cal_mae +from tests.global_test_config import ( + DATA, + DEVICE, + check_tb_and_model_checkpoints_existence, +) +from tests.imputation.config import ( + TRAIN_SET, + VAL_SET, + TEST_SET, + RESULT_SAVING_DIR_FOR_IMPUTATION, + EPOCHS, +) + + +class TestUSGAN(unittest.TestCase): + logger.info("Running tests for an imputation model US-GAN...") + + # set the log and model saving path + saving_path = os.path.join(RESULT_SAVING_DIR_FOR_IMPUTATION, "US-GAN") + model_save_name = "saved_USGAN_model.pypots" + + # initialize an Adam optimizer + G_optimizer = Adam(lr=0.001, weight_decay=1e-5) + D_optimizer = Adam(lr=0.001, weight_decay=1e-5) + + # initialize a US-GAN model + us_gan = USGAN( + DATA["n_steps"], + DATA["n_features"], + 256, + epochs=EPOCHS, + saving_path=saving_path, + G_optimizer=G_optimizer, + D_optimizer=D_optimizer, + device=DEVICE, + ) + + @pytest.mark.xdist_group(name="imputation-usgan") + def test_0_fit(self): + self.us_gan.fit(TRAIN_SET, VAL_SET) + + @pytest.mark.xdist_group(name="imputation-usgan") + def test_1_impute(self): + imputed_X = self.us_gan.impute(TEST_SET) + assert not np.isnan( + imputed_X + ).any(), "Output still has missing values after running impute()." + test_MAE = cal_mae( + imputed_X, DATA["test_X_intact"], DATA["test_X_indicating_mask"] + ) + logger.info(f"US-GAN test_MAE: {test_MAE}") + + @pytest.mark.xdist_group(name="imputation-usgan") + def test_2_parameters(self): + assert hasattr(self.us_gan, "model") and self.us_gan.model is not None + + assert ( + hasattr(self.us_gan, "G_optimizer") and self.us_gan.G_optimizer is not None + ) + assert ( + hasattr(self.us_gan, "D_optimizer") and self.us_gan.D_optimizer is not None + ) + + assert hasattr(self.us_gan, "best_loss") + self.assertNotEqual(self.us_gan.best_loss, float("inf")) + + assert ( + hasattr(self.us_gan, "best_model_dict") + and self.us_gan.best_model_dict is not None + ) + + @pytest.mark.xdist_group(name="imputation-usgan") + def test_3_saving_path(self): + # whether the root saving dir exists, which should be created by save_log_into_tb_file + assert os.path.exists( + self.saving_path + ), f"file {self.saving_path} does not exist" + + # check if the tensorboard file and model checkpoints exist + check_tb_and_model_checkpoints_existence(self.us_gan) + + # save the trained model into file, and check if the path exists + self.us_gan.save_model( + saving_dir=self.saving_path, file_name=self.model_save_name + ) + + # test loading the saved model, not necessary, but need to test + saved_model_path = os.path.join(self.saving_path, self.model_save_name) + self.us_gan.load_model(saved_model_path) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/optim/__init__.py b/tests/optim/__init__.py new file mode 100644 index 00000000..f0b4685e --- /dev/null +++ b/tests/optim/__init__.py @@ -0,0 +1,6 @@ +""" + +""" + +# Created by Wenjie Du +# License: GLP-v3 diff --git a/tests/optim/adadelta.py b/tests/optim/adadelta.py new file mode 100644 index 00000000..b69e5ea4 --- /dev/null +++ b/tests/optim/adadelta.py @@ -0,0 +1,56 @@ +""" +Test cases for the optimizer Adadelta. +""" + +# Created by Wenjie Du +# License: GLP-v3 + +import unittest + +import numpy as np +import pytest + +from pypots.imputation import SAITS +from pypots.optim import Adadelta +from pypots.utils.logging import logger +from pypots.utils.metrics import cal_mae +from tests.global_test_config import DATA +from tests.optim.config import EPOCHS, TEST_SET, TRAIN_SET, VAL_SET + + +class TestAdadelta(unittest.TestCase): + logger.info("Running tests for Adadelta...") + + # initialize an Adadelta optimizer + adadelta = Adadelta(lr=0.001, weight_decay=1e-5) + + # initialize a SAITS model for testing DatasetForMIT and BaseDataset + saits = SAITS( + DATA["n_steps"], + DATA["n_features"], + n_layers=1, + d_model=128, + d_inner=64, + n_heads=2, + d_k=64, + d_v=64, + dropout=0.1, + optimizer=adadelta, + epochs=EPOCHS, + ) + + @pytest.mark.xdist_group(name="optim-adadelta") + def test_0_fit(self): + self.saits.fit(TRAIN_SET, VAL_SET) + imputed_X = self.saits.impute(TEST_SET) + assert not np.isnan( + imputed_X + ).any(), "Output still has missing values after running impute()." + test_MAE = cal_mae( + imputed_X, DATA["test_X_intact"], DATA["test_X_indicating_mask"] + ) + logger.info(f"SAITS test_MAE: {test_MAE}") + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/optim/adagrad.py b/tests/optim/adagrad.py new file mode 100644 index 00000000..21b4696a --- /dev/null +++ b/tests/optim/adagrad.py @@ -0,0 +1,56 @@ +""" +Test cases for the optimizer Adagrad. +""" + +# Created by Wenjie Du +# License: GLP-v3 + +import unittest + +import numpy as np +import pytest + +from pypots.imputation import SAITS +from pypots.optim import Adagrad +from pypots.utils.logging import logger +from pypots.utils.metrics import cal_mae +from tests.global_test_config import DATA +from tests.optim.config import EPOCHS, TEST_SET, TRAIN_SET, VAL_SET + + +class TestAdagrad(unittest.TestCase): + logger.info("Running tests for Adagrad...") + + # initialize an Adagrad optimizer + adagrad = Adagrad(lr=0.001, weight_decay=1e-5) + + # initialize a SAITS model for testing DatasetForMIT and BaseDataset + saits = SAITS( + DATA["n_steps"], + DATA["n_features"], + n_layers=1, + d_model=128, + d_inner=64, + n_heads=2, + d_k=64, + d_v=64, + dropout=0.1, + optimizer=adagrad, + epochs=EPOCHS, + ) + + @pytest.mark.xdist_group(name="optim-adagrad") + def test_0_fit(self): + self.saits.fit(TRAIN_SET, VAL_SET) + imputed_X = self.saits.impute(TEST_SET) + assert not np.isnan( + imputed_X + ).any(), "Output still has missing values after running impute()." + test_MAE = cal_mae( + imputed_X, DATA["test_X_intact"], DATA["test_X_indicating_mask"] + ) + logger.info(f"SAITS test_MAE: {test_MAE}") + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/optim/adam.py b/tests/optim/adam.py new file mode 100644 index 00000000..448f92b9 --- /dev/null +++ b/tests/optim/adam.py @@ -0,0 +1,56 @@ +""" +Test cases for the optimizer Adam. +""" + +# Created by Wenjie Du +# License: GLP-v3 + +import unittest + +import numpy as np +import pytest + +from pypots.imputation import SAITS +from pypots.optim import Adam +from pypots.utils.logging import logger +from pypots.utils.metrics import cal_mae +from tests.global_test_config import DATA +from tests.optim.config import EPOCHS, TEST_SET, TRAIN_SET, VAL_SET + + +class TestAdam(unittest.TestCase): + logger.info("Running tests for Adam...") + + # initialize an Adam optimizer + adam = Adam(lr=0.001, weight_decay=1e-5) + + # initialize a SAITS model for testing DatasetForMIT and BaseDataset + saits = SAITS( + DATA["n_steps"], + DATA["n_features"], + n_layers=1, + d_model=128, + d_inner=64, + n_heads=2, + d_k=64, + d_v=64, + dropout=0.1, + optimizer=adam, + epochs=EPOCHS, + ) + + @pytest.mark.xdist_group(name="optim-adam") + def test_0_fit(self): + self.saits.fit(TRAIN_SET, VAL_SET) + imputed_X = self.saits.impute(TEST_SET) + assert not np.isnan( + imputed_X + ).any(), "Output still has missing values after running impute()." + test_MAE = cal_mae( + imputed_X, DATA["test_X_intact"], DATA["test_X_indicating_mask"] + ) + logger.info(f"SAITS test_MAE: {test_MAE}") + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/optim/adamw.py b/tests/optim/adamw.py new file mode 100644 index 00000000..a7941f43 --- /dev/null +++ b/tests/optim/adamw.py @@ -0,0 +1,56 @@ +""" +Test cases for the optimizer AdamW. +""" + +# Created by Wenjie Du +# License: GLP-v3 + +import unittest + +import numpy as np +import pytest + +from pypots.imputation import SAITS +from pypots.optim import AdamW +from pypots.utils.logging import logger +from pypots.utils.metrics import cal_mae +from tests.global_test_config import DATA +from tests.optim.config import EPOCHS, TEST_SET, TRAIN_SET, VAL_SET + + +class TestAdamW(unittest.TestCase): + logger.info("Running tests for AdamW...") + + # initialize an AdamW optimizer + adamw = AdamW(lr=0.001, weight_decay=1e-5) + + # initialize a SAITS model for testing DatasetForMIT and BaseDataset + saits = SAITS( + DATA["n_steps"], + DATA["n_features"], + n_layers=1, + d_model=128, + d_inner=64, + n_heads=2, + d_k=64, + d_v=64, + dropout=0.1, + optimizer=adamw, + epochs=EPOCHS, + ) + + @pytest.mark.xdist_group(name="optim-adamw") + def test_0_fit(self): + self.saits.fit(TRAIN_SET, VAL_SET) + imputed_X = self.saits.impute(TEST_SET) + assert not np.isnan( + imputed_X + ).any(), "Output still has missing values after running impute()." + test_MAE = cal_mae( + imputed_X, DATA["test_X_intact"], DATA["test_X_indicating_mask"] + ) + logger.info(f"SAITS test_MAE: {test_MAE}") + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/optim/config.py b/tests/optim/config.py new file mode 100644 index 00000000..a0391027 --- /dev/null +++ b/tests/optim/config.py @@ -0,0 +1,19 @@ +""" +Test configs for optimizers. +""" + +# Created by Wenjie Du +# License: GLP-v3 + +from tests.global_test_config import DATA + +TRAIN_SET = {"X": DATA["train_X"]} +VAL_SET = { + "X": DATA["val_X"], + "X_intact": DATA["val_X_intact"], + "indicating_mask": DATA["val_X_indicating_mask"], +} +TEST_SET = {"X": DATA["test_X"]} + + +EPOCHS = 1 diff --git a/tests/optim/rmsprop.py b/tests/optim/rmsprop.py new file mode 100644 index 00000000..1fe61a0d --- /dev/null +++ b/tests/optim/rmsprop.py @@ -0,0 +1,56 @@ +""" +Test cases for the optimizer RMSprop. +""" + +# Created by Wenjie Du +# License: GLP-v3 + +import unittest + +import numpy as np +import pytest + +from pypots.imputation import SAITS +from pypots.optim import RMSprop +from pypots.utils.logging import logger +from pypots.utils.metrics import cal_mae +from tests.global_test_config import DATA +from tests.optim.config import EPOCHS, TEST_SET, TRAIN_SET, VAL_SET + + +class TestRMSprop(unittest.TestCase): + logger.info("Running tests for RMSprop...") + + # initialize a RMSprop optimizer + rmsprop = RMSprop(lr=0.001, weight_decay=1e-5) + + # initialize a SAITS model for testing DatasetForMIT and BaseDataset + saits = SAITS( + DATA["n_steps"], + DATA["n_features"], + n_layers=1, + d_model=128, + d_inner=64, + n_heads=2, + d_k=64, + d_v=64, + dropout=0.1, + optimizer=rmsprop, + epochs=EPOCHS, + ) + + @pytest.mark.xdist_group(name="optim-rmsprop") + def test_0_fit(self): + self.saits.fit(TRAIN_SET, VAL_SET) + imputed_X = self.saits.impute(TEST_SET) + assert not np.isnan( + imputed_X + ).any(), "Output still has missing values after running impute()." + test_MAE = cal_mae( + imputed_X, DATA["test_X_intact"], DATA["test_X_indicating_mask"] + ) + logger.info(f"SAITS test_MAE: {test_MAE}") + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/optim/sgd.py b/tests/optim/sgd.py new file mode 100644 index 00000000..4b1c1998 --- /dev/null +++ b/tests/optim/sgd.py @@ -0,0 +1,56 @@ +""" +Test cases for the optimizer SGD. +""" + +# Created by Wenjie Du +# License: GLP-v3 + +import unittest + +import numpy as np +import pytest + +from pypots.imputation import SAITS +from pypots.optim import SGD +from pypots.utils.logging import logger +from pypots.utils.metrics import cal_mae +from tests.global_test_config import DATA +from tests.optim.config import EPOCHS, TEST_SET, TRAIN_SET, VAL_SET + + +class TestSGD(unittest.TestCase): + logger.info("Running tests for SGD...") + + # initialize a SGD optimizer + sgd = SGD(lr=0.001, weight_decay=1e-5) + + # initialize a SAITS model for testing DatasetForMIT and BaseDataset + saits = SAITS( + DATA["n_steps"], + DATA["n_features"], + n_layers=1, + d_model=128, + d_inner=64, + n_heads=2, + d_k=64, + d_v=64, + dropout=0.1, + optimizer=sgd, + epochs=EPOCHS, + ) + + @pytest.mark.xdist_group(name="optim-sgd") + def test_0_fit(self): + self.saits.fit(TRAIN_SET, VAL_SET) + imputed_X = self.saits.impute(TEST_SET) + assert not np.isnan( + imputed_X + ).any(), "Output still has missing values after running impute()." + test_MAE = cal_mae( + imputed_X, DATA["test_X_intact"], DATA["test_X_indicating_mask"] + ) + logger.info(f"SAITS test_MAE: {test_MAE}") + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_classification.py b/tests/test_classification.py deleted file mode 100644 index 2ef9c6d1..00000000 --- a/tests/test_classification.py +++ /dev/null @@ -1,256 +0,0 @@ -""" -Test cases for classification models. -""" - -# Created by Wenjie Du -# License: GLP-v3 - -import os -import unittest - -import pytest - -from pypots.classification import BRITS, GRUD, Raindrop -from pypots.optim import Adam -from pypots.utils.logging import logger -from pypots.utils.metrics import cal_binary_classification_metrics -from tests.global_test_config import ( - DATA, - RESULT_SAVING_DIR, - check_tb_and_model_checkpoints_existence, -) - -EPOCHS = 5 - -TRAIN_SET = {"X": DATA["train_X"], "y": DATA["train_y"]} -VAL_SET = {"X": DATA["val_X"], "y": DATA["val_y"]} -TEST_SET = {"X": DATA["test_X"]} - -RESULT_SAVING_DIR_FOR_CLASSIFICATION = os.path.join(RESULT_SAVING_DIR, "classification") - - -class TestBRITS(unittest.TestCase): - logger.info("Running tests for a classification model BRITS...") - - # set the log and model saving path - saving_path = os.path.join(RESULT_SAVING_DIR_FOR_CLASSIFICATION, "BRITS") - model_save_name = "saved_BRITS_model.pypots" - - # initialize an Adam optimizer - optimizer = Adam(lr=0.001, weight_decay=1e-5) - - # initialize a BRITS model - brits = BRITS( - DATA["n_steps"], - DATA["n_features"], - n_classes=DATA["n_classes"], - rnn_hidden_size=256, - epochs=EPOCHS, - saving_path=saving_path, - model_saving_strategy="better", - optimizer=optimizer, - ) - - @pytest.mark.xdist_group(name="classification-brits") - def test_0_fit(self): - self.brits.fit(TRAIN_SET, VAL_SET) - - @pytest.mark.xdist_group(name="classification-brits") - def test_1_classify(self): - predictions = self.brits.classify(TEST_SET) - metrics = cal_binary_classification_metrics(predictions, DATA["test_y"]) - logger.info( - f'ROC_AUC: {metrics["roc_auc"]}, \n' - f'PR_AUC: {metrics["pr_auc"]},\n' - f'F1: {metrics["f1"]},\n' - f'Precision: {metrics["precision"]},\n' - f'Recall: {metrics["recall"]},\n' - ) - assert metrics["roc_auc"] >= 0.5, "ROC-AUC < 0.5" - - @pytest.mark.xdist_group(name="classification-brits") - def test_2_parameters(self): - assert hasattr(self.brits, "model") and self.brits.model is not None - - assert hasattr(self.brits, "optimizer") and self.brits.optimizer is not None - - assert hasattr(self.brits, "best_loss") - self.assertNotEqual(self.brits.best_loss, float("inf")) - - assert ( - hasattr(self.brits, "best_model_dict") - and self.brits.best_model_dict is not None - ) - - @pytest.mark.xdist_group(name="classification-brits") - def test_3_saving_path(self): - # whether the root saving dir exists, which should be created by save_log_into_tb_file - assert os.path.exists( - self.saving_path - ), f"file {self.saving_path} does not exist" - - # check if the tensorboard file and model checkpoints exist - check_tb_and_model_checkpoints_existence(self.brits) - - # save the trained model into file, and check if the path exists - self.brits.save_model( - saving_dir=self.saving_path, file_name=self.model_save_name - ) - - # test loading the saved model, not necessary, but need to test - saved_model_path = os.path.join(self.saving_path, self.model_save_name) - self.brits.load_model(saved_model_path) - - -class TestGRUD(unittest.TestCase): - logger.info("Running tests for a classification model GRUD...") - - # set the log and model saving path - saving_path = os.path.join(RESULT_SAVING_DIR_FOR_CLASSIFICATION, "GRUD") - model_save_name = "saved_GRUD_model.pypots" - - # initialize an Adam optimizer - optimizer = Adam(lr=0.001, weight_decay=1e-5) - - # initialize a GRUD model - grud = GRUD( - DATA["n_steps"], - DATA["n_features"], - n_classes=DATA["n_classes"], - rnn_hidden_size=256, - epochs=EPOCHS, - saving_path=saving_path, - optimizer=optimizer, - ) - - @pytest.mark.xdist_group(name="classification-grud") - def test_0_fit(self): - self.grud.fit(TRAIN_SET, VAL_SET) - - @pytest.mark.xdist_group(name="classification-grud") - def test_1_classify(self): - predictions = self.grud.classify(TEST_SET) - metrics = cal_binary_classification_metrics(predictions, DATA["test_y"]) - logger.info( - f'ROC_AUC: {metrics["roc_auc"]}, \n' - f'PR_AUC: {metrics["pr_auc"]},\n' - f'F1: {metrics["f1"]},\n' - f'Precision: {metrics["precision"]},\n' - f'Recall: {metrics["recall"]},\n' - ) - assert metrics["roc_auc"] >= 0.5, "ROC-AUC < 0.5" - - @pytest.mark.xdist_group(name="classification-grud") - def test_2_parameters(self): - assert hasattr(self.grud, "model") and self.grud.model is not None - - assert hasattr(self.grud, "optimizer") and self.grud.optimizer is not None - - assert hasattr(self.grud, "best_loss") - self.assertNotEqual(self.grud.best_loss, float("inf")) - - assert ( - hasattr(self.grud, "best_model_dict") - and self.grud.best_model_dict is not None - ) - - @pytest.mark.xdist_group(name="classification-grud") - def test_3_saving_path(self): - # whether the root saving dir exists, which should be created by save_log_into_tb_file - assert os.path.exists( - self.saving_path - ), f"file {self.saving_path} does not exist" - - # check if the tensorboard file and model checkpoints exist - check_tb_and_model_checkpoints_existence(self.grud) - - # save the trained model into file, and check if the path exists - self.grud.save_model( - saving_dir=self.saving_path, file_name=self.model_save_name - ) - - # test loading the saved model, not necessary, but need to test - saved_model_path = os.path.join(self.saving_path, self.model_save_name) - self.grud.load_model(saved_model_path) - - -class TestRaindrop(unittest.TestCase): - logger.info("Running tests for a classification model Raindrop...") - - # set the log and model saving path - saving_path = os.path.join(RESULT_SAVING_DIR_FOR_CLASSIFICATION, "Raindrop") - model_save_name = "saved_Raindrop_model.pypots" - - # initialize a Raindrop model - raindrop = Raindrop( - DATA["n_steps"], - DATA["n_features"], - DATA["n_classes"], - n_layers=2, - d_model=DATA["n_features"] * 4, - d_inner=256, - n_heads=2, - dropout=0.3, - d_static=0, - aggregation="mean", - sensor_wise_mask=False, - static=False, - epochs=EPOCHS, - saving_path=saving_path, - ) - - @pytest.mark.xdist_group(name="classification-raindrop") - def test_0_fit(self): - self.raindrop.fit(TRAIN_SET, VAL_SET) - - @pytest.mark.xdist_group(name="classification-raindrop") - def test_1_classify(self): - predictions = self.raindrop.classify(TEST_SET) - metrics = cal_binary_classification_metrics(predictions, DATA["test_y"]) - logger.info( - f'ROC_AUC: {metrics["roc_auc"]}, \n' - f'PR_AUC: {metrics["pr_auc"]},\n' - f'F1: {metrics["f1"]},\n' - f'Precision: {metrics["precision"]},\n' - f'Recall: {metrics["recall"]},\n' - ) - assert metrics["roc_auc"] >= 0.5, "ROC-AUC < 0.5" - - @pytest.mark.xdist_group(name="classification-raindrop") - def test_2_parameters(self): - assert hasattr(self.raindrop, "model") and self.raindrop.model is not None - - assert ( - hasattr(self.raindrop, "optimizer") and self.raindrop.optimizer is not None - ) - - assert hasattr(self.raindrop, "best_loss") - self.assertNotEqual(self.raindrop.best_loss, float("inf")) - - assert ( - hasattr(self.raindrop, "best_model_dict") - and self.raindrop.best_model_dict is not None - ) - - @pytest.mark.xdist_group(name="classification-raindrop") - def test_3_saving_path(self): - # whether the root saving dir exists, which should be created by save_log_into_tb_file - assert os.path.exists( - self.saving_path - ), f"file {self.saving_path} does not exist" - - # check if the tensorboard file and model checkpoints exist - check_tb_and_model_checkpoints_existence(self.raindrop) - - # save the trained model into file, and check if the path exists - self.raindrop.save_model( - saving_dir=self.saving_path, file_name=self.model_save_name - ) - - # test loading the saved model, not necessary, but need to test - saved_model_path = os.path.join(self.saving_path, self.model_save_name) - self.raindrop.load_model(saved_model_path) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/test_cli.py b/tests/test_cli.py deleted file mode 100644 index 4e9e9927..00000000 --- a/tests/test_cli.py +++ /dev/null @@ -1,189 +0,0 @@ -""" -Test cases for the functions and classes in package `pypots.cli`. -""" - -# Created by Wenjie Du -# License: GLP-v3 - -import os -import threading -import unittest -from argparse import Namespace -from copy import copy - -import pytest - -from pypots.cli.dev import dev_command_factory -from pypots.cli.doc import doc_command_factory -from pypots.cli.env import env_command_factory -from pypots.utils.logging import logger - -PROJECT_ROOT_DIR = os.path.abspath(os.path.join(os.path.abspath(__file__), "../..")) - - -def callback_func(): - raise TimeoutError("Time out.") - - -def time_out(interval, callback): - def decorator(func): - def wrapper(*args, **kwargs): - t = threading.Thread(target=func, args=args, kwargs=kwargs) - t.setDaemon(True) - t.start() - t.join(interval) # wait for interval seconds - if t.is_alive(): - return threading.Timer(0, callback).start() # invoke callback() - else: - return - - return wrapper - - return decorator - - -@pytest.mark.xfail(reason="Allow tests for CLI to fail") -class TestPyPOTSCLIDev(unittest.TestCase): - # set up the default arguments - default_arguments = { - "build": False, - "cleanup": False, - "run_tests": False, - "k": None, - "show_coverage": False, - "lint_code": False, - } - # `pypots-cli dev` must run under the project root dir - os.chdir(PROJECT_ROOT_DIR) - - @pytest.mark.xdist_group(name="cli-dev") - def test_0_build(self): - arguments = copy(self.default_arguments) - arguments["build"] = True - args = Namespace(**arguments) - dev_command_factory(args).run() - - @pytest.mark.xdist_group(name="cli-dev") - def test_1_run_tests(self): - arguments = copy(self.default_arguments) - arguments["run_tests"] = True - arguments["k"] = "try_to_find_a_non_existing_test_case" - args = Namespace(**arguments) - try: - dev_command_factory(args).run() - except RuntimeError: # try to find a non-existing test case, so RuntimeError will be raised - pass - except Exception as e: # other exceptions will cause an error and result in failed testing - raise e - - # Don't test --lint-code because Black will reformat the code and cause error when generating the coverage report - # @pytest.mark.xdist_group(name="cli-dev") - # def test_2_lint_code(self): - # arguments = copy(self.default_arguments) - # arguments["lint_code"] = True - # args = Namespace(**arguments) - # dev_command_factory(args).run() - - @pytest.mark.xdist_group(name="cli-dev") - def test_3_cleanup(self): - arguments = copy(self.default_arguments) - arguments["cleanup"] = True - args = Namespace(**arguments) - dev_command_factory(args).run() - - -@pytest.mark.xfail(reason="Allow tests for CLI to fail") -class TestPyPOTSCLIDoc(unittest.TestCase): - # set up the default arguments - default_arguments = { - "gene_rst": False, - "branch": "main", - "gene_html": False, - "view_doc": False, - "port": 9075, - "cleanup": False, - } - # `pypots-cli doc` must run under the project root dir - os.chdir(PROJECT_ROOT_DIR) - - @pytest.mark.xdist_group(name="cli-doc") - def test_0_gene_rst(self): - arguments = copy(self.default_arguments) - arguments["gene_rst"] = True - args = Namespace(**arguments) - doc_command_factory(args).run() - - logger.info("run again under a non-root dir") - try: - os.chdir(os.path.abspath(os.path.join(PROJECT_ROOT_DIR, "pypots"))) - doc_command_factory(args).run() - except RuntimeError: # try to run under a non-root dir, so RuntimeError will be raised - pass - except Exception as e: # other exceptions will cause an error and result in failed testing - raise e - finally: - os.chdir(PROJECT_ROOT_DIR) - - @pytest.mark.xdist_group(name="cli-doc") - def test_1_gene_html(self): - arguments = copy(self.default_arguments) - arguments["gene_html"] = True - args = Namespace(**arguments) - try: - doc_command_factory(args).run() - except Exception as e: # somehow we have some error when testing on Windows, so just print and pass below - logger.error(e) - - @pytest.mark.xdist_group(name="cli-doc") - @time_out(2, callback_func) # wait for two seconds - def test_2_view_doc(self): - arguments = copy(self.default_arguments) - arguments["view_doc"] = True - args = Namespace(**arguments) - try: - doc_command_factory(args).run() - except Exception as e: # somehow we have some error when testing on Windows, so just print and pass below - logger.error(e) - - @pytest.mark.xdist_group(name="cli-doc") - def test_3_cleanup(self): - arguments = copy(self.default_arguments) - arguments["cleanup"] = True - args = Namespace(**arguments) - doc_command_factory(args).run() - - -@pytest.mark.xfail(reason="Allow tests for CLI to fail") -class TestPyPOTSCLIEnv(unittest.TestCase): - # set up the default arguments - default_arguments = { - "install": "optional", - "tool": "conda", - } - - # `pypots-cli env` must run under the project root dir - os.chdir(PROJECT_ROOT_DIR) - - @pytest.mark.xdist_group(name="cli-env") - def test_0_install_with_conda(self): - arguments = copy(self.default_arguments) - arguments["tool"] = "conda" - args = Namespace(**arguments) - try: - env_command_factory(args).run() - except Exception as e: # somehow we have some error when testing on Windows, so just print and pass below - logger.error(e) - - @pytest.mark.xdist_group(name="cli-env") - def test_1_install_with_pip(self): - arguments = copy(self.default_arguments) - arguments["tool"] = "pip" - args = Namespace(**arguments) - try: - env_command_factory(args).run() - except Exception as e: # somehow we have some error when testing on Windows, so just print and pass below - logger.error(e) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/test_imputation.py b/tests/test_imputation.py deleted file mode 100644 index 6094ce62..00000000 --- a/tests/test_imputation.py +++ /dev/null @@ -1,356 +0,0 @@ -""" -Test cases for imputation models. -""" - -# Created by Wenjie Du -# License: GPL-v3 - - -import os.path -import unittest - -import numpy as np -import pytest - -from pypots.imputation import ( - SAITS, - Transformer, - BRITS, - MRNN, - LOCF, -) -from pypots.optim import Adam -from pypots.utils.logging import logger -from pypots.utils.metrics import cal_mae -from tests.global_test_config import ( - DATA, - RESULT_SAVING_DIR, - check_tb_and_model_checkpoints_existence, -) - -EPOCH = 5 - -TRAIN_SET = {"X": DATA["train_X"]} -VAL_SET = { - "X": DATA["val_X"], - "X_intact": DATA["val_X_intact"], - "indicating_mask": DATA["val_X_indicating_mask"], -} -TEST_SET = {"X": DATA["test_X"]} - -RESULT_SAVING_DIR_FOR_IMPUTATION = os.path.join(RESULT_SAVING_DIR, "imputation") - - -class TestSAITS(unittest.TestCase): - logger.info("Running tests for an imputation model SAITS...") - - # set the log and model saving path - saving_path = os.path.join(RESULT_SAVING_DIR_FOR_IMPUTATION, "SAITS") - model_save_name = "saved_saits_model.pypots" - - # initialize an Adam optimizer - optimizer = Adam(lr=0.001, weight_decay=1e-5) - - # initialize a SAITS model - saits = SAITS( - DATA["n_steps"], - DATA["n_features"], - n_layers=2, - d_model=256, - d_inner=128, - n_heads=4, - d_k=64, - d_v=64, - dropout=0.1, - epochs=EPOCH, - saving_path=saving_path, - optimizer=optimizer, - ) - - @pytest.mark.xdist_group(name="imputation-saits") - def test_0_fit(self): - self.saits.fit(TRAIN_SET, VAL_SET) - - @pytest.mark.xdist_group(name="imputation-saits") - def test_1_impute(self): - imputed_X = self.saits.impute(TEST_SET) - assert not np.isnan( - imputed_X - ).any(), "Output still has missing values after running impute()." - test_MAE = cal_mae( - imputed_X, DATA["test_X_intact"], DATA["test_X_indicating_mask"] - ) - logger.info(f"SAITS test_MAE: {test_MAE}") - - @pytest.mark.xdist_group(name="imputation-saits") - def test_2_parameters(self): - assert hasattr(self.saits, "model") and self.saits.model is not None - - assert hasattr(self.saits, "optimizer") and self.saits.optimizer is not None - - assert hasattr(self.saits, "best_loss") - self.assertNotEqual(self.saits.best_loss, float("inf")) - - assert ( - hasattr(self.saits, "best_model_dict") - and self.saits.best_model_dict is not None - ) - - @pytest.mark.xdist_group(name="imputation-saits") - def test_3_saving_path(self): - # whether the root saving dir exists, which should be created by save_log_into_tb_file - assert os.path.exists( - self.saving_path - ), f"file {self.saving_path} does not exist" - - # check if the tensorboard file and model checkpoints exist - check_tb_and_model_checkpoints_existence(self.saits) - - # save the trained model into file, and check if the path exists - self.saits.save_model( - saving_dir=self.saving_path, file_name=self.model_save_name - ) - - # test loading the saved model, not necessary, but need to test - saved_model_path = os.path.join(self.saving_path, self.model_save_name) - self.saits.load_model(saved_model_path) - - -class TestTransformer(unittest.TestCase): - logger.info("Running tests for an imputation model Transformer...") - - # set the log and model saving path - saving_path = os.path.join(RESULT_SAVING_DIR_FOR_IMPUTATION, "Transformer") - model_save_name = "saved_transformer_model.pypots" - - # initialize an Adam optimizer - optimizer = Adam(lr=0.001, weight_decay=1e-5) - - # initialize a Transformer model - transformer = Transformer( - DATA["n_steps"], - DATA["n_features"], - n_layers=2, - d_model=256, - d_inner=128, - n_heads=4, - d_k=64, - d_v=64, - dropout=0.1, - epochs=EPOCH, - saving_path=saving_path, - optimizer=optimizer, - ) - - @pytest.mark.xdist_group(name="imputation-transformer") - def test_0_fit(self): - self.transformer.fit(TRAIN_SET, VAL_SET) - - @pytest.mark.xdist_group(name="imputation-transformer") - def test_1_impute(self): - imputed_X = self.transformer.impute(TEST_SET) - assert not np.isnan( - imputed_X - ).any(), "Output still has missing values after running impute()." - test_MAE = cal_mae( - imputed_X, DATA["test_X_intact"], DATA["test_X_indicating_mask"] - ) - logger.info(f"Transformer test_MAE: {test_MAE}") - - @pytest.mark.xdist_group(name="imputation-transformer") - def test_2_parameters(self): - assert hasattr(self.transformer, "model") and self.transformer.model is not None - - assert ( - hasattr(self.transformer, "optimizer") - and self.transformer.optimizer is not None - ) - - assert hasattr(self.transformer, "best_loss") - self.assertNotEqual(self.transformer.best_loss, float("inf")) - - assert ( - hasattr(self.transformer, "best_model_dict") - and self.transformer.best_model_dict is not None - ) - - @pytest.mark.xdist_group(name="imputation-transformer") - def test_3_saving_path(self): - # whether the root saving dir exists, which should be created by save_log_into_tb_file - assert os.path.exists( - self.saving_path - ), f"file {self.saving_path} does not exist" - - # check if the tensorboard file and model checkpoints exist - check_tb_and_model_checkpoints_existence(self.transformer) - - # save the trained model into file, and check if the path exists - self.transformer.save_model( - saving_dir=self.saving_path, file_name=self.model_save_name - ) - - # test loading the saved model, not necessary, but need to test - saved_model_path = os.path.join(self.saving_path, self.model_save_name) - self.transformer.load_model(saved_model_path) - - -class TestBRITS(unittest.TestCase): - logger.info("Running tests for an imputation model BRITS...") - - # set the log and model saving path - saving_path = os.path.join(RESULT_SAVING_DIR_FOR_IMPUTATION, "BRITS") - model_save_name = "saved_BRITS_model.pypots" - - # initialize an Adam optimizer - optimizer = Adam(lr=0.001, weight_decay=1e-5) - - # initialize a BRITS model - brits = BRITS( - DATA["n_steps"], - DATA["n_features"], - 256, - epochs=EPOCH, - saving_path=f"{RESULT_SAVING_DIR_FOR_IMPUTATION}/BRITS", - optimizer=optimizer, - ) - - @pytest.mark.xdist_group(name="imputation-brits") - def test_0_fit(self): - self.brits.fit(TRAIN_SET, VAL_SET) - - @pytest.mark.xdist_group(name="imputation-brits") - def test_1_impute(self): - imputed_X = self.brits.impute(TEST_SET) - assert not np.isnan( - imputed_X - ).any(), "Output still has missing values after running impute()." - test_MAE = cal_mae( - imputed_X, DATA["test_X_intact"], DATA["test_X_indicating_mask"] - ) - logger.info(f"BRITS test_MAE: {test_MAE}") - - @pytest.mark.xdist_group(name="imputation-brits") - def test_2_parameters(self): - assert hasattr(self.brits, "model") and self.brits.model is not None - - assert hasattr(self.brits, "optimizer") and self.brits.optimizer is not None - - assert hasattr(self.brits, "best_loss") - self.assertNotEqual(self.brits.best_loss, float("inf")) - - assert ( - hasattr(self.brits, "best_model_dict") - and self.brits.best_model_dict is not None - ) - - @pytest.mark.xdist_group(name="imputation-brits") - def test_3_saving_path(self): - # whether the root saving dir exists, which should be created by save_log_into_tb_file - assert os.path.exists( - self.saving_path - ), f"file {self.saving_path} does not exist" - - # check if the tensorboard file and model checkpoints exist - check_tb_and_model_checkpoints_existence(self.brits) - - # save the trained model into file, and check if the path exists - self.brits.save_model( - saving_dir=self.saving_path, file_name=self.model_save_name - ) - - # test loading the saved model, not necessary, but need to test - saved_model_path = os.path.join(self.saving_path, self.model_save_name) - self.brits.load_model(saved_model_path) - - -class TestMRNN(unittest.TestCase): - logger.info("Running tests for an imputation model MRNN...") - - # set the log and model saving path - saving_path = os.path.join(RESULT_SAVING_DIR_FOR_IMPUTATION, "MRNN") - model_save_name = "saved_MRNN_model.pypots" - - # initialize an Adam optimizer - optimizer = Adam(lr=0.001, weight_decay=1e-5) - - # initialize a MRNN model - mrnn = MRNN( - DATA["n_steps"], - DATA["n_features"], - 256, - epochs=EPOCH, - saving_path=f"{RESULT_SAVING_DIR_FOR_IMPUTATION}/MRNN", - optimizer=optimizer, - ) - - @pytest.mark.xdist_group(name="imputation-mrnn") - def test_0_fit(self): - self.mrnn.fit(TRAIN_SET, VAL_SET) - - @pytest.mark.xdist_group(name="imputation-mrnn") - def test_1_impute(self): - imputed_X = self.mrnn.impute(TEST_SET) - assert not np.isnan( - imputed_X - ).any(), "Output still has missing values after running impute()." - test_MAE = cal_mae( - imputed_X, DATA["test_X_intact"], DATA["test_X_indicating_mask"] - ) - logger.info(f"MRNN test_MAE: {test_MAE}") - - @pytest.mark.xdist_group(name="imputation-mrnn") - def test_2_parameters(self): - assert hasattr(self.mrnn, "model") and self.mrnn.model is not None - - assert hasattr(self.mrnn, "optimizer") and self.mrnn.optimizer is not None - - assert hasattr(self.mrnn, "best_loss") - self.assertNotEqual(self.mrnn.best_loss, float("inf")) - - assert ( - hasattr(self.mrnn, "best_model_dict") - and self.mrnn.best_model_dict is not None - ) - - @pytest.mark.xdist_group(name="imputation-mrnn") - def test_3_saving_path(self): - # whether the root saving dir exists, which should be created by save_log_into_tb_file - assert os.path.exists( - self.saving_path - ), f"file {self.saving_path} does not exist" - - # check if the tensorboard file and model checkpoints exist - check_tb_and_model_checkpoints_existence(self.mrnn) - - # save the trained model into file, and check if the path exists - self.mrnn.save_model( - saving_dir=self.saving_path, file_name=self.model_save_name - ) - - # test loading the saved model, not necessary, but need to test - saved_model_path = os.path.join(self.saving_path, self.model_save_name) - self.mrnn.load_model(saved_model_path) - - -class TestLOCF(unittest.TestCase): - logger.info("Running tests for an imputation model LOCF...") - locf = LOCF(nan=0) - - @pytest.mark.xdist_group(name="imputation-locf") - def test_0_impute(self): - test_X_imputed = self.locf.impute(TEST_SET) - assert not np.isnan( - test_X_imputed - ).any(), "Output still has missing values after running impute()." - test_MAE = cal_mae( - test_X_imputed, DATA["test_X_intact"], DATA["test_X_indicating_mask"] - ) - logger.info(f"LOCF test_MAE: {test_MAE}") - - @pytest.mark.xdist_group(name="imputation-locf") - def test_1_parameters(self): - assert hasattr(self.locf, "nan") and self.locf.nan is not None - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/test_optim.py b/tests/test_optim.py deleted file mode 100644 index 9be096fb..00000000 --- a/tests/test_optim.py +++ /dev/null @@ -1,244 +0,0 @@ -""" -Test cases for optimizers. -""" - -# Created by Wenjie Du -# License: GLP-v3 - -import unittest - -import h5py -import numpy as np -import pytest - -from pypots.imputation import SAITS -from pypots.optim import Adam, AdamW, Adagrad, Adadelta, SGD, RMSprop -from pypots.utils.logging import logger -from pypots.utils.metrics import cal_mae -from tests.global_test_config import DATA - -TRAIN_SET = {"X": DATA["train_X"]} -VAL_SET = { - "X": DATA["val_X"], - "X_intact": DATA["val_X_intact"], - "indicating_mask": DATA["val_X_indicating_mask"], -} -TEST_SET = {"X": DATA["test_X"]} - - -EPOCHS = 3 - - -def save_data_set_into_h5(data, path): - with h5py.File(path, "w") as hf: - for i in data.keys(): - tp = int if i == "y" else "float32" - hf.create_dataset(i, data=data[i].astype(tp)) - - -class TestAdam(unittest.TestCase): - logger.info("Running tests for Adam...") - - # initialize an Adam optimizer - adam = Adam(lr=0.001, weight_decay=1e-5) - - # initialize a SAITS model for testing DatasetForMIT and BaseDataset - saits = SAITS( - DATA["n_steps"], - DATA["n_features"], - n_layers=1, - d_model=128, - d_inner=64, - n_heads=2, - d_k=64, - d_v=64, - dropout=0.1, - optimizer=adam, - epochs=EPOCHS, - ) - - @pytest.mark.xdist_group(name="optim-adam") - def test_0_fit(self): - self.saits.fit(TRAIN_SET, VAL_SET) - imputed_X = self.saits.impute(TEST_SET) - assert not np.isnan( - imputed_X - ).any(), "Output still has missing values after running impute()." - test_MAE = cal_mae( - imputed_X, DATA["test_X_intact"], DATA["test_X_indicating_mask"] - ) - logger.info(f"SAITS test_MAE: {test_MAE}") - - -class TestAdamW(unittest.TestCase): - logger.info("Running tests for AdamW...") - - # initialize an AdamW optimizer - adamw = AdamW(lr=0.001, weight_decay=1e-5) - - # initialize a SAITS model for testing DatasetForMIT and BaseDataset - saits = SAITS( - DATA["n_steps"], - DATA["n_features"], - n_layers=1, - d_model=128, - d_inner=64, - n_heads=2, - d_k=64, - d_v=64, - dropout=0.1, - optimizer=adamw, - epochs=EPOCHS, - ) - - @pytest.mark.xdist_group(name="optim-adamw") - def test_0_fit(self): - self.saits.fit(TRAIN_SET, VAL_SET) - imputed_X = self.saits.impute(TEST_SET) - assert not np.isnan( - imputed_X - ).any(), "Output still has missing values after running impute()." - test_MAE = cal_mae( - imputed_X, DATA["test_X_intact"], DATA["test_X_indicating_mask"] - ) - logger.info(f"SAITS test_MAE: {test_MAE}") - - -class TestAdagrad(unittest.TestCase): - logger.info("Running tests for Adagrad...") - - # initialize an Adagrad optimizer - adagrad = Adagrad(lr=0.001, weight_decay=1e-5) - - # initialize a SAITS model for testing DatasetForMIT and BaseDataset - saits = SAITS( - DATA["n_steps"], - DATA["n_features"], - n_layers=1, - d_model=128, - d_inner=64, - n_heads=2, - d_k=64, - d_v=64, - dropout=0.1, - optimizer=adagrad, - epochs=EPOCHS, - ) - - @pytest.mark.xdist_group(name="optim-adagrad") - def test_0_fit(self): - self.saits.fit(TRAIN_SET, VAL_SET) - imputed_X = self.saits.impute(TEST_SET) - assert not np.isnan( - imputed_X - ).any(), "Output still has missing values after running impute()." - test_MAE = cal_mae( - imputed_X, DATA["test_X_intact"], DATA["test_X_indicating_mask"] - ) - logger.info(f"SAITS test_MAE: {test_MAE}") - - -class TestAdadelta(unittest.TestCase): - logger.info("Running tests for Adadelta...") - - # initialize an Adadelta optimizer - adadelta = Adadelta(lr=0.001, weight_decay=1e-5) - - # initialize a SAITS model for testing DatasetForMIT and BaseDataset - saits = SAITS( - DATA["n_steps"], - DATA["n_features"], - n_layers=1, - d_model=128, - d_inner=64, - n_heads=2, - d_k=64, - d_v=64, - dropout=0.1, - optimizer=adadelta, - epochs=EPOCHS, - ) - - @pytest.mark.xdist_group(name="optim-adadelta") - def test_0_fit(self): - self.saits.fit(TRAIN_SET, VAL_SET) - imputed_X = self.saits.impute(TEST_SET) - assert not np.isnan( - imputed_X - ).any(), "Output still has missing values after running impute()." - test_MAE = cal_mae( - imputed_X, DATA["test_X_intact"], DATA["test_X_indicating_mask"] - ) - logger.info(f"SAITS test_MAE: {test_MAE}") - - -class TestSGD(unittest.TestCase): - logger.info("Running tests for SGD...") - - # initialize a SGD optimizer - sgd = SGD(lr=0.001, weight_decay=1e-5) - - # initialize a SAITS model for testing DatasetForMIT and BaseDataset - saits = SAITS( - DATA["n_steps"], - DATA["n_features"], - n_layers=1, - d_model=128, - d_inner=64, - n_heads=2, - d_k=64, - d_v=64, - dropout=0.1, - optimizer=sgd, - epochs=EPOCHS, - ) - - @pytest.mark.xdist_group(name="optim-sgd") - def test_0_fit(self): - self.saits.fit(TRAIN_SET, VAL_SET) - imputed_X = self.saits.impute(TEST_SET) - assert not np.isnan( - imputed_X - ).any(), "Output still has missing values after running impute()." - test_MAE = cal_mae( - imputed_X, DATA["test_X_intact"], DATA["test_X_indicating_mask"] - ) - logger.info(f"SAITS test_MAE: {test_MAE}") - - -class TestRMSprop(unittest.TestCase): - logger.info("Running tests for RMSprop...") - - # initialize a RMSprop optimizer - rmsprop = RMSprop(lr=0.001, weight_decay=1e-5) - - # initialize a SAITS model for testing DatasetForMIT and BaseDataset - saits = SAITS( - DATA["n_steps"], - DATA["n_features"], - n_layers=1, - d_model=128, - d_inner=64, - n_heads=2, - d_k=64, - d_v=64, - dropout=0.1, - optimizer=rmsprop, - epochs=EPOCHS, - ) - - @pytest.mark.xdist_group(name="optim-rmsprop") - def test_0_fit(self): - self.saits.fit(TRAIN_SET, VAL_SET) - imputed_X = self.saits.impute(TEST_SET) - assert not np.isnan( - imputed_X - ).any(), "Output still has missing values after running impute()." - test_MAE = cal_mae( - imputed_X, DATA["test_X_intact"], DATA["test_X_indicating_mask"] - ) - logger.info(f"SAITS test_MAE: {test_MAE}") - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/test_training_on_multi_gpus.py b/tests/test_training_on_multi_gpus.py deleted file mode 100644 index b076cbfe..00000000 --- a/tests/test_training_on_multi_gpus.py +++ /dev/null @@ -1,783 +0,0 @@ -""" -Test cases for running models on multi cuda devices. -""" - -# Created by Wenjie Du -# License: GPL-v3 - - -import os.path -import unittest - -import numpy as np -import pytest -import torch - -from pypots.classification import BRITS, GRUD, Raindrop -from pypots.clustering import VaDER, CRLI -from pypots.forecasting import BTTF -from pypots.imputation import BRITS as ImputationBRITS -from pypots.imputation import ( - SAITS, - Transformer, - MRNN, - LOCF, -) -from pypots.optim import Adam -from pypots.utils.logging import logger -from pypots.utils.metrics import cal_binary_classification_metrics -from pypots.utils.metrics import cal_mae -from pypots.utils.metrics import cal_rand_index, cal_cluster_purity -from tests.global_test_config import ( - DATA, - RESULT_SAVING_DIR, - check_tb_and_model_checkpoints_existence, -) - -EPOCHS = 5 - -cuda_devices = [torch.device(i) for i in range(torch.cuda.device_count())] - -# set DEVICES to None if no cuda device is available, to avoid initialization failed while importing test classes -DEVICES = None if cuda_devices == [] else cuda_devices - -# global skip test if less than two cuda-enabled devices -LESS_THAN_TWO_DEVICES = len(cuda_devices) < 2 -pytestmark = pytest.mark.skipif( - LESS_THAN_TWO_DEVICES, reason="not enough cuda devices to run tests" -) - - -TRAIN_SET = {"X": DATA["train_X"], "y": DATA["train_y"]} - -VAL_SET = { - "X": DATA["val_X"], - "X_intact": DATA["val_X_intact"], - "indicating_mask": DATA["val_X_indicating_mask"], - "y": DATA["val_y"], -} -TEST_SET = {"X": DATA["test_X"]} - -RESULT_SAVING_DIR_FOR_IMPUTATION = os.path.join(RESULT_SAVING_DIR, "imputation") -RESULT_SAVING_DIR_FOR_CLASSIFICATION = os.path.join(RESULT_SAVING_DIR, "classification") -RESULT_SAVING_DIR_FOR_CLUSTERING = os.path.join(RESULT_SAVING_DIR, "clustering") - - -class TestSAITS(unittest.TestCase): - logger.info("Running tests for an imputation model SAITS...") - - # set the log and model saving path - saving_path = os.path.join(RESULT_SAVING_DIR_FOR_IMPUTATION, "SAITS") - model_save_name = "saved_saits_model.pypots" - - # initialize an Adam optimizer - optimizer = Adam(lr=0.001, weight_decay=1e-5) - - # initialize a SAITS model - saits = SAITS( - DATA["n_steps"], - DATA["n_features"], - n_layers=2, - d_model=256, - d_inner=128, - n_heads=4, - d_k=64, - d_v=64, - dropout=0.1, - epochs=EPOCHS, - saving_path=saving_path, - optimizer=optimizer, - num_workers=2, - device=DEVICES, - ) - - @pytest.mark.xdist_group(name="imputation-saits") - def test_0_fit(self): - self.saits.fit(TRAIN_SET, VAL_SET) - - @pytest.mark.xdist_group(name="imputation-saits") - def test_1_impute(self): - imputed_X = self.saits.impute(TEST_SET) - assert not np.isnan( - imputed_X - ).any(), "Output still has missing values after running impute()." - test_MAE = cal_mae( - imputed_X, DATA["test_X_intact"], DATA["test_X_indicating_mask"] - ) - logger.info(f"SAITS test_MAE: {test_MAE}") - - @pytest.mark.xdist_group(name="imputation-saits") - def test_2_parameters(self): - assert hasattr(self.saits, "model") and self.saits.model is not None - - assert hasattr(self.saits, "optimizer") and self.saits.optimizer is not None - - assert hasattr(self.saits, "best_loss") - self.assertNotEqual(self.saits.best_loss, float("inf")) - - assert ( - hasattr(self.saits, "best_model_dict") - and self.saits.best_model_dict is not None - ) - - @pytest.mark.xdist_group(name="imputation-saits") - def test_3_saving_path(self): - # whether the root saving dir exists, which should be created by save_log_into_tb_file - assert os.path.exists( - self.saving_path - ), f"file {self.saving_path} does not exist" - - # check if the tensorboard file and model checkpoints exist - check_tb_and_model_checkpoints_existence(self.saits) - - # save the trained model into file, and check if the path exists - self.saits.save_model( - saving_dir=self.saving_path, file_name=self.model_save_name - ) - - # test loading the saved model, not necessary, but need to test - saved_model_path = os.path.join(self.saving_path, self.model_save_name) - self.saits.load_model(saved_model_path) - - -class TestTransformer(unittest.TestCase): - logger.info("Running tests for an imputation model Transformer...") - - # set the log and model saving path - saving_path = os.path.join(RESULT_SAVING_DIR_FOR_IMPUTATION, "Transformer") - model_save_name = "saved_transformer_model.pypots" - - # initialize an Adam optimizer - optimizer = Adam(lr=0.001, weight_decay=1e-5) - - # initialize a Transformer model - transformer = Transformer( - DATA["n_steps"], - DATA["n_features"], - n_layers=2, - d_model=256, - d_inner=128, - n_heads=4, - d_k=64, - d_v=64, - dropout=0.1, - epochs=EPOCHS, - saving_path=saving_path, - optimizer=optimizer, - num_workers=2, - device=DEVICES, - ) - - @pytest.mark.xdist_group(name="imputation-transformer") - def test_0_fit(self): - self.transformer.fit(TRAIN_SET, VAL_SET) - - @pytest.mark.xdist_group(name="imputation-transformer") - def test_1_impute(self): - imputed_X = self.transformer.impute(TEST_SET) - assert not np.isnan( - imputed_X - ).any(), "Output still has missing values after running impute()." - test_MAE = cal_mae( - imputed_X, DATA["test_X_intact"], DATA["test_X_indicating_mask"] - ) - logger.info(f"Transformer test_MAE: {test_MAE}") - - @pytest.mark.xdist_group(name="imputation-transformer") - def test_2_parameters(self): - assert hasattr(self.transformer, "model") and self.transformer.model is not None - - assert ( - hasattr(self.transformer, "optimizer") - and self.transformer.optimizer is not None - ) - - assert hasattr(self.transformer, "best_loss") - self.assertNotEqual(self.transformer.best_loss, float("inf")) - - assert ( - hasattr(self.transformer, "best_model_dict") - and self.transformer.best_model_dict is not None - ) - - @pytest.mark.xdist_group(name="imputation-transformer") - def test_3_saving_path(self): - # whether the root saving dir exists, which should be created by save_log_into_tb_file - assert os.path.exists( - self.saving_path - ), f"file {self.saving_path} does not exist" - - # check if the tensorboard file and model checkpoints exist - check_tb_and_model_checkpoints_existence(self.transformer) - - # save the trained model into file, and check if the path exists - self.transformer.save_model( - saving_dir=self.saving_path, file_name=self.model_save_name - ) - - # test loading the saved model, not necessary, but need to test - saved_model_path = os.path.join(self.saving_path, self.model_save_name) - self.transformer.load_model(saved_model_path) - - -class TestImputationBRITS(unittest.TestCase): - logger.info("Running tests for an imputation model BRITS...") - - # set the log and model saving path - saving_path = os.path.join(RESULT_SAVING_DIR_FOR_IMPUTATION, "BRITS") - model_save_name = "saved_BRITS_model.pypots" - - # initialize an Adam optimizer - optimizer = Adam(lr=0.001, weight_decay=1e-5) - - # initialize a BRITS model - brits = ImputationBRITS( - DATA["n_steps"], - DATA["n_features"], - 256, - epochs=EPOCHS, - saving_path=f"{RESULT_SAVING_DIR_FOR_IMPUTATION}/BRITS", - optimizer=optimizer, - num_workers=2, - device=DEVICES, - ) - - @pytest.mark.xdist_group(name="imputation-brits") - def test_0_fit(self): - self.brits.fit(TRAIN_SET, VAL_SET) - - @pytest.mark.xdist_group(name="imputation-brits") - def test_1_impute(self): - imputed_X = self.brits.impute(TEST_SET) - assert not np.isnan( - imputed_X - ).any(), "Output still has missing values after running impute()." - test_MAE = cal_mae( - imputed_X, DATA["test_X_intact"], DATA["test_X_indicating_mask"] - ) - logger.info(f"BRITS test_MAE: {test_MAE}") - - @pytest.mark.xdist_group(name="imputation-brits") - def test_2_parameters(self): - assert hasattr(self.brits, "model") and self.brits.model is not None - - assert hasattr(self.brits, "optimizer") and self.brits.optimizer is not None - - assert hasattr(self.brits, "best_loss") - self.assertNotEqual(self.brits.best_loss, float("inf")) - - assert ( - hasattr(self.brits, "best_model_dict") - and self.brits.best_model_dict is not None - ) - - @pytest.mark.xdist_group(name="imputation-brits") - def test_3_saving_path(self): - # whether the root saving dir exists, which should be created by save_log_into_tb_file - assert os.path.exists( - self.saving_path - ), f"file {self.saving_path} does not exist" - - # check if the tensorboard file and model checkpoints exist - check_tb_and_model_checkpoints_existence(self.brits) - - # save the trained model into file, and check if the path exists - self.brits.save_model( - saving_dir=self.saving_path, file_name=self.model_save_name - ) - - # test loading the saved model, not necessary, but need to test - saved_model_path = os.path.join(self.saving_path, self.model_save_name) - self.brits.load_model(saved_model_path) - - -class TestMRNN(unittest.TestCase): - logger.info("Running tests for an imputation model MRNN...") - - # set the log and model saving path - saving_path = os.path.join(RESULT_SAVING_DIR_FOR_IMPUTATION, "MRNN") - model_save_name = "saved_MRNN_model.pypots" - - # initialize an Adam optimizer - optimizer = Adam(lr=0.001, weight_decay=1e-5) - - # initialize a MRNN model - mrnn = MRNN( - DATA["n_steps"], - DATA["n_features"], - 256, - epochs=EPOCHS, - saving_path=f"{RESULT_SAVING_DIR_FOR_IMPUTATION}/MRNN", - optimizer=optimizer, - num_workers=2, - device=DEVICES, - ) - - @pytest.mark.xdist_group(name="imputation-mrnn") - def test_0_fit(self): - self.mrnn.fit(TRAIN_SET, VAL_SET) - - @pytest.mark.xdist_group(name="imputation-mrnn") - def test_1_impute(self): - imputed_X = self.mrnn.impute(TEST_SET) - assert not np.isnan( - imputed_X - ).any(), "Output still has missing values after running impute()." - test_MAE = cal_mae( - imputed_X, DATA["test_X_intact"], DATA["test_X_indicating_mask"] - ) - logger.info(f"MRNN test_MAE: {test_MAE}") - - @pytest.mark.xdist_group(name="imputation-mrnn") - def test_2_parameters(self): - assert hasattr(self.mrnn, "model") and self.mrnn.model is not None - - assert hasattr(self.mrnn, "optimizer") and self.mrnn.optimizer is not None - - assert hasattr(self.mrnn, "best_loss") - self.assertNotEqual(self.mrnn.best_loss, float("inf")) - - assert ( - hasattr(self.mrnn, "best_model_dict") - and self.mrnn.best_model_dict is not None - ) - - @pytest.mark.xdist_group(name="imputation-mrnn") - def test_3_saving_path(self): - # whether the root saving dir exists, which should be created by save_log_into_tb_file - assert os.path.exists( - self.saving_path - ), f"file {self.saving_path} does not exist" - - # check if the tensorboard file and model checkpoints exist - check_tb_and_model_checkpoints_existence(self.mrnn) - - # save the trained model into file, and check if the path exists - self.mrnn.save_model( - saving_dir=self.saving_path, file_name=self.model_save_name - ) - - # test loading the saved model, not necessary, but need to test - saved_model_path = os.path.join(self.saving_path, self.model_save_name) - self.mrnn.load_model(saved_model_path) - - -class TestLOCF(unittest.TestCase): - logger.info("Running tests for an imputation model LOCF...") - locf = LOCF(nan=0) - - @pytest.mark.xdist_group(name="imputation-locf") - def test_0_impute(self): - test_X_imputed = self.locf.impute(TEST_SET) - assert not np.isnan( - test_X_imputed - ).any(), "Output still has missing values after running impute()." - test_MAE = cal_mae( - test_X_imputed, DATA["test_X_intact"], DATA["test_X_indicating_mask"] - ) - logger.info(f"LOCF test_MAE: {test_MAE}") - - @pytest.mark.xdist_group(name="imputation-locf") - def test_1_parameters(self): - assert hasattr(self.locf, "nan") and self.locf.nan is not None - - -class TestClassificationBRITS(unittest.TestCase): - logger.info("Running tests for a classification model BRITS...") - - # set the log and model saving path - saving_path = os.path.join(RESULT_SAVING_DIR_FOR_CLASSIFICATION, "BRITS") - model_save_name = "saved_BRITS_model.pypots" - - # initialize an Adam optimizer - optimizer = Adam(lr=0.001, weight_decay=1e-5) - - # initialize a BRITS model - brits = BRITS( - DATA["n_steps"], - DATA["n_features"], - n_classes=DATA["n_classes"], - rnn_hidden_size=256, - epochs=EPOCHS, - saving_path=saving_path, - model_saving_strategy="better", - optimizer=optimizer, - num_workers=2, - device=DEVICES, - ) - - @pytest.mark.xdist_group(name="classification-brits") - def test_0_fit(self): - self.brits.fit(TRAIN_SET, VAL_SET) - - @pytest.mark.xdist_group(name="classification-brits") - def test_1_classify(self): - predictions = self.brits.classify(TEST_SET) - metrics = cal_binary_classification_metrics(predictions, DATA["test_y"]) - logger.info( - f'ROC_AUC: {metrics["roc_auc"]}, \n' - f'PR_AUC: {metrics["pr_auc"]},\n' - f'F1: {metrics["f1"]},\n' - f'Precision: {metrics["precision"]},\n' - f'Recall: {metrics["recall"]},\n' - ) - assert metrics["roc_auc"] >= 0.5, "ROC-AUC < 0.5" - - @pytest.mark.xdist_group(name="classification-brits") - def test_2_parameters(self): - assert hasattr(self.brits, "model") and self.brits.model is not None - - assert hasattr(self.brits, "optimizer") and self.brits.optimizer is not None - - assert hasattr(self.brits, "best_loss") - self.assertNotEqual(self.brits.best_loss, float("inf")) - - assert ( - hasattr(self.brits, "best_model_dict") - and self.brits.best_model_dict is not None - ) - - @pytest.mark.xdist_group(name="classification-brits") - def test_3_saving_path(self): - # whether the root saving dir exists, which should be created by save_log_into_tb_file - assert os.path.exists( - self.saving_path - ), f"file {self.saving_path} does not exist" - - # check if the tensorboard file and model checkpoints exist - check_tb_and_model_checkpoints_existence(self.brits) - - # save the trained model into file, and check if the path exists - self.brits.save_model( - saving_dir=self.saving_path, file_name=self.model_save_name - ) - - # test loading the saved model, not necessary, but need to test - saved_model_path = os.path.join(self.saving_path, self.model_save_name) - self.brits.load_model(saved_model_path) - - -class TestGRUD(unittest.TestCase): - logger.info("Running tests for a classification model GRUD...") - - # set the log and model saving path - saving_path = os.path.join(RESULT_SAVING_DIR_FOR_CLASSIFICATION, "GRUD") - model_save_name = "saved_GRUD_model.pypots" - - # initialize an Adam optimizer - optimizer = Adam(lr=0.001, weight_decay=1e-5) - - # initialize a GRUD model - grud = GRUD( - DATA["n_steps"], - DATA["n_features"], - n_classes=DATA["n_classes"], - rnn_hidden_size=256, - epochs=EPOCHS, - saving_path=saving_path, - optimizer=optimizer, - num_workers=2, - device=DEVICES, - ) - - @pytest.mark.xdist_group(name="classification-grud") - def test_0_fit(self): - self.grud.fit(TRAIN_SET, VAL_SET) - - @pytest.mark.xdist_group(name="classification-grud") - def test_1_classify(self): - predictions = self.grud.classify(TEST_SET) - metrics = cal_binary_classification_metrics(predictions, DATA["test_y"]) - logger.info( - f'ROC_AUC: {metrics["roc_auc"]}, \n' - f'PR_AUC: {metrics["pr_auc"]},\n' - f'F1: {metrics["f1"]},\n' - f'Precision: {metrics["precision"]},\n' - f'Recall: {metrics["recall"]},\n' - ) - assert metrics["roc_auc"] >= 0.5, "ROC-AUC < 0.5" - - @pytest.mark.xdist_group(name="classification-grud") - def test_2_parameters(self): - assert hasattr(self.grud, "model") and self.grud.model is not None - - assert hasattr(self.grud, "optimizer") and self.grud.optimizer is not None - - assert hasattr(self.grud, "best_loss") - self.assertNotEqual(self.grud.best_loss, float("inf")) - - assert ( - hasattr(self.grud, "best_model_dict") - and self.grud.best_model_dict is not None - ) - - @pytest.mark.xdist_group(name="classification-grud") - def test_3_saving_path(self): - # whether the root saving dir exists, which should be created by save_log_into_tb_file - assert os.path.exists( - self.saving_path - ), f"file {self.saving_path} does not exist" - - # check if the tensorboard file and model checkpoints exist - check_tb_and_model_checkpoints_existence(self.grud) - - # save the trained model into file, and check if the path exists - self.grud.save_model( - saving_dir=self.saving_path, file_name=self.model_save_name - ) - - # test loading the saved model, not necessary, but need to test - saved_model_path = os.path.join(self.saving_path, self.model_save_name) - self.grud.load_model(saved_model_path) - - -class TestRaindrop(unittest.TestCase): - logger.info("Running tests for a classification model Raindrop...") - - # set the log and model saving path - saving_path = os.path.join(RESULT_SAVING_DIR_FOR_CLASSIFICATION, "Raindrop") - model_save_name = "saved_Raindrop_model.pypots" - - # initialize a Raindrop model - raindrop = Raindrop( - DATA["n_steps"], - DATA["n_features"], - DATA["n_classes"], - n_layers=2, - d_model=DATA["n_features"] * 4, - d_inner=256, - n_heads=2, - dropout=0.3, - d_static=0, - aggregation="mean", - sensor_wise_mask=False, - static=False, - epochs=EPOCHS, - saving_path=saving_path, - ) - - @pytest.mark.xdist_group(name="classification-raindrop") - def test_0_fit(self): - self.raindrop.fit(TRAIN_SET, VAL_SET) - - @pytest.mark.xdist_group(name="classification-raindrop") - def test_1_classify(self): - predictions = self.raindrop.classify(TEST_SET) - metrics = cal_binary_classification_metrics(predictions, DATA["test_y"]) - logger.info( - f'ROC_AUC: {metrics["roc_auc"]}, \n' - f'PR_AUC: {metrics["pr_auc"]},\n' - f'F1: {metrics["f1"]},\n' - f'Precision: {metrics["precision"]},\n' - f'Recall: {metrics["recall"]},\n' - ) - assert metrics["roc_auc"] >= 0.5, "ROC-AUC < 0.5" - - @pytest.mark.xdist_group(name="classification-raindrop") - def test_2_parameters(self): - assert hasattr(self.raindrop, "model") and self.raindrop.model is not None - - assert ( - hasattr(self.raindrop, "optimizer") and self.raindrop.optimizer is not None - ) - - assert hasattr(self.raindrop, "best_loss") - self.assertNotEqual(self.raindrop.best_loss, float("inf")) - - assert ( - hasattr(self.raindrop, "best_model_dict") - and self.raindrop.best_model_dict is not None - ) - - @pytest.mark.xdist_group(name="classification-raindrop") - def test_3_saving_path(self): - # whether the root saving dir exists, which should be created by save_log_into_tb_file - assert os.path.exists( - self.saving_path - ), f"file {self.saving_path} does not exist" - - # check if the tensorboard file and model checkpoints exist - check_tb_and_model_checkpoints_existence(self.raindrop) - - # save the trained model into file, and check if the path exists - self.raindrop.save_model( - saving_dir=self.saving_path, file_name=self.model_save_name - ) - - # test loading the saved model, not necessary, but need to test - saved_model_path = os.path.join(self.saving_path, self.model_save_name) - self.raindrop.load_model(saved_model_path) - - -class TestCRLI(unittest.TestCase): - logger.info("Running tests for a clustering model CRLI...") - - # set the log and model saving path - saving_path = os.path.join(RESULT_SAVING_DIR_FOR_CLUSTERING, "CRLI") - model_save_name = "saved_CRLI_model.pypots" - - # initialize an Adam optimizer - G_optimizer = Adam(lr=0.001, weight_decay=1e-5) - D_optimizer = Adam(lr=0.001, weight_decay=1e-5) - - # initialize a CRLI model - crli = CRLI( - n_steps=DATA["n_steps"], - n_features=DATA["n_features"], - n_clusters=DATA["n_classes"], - n_generator_layers=2, - rnn_hidden_size=128, - epochs=EPOCHS, - saving_path=saving_path, - G_optimizer=G_optimizer, - D_optimizer=D_optimizer, - ) - - @pytest.mark.xdist_group(name="clustering-crli") - def test_0_fit(self): - self.crli.fit(TRAIN_SET) - - @pytest.mark.xdist_group(name="clustering-crli") - def test_1_parameters(self): - assert hasattr(self.crli, "model") and self.crli.model is not None - - assert hasattr(self.crli, "G_optimizer") and self.crli.G_optimizer is not None - assert hasattr(self.crli, "D_optimizer") and self.crli.D_optimizer is not None - - assert hasattr(self.crli, "best_loss") - self.assertNotEqual(self.crli.best_loss, float("inf")) - - assert ( - hasattr(self.crli, "best_model_dict") - and self.crli.best_model_dict is not None - ) - - @pytest.mark.xdist_group(name="clustering-crli") - def test_2_cluster(self): - clustering = self.crli.cluster(TEST_SET) - RI = cal_rand_index(clustering, DATA["test_y"]) - CP = cal_cluster_purity(clustering, DATA["test_y"]) - logger.info(f"RI: {RI}\nCP: {CP}") - - @pytest.mark.xdist_group(name="clustering-crli") - def test_3_saving_path(self): - # whether the root saving dir exists, which should be created by save_log_into_tb_file - assert os.path.exists( - self.saving_path - ), f"file {self.saving_path} does not exist" - - # check if the tensorboard file and model checkpoints exist - check_tb_and_model_checkpoints_existence(self.crli) - - # save the trained model into file, and check if the path exists - self.crli.save_model( - saving_dir=self.saving_path, file_name=self.model_save_name - ) - - # test loading the saved model, not necessary, but need to test - saved_model_path = os.path.join(self.saving_path, self.model_save_name) - self.crli.load_model(saved_model_path) - - -class TestVaDER(unittest.TestCase): - logger.info("Running tests for a clustering model Transformer...") - - # set the log and model saving path - saving_path = os.path.join(RESULT_SAVING_DIR_FOR_CLUSTERING, "VaDER") - model_save_name = "saved_VaDER_model.pypots" - - # initialize an Adam optimizer - optimizer = Adam(lr=0.001, weight_decay=1e-5) - - # initialize a VaDER model - vader = VaDER( - n_steps=DATA["n_steps"], - n_features=DATA["n_features"], - n_clusters=DATA["n_classes"], - rnn_hidden_size=64, - d_mu_stddev=5, - pretrain_epochs=20, - epochs=EPOCHS, - saving_path=saving_path, - optimizer=optimizer, - num_workers=2, - device=DEVICES, - ) - - @pytest.mark.xdist_group(name="clustering-vader") - def test_0_fit(self): - self.vader.fit(TRAIN_SET) - - @pytest.mark.xdist_group(name="clustering-vader") - def test_1_cluster(self): - try: - clustering = self.vader.cluster(TEST_SET) - RI = cal_rand_index(clustering, DATA["test_y"]) - CP = cal_cluster_purity(clustering, DATA["test_y"]) - logger.info(f"RI: {RI}\nCP: {CP}") - except np.linalg.LinAlgError as e: - logger.error( - f"{e}\n" - "Got singular matrix, please try to retrain the model to fix this" - ) - - @pytest.mark.xdist_group(name="clustering-vader") - def test_2_parameters(self): - assert hasattr(self.vader, "model") and self.vader.model is not None - - assert hasattr(self.vader, "optimizer") and self.vader.optimizer is not None - - assert hasattr(self.vader, "best_loss") - self.assertNotEqual(self.vader.best_loss, float("inf")) - - assert ( - hasattr(self.vader, "best_model_dict") - and self.vader.best_model_dict is not None - ) - - @pytest.mark.xdist_group(name="clustering-vader") - def test_3_saving_path(self): - # whether the root saving dir exists, which should be created by save_log_into_tb_file - assert os.path.exists( - self.saving_path - ), f"file {self.saving_path} does not exist" - - # check if the tensorboard file and model checkpoints exist - check_tb_and_model_checkpoints_existence(self.vader) - - # save the trained model into file, and check if the path exists - self.vader.save_model( - saving_dir=self.saving_path, file_name=self.model_save_name - ) - - # test loading the saved model, not necessary, but need to test - saved_model_path = os.path.join(self.saving_path, self.model_save_name) - self.vader.load_model(saved_model_path) - - -class TestBTTF(unittest.TestCase): - logger.info("Running tests for a forecasting model BTTF...") - - # initialize a BTTF model - pred_step = 4 - bttf = BTTF( - n_steps=DATA["n_steps"] - pred_step, - n_features=10, - pred_step=pred_step, - rank=10, - time_lags=[1, 2, 3, 5, 5 + 1, 5 + 2, 10, 10 + 1, 10 + 2], - burn_iter=5, - gibbs_iter=5, - multi_step=1, - ) - - @pytest.mark.xdist_group(name="forecasting-bttf") - def test_0_forecasting(self): - predictions = self.bttf.forecast({"X": DATA["test_X"][:, : -self.pred_step]}) - logger.info(f"prediction shape: {predictions.shape}") - mae = cal_mae(predictions, DATA["test_X_intact"][:, -self.pred_step :]) - logger.info(f"prediction MAE: {mae}") - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/utils/__init__.py b/tests/utils/__init__.py new file mode 100644 index 00000000..f0b4685e --- /dev/null +++ b/tests/utils/__init__.py @@ -0,0 +1,6 @@ +""" + +""" + +# Created by Wenjie Du +# License: GLP-v3 diff --git a/tests/test_utils.py b/tests/utils/logging.py similarity index 64% rename from tests/test_utils.py rename to tests/utils/logging.py index 0fd48ec8..113f0dde 100644 --- a/tests/test_utils.py +++ b/tests/utils/logging.py @@ -1,5 +1,5 @@ """ -Test cases for the functions and classes in package `pypots.utils`. +Test cases for the functions and classes in package `pypots.utils.logging`. """ # Created by Wenjie Du @@ -9,10 +9,7 @@ import shutil import unittest -import torch - from pypots.utils.logging import Logger -from pypots.utils.random import set_random_seed class TestLogging(unittest.TestCase): @@ -49,25 +46,5 @@ def test_saving_log_into_file(self): shutil.rmtree("test_log", ignore_errors=True) -class TestRandom(unittest.TestCase): - def test_set_random_seed(self): - random_state1 = torch.get_rng_state() - torch.rand( - 1, 3 - ) # randomly generate something, the random state will be reset, so two states should be varying - random_state2 = torch.get_rng_state() - assert not torch.equal( - random_state1, random_state2 - ), "The random seed hasn't set, so two random states should be different." - - set_random_seed(26) - random_state1 = torch.get_rng_state() - set_random_seed(26) - random_state2 = torch.get_rng_state() - assert torch.equal( - random_state1, random_state2 - ), "The random seed has been set, two random states are not the same." - - if __name__ == "__main__": unittest.main() diff --git a/tests/utils/random.py b/tests/utils/random.py new file mode 100644 index 00000000..0d1a0ca0 --- /dev/null +++ b/tests/utils/random.py @@ -0,0 +1,36 @@ +""" +Test cases for the functions and classes in package `pypots.utils.random`. +""" + +# Created by Wenjie Du +# License: GPL-v3 + +import unittest + +import torch + +from pypots.utils.random import set_random_seed + + +class TestRandom(unittest.TestCase): + def test_set_random_seed(self): + random_state1 = torch.get_rng_state() + torch.rand( + 1, 3 + ) # randomly generate something, the random state will be reset, so two states should be varying + random_state2 = torch.get_rng_state() + assert not torch.equal( + random_state1, random_state2 + ), "The random seed hasn't set, so two random states should be different." + + set_random_seed(26) + random_state1 = torch.get_rng_state() + set_random_seed(26) + random_state2 = torch.get_rng_state() + assert torch.equal( + random_state1, random_state2 + ), "The random seed has been set, two random states are not the same." + + +if __name__ == "__main__": + unittest.main()