From 8bcee52b3e1b0938b7a5e14cd84217b5ee0b43bb Mon Sep 17 00:00:00 2001 From: Wenjie Du Date: Tue, 19 Mar 2024 15:32:11 +0800 Subject: [PATCH 1/3] feat: add naive imputation methods, Mean and Median; --- pypots/base.py | 2 + pypots/imputation/__init__.py | 13 ++- pypots/imputation/mean/__init__.py | 12 +++ pypots/imputation/mean/model.py | 143 ++++++++++++++++++++++++++ pypots/imputation/median/__init__.py | 12 +++ pypots/imputation/median/model.py | 144 +++++++++++++++++++++++++++ tests/imputation/mean.py | 74 ++++++++++++++ tests/imputation/median.py | 74 ++++++++++++++ 8 files changed, 472 insertions(+), 2 deletions(-) create mode 100644 pypots/imputation/mean/__init__.py create mode 100644 pypots/imputation/mean/model.py create mode 100644 pypots/imputation/median/__init__.py create mode 100644 pypots/imputation/median/model.py create mode 100644 tests/imputation/mean.py create mode 100644 tests/imputation/median.py diff --git a/pypots/base.py b/pypots/base.py index 7f7aa188..19eec050 100644 --- a/pypots/base.py +++ b/pypots/base.py @@ -146,6 +146,8 @@ def _setup_device(self, device: Union[None, str, torch.device, list]) -> None: def _setup_path(self, saving_path) -> None: MODEL_NO_NEED_TO_SAVE = [ "LOCF", + "Median", + "Mean", ] # if the model is no need to save (e.g. LOCF), then skip the following steps if self.__class__.__name__ in MODEL_NO_NEED_TO_SAVE: diff --git a/pypots/imputation/__init__.py b/pypots/imputation/__init__.py index f065f0f9..4e7605f6 100644 --- a/pypots/imputation/__init__.py +++ b/pypots/imputation/__init__.py @@ -5,24 +5,33 @@ # Created by Wenjie Du # License: BSD-3-Clause +# neural network imputation methods from .brits import BRITS from .csdi import CSDI from .gpvae import GPVAE -from .locf import LOCF from .mrnn import MRNN from .saits import SAITS from .timesnet import TimesNet from .transformer import Transformer from .usgan import USGAN +# naive imputation methods +from .locf import LOCF +from .mean import Mean +from .median import Median + __all__ = [ + # neural network imputation methods "SAITS", "Transformer", "TimesNet", "BRITS", "MRNN", - "LOCF", "GPVAE", "USGAN", "CSDI", + # naive imputation methods + "LOCF", + "Mean", + "Median", ] diff --git a/pypots/imputation/mean/__init__.py b/pypots/imputation/mean/__init__.py new file mode 100644 index 00000000..9a8579bb --- /dev/null +++ b/pypots/imputation/mean/__init__.py @@ -0,0 +1,12 @@ +""" +The package of the partially-observed time-series imputation method Median. +""" + +# Created by Wenjie Du +# License: BSD-3-Clause + +from .model import Mean + +__all__ = [ + "Mean", +] diff --git a/pypots/imputation/mean/model.py b/pypots/imputation/mean/model.py new file mode 100644 index 00000000..2594df88 --- /dev/null +++ b/pypots/imputation/mean/model.py @@ -0,0 +1,143 @@ +""" +The implementation of Mean value imputation. + +""" + +# Created by Wenjie Du +# License: BSD-3-Clause + +import warnings +from typing import Union, Optional + +import h5py +import numpy as np +import torch + +from ..base import BaseImputer +from ...utils.logging import logger + + +class Mean(BaseImputer): + """Mean value imputation method.""" + + def __init__( + self, + ): + super().__init__() + + def fit( + self, + train_set: Union[dict, str], + val_set: Optional[Union[dict, str]] = None, + file_type: str = "h5py", + ) -> None: + """Train the imputer on the given data. + + Warnings + -------- + Mean imputation class does not need to run fit(). + Please run func ``predict()`` directly. + + """ + warnings.warn( + "Mean imputation class has no parameter to train. " + "Please run func `predict()` directly." + ) + + def predict( + self, + test_set: Union[dict, str], + file_type: str = "h5py", + ) -> dict: + """Make predictions for the input data with the trained model. + + Parameters + ---------- + test_set : dict or str + The dataset for model validating, should be a dictionary including keys as 'X', + or a path string locating a data file supported by PyPOTS (e.g. h5 file). + If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features], + which is time-series data for validating, can contain missing values, and y should be array-like of shape + [n_samples], which is classification labels of X. + If it is a path string, the path should point to a data file, e.g. a h5 file, which contains + key-value pairs like a dict, and it has to include keys as 'X' and 'y'. + + file_type : str + The type of the given file if test_set is a path string. + + Returns + ------- + result_dict: dict + Prediction results in a Python Dictionary for the given samples. + It should be a dictionary including keys as 'imputation', 'classification', 'clustering', and 'forecasting'. + For sure, only the keys that relevant tasks are supported by the model will be returned. + """ + if isinstance(test_set, str): + with h5py.File(test_set, "r") as f: + X = f["X"][:] + else: + X = test_set["X"] + + assert len(X.shape) == 3, ( + f"Input X should have 3 dimensions [n_samples, n_steps, n_features], " + f"but the actual shape of X: {X.shape}" + ) + if isinstance(X, list): + X = np.asarray(X) + + n_samples, n_steps, n_features = X.shape + + if isinstance(X, np.ndarray): + X_imputed_reshaped = np.copy(X).reshape(-1, n_features) + mean_values = np.nanmean(X_imputed_reshaped, axis=0) + for i, v in enumerate(mean_values): + X_imputed_reshaped[:, i] = np.nan_to_num( + X_imputed_reshaped[:, i], nan=v + ) + imputed_data = X_imputed_reshaped.reshape(n_samples, n_steps, n_features) + elif isinstance(X, torch.Tensor): + X_imputed_reshaped = torch.clone(X).reshape(-1, n_features) + mean_values = torch.nanmean(X_imputed_reshaped, dim=0).numpy() + for i, v in enumerate(mean_values): + X_imputed_reshaped[:, i] = torch.nan_to_num( + X_imputed_reshaped[:, i], nan=v + ) + imputed_data = X_imputed_reshaped.reshape(n_samples, n_steps, n_features) + else: + raise ValueError() + + result_dict = { + "imputation": imputed_data, + } + return result_dict + + def impute( + self, + X: Union[dict, str], + file_type="h5py", + ) -> np.ndarray: + """Impute missing values in the given data with the trained model. + + Warnings + -------- + The method impute is deprecated. Please use `predict()` instead. + + Parameters + ---------- + X : + The data samples for testing, should be array-like of shape [n_samples, sequence length (time steps), + n_features], or a path string locating a data file, e.g. h5 file. + + file_type : + The type of the given file if X is a path string. + + Returns + ------- + array-like, shape [n_samples, sequence length (time steps), n_features], + Imputed data. + """ + logger.warning( + "🚨DeprecationWarning: The method impute is deprecated. Please use `predict` instead." + ) + results_dict = self.predict(X, file_type=file_type) + return results_dict["imputation"] diff --git a/pypots/imputation/median/__init__.py b/pypots/imputation/median/__init__.py new file mode 100644 index 00000000..8d536e17 --- /dev/null +++ b/pypots/imputation/median/__init__.py @@ -0,0 +1,12 @@ +""" +The package of the partially-observed time-series imputation method Median. +""" + +# Created by Wenjie Du +# License: BSD-3-Clause + +from .model import Median + +__all__ = [ + "Median", +] diff --git a/pypots/imputation/median/model.py b/pypots/imputation/median/model.py new file mode 100644 index 00000000..fc22cd41 --- /dev/null +++ b/pypots/imputation/median/model.py @@ -0,0 +1,144 @@ +""" +The implementation of Median value imputation. + +""" + +# Created by Wenjie Du +# License: BSD-3-Clause + +import warnings +from typing import Union, Optional + +import h5py +import numpy as np +import torch + +from ..base import BaseImputer +from ...utils.logging import logger + + +class Median(BaseImputer): + """Median value imputation method.""" + + def __init__( + self, + ): + super().__init__() + + def fit( + self, + train_set: Union[dict, str], + val_set: Optional[Union[dict, str]] = None, + file_type: str = "h5py", + ) -> None: + """Train the imputer on the given data. + + Warnings + -------- + Median imputation class does not need to run fit(). + Please run func ``predict()`` directly. + + """ + warnings.warn( + "Median imputation class has no parameter to train. " + "Please run func `predict()` directly." + ) + + def predict( + self, + test_set: Union[dict, str], + file_type: str = "h5py", + ) -> dict: + """Make predictions for the input data with the trained model. + + Parameters + ---------- + test_set : dict or str + The dataset for model validating, should be a dictionary including keys as 'X', + or a path string locating a data file supported by PyPOTS (e.g. h5 file). + If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features], + which is time-series data for validating, can contain missing values, and y should be array-like of shape + [n_samples], which is classification labels of X. + If it is a path string, the path should point to a data file, e.g. a h5 file, which contains + key-value pairs like a dict, and it has to include keys as 'X' and 'y'. + + file_type : str + The type of the given file if test_set is a path string. + + Returns + ------- + result_dict: dict + Prediction results in a Python Dictionary for the given samples. + It should be a dictionary including keys as 'imputation', 'classification', 'clustering', and 'forecasting'. + For sure, only the keys that relevant tasks are supported by the model will be returned. + """ + if isinstance(test_set, str): + with h5py.File(test_set, "r") as f: + X = f["X"][:] + else: + X = test_set["X"] + + assert len(X.shape) == 3, ( + f"Input X should have 3 dimensions [n_samples, n_steps, n_features], " + f"but the actual shape of X: {X.shape}" + ) + if isinstance(X, list): + X = np.asarray(X) + + n_samples, n_steps, n_features = X.shape + + if isinstance(X, np.ndarray): + X_imputed_reshaped = np.copy(X).reshape(-1, n_features) + median_values = np.nanmedian(X_imputed_reshaped, axis=0) + for i, v in enumerate(median_values): + X_imputed_reshaped[:, i] = np.nan_to_num( + X_imputed_reshaped[:, i], nan=v + ) + imputed_data = X_imputed_reshaped.reshape(n_samples, n_steps, n_features) + elif isinstance(X, torch.Tensor): + X_imputed_reshaped = torch.clone(X).reshape(-1, n_features) + median_values = torch.nanmedian(X_imputed_reshaped, dim=0).numpy() + for i, v in enumerate(median_values): + X_imputed_reshaped[:, i] = torch.nan_to_num( + X_imputed_reshaped[:, i], nan=v + ) + imputed_data = X_imputed_reshaped.reshape(n_samples, n_steps, n_features) + + else: + raise ValueError() + + result_dict = { + "imputation": imputed_data, + } + return result_dict + + def impute( + self, + X: Union[dict, str], + file_type="h5py", + ) -> np.ndarray: + """Impute missing values in the given data with the trained model. + + Warnings + -------- + The method impute is deprecated. Please use `predict()` instead. + + Parameters + ---------- + X : + The data samples for testing, should be array-like of shape [n_samples, sequence length (time steps), + n_features], or a path string locating a data file, e.g. h5 file. + + file_type : + The type of the given file if X is a path string. + + Returns + ------- + array-like, shape [n_samples, sequence length (time steps), n_features], + Imputed data. + """ + logger.warning( + "🚨DeprecationWarning: The method impute is deprecated. Please use `predict` instead." + ) + results_dict = self.predict(X, file_type=file_type) + return results_dict["imputation"] diff --git a/tests/imputation/mean.py b/tests/imputation/mean.py new file mode 100644 index 00000000..31747c71 --- /dev/null +++ b/tests/imputation/mean.py @@ -0,0 +1,74 @@ +""" +Test cases for Mean imputation method. +""" + +# Created by Wenjie Du +# License: BSD-3-Clause + + +import unittest + +import numpy as np +import pytest +import torch + +from pypots.imputation import Mean +from pypots.utils.logging import logger +from pypots.utils.metrics import calc_mse +from tests.global_test_config import ( + DATA, + TEST_SET, + H5_TRAIN_SET_PATH, + H5_VAL_SET_PATH, + H5_TEST_SET_PATH, +) + + +class TestMean(unittest.TestCase): + logger.info("Running tests for an imputation model Mean...") + mean = Mean() + + @pytest.mark.xdist_group(name="imputation-mean") + def test_0_impute(self): + # if input data is numpy ndarray + test_X_imputed = self.mean.predict(TEST_SET)["imputation"] + assert not np.isnan( + test_X_imputed + ).any(), "Output still has missing values after running impute()." + test_MSE = calc_mse( + test_X_imputed, DATA["test_X_ori"], DATA["test_X_indicating_mask"] + ) + logger.info(f"Mean test_MSE: {test_MSE}") + + # if input data is torch tensor + X = torch.from_numpy(np.copy(TEST_SET["X"])) + test_X_ori = torch.from_numpy(np.copy(DATA["test_X_ori"])) + test_X_indicating_mask = torch.from_numpy( + np.copy(DATA["test_X_indicating_mask"]) + ) + + test_X_imputed = self.mean.predict({"X": X})["imputation"] + assert not torch.isnan( + test_X_imputed + ).any(), "Output still has missing values after running impute()." + test_MSE = calc_mse(test_X_imputed, test_X_ori, test_X_indicating_mask) + logger.info(f"Mean test_MSE: {test_MSE}") + + @pytest.mark.xdist_group(name="imputation-mean") + def test_4_lazy_loading(self): + self.mean.fit(H5_TRAIN_SET_PATH, H5_VAL_SET_PATH) + imputation_results = self.mean.predict(H5_TEST_SET_PATH) + assert not np.isnan( + imputation_results["imputation"] + ).any(), "Output still has missing values after running impute()." + + test_MSE = calc_mse( + imputation_results["imputation"], + DATA["test_X_ori"], + DATA["test_X_indicating_mask"], + ) + logger.info(f"Lazy-loading Mean test_MSE: {test_MSE}") + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/imputation/median.py b/tests/imputation/median.py new file mode 100644 index 00000000..c11ab3d3 --- /dev/null +++ b/tests/imputation/median.py @@ -0,0 +1,74 @@ +""" +Test cases for Median imputation method. +""" + +# Created by Wenjie Du +# License: BSD-3-Clause + + +import unittest + +import numpy as np +import pytest +import torch + +from pypots.imputation import Median +from pypots.utils.logging import logger +from pypots.utils.metrics import calc_mse +from tests.global_test_config import ( + DATA, + TEST_SET, + H5_TRAIN_SET_PATH, + H5_VAL_SET_PATH, + H5_TEST_SET_PATH, +) + + +class TestMedian(unittest.TestCase): + logger.info("Running tests for an imputation model Median...") + median = Median() + + @pytest.mark.xdist_group(name="imputation-median") + def test_0_impute(self): + # if input data is numpy ndarray + test_X_imputed = self.median.predict(TEST_SET)["imputation"] + assert not np.isnan( + test_X_imputed + ).any(), "Output still has missing values after running impute()." + test_MSE = calc_mse( + test_X_imputed, DATA["test_X_ori"], DATA["test_X_indicating_mask"] + ) + logger.info(f"Median test_MSE: {test_MSE}") + + # if input data is torch tensor + X = torch.from_numpy(np.copy(TEST_SET["X"])) + test_X_ori = torch.from_numpy(np.copy(DATA["test_X_ori"])) + test_X_indicating_mask = torch.from_numpy( + np.copy(DATA["test_X_indicating_mask"]) + ) + + test_X_imputed = self.median.predict({"X": X})["imputation"] + assert not torch.isnan( + test_X_imputed + ).any(), "Output still has missing values after running impute()." + test_MSE = calc_mse(test_X_imputed, test_X_ori, test_X_indicating_mask) + logger.info(f"Median test_MSE: {test_MSE}") + + @pytest.mark.xdist_group(name="imputation-median") + def test_4_lazy_loading(self): + self.median.fit(H5_TRAIN_SET_PATH, H5_VAL_SET_PATH) + imputation_results = self.median.predict(H5_TEST_SET_PATH) + assert not np.isnan( + imputation_results["imputation"] + ).any(), "Output still has missing values after running impute()." + + test_MSE = calc_mse( + imputation_results["imputation"], + DATA["test_X_ori"], + DATA["test_X_indicating_mask"], + ) + logger.info(f"Lazy-loading Median test_MSE: {test_MSE}") + + +if __name__ == "__main__": + unittest.main() From 7777efa5af648b537a52c0fb19e1bacd527d23ff Mon Sep 17 00:00:00 2001 From: Wenjie Du Date: Tue, 19 Mar 2024 15:55:30 +0800 Subject: [PATCH 2/3] fix: fetch values from torch.nanmedian() return; --- pypots/imputation/median/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypots/imputation/median/model.py b/pypots/imputation/median/model.py index fc22cd41..6d5db169 100644 --- a/pypots/imputation/median/model.py +++ b/pypots/imputation/median/model.py @@ -97,7 +97,7 @@ def predict( imputed_data = X_imputed_reshaped.reshape(n_samples, n_steps, n_features) elif isinstance(X, torch.Tensor): X_imputed_reshaped = torch.clone(X).reshape(-1, n_features) - median_values = torch.nanmedian(X_imputed_reshaped, dim=0).numpy() + median_values = torch.nanmedian(X_imputed_reshaped, dim=0).values.numpy() for i, v in enumerate(median_values): X_imputed_reshaped[:, i] = torch.nan_to_num( X_imputed_reshaped[:, i], nan=v From 13f2caf4d4ba60fc7d79e7ff73e333a07f36db79 Mon Sep 17 00:00:00 2001 From: Wenjie Du Date: Tue, 19 Mar 2024 17:03:50 +0800 Subject: [PATCH 3/3] docs: update the docs; --- README.md | 7 ++++--- docs/examples.rst | 4 ++-- docs/index.rst | 27 +++++++++++++++++---------- 3 files changed, 23 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index d7281f31..179de9d3 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@

- Python version + Python version powered by Pytorch @@ -144,7 +144,7 @@ Alternatively, you can install from the latest source code with the latest featu ## ❖ Usage Besides [BrewPOTS](https://github.com/WenjieDu/BrewPOTS), you can also find a simple and quick-start tutorial notebook -on Google Colab with [this link](https://colab.research.google.com/drive/1HEFjylEy05-r47jRy0H9jiS_WhD0UWmQ?usp=sharing). +on Google Colab Colab tutorials. If you have further questions, please refer to PyPOTS documentation [docs.pypots.com](https://docs.pypots.com). You can also [raise an issue](https://github.com/WenjieDu/PyPOTS/issues) or [ask in our community](#-community). @@ -265,7 +265,8 @@ By committing your code, you'll Take a look at our [inclusion criteria](https://docs.pypots.com/en/latest/faq.html#inclusion-criteria). You can utilize the `template` folder in each task package (e.g. [pypots/imputation/template](https://github.com/WenjieDu/PyPOTS/tree/main/pypots/imputation/template)) to quickly start; -2. be listed as one of [PyPOTS contributors](https://pypots.com/about/#all-contributors); +2. become one of [PyPOTS contributors](https://github.com/WenjieDu/PyPOTS/graphs/contributors) and + be listed as a volunteer developer [on the PyPOTS website](https://pypots.com/about/#volunteer-developers); 3. get mentioned in our [release notes](https://github.com/WenjieDu/PyPOTS/releases); You can also contribute to PyPOTS by simply staring🌟 this repo to help more people notice it. diff --git a/docs/examples.rst b/docs/examples.rst index 69fd6a28..4e4dd421 100644 --- a/docs/examples.rst +++ b/docs/examples.rst @@ -10,11 +10,11 @@ Quick-start Examples We put some examples here to help our users to get started quickly. Please refer to `BrewPOTS `_ for detailed PyPOTS tutorials. -You can also find a simple and quick-start tutorial notebook on Google Colab with -`this link `_. +You can also find a simple and quick-start tutorial notebook on Google Colab .. raw:: html +
diff --git a/docs/index.rst b/docs/index.rst index 93b84fb9..de4f61ab 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -12,7 +12,7 @@ Welcome to PyPOTS docs! **A Python Toolbox for Data Mining on Partially-Observed Time Series** -.. image:: https://img.shields.io/badge/Python->=v3.7-E97040?logo=python&logoColor=white +.. image:: https://img.shields.io/badge/Python-v3.7+-E97040?logo=python&logoColor=white :alt: Python version :target: https://docs.pypots.com/en/latest/install.html#reasons-of-version-limitations-on-dependencies @@ -88,12 +88,12 @@ if it helps with your research. This really means a lot to our open-source resea The rest of this readme file is organized as follows: `❖ PyPOTS Ecosystem <#id1>`_, -`❖ Installation <#id2>`_, -`❖ Usage <#id4>`_, -`❖ Available Algorithms <#id6>`_, -`❖ Citing PyPOTS <#id19>`_, -`❖ Contribution <#id20>`_, -`❖ Community <#id21>`_. +`❖ Installation <#id3>`_, +`❖ Usage <#id5>`_, +`❖ Available Algorithms <#id7>`_, +`❖ Citing PyPOTS <#id22>`_, +`❖ Contribution <#id23>`_, +`❖ Community <#id24>`_. ❖ PyPOTS Ecosystem @@ -136,7 +136,13 @@ Considering the future workload, PyPOTS tutorials is released in a single repo, and you can find them in `BrewPOTS `_. Take a look at it now, and learn how to brew your POTS datasets! -☕️ Welcome to the universe of PyPOTS. Enjoy it and have fun! +**☕️ Welcome to the universe of PyPOTS. Enjoy it and have fun!** + +.. image:: https://pypots.com/figs/pypots_logos/Ecosystem/PyPOTS_Ecosystem_Pipeline.png + :width: 95% + :alt: BrewPOTS logo + :align: center + :target: https://pypots.com/ecosystem/ ❖ Installation @@ -149,7 +155,7 @@ Refer to the page `Installation `_ to see different ways of instal ❖ Usage ^^^^^^^^ Besides `BrewPOTS `_, you can also find a simple and quick-start tutorial notebook -on Google Colab with `this link `_. +on Google Colab with `this link `_. You can also `raise an issue `_ or `ask in our community <#id21>`_. Additionally, we present you a usage example of imputing missing values in time series with PyPOTS in @@ -227,7 +233,8 @@ By committing your code, you'll Take a look at our `inclusion criteria `_. You can utilize the ``template`` folder in each task package (e.g. `pypots/imputation/template `_) to quickly start; -2. be listed as one of `PyPOTS contributors `_: +2. become one of `PyPOTS contributors `_ and + be listed as a volunteer developer `on the PyPOTS website `_; 3. get mentioned in our `release notes `_; You can also contribute to PyPOTS by simply staring🌟 this repo to help more people notice it.