Merge pull request #216 from WenjieDu/dev

Merge the docs of PyPOTS ecosystem, and replace pycorruptor with pygrinder in pypots
WenjieDu · Oct 13, 2023 · c3fd6ef · c3fd6ef
2 parents 1eac993 + adb6d38
commit c3fd6ef
Show file tree

Hide file tree

Showing 20 changed files with 118 additions and 148 deletions.
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
@@ -1,4 +1,5 @@
 # This is file is used to help customize PyPOTS documentation building process on ReadTheDocs.
+# https://docs.readthedocs.io/en/stable/build-customization.html
 
 version: 2
 
@@ -21,7 +22,7 @@ build:
       pre_install:
          - python -m pip install --upgrade pip
          - pip install torch==1.13.1 -f https://download.pytorch.org/whl/cpu
-         - pip install torch-geometric torch-scatter torch-sparse -f "https://data.pyg.org/whl/torch-1.13.1+cpu.html"
+         - pip install torch-geometric==2.3.1 torch-scatter==2.1.1 torch-sparse==0.6.17 -f "https://data.pyg.org/whl/torch-1.13.1+cpu.html"
          - pip install pypots
          - pip install sphinx==6.2.1 docutils==0.19 sphinxcontrib-bibtex==2.1.4 sphinxcontrib-gtagjs sphinx-autodoc-typehints furo==2023.07.26
 

diff --git a/README.md b/README.md
@@ -131,7 +131,8 @@ We present you a usage example of imputing missing values in time series with Py
 ``` python
 import numpy as np
 from sklearn.preprocessing import StandardScaler
-from pypots.data import load_specific_dataset, mcar, masked_fill
+from pygrinder import mcar, masked_fill
+from pypots.data import load_specific_dataset
 from pypots.imputation import SAITS
 from pypots.utils.metrics import cal_mae
 # Data preprocessing. Tedious, but PyPOTS can help.

diff --git a/docs/about_us.rst b/docs/about_us.rst
@@ -29,7 +29,7 @@ PyPOTS exists thanks to all the nice people (sorted by contribution time) who co
 `PyPOTS <https://github.com/WenjieDu/PyPOTS/graphs/contributors>`_,
 `BrewPOTS <https://github.com/WenjieDu/BrewPOTS/graphs/contributors>`_,
 `TSDB <https://github.com/WenjieDu/TSDB/graphs/contributors>`_,
-`PyCorruptor <https://github.com/WenjieDu/PyCorruptor/graphs/contributors>`_):
+`PyGrinder <https://github.com/WenjieDu/PyGrinder/graphs/contributors>`_):
 
 .. raw:: html
 

diff --git a/docs/examples.rst b/docs/examples.rst
@@ -22,7 +22,8 @@ You can also find a simple and quick-start tutorial notebook on Google Colab wit
 
     import numpy as np
     from sklearn.preprocessing import StandardScaler
-    from pypots.data import load_specific_dataset, mcar, masked_fill
+    from pygrinder import mcar, masked_fill
+    from pypots.data import load_specific_dataset
     from pypots.imputation import SAITS
     from pypots.utils.metrics import cal_mae
 

diff --git a/docs/index.rst b/docs/index.rst
@@ -254,6 +254,8 @@ PyPOTS community is open, transparent, and surely friendly. Let's work together
 
    model_api
    pypots
+   tsdb
+   pygrinder
 
 .. toctree::
    :maxdepth: 2

diff --git a/docs/install.rst b/docs/install.rst
@@ -34,7 +34,7 @@ Required Dependencies
 * tensorboard
 * h5py
 * tsdb
-* pycorruptor
+* pygrinder
 
 
 Optional Dependencies

diff --git a/docs/pygrinder.rst b/docs/pygrinder.rst
@@ -0,0 +1,11 @@
+All APIs of PyGrinder
+=======================
+
+PyGrinder
+---------
+
+.. automodule:: pygrinder
+   :members:
+   :undoc-members:
+   :show-inheritance:
+   :inherited-members:
diff --git a/docs/tsdb.rst b/docs/tsdb.rst
@@ -0,0 +1,11 @@
+All APIs of TSDB
+=======================
+
+TSDB
+----
+
+.. automodule:: tsdb
+   :members:
+   :undoc-members:
+   :show-inheritance:
+   :inherited-members:
diff --git a/environment-dev.yml b/environment-dev.yml
@@ -16,7 +16,7 @@ dependencies:
     #- conda-forge::pandas <2.0.0
     #- conda-forge::h5py
     #- conda-forge::tensorboard
-    #- conda-forge::pycorruptor
+    #- conda-forge::pygrinder
     #- conda-forge::tsdb
     #- pytorch::pytorch >=1.10.0
     ## Below we install the latest pypots because we need pypots-cli in it for development.

diff --git a/pypots/classification/grud/data.py b/pypots/classification/grud/data.py
@@ -11,7 +11,7 @@
 import torch
 
 from ...data.base import BaseDataset
-from ...data.utils import torch_parse_delta
+from ...data.utils import _parse_delta_torch
 from ...imputation.locf import LOCF
 
 
@@ -55,7 +55,7 @@ def __init__(
             self.missing_mask = (~torch.isnan(self.X)).to(torch.float32)
             self.X_filledLOCF = self.locf._locf_torch(self.X)
             self.X = torch.nan_to_num(self.X)
-            self.deltas = torch_parse_delta(self.missing_mask)
+            self.deltas = _parse_delta_torch(self.missing_mask)
             self.empirical_mean = torch.sum(
                 self.missing_mask * self.X, dim=[0, 1]
             ) / torch.sum(self.missing_mask, dim=[0, 1])
@@ -127,7 +127,7 @@ def _fetch_data_from_file(self, idx: int) -> Iterable:
         missing_mask = (~torch.isnan(X)).to(torch.float32)
         X_filledLOCF = self.locf._locf_torch(X.unsqueeze(dim=0)).squeeze()
         X = torch.nan_to_num(X)
-        deltas = torch_parse_delta(missing_mask)
+        deltas = _parse_delta_torch(missing_mask)
         empirical_mean = torch.sum(missing_mask * X, dim=[0]) / torch.sum(
             missing_mask, dim=[0]
         )

diff --git a/pypots/cli/env.py b/pypots/cli/env.py
@@ -16,7 +16,7 @@
     # import scipy
     # import h5py
     # import tsdb
-    # import pycorruptor
+    # import pygrinder
 except ImportError:
     raise ImportError(
         "Torch not installed. Using this tool supposes that you've already installed `pypots` "

diff --git a/pypots/data/__init__.py b/pypots/data/__init__.py
@@ -17,13 +17,8 @@
     list_supported_datasets,
     load_specific_dataset,
 )
-from .utils import (
-    masked_fill,
-    mcar,
-    pickle_load,
-    pickle_dump,
-)
 from .saving import save_dict_into_h5
+from .utils import parse_delta, sliding_window
 
 __all__ = [
     # datasets
@@ -38,10 +33,8 @@
     "list_supported_datasets",
     "load_specific_dataset",
     # utils
-    "masked_fill",
-    "mcar",
-    "pickle_load",
-    "pickle_dump",
+    "parse_delta",
+    "sliding_window",
     # saving
     "save_dict_into_h5",
 ]
diff --git a/pypots/data/generating.py b/pypots/data/generating.py
@@ -10,12 +10,12 @@
 
 import numpy as np
 import torch
+from pygrinder import mcar, masked_fill
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import StandardScaler
 from sklearn.utils import check_random_state
 
 from .load_specific_datasets import load_specific_dataset
-from .utils import mcar, masked_fill
 
 
 def gene_complete_random_walk(

diff --git a/pypots/data/utils.py b/pypots/data/utils.py
@@ -5,127 +5,33 @@
 # Created by Wenjie Du <[email protected]>
 # License: GLP-v3
 
-
 from typing import Union
-
 import numpy as np
-import pycorruptor as corruptor
 import torch
-from tsdb import (
-    pickle_load as _pickle_load,
-    pickle_dump as _pickle_dump,
-)
 
-pickle_load = _pickle_load
-pickle_dump = _pickle_dump
 
-
-def cal_missing_rate(X: Union[np.ndarray, torch.Tensor, list]) -> float:
-    """Calculate the missing rate of the given data.
+def _parse_delta_torch(missing_mask: torch.Tensor) -> torch.Tensor:
+    """Generate the time-gap matrix (i.e. the delta metrix) from the missing mask.
+    Please refer to :cite:`che2018GRUD` for its math definition.
 
     Parameters
     ----------
-    X :
-        The data to calculate missing rate.
+    missing_mask : shape of [n_steps, n_features] or [n_samples, n_steps, n_features]
+        Binary masks indicate missing data (0 means missing values, 1 means observed values).
 
     Returns
     -------
-    missing_rate :
-        The missing rate of the given data.
-
-    """
-    missing_rate = corruptor.cal_missing_rate(X)
-    return missing_rate
-
-
-def masked_fill(
-    X: Union[np.ndarray, torch.Tensor, list],
-    mask: Union[np.ndarray, torch.Tensor, list],
-    value: float,
-) -> Union[np.ndarray, torch.Tensor]:
-    """Fill the masked values in ``X`` according to ``mask`` with the given ``value``.
+    delta :
+        The delta matrix indicates the time gaps between observed values.
+        With the same shape of missing_mask.
 
-    Parameters
+    References
     ----------
-    X :
-        The data to be filled.
-
-    mask :
-        The mask for filling the given data.
-
-    value :
-        The value to fill the masked values.
-
-    Returns
-    -------
-    filled_X :
-        The filled data.
-
-    """
-    filled_X = corruptor.masked_fill(X, mask, value)
-    return filled_X
-
-
-def mcar(
-    X: Union[np.ndarray, torch.Tensor, list],
-    rate: float,
-    nan: float = 0,
-) -> Union[np.ndarray, torch.Tensor]:
-    """Generate missing values in the given data with MCAR (Missing Completely At Random) mechanism.
-
-    Parameters
-    ----------
-    X :
-        Data vector. If X has any missing values, they should be numpy.nan.
-
-    rate :
-        Artificially missing rate, rate of the observed values which will be artificially masked as missing.
-
-        Note that,
-        `rate` = (number of artificially missing values) / np.sum(~np.isnan(self.data)),
-        not (number of artificially missing values) / np.product(self.data.shape),
-        considering that the given data may already contain missing values,
-        the latter way may be confusing because if the original missing rate >= `rate`,
-        the function will do nothing, i.e. it won't play the role it has to be.
-
-    nan :
-        Value used to fill NaN values.
-
-    Returns
-    -------
-    X_intact : array,
-        Original data with missing values (nan) filled with given parameter `nan`, with observed values intact.
-        X_intact is for loss calculation in the masked imputation task.
-
-    X : array,
-        Original X with artificial missing values. X is for model input.
-        Both originally-missing and artificially-missing values are filled with given parameter `nan`.
-
-    missing_mask : array,
-        The mask indicates all missing values in X.
-        In it, 1 indicates observed values, and 0 indicates missing values.
+    .. [1] `Che, Zhengping, Sanjay Purushotham, Kyunghyun Cho, David Sontag, and Yan Liu.
+        "Recurrent neural networks for multivariate time series with missing values."
+        Scientific reports 8, no. 1 (2018): 6085.
+        <https://www.nature.com/articles/s41598-018-24271-9.pdf>`_
 
-    indicating_mask : array,
-        The mask indicates the artificially-missing values in X, namely missing parts different from X_intact.
-        In it, 1 indicates artificially missing values, and other values are indicated as 0.
-    """
-    X = corruptor.mcar(X, rate, nan)
-    return X
-
-
-def torch_parse_delta(missing_mask: torch.Tensor) -> torch.Tensor:
-    """Generate time-gap (delta) matrix from missing masks.
-    Please refer to :cite:`che2018GRUD` for its math definition.
-
-    Parameters
-    ----------
-    missing_mask :
-        Binary masks indicate missing values. Shape of [n_steps, n_features] or [n_samples, n_steps, n_features]
-
-    Returns
-    -------
-    delta
-        Delta matrix indicates time gaps of missing values.
     """
 
     def cal_delta_for_single_sample(mask: torch.Tensor) -> torch.Tensor:
@@ -156,18 +62,28 @@ def cal_delta_for_single_sample(mask: torch.Tensor) -> torch.Tensor:
     return delta
 
 
-def numpy_parse_delta(missing_mask: np.ndarray) -> np.ndarray:
-    """Generate time-gap (delta) matrix from missing masks. Please refer to :cite:`che2018GRUD` for its math definition.
+def _parse_delta_numpy(missing_mask: np.ndarray) -> np.ndarray:
+    """Generate the time-gap matrix (i.e. the delta metrix) from the missing mask.
+    Please refer to :cite:`che2018GRUD` for its math definition.
 
     Parameters
     ----------
-    missing_mask :
-        Binary masks indicate missing values. Shape of [n_steps, n_features] or [n_samples, n_steps, n_features].
+    missing_mask : shape of [n_steps, n_features] or [n_samples, n_steps, n_features]
+        Binary masks indicate missing data (0 means missing values, 1 means observed values).
 
     Returns
     -------
-    delta
-        Delta matrix indicates time gaps of missing values.
+    delta :
+        The delta matrix indicates the time gaps between observed values.
+        With the same shape of missing_mask.
+
+    References
+    ----------
+    .. [1] `Che, Zhengping, Sanjay Purushotham, Kyunghyun Cho, David Sontag, and Yan Liu.
+        "Recurrent neural networks for multivariate time series with missing values."
+        Scientific reports 8, no. 1 (2018): 6085.
+        <https://www.nature.com/articles/s41598-018-24271-9.pdf>`_
+
     """
 
     def cal_delta_for_single_sample(mask: np.ndarray) -> np.ndarray:
@@ -194,6 +110,40 @@ def cal_delta_for_single_sample(mask: np.ndarray) -> np.ndarray:
     return delta
 
 
+def parse_delta(
+    missing_mask: Union[np.ndarray, torch.Tensor]
+) -> Union[np.ndarray, torch.Tensor]:
+    """Generate the time-gap matrix (i.e. the delta metrix) from the missing mask.
+    Please refer to :cite:`che2018GRUD` for its math definition.
+
+    Parameters
+    ----------
+    missing_mask : shape of [n_steps, n_features] or [n_samples, n_steps, n_features]
+        Binary masks indicate missing data (0 means missing values, 1 means observed values).
+
+    Returns
+    -------
+    delta :
+        The delta matrix indicates the time gaps between observed values.
+        With the same shape of missing_mask.
+
+    References
+    ----------
+    .. [1] `Che, Zhengping, Sanjay Purushotham, Kyunghyun Cho, David Sontag, and Yan Liu.
+        "Recurrent neural networks for multivariate time series with missing values."
+        Scientific reports 8, no. 1 (2018): 6085.
+        <https://www.nature.com/articles/s41598-018-24271-9.pdf>`_
+
+    """
+    if isinstance(missing_mask, np.ndarray):
+        delta = _parse_delta_numpy(missing_mask)
+    elif isinstance(missing_mask, torch.Tensor):
+        delta = _parse_delta_torch(missing_mask)
+    else:
+        raise RuntimeError
+    return delta
+
+
 def sliding_window(time_series, window_len, sliding_len=None):
     """Generate time series samples with sliding window method, truncating windows from time-series data
     with a given sequence length.