Merge pull request #210 from WenjieDu/dev

Adding CSDI, updating the docs
WenjieDu · Oct 11, 2023 · b7b2f25 · b7b2f25
2 parents 069bb11 + a735559
commit b7b2f25
Show file tree

Hide file tree

Showing 18 changed files with 1,475 additions and 38 deletions.
diff --git a/README.md b/README.md
@@ -162,6 +162,7 @@ PyPOTS supports imputation, classification, clustering, and forecasting tasks on
 |        **Type**        |  **Abbr.**  |                                                                    **Full name of the algorithm/model/paper**                                                                     | **Year** |
 |       Neural Net       |    SAITS    |                                                               Self-Attention-based Imputation for Time Series [^1]                                                                |   2023   |
 |       Neural Net       | Transformer | Attention is All you Need [^2];<br>Self-Attention-based Imputation for Time Series [^1];<br><sub>Note: proposed in [^2], and re-implemented as an imputation model in [^1].</sub> |   2017   |
+|       Neural Net       |    CSDI     |                                              Conditional Score-based Diffusion Models for Probabilistic Time Series Imputation [^12]                                              |   2021   |
 |       Neural Net       |   US-GAN    |                                                 Generative Semi-supervised Learning for Multivariate Time Series Imputation [^10]                                                 |   2021   |
 |       Neural Net       |   GP-VAE    |                                                              GP-VAE: Deep Probabilistic Time Series Imputation [^11]                                                              |   2020   |
 |       Neural Net       |    BRITS    |                                                              Bidirectional Recurrent Imputation for Time Series [^3]                                                              |   2018   |
@@ -284,7 +285,9 @@ PyPOTS community is open, transparent, and surely friendly. Let's work together
 [^8]: Chen, X., & Sun, L. (2021). [Bayesian Temporal Factorization for Multidimensional Time Series Prediction](https://arxiv.org/abs/1910.06366). *IEEE transactions on pattern analysis and machine intelligence*.
 [^9]: Yoon, J., Zame, W. R., & van der Schaar, M. (2019). [Estimating Missing Data in Temporal Data Streams Using Multi-Directional Recurrent Neural Networks](https://ieeexplore.ieee.org/document/8485748). *IEEE Transactions on Biomedical Engineering*.
 [^10]: Miao, X., Wu, Y., Wang, J., Gao, Y., Mao, X., & Yin, J. (2021). [Generative Semi-supervised Learning for Multivariate Time Series Imputation](https://ojs.aaai.org/index.php/AAAI/article/view/17086). *AAAI 2021*.
-[^11]: Fortuin, V., Baranchuk, D., Raetsch, G. & Mandt, S.. (2020). [GP-VAE: Deep Probabilistic Time Series Imputation](https://proceedings.mlr.press/v108/fortuin20a.html). *AISTATS 2020*.
+[^11]: Fortuin, V., Baranchuk, D., Raetsch, G. & Mandt, S. (2020). [GP-VAE: Deep Probabilistic Time Series Imputation](https://proceedings.mlr.press/v108/fortuin20a.html). *AISTATS 2020*.
+[^12]: Tashiro, Y., Song, J., Song, Y., & Ermon, S. (2021). [CSDI: Conditional Score-based Diffusion Models for Probabilistic Time Series Imputation](https://proceedings.neurips.cc/paper/2021/hash/cfe8504bda37b575c70ee1a8276f3486-Abstract.html). *NeurIPS 2021*.
+
 
 <details>
 <summary>🏠 Visits</summary>

diff --git a/docs/pypots.imputation.rst b/docs/pypots.imputation.rst
@@ -19,6 +19,15 @@ pypots.imputation.transformer
    :show-inheritance:
    :inherited-members:
 
+pypots.imputation.csdi
+------------------------------
+
+.. automodule:: pypots.imputation.csdi
+   :members:
+   :undoc-members:
+   :show-inheritance:
+   :inherited-members:
+
 pypots.imputation.usgan
 ------------------------------
 

diff --git a/docs/pypots.modules.rst b/docs/pypots.modules.rst
@@ -0,0 +1,14 @@
+pypots.modules package
+======================
+
+pypots.modules.rnn
+------------------
+
+.. automodule:: pypots.modules.rnn
+   :members:
+
+pypots.modules.self_attention
+-----------------------------
+
+.. automodule:: pypots.modules.self_attention
+   :members:
diff --git a/docs/pypots.rst b/docs/pypots.rst
@@ -11,6 +11,7 @@ Subpackages
    pypots.classification
    pypots.clustering
    pypots.forecasting
+   pypots.modules
    pypots.optim
    pypots.data
    pypots.utils
diff --git a/docs/references.bib b/docs/references.bib
@@ -336,20 +336,6 @@ @article{tang2019JointModeling
 keywords = {Computer Science - Machine Learning,Statistics - Machine Learning}
 }
 
-@article{tashiro2021CSDI,
-title = {{{CSDI}}: {{Conditional Score-based Diffusion Models}} for {{Probabilistic Time Series Imputation}}},
-author = {Tashiro, Yusuke and Song, Jiaming and Song, Yang and Ermon, Stefano},
-year = {2021},
-month = oct,
-journal = {arXiv:2107.03502 [cs, stat]},
-eprint = {2107.03502},
-eprinttype = {arxiv},
-primaryclass = {cs, stat},
-url = {http://arxiv.org/abs/2107.03502},
-archiveprefix = {arXiv},
-keywords = {Computer Science - Machine Learning,Statistics - Machine Learning}
-}
-
 @inproceedings{vaswani2017Transformer,
 author = {Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, \L ukasz and Polosukhin, Illia},
 booktitle = {Advances in Neural Information Processing Systems},
@@ -449,4 +435,13 @@ @article{calinski1974
   pages={1--27},
   year={1974},
   publisher={Taylor \& Francis}
-}
+}
+
+@inproceedings{tashiro2021csdi,
+title={{CSDI}: Conditional Score-based Diffusion Models for Probabilistic Time Series Imputation},
+author={YUSUKE TASHIRO and Jiaming Song and Yang Song and Stefano Ermon},
+booktitle={Advances in Neural Information Processing Systems},
+editor={A. Beygelzimer and Y. Dauphin and P. Liang and J. Wortman Vaughan},
+year={2021},
+url={https://openreview.net/forum?id=VzuIzbRDrum}
+}
diff --git a/pypots/classification/grud/modules/__init__.py b/pypots/classification/grud/modules/__init__.py
@@ -6,7 +6,7 @@
 # License: GLP-v3
 
 from .core import _GRUD
-from .submodules import TemporalDecay
+from pypots.modules.rnn import TemporalDecay
 
 __all__ = [
     "_GRUD",

diff --git a/pypots/classification/grud/modules/core.py b/pypots/classification/grud/modules/core.py
@@ -16,7 +16,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
-from .submodules import TemporalDecay
+from pypots.modules.rnn import TemporalDecay
 
 
 class _GRUD(nn.Module):

diff --git a/pypots/imputation/__init__.py b/pypots/imputation/__init__.py
@@ -12,6 +12,7 @@
 from .saits import SAITS
 from .transformer import Transformer
 from .usgan import USGAN
+from .csdi import CSDI
 
 __all__ = [
     "SAITS",
@@ -21,4 +22,5 @@
     "LOCF",
     "GPVAE",
     "USGAN",
+    "CSDI",
 ]
diff --git a/pypots/imputation/brits/modules/core.py b/pypots/imputation/brits/modules/core.py
@@ -20,7 +20,7 @@
 import torch.nn as nn
 
 from .submodules import FeatureRegression
-from ....classification.grud.modules import TemporalDecay
+from ....modules.rnn import TemporalDecay
 from ....utils.metrics import cal_mae
 
 

diff --git a/pypots/imputation/csdi/__init__.py b/pypots/imputation/csdi/__init__.py
@@ -0,0 +1,12 @@
+"""
+
+"""
+
+# Created by Wenjie Du <[email protected]>
+# License: GLP-v3
+
+from .model import CSDI
+
+__all__ = [
+    "CSDI",
+]
diff --git a/pypots/imputation/csdi/data.py b/pypots/imputation/csdi/data.py
@@ -0,0 +1,152 @@
+"""
+
+"""
+
+# Created by Wenjie Du <[email protected]>
+# License: GLP-v3
+
+from typing import Union, Iterable
+
+import torch
+from pycorruptor import mcar
+
+from ...data.base import BaseDataset
+
+
+class DatasetForCSDI(BaseDataset):
+    """Dataset for CSDI model."""
+
+    def __init__(
+        self,
+        data: Union[dict, str],
+        return_labels: bool = True,
+        file_type: str = "h5py",
+        rate: float = 0.1,
+    ):
+        super().__init__(data, return_labels, file_type)
+        self.time_points = (
+            None if "time_points" not in data.keys() else data["time_points"]
+        )
+        # _, self.time_points = self._check_input(self.X, time_points)
+        self.for_pattern_mask = (
+            None if "for_pattern_mask" not in data.keys() else data["for_pattern_mask"]
+        )
+        # _, self.for_pattern_mask = self._check_input(self.X, for_pattern_mask)
+        self.cut_length = (
+            None if "cut_length" not in data.keys() else data["cut_length"]
+        )
+        # _, self.cut_length = self._check_input(self.X, cut_length)
+        self.rate = rate
+
+    def _fetch_data_from_array(self, idx: int) -> Iterable:
+        """Fetch data according to index.
+
+        Parameters
+        ----------
+        idx : int,
+            The index to fetch the specified sample.
+
+        Returns
+        -------
+        sample : list,
+            A list contains
+
+            index : int tensor,
+                The index of the sample.
+
+            X_intact : tensor,
+                Original time-series for calculating mask imputation loss.
+
+            X : tensor,
+                Time-series data with artificially missing values for model input.
+
+            missing_mask : tensor,
+                The mask records all missing values in X.
+
+            indicating_mask : tensor.
+                The mask indicates artificially missing values in X.
+        """
+        X = self.X[idx].to(torch.float32)
+        X_intact, X, missing_mask, indicating_mask = mcar(X, rate=self.rate)
+
+        observed_data = X_intact
+        observed_mask = missing_mask + indicating_mask
+        observed_tp = (
+            torch.arange(0, self.n_steps, dtype=torch.float32)
+            if self.time_points is None
+            else self.time_points[idx].to(torch.float32)
+        )
+        gt_mask = indicating_mask
+        for_pattern_mask = (
+            gt_mask if self.for_pattern_mask is None else self.for_pattern_mask[idx]
+        )
+        cut_length = (
+            torch.zeros(len(observed_data)).long()
+            if self.cut_length is None
+            else self.cut_length[idx]
+        )
+
+        sample = [
+            torch.tensor(idx),
+            observed_data,
+            observed_mask,
+            observed_tp,
+            gt_mask,
+            for_pattern_mask,
+            cut_length,
+        ]
+
+        if self.y is not None and self.return_labels:
+            sample.append(self.y[idx].to(torch.long))
+
+        return sample
+
+    def _fetch_data_from_file(self, idx: int) -> Iterable:
+        """Fetch data with the lazy-loading strategy, i.e. only loading data from the file while requesting for samples.
+        Here the opened file handle doesn't load the entire dataset into RAM but only load the currently accessed slice.
+
+        Parameters
+        ----------
+        idx : int,
+            The index of the sample to be return.
+
+        Returns
+        -------
+        sample : list,
+            The collated data sample, a list including all necessary sample info.
+        """
+
+        if self.file_handle is None:
+            self.file_handle = self._open_file_handle()
+
+        X = torch.from_numpy(self.file_handle["X"][idx]).to(torch.float32)
+        X_intact, X, missing_mask, indicating_mask = mcar(X, rate=self.rate)
+
+        observed_data = X_intact
+        observed_mask = missing_mask + indicating_mask
+        observed_tp = self.time_points[idx].to(torch.float32)
+        gt_mask = indicating_mask
+        for_pattern_mask = (
+            gt_mask if self.for_pattern_mask is None else self.for_pattern_mask[idx]
+        )
+        cut_length = (
+            torch.zeros(len(observed_data)).long()
+            if self.cut_length is None
+            else self.cut_length[idx]
+        )
+
+        sample = [
+            torch.tensor(idx),
+            observed_data,
+            observed_mask,
+            observed_tp,
+            gt_mask,
+            for_pattern_mask,
+            cut_length,
+        ]
+
+        # if the dataset has labels and is for training, then fetch it from the file
+        if "y" in self.file_handle.keys() and self.return_labels:
+            sample.append(torch.tensor(self.file_handle["y"][idx], dtype=torch.long))
+
+        return sample