From cecffea68b75700cac6682d1ea8000f16b2ec186 Mon Sep 17 00:00:00 2001 From: Wenjie Du Date: Mon, 21 Aug 2023 20:51:12 +0800 Subject: [PATCH 01/15] docs: update the faq page; --- docs/faq.rst | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/faq.rst b/docs/faq.rst index 79b3da2e..e506c817 100644 --- a/docs/faq.rst +++ b/docs/faq.rst @@ -38,7 +38,7 @@ and **NOTE**: The maintainer role is not permanent. The role is called "maintainer" because it actively maintains the project. You can take a leave of absence from the role with notice at any time. -But if you're inactive for a long time (more than three months. With reasons, a longer period is allowed for sure), your role will be deactivated. +But if you're inactive for a long time (more than three months. With reasons, a longer period is allowed for sure), your role may be deactivated. Becoming a Lead """"""""""""""" @@ -52,7 +52,8 @@ The lead is a permanent role unless your research is no longer related to the fi you no longer want to get involved with affairs at PyPOTS. If you believe you want to do this, you can drop an email with anything you want to tell and your CV attachment to -`team@pypots.com `_. We will schedule a meeting for you and all other leads at PyPOTS for further discussion. +`team@pypots.com `_. We will schedule a meeting for you and all other members at PyPOTS for further discussion. +This is absolutely not a so-called interview, please don't take it formal and we just would like to listen to your thoughts about the field of POTS ;-) Our Development Principles @@ -64,7 +65,7 @@ Our Development Principles We develop PyPOTS and we should try the best to use it in any scenarios related to POTS data. Only in this way, we can figure out how it tastes like, if it is a good toolset for users, and what other features and models should be included into PyPOTS; 3. `No silver bullet `_ and `No free launch `_. - There is no one solution to all problems in the Universe. In PyPOTS, we keep things modular, so one can easily try and replace parts of the pipeline + There is no one solution to all problems in the universe. In PyPOTS, we keep things modular, so one can easily try and replace parts of the pipeline in search for the optimal combination for the particular task; 4. Keep things easy to use and familiar. We try to keep PyPOTS intuitive without compromising flexibility and without forcing users to learn a completely new technology. We do this by keeping the toolkit close to APIs in scikit-learn and pytorch that people know and love; From 6730e026640816bd9b3c4e9131c5e59fa415e58d Mon Sep 17 00:00:00 2001 From: Wenjie Du Date: Tue, 22 Aug 2023 11:20:06 +0800 Subject: [PATCH 02/15] feat: inverse attention masking, 0 means to mask, 1 means to not mask; --- pypots/imputation/saits/model.py | 2 +- pypots/modules/self_attention.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pypots/imputation/saits/model.py b/pypots/imputation/saits/model.py index 233fc780..85731df7 100644 --- a/pypots/imputation/saits/model.py +++ b/pypots/imputation/saits/model.py @@ -156,7 +156,7 @@ def forward( if (training and self.diagonal_attention_mask) or ( (not training) and diagonal_attention_mask ): - diagonal_attention_mask = torch.eye(self.n_steps).to(X.device) + diagonal_attention_mask = (1 - torch.eye(self.n_steps)).to(X.device) # then broadcast on the batch axis diagonal_attention_mask = diagonal_attention_mask.unsqueeze(0) else: diff --git a/pypots/modules/self_attention.py b/pypots/modules/self_attention.py index e3dbe34a..8f42f799 100644 --- a/pypots/modules/self_attention.py +++ b/pypots/modules/self_attention.py @@ -42,7 +42,7 @@ def forward( # apply masking on the attention map, this is optional if attn_mask is not None: - attn = attn.masked_fill(attn_mask == 1, -1e9) + attn = attn.masked_fill(attn_mask == 0, -1e9) # compute attention score [0, 1], then apply dropout attn = self.dropout(F.softmax(attn, dim=-1)) From 5659c6e357eaa03a4f458b183eea98c990440163 Mon Sep 17 00:00:00 2001 From: Wenjie Du Date: Tue, 22 Aug 2023 23:07:25 +0800 Subject: [PATCH 03/15] fix: typos in variable names; --- pypots/modules/self_attention.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pypots/modules/self_attention.py b/pypots/modules/self_attention.py index fa3bbe86..e3dbe34a 100644 --- a/pypots/modules/self_attention.py +++ b/pypots/modules/self_attention.py @@ -207,10 +207,10 @@ def forward( dec_enc_attn_mask: Optional[torch.Tensor] = None, ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: dec_output, dec_slf_attn = self.slf_attn( - dec_input, dec_input, dec_input, mask=slf_attn_mask + dec_input, dec_input, dec_input, attn_mask=slf_attn_mask ) dec_output, dec_enc_attn = self.enc_attn( - dec_output, enc_output, enc_output, mask=dec_enc_attn_mask + dec_output, enc_output, enc_output, attn_mask=dec_enc_attn_mask ) dec_output = self.pos_ffn(dec_output) return dec_output, dec_slf_attn, dec_enc_attn @@ -288,7 +288,7 @@ def __init__( self.embedding = nn.Linear(n_features, d_model) self.dropout = nn.Dropout(dropout) self.position_enc = PositionalEncoding(d_model, n_position=n_steps) - self.dec_layer_stack = nn.ModuleList( + self.layer_stack = nn.ModuleList( [ DecoderLayer( d_model, From e31f73a72f924052d3150fee4e3126718ff874e8 Mon Sep 17 00:00:00 2001 From: Wenjie Du Date: Fri, 25 Aug 2023 19:51:20 +0800 Subject: [PATCH 04/15] docs: add `.gitignore`; --- .gitignore | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..0841fdef --- /dev/null +++ b/.gitignore @@ -0,0 +1,20 @@ +# ignore special files or folds +*~ +.idea +.DS_Store + +# ignore all building results +dist +build +docs/_build +*.egg-info + +# ignore all testing/running results +.run +.coverage +.pytest_cache +*__pycache__* +*testing_results* + +# ignore specific kinds of files like all PDFs +*.pdf From df3eba32ea604c42640952b64730576fc327d175 Mon Sep 17 00:00:00 2001 From: Wenjie Du Date: Fri, 25 Aug 2023 20:23:57 +0800 Subject: [PATCH 05/15] feat: ignore all readme files in pypots package while building for release; --- MANIFEST.in | 1 + 1 file changed, 1 insertion(+) diff --git a/MANIFEST.in b/MANIFEST.in index f2f5b609..a65e0605 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,2 +1,3 @@ prune tests prune pypots/*/template +prune pypots/*/README.md From 54cf6bb1051fb840f8da23db66415df6e5d13174 Mon Sep 17 00:00:00 2001 From: Wenjie Du Date: Mon, 28 Aug 2023 17:02:13 +0800 Subject: [PATCH 06/15] docs: update PyPI downloads badge link; --- README.md | 2 +- docs/index.rst | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 56df9e5d..851fe16d 100644 --- a/README.md +++ b/README.md @@ -44,7 +44,7 @@ Conda downloads - PyPI downloads + PyPI downloads

diff --git a/docs/index.rst b/docs/index.rst index 6254b4c5..499bc710 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -10,7 +10,7 @@ Welcome to PyPOTS docs! :target: https://github.com/WenjieDu/PyPOTS :alt: PyPOTS logo -.. centered:: A Python Toolbox for Data Mining on Partially-Observed Time Series +**A Python Toolbox for Data Mining on Partially-Observed Time Series** .. image:: https://img.shields.io/badge/Python-v3.7--3.10-E97040?logo=python&logoColor=white :alt: Python version @@ -62,7 +62,7 @@ Welcome to PyPOTS docs! :alt: Conda downloads :target: https://anaconda.org/conda-forge/pypots -.. image:: https://static.pepy.tech/personalized-badge/pypots?period=total&units=international_system&left_color=grey&right_color=blue&left_text=PyPI%20Downloads +.. image:: https://img.shields.io/endpoint?url=https%3A%2F%2Fraw.githubusercontent.com%2FWenjieDu%2FWenjieDu%2Fmain%2Ffigs%2Fprojects%2Fpypots_downloads.json :alt: PyPI downloads :target: https://pepy.tech/project/pypots From 737cc58fe1b72a19d36447c19c2c7e27ac0f611a Mon Sep 17 00:00:00 2001 From: Wenjie Du Date: Mon, 28 Aug 2023 21:18:32 +0800 Subject: [PATCH 07/15] feat: add save_dict_into_h5(); --- pypots/data/__init__.py | 3 +++ pypots/data/saving.py | 43 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+) create mode 100644 pypots/data/saving.py diff --git a/pypots/data/__init__.py b/pypots/data/__init__.py index 6052b1fa..a3a68be9 100644 --- a/pypots/data/__init__.py +++ b/pypots/data/__init__.py @@ -22,6 +22,7 @@ pickle_load, pickle_dump, ) +from .saving import save_dict_into_h5 __all__ = [ # datasets @@ -39,4 +40,6 @@ "mcar", "pickle_load", "pickle_dump", + # saving + "save_dict_into_h5", ] diff --git a/pypots/data/saving.py b/pypots/data/saving.py new file mode 100644 index 00000000..8581ad50 --- /dev/null +++ b/pypots/data/saving.py @@ -0,0 +1,43 @@ +""" +Data saving utilities. +""" + +# Created by Wenjie Du +# License: GLP-v3 + + +import os + +import h5py + +from pypots.utils.file import create_dir_if_not_exist +from pypots.utils.logging import logger + + +def save_dict_into_h5(data_dict: dict, saving_dir: str) -> None: + """Save the given data (in a dictionary) into the given h5 file. + + Parameters + ---------- + data_dict : dict, + The data to be saved, should be a Python dictionary. + + saving_dir : str, + The h5 file to save the data. + + """ + + def save_set(handle, name, data): + if isinstance(data, dict): + single_set_handle = handle.create_group(name) + for key, value in data.items(): + save_set(single_set_handle, key, value) + else: + handle.create_dataset(name, data=data) + + create_dir_if_not_exist(saving_dir) + saving_path = os.path.join(saving_dir, "datasets.h5") + with h5py.File(saving_path, "w") as hf: + for k, v in data_dict.items(): + save_set(hf, k, v) + logger.info(f"Successfully saved the given data into {saving_path}.") From 6a967e22325fb092da85089b5d148a5449f4e0e8 Mon Sep 17 00:00:00 2001 From: Wenjie Du Date: Wed, 30 Aug 2023 14:46:44 +0800 Subject: [PATCH 08/15] docs: update the doc; --- README.md | 10 ++++++---- docs/index.rst | 8 +++++--- docs/pypots.utils.rst | 4 ++-- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 851fe16d..fa7c38c7 100644 --- a/README.md +++ b/README.md @@ -9,12 +9,14 @@ Python version - powered by Pytorch - - the latest release version + + powered by Pytorch + + + the latest release version - GPL-v3 license + GPL-v3 license Community diff --git a/docs/index.rst b/docs/index.rst index 499bc710..0bdabc03 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -14,15 +14,17 @@ Welcome to PyPOTS docs! .. image:: https://img.shields.io/badge/Python-v3.7--3.10-E97040?logo=python&logoColor=white :alt: Python version + :target: https://docs.pypots.com/en/latest/install.html#reasons-of-version-limitations-on-dependencies .. image:: https://img.shields.io/badge/PyTorch-❤️-F8C6B5?logo=pytorch&logoColor=white :alt: powered by Pytorch + :target: https://github.com/WenjieDu/PyPOTS -.. image:: https://img.shields.io/github/v/release/wenjiedu/pypots?color=EE781F&include_prereleases&label=Release +.. image:: https://img.shields.io/github/v/release/wenjiedu/pypots?color=EE781F&include_prereleases&label=Release&logo=github&logoColor=white :alt: the latest release version - :target: https://pypi.org/project/pypots + :target: https://github.com/WenjieDu/PyPOTS/releases -.. image:: https://img.shields.io/badge/License-GPL--v3-E9BB41 +.. image:: https://img.shields.io/badge/License-GPL--v3-E9BB41?logo=opensourceinitiative&logoColor=white :alt: GPL-v3 license :target: https://github.com/WenjieDu/PyPOTS/blob/main/LICENSE diff --git a/docs/pypots.utils.rst b/docs/pypots.utils.rst index 47dac4b8..50c9c875 100644 --- a/docs/pypots.utils.rst +++ b/docs/pypots.utils.rst @@ -1,10 +1,10 @@ pypots.utils package ==================== -pypots.utils.files module +pypots.utils.file module ------------------------- -.. automodule:: pypots.utils.files +.. automodule:: pypots.utils.file :members: :undoc-members: :show-inheritance: From 833c87750ffc5ea4ac4d0377d702ddee076cc837 Mon Sep 17 00:00:00 2001 From: Bhargav Vemuri <59896417+vemuribv@users.noreply.github.com> Date: Wed, 6 Sep 2023 07:44:54 -0700 Subject: [PATCH 09/15] Add internal clustering validation measures to pypots.utils.metrics (#179) * Added 3 Internal Clustering Validation Measures to use when ground truth cluster labels are not known --------- Co-authored-by: Wenjie Du --- pypots/utils/metrics.py | 72 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/pypots/utils/metrics.py b/pypots/utils/metrics.py index ee7d69bb..85efb54d 100644 --- a/pypots/utils/metrics.py +++ b/pypots/utils/metrics.py @@ -572,3 +572,75 @@ def cal_cluster_purity( contingency_matrix ) return cluster_purity + + +def cal_silhouette( + latent_rep: np.ndarray, + class_predictions: np.ndarray +) -> float: + """Compute the mean Silhouette Coefficient of all samples. + + Parameters + ---------- + latent_rep : + Latent representation learned by a clusterer. + + class_predictions : + Clustering results returned by a clusterer. + + Returns + ------- + silhouette : + Mean Silhouette Coefficient for all samples. + + """ + silhouette = metrics.silhouette_score(latent_rep, class_predictions) + return silhouette + + +def cal_chs( + latent_rep: np.ndarray, + class_predictions: np.ndarray +) -> float: + """Compute the Calinski and Harabasz score (also known as the Variance Ratio Criterion). + + Parameters + ---------- + latent_rep : + Latent representation learned by a clusterer. + + class_predictions : + Clustering results returned by a clusterer. + + Returns + ------- + chs : + The resulting Calinski-Harabasz score. + + """ + chs = metrics.calinski_harabasz_score(latent_rep, class_predictions) + return chs + + +def cal_dbs( + latent_rep: np.ndarray, + class_predictions: np.ndarray +) -> float: + """Compute the Davies-Bouldin score. + + Parameters + ---------- + latent_rep : + Latent representation learned by a clusterer. + + class_predictions : + Clustering results returned by a clusterer. + + Returns + ------- + dbs : + The resulting Davies-Bouldin score. + + """ + dbs = metrics.davies_bouldin_score(latent_rep, class_predictions) + return dbs From a1d2ee0880fbcb7149fc3e0beaf4ff873cc79e28 Mon Sep 17 00:00:00 2001 From: Wenjie Du Date: Thu, 7 Sep 2023 00:10:18 +0800 Subject: [PATCH 10/15] feat: add sliding_window(); --- pypots/data/utils.py | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/pypots/data/utils.py b/pypots/data/utils.py index c5fb1e22..9d859be5 100644 --- a/pypots/data/utils.py +++ b/pypots/data/utils.py @@ -192,3 +192,46 @@ def cal_delta_for_single_sample(mask: np.ndarray) -> np.ndarray: delta_collector.append(delta) delta = np.asarray(delta_collector) return delta + + +def sliding_window(time_series, n_steps, sliding_len=None): + """Generate time series samples with sliding window method, truncating windows from time-series data + with a given sequence length. + + Given a time series of shape [seq_len, n_features] (seq_len is the total sequence length of the time series), this + sliding_window function will generate time-series samples from this given time series with sliding window method. + The number of generated samples is seq_len//sliding_len. And the final returned numpy ndarray has a shape + [seq_len//sliding_len, n_steps, n_features]. + + Parameters + ---------- + time_series : np.ndarray, + time series data, len(shape)=2, [total_length, feature_num] + + n_steps : int, + The number of time steps in the generated data samples. + + sliding_len : int, default = None, + The size of the sliding window. It will be set as the same with n_steps if None. + + Returns + ------- + samples : np.ndarray, + The generated time-series data samples of shape [seq_len//sliding_len, n_steps, n_features]. + + """ + sliding_len = n_steps if sliding_len is None else sliding_len + total_len = time_series.shape[0] + start_indices = np.asarray(range(total_len // sliding_len)) * sliding_len + + # remove the last one if left length is not enough + if total_len - start_indices[-1] * sliding_len < n_steps: + start_indices = start_indices[:-1] + + sample_collector = [] + for idx in start_indices: + sample_collector.append(time_series[idx : idx + n_steps]) + + samples = np.asarray(sample_collector).astype("float32") + + return samples From aef28b324cbe0967a2c797e299adca3e698199c6 Mon Sep 17 00:00:00 2001 From: Wenjie Du Date: Fri, 8 Sep 2023 02:42:05 +0800 Subject: [PATCH 11/15] docs: update the PR template; --- .github/PULL_REQUEST_TEMPLATE.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 416976ed..5bf60197 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,3 +1,5 @@ + + # What does this PR do?