add synchronized_brainwave_datase and its test case, add readme

AlexanderVNikitin · Jun 14, 2024 · b5df408 · b5df408
1 parent b91cf21
commit b5df408
Show file tree

Hide file tree

Showing 3 changed files with 52 additions and 12 deletions.
diff --git a/README.md b/README.md
@@ -146,18 +146,19 @@ TSGM implements many metrics for synthetic time series evaluation. Check Section
 
 
 ## :floppy_disk: Datasets
-| Dataset  | API | Description     |
-| ------------- | ------------- | ------------- |
-| UCR Dataset  | `tsgm.utils.UCRDataManager` | https://www.cs.ucr.edu/%7Eeamonn/time_series_data_2018/  |
-| Mauna Loa  | `tsgm.utils.get_mauna_loa()` | https://gml.noaa.gov/ccgg/trends/data.html |
-| EEG & Eye state  | `tsgm.utils.get_eeg()` | https://archive.ics.uci.edu/ml/datasets/EEG+Eye+State  |
-| Power consumption dataset  | `tsgm.utils.get_power_consumption()` | https://archive.ics.uci.edu/ml/datasets/individual+household+electric+power+consumption  |
-| Stock data  | `tsgm.utils.get_stock_data(ticker_name)` | Gets historical stock data from YFinance  |
-| COVID-19 over the US  | `tsgm.utils.get_covid_19()` | Covid-19 distribution over the US  |
-| Energy Data (UCI)  | `tsgm.utils.get_energy_data()` | https://archive.ics.uci.edu/ml/datasets/Appliances+energy+prediction  |
-| MNIST as time series  | `tsgm.utils.get_mnist_data()` | https://en.wikipedia.org/wiki/MNIST_database  |
-| Samples from GPs  | `tsgm.utils.get_gp_samples_data()` | https://en.wikipedia.org/wiki/Gaussian_process |
-| Physionet 2012  | `tsgm.utils.get_physionet2012()` | https://archive.physionet.org/pn3/challenge/2012/ |
+| Dataset | API                                               | Description     |
+| - |---------------------------------------------------| ------------- |
+| UCR Dataset | `tsgm.utils.UCRDataManager`                       | https://www.cs.ucr.edu/%7Eeamonn/time_series_data_2018/  |
+| Mauna Loa | `tsgm.utils.get_mauna_loa()`                      | https://gml.noaa.gov/ccgg/trends/data.html |
+| EEG & Eye state | `tsgm.utils.get_eeg()`                            | https://archive.ics.uci.edu/ml/datasets/EEG+Eye+State  |
+| Power consumption dataset | `tsgm.utils.get_power_consumption()`              | https://archive.ics.uci.edu/ml/datasets/individual+household+electric+power+consumption  |
+| Stock data | `tsgm.utils.get_stock_data(ticker_name)`          | Gets historical stock data from YFinance  |
+| COVID-19 over the US | `tsgm.utils.get_covid_19()`                       | Covid-19 distribution over the US  |
+| Energy Data (UCI) | `tsgm.utils.get_energy_data()`                    | https://archive.ics.uci.edu/ml/datasets/Appliances+energy+prediction  |
+| MNIST as time series | `tsgm.utils.get_mnist_data()`                     | https://en.wikipedia.org/wiki/MNIST_database  |
+| Samples from GPs | `tsgm.utils.get_gp_samples_data()`                | https://en.wikipedia.org/wiki/Gaussian_process |
+| Physionet 2012 | `tsgm.utils.get_physionet2012()`                  | https://archive.physionet.org/pn3/challenge/2012/ |
+| Synchronized Brainwave Dataset  | `tsgm.utils.get_synchronized_brainwave_dataset()` | https://www.kaggle.com/datasets/berkeley-biosense/synchronized-brainwave-dataset |
 
 TSGM provides API for convenient use of many time-series datasets (currently more than 140 datasets). The comprehensive list of the datasets in the [documentation](https://tsgm.readthedocs.io/en/latest/guides/datasets.html)
 

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -367,3 +367,8 @@ def test_extract_targz():
 
 def test_version():
     assert isinstance(tsgm.__version__, str)
+
+
+def test_get_synchronized_brainwave_dataset():
+    df = tsgm.utils.get_synchronized_brainwave_dataset()
+    assert df.shape == (30013, 13)
diff --git a/tsgm/utils/datasets.py b/tsgm/utils/datasets.py
@@ -19,6 +19,7 @@
 
 from tsgm.utils import covid19_data_utils
 from tsgm.utils import file_utils
+import requests
 
 
 logger = logging.getLogger('utils')
@@ -296,6 +297,39 @@ def get_eeg() -> T.Tuple[TensorLike, TensorLike]:
     return X, y
 
 
+def get_synchronized_brainwave_dataset() -> pd.DataFrame:
+    # TODO: we need a better url
+    url = ("https://storage.googleapis.com/kaggle-data-sets/267/799894/compressed/eeg-data.csv.zip?X-Goog-Algorithm"
+           "=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20240612"
+           "%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20240612T131629Z&X-Goog-Expires=259200&X-Goog-SignedHeaders"
+           "=host&X-Goog-Signature"
+           "=678772c7339d5116478fbc1c2dc377567284c4c0d97aff46533b7d64dee0c4336c9ebf82a0c3c980ead3bbfbfc426135bf1dee685eb544c9a36074199e4a760a311379bd1f8996c5906481a72aa19c19af3cfdecd7bd565c434fc626f2c43ce6ccbdfaa58eee2a3c7668708fc8c93364499b5b083c668288840eace09b6267bb2a5c6208ed9fb1e66cda405e1900dc35a6fbb9f02562a8ab351dd88a0b346a32c1941cf5ad5f4cf2a10ccd36a35bdbc3620d1402d26b407acec6eab9a7dde4b355cf977e588f36eeca0046a36331e2798f3a5074b6423c830cbfeac12f8a1d8b45911963e0ff4f7e0879755d1bf2054434e48d043ff4cca5e96f5e6a4f55e2e1")
+    cur_path = os.path.dirname(__file__)
+    path_to_folder = os.path.join(cur_path, "../../data/")
+    path_to_resource = os.path.join(path_to_folder, 'synchronized_brainwave_dataset.zip')
+    path_to_renamed_csv = os.path.join(path_to_folder, "synchronized_brainwave_dataset.csv")
+    if not os.path.exists(path_to_renamed_csv):
+        # TODO: utils.py is not used here, maybe need md5 checking
+        response = requests.get(url)
+        with open(path_to_resource, 'wb') as f:
+            f.write(response.content)
+        print("Download completed.")
+        file_utils.extract_archive(path_to_resource, path_to_folder)
+        print("Extraction completed.")
+
+        original_csv = os.path.join(path_to_folder, "eeg-data.csv")
+
+        if os.path.exists(original_csv):
+            os.rename(original_csv, path_to_renamed_csv)
+            print(f"File renamed to {path_to_renamed_csv}")
+        else:
+            print("The expected CSV file was not found.")
+    else:
+        print("File exist")
+    df = pd.read_csv(path_to_renamed_csv)
+    return df
+
+
 def get_power_consumption() -> npt.NDArray:
     """
     Retrieves the household power consumption dataset.