From 74897ccdad36056e136a183bdcbf0827e7ddf8c6 Mon Sep 17 00:00:00 2001 From: shawnboltz <73495214+shawnboltz@users.noreply.github.com> Date: Wed, 3 Apr 2024 13:52:40 -0700 Subject: [PATCH] Add ability to pass alternate index_path to `WaveBank` and `EventBank` (#274) * Add ability to pass a custom index path to a WaveBank * Add ability to specify index_path to EventBank, update docs accordingly --- CHANGELOG.txt | 4 ++- docs/notebooks/interfaces/eventbank.ipynb | 27 ++++++++++++++-- docs/notebooks/interfaces/wavebank.ipynb | 25 ++++++++++++++- src/obsplus/bank/core.py | 7 +++-- src/obsplus/bank/eventbank.py | 5 +++ src/obsplus/bank/wavebank.py | 9 ++++-- src/obsplus/utils/testing.py | 22 +++++++++++++ tests/test_bank/test_eventbank.py | 38 ++++++++++++++++++++++- tests/test_bank/test_wavebank.py | 35 +++++++++++++++------ 9 files changed, 152 insertions(+), 20 deletions(-) diff --git a/CHANGELOG.txt b/CHANGELOG.txt index 9229549e..448aa480 100644 --- a/CHANGELOG.txt +++ b/CHANGELOG.txt @@ -1,7 +1,9 @@ master + - obsplus.bank + * Tweaked the banks to allow for custom index paths. See #258. - obsplus.utils.stations * Fixed df_to_inventory to use a local copy of the NRL for compatibility - with ObsPy and NRLv2 (Note that NRLv1 is no longer accessible) + with ObsPy and NRLv2 (Note that NRLv1 is no longer accessible) (#271) obsplus 0.2.5 - obsplus diff --git a/docs/notebooks/interfaces/eventbank.ipynb b/docs/notebooks/interfaces/eventbank.ipynb index 3d88f88b..836c6b6c 100644 --- a/docs/notebooks/interfaces/eventbank.ipynb +++ b/docs/notebooks/interfaces/eventbank.ipynb @@ -49,7 +49,7 @@ "bank = obsplus.EventBank(event_path)\n", "\n", "# ensure index is up-to-date\n", - "bank.update_index() " + "bank.update_index()" ] }, { @@ -86,6 +86,29 @@ "print(index.columns)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you are working from a data directory that doesn't have write access, you can specify a custom location for the index path:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import tempfile\n", + "from pathlib import Path\n", + "\n", + "index_path = Path(tempfile.mkdtemp()) / \"index.db\"\n", + "cust_ind_bank = obsplus.EventBank(event_path, index_path=index_path)\n", + "cust_ind_bank.update_index()\n", + "ind = cust_ind_bank.read_index()\n", + "ind # Note that paths in the index are relative to event_path" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -213,7 +236,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.9.18" } }, "nbformat": 4, diff --git a/docs/notebooks/interfaces/wavebank.ipynb b/docs/notebooks/interfaces/wavebank.ipynb index 4e568d79..09df844a 100644 --- a/docs/notebooks/interfaces/wavebank.ipynb +++ b/docs/notebooks/interfaces/wavebank.ipynb @@ -82,6 +82,29 @@ "bank.update_index()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Using a custom index path\n", + "\n", + "If you are working from a data directory that doesn't have write access, you can specify a custom location for the index path:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import tempfile\n", + "from pathlib import Path\n", + "\n", + "index_path = Path(tempfile.mkdtemp()) / \"index.h5\"\n", + "cust_ind_bank = obsplus.WaveBank(crandall_path, index_path=index_path)\n", + "cust_ind_bank.update_index()" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -344,7 +367,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.9.18" } }, "nbformat": 4, diff --git a/src/obsplus/bank/core.py b/src/obsplus/bank/core.py index c06da549..773b2559 100644 --- a/src/obsplus/bank/core.py +++ b/src/obsplus/bank/core.py @@ -38,7 +38,8 @@ class _Bank(ABC): ext = "" bank_path: Path = "" namespace = "" - index_name = ".index.h5" # name of index file + index_name = ".index.h5" # default name of index file + _index_path: Path = None # Override for the index path executor = None # an executor for using parallelism # optional str defining the directory structure and file name schemes path_structure = None @@ -89,7 +90,7 @@ def _read_metadata(self) -> pd.DataFrame: @property def index_path(self): """Return the expected path to the index file.""" - return Path(self.bank_path) / self.index_name + return self._index_path or Path(self.bank_path) / self.index_name @property def _index_node(self): @@ -319,6 +320,6 @@ def load_example_bank( def __repr__(self): """Return the class name with bank path.""" name = type(self).__name__ - return f"{name}(base_path={self.bank_path})" + return f"{name}(base_path={self.bank_path}, index_path={self.index_path})" __str__ = __repr__ diff --git a/src/obsplus/bank/eventbank.py b/src/obsplus/bank/eventbank.py index 0de51776..1c3789c6 100644 --- a/src/obsplus/bank/eventbank.py +++ b/src/obsplus/bank/eventbank.py @@ -102,6 +102,9 @@ class EventBank(_Bank): variables and a slash cannot be used in a file name on most operating systems. The default extension (.xml) will be added. The default is {time}_{event_id_short}. + index_path + The path to the index file containing the contents of the directory. + By default it will be created in the top-level of the data directory. format The anticipated format of the event files. Any format supported by the obspy.read_events function is permitted. @@ -160,6 +163,7 @@ def __init__( base_path: Union[str, Path, "EventBank"] = ".", path_structure: Optional[str] = None, name_structure: Optional[str] = None, + index_path: Optional[Union[str, Path]] = None, format="quakeml", ext=".xml", executor: Optional[Executor] = None, @@ -181,6 +185,7 @@ def __init__( self.path_structure = ps ns = name_structure or self._name_structure or EVENT_NAME_STRUCTURE self.name_structure = ns + self._index_path = index_path self.executor = executor # enforce min version and warn on newer self._enforce_min_version() diff --git a/src/obsplus/bank/wavebank.py b/src/obsplus/bank/wavebank.py index 83825958..18ad6815 100644 --- a/src/obsplus/bank/wavebank.py +++ b/src/obsplus/bank/wavebank.py @@ -9,7 +9,7 @@ from functools import partial from itertools import chain from pathlib import Path -from typing import Optional, Union +from typing import Any, Optional, Union import numpy as np import obspy @@ -90,6 +90,9 @@ class WaveBank(_Bank): variables but requires a period as the separation character. The default extension (.mseed) will be added. The default is {time} example : {seedid}.{time} + index_path : str + The path to the index file containing the contents of the directory. + By default it will be created in the top-level of the data directory. cache_size : int The number of queries to store. Avoids having to read the index of the bank multiple times for queries involving the same start and end @@ -175,6 +178,7 @@ def __init__( base_path: Union[str, Path, "WaveBank"] = ".", path_structure: Optional[str] = None, name_structure: Optional[str] = None, + index_path: Optional[Union[str, Path]] = None, cache_size: int = 5, format="mseed", ext=None, @@ -191,6 +195,7 @@ def __init__( path_structure if path_structure is not None else WAVEFORM_STRUCTURE ) self.name_structure = name_structure or WAVEFORM_NAME_STRUCTURE + self._index_path = index_path self.executor = executor # initialize cache self._index_cache = _IndexCache(self, cache_size=cache_size) @@ -227,7 +232,7 @@ def hdf_kwargs(self) -> dict: bar_description=bar_parameter_description, paths_description=paths_description ) def update_index( - self, bar: Optional = None, paths: Optional[bank_subpaths_type] = None + self, bar: Optional[Any] = None, paths: Optional[bank_subpaths_type] = None ) -> "WaveBank": """ Iterate files in bank and add any modified since last update to index. diff --git a/src/obsplus/utils/testing.py b/src/obsplus/utils/testing.py index ef05a2cc..ca154f92 100644 --- a/src/obsplus/utils/testing.py +++ b/src/obsplus/utils/testing.py @@ -12,7 +12,10 @@ import obspy import pandas as pd +from obsplus.bank.core import _Bank from obsplus.constants import NSLC, utc_able_type +from obsplus.utils.bank import _natify_paths +from obsplus.utils.misc import iter_files from obsplus.utils.time import make_time_chunks, to_utc @@ -302,3 +305,22 @@ def __call__(self, st1, st2): for tr1, tr2 in zip(st1, st2): self._assert_stats_equal(tr1, tr2) self._assert_arrays_almost_equal(tr1, tr2) + + +def check_index_paths(bank: _Bank): + """ + Make sure the paths in a bank's index can be resolved correctly + + Parameters + ---------- + bank: + A Bank (either WaveBank or EventBank) to verify + """ + bank_path = bank.bank_path + index = bank.read_index() + index_paths = _natify_paths(index["path"]) + file_paths = set([bank.bank_path / pth for pth in index_paths]) + for file_path in iter_files(str(bank_path), ext="mseed"): + # go up two levels to match path reference + file_path = Path(file_path) + assert file_path in file_paths diff --git a/tests/test_bank/test_eventbank.py b/tests/test_bank/test_eventbank.py index 1980c54e..41a9d840 100644 --- a/tests/test_bank/test_eventbank.py +++ b/tests/test_bank/test_eventbank.py @@ -20,7 +20,7 @@ from obsplus.constants import EVENT_DTYPES from obsplus.exceptions import UnsupportedKeyword from obsplus.utils.events import get_preferred -from obsplus.utils.testing import instrument_methods +from obsplus.utils.testing import check_index_paths, instrument_methods from obsplus.utils.misc import suppress_warnings, get_progressbar @@ -130,6 +130,24 @@ def ebank_low_version(self, tmpdir, monkeypatch): assert obsplus.__last_version__ != self.low_version_str return ebank + @pytest.fixture + def cust_ebank_index_path(self, tmpdir_factory): + """Path for a custom index location""" + return tmpdir_factory.mktemp("custom_index") / ".index.db" + + @pytest.fixture + def cust_index_ebank(self, tmpdir_factory, cust_ebank_index_path): + """ + Create a copy of the bingham_test data set. Then return an inited event bank + using the temporary bingham_test bank + """ + new = Path(str(tmpdir_factory.mktemp("bingham_test"))) + copy_dataset("bingham_test", new) + path = new / "bingham_test" / "events" + ebank = EventBank(path, index_path=cust_ebank_index_path) + ebank.update_index() + return ebank + @pytest.fixture(scope="class") def ebank_with_event_no_time(self, tmp_path_factory): """Create an event bank which has one file with no time.""" @@ -177,6 +195,24 @@ def test_read_index(self, bing_ebank, bingham_catalog): assert isinstance(df, pd.DataFrame) assert len(bingham_catalog) == len(df) + def test_custom_index_path( + self, cust_index_ebank, cust_ebank_index_path, bingham_catalog + ): + """ + Read index, ensure its length matches events and id sets are + equal. + """ + index_path = cust_index_ebank.index_path + # Make sure the new path got passed correctly + assert index_path == cust_ebank_index_path + assert os.path.exists(index_path) + # Make sure paths got written to the index properly + check_index_paths(cust_index_ebank) + # As an extra check, verify the length of the index matches the data catalog + df = cust_index_ebank.read_index() + assert isinstance(df, pd.DataFrame) + assert len(bingham_catalog) == len(df) + def test_read_timestamp(self, bing_ebank): """read the current timestamp (after index has been updated)""" bing_ebank.update_index() diff --git a/tests/test_bank/test_wavebank.py b/tests/test_bank/test_wavebank.py index 1b255ce0..f2335bfc 100644 --- a/tests/test_bank/test_wavebank.py +++ b/tests/test_bank/test_wavebank.py @@ -31,9 +31,8 @@ from obsplus.bank.wavebank import WaveBank from obsplus.constants import NSLC, EMPTYTD64, WAVEFORM_DTYPES from obsplus.exceptions import BankDoesNotExistError, UnsupportedKeyword -from obsplus.utils.misc import iter_files from obsplus.utils.time import to_datetime64, to_timedelta64, to_utc -from obsplus.utils.bank import _natify_paths +from obsplus.utils.testing import check_index_paths from obsplus import get_reference_time # ----------------------------------- Helper functions @@ -146,6 +145,19 @@ def default_bank_low_version(self, default_wbank, monkeypatch): assert Path(default_wbank.bank_path).exists() return default_wbank + @pytest.fixture + def cust_wbank_index_path(self, tmpdir_factory): + """Path for a custom index location""" + return tmpdir_factory.mktemp("custom_index") / ".index.h5" + + @pytest.fixture + def cust_index_wbank(self, tmp_ta_dir, cust_wbank_index_path): + """WaveBank that uses a custom index path""" + bank_path = os.path.join(tmp_ta_dir, "waveforms") + bank = WaveBank(bank_path, index_path=cust_wbank_index_path) + bank.update_index() + return bank + @pytest.fixture def legacy_path_index(self, default_wbank, monkeypatch): """ @@ -172,21 +184,24 @@ def test_index(self, ta_bank_index): assert os.path.exists(ta_bank_index.index_path) assert isinstance(ta_bank_index.last_updated_timestamp, float) + def test_custom_index_path(self, cust_index_wbank, cust_wbank_index_path): + """ensure a custom index path can be used""" + index_path = cust_index_wbank.index_path + # Make sure the new path got passed correctly + assert index_path == cust_wbank_index_path + assert os.path.exists(index_path) + assert isinstance(cust_index_wbank.last_updated_timestamp, float) + # Make sure paths got written to the index properly + check_index_paths(cust_index_wbank) + def test_create_index(self, ta_bank_no_index): """make sure a fresh index can be created""" # test that just trying to get an index that doesnt exists creates it ta_bank_no_index.read_index() index_path = ta_bank_no_index.index_path - bank_path = ta_bank_no_index.bank_path assert os.path.exists(index_path) # make sure all file paths are in the index - index = ta_bank_no_index.read_index() - index_paths = _natify_paths(index["path"]) - file_paths = set([ta_bank_no_index.bank_path / pth for pth in index_paths]) - for file_path in iter_files(str(bank_path), ext="mseed"): - # go up two levels to match path reference - file_path = Path(file_path) - assert file_path in file_paths + check_index_paths(ta_bank_no_index) def test_update_index_bumps_only_for_new_files(self, ta_bank_index): """