Skip to content

Commit

Permalink
Add ability to pass alternate index_path to WaveBank and EventBank (
Browse files Browse the repository at this point in the history
#274)

* Add ability to pass a custom index path to a WaveBank

* Add ability to specify index_path to EventBank, update docs accordingly
  • Loading branch information
shawnboltz authored Apr 3, 2024
1 parent 54c3fc8 commit 74897cc
Show file tree
Hide file tree
Showing 9 changed files with 152 additions and 20 deletions.
4 changes: 3 additions & 1 deletion CHANGELOG.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
master
- obsplus.bank
* Tweaked the banks to allow for custom index paths. See #258.
- obsplus.utils.stations
* Fixed df_to_inventory to use a local copy of the NRL for compatibility
with ObsPy and NRLv2 (Note that NRLv1 is no longer accessible)
with ObsPy and NRLv2 (Note that NRLv1 is no longer accessible) (#271)

obsplus 0.2.5
- obsplus
Expand Down
27 changes: 25 additions & 2 deletions docs/notebooks/interfaces/eventbank.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@
"bank = obsplus.EventBank(event_path)\n",
"\n",
"# ensure index is up-to-date\n",
"bank.update_index() "
"bank.update_index()"
]
},
{
Expand Down Expand Up @@ -86,6 +86,29 @@
"print(index.columns)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"If you are working from a data directory that doesn't have write access, you can specify a custom location for the index path:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import tempfile\n",
"from pathlib import Path\n",
"\n",
"index_path = Path(tempfile.mkdtemp()) / \"index.db\"\n",
"cust_ind_bank = obsplus.EventBank(event_path, index_path=index_path)\n",
"cust_ind_bank.update_index()\n",
"ind = cust_ind_bank.read_index()\n",
"ind # Note that paths in the index are relative to event_path"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down Expand Up @@ -213,7 +236,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
"version": "3.9.18"
}
},
"nbformat": 4,
Expand Down
25 changes: 24 additions & 1 deletion docs/notebooks/interfaces/wavebank.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,29 @@
"bank.update_index()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Using a custom index path\n",
"\n",
"If you are working from a data directory that doesn't have write access, you can specify a custom location for the index path:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import tempfile\n",
"from pathlib import Path\n",
"\n",
"index_path = Path(tempfile.mkdtemp()) / \"index.h5\"\n",
"cust_ind_bank = obsplus.WaveBank(crandall_path, index_path=index_path)\n",
"cust_ind_bank.update_index()"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down Expand Up @@ -344,7 +367,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
"version": "3.9.18"
}
},
"nbformat": 4,
Expand Down
7 changes: 4 additions & 3 deletions src/obsplus/bank/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@ class _Bank(ABC):
ext = ""
bank_path: Path = ""
namespace = ""
index_name = ".index.h5" # name of index file
index_name = ".index.h5" # default name of index file
_index_path: Path = None # Override for the index path
executor = None # an executor for using parallelism
# optional str defining the directory structure and file name schemes
path_structure = None
Expand Down Expand Up @@ -89,7 +90,7 @@ def _read_metadata(self) -> pd.DataFrame:
@property
def index_path(self):
"""Return the expected path to the index file."""
return Path(self.bank_path) / self.index_name
return self._index_path or Path(self.bank_path) / self.index_name

@property
def _index_node(self):
Expand Down Expand Up @@ -319,6 +320,6 @@ def load_example_bank(
def __repr__(self):
"""Return the class name with bank path."""
name = type(self).__name__
return f"{name}(base_path={self.bank_path})"
return f"{name}(base_path={self.bank_path}, index_path={self.index_path})"

__str__ = __repr__
5 changes: 5 additions & 0 deletions src/obsplus/bank/eventbank.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,9 @@ class EventBank(_Bank):
variables and a slash cannot be used in a file name on most operating
systems. The default extension (.xml) will be added.
The default is {time}_{event_id_short}.
index_path
The path to the index file containing the contents of the directory.
By default it will be created in the top-level of the data directory.
format
The anticipated format of the event files. Any format supported by the
obspy.read_events function is permitted.
Expand Down Expand Up @@ -160,6 +163,7 @@ def __init__(
base_path: Union[str, Path, "EventBank"] = ".",
path_structure: Optional[str] = None,
name_structure: Optional[str] = None,
index_path: Optional[Union[str, Path]] = None,
format="quakeml",
ext=".xml",
executor: Optional[Executor] = None,
Expand All @@ -181,6 +185,7 @@ def __init__(
self.path_structure = ps
ns = name_structure or self._name_structure or EVENT_NAME_STRUCTURE
self.name_structure = ns
self._index_path = index_path
self.executor = executor
# enforce min version and warn on newer
self._enforce_min_version()
Expand Down
9 changes: 7 additions & 2 deletions src/obsplus/bank/wavebank.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from functools import partial
from itertools import chain
from pathlib import Path
from typing import Optional, Union
from typing import Any, Optional, Union

import numpy as np
import obspy
Expand Down Expand Up @@ -90,6 +90,9 @@ class WaveBank(_Bank):
variables but requires a period as the separation character. The
default extension (.mseed) will be added. The default is {time}
example : {seedid}.{time}
index_path : str
The path to the index file containing the contents of the directory.
By default it will be created in the top-level of the data directory.
cache_size : int
The number of queries to store. Avoids having to read the index of
the bank multiple times for queries involving the same start and end
Expand Down Expand Up @@ -175,6 +178,7 @@ def __init__(
base_path: Union[str, Path, "WaveBank"] = ".",
path_structure: Optional[str] = None,
name_structure: Optional[str] = None,
index_path: Optional[Union[str, Path]] = None,
cache_size: int = 5,
format="mseed",
ext=None,
Expand All @@ -191,6 +195,7 @@ def __init__(
path_structure if path_structure is not None else WAVEFORM_STRUCTURE
)
self.name_structure = name_structure or WAVEFORM_NAME_STRUCTURE
self._index_path = index_path
self.executor = executor
# initialize cache
self._index_cache = _IndexCache(self, cache_size=cache_size)
Expand Down Expand Up @@ -227,7 +232,7 @@ def hdf_kwargs(self) -> dict:
bar_description=bar_parameter_description, paths_description=paths_description
)
def update_index(
self, bar: Optional = None, paths: Optional[bank_subpaths_type] = None
self, bar: Optional[Any] = None, paths: Optional[bank_subpaths_type] = None
) -> "WaveBank":
"""
Iterate files in bank and add any modified since last update to index.
Expand Down
22 changes: 22 additions & 0 deletions src/obsplus/utils/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,10 @@
import obspy
import pandas as pd

from obsplus.bank.core import _Bank
from obsplus.constants import NSLC, utc_able_type
from obsplus.utils.bank import _natify_paths
from obsplus.utils.misc import iter_files
from obsplus.utils.time import make_time_chunks, to_utc


Expand Down Expand Up @@ -302,3 +305,22 @@ def __call__(self, st1, st2):
for tr1, tr2 in zip(st1, st2):
self._assert_stats_equal(tr1, tr2)
self._assert_arrays_almost_equal(tr1, tr2)


def check_index_paths(bank: _Bank):
"""
Make sure the paths in a bank's index can be resolved correctly
Parameters
----------
bank:
A Bank (either WaveBank or EventBank) to verify
"""
bank_path = bank.bank_path
index = bank.read_index()
index_paths = _natify_paths(index["path"])
file_paths = set([bank.bank_path / pth for pth in index_paths])
for file_path in iter_files(str(bank_path), ext="mseed"):
# go up two levels to match path reference
file_path = Path(file_path)
assert file_path in file_paths
38 changes: 37 additions & 1 deletion tests/test_bank/test_eventbank.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from obsplus.constants import EVENT_DTYPES
from obsplus.exceptions import UnsupportedKeyword
from obsplus.utils.events import get_preferred
from obsplus.utils.testing import instrument_methods
from obsplus.utils.testing import check_index_paths, instrument_methods
from obsplus.utils.misc import suppress_warnings, get_progressbar


Expand Down Expand Up @@ -130,6 +130,24 @@ def ebank_low_version(self, tmpdir, monkeypatch):
assert obsplus.__last_version__ != self.low_version_str
return ebank

@pytest.fixture
def cust_ebank_index_path(self, tmpdir_factory):
"""Path for a custom index location"""
return tmpdir_factory.mktemp("custom_index") / ".index.db"

@pytest.fixture
def cust_index_ebank(self, tmpdir_factory, cust_ebank_index_path):
"""
Create a copy of the bingham_test data set. Then return an inited event bank
using the temporary bingham_test bank
"""
new = Path(str(tmpdir_factory.mktemp("bingham_test")))
copy_dataset("bingham_test", new)
path = new / "bingham_test" / "events"
ebank = EventBank(path, index_path=cust_ebank_index_path)
ebank.update_index()
return ebank

@pytest.fixture(scope="class")
def ebank_with_event_no_time(self, tmp_path_factory):
"""Create an event bank which has one file with no time."""
Expand Down Expand Up @@ -177,6 +195,24 @@ def test_read_index(self, bing_ebank, bingham_catalog):
assert isinstance(df, pd.DataFrame)
assert len(bingham_catalog) == len(df)

def test_custom_index_path(
self, cust_index_ebank, cust_ebank_index_path, bingham_catalog
):
"""
Read index, ensure its length matches events and id sets are
equal.
"""
index_path = cust_index_ebank.index_path
# Make sure the new path got passed correctly
assert index_path == cust_ebank_index_path
assert os.path.exists(index_path)
# Make sure paths got written to the index properly
check_index_paths(cust_index_ebank)
# As an extra check, verify the length of the index matches the data catalog
df = cust_index_ebank.read_index()
assert isinstance(df, pd.DataFrame)
assert len(bingham_catalog) == len(df)

def test_read_timestamp(self, bing_ebank):
"""read the current timestamp (after index has been updated)"""
bing_ebank.update_index()
Expand Down
35 changes: 25 additions & 10 deletions tests/test_bank/test_wavebank.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,8 @@
from obsplus.bank.wavebank import WaveBank
from obsplus.constants import NSLC, EMPTYTD64, WAVEFORM_DTYPES
from obsplus.exceptions import BankDoesNotExistError, UnsupportedKeyword
from obsplus.utils.misc import iter_files
from obsplus.utils.time import to_datetime64, to_timedelta64, to_utc
from obsplus.utils.bank import _natify_paths
from obsplus.utils.testing import check_index_paths
from obsplus import get_reference_time

# ----------------------------------- Helper functions
Expand Down Expand Up @@ -146,6 +145,19 @@ def default_bank_low_version(self, default_wbank, monkeypatch):
assert Path(default_wbank.bank_path).exists()
return default_wbank

@pytest.fixture
def cust_wbank_index_path(self, tmpdir_factory):
"""Path for a custom index location"""
return tmpdir_factory.mktemp("custom_index") / ".index.h5"

@pytest.fixture
def cust_index_wbank(self, tmp_ta_dir, cust_wbank_index_path):
"""WaveBank that uses a custom index path"""
bank_path = os.path.join(tmp_ta_dir, "waveforms")
bank = WaveBank(bank_path, index_path=cust_wbank_index_path)
bank.update_index()
return bank

@pytest.fixture
def legacy_path_index(self, default_wbank, monkeypatch):
"""
Expand All @@ -172,21 +184,24 @@ def test_index(self, ta_bank_index):
assert os.path.exists(ta_bank_index.index_path)
assert isinstance(ta_bank_index.last_updated_timestamp, float)

def test_custom_index_path(self, cust_index_wbank, cust_wbank_index_path):
"""ensure a custom index path can be used"""
index_path = cust_index_wbank.index_path
# Make sure the new path got passed correctly
assert index_path == cust_wbank_index_path
assert os.path.exists(index_path)
assert isinstance(cust_index_wbank.last_updated_timestamp, float)
# Make sure paths got written to the index properly
check_index_paths(cust_index_wbank)

def test_create_index(self, ta_bank_no_index):
"""make sure a fresh index can be created"""
# test that just trying to get an index that doesnt exists creates it
ta_bank_no_index.read_index()
index_path = ta_bank_no_index.index_path
bank_path = ta_bank_no_index.bank_path
assert os.path.exists(index_path)
# make sure all file paths are in the index
index = ta_bank_no_index.read_index()
index_paths = _natify_paths(index["path"])
file_paths = set([ta_bank_no_index.bank_path / pth for pth in index_paths])
for file_path in iter_files(str(bank_path), ext="mseed"):
# go up two levels to match path reference
file_path = Path(file_path)
assert file_path in file_paths
check_index_paths(ta_bank_no_index)

def test_update_index_bumps_only_for_new_files(self, ta_bank_index):
"""
Expand Down

0 comments on commit 74897cc

Please sign in to comment.