diff --git a/cfgrib/__main__.py b/cfgrib/__main__.py index 0404c4e6..73b6c62f 100644 --- a/cfgrib/__main__.py +++ b/cfgrib/__main__.py @@ -176,5 +176,18 @@ def dump(inpaths, variable, cdm, engine): print(ds_or_da) +@cfgrib_cli.command("build_index") +@click.argument("inpaths", nargs=-1, required=True) +@click.option("--index-basedir", default=None) +@click.option("--force-index-creation", default=None) +def build_index(inpaths, index_basedir, force_index_creation): + # type: (T.List[str], str, bool) -> None + from .dataset import get_or_create_index + + for fp in inpaths: + print(f"{fp}: Creating index") + get_or_create_index(str(fp), index_basedir, force_index_creation) + + if __name__ == "__main__": # pragma: no cover cfgrib_cli() diff --git a/cfgrib/dataset.py b/cfgrib/dataset.py index bf5eea9a..209dc459 100644 --- a/cfgrib/dataset.py +++ b/cfgrib/dataset.py @@ -23,6 +23,7 @@ import logging import os import typing as T +from pathlib import Path import attr import numpy as np @@ -797,3 +798,16 @@ def open_file( index = open_fileindex(stream, indexpath, index_keys, filter_by_keys=filter_by_keys) return open_from_index(index, read_keys, time_dims, extra_coords, errors=errors, **kwargs) + + +def get_or_create_index(fp: str | Path, index_basedir: str | Path, force_index_creation: bool=False) -> messages.FileIndex: + """ Create a pygrib index file """ + index_keys = compute_index_keys() + stream = messages.FileStream(str(fp)) + index = messages.FileIndex.from_indexpath_or_filestream( + filestream=stream, + index_keys=index_keys, + indexpath=str(os.path.join(index_basedir, '{path}.idx')), + force_index_creation=force_index_creation + ) + return index diff --git a/cfgrib/messages.py b/cfgrib/messages.py index f7d725fb..6aa365df 100644 --- a/cfgrib/messages.py +++ b/cfgrib/messages.py @@ -520,9 +520,10 @@ class FileIndex(FieldsetIndex): @classmethod def from_indexpath_or_filestream( - cls, filestream, index_keys, indexpath=DEFAULT_INDEXPATH, computed_keys={}, log=LOG + cls, filestream, index_keys, indexpath=DEFAULT_INDEXPATH, computed_keys={}, log=LOG, + force_index_creation=False ): - # type: (FileStream, T.Sequence[str], str, ComputedKeysType, logging.Logger) -> FileIndex + # type: (FileStream, T.Sequence[str], str, ComputedKeysType, logging.Logger, bool) -> FileIndex # Reading and writing the index can be explicitly suppressed by passing indexpath==''. if not indexpath: @@ -530,6 +531,10 @@ def from_indexpath_or_filestream( hash = hashlib.md5(repr(index_keys).encode("utf-8")).hexdigest() indexpath = indexpath.format(path=filestream.path, hash=hash, short_hash=hash[:5]) + + if force_index_creation and os.path.exists(indexpath): + os.unlink(indexpath) + try: with compat_create_exclusive(indexpath) as new_index_file: self = cls.from_fieldset(filestream, index_keys, computed_keys) diff --git a/tests/test_30_dataset.py b/tests/test_30_dataset.py index 5523d3ee..94acb316 100644 --- a/tests/test_30_dataset.py +++ b/tests/test_30_dataset.py @@ -324,3 +324,11 @@ def test_missing_field_values() -> None: t2 = res.variables["t2m"] assert np.isclose(np.nanmean(t2.data[0, :, :]), 268.375) assert np.isclose(np.nanmean(t2.data[1, :, :]), 270.716) + + +def test_get_or_create_index(tmpdir) -> None: + index = dataset.get_or_create_index(TEST_DATA, os.path.join(tmpdir, "indexes")) + assert isinstance(index, messages.FileIndex) + + index = dataset.get_or_create_index(TEST_DATA, os.path.join(tmpdir, "indexes"), force_index_creation=True) + assert isinstance(index, messages.FileIndex)