Skip to content

Commit

Permalink
Merge pull request #553 from djhoese/cache-area-slices
Browse files Browse the repository at this point in the history
Add optional caching to AreaDefinition.get_area_slices
  • Loading branch information
mraspaud authored Nov 20, 2023
2 parents 6a8afc0 + b0a2579 commit d8f45cd
Show file tree
Hide file tree
Showing 8 changed files with 411 additions and 148 deletions.
61 changes: 61 additions & 0 deletions docs/source/howtos/configuration.rst
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,67 @@ Or for specific blocks of code:
Similarly, if you need to access one of the values you can
use the ``pyresample.config.get`` method.
Cache Directory
^^^^^^^^^^^^^^^
* **Environment variable**: ``PYRESAMPLE_CACHE_DIR``
* **YAML/Config Key**: ``cache_dir``
* **Default**: See below
Directory where any files cached by Pyresample will be stored. This
directory is not necessarily cleared out by Pyresample, but is rarely used
without explicitly being enabled by the user. This
defaults to a different path depending on your operating system following
the `platformdirs <https://github.com/platformdirs/platformdirs#example-output>`_
"user cache dir".
.. note::
Some resampling algorithms provide caching functionality when the user
provides a directory to cache to. These resamplers do not currently use this
configuration option.
.. _config_cache_sensor_angles_setting:
Cache Geometry Slices
^^^^^^^^^^^^^^^^^^^^^
* **Environment variable**: ``PYRESAMPLE_CACHE_GEOMETRY_SLICES``
* **YAML/Config Key**: ``cache_geometry_slices``
* **Default**: ``False``
Whether or not generated slices for geometry objects are cached to disk.
These slices are used in various parts of Pyresample like
cropping or overlap calculations including those performed in some resampling
algorithms. At the time of writing this is only performed on
``AreaDefinition`` objects through their
:meth:`~pyresample.geometry.AreaDefinition.get_area_slices` method.
Slices are stored in ``cache_dir`` (see above).
Unlike other caching performed in Pyresample where potentially large arrays
are cached, this option saves a pair of ``slice`` objects that consist of
only 3 integers each. This makes the amount of space used in the cache very
small for many cached results.
The slicing operations in Pyresample typically involve finding the intersection
between two geometries. This requires generating bounding polygons for the
geometries and doing polygon intersection calculations that can handle
projection anti-meridians. At the time of writing these calculations can take
as long as 15 seconds depending on number of vertices used in the bounding
polygons. One use case for these slices is reducing input data to only the
overlap of the target area. This can be done before or during resampling as
part of the algorithm or as part of a third-party resampling interface
(ex. Satpy). In the future as optimizations are made to the polygon
intersection logic this caching option should hopefully not be needed.
When setting this as an environment variable, this should be set with the
string equivalent of the Python boolean values ``="True"`` or ``="False"``.
.. warning::
This caching does not limit the number of entries nor does it expire old
entries. It is up to the user to manage the contents of the cache
directory.
Feature Flags
-------------
Expand Down
128 changes: 128 additions & 0 deletions pyresample/_caching.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
"""Various tools for caching.
These tools are rarely needed by users and are used where they make sense
throughout pyresample.
"""
from __future__ import annotations

import hashlib
import json
import os
import warnings
from functools import update_wrapper
from glob import glob
from pathlib import Path
from typing import Any, Callable

import pyresample


class JSONCacheHelper:
"""Decorator class to cache results to a JSON file on-disk."""

def __init__(
self,
func: Callable,
cache_config_key: str,
cache_version: int = 1,
):
self._callable = func
self._cache_config_key = cache_config_key
self._cache_version = cache_version
self._uncacheable_arg_type_names = ("",)

@staticmethod
def cache_clear(cache_dir: str | None = None):
"""Remove all on-disk files associated with this function.
Intended to mimic the :func:`functools.cache` behavior.
"""
cache_dir = _get_cache_dir_from_config(cache_dir=cache_dir, cache_version="*")
for json_file in glob(str(cache_dir / "*.json")):
os.remove(json_file)

def __call__(self, *args):
"""Call decorated function and cache the result to JSON."""
should_cache = pyresample.config.get(self._cache_config_key, False)
if not should_cache:
return self._callable(*args)

try:
arg_hash = _hash_args(args)
except TypeError as err:
warnings.warn("Cannot cache function due to unhashable argument: " + str(err),
stacklevel=2)
return self._callable(*args)

return self._run_and_cache(arg_hash, args)

def _run_and_cache(self, arg_hash: str, args: tuple[Any]) -> Any:
base_cache_dir = _get_cache_dir_from_config(cache_version=self._cache_version)
json_path = base_cache_dir / f"{arg_hash}.json"
if not json_path.is_file():
res = self._callable(*args)
json_path.parent.mkdir(exist_ok=True)
with open(json_path, "w") as json_cache:
json.dump(res, json_cache, cls=_JSONEncoderWithSlice)

# for consistency, always load the cached result
with open(json_path, "r") as json_cache:
res = json.load(json_cache, object_hook=_object_hook)
return res


def _get_cache_dir_from_config(cache_dir: str | None = None, cache_version: int | str = 1) -> Path:
cache_dir = cache_dir or pyresample.config.get("cache_dir")
if cache_dir is None:
raise RuntimeError("Can't use JSON caching. No 'cache_dir' configured.")
subdir = f"geometry_slices_v{cache_version}"
return Path(cache_dir) / subdir


def _hash_args(args: tuple[Any]) -> str:
from pyresample.future.geometry import AreaDefinition, SwathDefinition
from pyresample.geometry import AreaDefinition as LegacyAreaDefinition
from pyresample.geometry import SwathDefinition as LegacySwathDefinition

hashable_args = []
for arg in args:
if isinstance(arg, (SwathDefinition, LegacySwathDefinition)):
raise TypeError(f"Unhashable type ({type(arg)})")
if isinstance(arg, (AreaDefinition, LegacyAreaDefinition)):
arg = hash(arg)
hashable_args.append(arg)
arg_hash = hashlib.sha1() # nosec
arg_hash.update(json.dumps(tuple(hashable_args)).encode("utf8"))
return arg_hash.hexdigest()


class _JSONEncoderWithSlice(json.JSONEncoder):
def default(self, obj: Any) -> Any:
if isinstance(obj, slice):
return {"__slice__": True, "start": obj.start, "stop": obj.stop, "step": obj.step}
return super().default(obj)


def _object_hook(obj: object) -> Any:
if isinstance(obj, dict) and obj.get("__slice__", False):
return slice(obj["start"], obj["stop"], obj["step"])
return obj


def cache_to_json_if(cache_config_key: str) -> Callable:
"""Decorate a function and cache the results to a JSON file on disk.
This caching only happens if the ``pyresample.config`` boolean value for
the provided key is ``True`` as well as some other conditions. See
:class:`JSONCacheHelper` for more information. Most importantly this
decorator does not limit how many items can be cached and does not clear
out old entries. It is up to the user to manage the size of the cache.
"""
def _decorator(func: Callable) -> Callable:
zarr_cacher = JSONCacheHelper(func, cache_config_key)
wrapper = update_wrapper(zarr_cacher, func)
return wrapper

return _decorator
3 changes: 2 additions & 1 deletion pyresample/_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
from donfig import Config

BASE_PATH = os.path.dirname(os.path.realpath(__file__))
# FIXME: Use package_resources?
PACKAGE_CONFIG_PATH = os.path.join(BASE_PATH, 'etc')

_user_config_dir = platformdirs.user_config_dir("pyresample", "pytroll")
Expand All @@ -36,6 +35,8 @@
config = Config(
"pyresample",
defaults=[{
"cache_dir": platformdirs.user_cache_dir("pyresample", "pytroll"),
"cache_geometry_slices": False,
"features": {
"future_geometries": False,
},
Expand Down
131 changes: 131 additions & 0 deletions pyresample/future/geometry/_subset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
"""Functions and tools for subsetting a geometry object."""
from __future__ import annotations

import math
from typing import TYPE_CHECKING, Any

import numpy as np

# this caching module imports the geometries so this subset module
# must be imported inside functions in the geometry modules if needed
# to avoid circular dependencies
from pyresample._caching import cache_to_json_if
from pyresample.boundary import Boundary
from pyresample.geometry import get_geostationary_bounding_box_in_lonlats, logger
from pyresample.utils import check_slice_orientation

if TYPE_CHECKING:
from pyresample import AreaDefinition


@cache_to_json_if("cache_geometry_slices")
def get_area_slices(
src_area: AreaDefinition,
area_to_cover: AreaDefinition,
shape_divisible_by: int | None,
) -> tuple[slice, slice]:
"""Compute the slice to read based on an `area_to_cover`."""
if not _is_area_like(src_area):
raise NotImplementedError(f"Only AreaDefinitions are supported, not {type(src_area)}")
if not _is_area_like(area_to_cover):
raise NotImplementedError(f"Only AreaDefinitions are supported, not {type(area_to_cover)}")

# Intersection only required for two different projections
proj_def_to_cover = area_to_cover.crs
proj_def = src_area.crs
if proj_def_to_cover == proj_def:
logger.debug('Projections for data and slice areas are identical: %s',
proj_def_to_cover)
# Get slice parameters
xstart, xstop, ystart, ystop = _get_slice_starts_stops(src_area, area_to_cover)

x_slice = check_slice_orientation(slice(xstart, xstop))
y_slice = check_slice_orientation(slice(ystart, ystop))
x_slice = _ensure_integer_slice(x_slice)
y_slice = _ensure_integer_slice(y_slice)
return x_slice, y_slice

data_boundary = _get_area_boundary(src_area)
area_boundary = _get_area_boundary(area_to_cover)
intersection = data_boundary.contour_poly.intersection(
area_boundary.contour_poly)
if intersection is None:
logger.debug('Cannot determine appropriate slicing. '
"Data and projection area do not overlap.")
raise NotImplementedError
x, y = src_area.get_array_indices_from_lonlat(
np.rad2deg(intersection.lon), np.rad2deg(intersection.lat))
x_slice = slice(np.ma.min(x), np.ma.max(x) + 1)
y_slice = slice(np.ma.min(y), np.ma.max(y) + 1)
x_slice = _ensure_integer_slice(x_slice)
y_slice = _ensure_integer_slice(y_slice)
if shape_divisible_by is not None:
x_slice = _make_slice_divisible(x_slice, src_area.width,
factor=shape_divisible_by)
y_slice = _make_slice_divisible(y_slice, src_area.height,
factor=shape_divisible_by)

return (check_slice_orientation(x_slice),
check_slice_orientation(y_slice))


def _is_area_like(area_obj: Any) -> bool:
return hasattr(area_obj, "crs") and hasattr(area_obj, "area_extent")


def _get_slice_starts_stops(src_area, area_to_cover):
"""Get x and y start and stop points for slicing."""
llx, lly, urx, ury = area_to_cover.area_extent
x, y = src_area.get_array_coordinates_from_projection_coordinates([llx, urx], [lly, ury])

# we use `round` because we want the *exterior* of the pixels to contain the area_to_cover's area extent.
if (src_area.area_extent[0] > src_area.area_extent[2]) ^ (llx > urx):
xstart = max(0, round(x[1]))
xstop = min(src_area.width, round(x[0]) + 1)
else:
xstart = max(0, round(x[0]))
xstop = min(src_area.width, round(x[1]) + 1)
if (src_area.area_extent[1] > src_area.area_extent[3]) ^ (lly > ury):
ystart = max(0, round(y[0]))
ystop = min(src_area.height, round(y[1]) + 1)
else:
ystart = max(0, round(y[1]))
ystop = min(src_area.height, round(y[0]) + 1)

return xstart, xstop, ystart, ystop


def _get_area_boundary(area_to_cover: AreaDefinition) -> Boundary:
try:
if area_to_cover.is_geostationary:
return Boundary(*get_geostationary_bounding_box_in_lonlats(area_to_cover))
boundary_shape = max(max(*area_to_cover.shape) // 100 + 1, 3)
return area_to_cover.boundary(frequency=boundary_shape, force_clockwise=True)
except ValueError:
raise NotImplementedError("Can't determine boundary of area to cover")


def _make_slice_divisible(sli, max_size, factor=2):
"""Make the given slice even in size."""
rem = (sli.stop - sli.start) % factor
if rem != 0:
adj = factor - rem
if sli.stop + 1 + rem < max_size:
sli = slice(sli.start, sli.stop + adj)
elif sli.start > 0:
sli = slice(sli.start - adj, sli.stop)
else:
sli = slice(sli.start, sli.stop - rem)

return sli


def _ensure_integer_slice(sli):
start = sli.start
stop = sli.stop
step = sli.step
return slice(
math.floor(start) if start is not None else None,
math.ceil(stop) if stop is not None else None,
math.floor(step) if step is not None else None
)
Loading

0 comments on commit d8f45cd

Please sign in to comment.