From 1535202ed6ee0c82f9e30ca253aa086e8e8afcc0 Mon Sep 17 00:00:00 2001 From: Jeremy Magland Date: Mon, 20 May 2024 23:32:10 -0400 Subject: [PATCH] Raise exception for trying to set too large chunk in local cache --- lindi/LindiH5ZarrStore/LindiH5ZarrStore.py | 8 ++--- .../LindiReferenceFileSystemStore.py | 8 ++--- lindi/LindiRemfile/LindiRemfile.py | 30 ++++++++----------- lindi/LocalCache/LocalCache.py | 14 +++++---- lindi/__init__.py | 2 +- tests/test_local_cache.py | 2 +- 6 files changed, 31 insertions(+), 33 deletions(-) diff --git a/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py b/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py index afec442..e40fcef 100644 --- a/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py +++ b/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py @@ -20,7 +20,7 @@ from ..conversion.h5_filters_to_codecs import h5_filters_to_codecs from ..conversion.create_zarr_dataset_from_h5_data import create_zarr_dataset_from_h5_data from ..LindiH5pyFile.LindiReferenceFileSystemStore import LindiReferenceFileSystemStore -from ..LocalCache.LocalCache import LocalCache +from ..LocalCache.LocalCache import ChunkTooLargeError, LocalCache from ..LindiRemfile.LindiRemfile import LindiRemfile from .LindiH5ZarrStoreOpts import LindiH5ZarrStoreOpts @@ -346,15 +346,15 @@ def _get_chunk_file_bytes(self, key_parent: str, key_name: str): buf = _read_bytes(self._file, byte_offset, byte_count) if self._local_cache is not None: assert self._url is not None, "Unexpected: url is None but local_cache is not None" - if byte_count < 1000 * 1000 * 900: + try: self._local_cache.put_remote_chunk( url=self._url, offset=byte_offset, size=byte_count, data=buf ) - else: - print(f"Warning: Not storing chunk of size {byte_count} in local cache in LindiH5ZarrStore (key: {key_parent}/{key_name})") + except ChunkTooLargeError: + print(f"Warning: Unable to store chunk of size {byte_count} in local cache in LindiH5ZarrStore (key: {key_parent}/{key_name})") return buf def _get_chunk_file_bytes_data(self, key_parent: str, key_name: str): diff --git a/lindi/LindiH5pyFile/LindiReferenceFileSystemStore.py b/lindi/LindiH5pyFile/LindiReferenceFileSystemStore.py index 19b2e80..48b2801 100644 --- a/lindi/LindiH5pyFile/LindiReferenceFileSystemStore.py +++ b/lindi/LindiH5pyFile/LindiReferenceFileSystemStore.py @@ -4,7 +4,7 @@ import requests from zarr.storage import Store as ZarrStore -from ..LocalCache.LocalCache import LocalCache +from ..LocalCache.LocalCache import ChunkTooLargeError, LocalCache class LindiReferenceFileSystemStore(ZarrStore): @@ -141,10 +141,10 @@ def __getitem__(self, key: str): return x val = _read_bytes_from_url_or_path(url, offset, length) if self.local_cache is not None: - if length < 1000 * 1000 * 900: + try: self.local_cache.put_remote_chunk(url=url, offset=offset, size=length, data=val) - else: - print(f'Warning: not caching chunk of size {length} om LindiReferenceFileSystemStore (key: {key})') + except ChunkTooLargeError: + print(f'Warning: unable to cache chunk of size {length} on LocalCache (key: {key})') return val else: # should not happen given checks in __init__, but self.rfs is mutable diff --git a/lindi/LindiRemfile/LindiRemfile.py b/lindi/LindiRemfile/LindiRemfile.py index af11f60..f19eb95 100644 --- a/lindi/LindiRemfile/LindiRemfile.py +++ b/lindi/LindiRemfile/LindiRemfile.py @@ -218,15 +218,12 @@ def _load_chunk(self, chunk_index: int) -> bytes: self._memory_chunks[chunk_index] = x if self._local_cache is not None: size = min(self._min_chunk_size, self.length - chunk_index * self._min_chunk_size) - if size < 1000 * 1000 * 900: - self._local_cache.put_remote_chunk( - url=self._url, - offset=chunk_index * self._min_chunk_size, - size=size, - data=x - ) - else: - raise Exception(f'Unexpected large size for chunk when caching: {size}') + self._local_cache.put_remote_chunk( + url=self._url, + offset=chunk_index * self._min_chunk_size, + size=size, + data=x + ) self._memory_chunk_indices.append(chunk_index) else: for i in range(self._smart_loader_chunk_sequence_length): @@ -242,15 +239,12 @@ def _load_chunk(self, chunk_index: int) -> bytes: data = x[i * self._min_chunk_size: (i + 1) * self._min_chunk_size] if len(data) != size: raise ValueError(f'Unexpected: len(data) != size: {len(data)} != {size}') - if size < 1000 * 1000 * 900: - self._local_cache.put_remote_chunk( - url=self._url, - offset=(chunk_index + i) * self._min_chunk_size, - size=size, - data=data - ) - else: - raise Exception(f'Unexpected large size for chunk when caching: {size}') + self._local_cache.put_remote_chunk( + url=self._url, + offset=(chunk_index + i) * self._min_chunk_size, + size=size, + data=data + ) self._smart_loader_last_chunk_index_accessed = ( chunk_index + self._smart_loader_chunk_sequence_length - 1 ) diff --git a/lindi/LocalCache/LocalCache.py b/lindi/LocalCache/LocalCache.py index 9b65586..f675f30 100644 --- a/lindi/LocalCache/LocalCache.py +++ b/lindi/LocalCache/LocalCache.py @@ -18,14 +18,13 @@ def get_remote_chunk(self, *, url: str, offset: int, size: int): def put_remote_chunk(self, *, url: str, offset: int, size: int, data: bytes): if len(data) != size: raise ValueError("data size does not match size") - if size >= 1000 * 1000 * 900: - # This is a sqlite limitation/configuration - # https://www.sqlite.org/limits.html - # For some reason 1000 * 1000 * 1000 seems to be too large, whereas 1000 * 1000 * 900 is fine - raise ValueError("Cannot store blobs larger than 900 MB in LocalCache") self._sqlite_client.put_remote_chunk(url=url, offset=offset, size=size, data=data) +class ChunkTooLargeError(Exception): + pass + + class LocalCacheSQLiteClient: def __init__(self, *, db_fname: str): import sqlite3 @@ -63,6 +62,11 @@ def get_remote_chunk(self, *, url: str, offset: int, size: int): return row[0] def put_remote_chunk(self, *, url: str, offset: int, size: int, data: bytes): + if size >= 1000 * 1000 * 900: + # This is a sqlite limitation/configuration + # https://www.sqlite.org/limits.html + # For some reason 1000 * 1000 * 1000 seems to be too large, whereas 1000 * 1000 * 900 is fine + raise ChunkTooLargeError("Cannot store blobs larger than 900 MB in LocalCache") self._cursor.execute( """ INSERT OR REPLACE INTO remote_chunks (url, offset, size, data) VALUES (?, ?, ?, ?) diff --git a/lindi/__init__.py b/lindi/__init__.py index 628432e..f300066 100644 --- a/lindi/__init__.py +++ b/lindi/__init__.py @@ -1,5 +1,5 @@ from .LindiH5ZarrStore import LindiH5ZarrStore, LindiH5ZarrStoreOpts # noqa: F401 from .LindiH5pyFile import LindiH5pyFile, LindiH5pyGroup, LindiH5pyDataset, LindiH5pyHardLink, LindiH5pySoftLink # noqa: F401 from .LindiStagingStore import LindiStagingStore, StagingArea # noqa: F401 -from .LocalCache.LocalCache import LocalCache # noqa: F401 +from .LocalCache.LocalCache import LocalCache, ChunkTooLargeError # noqa: F401 from .File.File import File # noqa: F401 diff --git a/tests/test_local_cache.py b/tests/test_local_cache.py index eda23ec..7fdb857 100644 --- a/tests/test_local_cache.py +++ b/tests/test_local_cache.py @@ -62,7 +62,7 @@ def test_put_local_cache(): ) assert data == data2 data_too_large = b'x' * (1000 * 1000 * 900) - with pytest.raises(ValueError): + with pytest.raises(lindi.ChunkTooLargeError): local_cache.put_remote_chunk( url='dummy', offset=0,