diff --git a/docs/source/_static/images/logo_small.png b/docs/source/_static/images/logo_small.png index 3f3e885..101b6aa 100644 Binary files a/docs/source/_static/images/logo_small.png and b/docs/source/_static/images/logo_small.png differ diff --git a/extras/fileformats/extras/application/archive.py b/extras/fileformats/extras/application/archive.py index 1d7bff1..5ca5bba 100644 --- a/extras/fileformats/extras/application/archive.py +++ b/extras/fileformats/extras/application/archive.py @@ -238,4 +238,4 @@ def relative_path(path: PathType, base_dir: PathType) -> str: f"Cannot add {path} to archive as it is not a " f"subdirectory of {base_dir}" ) - return relpath # type: ignore[no-any-return] + return relpath diff --git a/extras/fileformats/extras/application/serialization.py b/extras/fileformats/extras/application/serialization.py index 65999d9..0fdb189 100644 --- a/extras/fileformats/extras/application/serialization.py +++ b/extras/fileformats/extras/application/serialization.py @@ -24,7 +24,7 @@ def convert_data_serialization( output_path = out_dir / ( in_file.fspath.stem + (output_format.ext if output_format.ext else "") ) - return output_format.save_new(output_path, dct) # type: ignore[no-any-return] + return output_format.save_new(output_path, dct) @extra_implementation(DataSerialization.load) diff --git a/extras/fileformats/extras/image/converters.py b/extras/fileformats/extras/image/converters.py index 89266b9..db01286 100644 --- a/extras/fileformats/extras/image/converters.py +++ b/extras/fileformats/extras/image/converters.py @@ -25,4 +25,4 @@ def convert_image( output_path = out_dir / ( in_file.fspath.stem + (output_format.ext if output_format.ext else "") ) - return output_format.save_new(output_path, data_array) # type: ignore[no-any-return] + return output_format.save_new(output_path, data_array) diff --git a/fileformats/core/fileset.py b/fileformats/core/fileset.py index 5d87abb..7b27475 100644 --- a/fileformats/core/fileset.py +++ b/fileformats/core/fileset.py @@ -17,6 +17,7 @@ from fileformats.core.typing import Self from .utils import ( classproperty, + mtime_cached_property, fspaths_converter, describe_task, matching_source, @@ -67,7 +68,10 @@ class FileSet(DataType): a set of file-system paths pointing to all the resources in the file-set metadata : dict[str, Any] metadata associated with the file-set, typically lazily loaded via `read_metadata` - extra hook + extra hook but can be provided directly at the time of instantiation + metadata_keys : list[str] + the keys of the metadata to load when the `metadata` property is called. Provided + to allow for selective loading of metadata fields for performance reasons. """ # Class attributes @@ -87,14 +91,17 @@ class FileSet(DataType): # Member attributes fspaths: ty.FrozenSet[Path] - _metadata: ty.Union[ty.Mapping[str, ty.Any], bool, None] + _explicit_metadata: ty.Optional[ty.Mapping[str, ty.Any]] + _metadata_keys: ty.Optional[ty.List[str]] def __init__( self, fspaths: FspathsInputType, - metadata: ty.Union[ty.Dict[str, ty.Any], bool, None] = False, + metadata: ty.Optional[ty.Dict[str, ty.Any]] = None, + metadata_keys: ty.Optional[ty.List[str]] = None, ): - self._metadata = metadata + self._explicit_metadata = metadata + self._metadata_keys = metadata_keys self._validate_class() self.fspaths = fspaths_converter(fspaths) self._validate_fspaths() @@ -179,6 +186,18 @@ def relative_fspaths(self) -> ty.Iterator[Path]: "Paths for all top-level paths in the file-set relative to the common parent directory" return (p.relative_to(self.parent) for p in self.fspaths) + @property + def mtimes(self) -> ty.Tuple[ty.Tuple[str, float], ...]: + """Modification times of all fspaths in the file-set + + Returns + ------- + tuple[tuple[str, float], ...] + a tuple of tuples containing the file paths and the modification time sorted + by the file path + """ + return tuple((str(p), p.stat().st_mtime) for p in sorted(self.fspaths)) + @classproperty def mime_type(cls) -> str: """Generates a MIME type (IANA) identifier from a format class. If an official @@ -225,41 +244,22 @@ def possible_exts(cls) -> ty.List[ty.Optional[str]]: pass return possible - @property + @mtime_cached_property def metadata(self) -> ty.Mapping[str, ty.Any]: """Lazily load metadata from `read_metadata` extra if implemented, returning an empty metadata array if not""" - if self._metadata is not False: - assert self._metadata is not True - return self._metadata if self._metadata else {} + if self._explicit_metadata is not None: + return self._explicit_metadata try: - self._metadata = self.read_metadata() + metadata = self.read_metadata(selected_keys=self._metadata_keys) except FileFormatsExtrasPkgUninstalledError: raise except FileFormatsExtrasPkgNotCheckedError as e: logger.warning(str(e)) - self._metadata = None + metadata = {} except FileFormatsExtrasError: - self._metadata = None - return self._metadata if self._metadata else {} - - def select_metadata( - self, selected_keys: ty.Union[ty.Sequence[str], None] = None - ) -> None: - """Selects a subset of the metadata to be read and stored instead all available - (i.e for performance reasons). - - Parameters - ---------- - selected_keys : Union[Sequence[str], None] - the keys of the values to load. If None, all values are loaded - """ - if not ( - isinstance(self._metadata, dict) - and selected_keys is not None - and set(selected_keys).issubset(self._metadata) - ): - self._metadata = dict(self.read_metadata(selected_keys)) + metadata = {} + return metadata @extra def read_metadata( @@ -672,7 +672,7 @@ def formats_by_iana_mime(cls) -> ty.Dict[str, ty.Type["FileSet"]]: """a dictionary containing all formats by their IANA MIME type (if applicable)""" if cls._formats_by_iana_mime is None: cls._formats_by_iana_mime = { - f.iana_mime: f # type: ignore + f.iana_mime: f # type: ignore[misc] for f in FileSet.all_formats if f.__dict__.get("iana_mime") is not None } diff --git a/fileformats/core/tests/test_metadata.py b/fileformats/core/tests/test_metadata.py index 98cc0a2..90a1674 100644 --- a/fileformats/core/tests/test_metadata.py +++ b/fileformats/core/tests/test_metadata.py @@ -1,5 +1,6 @@ import typing as ty import pytest +import time from fileformats.core import FileSet, extra_implementation from fileformats.generic import File @@ -21,7 +22,7 @@ def aformat_read_metadata( @pytest.fixture -def file_with_metadata(tmp_path): +def file_with_metadata_fspath(tmp_path): metadata = { "a": 1, "b": 2, @@ -32,27 +33,46 @@ def file_with_metadata(tmp_path): fspath = tmp_path / "metadata-file.mf" with open(fspath, "w") as f: f.write("\n".join("{}:{}".format(*t) for t in metadata.items())) - return FileWithMetadata(fspath) + return fspath -def test_metadata(file_with_metadata): +def test_metadata(file_with_metadata_fspath): + file_with_metadata = FileWithMetadata(file_with_metadata_fspath) assert file_with_metadata.metadata["a"] == "1" assert sorted(file_with_metadata.metadata) == ["a", "b", "c", "d", "e"] -def test_select_metadata(file_with_metadata): - file_with_metadata.select_metadata(["a", "b", "c"]) +def test_select_metadata(file_with_metadata_fspath): + file_with_metadata = FileWithMetadata( + file_with_metadata_fspath, metadata_keys=["a", "b", "c"] + ) assert file_with_metadata.metadata["a"] == "1" assert sorted(file_with_metadata.metadata) == ["a", "b", "c"] -def test_select_metadata_reload(file_with_metadata): - file_with_metadata.select_metadata(["a", "b", "c"]) +def test_explicit_metadata(file_with_metadata_fspath): + file_with_metadata = FileWithMetadata( + file_with_metadata_fspath, + metadata={ + "a": 1, + "b": 2, + "c": 3, + }, + ) + # Check that we use the explicitly provided metadata and not one from the file + # contents assert sorted(file_with_metadata.metadata) == ["a", "b", "c"] - # add new metadata line to check that it isn't loaded + # add new metadata line to check and check that it isn't reloaded with open(file_with_metadata, "a") as f: f.write("\nf:6") - file_with_metadata.select_metadata(["a", "b"]) assert sorted(file_with_metadata.metadata) == ["a", "b", "c"] - file_with_metadata.select_metadata(None) + + +def test_metadata_reload(file_with_metadata_fspath): + file_with_metadata = FileWithMetadata(file_with_metadata_fspath) + assert sorted(file_with_metadata.metadata) == ["a", "b", "c", "d", "e"] + # add new metadata line to check and check that it is reloaded + time.sleep(2) + with open(file_with_metadata, "a") as f: + f.write("\nf:6") assert sorted(file_with_metadata.metadata) == ["a", "b", "c", "d", "e", "f"] diff --git a/fileformats/core/tests/test_utils.py b/fileformats/core/tests/test_utils.py index 071fdf7..21ce5fc 100644 --- a/fileformats/core/tests/test_utils.py +++ b/fileformats/core/tests/test_utils.py @@ -8,6 +8,7 @@ from fileformats.generic import File, Directory, FsObject from fileformats.core.mixin import WithSeparateHeader from fileformats.core.exceptions import UnsatisfiableCopyModeError +from fileformats.core.utils import mtime_cached_property from conftest import write_test_file @@ -365,3 +366,44 @@ def test_hash_files(fsobject: FsObject, work_dir: Path, dest_dir: Path): ) cpy = fsobject.copy(dest_dir) assert cpy.hash_files() == fsobject.hash_files() + + +class MtimeTestFile(File): + + flag: int + + @mtime_cached_property + def cached_prop(self): + return self.flag + + +def test_mtime_cached_property(tmp_path: Path): + fspath = tmp_path / "file_1.txt" + fspath.write_text("hello") + + file = MtimeTestFile(fspath) + + file.flag = 0 + assert file.cached_prop == 0 + # Need a long delay to ensure the mtime changes on Ubuntu and particularly on Windows + # On MacOS, the mtime resolution is much higher so not usually an issue. Use + # explicitly cache clearing if needed + time.sleep(2) + file.flag = 1 + assert file.cached_prop == 0 + time.sleep(2) + fspath.write_text("world") + assert file.cached_prop == 1 + + +def test_mtime_cached_property_force_clear(tmp_path: Path): + fspath = tmp_path / "file_1.txt" + fspath.write_text("hello") + + file = MtimeTestFile(fspath) + + file.flag = 0 + assert file.cached_prop == 0 + file.flag = 1 + MtimeTestFile.cached_prop.clear(file) + assert file.cached_prop == 1 diff --git a/fileformats/core/typing.py b/fileformats/core/typing.py index d7de7b6..c3bc219 100644 --- a/fileformats/core/typing.py +++ b/fileformats/core/typing.py @@ -22,4 +22,10 @@ PathType: TypeAlias = ty.Union[str, Path] -__all__ = ["CryptoMethod", "FspathsInputType", "PathType", "TypeAlias", "Self"] +__all__ = [ + "CryptoMethod", + "FspathsInputType", + "PathType", + "TypeAlias", + "Self", +] diff --git a/fileformats/core/utils.py b/fileformats/core/utils.py index 98c7101..e7a5845 100644 --- a/fileformats/core/utils.py +++ b/fileformats/core/utils.py @@ -6,6 +6,7 @@ from types import ModuleType import urllib.request import urllib.error +from threading import RLock import os import logging import pkgutil @@ -97,22 +98,22 @@ def fspaths_converter(fspaths: FspathsInputType) -> ty.FrozenSet[Path]: PropReturn = ty.TypeVar("PropReturn") + +def classproperty(meth: ty.Callable[..., PropReturn]) -> PropReturn: + """Access a @classmethod like a @property.""" + # mypy doesn't understand class properties yet: https://github.com/python/mypy/issues/2563 + return classmethod(property(meth)) # type: ignore + + if sys.version_info[:2] < (3, 9): - class classproperty(object): + class classproperty(object): # type: ignore[no-redef] # noqa def __init__(self, f: ty.Callable[[ty.Type[ty.Any]], ty.Any]): self.f = f def __get__(self, obj: ty.Any, owner: ty.Any) -> ty.Any: return self.f(owner) -else: - - def classproperty(meth: ty.Callable[..., PropReturn]) -> PropReturn: - """Access a @classmethod like a @property.""" - # mypy doesn't understand class properties yet: https://github.com/python/mypy/issues/2563 - return classmethod(property(meth)) # type: ignore - def add_exc_note(e: Exception, note: str) -> Exception: """Adds a note to an exception in a Python <3.11 compatible way @@ -248,3 +249,54 @@ def import_extras_module(klass: ty.Type["fileformats.core.DataType"]) -> ExtrasM else: extras_imported = True return ExtrasModule(extras_imported, extras_pkg, extras_pypi) + + +ReturnType = ty.TypeVar("ReturnType") + + +class mtime_cached_property: + """A property that is cached until the mtimes of the files in the fileset are changed""" + + def __init__(self, func: ty.Callable[..., ty.Any]): + self.func = func + self.__doc__ = func.__doc__ + self.lock = RLock() + + @property + def _cache_name(self) -> str: + return f"_{self.func.__name__}_mtime_cache" + + def clear(self, instance: "fileformats.core.FileSet") -> None: + """Forcibly clear the cache""" + del instance.__dict__[self._cache_name] + + def __get__( + self, + instance: ty.Optional["fileformats.core.FileSet"], + owner: ty.Optional[ty.Type["fileformats.core.FileSet"]] = None, + ) -> ty.Any: + if instance is None: # if accessing property from class not instance + return self + assert isinstance(instance, fileformats.core.FileSet), ( + "Cannot use mtime_cached_property instance with " + f"{type(instance).__name__!r} object, only FileSet objects." + ) + try: + mtimes, value = instance.__dict__[self._cache_name] + except KeyError: + pass + else: + if instance.mtimes == mtimes: + return value + with self.lock: + # check if another thread filled cache while we awaited lock + try: + mtimes, value = instance.__dict__[self._cache_name] + except KeyError: + pass + else: + if instance.mtimes == mtimes: + return value + value = self.func(instance) + instance.__dict__[self._cache_name] = (instance.mtimes, value) + return value diff --git a/fileformats/generic/file.py b/fileformats/generic/file.py index 22e5b6e..ba096eb 100644 --- a/fileformats/generic/file.py +++ b/fileformats/generic/file.py @@ -5,7 +5,7 @@ FormatMismatchError, UnconstrainedExtensionException, ) -from fileformats.core.utils import classproperty +from fileformats.core.utils import classproperty, mtime_cached_property from .fsobject import FsObject @@ -79,7 +79,7 @@ def copy_ext( ) return Path(new_path).with_suffix(suffix) - @property + @mtime_cached_property def contents(self) -> ty.Union[str, bytes]: return self.read_contents() @@ -125,7 +125,7 @@ def actual_ext(self) -> str: "(i.e. matches the None extension)" ) # Return the longest matching extension, useful for optional extensions - return sorted(matching, key=len)[-1] # type: ignore[no-any-return] + return sorted(matching, key=len)[-1] @property def stem(self) -> str: diff --git a/fileformats/generic/set.py b/fileformats/generic/set.py index 200038d..2da783b 100644 --- a/fileformats/generic/set.py +++ b/fileformats/generic/set.py @@ -3,6 +3,7 @@ from fileformats.core.exceptions import ( FormatMismatchError, ) +from fileformats.core.utils import mtime_cached_property from fileformats.core.mixin import WithClassifiers @@ -12,7 +13,7 @@ class TypedSet(FileSet): content_types: ty.Tuple[ty.Type[FileSet], ...] = () - @property + @mtime_cached_property def contents(self) -> ty.Iterable[FileSet]: for content_type in self.content_types: for p in self.fspaths: diff --git a/fileformats/testing/classifiers/file.py b/fileformats/testing/classifiers/file.py index a71b778..955dd33 100644 --- a/fileformats/testing/classifiers/file.py +++ b/fileformats/testing/classifiers/file.py @@ -84,9 +84,10 @@ class N(WithClassifiers, File): @attrs.define -class TestField(Singular[ty.Any, ty.Any]): +class TestField(Singular[ty.Any, str]): value: ty.Any + primitive = str class P(WithClassifiers, File):