Skip to content

Commit

Permalink
Merge pull request #76 from ArcanaFramework/negative-file-offsets
Browse files Browse the repository at this point in the history
Hardens mtime_cached_property to invalidate the cache for poor mtime resolutions
  • Loading branch information
tclose authored Sep 12, 2024
2 parents 5b84d2b + 356ca38 commit 86f0f68
Show file tree
Hide file tree
Showing 18 changed files with 253 additions and 171 deletions.
2 changes: 1 addition & 1 deletion .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,6 @@ exclude =
docs/source/conf.py
max-line-length = 88
select = C,E,F,W,B,B950
extend-ignore = E203,E501,E129,W503
extend-ignore = E203,E501,E129,W503,E704
per-file-ignores =
setup.py:F401
73 changes: 37 additions & 36 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,39 +1,40 @@
# See https://pre-commit.com for more information
# See https://pre-commit.com/hooks.html for more hooks
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.3.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
- id: check-yaml
- id: check-added-large-files
- repo: https://github.com/psf/black
rev: 22.3.0
hooks:
- id: black
exclude: ^(fileformats/core/_version\.py)$
args:
- -l 88
- repo: https://github.com/codespell-project/codespell
rev: v2.1.0
hooks:
- id: codespell
exclude: ^(fileformats/core/_version\.py)$
args:
- --ignore-words=.codespell-ignorewords
- repo: https://github.com/PyCQA/flake8
rev: 4.0.1
hooks:
- id: flake8
- repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.11.2
hooks:
- id: mypy
args:
[
--strict,
--install-types,
--non-interactive,
--no-warn-unused-ignores,
]
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.3.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
- id: check-yaml
- id: check-added-large-files
- repo: https://github.com/psf/black
rev: 22.3.0
hooks:
- id: black
exclude: ^(fileformats/core/_version\.py)$
args:
- -l 88
- repo: https://github.com/codespell-project/codespell
rev: v2.1.0
hooks:
- id: codespell
exclude: ^(fileformats/core/_version\.py)$
args:
- --ignore-words=.codespell-ignorewords
- repo: https://github.com/PyCQA/flake8
rev: 4.0.1
hooks:
- id: flake8
- repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.11.2
hooks:
- id: mypy
args:
[
--strict,
--install-types,
--non-interactive,
--no-warn-unused-ignores,
]
exclude: tests
2 changes: 2 additions & 0 deletions fileformats/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
)
from .sampling import SampleFileGenerator
from .extras import extra, extra_implementation, converter
from .decorators import mtime_cached_property

__all__ = [
"__version__",
Expand All @@ -27,4 +28,5 @@
"extra",
"extra_implementation",
"converter",
"mtime_cached_property",
]
2 changes: 1 addition & 1 deletion fileformats/core/classifier.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import typing as ty
from .utils import classproperty
from .decorators import classproperty
from .exceptions import FormatDefinitionError


Expand Down
2 changes: 1 addition & 1 deletion fileformats/core/datatype.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
FormatConversionError,
FormatRecognitionError,
)
from .decorators import classproperty
from .utils import (
classproperty,
subpackages,
add_exc_note,
)
Expand Down
114 changes: 114 additions & 0 deletions fileformats/core/decorators.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
import sys
import typing as ty
from pathlib import Path
import time
from threading import RLock
import fileformats.core
from .fs_mount_identifier import FsMountIdentifier


PropReturn = ty.TypeVar("PropReturn")


__all__ = ["mtime_cached_property", "classproperty"]


class mtime_cached_property:
"""A property that is cached until the mtimes of the files in the fileset are changed"""

def __init__(self, func: ty.Callable[..., ty.Any]):
self.func = func
self.__doc__ = func.__doc__
self.lock = RLock()
self._cache_name = f"_{func.__name__}_mtime_cache"

def __get__(
self,
instance: ty.Optional["fileformats.core.FileSet"],
owner: ty.Optional[ty.Type["fileformats.core.FileSet"]] = None,
) -> ty.Any:
if instance is None: # if accessing property from class not instance
return self
assert isinstance(instance, fileformats.core.FileSet), (
"Cannot use mtime_cached_property instance with "
f"{type(instance).__name__!r} object, only FileSet objects."
)
try:
mtimes, value = instance.__dict__[self._cache_name]
except KeyError:
pass
else:
if (
instance.mtimes == mtimes
and enough_time_has_elapsed_given_mtime_resolution(mtimes)
):
return value
else:
del instance.__dict__[self._cache_name]
with self.lock:
# check if another thread filled cache while we awaited lock
try:
mtimes, value = instance.__dict__[self._cache_name]
except KeyError:
pass
else:
if (
instance.mtimes == mtimes
and enough_time_has_elapsed_given_mtime_resolution(mtimes)
):
return value
value = self.func(instance)
instance.__dict__[self._cache_name] = (instance.mtimes, value)
return value


def classproperty(meth: ty.Callable[..., PropReturn]) -> PropReturn:
"""Access a @classmethod like a @property."""
# mypy doesn't understand class properties yet: https://github.com/python/mypy/issues/2563
return classmethod(property(meth)) # type: ignore


if sys.version_info[:2] < (3, 9):

class classproperty(object): # type: ignore[no-redef] # noqa
def __init__(self, f: ty.Callable[[ty.Type[ty.Any]], ty.Any]):
self.f = f

def __get__(self, obj: ty.Any, owner: ty.Any) -> ty.Any:
return self.f(owner)


def enough_time_has_elapsed_given_mtime_resolution(
mtimes: ty.Iterable[ty.Tuple[Path, ty.Union[int, float]]],
current_time: ty.Optional[int] = None,
) -> bool:
"""Determines whether enough time has elapsed since the the last of the cached mtimes
to be sure that changes in mtimes will be detected given the resolution of the mtimes
on the file system. For example, on systems with a mtime resolution of a second,
a change in mtime may not be detected if the cache is re-read within a second and
the file is modified in the intervening period (probably only likely during tests).
So this function guesses the resolution of the mtimes by the
minimum number of trailing zeros in the mtimes and then checks if enough time has
passed to be sure that any changes in mtimes will be detected.
This may result in false negatives for systems with low mtime resolutions, but
this will just result in (presumably minor) performance hit via unnecessary cache
invalidations.
Parameters
----------
mtimes : Iterable[tuple[Path, int]]
the path-mtime pairs in nanoseconds to guess the resolution of
Returns
-------
bool
whether enough time has elapsed since the lagiven the guessed resolution of the mtimes
"""
if current_time is None:
current_time = time.time_ns()
for fspath, mtime in mtimes:
mtime_resolution = FsMountIdentifier.get_mtime_resolution(fspath)
if current_time - mtime <= mtime_resolution:
return False
return True
4 changes: 1 addition & 3 deletions fileformats/core/field.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
from __future__ import annotations
import typing as ty
from .utils import (
classproperty,
)
from .decorators import classproperty
from .datatype import DataType
from .exceptions import FormatMismatchError

Expand Down
13 changes: 6 additions & 7 deletions fileformats/core/fileset.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,12 @@
import logging
from fileformats.core.typing import Self
from .utils import (
classproperty,
mtime_cached_property,
fspaths_converter,
describe_task,
matching_source,
import_extras_module,
)
from .decorators import mtime_cached_property, classproperty
from .typing import FspathsInputType, CryptoMethod, PathType
from .sampling import SampleFileGenerator
from .identification import (
Expand Down Expand Up @@ -187,16 +186,16 @@ def relative_fspaths(self) -> ty.Iterator[Path]:
return (p.relative_to(self.parent) for p in self.fspaths)

@property
def mtimes(self) -> ty.Tuple[ty.Tuple[str, float], ...]:
def mtimes(self) -> ty.Tuple[ty.Tuple[str, int], ...]:
"""Modification times of all fspaths in the file-set
Returns
-------
tuple[tuple[str, float], ...]
a tuple of tuples containing the file paths and the modification time sorted
by the file path
tuple[tuple[str, int], ...]
a tuple of tuples containing the file paths and the modification time (ns)
sorted by the file path
"""
return tuple((str(p), p.stat().st_mtime) for p in sorted(self.fspaths))
return tuple((str(p), p.stat().st_mtime_ns) for p in sorted(self.fspaths))

@classproperty
def mime_type(cls) -> str:
Expand Down
35 changes: 35 additions & 0 deletions fileformats/core/fs_mount_identifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,4 +155,39 @@ def patch_table(cls, mount_table: ty.List[ty.Tuple[str, str]]) -> ty.Iterator[No
finally:
cls._mount_table = orig_table

@classmethod
def get_mtime_resolution(cls, path: PathLike) -> int:
"""Get the mount point for a given file-system path
Parameters
----------
path: os.PathLike
the file-system path to identify the mount of
Returns
-------
mount_point: os.PathLike
the root of the mount the path sits on
fstype : str
the type of the file-system (e.g. ext4 or cifs)"""
mount_point, fstype = cls.get_mount(path)
try:
resolution = cls.FS_MAX_MTIME_NS_RESOLUTION[fstype]
except KeyError:
resolution = max(cls.FS_MAX_MTIME_NS_RESOLUTION.values())
return resolution

_mount_table: ty.Optional[ty.List[ty.Tuple[str, str]]] = None

# Define a table of file system types and their mtime resolutions (in seconds)
FS_MAX_MTIME_NS_RESOLUTION: ty.Dict[str, int] = {
"ext4": int(1e9), # docs say 1 nanosecond but in found 1 sec often in practice
"xfs": 1,
"btrfs": 1,
"ntfs": 100,
"hfs": 1,
"apfs": 1,
"fat32": int(2e9), # 2 seconds
"exfat": int(1e9), # 1 second
# Add more file systems and their resolutions as needed
}
3 changes: 2 additions & 1 deletion fileformats/core/mixin.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
import logging
from .datatype import DataType
import fileformats.core
from .utils import classproperty, describe_task, matching_source
from .utils import describe_task, matching_source
from .decorators import classproperty
from .identification import to_mime_format_name
from .converter_helpers import SubtypeVar, ConverterSpec
from .classifier import Classifier
Expand Down
45 changes: 45 additions & 0 deletions fileformats/core/tests/test_decorators.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
from pathlib import Path
import time
from fileformats.core.decorators import (
mtime_cached_property,
enough_time_has_elapsed_given_mtime_resolution,
)
from fileformats.generic import File


class MtimeTestFile(File):

flag: int

@mtime_cached_property
def cached_prop(self):
return self.flag


def test_mtime_cached_property(tmp_path: Path):
fspath = tmp_path / "file_1.txt"
fspath.write_text("hello")

file = MtimeTestFile(fspath)
time.sleep(
2
) # ensure enough time has elapsed since file creation/modification for mtime to increment

file.flag = 0
assert file.cached_prop == 0
file.flag = 1
assert file.cached_prop == 0
fspath.write_text("world")
assert file.cached_prop == 1


def test_enough_time_has_elapsed_given_mtime_resolution():
assert enough_time_has_elapsed_given_mtime_resolution(
[("", 110), ("", 220), ("", 300)], int(3e9) # need to make it high for windows
)


def test_not_enough_time_has_elapsed_given_mtime_resolution():
assert not enough_time_has_elapsed_given_mtime_resolution(
[("", 110), ("", 220), ("", 300)], 301
)
Loading

0 comments on commit 86f0f68

Please sign in to comment.