diff --git a/dandi/download.py b/dandi/download.py index f29328e67..6749a8e35 100644 --- a/dandi/download.py +++ b/dandi/download.py @@ -17,7 +17,7 @@ from threading import Lock import time from types import TracebackType -from typing import IO, Any, Literal, Protocol +from typing import IO, Any, Literal from dandischema.models import DigestType from fasteners import InterProcessLock @@ -36,6 +36,7 @@ from .support.iterators import IteratorWithAggregation from .support.pyout import naturalsize from .utils import ( + Hasher, abbrev_prompt, ensure_datetime, exclude_from_zarr, @@ -488,14 +489,6 @@ def _populate_dandiset_yaml( } -class Hasher(Protocol): - def update(self, data: bytes) -> None: - ... - - def hexdigest(self) -> str: - ... - - def _download_file( downloader: Callable[[int], Iterator[bytes]], path: Path, diff --git a/dandi/support/digests.py b/dandi/support/digests.py index b1ce72f4d..99e01f1d3 100644 --- a/dandi/support/digests.py +++ b/dandi/support/digests.py @@ -11,6 +11,8 @@ from __future__ import annotations +from collections.abc import Callable +from dataclasses import dataclass, field import hashlib import logging import os.path @@ -21,12 +23,12 @@ from zarr_checksum import ZarrChecksumTree from .threaded_walk import threaded_walk -from ..utils import auto_repr, exclude_from_zarr +from ..utils import Hasher, exclude_from_zarr lgr = logging.getLogger("dandi.support.digests") -@auto_repr +@dataclass class Digester: """Helper to compute multiple digests in one pass for a file""" @@ -36,28 +38,18 @@ class Digester: # Ideally we should find an efficient way to parallelize this but # atm this one is sufficiently speedy - DEFAULT_DIGESTS = ["md5", "sha1", "sha256", "sha512"] + #: List of any supported algorithm labels, such as md5, sha1, etc. + digests: list[str] = field( + default_factory=lambda: ["md5", "sha1", "sha256", "sha512"] + ) - def __init__( - self, digests: list[str] | None = None, blocksize: int = 1 << 16 - ) -> None: - """ - Parameters - ---------- - digests : list or None - List of any supported algorithm labels, such as md5, sha1, etc. - If None, a default set of hashes will be computed (md5, sha1, - sha256, sha512). - blocksize : int - Chunk size (in bytes) by which to consume a file. - """ - self._digests = digests or self.DEFAULT_DIGESTS - self._digest_funcs = [getattr(hashlib, digest) for digest in self._digests] - self.blocksize = blocksize + #: Chunk size (in bytes) by which to consume a file. + blocksize: int = 1 << 16 - @property - def digests(self) -> list[str]: - return self._digests + digest_funcs: list[Callable[[], Hasher]] = field(init=False, repr=False) + + def __post_init__(self) -> None: + self.digest_funcs = [getattr(hashlib, digest) for digest in self.digests] def __call__(self, fpath: str | Path) -> dict[str, str]: """ @@ -70,14 +62,14 @@ def __call__(self, fpath: str | Path) -> dict[str, str]: Keys are algorithm labels, and values are checksum strings """ lgr.debug("Estimating digests for %s" % fpath) - digests = [x() for x in self._digest_funcs] + digests = [x() for x in self.digest_funcs] with open(fpath, "rb") as f: while True: block = f.read(self.blocksize) if not block: break - [d.update(block) for d in digests] - + for d in digests: + d.update(block) return {n: d.hexdigest() for n, d in zip(self.digests, digests)} diff --git a/dandi/utils.py b/dandi/utils.py index eb279d1d1..d4f769f83 100644 --- a/dandi/utils.py +++ b/dandi/utils.py @@ -19,7 +19,7 @@ import subprocess import sys import types -from typing import IO, Any, List, Optional, TypeVar, Union +from typing import IO, Any, List, Optional, Protocol, TypeVar, Union from urllib.parse import parse_qs, urlparse, urlunparse import dateutil.parser @@ -60,6 +60,14 @@ ) +class Hasher(Protocol): + def update(self, data: bytes) -> None: + ... + + def hexdigest(self) -> str: + ... + + def is_interactive() -> bool: """Return True if all in/outs are tty""" # TODO: check on windows if hasattr check would work correctly and add value: @@ -511,42 +519,6 @@ def shortened_repr(value: Any, length: int = 30) -> str: return value_repr -def __auto_repr__(obj: Any) -> str: - attr_names: tuple[str, ...] = () - if hasattr(obj, "__dict__"): - attr_names += tuple(obj.__dict__.keys()) - if hasattr(obj, "__slots__"): - attr_names += tuple(obj.__slots__) - - items = [] - for attr in sorted(set(attr_names)): - if attr.startswith("_"): - continue - value = getattr(obj, attr) - # TODO: should we add this feature to minimize some talktative reprs - # such as of URL? - # if value is None: - # continue - items.append("%s=%s" % (attr, shortened_repr(value))) - - return "%s(%s)" % (obj.__class__.__name__, ", ".join(items)) - - -TT = TypeVar("TT", bound=type) - - -def auto_repr(cls: TT) -> TT: - """Decorator for a class to assign it an automagic quick and dirty __repr__ - - It uses public class attributes to prepare repr of a class - - Original idea: http://stackoverflow.com/a/27799004/1265472 - """ - - cls.__repr__ = __auto_repr__ # type: ignore[assignment] - return cls - - def Parallel(**kwargs: Any) -> Any: # TODO: disable lint complaint """Adapter for joblib.Parallel so we could if desired, centralize control""" # ATM just a straight invocation