Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use dataclass instead of auto_repr #1349

Merged
merged 1 commit into from
Nov 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 2 additions & 9 deletions dandi/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from threading import Lock
import time
from types import TracebackType
from typing import IO, Any, Literal, Protocol
from typing import IO, Any, Literal

from dandischema.models import DigestType
from fasteners import InterProcessLock
Expand All @@ -36,6 +36,7 @@
from .support.iterators import IteratorWithAggregation
from .support.pyout import naturalsize
from .utils import (
Hasher,
abbrev_prompt,
ensure_datetime,
exclude_from_zarr,
Expand Down Expand Up @@ -488,14 +489,6 @@ def _populate_dandiset_yaml(
}


class Hasher(Protocol):
def update(self, data: bytes) -> None:
...

def hexdigest(self) -> str:
...


def _download_file(
downloader: Callable[[int], Iterator[bytes]],
path: Path,
Expand Down
42 changes: 17 additions & 25 deletions dandi/support/digests.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@

from __future__ import annotations

from collections.abc import Callable
from dataclasses import dataclass, field
import hashlib
import logging
import os.path
Expand All @@ -21,12 +23,12 @@
from zarr_checksum import ZarrChecksumTree

from .threaded_walk import threaded_walk
from ..utils import auto_repr, exclude_from_zarr
from ..utils import Hasher, exclude_from_zarr

lgr = logging.getLogger("dandi.support.digests")


@auto_repr
@dataclass
class Digester:
"""Helper to compute multiple digests in one pass for a file"""

Expand All @@ -36,28 +38,18 @@ class Digester:
# Ideally we should find an efficient way to parallelize this but
# atm this one is sufficiently speedy

DEFAULT_DIGESTS = ["md5", "sha1", "sha256", "sha512"]
#: List of any supported algorithm labels, such as md5, sha1, etc.
digests: list[str] = field(
default_factory=lambda: ["md5", "sha1", "sha256", "sha512"]
)

def __init__(
self, digests: list[str] | None = None, blocksize: int = 1 << 16
) -> None:
"""
Parameters
----------
digests : list or None
List of any supported algorithm labels, such as md5, sha1, etc.
If None, a default set of hashes will be computed (md5, sha1,
sha256, sha512).
blocksize : int
Chunk size (in bytes) by which to consume a file.
"""
self._digests = digests or self.DEFAULT_DIGESTS
self._digest_funcs = [getattr(hashlib, digest) for digest in self._digests]
self.blocksize = blocksize
#: Chunk size (in bytes) by which to consume a file.
blocksize: int = 1 << 16

@property
def digests(self) -> list[str]:
return self._digests
digest_funcs: list[Callable[[], Hasher]] = field(init=False, repr=False)

def __post_init__(self) -> None:
self.digest_funcs = [getattr(hashlib, digest) for digest in self.digests]

def __call__(self, fpath: str | Path) -> dict[str, str]:
"""
Expand All @@ -70,14 +62,14 @@ def __call__(self, fpath: str | Path) -> dict[str, str]:
Keys are algorithm labels, and values are checksum strings
"""
lgr.debug("Estimating digests for %s" % fpath)
digests = [x() for x in self._digest_funcs]
digests = [x() for x in self.digest_funcs]
with open(fpath, "rb") as f:
while True:
block = f.read(self.blocksize)
if not block:
break
[d.update(block) for d in digests]

for d in digests:
d.update(block)
return {n: d.hexdigest() for n, d in zip(self.digests, digests)}


Expand Down
46 changes: 9 additions & 37 deletions dandi/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
import subprocess
import sys
import types
from typing import IO, Any, List, Optional, TypeVar, Union
from typing import IO, Any, List, Optional, Protocol, TypeVar, Union
from urllib.parse import parse_qs, urlparse, urlunparse

import dateutil.parser
Expand Down Expand Up @@ -60,6 +60,14 @@
)


class Hasher(Protocol):
def update(self, data: bytes) -> None:
...
Dismissed Show dismissed Hide dismissed

def hexdigest(self) -> str:
...
Dismissed Show dismissed Hide dismissed


def is_interactive() -> bool:
"""Return True if all in/outs are tty"""
# TODO: check on windows if hasattr check would work correctly and add value:
Expand Down Expand Up @@ -511,42 +519,6 @@ def shortened_repr(value: Any, length: int = 30) -> str:
return value_repr


def __auto_repr__(obj: Any) -> str:
attr_names: tuple[str, ...] = ()
if hasattr(obj, "__dict__"):
attr_names += tuple(obj.__dict__.keys())
if hasattr(obj, "__slots__"):
attr_names += tuple(obj.__slots__)

items = []
for attr in sorted(set(attr_names)):
if attr.startswith("_"):
continue
value = getattr(obj, attr)
# TODO: should we add this feature to minimize some talktative reprs
# such as of URL?
# if value is None:
# continue
items.append("%s=%s" % (attr, shortened_repr(value)))

return "%s(%s)" % (obj.__class__.__name__, ", ".join(items))


TT = TypeVar("TT", bound=type)


def auto_repr(cls: TT) -> TT:
"""Decorator for a class to assign it an automagic quick and dirty __repr__

It uses public class attributes to prepare repr of a class

Original idea: http://stackoverflow.com/a/27799004/1265472
"""

cls.__repr__ = __auto_repr__ # type: ignore[assignment]
return cls


def Parallel(**kwargs: Any) -> Any: # TODO: disable lint complaint
"""Adapter for joblib.Parallel so we could if desired, centralize control"""
# ATM just a straight invocation
Expand Down