Skip to content

Commit

Permalink
address PR comments
Browse files Browse the repository at this point in the history
  • Loading branch information
sneakers-the-rat committed Nov 23, 2023
1 parent b1a694e commit 96f20f7
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 39 deletions.
32 changes: 19 additions & 13 deletions dandi/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -583,7 +583,7 @@ def _download_file(
and "sha256" in digests
):
if key_parts[-1].partition(".")[0] == digests["sha256"]:
lgr.debug("already exists - matching digest in filename")
lgr.debug(f"{path!r} already exists - matching digest in filename")
yield _skip_file("already exists")
return
else:
Expand All @@ -592,8 +592,9 @@ def _download_file(
path,
)
elif digests is not None and check_digests(path, digests):
lgr.debug("already exists - matching digest")
yield _skip_file("already exists")
lgr.debug(f"{path!r} already exists - matching digest")
yield _skip_file("matching digest")
yield {"checksum": "ok"}
return
else:
lgr.debug(
Expand All @@ -606,8 +607,9 @@ def _download_file(
# If we have no expected mtime, warn and check file digests if present
if mtime is None:
if digests is not None and check_digests(path, digests):
lgr.debug("already exists - matching digest")
yield _skip_file("already exists")
lgr.debug(f"{path!r} already exists - matching digest")
yield _skip_file("matching digest")
yield {"checksum": "ok"}
return
else:
lgr.warning(
Expand All @@ -629,18 +631,19 @@ def _download_file(
# TODO: add recording and handling of .nwb object_id
# if we have digests, check those before deciding not to redownload
if digests is not None and check_digests(path, digests):
lgr.debug("already exists - same time, size, and digest")
yield _skip_file("already exists - same time, size, and digest")
lgr.debug(
f"{path!r} already exists - same time, size, and digest"
)
yield _skip_file("same time, size, and digest")
yield {"checksum": "ok"}
return

# if we don't have digests but size and mtime match, don't redownload
elif digests is None or len(digests) == 0:
elif digests is None:
lgr.debug(
"already exists - same time and size, but missing digests"
)
yield _skip_file(
"already exists - same time and size, but missing digests"
f"{path!r} already exists - same time and size, but missing digests"
)
yield _skip_file("same time and size, missing digests")
return

# otherwise we're redownloading
Expand All @@ -649,7 +652,10 @@ def _download_file(
f"{path!r} - same time and size, but hashes dont match. Redownloading"
)
else:
lgr.debug(f"{path!r} - same attributes: {same}. Redownloading")
differing = {"mtime", "size"} - set(same)
lgr.debug(
f"{path!r} - {', '.join(differing)} doesn't match. Redownloading"
)

if size is not None:
yield {"size": size}
Expand Down
39 changes: 20 additions & 19 deletions dandi/support/digests.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,9 +139,9 @@ def md5file_nocache(filepath: str | Path) -> str:
return Digester(["md5"])(filepath)["md5"]


DIGEST_PRIORITY = ("dandi-etag", "zarr-checksum", "sha512", "sha256", "sha1", "md5")
DIGEST_PRIORITY = ("dandi-etag", "sha512", "sha256", "sha1", "md5")
"""
Default order to check file digests against.
Default order to prioritize digest types for :func:`.check_digests`
"""


Expand All @@ -151,28 +151,29 @@ def check_digests(
priority: tuple[str, ...] = DIGEST_PRIORITY,
) -> bool:
"""
Given some set of digests from DANDI metadata, eg that given by
Given some set of digests from DANDI metadata, e.g., that given by
:meth:`~dandi.download.Downloader.download_generator` to ``_download_file`` ,
check that a file matches.
Only check the first digest that is present in ``digests`` , rather than
checking all and returning if any are ``True`` - save compute time and
honor the notion of priority - some hash algos are better than others.
check that a file matches the highest priority digest.
Parameters
----------
filepath : str, :class:`pathlib.Path`
File to checksum
digests : dict
Digest hashes to check against, with keys matching entries in ``priority``
priority : list(str), tuple(str)
Order of hashes to check, highest priority first. Default :const:`.DIGEST_PRIORITY`
"""

# find the digest type to use
digest_type = None
digest = None
for dtype in priority:
if dtype in digests.keys():
digest_type = dtype
digest = digests[dtype]
break
if digest_type is None:
try:
digest_type = next(dtype for dtype in priority if dtype in digests)
except StopIteration:
raise RuntimeError(
f"No digest found matching those in priority list. "
f"Got {digests.keys()} digests with priority list {priority}"
f"No digest found matching those in priority list."
f"\ndigests: {', '.join(digests.keys())}"
f"\npriority: {', '.join(priority)}"
)
digest = digests[digest_type]

# return simple comparison
return bool(get_digest(filepath, digest_type) == digest)
8 changes: 1 addition & 7 deletions dandi/tests/test_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

from collections.abc import Callable
import json
import logging
import os
import os.path
from pathlib import Path
Expand Down Expand Up @@ -31,7 +30,7 @@
download,
)
from ..exceptions import NotFoundError
from ..support.digests import check_digests
from ..support.digests import Digester, check_digests
from ..utils import list_paths


Expand Down Expand Up @@ -156,11 +155,6 @@ def test_download_000027_resume(
def test_download_000027_digest(
tmp_path: Path, dlmode: DownloadExisting, caplog: pytest.LogCaptureFixture
) -> None:
from ..support.digests import Digester

# capture logs to test whether downloads happen
caplog.set_level(logging.DEBUG, logger="dandi")

# Should redownload if size and mtime match but content doesn't match
digester = Digester()
url = "https://dandiarchive.org/dandiset/000027/0.210831.2033"
Expand Down

0 comments on commit 96f20f7

Please sign in to comment.