address PR comments

dandi · Nov 23, 2023 · 96f20f7 · 96f20f7
1 parent b1a694e
commit 96f20f7
Show file tree

Hide file tree

Showing 3 changed files with 40 additions and 39 deletions.
diff --git a/dandi/download.py b/dandi/download.py
@@ -583,7 +583,7 @@ def _download_file(
                 and "sha256" in digests
             ):
                 if key_parts[-1].partition(".")[0] == digests["sha256"]:
-                    lgr.debug("already exists - matching digest in filename")
+                    lgr.debug(f"{path!r} already exists - matching digest in filename")
                     yield _skip_file("already exists")
                     return
                 else:
@@ -592,8 +592,9 @@ def _download_file(
                         path,
                     )
             elif digests is not None and check_digests(path, digests):
-                lgr.debug("already exists - matching digest")
-                yield _skip_file("already exists")
+                lgr.debug(f"{path!r} already exists - matching digest")
+                yield _skip_file("matching digest")
+                yield {"checksum": "ok"}
                 return
             else:
                 lgr.debug(
@@ -606,8 +607,9 @@ def _download_file(
             # If we have no expected mtime, warn and check file digests if present
             if mtime is None:
                 if digests is not None and check_digests(path, digests):
-                    lgr.debug("already exists - matching digest")
-                    yield _skip_file("already exists")
+                    lgr.debug(f"{path!r} already exists - matching digest")
+                    yield _skip_file("matching digest")
+                    yield {"checksum": "ok"}
                     return
                 else:
                     lgr.warning(
@@ -629,18 +631,19 @@ def _download_file(
                     # TODO: add recording and handling of .nwb object_id
                     # if we have digests, check those before deciding not to redownload
                     if digests is not None and check_digests(path, digests):
-                        lgr.debug("already exists - same time, size, and digest")
-                        yield _skip_file("already exists - same time, size, and digest")
+                        lgr.debug(
+                            f"{path!r} already exists - same time, size, and digest"
+                        )
+                        yield _skip_file("same time, size, and digest")
+                        yield {"checksum": "ok"}
                         return
 
                     # if we don't have digests but size and mtime match, don't redownload
-                    elif digests is None or len(digests) == 0:
+                    elif digests is None:
                         lgr.debug(
-                            "already exists - same time and size, but missing digests"
-                        )
-                        yield _skip_file(
-                            "already exists - same time and size, but missing digests"
+                            f"{path!r} already exists - same time and size, but missing digests"
                         )
+                        yield _skip_file("same time and size, missing digests")
                         return
 
                     # otherwise we're redownloading
@@ -649,7 +652,10 @@ def _download_file(
                             f"{path!r} - same time and size, but hashes dont match. Redownloading"
                         )
                 else:
-                    lgr.debug(f"{path!r} - same attributes: {same}.  Redownloading")
+                    differing = {"mtime", "size"} - set(same)
+                    lgr.debug(
+                        f"{path!r} - {', '.join(differing)} doesn't match.  Redownloading"
+                    )
 
     if size is not None:
         yield {"size": size}

diff --git a/dandi/support/digests.py b/dandi/support/digests.py
@@ -139,9 +139,9 @@ def md5file_nocache(filepath: str | Path) -> str:
     return Digester(["md5"])(filepath)["md5"]
 
 
-DIGEST_PRIORITY = ("dandi-etag", "zarr-checksum", "sha512", "sha256", "sha1", "md5")
+DIGEST_PRIORITY = ("dandi-etag", "sha512", "sha256", "sha1", "md5")
 """
-Default order to check file digests against.
+Default order to prioritize digest types for :func:`.check_digests`
 """
 
 
@@ -151,28 +151,29 @@ def check_digests(
     priority: tuple[str, ...] = DIGEST_PRIORITY,
 ) -> bool:
     """
-    Given some set of digests from DANDI metadata, eg that given by
+    Given some set of digests from DANDI metadata, e.g., that given by
     :meth:`~dandi.download.Downloader.download_generator` to ``_download_file`` ,
-    check that a file matches.
-
-    Only check the first digest that is present in ``digests`` , rather than
-    checking all and returning if any are ``True`` - save compute time and
-    honor the notion of priority - some hash algos are better than others.
+    check that a file matches the highest priority digest.
+
+    Parameters
+    ----------
+    filepath : str, :class:`pathlib.Path`
+        File to checksum
+    digests : dict
+        Digest hashes to check against, with keys matching entries in ``priority``
+    priority : list(str), tuple(str)
+        Order of hashes to check, highest priority first. Default :const:`.DIGEST_PRIORITY`
     """
 
     # find the digest type to use
-    digest_type = None
-    digest = None
-    for dtype in priority:
-        if dtype in digests.keys():
-            digest_type = dtype
-            digest = digests[dtype]
-            break
-    if digest_type is None:
+    try:
+        digest_type = next(dtype for dtype in priority if dtype in digests)
+    except StopIteration:
         raise RuntimeError(
-            f"No digest found matching those in priority list. "
-            f"Got {digests.keys()} digests with priority list {priority}"
+            f"No digest found matching those in priority list."
+            f"\ndigests: {', '.join(digests.keys())}"
+            f"\npriority: {', '.join(priority)}"
         )
+    digest = digests[digest_type]
 
-    # return simple comparison
     return bool(get_digest(filepath, digest_type) == digest)
diff --git a/dandi/tests/test_download.py b/dandi/tests/test_download.py
@@ -2,7 +2,6 @@
 
 from collections.abc import Callable
 import json
-import logging
 import os
 import os.path
 from pathlib import Path
@@ -31,7 +30,7 @@
     download,
 )
 from ..exceptions import NotFoundError
-from ..support.digests import check_digests
+from ..support.digests import Digester, check_digests
 from ..utils import list_paths
 
 
@@ -156,11 +155,6 @@ def test_download_000027_resume(
 def test_download_000027_digest(
     tmp_path: Path, dlmode: DownloadExisting, caplog: pytest.LogCaptureFixture
 ) -> None:
-    from ..support.digests import Digester
-
-    # capture logs to test whether downloads happen
-    caplog.set_level(logging.DEBUG, logger="dandi")
-
     # Should redownload if size and mtime match but content doesn't match
     digester = Digester()
     url = "https://dandiarchive.org/dandiset/000027/0.210831.2033"