Skip to content

Commit

Permalink
BF: use total zarr_size to compute done% for zarr
Browse files Browse the repository at this point in the history
Since maxsize is dynamically computed as we go through the files.
The idea, I guess, was that it would grow rapidly before actual
downloads commense but it is not the case, so we endup with done%
being always close to 100% since we get those reports on final
downloads completed close to when individual files are downloaded.

So this should close #1407 .

But for total zarr file to be used, we needed to account also for
skipped files.  I added reporting of sizes for skipped files as well.
It seems there is no negative side effect on regular files download.
So now for the %done of zarr we might be getting to 100% of original
size having downloaded nothing.  But IMHO it is ok since user does
not care as much of how many "subparts" are downloaded, but rather
to have adequate progress report back.

There also could be side effects if  -e skip  and we skip
download of some updated files which would be smaller than the local
ones so altogether we would get over 100% total at the end.
  • Loading branch information
yarikoptic committed May 14, 2024
1 parent c0df9af commit 743d034
Showing 1 changed file with 24 additions and 10 deletions.
34 changes: 24 additions & 10 deletions dandi/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -481,8 +481,8 @@ def agg_done(self, done_sizes: Iterator[int]) -> list[str]:
return v


def _skip_file(msg: Any) -> dict:
return {"status": "skipped", "message": str(msg)}
def _skip_file(msg: Any, **kwargs: Any) -> dict:
return dict(**kwargs, status="skipped", message=str(msg))


def _populate_dandiset_yaml(
Expand Down Expand Up @@ -514,7 +514,7 @@ def _populate_dandiset_yaml(
existing is DownloadExisting.REFRESH
and os.lstat(dandiset_yaml).st_mtime >= mtime.timestamp()
):
yield _skip_file("already exists")
yield _skip_file("already exists", size=os.lstat(dandiset_yaml).st_mtime)
return
ds = Dandiset(dandiset_path, allow_empty=True)
ds.path_obj.mkdir(exist_ok=True) # exist_ok in case of parallel race
Expand Down Expand Up @@ -637,7 +637,7 @@ def _download_file(
# but mtime is different
if same == ["mtime", "size"]:
# TODO: add recording and handling of .nwb object_id
yield _skip_file("same time and size")
yield _skip_file("same time and size", size=size)
return
lgr.debug(f"{path!r} - same attributes: {same}. Redownloading")

Expand Down Expand Up @@ -1028,11 +1028,17 @@ def get_done(self) -> dict:
total_downloaded = sum(
s.downloaded
for s in self.files.values()
if s.state in (DLState.DOWNLOADING, DLState.CHECKSUM_ERROR, DLState.DONE)
if s.state
in (
DLState.DOWNLOADING,
DLState.CHECKSUM_ERROR,
DLState.SKIPPED,
DLState.DONE,
)
)
return {
"done": total_downloaded,
"done%": total_downloaded / self.maxsize * 100,
"done%": total_downloaded / self.zarr_size * 100,
}

def set_status(self, statusdict: dict) -> None:
Expand Down Expand Up @@ -1061,16 +1067,24 @@ def set_status(self, statusdict: dict) -> None:
def feed(self, path: str, status: dict) -> Iterator[dict]:
keys = list(status.keys())
self.files.setdefault(path, DownloadProgress())
size = status.get("size")
if size is not None:
if not self.yielded_size:
# this thread will yield
self.yielded_size = True
yield {"size": self.zarr_size}
if status.get("status") == "skipped":
self.files[path].state = DLState.SKIPPED
out = {"message": self.message}
# Treat skipped as "downloaded" for the matter of accounting
if size is not None:
self.files[path].downloaded = size
self.maxsize += size
self.set_status(out)
yield out
yield self.get_done()
elif keys == ["size"]:
if not self.yielded_size:
yield {"size": self.zarr_size}
self.yielded_size = True
self.files[path].size = status["size"]
self.files[path].size = size
self.maxsize += status["size"]
if any(s.state is DLState.DOWNLOADING for s in self.files.values()):
yield self.get_done()
Expand Down

0 comments on commit 743d034

Please sign in to comment.