Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve retrying downloads code and testing #1559

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions dandi/consts.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ def urls(self) -> Iterator[str]:
"linc-staging",
"https://staging.lincbrain.org",
"https://staging-api.lincbrain.org/api",
)
),
}
# to map back url: name
known_instances_rev = {
Expand All @@ -160,7 +160,7 @@ def urls(self) -> Iterator[str]:

#: HTTP response status codes that should always be retried (until we run out
#: of retries)
RETRY_STATUSES = (500, 502, 503, 504)
RETRY_STATUSES = (429, 500, 502, 503, 504)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Adding 429 here will also affect requests made by dandiapi.py and dandiarchive.py. Unless you want to add support for Retry-After there as well, it'd be better to just add 429 next to RETRY_STATUSES in download.py.


VIDEO_FILE_EXTENSIONS = [".mp4", ".avi", ".wmv", ".mov", ".flv", ".mkv"]
VIDEO_FILE_MODULES = ["processing", "acquisition"]
Expand Down
64 changes: 37 additions & 27 deletions dandi/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
from collections import Counter, deque
from collections.abc import Callable, Iterable, Iterator, Sequence
from dataclasses import InitVar, dataclass, field
from datetime import datetime
from datetime import UTC, datetime
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

datetime.UTC requires Python 3.11. Use datetime.timezone.utc instead.

from email.utils import parsedate_to_datetime
from enum import Enum
from functools import partial
import hashlib
Expand Down Expand Up @@ -1085,10 +1086,10 @@
exc: requests.RequestException,
attempt: int,
attempts_allowed: int,
downloaded_in_attempt: int,
downloaded_in_attempt: int = 0,
) -> int | None:
"""Check if we should retry the download, return potentially adjusted 'attempts_allowed'"""
sleep_amount = random.random() * 5 * attempt
sleep_amount: float = -1.0
if os.environ.get("DANDI_DOWNLOAD_AGGRESSIVE_RETRY"):
# in such a case if we downloaded a little more --
# consider it a successful attempt
Expand Down Expand Up @@ -1122,19 +1123,36 @@
return None
elif retry_after := exc.response.headers.get("Retry-After"):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not about the changes in this PR. However, since are trying to improve this function, we may as well consider replace elif with if here. The last if clause ended with a return statement, so the elif here is about confusing.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So can be said about the containing elif clause at line 1111 at which I can't place a comment.

# playing safe
if not str(retry_after).isdigit():
# our code is wrong, do not crash but issue warning so
# we might get report/fix it up
lgr.warning(
"%s - download failed due to response %d with non-integer"
" Retry-After=%r: %s",
path,
exc.response.status_code,
retry_after,
exc,
)
return None
sleep_amount = int(retry_after)
retry_after = str(retry_after)
yarikoptic marked this conversation as resolved.
Show resolved Hide resolved
if retry_after.isdigit():
sleep_amount = int(retry_after)
Comment on lines +1127 to +1128
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we are assuming that this client always connect to our trusted DANDI API, I think this is fine, but if we want to want to play even "safer" I think we may want to ensure the retry_after string is really a non-negative integer in a decimal expression.

If retry_after is "²³" will crash this client at this point.

This safe guard can be excessive though since we have control of both the client and server.

I think use of str.isdecimal() and ensuring the return of int() to be non-negative is better.

else:
# else if it is a datestamp like "Wed, 21 Oct 2015 07:28:00 GMT"
# we could parse it and calculate how long to sleep
try:
retry_after_date = parsedate_to_datetime(retry_after)
current_date = datetime.now(UTC)
difference = retry_after_date - current_date
sleep_amount = int(difference.total_seconds())
if sleep_amount < 0:
lgr.warning(

Check warning on line 1138 in dandi/download.py

View check run for this annotation

Codecov / codecov/patch

dandi/download.py#L1138

Added line #L1138 was not covered by tests
"%s - date in Retry-After=%r is in the past (current is %r)",
path,
retry_after,
current_date,
)
except ValueError as exc_ve:
# our code or response is wrong, do not crash but issue warning
# and continue with "default" sleep logic
lgr.warning(
"%s - download failed due to response %d with non-integer or future date"
" Retry-After=%r: %s with %s upon converting assuming it is a date",
path,
exc.response.status_code,
retry_after,
exc,
exc_ve,
)
Comment on lines +1132 to +1155
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
try:
retry_after_date = parsedate_to_datetime(retry_after)
current_date = datetime.now(UTC)
difference = retry_after_date - current_date
sleep_amount = int(difference.total_seconds())
if sleep_amount < 0:
lgr.warning(
"%s - date in Retry-After=%r is in the past (current is %r)",
path,
retry_after,
current_date,
)
except ValueError as exc_ve:
# our code or response is wrong, do not crash but issue warning
# and continue with "default" sleep logic
lgr.warning(
"%s - download failed due to response %d with non-integer or future date"
" Retry-After=%r: %s with %s upon converting assuming it is a date",
path,
exc.response.status_code,
retry_after,
exc,
exc_ve,
)
try:
retry_after_date = parsedate_to_datetime(retry_after)
except (ValueError, TypeError) as exc_ve:
# our code or response is wrong, do not crash but issue warning
# and continue with "default" sleep logic
lgr.warning(
"%s - download failed due to response %d with non-integer or future date"
" Retry-After=%r: %s with %s upon converting assuming it is a date",
path,
exc.response.status_code,
retry_after,
exc,
exc_ve,
)
else:
current_date = datetime.now(timezone.utc)
difference = retry_after_date - current_date
sleep_amount = int(difference.total_seconds())
if sleep_amount < 0: # todo: `sleep_amount < 0` may not be an error. It the server set a `Retry-After` datetime extremely close and the network is slow, the client may receive the response after the `Retry-After` datetime.
lgr.warning(
"%s - date in Retry-After=%r is in the past (current is %r)",
path,
retry_after,
current_date,
)

This suggestion make the try clause

  1. more targeted
  2. catches TypeErrors from parsedate_to_datetime. (Try parsedate_to_datetime('Wed, 21 O3t 2015 07:280 GMT'))
  3. adds a note with a todo comment regarding sleep_amount < 0

lgr.debug(
"%s - download failed due to response %d with "
"Retry-After=%d: %s, will sleep and retry",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the "Retry-After=[]" in this log is now insistent and sleep_amount may not be a meaningful number if retry_after failed to be parsed into a datetime object

Expand All @@ -1143,17 +1161,9 @@
sleep_amount,
exc,
)
else:
lgr.debug(
"%s - download failed on attempt #%d: %s, will sleep a bit and retry",
path,
attempt,
exc,
)
# if is_access_denied(exc) or attempt >= 2:
# raise
# sleep a little and retry
else:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

By removing this else, the lgr.debug() call on line 1138/1156 will be followed by the call on line 1157/1167, resulting in two similar log messages.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i think it would only be in the case of non-conformant "retry-after" we would get 3 messages "on the matter"

  • "to response %d with non-integer or future date"
  • "to response %d with Retry-After"
  • the one below.

if there is a good Retry-after, we would get a non-negative value for sleep_amount, and log only at 1138/1156 (but we might want it into else: for exception, to not log with /1147) as sleep_amount condition on /1164 would not trigger that other message where we would also define the random sleep.

But for above "3 msgs" I will adjust... thanks!

if sleep_amount < 0:
# it was not Retry-after set, so we come up with random duration to sleep
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the current arrangement, the execution flow can reach here event if Retry-After is set. It can be set to an extremely close datetime or some string can't be parsed properly in to a datetime object

sleep_amount = random.random() * 5 * attempt
lgr.debug(
"%s - download failed on attempt #%d: %s, will sleep a bit and retry",
path,
Expand Down
94 changes: 94 additions & 0 deletions dandi/tests/test_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

from collections.abc import Callable
from contextlib import nullcontext
from email.utils import parsedate_to_datetime
from functools import partial
from glob import glob
import json
import logging
Expand All @@ -12,10 +14,13 @@
import re
from shutil import rmtree
import time
from unittest import mock

import numpy as np
import pytest
from pytest_mock import MockerFixture
import requests
from requests.exceptions import HTTPError
import responses
import zarr

Expand All @@ -32,6 +37,7 @@
PathType,
ProgressCombiner,
PYOUTHelper,
_check_if_more_attempts_allowed,
download,
)
from ..exceptions import NotFoundError
Expand Down Expand Up @@ -1169,3 +1175,91 @@ def test_DownloadDirectory_exc(
assert dl.dirpath.exists()
assert dl.fp is None
assert dl.writefile.read_bytes() == b"456"


def test__check_if_more_attempts_allowed():
f = partial(_check_if_more_attempts_allowed, "some/path")

response403 = requests.Response()
response403.status_code = 403 # no retry

response500 = requests.Response()
response500.status_code = 500

# we do retry if cause is unknown (no response)
with mock.patch("time.sleep") as mock_sleep:
assert f(HTTPError(), attempt=1, attempts_allowed=2) == 2
mock_sleep.assert_called_once()
assert mock_sleep.call_args.args[0] >= 0

# or if some 500
with mock.patch("time.sleep") as mock_sleep:
assert f(HTTPError(response=response500), attempt=1, attempts_allowed=2) == 2
mock_sleep.assert_called_once()
assert mock_sleep.call_args.args[0] >= 0

# do not bother if already at limit
with mock.patch("time.sleep") as mock_sleep:
assert f(HTTPError(), attempt=2, attempts_allowed=2) is None
mock_sleep.assert_not_called()

# do not bother if 403
with mock.patch("time.sleep") as mock_sleep:
assert f(HTTPError(response=response403), attempt=1, attempts_allowed=2) is None
mock_sleep.assert_not_called()

# And in case of "Aggressive setting" when DANDI_DOWNLOAD_AGGRESSIVE_RETRY
# env var is set to 1, we retry if there was extra content downloaded
# patch env var DANDI_DOWNLOAD_AGGRESSIVE_RETRY
with mock.patch.dict(os.environ, {"DANDI_DOWNLOAD_AGGRESSIVE_RETRY": "1"}):
with mock.patch("time.sleep") as mock_sleep:
assert (
f(HTTPError(), attempt=2, attempts_allowed=2, downloaded_in_attempt=0)
is None
)
mock_sleep.assert_not_called()

assert (
f(HTTPError(), attempt=2, attempts_allowed=2, downloaded_in_attempt=1)
== 3
)
mock_sleep.assert_called_once()
assert mock_sleep.call_args.args[0] >= 0


@pytest.mark.parametrize("status_code", [429, 503])
def test__check_if_more_attempts_allowed_retries(status_code):
f = partial(_check_if_more_attempts_allowed, "some/path")

response = requests.Response()
response.status_code = status_code

response.headers["Retry-After"] = "10"
with mock.patch("time.sleep") as mock_sleep:
assert f(HTTPError(response=response), attempt=1, attempts_allowed=2) == 2
mock_sleep.assert_called_once()
assert mock_sleep.call_args.args[0] == 10

response.headers["Retry-After"] = "Wed, 21 Oct 2015 07:28:00 GMT"
with mock.patch("time.sleep") as mock_sleep, mock.patch(
"dandi.download.datetime"
) as mock_datetime:
# shifted by 2 minutes
mock_datetime.now.return_value = parsedate_to_datetime(
"Wed, 21 Oct 2015 07:26:00 GMT"
)
assert f(HTTPError(response=response), attempt=1, attempts_allowed=2) == 2
mock_sleep.assert_called_once()
assert mock_sleep.call_args.args[0] == 120

# we would still sleep some time if Retry-After is not decypherable
response.headers["Retry-After"] = "nondecypherable"
with mock.patch("time.sleep") as mock_sleep:
assert (
_check_if_more_attempts_allowed(
"some/path", HTTPError(response=response), attempt=1, attempts_allowed=2
)
== 2
)
mock_sleep.assert_called_once()
assert mock_sleep.call_args.args[0] > 0
Loading