Skip to content

Commit

Permalink
Merge pull request #1559 from dandi/gh-1556
Browse files Browse the repository at this point in the history
Improve retrying downloads code and testing
  • Loading branch information
yarikoptic authored Feb 12, 2025
2 parents 50e2d69 + 2482103 commit 527347c
Show file tree
Hide file tree
Showing 6 changed files with 250 additions and 59 deletions.
2 changes: 1 addition & 1 deletion dandi/consts.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ def urls(self) -> Iterator[str]:

#: HTTP response status codes that should always be retried (until we run out
#: of retries)
RETRY_STATUSES = (500, 502, 503, 504)
RETRY_STATUSES = (429, 500, 502, 503, 504)

VIDEO_FILE_EXTENSIONS = [".mp4", ".avi", ".wmv", ".mov", ".flv", ".mkv"]
VIDEO_FILE_MODULES = ["processing", "acquisition"]
Expand Down
8 changes: 8 additions & 0 deletions dandi/dandiapi.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
chunked,
ensure_datetime,
get_instance,
get_retry_after,
is_interactive,
is_page2_url,
joinurl,
Expand Down Expand Up @@ -236,6 +237,13 @@ def request(
)
if data is not None and hasattr(data, "seek"):
data.seek(0)
if retry_after := get_retry_after(result):
lgr.debug(
"Sleeping for %d seconds as instructed in response "
"(in addition to tenacity imposed)",
retry_after,
)
sleep(retry_after)
result.raise_for_status()
except Exception as e:
if isinstance(e, requests.HTTPError):
Expand Down
30 changes: 21 additions & 9 deletions dandi/dandiarchive.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
)
from .dandiapi import BaseRemoteAsset, DandiAPIClient, RemoteDandiset
from .exceptions import FailedToConnectError, NotFoundError, UnknownURLError
from .utils import get_instance
from .utils import get_instance, get_retry_after

lgr = get_logger()

Expand Down Expand Up @@ -893,14 +893,26 @@ def follow_redirect(url: str) -> str:
while True:
r = requests.head(url, allow_redirects=True)
if r.status_code in RETRY_STATUSES and i < 4:
delay = 0.1 * 10**i
lgr.warning(
"HEAD request to %s returned %d; sleeping for %f seconds and then retrying...",
url,
r.status_code,
delay,
)
sleep(delay)
retry_after = get_retry_after(r)
if retry_after is not None:
delay = retry_after
else:
delay = 0.1 * 10**i
if delay:
lgr.warning(
"HEAD request to %s returned %d; "
"sleeping for %f seconds and then retrying...",
url,
r.status_code,
delay,
)
sleep(delay)
else:
lgr.warning(
"HEAD request to %s returned %d; retrying...",
url,
r.status_code,
)
i += 1
continue
elif r.status_code == 404:
Expand Down
83 changes: 34 additions & 49 deletions dandi/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
ensure_datetime,
exclude_from_zarr,
flattened,
get_retry_after,
is_same_time,
path_is_subpath,
pluralize,
Expand Down Expand Up @@ -796,7 +797,7 @@ def _download_file(
# Catching RequestException lets us retry on timeout & connection
# errors (among others) in addition to HTTP status errors.
except requests.RequestException as exc:
attempts_allowed_or_not = _check_if_more_attempts_allowed(
attempts_allowed_or_not = _check_attempts_and_sleep(
path=path,
exc=exc,
attempt=attempt,
Expand Down Expand Up @@ -1080,15 +1081,29 @@ def downloads_gen():
yield {"status": "done"}


def _check_if_more_attempts_allowed(
def _check_attempts_and_sleep(
path: Path,
exc: requests.RequestException,
attempt: int,
attempts_allowed: int,
downloaded_in_attempt: int,
downloaded_in_attempt: int = 0,
) -> int | None:
"""Check if we should retry the download, return potentially adjusted 'attempts_allowed'"""
sleep_amount = random.random() * 5 * attempt
"""
Check if we should retry the download, sleep if still allowed,
and return potentially adjusted 'attempts_allowed'
:param path: Destination of the download
:param exc: Exception raised during the last download attempt
:param attempt: The index of the last download attempt
:param attempts_allowed: The number of download attempts currently allowed
:param downloaded_in_attempt: The number of bytes downloaded in the last attempt
:returns: The number of download attempts allowed, potentially adjusted, if download
should be retried. None if download should not be retried.
Note: If download should be retried, this function sleeps before returning.
otherwise, it returns immediately.
"""
sleep_amount: float | None = None
if os.environ.get("DANDI_DOWNLOAD_AGGRESSIVE_RETRY"):
# in such a case if we downloaded a little more --
# consider it a successful attempt
Expand All @@ -1104,12 +1119,11 @@ def _check_if_more_attempts_allowed(
)
attempts_allowed += 1
if attempt >= attempts_allowed:
lgr.debug("%s - download failed after %d attempts: %s", path, attempt, exc)
lgr.debug("%s - download failed after %d attempts: %s", path, attempt + 1, exc)
return None
# TODO: actually we should probably retry only on selected codes,
elif exc.response is not None:
if exc.response is not None:
if exc.response.status_code not in (
400, # Bad Request, but happened with gider:
400, # Bad Request, but happened with girder:
# https://github.com/dandi/dandi-cli/issues/87
*RETRY_STATUSES,
):
Expand All @@ -1120,46 +1134,17 @@ def _check_if_more_attempts_allowed(
exc,
)
return None
elif retry_after := exc.response.headers.get("Retry-After"):
# playing safe
if not str(retry_after).isdigit():
# our code is wrong, do not crash but issue warning so
# we might get report/fix it up
lgr.warning(
"%s - download failed due to response %d with non-integer"
" Retry-After=%r: %s",
path,
exc.response.status_code,
retry_after,
exc,
)
return None
sleep_amount = int(retry_after)
lgr.debug(
"%s - download failed due to response %d with "
"Retry-After=%d: %s, will sleep and retry",
path,
exc.response.status_code,
sleep_amount,
exc,
)
else:
lgr.debug(
"%s - download failed on attempt #%d: %s, will sleep a bit and retry",
path,
attempt,
exc,
)
# if is_access_denied(exc) or attempt >= 2:
# raise
# sleep a little and retry
else:
lgr.debug(
"%s - download failed on attempt #%d: %s, will sleep a bit and retry",
path,
attempt,
exc,
)
sleep_amount = get_retry_after(exc.response)
if sleep_amount is None:
# it was not Retry-after set, so we come up with random duration to sleep
sleep_amount = random.random() * 5 * attempt
lgr.debug(
"%s - download failed on attempt #%d: %s, will sleep %f and retry",
path,
attempt,
exc,
sleep_amount,
)
time.sleep(sleep_amount)
return attempts_allowed

Expand Down
123 changes: 123 additions & 0 deletions dandi/tests/test_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

from collections.abc import Callable
from contextlib import nullcontext
from email.utils import parsedate_to_datetime
from functools import partial
from glob import glob
import json
import logging
Expand All @@ -12,10 +14,13 @@
import re
from shutil import rmtree
import time
from unittest import mock

import numpy as np
import pytest
from pytest_mock import MockerFixture
import requests
from requests.exceptions import HTTPError
import responses
import zarr

Expand All @@ -32,6 +37,7 @@
PathType,
ProgressCombiner,
PYOUTHelper,
_check_attempts_and_sleep,
download,
)
from ..exceptions import NotFoundError
Expand Down Expand Up @@ -1169,3 +1175,120 @@ def test_DownloadDirectory_exc(
assert dl.dirpath.exists()
assert dl.fp is None
assert dl.writefile.read_bytes() == b"456"


def test__check_attempts_and_sleep() -> None:
f = partial(_check_attempts_and_sleep, Path("some/path"))

response403 = requests.Response()
response403.status_code = 403 # no retry

response500 = requests.Response()
response500.status_code = 500

# we do retry if cause is unknown (no response)
with mock.patch("time.sleep") as mock_sleep:
assert f(HTTPError(), attempt=1, attempts_allowed=2) == 2
mock_sleep.assert_called_once()
assert mock_sleep.call_args.args[0] >= 0

# or if some 500
with mock.patch("time.sleep") as mock_sleep:
assert f(HTTPError(response=response500), attempt=1, attempts_allowed=2) == 2
mock_sleep.assert_called_once()
assert mock_sleep.call_args.args[0] >= 0

# do not bother if already at limit
with mock.patch("time.sleep") as mock_sleep:
assert f(HTTPError(), attempt=2, attempts_allowed=2) is None
mock_sleep.assert_not_called()

# do not bother if 403
with mock.patch("time.sleep") as mock_sleep:
assert f(HTTPError(response=response403), attempt=1, attempts_allowed=2) is None
mock_sleep.assert_not_called()

# And in case of "Aggressive setting" when DANDI_DOWNLOAD_AGGRESSIVE_RETRY
# env var is set to 1, we retry if there was extra content downloaded
# patch env var DANDI_DOWNLOAD_AGGRESSIVE_RETRY
with mock.patch.dict(os.environ, {"DANDI_DOWNLOAD_AGGRESSIVE_RETRY": "1"}):
with mock.patch("time.sleep") as mock_sleep:
assert (
f(HTTPError(), attempt=2, attempts_allowed=2, downloaded_in_attempt=0)
is None
)
mock_sleep.assert_not_called()

assert (
f(HTTPError(), attempt=2, attempts_allowed=2, downloaded_in_attempt=1)
== 3
)
mock_sleep.assert_called_once()
assert mock_sleep.call_args.args[0] >= 0


@pytest.mark.parametrize("status_code", [429, 503])
def test__check_attempts_and_sleep_retries(status_code: int) -> None:
f = partial(_check_attempts_and_sleep, Path("some/path"))

response = requests.Response()
response.status_code = status_code

response.headers["Retry-After"] = "10"
with mock.patch("time.sleep") as mock_sleep:
assert f(HTTPError(response=response), attempt=1, attempts_allowed=2) == 2
mock_sleep.assert_called_once()
assert mock_sleep.call_args.args[0] == 10

response.headers["Retry-After"] = "Wed, 21 Oct 2015 07:28:00 GMT"
with mock.patch("time.sleep") as mock_sleep, mock.patch(
"dandi.utils.datetime"
) as mock_datetime:
# shifted by 2 minutes
mock_datetime.datetime.now.return_value = parsedate_to_datetime(
"Wed, 21 Oct 2015 07:26:00 GMT"
)
assert f(HTTPError(response=response), attempt=1, attempts_allowed=2) == 2
mock_sleep.assert_called_once()
assert mock_sleep.call_args.args[0] == 120

# we would still sleep some time if Retry-After is not decypherable
response.headers["Retry-After"] = "indecipherable"
with mock.patch("time.sleep") as mock_sleep:
assert (
_check_attempts_and_sleep(
Path("some/path"),
HTTPError(response=response),
attempt=1,
attempts_allowed=2,
)
== 2
)
mock_sleep.assert_called_once()
assert mock_sleep.call_args.args[0] > 0

# shifted by 1 year! (too long)
response.headers["Retry-After"] = "Wed, 21 Oct 2016 07:28:00 GMT"
with mock.patch("time.sleep") as mock_sleep, mock.patch(
"dandi.utils.datetime"
) as mock_datetime:
mock_datetime.datetime.now.return_value = parsedate_to_datetime(
"Wed, 21 Oct 2015 07:28:00 GMT"
)
assert f(HTTPError(response=response), attempt=1, attempts_allowed=2) == 2
mock_sleep.assert_called_once()
# and we do sleep some time
assert mock_sleep.call_args.args[0] > 0

# in the past second (too quick)
response.headers["Retry-After"] = "Wed, 21 Oct 2015 07:27:59 GMT"
with mock.patch("time.sleep") as mock_sleep, mock.patch(
"dandi.utils.datetime"
) as mock_datetime:
mock_datetime.datetime.now.return_value = parsedate_to_datetime(
"Wed, 21 Oct 2015 07:28:00 GMT"
)
assert f(HTTPError(response=response), attempt=1, attempts_allowed=2) == 2
mock_sleep.assert_called_once()
# and we do not sleep really
assert not mock_sleep.call_args.args[0]
Loading

0 comments on commit 527347c

Please sign in to comment.