mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-13 09:01:54 +00:00
fix(vision): fail fast on non-retryable image download errors (#35221)
_download_image() wrapped every download attempt in a blanket `except Exception` and retried 3x with 2s/4s/8s backoff regardless of cause. A 404/403 image URL would never resolve on retry, so it just burned up to 6s of wall-clock + extra GETs before failing — inflating latency for a deterministic failure (issue #32296, umbrella #35114). Add _is_retryable_download_error(): 4xx client errors (except 429), website-policy PermissionError, and too-large/SSRF ValueError now raise on the first attempt. 429, 5xx, and unclassified network errors stay retryable. Removed the now-unreachable fall-through branch since the loop always returns on success or re-raises on the final/terminal attempt.
This commit is contained in:
parent
e481b15333
commit
b4cf114f68
2 changed files with 127 additions and 14 deletions
|
|
@ -917,3 +917,84 @@ class TestIsImageSizeError:
|
|||
|
||||
def test_empty_message(self):
|
||||
assert not _is_image_size_error(Exception(""))
|
||||
|
||||
|
||||
class TestDownloadRetryClassification:
|
||||
"""Error-class-aware retry: 4xx fail-fast, 429/5xx/transient retried (issue #32296)."""
|
||||
|
||||
@staticmethod
|
||||
def _status_error(status_code):
|
||||
import httpx
|
||||
|
||||
request = httpx.Request("GET", "https://example.com/img.jpg")
|
||||
response = httpx.Response(status_code, request=request)
|
||||
return httpx.HTTPStatusError(
|
||||
f"{status_code}", request=request, response=response
|
||||
)
|
||||
|
||||
def _make_client_raising_status(self, status_code):
|
||||
"""AsyncClient whose response.raise_for_status() raises HTTPStatusError."""
|
||||
mock_response = MagicMock()
|
||||
mock_response.raise_for_status = MagicMock(
|
||||
side_effect=self._status_error(status_code)
|
||||
)
|
||||
mock_client = AsyncMock()
|
||||
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
|
||||
mock_client.__aexit__ = AsyncMock(return_value=False)
|
||||
mock_client.get = AsyncMock(return_value=mock_response)
|
||||
return mock_client
|
||||
|
||||
def test_is_retryable_classification(self):
|
||||
from tools.vision_tools import _is_retryable_download_error
|
||||
|
||||
# Non-retryable client errors
|
||||
for code in (400, 403, 404, 410):
|
||||
assert _is_retryable_download_error(self._status_error(code)) is False
|
||||
# Retryable: rate limit + server errors
|
||||
for code in (429, 500, 502, 503):
|
||||
assert _is_retryable_download_error(self._status_error(code)) is True
|
||||
# Policy/SSRF/size errors are terminal
|
||||
assert _is_retryable_download_error(PermissionError("blocked")) is False
|
||||
assert _is_retryable_download_error(ValueError("too large")) is False
|
||||
# Unclassified (network blip) is retryable
|
||||
assert _is_retryable_download_error(ConnectionError("reset")) is True
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_404_fails_fast_without_retry(self, tmp_path):
|
||||
"""A 404 must raise on the first attempt — no backoff sleep, no extra GETs."""
|
||||
import httpx
|
||||
from tools.vision_tools import _download_image
|
||||
|
||||
mock_client = self._make_client_raising_status(404)
|
||||
with (
|
||||
patch("tools.vision_tools.httpx.AsyncClient", return_value=mock_client),
|
||||
patch("tools.vision_tools.check_website_access", return_value=None),
|
||||
patch("asyncio.sleep", new_callable=AsyncMock) as mock_sleep,
|
||||
pytest.raises(httpx.HTTPStatusError),
|
||||
):
|
||||
await _download_image(
|
||||
"https://example.com/missing.jpg", tmp_path / "x.jpg", max_retries=3
|
||||
)
|
||||
# Exactly one attempt, zero backoff sleeps.
|
||||
assert mock_client.get.await_count == 1
|
||||
mock_sleep.assert_not_called()
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_503_retries_then_raises(self, tmp_path):
|
||||
"""A 5xx is retried up to max_retries, sleeping between attempts."""
|
||||
import httpx
|
||||
from tools.vision_tools import _download_image
|
||||
|
||||
mock_client = self._make_client_raising_status(503)
|
||||
with (
|
||||
patch("tools.vision_tools.httpx.AsyncClient", return_value=mock_client),
|
||||
patch("tools.vision_tools.check_website_access", return_value=None),
|
||||
patch("asyncio.sleep", new_callable=AsyncMock) as mock_sleep,
|
||||
pytest.raises(httpx.HTTPStatusError),
|
||||
):
|
||||
await _download_image(
|
||||
"https://example.com/flaky.jpg", tmp_path / "y.jpg", max_retries=3
|
||||
)
|
||||
# All three attempts used, two backoff sleeps between them.
|
||||
assert mock_client.get.await_count == 3
|
||||
assert mock_sleep.await_count == 2
|
||||
|
|
|
|||
|
|
@ -127,6 +127,30 @@ def _detect_image_mime_type(image_path: Path) -> Optional[str]:
|
|||
return None
|
||||
|
||||
|
||||
def _is_retryable_download_error(error: Exception) -> bool:
|
||||
"""Return True only for transient image-download failures worth retrying.
|
||||
|
||||
Non-retryable (fail-fast):
|
||||
- httpx.HTTPStatusError with a 4xx status other than 429 (404/403/410/...):
|
||||
the resource is missing or forbidden; retrying can't change that.
|
||||
- PermissionError: blocked by website policy / SSRF guard.
|
||||
- ValueError: image too large or blocked redirect — deterministic.
|
||||
|
||||
Retryable (transient):
|
||||
- httpx 429 (rate limited) and 5xx (server-side) errors.
|
||||
- Connection/timeout/transport errors (httpx.TransportError) and any
|
||||
other unclassified exception, which may be a flaky network blip.
|
||||
"""
|
||||
if isinstance(error, (PermissionError, ValueError)):
|
||||
return False
|
||||
if isinstance(error, httpx.HTTPStatusError):
|
||||
status = error.response.status_code
|
||||
if 400 <= status < 500 and status != 429:
|
||||
return False
|
||||
return True
|
||||
return True
|
||||
|
||||
|
||||
async def _download_image(image_url: str, destination: Path, max_retries: int = 3) -> Path:
|
||||
"""
|
||||
Download an image from a URL to a local destination (async) with retry logic.
|
||||
|
|
@ -210,24 +234,32 @@ async def _download_image(image_url: str, destination: Path, max_retries: int =
|
|||
return destination
|
||||
except Exception as e:
|
||||
last_error = e
|
||||
if attempt < max_retries - 1:
|
||||
wait_time = 2 ** (attempt + 1) # 2s, 4s, 8s
|
||||
logger.warning("Image download failed (attempt %s/%s): %s", attempt + 1, max_retries, str(e)[:50])
|
||||
logger.warning("Retrying in %ss...", wait_time)
|
||||
await asyncio.sleep(wait_time)
|
||||
else:
|
||||
# Error-class-aware retry: only retry transient failures. A 4xx
|
||||
# client error (404/403/410, etc.) will never succeed on retry —
|
||||
# the resource isn't there or we're not allowed — so burning 3
|
||||
# attempts with 2s/4s/8s backoff just inflates latency. 429 (rate
|
||||
# limit) and 5xx remain retryable. PermissionError (policy block)
|
||||
# and ValueError (too-large / SSRF redirect) are also terminal.
|
||||
if not _is_retryable_download_error(e) or attempt >= max_retries - 1:
|
||||
logger.error(
|
||||
"Image download failed after %s attempts: %s",
|
||||
max_retries,
|
||||
"Image download failed after %s attempt(s): %s",
|
||||
attempt + 1,
|
||||
str(e)[:100],
|
||||
exc_info=True,
|
||||
)
|
||||
|
||||
if last_error is None:
|
||||
raise RuntimeError(
|
||||
f"_download_image exited retry loop without attempting (max_retries={max_retries})"
|
||||
)
|
||||
raise last_error
|
||||
raise
|
||||
wait_time = 2 ** (attempt + 1) # 2s, 4s, 8s
|
||||
logger.warning("Image download failed (attempt %s/%s): %s", attempt + 1, max_retries, str(e)[:50])
|
||||
logger.warning("Retrying in %ss...", wait_time)
|
||||
await asyncio.sleep(wait_time)
|
||||
|
||||
# The loop always returns on success or re-raises on the final/non-retryable
|
||||
# attempt, so reaching here means max_retries was non-positive.
|
||||
if last_error is not None:
|
||||
raise last_error
|
||||
raise RuntimeError(
|
||||
f"_download_image exited retry loop without attempting (max_retries={max_retries})"
|
||||
)
|
||||
|
||||
|
||||
def _determine_mime_type(image_path: Path) -> str:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue