fix(vision): fail fast on non-retryable image download errors (#35221)

_download_image() wrapped every download attempt in a blanket
`except Exception` and retried 3x with 2s/4s/8s backoff regardless of
cause. A 404/403 image URL would never resolve on retry, so it just
burned up to 6s of wall-clock + extra GETs before failing — inflating
latency for a deterministic failure (issue #32296, umbrella #35114).

Add _is_retryable_download_error(): 4xx client errors (except 429),
website-policy PermissionError, and too-large/SSRF ValueError now raise
on the first attempt. 429, 5xx, and unclassified network errors stay
retryable. Removed the now-unreachable fall-through branch since the
loop always returns on success or re-raises on the final/terminal attempt.
This commit is contained in:
Teknium 2026-05-30 01:40:39 -07:00 committed by GitHub
parent e481b15333
commit b4cf114f68
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 127 additions and 14 deletions

View file

@ -917,3 +917,84 @@ class TestIsImageSizeError:
def test_empty_message(self):
assert not _is_image_size_error(Exception(""))
class TestDownloadRetryClassification:
"""Error-class-aware retry: 4xx fail-fast, 429/5xx/transient retried (issue #32296)."""
@staticmethod
def _status_error(status_code):
import httpx
request = httpx.Request("GET", "https://example.com/img.jpg")
response = httpx.Response(status_code, request=request)
return httpx.HTTPStatusError(
f"{status_code}", request=request, response=response
)
def _make_client_raising_status(self, status_code):
"""AsyncClient whose response.raise_for_status() raises HTTPStatusError."""
mock_response = MagicMock()
mock_response.raise_for_status = MagicMock(
side_effect=self._status_error(status_code)
)
mock_client = AsyncMock()
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
mock_client.__aexit__ = AsyncMock(return_value=False)
mock_client.get = AsyncMock(return_value=mock_response)
return mock_client
def test_is_retryable_classification(self):
from tools.vision_tools import _is_retryable_download_error
# Non-retryable client errors
for code in (400, 403, 404, 410):
assert _is_retryable_download_error(self._status_error(code)) is False
# Retryable: rate limit + server errors
for code in (429, 500, 502, 503):
assert _is_retryable_download_error(self._status_error(code)) is True
# Policy/SSRF/size errors are terminal
assert _is_retryable_download_error(PermissionError("blocked")) is False
assert _is_retryable_download_error(ValueError("too large")) is False
# Unclassified (network blip) is retryable
assert _is_retryable_download_error(ConnectionError("reset")) is True
@pytest.mark.asyncio
async def test_404_fails_fast_without_retry(self, tmp_path):
"""A 404 must raise on the first attempt — no backoff sleep, no extra GETs."""
import httpx
from tools.vision_tools import _download_image
mock_client = self._make_client_raising_status(404)
with (
patch("tools.vision_tools.httpx.AsyncClient", return_value=mock_client),
patch("tools.vision_tools.check_website_access", return_value=None),
patch("asyncio.sleep", new_callable=AsyncMock) as mock_sleep,
pytest.raises(httpx.HTTPStatusError),
):
await _download_image(
"https://example.com/missing.jpg", tmp_path / "x.jpg", max_retries=3
)
# Exactly one attempt, zero backoff sleeps.
assert mock_client.get.await_count == 1
mock_sleep.assert_not_called()
@pytest.mark.asyncio
async def test_503_retries_then_raises(self, tmp_path):
"""A 5xx is retried up to max_retries, sleeping between attempts."""
import httpx
from tools.vision_tools import _download_image
mock_client = self._make_client_raising_status(503)
with (
patch("tools.vision_tools.httpx.AsyncClient", return_value=mock_client),
patch("tools.vision_tools.check_website_access", return_value=None),
patch("asyncio.sleep", new_callable=AsyncMock) as mock_sleep,
pytest.raises(httpx.HTTPStatusError),
):
await _download_image(
"https://example.com/flaky.jpg", tmp_path / "y.jpg", max_retries=3
)
# All three attempts used, two backoff sleeps between them.
assert mock_client.get.await_count == 3
assert mock_sleep.await_count == 2

View file

@ -127,6 +127,30 @@ def _detect_image_mime_type(image_path: Path) -> Optional[str]:
return None
def _is_retryable_download_error(error: Exception) -> bool:
"""Return True only for transient image-download failures worth retrying.
Non-retryable (fail-fast):
- httpx.HTTPStatusError with a 4xx status other than 429 (404/403/410/...):
the resource is missing or forbidden; retrying can't change that.
- PermissionError: blocked by website policy / SSRF guard.
- ValueError: image too large or blocked redirect deterministic.
Retryable (transient):
- httpx 429 (rate limited) and 5xx (server-side) errors.
- Connection/timeout/transport errors (httpx.TransportError) and any
other unclassified exception, which may be a flaky network blip.
"""
if isinstance(error, (PermissionError, ValueError)):
return False
if isinstance(error, httpx.HTTPStatusError):
status = error.response.status_code
if 400 <= status < 500 and status != 429:
return False
return True
return True
async def _download_image(image_url: str, destination: Path, max_retries: int = 3) -> Path:
"""
Download an image from a URL to a local destination (async) with retry logic.
@ -210,24 +234,32 @@ async def _download_image(image_url: str, destination: Path, max_retries: int =
return destination
except Exception as e:
last_error = e
if attempt < max_retries - 1:
wait_time = 2 ** (attempt + 1) # 2s, 4s, 8s
logger.warning("Image download failed (attempt %s/%s): %s", attempt + 1, max_retries, str(e)[:50])
logger.warning("Retrying in %ss...", wait_time)
await asyncio.sleep(wait_time)
else:
# Error-class-aware retry: only retry transient failures. A 4xx
# client error (404/403/410, etc.) will never succeed on retry —
# the resource isn't there or we're not allowed — so burning 3
# attempts with 2s/4s/8s backoff just inflates latency. 429 (rate
# limit) and 5xx remain retryable. PermissionError (policy block)
# and ValueError (too-large / SSRF redirect) are also terminal.
if not _is_retryable_download_error(e) or attempt >= max_retries - 1:
logger.error(
"Image download failed after %s attempts: %s",
max_retries,
"Image download failed after %s attempt(s): %s",
attempt + 1,
str(e)[:100],
exc_info=True,
)
if last_error is None:
raise RuntimeError(
f"_download_image exited retry loop without attempting (max_retries={max_retries})"
)
raise last_error
raise
wait_time = 2 ** (attempt + 1) # 2s, 4s, 8s
logger.warning("Image download failed (attempt %s/%s): %s", attempt + 1, max_retries, str(e)[:50])
logger.warning("Retrying in %ss...", wait_time)
await asyncio.sleep(wait_time)
# The loop always returns on success or re-raises on the final/non-retryable
# attempt, so reaching here means max_retries was non-positive.
if last_error is not None:
raise last_error
raise RuntimeError(
f"_download_image exited retry loop without attempting (max_retries={max_retries})"
)
def _determine_mime_type(image_path: Path) -> str: