From b4cf114f68da5d1de6b53cdc4a208d270e0654d7 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Sat, 30 May 2026 01:40:39 -0700 Subject: [PATCH] fix(vision): fail fast on non-retryable image download errors (#35221) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit _download_image() wrapped every download attempt in a blanket `except Exception` and retried 3x with 2s/4s/8s backoff regardless of cause. A 404/403 image URL would never resolve on retry, so it just burned up to 6s of wall-clock + extra GETs before failing — inflating latency for a deterministic failure (issue #32296, umbrella #35114). Add _is_retryable_download_error(): 4xx client errors (except 429), website-policy PermissionError, and too-large/SSRF ValueError now raise on the first attempt. 429, 5xx, and unclassified network errors stay retryable. Removed the now-unreachable fall-through branch since the loop always returns on success or re-raises on the final/terminal attempt. --- tests/tools/test_vision_tools.py | 81 ++++++++++++++++++++++++++++++++ tools/vision_tools.py | 60 +++++++++++++++++------ 2 files changed, 127 insertions(+), 14 deletions(-) diff --git a/tests/tools/test_vision_tools.py b/tests/tools/test_vision_tools.py index e3bff50d56f..7a50a4b4630 100644 --- a/tests/tools/test_vision_tools.py +++ b/tests/tools/test_vision_tools.py @@ -917,3 +917,84 @@ class TestIsImageSizeError: def test_empty_message(self): assert not _is_image_size_error(Exception("")) + + +class TestDownloadRetryClassification: + """Error-class-aware retry: 4xx fail-fast, 429/5xx/transient retried (issue #32296).""" + + @staticmethod + def _status_error(status_code): + import httpx + + request = httpx.Request("GET", "https://example.com/img.jpg") + response = httpx.Response(status_code, request=request) + return httpx.HTTPStatusError( + f"{status_code}", request=request, response=response + ) + + def _make_client_raising_status(self, status_code): + """AsyncClient whose response.raise_for_status() raises HTTPStatusError.""" + mock_response = MagicMock() + mock_response.raise_for_status = MagicMock( + side_effect=self._status_error(status_code) + ) + mock_client = AsyncMock() + mock_client.__aenter__ = AsyncMock(return_value=mock_client) + mock_client.__aexit__ = AsyncMock(return_value=False) + mock_client.get = AsyncMock(return_value=mock_response) + return mock_client + + def test_is_retryable_classification(self): + from tools.vision_tools import _is_retryable_download_error + + # Non-retryable client errors + for code in (400, 403, 404, 410): + assert _is_retryable_download_error(self._status_error(code)) is False + # Retryable: rate limit + server errors + for code in (429, 500, 502, 503): + assert _is_retryable_download_error(self._status_error(code)) is True + # Policy/SSRF/size errors are terminal + assert _is_retryable_download_error(PermissionError("blocked")) is False + assert _is_retryable_download_error(ValueError("too large")) is False + # Unclassified (network blip) is retryable + assert _is_retryable_download_error(ConnectionError("reset")) is True + + @pytest.mark.asyncio + async def test_404_fails_fast_without_retry(self, tmp_path): + """A 404 must raise on the first attempt — no backoff sleep, no extra GETs.""" + import httpx + from tools.vision_tools import _download_image + + mock_client = self._make_client_raising_status(404) + with ( + patch("tools.vision_tools.httpx.AsyncClient", return_value=mock_client), + patch("tools.vision_tools.check_website_access", return_value=None), + patch("asyncio.sleep", new_callable=AsyncMock) as mock_sleep, + pytest.raises(httpx.HTTPStatusError), + ): + await _download_image( + "https://example.com/missing.jpg", tmp_path / "x.jpg", max_retries=3 + ) + # Exactly one attempt, zero backoff sleeps. + assert mock_client.get.await_count == 1 + mock_sleep.assert_not_called() + + @pytest.mark.asyncio + async def test_503_retries_then_raises(self, tmp_path): + """A 5xx is retried up to max_retries, sleeping between attempts.""" + import httpx + from tools.vision_tools import _download_image + + mock_client = self._make_client_raising_status(503) + with ( + patch("tools.vision_tools.httpx.AsyncClient", return_value=mock_client), + patch("tools.vision_tools.check_website_access", return_value=None), + patch("asyncio.sleep", new_callable=AsyncMock) as mock_sleep, + pytest.raises(httpx.HTTPStatusError), + ): + await _download_image( + "https://example.com/flaky.jpg", tmp_path / "y.jpg", max_retries=3 + ) + # All three attempts used, two backoff sleeps between them. + assert mock_client.get.await_count == 3 + assert mock_sleep.await_count == 2 diff --git a/tools/vision_tools.py b/tools/vision_tools.py index 986f9dab984..23a0508fed1 100644 --- a/tools/vision_tools.py +++ b/tools/vision_tools.py @@ -127,6 +127,30 @@ def _detect_image_mime_type(image_path: Path) -> Optional[str]: return None +def _is_retryable_download_error(error: Exception) -> bool: + """Return True only for transient image-download failures worth retrying. + + Non-retryable (fail-fast): + - httpx.HTTPStatusError with a 4xx status other than 429 (404/403/410/...): + the resource is missing or forbidden; retrying can't change that. + - PermissionError: blocked by website policy / SSRF guard. + - ValueError: image too large or blocked redirect — deterministic. + + Retryable (transient): + - httpx 429 (rate limited) and 5xx (server-side) errors. + - Connection/timeout/transport errors (httpx.TransportError) and any + other unclassified exception, which may be a flaky network blip. + """ + if isinstance(error, (PermissionError, ValueError)): + return False + if isinstance(error, httpx.HTTPStatusError): + status = error.response.status_code + if 400 <= status < 500 and status != 429: + return False + return True + return True + + async def _download_image(image_url: str, destination: Path, max_retries: int = 3) -> Path: """ Download an image from a URL to a local destination (async) with retry logic. @@ -210,24 +234,32 @@ async def _download_image(image_url: str, destination: Path, max_retries: int = return destination except Exception as e: last_error = e - if attempt < max_retries - 1: - wait_time = 2 ** (attempt + 1) # 2s, 4s, 8s - logger.warning("Image download failed (attempt %s/%s): %s", attempt + 1, max_retries, str(e)[:50]) - logger.warning("Retrying in %ss...", wait_time) - await asyncio.sleep(wait_time) - else: + # Error-class-aware retry: only retry transient failures. A 4xx + # client error (404/403/410, etc.) will never succeed on retry — + # the resource isn't there or we're not allowed — so burning 3 + # attempts with 2s/4s/8s backoff just inflates latency. 429 (rate + # limit) and 5xx remain retryable. PermissionError (policy block) + # and ValueError (too-large / SSRF redirect) are also terminal. + if not _is_retryable_download_error(e) or attempt >= max_retries - 1: logger.error( - "Image download failed after %s attempts: %s", - max_retries, + "Image download failed after %s attempt(s): %s", + attempt + 1, str(e)[:100], exc_info=True, ) - - if last_error is None: - raise RuntimeError( - f"_download_image exited retry loop without attempting (max_retries={max_retries})" - ) - raise last_error + raise + wait_time = 2 ** (attempt + 1) # 2s, 4s, 8s + logger.warning("Image download failed (attempt %s/%s): %s", attempt + 1, max_retries, str(e)[:50]) + logger.warning("Retrying in %ss...", wait_time) + await asyncio.sleep(wait_time) + + # The loop always returns on success or re-raises on the final/non-retryable + # attempt, so reaching here means max_retries was non-positive. + if last_error is not None: + raise last_error + raise RuntimeError( + f"_download_image exited retry loop without attempting (max_retries={max_retries})" + ) def _determine_mime_type(image_path: Path) -> str: