fix(vision): cap embedded image size before it wedges a session (#35732)

Resize vision tool-result images down to a 4 MB embed cap at load time,
not just at the 20 MB hard ceiling. A 5-20 MB image previously sailed
through the native fast path and got baked into conversation history,
where Anthropic's 5 MB per-image base64 limit rejected every subsequent
turn with a 400 — and because history is immutable, retries could never
clear it, permanently wedging the session.

Also harden the reactive shrink-recovery: it now returns False (don't
retry) when any oversized image part can't be brought under target, so
the single retry isn't burned re-sending a payload that will fail
identically. Previously it returned True after shrinking *any* part,
even when the actual oversized culprit survived.
This commit is contained in:
Teknium 2026-05-31 00:12:09 -07:00 committed by GitHub
parent d4e7b2fc19
commit 0ffbcbbe7d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 134 additions and 4 deletions

View file

@ -273,3 +273,51 @@ class TestShrinkImagePartsHelper:
assert agent._try_shrink_image_parts_in_messages(msgs) is False
# Original URL still in place, not replaced by the bigger one.
assert msgs[0]["content"][0]["image_url"]["url"] == oversized_url
def test_mixed_one_shrinkable_one_not_returns_false(self, monkeypatch):
"""Regression for the wedged-session incident (May 2026).
When one oversized image shrinks but another oversized image can't,
the helper must return False retrying would re-send the surviving
oversized payload and fail identically, burning the single retry on a
no-op. The original bug returned True after shrinking *any* part,
which is what permanently wedged a session whose history held a 12 MB
tool-result image alongside a freshly-loaded shrinkable one.
"""
agent = _make_agent()
shrinkable = _big_png_data_url(5000)
unshrinkable = _big_png_data_url(6000)
small = "data:image/jpeg;base64," + "C" * 500
# _resize_image_for_vision returns small for the shrinkable input but
# echoes the oversized payload back for the unshrinkable one.
def fake_resize(path, *a, **kw):
# The temp file written by the helper contains the decoded bytes;
# distinguish by size — the 6000 KB source stays "big".
try:
size = path.stat().st_size
except Exception:
size = 0
if size > 5500 * 1024:
return unshrinkable # can't reduce — echo oversized back
return small
monkeypatch.setattr(
"tools.vision_tools._resize_image_for_vision",
fake_resize,
raising=False,
)
msgs = [{
"role": "tool",
"content": [
{"type": "image_url", "image_url": {"url": shrinkable}},
{"type": "image_url", "image_url": {"url": unshrinkable}},
],
}]
# One part shrank, one survived oversized → must NOT retry.
assert agent._try_shrink_image_parts_in_messages(msgs) is False
# The shrinkable one was still re-encoded (mutated in place).
assert msgs[0]["content"][0]["image_url"]["url"] == small
# The unshrinkable one is left as-is (caller surfaces original error).
assert msgs[0]["content"][1]["image_url"]["url"] == unshrinkable

View file

@ -139,6 +139,44 @@ class TestVisionAnalyzeNative:
assert isinstance(result, dict)
assert result.get("_multimodal") is True
def test_oversized_image_resized_under_embed_cap(self, tmp_path):
"""Regression for the wedged-session incident (May 2026).
A vision tool-result image is baked into conversation history and
re-sent on every subsequent turn. Anthropic rejects any single
base64 image over 5 MB with a 400, and immutable history means the
bad bytes can't be cleared by retrying — the session is permanently
wedged. The native fast path must proactively resize down to the
embed cap (well under 5 MB) BEFORE embedding, not just at the 20 MB
hard ceiling. Skips if Pillow isn't available (resize is a no-op).
"""
pytest = __import__("pytest")
try:
from PIL import Image
except ImportError:
pytest.skip("Pillow not installed — proactive resize is a no-op")
from tools.vision_tools import _EMBED_TARGET_BYTES
# Noisy PNG that base64-encodes to well over 5 MB (won't compress much).
big = tmp_path / "big.png"
Image.effect_noise((2600, 2600), 80).convert("RGB").save(big, format="PNG")
assert big.stat().st_size * 4 // 3 > 5 * 1024 * 1024, "test image not big enough"
result = asyncio.get_event_loop().run_until_complete(
_vision_analyze_native(str(big), "describe")
)
assert isinstance(result, dict) and result.get("_multimodal") is True
url = next(
p["image_url"]["url"]
for p in result["content"]
if p.get("type") == "image_url"
)
assert len(url) <= _EMBED_TARGET_BYTES, (
f"embedded image {len(url) / 1024 / 1024:.1f} MB exceeds embed cap "
f"{_EMBED_TARGET_BYTES / 1024 / 1024:.0f} MB — would wedge sessions on Anthropic"
)
# ─── _handle_vision_analyze fast-path gating ─────────────────────────────────