diff --git a/agent/conversation_compression.py b/agent/conversation_compression.py index 5c7d299f0a4..89bb4ceb55a 100644 --- a/agent/conversation_compression.py +++ b/agent/conversation_compression.py @@ -712,33 +712,58 @@ def try_shrink_image_parts_in_messages( # actually brought under the target. unshrinkable_oversized = 0 - def _shrink_data_url(url: str) -> Optional[str]: - """Return a smaller data URL, or None if shrink can't help.""" - if not isinstance(url, str) or not url.startswith("data:"): + def _decode_pixels(data_url: str) -> Optional[tuple]: + """Return ``(width, height)`` of a base64 data URL, or None on failure. + + Soft-depends on Pillow; returns None (caller falls back to a + bytes-only check) if Pillow is missing or the payload is corrupt. + """ + try: + import base64 as _b64_dim + import io as _io_dim + header_d, _, data_d = data_url.partition(",") + if not data_d or not data_url.startswith("data:"): + return None + from PIL import Image as _PILImage + with _PILImage.open(_io_dim.BytesIO(_b64_dim.b64decode(data_d))) as _img: + return _img.size + except Exception: return None - # Check both byte size AND pixel dimensions. + def _shrink_data_url(url: str) -> tuple: + """Return ``(resized_url, unshrinkable)`` for a data URL. + + ``resized_url`` is a smaller/dimension-correct data URL, or None when + no rewrite was applied. ``unshrinkable`` is True only when the image + exceeded a constraint (byte-size or dimensions) and the resize failed + to satisfy *that same* constraint — so the caller knows retrying is + pointless even if a different image in the request shrank. + """ + if not isinstance(url, str) or not url.startswith("data:"): + return None, False + + # Determine which constraint is binding. The accept/reject gate below + # MUST be checked against the same axis that triggered the shrink: a + # downscaled screenshot PNG routinely re-encodes to *more* bytes than + # the original (PNG compression is non-monotonic in image size — a + # smaller raster with LANCZOS resampling noise compresses worse than a + # larger smooth one). Rejecting a pixel-correct downscale purely + # because its bytes grew permanently wedges sessions on the Anthropic + # many-image 2000px path (#48013). needs_shrink = len(url) > target_bytes # over byte budget + triggered_by = "bytes" if needs_shrink else None if not needs_shrink: - # Even if bytes are fine, check pixel dimensions against the - # provider's reported per-side cap. A screenshot can be tiny in - # bytes yet too large in pixels. - try: - import base64 as _b64_dim - header_d, _, data_d = url.partition(",") - if not data_d: - return None - raw_d = _b64_dim.b64decode(data_d) - from PIL import Image as _PILImage - import io as _io_dim - with _PILImage.open(_io_dim.BytesIO(raw_d)) as _img: - if max(_img.size) <= max_dimension: - return None # both bytes and pixels are fine - needs_shrink = True # pixels exceed limit, force shrink - except Exception: - # If we can't check dimensions (Pillow unavailable, corrupt - # image, etc.), fall back to byte-only check. - return None + # Bytes are fine — check pixel dimensions against the provider's + # reported per-side cap. A screenshot can be tiny in bytes yet + # too large in pixels. + dims = _decode_pixels(url) + if dims is None: + # Pillow missing or corrupt data — fall back to byte-only. + return None, False + if max(dims) <= max_dimension: + return None, False # both bytes and pixels are within limits + needs_shrink = True + triggered_by = "dimension" try: header, _, data = url.partition(",") @@ -770,13 +795,45 @@ def try_shrink_image_parts_in_messages( Path(tmp.name).unlink(missing_ok=True) except Exception: pass - if not resized or len(resized) >= len(url): - # Shrink didn't help (or made it bigger — corrupt input?). - return None - return resized + if not resized: + # Resize returned nothing — Pillow couldn't help. + return None, True + if triggered_by == "bytes": + # Byte budget is the binding constraint — bytes must shrink. + if len(resized) >= len(url): + return None, True # re-encode made it bigger + # The per-side dimension cap is ALSO an active provider + # constraint on this request (the caller passes the parsed cap + # to both this helper and the resizer). _resize_image_for_vision + # returns a best-effort, possibly-over-cap blob when it + # exhausts its halving budget — it freezes the long side once + # the short side hits its 64px floor, so a very-high-aspect + # image can stay over the cap even after bytes shrank. If the + # output is still over the cap, retrying would re-400 on + # dimensions; treat it as unshrinkable. (Skip when dims can't + # be decoded — preserves historical byte-only behaviour.) + new_dims = _decode_pixels(resized) + if new_dims is not None and max(new_dims) > max_dimension: + return None, True + return resized, False + # triggered_by == "dimension": the per-side cap is binding. The + # re-encode may have grown in bytes; accept it as long as it is now + # within the dimension cap. Verify the new dimensions when we can. + new_dims = _decode_pixels(resized) + if new_dims is not None: + if max(new_dims) <= max_dimension: + return resized, False + # Still over the per-side cap — the resize didn't satisfy it. + return None, True + # Couldn't verify the re-encode's dimensions (corrupt output or + # Pillow gone mid-call). Fall back to the historical "bytes must + # shrink" gate so we never accept an unverifiable, byte-larger blob. + if len(resized) >= len(url): + return None, True + return resized, False except Exception as exc: logger.warning("image-shrink recovery: re-encode failed — %s", exc) - return None + return None, triggered_by is not None for msg in api_messages: if not isinstance(msg, dict): @@ -795,20 +852,18 @@ def try_shrink_image_parts_in_messages( # OpenAI Responses: {"image_url": "data:..."} if isinstance(image_value, dict): url = image_value.get("url", "") - resized = _shrink_data_url(url) + resized, unshrinkable = _shrink_data_url(url) if resized: image_value["url"] = resized changed_count += 1 - elif isinstance(url, str) and url.startswith("data:") \ - and len(url) > target_bytes: + elif unshrinkable: unshrinkable_oversized += 1 elif isinstance(image_value, str): - resized = _shrink_data_url(image_value) + resized, unshrinkable = _shrink_data_url(image_value) if resized: part["image_url"] = resized changed_count += 1 - elif image_value.startswith("data:") \ - and len(image_value) > target_bytes: + elif unshrinkable: unshrinkable_oversized += 1 if changed_count: diff --git a/tests/run_agent/repro_48013_image_shrink_brick.py b/tests/run_agent/repro_48013_image_shrink_brick.py new file mode 100644 index 00000000000..ee099f48dcf --- /dev/null +++ b/tests/run_agent/repro_48013_image_shrink_brick.py @@ -0,0 +1,179 @@ +#!/usr/bin/env python3 +"""Runnable proof for issue #48013 — image-dimension 400 session brick. + +Before the fix, ``agent.conversation_compression.try_shrink_image_parts_in_messages`` +silently discarded a *pixel-correct* downscale whenever the re-encoded PNG was +larger in bytes than the original (the common case for downscaled Retina +screenshots). The image was left at its original oversized dimensions, the +provider re-rejected it on retry, and the session wedged forever on the +Anthropic many-image 2000px path. + +This script reproduces the exact scenario with REAL Pillow (no mocks): it +synthesizes screenshot-like PNGs at the dimensions from the issue's table — +images that are small in bytes (under the 4 MB budget) but over the 2000px +per-side cap — and runs the real recovery helper. It asserts every image is +brought under the cap and that the helper reports success. + +Run directly to see a human-readable report: + + python tests/run_agent/repro_48013_image_shrink_brick.py + +Or as a pytest smoke test (skipped automatically when Pillow is absent): + + scripts/run_tests.sh tests/run_agent/repro_48013_image_shrink_brick.py +""" + +from __future__ import annotations + +import base64 +import io +import sys +from pathlib import Path + +import pytest + +# Make the repo root importable when run as a plain script. +_REPO_ROOT = Path(__file__).resolve().parents[2] +if str(_REPO_ROOT) not in sys.path: + sys.path.insert(0, str(_REPO_ROOT)) + +PIL = pytest.importorskip("PIL", reason="Pillow required for the real-resize proof") +from PIL import Image, ImageDraw # noqa: E402 + +from agent.conversation_compression import ( # noqa: E402 + try_shrink_image_parts_in_messages, +) + +# The many-image per-side cap Anthropic reported in the wild (issue #48013). +MANY_IMAGE_CAP = 2000 +BYTE_BUDGET = 4 * 1024 * 1024 + +# Dimensions straight from the issue's per-image table. The "REJECTED" rows +# are the ones that bricked: tall/large screenshots whose downscale re-encodes +# to MORE PNG bytes than the original. +CASES = [ + (2344, 778), # wide — shrank even before the fix + (2374, 1144), # wide — shrank even before the fix + (2097, 1476), # REJECTED before fix + (2247, 1544), # REJECTED before fix + (2263, 1644), # REJECTED before fix +] + + +def _make_screenshot_png(width: int, height: int) -> bytes: + """A screenshot-like PNG: mostly flat UI regions so it compresses small. + + Flat regions keep the byte size well under the 4 MB budget, forcing the + DIMENSION path (not the byte path) — exactly the code that bricked. The + downscale of such an image re-encodes to a comparable-or-larger PNG, which + is what the old byte gate wrongly rejected. + """ + img = Image.new("RGB", (width, height), (245, 245, 247)) + draw = ImageDraw.Draw(img) + for y in range(0, height, 40): + shade = 255 - (y // 40) % 6 * 4 + draw.rectangle([20, y + 5, width - 20, y + 30], fill=(shade, 250, 250)) + for x in range(0, width, 160): + draw.rectangle([x, 0, x + 2, height], fill=(220, 220, 225)) + draw.text((40, 40), "Some UI text " * 30, fill=(20, 20, 20)) + buf = io.BytesIO() + img.save(buf, format="PNG", optimize=False) + return buf.getvalue() + + +def _data_url(raw: bytes) -> str: + return "data:image/png;base64," + base64.b64encode(raw).decode("ascii") + + +def _decode_dims(data_url: str) -> tuple[int, int]: + payload = data_url.partition(",")[2] + with Image.open(io.BytesIO(base64.b64decode(payload))) as img: + return img.size + + +def run_proof(verbose: bool = False) -> list[dict]: + """Run the recovery against every case; return per-case results.""" + results: list[dict] = [] + for width, height in CASES: + raw = _make_screenshot_png(width, height) + url = _data_url(raw) + # Sanity: this case must be UNDER the byte budget and OVER the pixel cap, + # i.e. it exercises the dimension path that bricked. + under_byte_budget = len(url) <= BYTE_BUDGET + over_pixel_cap = max(width, height) > MANY_IMAGE_CAP + + msgs = [{ + "role": "user", + "content": [{"type": "image_url", "image_url": {"url": url}}], + }] + changed = try_shrink_image_parts_in_messages( + msgs, max_dimension=MANY_IMAGE_CAP, + ) + out_url = msgs[0]["content"][0]["image_url"]["url"] + out_dims = _decode_dims(out_url) + + result = { + "orig": (width, height), + "orig_bytes": len(raw), + "under_byte_budget": under_byte_budget, + "over_pixel_cap": over_pixel_cap, + "changed": changed, + "result_dims": out_dims, + "under_cap_after": max(out_dims) <= MANY_IMAGE_CAP, + } + results.append(result) + if verbose: + status = "OK" if result["under_cap_after"] else "BRICK" + print( + f" {width}x{height} ({len(raw)//1024:>3} KB)" + f" -> changed={changed!s:>5}" + f" result={out_dims[0]}x{out_dims[1]}" + f" [{status}]" + ) + return results + + +def test_issue_48013_dimension_shrink_does_not_brick(): + """Every dimension-oversized screenshot must be brought under the cap.""" + results = run_proof() + assert results, "no cases ran" + for r in results: + # Precondition: we really are on the dimension path. + assert r["under_byte_budget"], ( + f"{r['orig']} must be under the byte budget to exercise the bug" + ) + assert r["over_pixel_cap"], f"{r['orig']} must exceed the pixel cap" + # The fix: image lands under the cap and the helper reports success. + assert r["under_cap_after"], ( + f"BRICK: {r['orig']} left at {r['result_dims']} " + f"(> {MANY_IMAGE_CAP}px) — the shrink recovery discarded a " + f"pixel-correct downscale (#48013)" + ) + assert r["changed"] is True, ( + f"{r['orig']} shrank but helper reported no progress — caller " + f"would surface the original error and burn the one-shot retry" + ) + + +def main() -> int: + print("Issue #48013 proof — image-dimension shrink must not brick sessions") + print(f"(many-image per-side cap = {MANY_IMAGE_CAP}px, byte budget = " + f"{BYTE_BUDGET // (1024 * 1024)} MB)\n") + results = run_proof(verbose=True) + bricked = [r for r in results if not r["under_cap_after"]] + no_progress = [r for r in results if r["under_cap_after"] and not r["changed"]] + print() + if bricked: + print(f"FAIL: {len(bricked)} image(s) still over the pixel cap (BRICK).") + return 1 + if no_progress: + print(f"FAIL: {len(no_progress)} image(s) shrank but helper reported " + f"no progress (would burn the retry).") + return 1 + print(f"PASS: all {len(results)} dimension-oversized screenshots brought " + f"under {MANY_IMAGE_CAP}px and reported as progress.") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tests/run_agent/test_image_shrink_recovery.py b/tests/run_agent/test_image_shrink_recovery.py index 240546ea14c..24f8b7e242d 100644 --- a/tests/run_agent/test_image_shrink_recovery.py +++ b/tests/run_agent/test_image_shrink_recovery.py @@ -108,11 +108,36 @@ def _big_png_data_url(size_kb: int) -> str: return "data:image/png;base64," + base64.b64encode(raw).decode("ascii") -def _install_fake_pillow(monkeypatch, size: tuple[int, int]) -> None: - """Install the tiny subset of Pillow used by the shrink preflight.""" +def _install_fake_pillow( + monkeypatch, + size: tuple[int, int], + *, + shrunk_size: tuple[int, int] | None = None, + sizes: list[tuple[int, int]] | None = None, +) -> None: + """Install the tiny subset of Pillow used by the shrink preflight. + + The shrink helper decodes pixel dimensions twice for the dimension path: + once on the *original* data URL (to decide it's oversized) and once on the + *re-encoded* result (to confirm the downscale landed under the cap). To + model that honestly, ``_FakeImage`` can return a sequence of sizes across + successive ``open()`` calls: + + * ``sizes=[...]`` — explicit per-call size list (clamped to last). + * ``shrunk_size=(w, h)`` — shorthand for ``[size, shrunk_size]``: first + decode is the oversized original, second is the in-cap re-encode. + * neither — every decode returns ``size`` (legacy behaviour). + """ + call_count = {"n": 0} + target_sizes = sizes or [ + size, + shrunk_size if shrunk_size is not None else size, + ] + class _FakeImage: def __init__(self): - self.size = size + self.size = target_sizes[min(call_count["n"], len(target_sizes) - 1)] + call_count["n"] += 1 def __enter__(self): return self @@ -203,9 +228,10 @@ class TestShrinkImagePartsHelper: assert msgs[0]["content"][1]["image_url"]["url"] == shrunk def test_many_image_dimension_limit_rewritten(self, monkeypatch): - """A 2000px many-image rejection must shrink images below 8000px.""" + """A 2000px many-image rejection must shrink images below the cap.""" agent = _make_agent() - _install_fake_pillow(monkeypatch, (2501, 100)) + # Original decodes oversized (2501px); the re-encode decodes in-cap. + _install_fake_pillow(monkeypatch, (2501, 100), shrunk_size=(1500, 60)) oversized_for_many = _big_png_data_url(100) shrunk = "data:image/jpeg;base64," + "M" * 1000 seen = {} @@ -392,3 +418,200 @@ class TestShrinkImagePartsHelper: assert msgs[0]["content"][0]["image_url"]["url"] == small # The unshrinkable one is left as-is (caller surfaces original error). assert msgs[0]["content"][1]["image_url"]["url"] == unshrinkable + + # ------------------------------------------------------------------ + # #48013: the dimension path must accept a pixel-correct downscale even + # when the re-encoded PNG grew in bytes. Before the fix, the byte gate + # (`len(resized) >= len(url)`) discarded the dimension-correct result and + # left the image oversized, bricking the session on the Anthropic + # many-image 2000px path. + # ------------------------------------------------------------------ + + def test_dimension_shrink_with_byte_growth_accepted(self, monkeypatch): + """A dimension-driven shrink is accepted even if its bytes grow. + + Regression for #48013. The original (2501px, under the 4 MB byte + budget) is oversized on pixels only. The re-encode lands at 1500px + (in-cap) but is *larger in bytes* — the historical byte gate would + reject it. The fix keys the accept gate on the binding constraint + (dimensions), so the pixel-correct result is kept. + """ + agent = _make_agent() + _install_fake_pillow(monkeypatch, (2501, 100), shrunk_size=(1500, 60)) + original_url = _big_png_data_url(100) # ~100 KB → well under 4 MB + # A *byte-larger* re-encode (the brick trigger): 200 KB payload. + dimensionally_shrunk = "data:image/png;base64," + "G" * 200 * 1024 + seen = {} + + def _fake_resize(path, mime_type=None, max_base64_bytes=None, max_dimension=None): + seen["max_dimension"] = max_dimension + return dimensionally_shrunk + + monkeypatch.setattr( + "tools.vision_tools._resize_image_for_vision", + _fake_resize, + raising=False, + ) + + msgs = [{ + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": original_url}}, + ], + }] + # The re-encode is byte-LARGER than the original — proves the byte gate + # is no longer the rejection driver on the dimension path. + assert len(dimensionally_shrunk) > len(original_url) + assert agent._try_shrink_image_parts_in_messages( + msgs, max_dimension=2000, + ) is True + assert seen["max_dimension"] == 2000 + assert msgs[0]["content"][0]["image_url"]["url"] == dimensionally_shrunk + + def test_dimension_shrink_failure_still_blocks_retry(self, monkeypatch): + """A dimension-oversized image that stays oversized is unshrinkable. + + If the re-encode is *still* over the per-side cap, the helper must + report no progress (return False) so the one-shot retry isn't burned + re-sending a payload the provider already rejected. + """ + agent = _make_agent() + # Both decodes report oversized: original and re-encode are 2501px. + _install_fake_pillow(monkeypatch, (2501, 100)) + original_url = _big_png_data_url(100) + still_oversized = "data:image/png;base64," + "H" * 120 * 1024 + + monkeypatch.setattr( + "tools.vision_tools._resize_image_for_vision", + lambda *a, **kw: still_oversized, + raising=False, + ) + + msgs = [{ + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": original_url}}, + ], + }] + assert agent._try_shrink_image_parts_in_messages( + msgs, max_dimension=2000, + ) is False + # Original left untouched — caller surfaces the provider's 400. + assert msgs[0]["content"][0]["image_url"]["url"] == original_url + + def test_mixed_dimension_partial_progress_returns_false(self, monkeypatch): + """Partial dimension-path progress must not falsely burn the retry. + + Two dimension-oversized images: the first re-encodes in-cap, the + second stays oversized. Even though one part changed, an oversized + image survives, so retrying would 400 again — the helper must report + False. (Mirrors the byte-path + ``test_mixed_one_shrinkable_one_not_returns_false`` invariant for the + pixel axis.) + """ + agent = _make_agent() + # Decode order: img1 orig (2501) -> img1 re-encode (1500, in-cap) -> + # img2 orig (2501) -> img2 re-encode (2501, still over). + _install_fake_pillow( + monkeypatch, + (2501, 100), + sizes=[(2501, 100), (1500, 60), (2501, 100), (2501, 100)], + ) + first = _big_png_data_url(100) + second = _big_png_data_url(90) + calls = {"n": 0} + + def _fake_resize(path, mime_type=None, max_base64_bytes=None, max_dimension=None): + calls["n"] += 1 + if calls["n"] == 1: + return "data:image/png;base64," + "G" * 200 * 1024 # in-cap + return "data:image/png;base64," + "H" * 120 * 1024 # still over + + monkeypatch.setattr( + "tools.vision_tools._resize_image_for_vision", + _fake_resize, + raising=False, + ) + + msgs = [{ + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": first}}, + {"type": "image_url", "image_url": {"url": second}}, + ], + }] + assert agent._try_shrink_image_parts_in_messages( + msgs, max_dimension=2000, + ) is False + + def test_byte_oversized_but_pixel_oversized_after_shrink_blocks_retry(self, monkeypatch): + """Bytes-triggered shrink must ALSO honour the active per-side cap. + + Adversarial-review regression (#48013, round 2): an image over BOTH the + 4 MB byte budget AND the per-side pixel cap can be byte-shrunk yet stay + over the cap (``_resize_image_for_vision`` returns a best-effort blob + when it exhausts its halving budget on a very-high-aspect image). The + byte-path accept gate originally checked only ``len(resized) < len(url)`` + and reported success, so the caller retried and the provider re-rejected + on dimensions — re-bricking the session. The fix re-checks the pixel + cap on the byte path too; a still-over-cap result must be unshrinkable. + """ + agent = _make_agent() + # On the BYTE path, _decode_pixels is called once — on the RESIZED blob. + # Script that single decode to report still-over-cap dims (2560 > 2000). + _install_fake_pillow(monkeypatch, (2560, 64), sizes=[(2560, 64)]) + # Over the 4 MB byte budget so the BYTE path is taken (triggered_by="bytes"). + oversized_url = _big_png_data_url(5000) # ~5 MB raw → ~6.7 MB b64 + # Byte-SMALLER re-encode, but its decoded dims are still over the cap. + byte_smaller_still_over = "data:image/png;base64," + "K" * 1000 + + monkeypatch.setattr( + "tools.vision_tools._resize_image_for_vision", + lambda *a, **kw: byte_smaller_still_over, + raising=False, + ) + + msgs = [{ + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": oversized_url}}, + ], + }] + # Bytes shrank, but the per-side cap is still violated → no real + # progress; the helper must NOT report success (would burn the retry). + assert len(byte_smaller_still_over) < len(oversized_url) + assert agent._try_shrink_image_parts_in_messages( + msgs, max_dimension=2000, + ) is False + # Original left in place — caller surfaces the provider's 400. + assert msgs[0]["content"][0]["image_url"]["url"] == oversized_url + + def test_byte_oversized_with_no_dim_cap_accepts_byte_shrink(self, monkeypatch): + """Bytes path with the default 8000px cap still accepts a byte shrink. + + Guards the fix above against over-reach: when no tight dimension cap is + active (default 8000px) and the byte-shrunk re-encode is comfortably + within it, the byte path must keep accepting on byte-shrinkage alone. + """ + agent = _make_agent() + # Byte path → single _decode_pixels call on the resized blob; report + # in-cap dims so the byte-shrink is accepted under the default 8000 cap. + _install_fake_pillow(monkeypatch, (1250, 50), sizes=[(1250, 50)]) + oversized_url = _big_png_data_url(5000) + shrunk = "data:image/jpeg;base64," + "L" * 1000 + + monkeypatch.setattr( + "tools.vision_tools._resize_image_for_vision", + lambda *a, **kw: shrunk, + raising=False, + ) + + msgs = [{ + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": oversized_url}}, + ], + }] + # Default cap (8000) — no explicit max_dimension passed. + assert agent._try_shrink_image_parts_in_messages(msgs) is True + assert msgs[0]["content"][0]["image_url"]["url"] == shrunk