fix(agent): trigger preflight compression on few-but-huge sessions (#27405)

The preflight-compression gate only ran the (expensive) token estimate when the message COUNT exceeded protect_first_n + protect_last_n + 1. A session with a handful of very large messages never tripped the count condition, so compression was never attempted and the turn eventually hit a hard context-overflow error. Add _should_run_preflight_estimate() with OR semantics: run the estimate when either the message count exceeds the protected ranges (the historical gate) OR a cheap char-based estimate already crosses the configured threshold. The downstream estimate_request_tokens_rough() stays authoritative — this is only a hint that decides whether to pay for the full estimate. Salvaged from #27435 by @texhy (authorship preserved). Re-applied on current main: the preflight gate moved from conversation_loop.py to turn_context.py since the PR was opened, so the helper + gate are placed there; the test imports the real MINIMUM_CONTEXT_LENGTH instead of a hardcoded literal. Closes #27405.
2026-06-28 11:32:22 +00:00 · 2026-06-25 01:10:13 +05:30 · 2026-06-25 01:10:13 +05:30 · aacc6bb0a8
commit aacc6bb0a8
parent b13e2fd694
3 changed files with 150 additions and 5 deletions
--- a/agent/turn_context.py
+++ b/agent/turn_context.py
@ -29,7 +29,10 @@ from dataclasses import dataclass
 from typing import Any, Dict, List, Optional

 from agent.iteration_budget import IterationBudget
-from agent.model_metadata import estimate_request_tokens_rough
+from agent.model_metadata import (
+    estimate_messages_tokens_rough,
+    estimate_request_tokens_rough,
+)

 logger = logging.getLogger(__name__)

@ -57,6 +60,34 @@ def _compression_made_progress(
    return orig_tokens > 0 and new_tokens < orig_tokens * 0.95


+def _should_run_preflight_estimate(
+    messages: List[Dict[str, Any]],
+    protect_first_n: int,
+    protect_last_n: int,
+    threshold_tokens: int,
+) -> bool:
+    """Cheap gate for the (expensive) full preflight token estimate.
+
+    Returns ``True`` when either:
+      (a) message count exceeds the protected ranges (the historical gate), or
+      (b) a cheap char-based estimate already crosses the configured threshold
+          — the few-but-huge case from issue #27405 that the count-only gate
+          would silently skip (a handful of very large messages never trips
+          the count condition, so compression was never attempted and the
+          turn hit a hard context-overflow error).
+
+    Branch (b) uses ``estimate_messages_tokens_rough`` (the shared char-based
+    estimator) so a single large base64 image isn't mistaken for ~250K tokens.
+    It intentionally undercounts vs. the full request estimate — it omits the
+    system prompt and tool schemas — because it is only a *hint* deciding
+    whether to pay for the authoritative ``estimate_request_tokens_rough``,
+    which (together with ``should_compress``) makes the real decision.
+    """
+    if len(messages) > protect_first_n + protect_last_n + 1:
+        return True
+    return estimate_messages_tokens_rough(messages) >= threshold_tokens
+
+
@dataclass
 class TurnContext:
    """Values produced by the turn prologue and consumed by the turn loop."""
@ -289,10 +320,14 @@ def build_turn_context(
        )

    # ── Preflight context compression ──
-    if (
-        agent.compression_enabled
-        and len(messages) > agent.context_compressor.protect_first_n
-                            + agent.context_compressor.protect_last_n + 1
+    # Gate the (expensive) full token estimate behind a cheap pre-check.
+    # See ``_should_run_preflight_estimate`` for the OR semantics that fix
+    # issue #27405 (a few very large messages slipping past the count gate).
+    if agent.compression_enabled and _should_run_preflight_estimate(
+        messages,
+        agent.context_compressor.protect_first_n,
+        agent.context_compressor.protect_last_n,
+        agent.context_compressor.threshold_tokens,
    ):
        _preflight_tokens = estimate_request_tokens_rough(
            messages,
--- a/scripts/release.py
+++ b/scripts/release.py
@ -64,6 +64,7 @@ AUTHOR_MAP = {
    "rayjun0412@gmail.com": "rayjun",  # cron model.default salvage co-author (#43952)
    "96944678+sweetcornna@users.noreply.github.com": "sweetcornna",  # cron ticker-liveness salvage co-author (#33849)
    "izumi0uu@gmail.com": "izumi0uu",  # PR #49544 salvage (native rich reply echo; #49534)
+    "dev@pixlmedia.no": "texhy",  # PR #27435 salvage (few-but-huge preflight compression gate; #27405)
    "w31rdm4ch1n3z@protonmail.com": "w31rdm4ch1nZ",
    "xtpeeps@gmail.com": "x7peeps",
    "ahmad@madsgency.com": "ahmadashfq",
--- a/tests/agent/test_preflight_compression_gate.py
+++ b/tests/agent/test_preflight_compression_gate.py
@ -0,0 +1,109 @@
+"""Regression tests for issue #27405.
+
+The preflight compression gate must trigger when *either* the message
+count exceeds the protected ranges OR the cheap char-based token
+estimate already crosses the configured threshold. Pre-fix, only the
+message-count condition was checked, so a session with a small number
+of huge messages would silently skip compression and eventually hit a
+hard context-overflow error.
+"""
+
+from agent.turn_context import _should_run_preflight_estimate
+
+
+# Protected-range counts mirror the compressor defaults. THRESHOLD_TOKENS is an
+# arbitrary test threshold passed explicitly into the helper — it is NOT the
+# live runtime threshold (which is max(0.5*window, MINIMUM_CONTEXT_LENGTH) per
+# model); the helper takes the threshold as a parameter so the tests are
+# self-contained and independent of model metadata.
+PROTECT_FIRST_N = 3
+PROTECT_LAST_N = 20
+THRESHOLD_TOKENS = 64_000
+
+
+def _msg(content: str) -> dict:
+    return {"role": "user", "content": content}
+
+
+def test_few_messages_huge_content_triggers_gate():
+    """The bug from #27405: 8 messages with one massive content blob."""
+    # ~280K chars in one message ~= 70K tokens at 4 chars/token.
+    big = "x" * 280_000
+    messages = [_msg("hi")] * 7 + [_msg(big)]
+    assert len(messages) <= PROTECT_FIRST_N + PROTECT_LAST_N + 1  # would fail old gate
+    assert _should_run_preflight_estimate(
+        messages, PROTECT_FIRST_N, PROTECT_LAST_N, THRESHOLD_TOKENS
+    ) is True
+
+
+def test_few_messages_small_content_does_not_trigger():
+    """Regression guard: tiny sessions should not pay the estimator cost."""
+    messages = [_msg("hello world")] * 8
+    assert _should_run_preflight_estimate(
+        messages, PROTECT_FIRST_N, PROTECT_LAST_N, THRESHOLD_TOKENS
+    ) is False
+
+
+def test_many_small_messages_still_triggers_via_count():
+    """The historical path: > protect_first + protect_last + 1 messages."""
+    messages = [_msg("ok")] * (PROTECT_FIRST_N + PROTECT_LAST_N + 2)  # 25
+    assert _should_run_preflight_estimate(
+        messages, PROTECT_FIRST_N, PROTECT_LAST_N, THRESHOLD_TOKENS
+    ) is True
+
+
+def test_content_above_threshold_triggers():
+    """A single message comfortably above the threshold trips branch (b)."""
+    # ~threshold*4 chars => ~threshold tokens; +1000 tokens of margin so the
+    # test doesn't depend on per-message dict-wrapping overhead in the
+    # shared estimator's (chars+3)//4 rounding.
+    messages = [_msg("x" * ((THRESHOLD_TOKENS + 1000) * 4))]
+    assert _should_run_preflight_estimate(
+        messages, PROTECT_FIRST_N, PROTECT_LAST_N, THRESHOLD_TOKENS
+    ) is True
+
+
+def test_content_below_threshold_does_not_trigger():
+    """A single message comfortably below the threshold (and few messages)
+    must not trigger — the estimator stays under and the count gate is not
+    tripped."""
+    messages = [_msg("x" * ((THRESHOLD_TOKENS - 1000) * 4))]
+    assert _should_run_preflight_estimate(
+        messages, PROTECT_FIRST_N, PROTECT_LAST_N, THRESHOLD_TOKENS
+    ) is False
+
+
+def test_message_with_none_content_is_treated_as_empty():
+    """Assistant turns mid-tool-call carry content=None -- must not crash."""
+    messages = [{"role": "assistant", "content": None}] * 5
+    assert _should_run_preflight_estimate(
+        messages, PROTECT_FIRST_N, PROTECT_LAST_N, THRESHOLD_TOKENS
+    ) is False
+
+
+def test_message_with_list_content_counts_text_parts():
+    """Multimodal content lists: the shared estimator digs into text parts.
+
+    estimate_messages_tokens_rough walks list content (rather than str()-ing
+    the whole list), so a huge text part is counted by its real length and an
+    image part is counted at a flat per-image cost — not its base64 length.
+    """
+    parts = [{"type": "text", "text": "x" * 300_000}]
+    messages = [{"role": "user", "content": parts}]
+    assert _should_run_preflight_estimate(
+        messages, PROTECT_FIRST_N, PROTECT_LAST_N, THRESHOLD_TOKENS
+    ) is True
+
+
+def test_large_base64_image_does_not_falsely_trip_gate():
+    """Regression for the inline-estimator bug: a single ~1MB base64 image
+    must NOT be mistaken for ~250K tokens. The shared estimator counts images
+    at a flat per-image cost, so one screenshot in a tiny session stays below
+    the threshold and the gate does not fire on content size alone.
+    """
+    big_b64 = "A" * 1_000_000  # ~1MB base64 payload
+    parts = [{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{big_b64}"}}]
+    messages = [{"role": "user", "content": parts}]
+    assert _should_run_preflight_estimate(
+        messages, PROTECT_FIRST_N, PROTECT_LAST_N, THRESHOLD_TOKENS
+    ) is False