fix(agent): trigger preflight compression on few-but-huge sessions (#27405)

The preflight-compression gate only ran the (expensive) token estimate when
the message COUNT exceeded protect_first_n + protect_last_n + 1. A session
with a handful of very large messages never tripped the count condition, so
compression was never attempted and the turn eventually hit a hard
context-overflow error.

Add _should_run_preflight_estimate() with OR semantics: run the estimate when
either the message count exceeds the protected ranges (the historical gate)
OR a cheap char-based estimate already crosses the configured threshold. The
downstream estimate_request_tokens_rough() stays authoritative — this is only
a hint that decides whether to pay for the full estimate.

Salvaged from #27435 by @texhy (authorship preserved). Re-applied on current
main: the preflight gate moved from conversation_loop.py to turn_context.py
since the PR was opened, so the helper + gate are placed there; the test
imports the real MINIMUM_CONTEXT_LENGTH instead of a hardcoded literal.

Closes #27405.
This commit is contained in:
texhy 2026-06-25 01:10:13 +05:30 committed by kshitijk4poor
parent b13e2fd694
commit aacc6bb0a8
3 changed files with 150 additions and 5 deletions

View file

@ -29,7 +29,10 @@ from dataclasses import dataclass
from typing import Any, Dict, List, Optional
from agent.iteration_budget import IterationBudget
from agent.model_metadata import estimate_request_tokens_rough
from agent.model_metadata import (
estimate_messages_tokens_rough,
estimate_request_tokens_rough,
)
logger = logging.getLogger(__name__)
@ -57,6 +60,34 @@ def _compression_made_progress(
return orig_tokens > 0 and new_tokens < orig_tokens * 0.95
def _should_run_preflight_estimate(
messages: List[Dict[str, Any]],
protect_first_n: int,
protect_last_n: int,
threshold_tokens: int,
) -> bool:
"""Cheap gate for the (expensive) full preflight token estimate.
Returns ``True`` when either:
(a) message count exceeds the protected ranges (the historical gate), or
(b) a cheap char-based estimate already crosses the configured threshold
the few-but-huge case from issue #27405 that the count-only gate
would silently skip (a handful of very large messages never trips
the count condition, so compression was never attempted and the
turn hit a hard context-overflow error).
Branch (b) uses ``estimate_messages_tokens_rough`` (the shared char-based
estimator) so a single large base64 image isn't mistaken for ~250K tokens.
It intentionally undercounts vs. the full request estimate it omits the
system prompt and tool schemas because it is only a *hint* deciding
whether to pay for the authoritative ``estimate_request_tokens_rough``,
which (together with ``should_compress``) makes the real decision.
"""
if len(messages) > protect_first_n + protect_last_n + 1:
return True
return estimate_messages_tokens_rough(messages) >= threshold_tokens
@dataclass
class TurnContext:
"""Values produced by the turn prologue and consumed by the turn loop."""
@ -289,10 +320,14 @@ def build_turn_context(
)
# ── Preflight context compression ──
if (
agent.compression_enabled
and len(messages) > agent.context_compressor.protect_first_n
+ agent.context_compressor.protect_last_n + 1
# Gate the (expensive) full token estimate behind a cheap pre-check.
# See ``_should_run_preflight_estimate`` for the OR semantics that fix
# issue #27405 (a few very large messages slipping past the count gate).
if agent.compression_enabled and _should_run_preflight_estimate(
messages,
agent.context_compressor.protect_first_n,
agent.context_compressor.protect_last_n,
agent.context_compressor.threshold_tokens,
):
_preflight_tokens = estimate_request_tokens_rough(
messages,

View file

@ -64,6 +64,7 @@ AUTHOR_MAP = {
"rayjun0412@gmail.com": "rayjun", # cron model.default salvage co-author (#43952)
"96944678+sweetcornna@users.noreply.github.com": "sweetcornna", # cron ticker-liveness salvage co-author (#33849)
"izumi0uu@gmail.com": "izumi0uu", # PR #49544 salvage (native rich reply echo; #49534)
"dev@pixlmedia.no": "texhy", # PR #27435 salvage (few-but-huge preflight compression gate; #27405)
"w31rdm4ch1n3z@protonmail.com": "w31rdm4ch1nZ",
"xtpeeps@gmail.com": "x7peeps",
"ahmad@madsgency.com": "ahmadashfq",

View file

@ -0,0 +1,109 @@
"""Regression tests for issue #27405.
The preflight compression gate must trigger when *either* the message
count exceeds the protected ranges OR the cheap char-based token
estimate already crosses the configured threshold. Pre-fix, only the
message-count condition was checked, so a session with a small number
of huge messages would silently skip compression and eventually hit a
hard context-overflow error.
"""
from agent.turn_context import _should_run_preflight_estimate
# Protected-range counts mirror the compressor defaults. THRESHOLD_TOKENS is an
# arbitrary test threshold passed explicitly into the helper — it is NOT the
# live runtime threshold (which is max(0.5*window, MINIMUM_CONTEXT_LENGTH) per
# model); the helper takes the threshold as a parameter so the tests are
# self-contained and independent of model metadata.
PROTECT_FIRST_N = 3
PROTECT_LAST_N = 20
THRESHOLD_TOKENS = 64_000
def _msg(content: str) -> dict:
return {"role": "user", "content": content}
def test_few_messages_huge_content_triggers_gate():
"""The bug from #27405: 8 messages with one massive content blob."""
# ~280K chars in one message ~= 70K tokens at 4 chars/token.
big = "x" * 280_000
messages = [_msg("hi")] * 7 + [_msg(big)]
assert len(messages) <= PROTECT_FIRST_N + PROTECT_LAST_N + 1 # would fail old gate
assert _should_run_preflight_estimate(
messages, PROTECT_FIRST_N, PROTECT_LAST_N, THRESHOLD_TOKENS
) is True
def test_few_messages_small_content_does_not_trigger():
"""Regression guard: tiny sessions should not pay the estimator cost."""
messages = [_msg("hello world")] * 8
assert _should_run_preflight_estimate(
messages, PROTECT_FIRST_N, PROTECT_LAST_N, THRESHOLD_TOKENS
) is False
def test_many_small_messages_still_triggers_via_count():
"""The historical path: > protect_first + protect_last + 1 messages."""
messages = [_msg("ok")] * (PROTECT_FIRST_N + PROTECT_LAST_N + 2) # 25
assert _should_run_preflight_estimate(
messages, PROTECT_FIRST_N, PROTECT_LAST_N, THRESHOLD_TOKENS
) is True
def test_content_above_threshold_triggers():
"""A single message comfortably above the threshold trips branch (b)."""
# ~threshold*4 chars => ~threshold tokens; +1000 tokens of margin so the
# test doesn't depend on per-message dict-wrapping overhead in the
# shared estimator's (chars+3)//4 rounding.
messages = [_msg("x" * ((THRESHOLD_TOKENS + 1000) * 4))]
assert _should_run_preflight_estimate(
messages, PROTECT_FIRST_N, PROTECT_LAST_N, THRESHOLD_TOKENS
) is True
def test_content_below_threshold_does_not_trigger():
"""A single message comfortably below the threshold (and few messages)
must not trigger the estimator stays under and the count gate is not
tripped."""
messages = [_msg("x" * ((THRESHOLD_TOKENS - 1000) * 4))]
assert _should_run_preflight_estimate(
messages, PROTECT_FIRST_N, PROTECT_LAST_N, THRESHOLD_TOKENS
) is False
def test_message_with_none_content_is_treated_as_empty():
"""Assistant turns mid-tool-call carry content=None -- must not crash."""
messages = [{"role": "assistant", "content": None}] * 5
assert _should_run_preflight_estimate(
messages, PROTECT_FIRST_N, PROTECT_LAST_N, THRESHOLD_TOKENS
) is False
def test_message_with_list_content_counts_text_parts():
"""Multimodal content lists: the shared estimator digs into text parts.
estimate_messages_tokens_rough walks list content (rather than str()-ing
the whole list), so a huge text part is counted by its real length and an
image part is counted at a flat per-image cost not its base64 length.
"""
parts = [{"type": "text", "text": "x" * 300_000}]
messages = [{"role": "user", "content": parts}]
assert _should_run_preflight_estimate(
messages, PROTECT_FIRST_N, PROTECT_LAST_N, THRESHOLD_TOKENS
) is True
def test_large_base64_image_does_not_falsely_trip_gate():
"""Regression for the inline-estimator bug: a single ~1MB base64 image
must NOT be mistaken for ~250K tokens. The shared estimator counts images
at a flat per-image cost, so one screenshot in a tiny session stays below
the threshold and the gate does not fire on content size alone.
"""
big_b64 = "A" * 1_000_000 # ~1MB base64 payload
parts = [{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{big_b64}"}}]
messages = [{"role": "user", "content": parts}]
assert _should_run_preflight_estimate(
messages, PROTECT_FIRST_N, PROTECT_LAST_N, THRESHOLD_TOKENS
) is False