diff --git a/agent/turn_context.py b/agent/turn_context.py index 368b8f33c34..8b81a45bd22 100644 --- a/agent/turn_context.py +++ b/agent/turn_context.py @@ -29,7 +29,10 @@ from dataclasses import dataclass from typing import Any, Dict, List, Optional from agent.iteration_budget import IterationBudget -from agent.model_metadata import estimate_request_tokens_rough +from agent.model_metadata import ( + estimate_messages_tokens_rough, + estimate_request_tokens_rough, +) logger = logging.getLogger(__name__) @@ -57,6 +60,34 @@ def _compression_made_progress( return orig_tokens > 0 and new_tokens < orig_tokens * 0.95 +def _should_run_preflight_estimate( + messages: List[Dict[str, Any]], + protect_first_n: int, + protect_last_n: int, + threshold_tokens: int, +) -> bool: + """Cheap gate for the (expensive) full preflight token estimate. + + Returns ``True`` when either: + (a) message count exceeds the protected ranges (the historical gate), or + (b) a cheap char-based estimate already crosses the configured threshold + — the few-but-huge case from issue #27405 that the count-only gate + would silently skip (a handful of very large messages never trips + the count condition, so compression was never attempted and the + turn hit a hard context-overflow error). + + Branch (b) uses ``estimate_messages_tokens_rough`` (the shared char-based + estimator) so a single large base64 image isn't mistaken for ~250K tokens. + It intentionally undercounts vs. the full request estimate — it omits the + system prompt and tool schemas — because it is only a *hint* deciding + whether to pay for the authoritative ``estimate_request_tokens_rough``, + which (together with ``should_compress``) makes the real decision. + """ + if len(messages) > protect_first_n + protect_last_n + 1: + return True + return estimate_messages_tokens_rough(messages) >= threshold_tokens + + @dataclass class TurnContext: """Values produced by the turn prologue and consumed by the turn loop.""" @@ -289,10 +320,14 @@ def build_turn_context( ) # ── Preflight context compression ── - if ( - agent.compression_enabled - and len(messages) > agent.context_compressor.protect_first_n - + agent.context_compressor.protect_last_n + 1 + # Gate the (expensive) full token estimate behind a cheap pre-check. + # See ``_should_run_preflight_estimate`` for the OR semantics that fix + # issue #27405 (a few very large messages slipping past the count gate). + if agent.compression_enabled and _should_run_preflight_estimate( + messages, + agent.context_compressor.protect_first_n, + agent.context_compressor.protect_last_n, + agent.context_compressor.threshold_tokens, ): _preflight_tokens = estimate_request_tokens_rough( messages, diff --git a/scripts/release.py b/scripts/release.py index d0ee8bf0cf9..68d9e44528b 100755 --- a/scripts/release.py +++ b/scripts/release.py @@ -64,6 +64,7 @@ AUTHOR_MAP = { "rayjun0412@gmail.com": "rayjun", # cron model.default salvage co-author (#43952) "96944678+sweetcornna@users.noreply.github.com": "sweetcornna", # cron ticker-liveness salvage co-author (#33849) "izumi0uu@gmail.com": "izumi0uu", # PR #49544 salvage (native rich reply echo; #49534) + "dev@pixlmedia.no": "texhy", # PR #27435 salvage (few-but-huge preflight compression gate; #27405) "w31rdm4ch1n3z@protonmail.com": "w31rdm4ch1nZ", "xtpeeps@gmail.com": "x7peeps", "ahmad@madsgency.com": "ahmadashfq", diff --git a/tests/agent/test_preflight_compression_gate.py b/tests/agent/test_preflight_compression_gate.py new file mode 100644 index 00000000000..727a728e8bb --- /dev/null +++ b/tests/agent/test_preflight_compression_gate.py @@ -0,0 +1,109 @@ +"""Regression tests for issue #27405. + +The preflight compression gate must trigger when *either* the message +count exceeds the protected ranges OR the cheap char-based token +estimate already crosses the configured threshold. Pre-fix, only the +message-count condition was checked, so a session with a small number +of huge messages would silently skip compression and eventually hit a +hard context-overflow error. +""" + +from agent.turn_context import _should_run_preflight_estimate + + +# Protected-range counts mirror the compressor defaults. THRESHOLD_TOKENS is an +# arbitrary test threshold passed explicitly into the helper — it is NOT the +# live runtime threshold (which is max(0.5*window, MINIMUM_CONTEXT_LENGTH) per +# model); the helper takes the threshold as a parameter so the tests are +# self-contained and independent of model metadata. +PROTECT_FIRST_N = 3 +PROTECT_LAST_N = 20 +THRESHOLD_TOKENS = 64_000 + + +def _msg(content: str) -> dict: + return {"role": "user", "content": content} + + +def test_few_messages_huge_content_triggers_gate(): + """The bug from #27405: 8 messages with one massive content blob.""" + # ~280K chars in one message ~= 70K tokens at 4 chars/token. + big = "x" * 280_000 + messages = [_msg("hi")] * 7 + [_msg(big)] + assert len(messages) <= PROTECT_FIRST_N + PROTECT_LAST_N + 1 # would fail old gate + assert _should_run_preflight_estimate( + messages, PROTECT_FIRST_N, PROTECT_LAST_N, THRESHOLD_TOKENS + ) is True + + +def test_few_messages_small_content_does_not_trigger(): + """Regression guard: tiny sessions should not pay the estimator cost.""" + messages = [_msg("hello world")] * 8 + assert _should_run_preflight_estimate( + messages, PROTECT_FIRST_N, PROTECT_LAST_N, THRESHOLD_TOKENS + ) is False + + +def test_many_small_messages_still_triggers_via_count(): + """The historical path: > protect_first + protect_last + 1 messages.""" + messages = [_msg("ok")] * (PROTECT_FIRST_N + PROTECT_LAST_N + 2) # 25 + assert _should_run_preflight_estimate( + messages, PROTECT_FIRST_N, PROTECT_LAST_N, THRESHOLD_TOKENS + ) is True + + +def test_content_above_threshold_triggers(): + """A single message comfortably above the threshold trips branch (b).""" + # ~threshold*4 chars => ~threshold tokens; +1000 tokens of margin so the + # test doesn't depend on per-message dict-wrapping overhead in the + # shared estimator's (chars+3)//4 rounding. + messages = [_msg("x" * ((THRESHOLD_TOKENS + 1000) * 4))] + assert _should_run_preflight_estimate( + messages, PROTECT_FIRST_N, PROTECT_LAST_N, THRESHOLD_TOKENS + ) is True + + +def test_content_below_threshold_does_not_trigger(): + """A single message comfortably below the threshold (and few messages) + must not trigger — the estimator stays under and the count gate is not + tripped.""" + messages = [_msg("x" * ((THRESHOLD_TOKENS - 1000) * 4))] + assert _should_run_preflight_estimate( + messages, PROTECT_FIRST_N, PROTECT_LAST_N, THRESHOLD_TOKENS + ) is False + + +def test_message_with_none_content_is_treated_as_empty(): + """Assistant turns mid-tool-call carry content=None -- must not crash.""" + messages = [{"role": "assistant", "content": None}] * 5 + assert _should_run_preflight_estimate( + messages, PROTECT_FIRST_N, PROTECT_LAST_N, THRESHOLD_TOKENS + ) is False + + +def test_message_with_list_content_counts_text_parts(): + """Multimodal content lists: the shared estimator digs into text parts. + + estimate_messages_tokens_rough walks list content (rather than str()-ing + the whole list), so a huge text part is counted by its real length and an + image part is counted at a flat per-image cost — not its base64 length. + """ + parts = [{"type": "text", "text": "x" * 300_000}] + messages = [{"role": "user", "content": parts}] + assert _should_run_preflight_estimate( + messages, PROTECT_FIRST_N, PROTECT_LAST_N, THRESHOLD_TOKENS + ) is True + + +def test_large_base64_image_does_not_falsely_trip_gate(): + """Regression for the inline-estimator bug: a single ~1MB base64 image + must NOT be mistaken for ~250K tokens. The shared estimator counts images + at a flat per-image cost, so one screenshot in a tiny session stays below + the threshold and the gate does not fire on content size alone. + """ + big_b64 = "A" * 1_000_000 # ~1MB base64 payload + parts = [{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{big_b64}"}}] + messages = [{"role": "user", "content": parts}] + assert _should_run_preflight_estimate( + messages, PROTECT_FIRST_N, PROTECT_LAST_N, THRESHOLD_TOKENS + ) is False