mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-28 11:32:22 +00:00
fix(agent): trigger preflight compression on few-but-huge sessions (#27405)
The preflight-compression gate only ran the (expensive) token estimate when the message COUNT exceeded protect_first_n + protect_last_n + 1. A session with a handful of very large messages never tripped the count condition, so compression was never attempted and the turn eventually hit a hard context-overflow error. Add _should_run_preflight_estimate() with OR semantics: run the estimate when either the message count exceeds the protected ranges (the historical gate) OR a cheap char-based estimate already crosses the configured threshold. The downstream estimate_request_tokens_rough() stays authoritative — this is only a hint that decides whether to pay for the full estimate. Salvaged from #27435 by @texhy (authorship preserved). Re-applied on current main: the preflight gate moved from conversation_loop.py to turn_context.py since the PR was opened, so the helper + gate are placed there; the test imports the real MINIMUM_CONTEXT_LENGTH instead of a hardcoded literal. Closes #27405.
This commit is contained in:
parent
b13e2fd694
commit
aacc6bb0a8
3 changed files with 150 additions and 5 deletions
|
|
@ -29,7 +29,10 @@ from dataclasses import dataclass
|
|||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from agent.iteration_budget import IterationBudget
|
||||
from agent.model_metadata import estimate_request_tokens_rough
|
||||
from agent.model_metadata import (
|
||||
estimate_messages_tokens_rough,
|
||||
estimate_request_tokens_rough,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
|
@ -57,6 +60,34 @@ def _compression_made_progress(
|
|||
return orig_tokens > 0 and new_tokens < orig_tokens * 0.95
|
||||
|
||||
|
||||
def _should_run_preflight_estimate(
|
||||
messages: List[Dict[str, Any]],
|
||||
protect_first_n: int,
|
||||
protect_last_n: int,
|
||||
threshold_tokens: int,
|
||||
) -> bool:
|
||||
"""Cheap gate for the (expensive) full preflight token estimate.
|
||||
|
||||
Returns ``True`` when either:
|
||||
(a) message count exceeds the protected ranges (the historical gate), or
|
||||
(b) a cheap char-based estimate already crosses the configured threshold
|
||||
— the few-but-huge case from issue #27405 that the count-only gate
|
||||
would silently skip (a handful of very large messages never trips
|
||||
the count condition, so compression was never attempted and the
|
||||
turn hit a hard context-overflow error).
|
||||
|
||||
Branch (b) uses ``estimate_messages_tokens_rough`` (the shared char-based
|
||||
estimator) so a single large base64 image isn't mistaken for ~250K tokens.
|
||||
It intentionally undercounts vs. the full request estimate — it omits the
|
||||
system prompt and tool schemas — because it is only a *hint* deciding
|
||||
whether to pay for the authoritative ``estimate_request_tokens_rough``,
|
||||
which (together with ``should_compress``) makes the real decision.
|
||||
"""
|
||||
if len(messages) > protect_first_n + protect_last_n + 1:
|
||||
return True
|
||||
return estimate_messages_tokens_rough(messages) >= threshold_tokens
|
||||
|
||||
|
||||
@dataclass
|
||||
class TurnContext:
|
||||
"""Values produced by the turn prologue and consumed by the turn loop."""
|
||||
|
|
@ -289,10 +320,14 @@ def build_turn_context(
|
|||
)
|
||||
|
||||
# ── Preflight context compression ──
|
||||
if (
|
||||
agent.compression_enabled
|
||||
and len(messages) > agent.context_compressor.protect_first_n
|
||||
+ agent.context_compressor.protect_last_n + 1
|
||||
# Gate the (expensive) full token estimate behind a cheap pre-check.
|
||||
# See ``_should_run_preflight_estimate`` for the OR semantics that fix
|
||||
# issue #27405 (a few very large messages slipping past the count gate).
|
||||
if agent.compression_enabled and _should_run_preflight_estimate(
|
||||
messages,
|
||||
agent.context_compressor.protect_first_n,
|
||||
agent.context_compressor.protect_last_n,
|
||||
agent.context_compressor.threshold_tokens,
|
||||
):
|
||||
_preflight_tokens = estimate_request_tokens_rough(
|
||||
messages,
|
||||
|
|
|
|||
|
|
@ -64,6 +64,7 @@ AUTHOR_MAP = {
|
|||
"rayjun0412@gmail.com": "rayjun", # cron model.default salvage co-author (#43952)
|
||||
"96944678+sweetcornna@users.noreply.github.com": "sweetcornna", # cron ticker-liveness salvage co-author (#33849)
|
||||
"izumi0uu@gmail.com": "izumi0uu", # PR #49544 salvage (native rich reply echo; #49534)
|
||||
"dev@pixlmedia.no": "texhy", # PR #27435 salvage (few-but-huge preflight compression gate; #27405)
|
||||
"w31rdm4ch1n3z@protonmail.com": "w31rdm4ch1nZ",
|
||||
"xtpeeps@gmail.com": "x7peeps",
|
||||
"ahmad@madsgency.com": "ahmadashfq",
|
||||
|
|
|
|||
109
tests/agent/test_preflight_compression_gate.py
Normal file
109
tests/agent/test_preflight_compression_gate.py
Normal file
|
|
@ -0,0 +1,109 @@
|
|||
"""Regression tests for issue #27405.
|
||||
|
||||
The preflight compression gate must trigger when *either* the message
|
||||
count exceeds the protected ranges OR the cheap char-based token
|
||||
estimate already crosses the configured threshold. Pre-fix, only the
|
||||
message-count condition was checked, so a session with a small number
|
||||
of huge messages would silently skip compression and eventually hit a
|
||||
hard context-overflow error.
|
||||
"""
|
||||
|
||||
from agent.turn_context import _should_run_preflight_estimate
|
||||
|
||||
|
||||
# Protected-range counts mirror the compressor defaults. THRESHOLD_TOKENS is an
|
||||
# arbitrary test threshold passed explicitly into the helper — it is NOT the
|
||||
# live runtime threshold (which is max(0.5*window, MINIMUM_CONTEXT_LENGTH) per
|
||||
# model); the helper takes the threshold as a parameter so the tests are
|
||||
# self-contained and independent of model metadata.
|
||||
PROTECT_FIRST_N = 3
|
||||
PROTECT_LAST_N = 20
|
||||
THRESHOLD_TOKENS = 64_000
|
||||
|
||||
|
||||
def _msg(content: str) -> dict:
|
||||
return {"role": "user", "content": content}
|
||||
|
||||
|
||||
def test_few_messages_huge_content_triggers_gate():
|
||||
"""The bug from #27405: 8 messages with one massive content blob."""
|
||||
# ~280K chars in one message ~= 70K tokens at 4 chars/token.
|
||||
big = "x" * 280_000
|
||||
messages = [_msg("hi")] * 7 + [_msg(big)]
|
||||
assert len(messages) <= PROTECT_FIRST_N + PROTECT_LAST_N + 1 # would fail old gate
|
||||
assert _should_run_preflight_estimate(
|
||||
messages, PROTECT_FIRST_N, PROTECT_LAST_N, THRESHOLD_TOKENS
|
||||
) is True
|
||||
|
||||
|
||||
def test_few_messages_small_content_does_not_trigger():
|
||||
"""Regression guard: tiny sessions should not pay the estimator cost."""
|
||||
messages = [_msg("hello world")] * 8
|
||||
assert _should_run_preflight_estimate(
|
||||
messages, PROTECT_FIRST_N, PROTECT_LAST_N, THRESHOLD_TOKENS
|
||||
) is False
|
||||
|
||||
|
||||
def test_many_small_messages_still_triggers_via_count():
|
||||
"""The historical path: > protect_first + protect_last + 1 messages."""
|
||||
messages = [_msg("ok")] * (PROTECT_FIRST_N + PROTECT_LAST_N + 2) # 25
|
||||
assert _should_run_preflight_estimate(
|
||||
messages, PROTECT_FIRST_N, PROTECT_LAST_N, THRESHOLD_TOKENS
|
||||
) is True
|
||||
|
||||
|
||||
def test_content_above_threshold_triggers():
|
||||
"""A single message comfortably above the threshold trips branch (b)."""
|
||||
# ~threshold*4 chars => ~threshold tokens; +1000 tokens of margin so the
|
||||
# test doesn't depend on per-message dict-wrapping overhead in the
|
||||
# shared estimator's (chars+3)//4 rounding.
|
||||
messages = [_msg("x" * ((THRESHOLD_TOKENS + 1000) * 4))]
|
||||
assert _should_run_preflight_estimate(
|
||||
messages, PROTECT_FIRST_N, PROTECT_LAST_N, THRESHOLD_TOKENS
|
||||
) is True
|
||||
|
||||
|
||||
def test_content_below_threshold_does_not_trigger():
|
||||
"""A single message comfortably below the threshold (and few messages)
|
||||
must not trigger — the estimator stays under and the count gate is not
|
||||
tripped."""
|
||||
messages = [_msg("x" * ((THRESHOLD_TOKENS - 1000) * 4))]
|
||||
assert _should_run_preflight_estimate(
|
||||
messages, PROTECT_FIRST_N, PROTECT_LAST_N, THRESHOLD_TOKENS
|
||||
) is False
|
||||
|
||||
|
||||
def test_message_with_none_content_is_treated_as_empty():
|
||||
"""Assistant turns mid-tool-call carry content=None -- must not crash."""
|
||||
messages = [{"role": "assistant", "content": None}] * 5
|
||||
assert _should_run_preflight_estimate(
|
||||
messages, PROTECT_FIRST_N, PROTECT_LAST_N, THRESHOLD_TOKENS
|
||||
) is False
|
||||
|
||||
|
||||
def test_message_with_list_content_counts_text_parts():
|
||||
"""Multimodal content lists: the shared estimator digs into text parts.
|
||||
|
||||
estimate_messages_tokens_rough walks list content (rather than str()-ing
|
||||
the whole list), so a huge text part is counted by its real length and an
|
||||
image part is counted at a flat per-image cost — not its base64 length.
|
||||
"""
|
||||
parts = [{"type": "text", "text": "x" * 300_000}]
|
||||
messages = [{"role": "user", "content": parts}]
|
||||
assert _should_run_preflight_estimate(
|
||||
messages, PROTECT_FIRST_N, PROTECT_LAST_N, THRESHOLD_TOKENS
|
||||
) is True
|
||||
|
||||
|
||||
def test_large_base64_image_does_not_falsely_trip_gate():
|
||||
"""Regression for the inline-estimator bug: a single ~1MB base64 image
|
||||
must NOT be mistaken for ~250K tokens. The shared estimator counts images
|
||||
at a flat per-image cost, so one screenshot in a tiny session stays below
|
||||
the threshold and the gate does not fire on content size alone.
|
||||
"""
|
||||
big_b64 = "A" * 1_000_000 # ~1MB base64 payload
|
||||
parts = [{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{big_b64}"}}]
|
||||
messages = [{"role": "user", "content": parts}]
|
||||
assert _should_run_preflight_estimate(
|
||||
messages, PROTECT_FIRST_N, PROTECT_LAST_N, THRESHOLD_TOKENS
|
||||
) is False
|
||||
Loading…
Add table
Add a link
Reference in a new issue