hermes-agent/tests/agent/test_preflight_compression_gate.py
texhy aacc6bb0a8 fix(agent): trigger preflight compression on few-but-huge sessions (#27405)
The preflight-compression gate only ran the (expensive) token estimate when
the message COUNT exceeded protect_first_n + protect_last_n + 1. A session
with a handful of very large messages never tripped the count condition, so
compression was never attempted and the turn eventually hit a hard
context-overflow error.

Add _should_run_preflight_estimate() with OR semantics: run the estimate when
either the message count exceeds the protected ranges (the historical gate)
OR a cheap char-based estimate already crosses the configured threshold. The
downstream estimate_request_tokens_rough() stays authoritative — this is only
a hint that decides whether to pay for the full estimate.

Salvaged from #27435 by @texhy (authorship preserved). Re-applied on current
main: the preflight gate moved from conversation_loop.py to turn_context.py
since the PR was opened, so the helper + gate are placed there; the test
imports the real MINIMUM_CONTEXT_LENGTH instead of a hardcoded literal.

Closes #27405.
2026-06-25 01:20:23 +05:30

109 lines
4.5 KiB
Python

"""Regression tests for issue #27405.
The preflight compression gate must trigger when *either* the message
count exceeds the protected ranges OR the cheap char-based token
estimate already crosses the configured threshold. Pre-fix, only the
message-count condition was checked, so a session with a small number
of huge messages would silently skip compression and eventually hit a
hard context-overflow error.
"""
from agent.turn_context import _should_run_preflight_estimate
# Protected-range counts mirror the compressor defaults. THRESHOLD_TOKENS is an
# arbitrary test threshold passed explicitly into the helper — it is NOT the
# live runtime threshold (which is max(0.5*window, MINIMUM_CONTEXT_LENGTH) per
# model); the helper takes the threshold as a parameter so the tests are
# self-contained and independent of model metadata.
PROTECT_FIRST_N = 3
PROTECT_LAST_N = 20
THRESHOLD_TOKENS = 64_000
def _msg(content: str) -> dict:
return {"role": "user", "content": content}
def test_few_messages_huge_content_triggers_gate():
"""The bug from #27405: 8 messages with one massive content blob."""
# ~280K chars in one message ~= 70K tokens at 4 chars/token.
big = "x" * 280_000
messages = [_msg("hi")] * 7 + [_msg(big)]
assert len(messages) <= PROTECT_FIRST_N + PROTECT_LAST_N + 1 # would fail old gate
assert _should_run_preflight_estimate(
messages, PROTECT_FIRST_N, PROTECT_LAST_N, THRESHOLD_TOKENS
) is True
def test_few_messages_small_content_does_not_trigger():
"""Regression guard: tiny sessions should not pay the estimator cost."""
messages = [_msg("hello world")] * 8
assert _should_run_preflight_estimate(
messages, PROTECT_FIRST_N, PROTECT_LAST_N, THRESHOLD_TOKENS
) is False
def test_many_small_messages_still_triggers_via_count():
"""The historical path: > protect_first + protect_last + 1 messages."""
messages = [_msg("ok")] * (PROTECT_FIRST_N + PROTECT_LAST_N + 2) # 25
assert _should_run_preflight_estimate(
messages, PROTECT_FIRST_N, PROTECT_LAST_N, THRESHOLD_TOKENS
) is True
def test_content_above_threshold_triggers():
"""A single message comfortably above the threshold trips branch (b)."""
# ~threshold*4 chars => ~threshold tokens; +1000 tokens of margin so the
# test doesn't depend on per-message dict-wrapping overhead in the
# shared estimator's (chars+3)//4 rounding.
messages = [_msg("x" * ((THRESHOLD_TOKENS + 1000) * 4))]
assert _should_run_preflight_estimate(
messages, PROTECT_FIRST_N, PROTECT_LAST_N, THRESHOLD_TOKENS
) is True
def test_content_below_threshold_does_not_trigger():
"""A single message comfortably below the threshold (and few messages)
must not trigger — the estimator stays under and the count gate is not
tripped."""
messages = [_msg("x" * ((THRESHOLD_TOKENS - 1000) * 4))]
assert _should_run_preflight_estimate(
messages, PROTECT_FIRST_N, PROTECT_LAST_N, THRESHOLD_TOKENS
) is False
def test_message_with_none_content_is_treated_as_empty():
"""Assistant turns mid-tool-call carry content=None -- must not crash."""
messages = [{"role": "assistant", "content": None}] * 5
assert _should_run_preflight_estimate(
messages, PROTECT_FIRST_N, PROTECT_LAST_N, THRESHOLD_TOKENS
) is False
def test_message_with_list_content_counts_text_parts():
"""Multimodal content lists: the shared estimator digs into text parts.
estimate_messages_tokens_rough walks list content (rather than str()-ing
the whole list), so a huge text part is counted by its real length and an
image part is counted at a flat per-image cost — not its base64 length.
"""
parts = [{"type": "text", "text": "x" * 300_000}]
messages = [{"role": "user", "content": parts}]
assert _should_run_preflight_estimate(
messages, PROTECT_FIRST_N, PROTECT_LAST_N, THRESHOLD_TOKENS
) is True
def test_large_base64_image_does_not_falsely_trip_gate():
"""Regression for the inline-estimator bug: a single ~1MB base64 image
must NOT be mistaken for ~250K tokens. The shared estimator counts images
at a flat per-image cost, so one screenshot in a tiny session stays below
the threshold and the gate does not fire on content size alone.
"""
big_b64 = "A" * 1_000_000 # ~1MB base64 payload
parts = [{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{big_b64}"}}]
messages = [{"role": "user", "content": parts}]
assert _should_run_preflight_estimate(
messages, PROTECT_FIRST_N, PROTECT_LAST_N, THRESHOLD_TOKENS
) is False