mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-27 11:22:03 +00:00
The preflight-compression gate only ran the (expensive) token estimate when the message COUNT exceeded protect_first_n + protect_last_n + 1. A session with a handful of very large messages never tripped the count condition, so compression was never attempted and the turn eventually hit a hard context-overflow error. Add _should_run_preflight_estimate() with OR semantics: run the estimate when either the message count exceeds the protected ranges (the historical gate) OR a cheap char-based estimate already crosses the configured threshold. The downstream estimate_request_tokens_rough() stays authoritative — this is only a hint that decides whether to pay for the full estimate. Salvaged from #27435 by @texhy (authorship preserved). Re-applied on current main: the preflight gate moved from conversation_loop.py to turn_context.py since the PR was opened, so the helper + gate are placed there; the test imports the real MINIMUM_CONTEXT_LENGTH instead of a hardcoded literal. Closes #27405.
109 lines
4.5 KiB
Python
109 lines
4.5 KiB
Python
"""Regression tests for issue #27405.
|
|
|
|
The preflight compression gate must trigger when *either* the message
|
|
count exceeds the protected ranges OR the cheap char-based token
|
|
estimate already crosses the configured threshold. Pre-fix, only the
|
|
message-count condition was checked, so a session with a small number
|
|
of huge messages would silently skip compression and eventually hit a
|
|
hard context-overflow error.
|
|
"""
|
|
|
|
from agent.turn_context import _should_run_preflight_estimate
|
|
|
|
|
|
# Protected-range counts mirror the compressor defaults. THRESHOLD_TOKENS is an
|
|
# arbitrary test threshold passed explicitly into the helper — it is NOT the
|
|
# live runtime threshold (which is max(0.5*window, MINIMUM_CONTEXT_LENGTH) per
|
|
# model); the helper takes the threshold as a parameter so the tests are
|
|
# self-contained and independent of model metadata.
|
|
PROTECT_FIRST_N = 3
|
|
PROTECT_LAST_N = 20
|
|
THRESHOLD_TOKENS = 64_000
|
|
|
|
|
|
def _msg(content: str) -> dict:
|
|
return {"role": "user", "content": content}
|
|
|
|
|
|
def test_few_messages_huge_content_triggers_gate():
|
|
"""The bug from #27405: 8 messages with one massive content blob."""
|
|
# ~280K chars in one message ~= 70K tokens at 4 chars/token.
|
|
big = "x" * 280_000
|
|
messages = [_msg("hi")] * 7 + [_msg(big)]
|
|
assert len(messages) <= PROTECT_FIRST_N + PROTECT_LAST_N + 1 # would fail old gate
|
|
assert _should_run_preflight_estimate(
|
|
messages, PROTECT_FIRST_N, PROTECT_LAST_N, THRESHOLD_TOKENS
|
|
) is True
|
|
|
|
|
|
def test_few_messages_small_content_does_not_trigger():
|
|
"""Regression guard: tiny sessions should not pay the estimator cost."""
|
|
messages = [_msg("hello world")] * 8
|
|
assert _should_run_preflight_estimate(
|
|
messages, PROTECT_FIRST_N, PROTECT_LAST_N, THRESHOLD_TOKENS
|
|
) is False
|
|
|
|
|
|
def test_many_small_messages_still_triggers_via_count():
|
|
"""The historical path: > protect_first + protect_last + 1 messages."""
|
|
messages = [_msg("ok")] * (PROTECT_FIRST_N + PROTECT_LAST_N + 2) # 25
|
|
assert _should_run_preflight_estimate(
|
|
messages, PROTECT_FIRST_N, PROTECT_LAST_N, THRESHOLD_TOKENS
|
|
) is True
|
|
|
|
|
|
def test_content_above_threshold_triggers():
|
|
"""A single message comfortably above the threshold trips branch (b)."""
|
|
# ~threshold*4 chars => ~threshold tokens; +1000 tokens of margin so the
|
|
# test doesn't depend on per-message dict-wrapping overhead in the
|
|
# shared estimator's (chars+3)//4 rounding.
|
|
messages = [_msg("x" * ((THRESHOLD_TOKENS + 1000) * 4))]
|
|
assert _should_run_preflight_estimate(
|
|
messages, PROTECT_FIRST_N, PROTECT_LAST_N, THRESHOLD_TOKENS
|
|
) is True
|
|
|
|
|
|
def test_content_below_threshold_does_not_trigger():
|
|
"""A single message comfortably below the threshold (and few messages)
|
|
must not trigger — the estimator stays under and the count gate is not
|
|
tripped."""
|
|
messages = [_msg("x" * ((THRESHOLD_TOKENS - 1000) * 4))]
|
|
assert _should_run_preflight_estimate(
|
|
messages, PROTECT_FIRST_N, PROTECT_LAST_N, THRESHOLD_TOKENS
|
|
) is False
|
|
|
|
|
|
def test_message_with_none_content_is_treated_as_empty():
|
|
"""Assistant turns mid-tool-call carry content=None -- must not crash."""
|
|
messages = [{"role": "assistant", "content": None}] * 5
|
|
assert _should_run_preflight_estimate(
|
|
messages, PROTECT_FIRST_N, PROTECT_LAST_N, THRESHOLD_TOKENS
|
|
) is False
|
|
|
|
|
|
def test_message_with_list_content_counts_text_parts():
|
|
"""Multimodal content lists: the shared estimator digs into text parts.
|
|
|
|
estimate_messages_tokens_rough walks list content (rather than str()-ing
|
|
the whole list), so a huge text part is counted by its real length and an
|
|
image part is counted at a flat per-image cost — not its base64 length.
|
|
"""
|
|
parts = [{"type": "text", "text": "x" * 300_000}]
|
|
messages = [{"role": "user", "content": parts}]
|
|
assert _should_run_preflight_estimate(
|
|
messages, PROTECT_FIRST_N, PROTECT_LAST_N, THRESHOLD_TOKENS
|
|
) is True
|
|
|
|
|
|
def test_large_base64_image_does_not_falsely_trip_gate():
|
|
"""Regression for the inline-estimator bug: a single ~1MB base64 image
|
|
must NOT be mistaken for ~250K tokens. The shared estimator counts images
|
|
at a flat per-image cost, so one screenshot in a tiny session stays below
|
|
the threshold and the gate does not fire on content size alone.
|
|
"""
|
|
big_b64 = "A" * 1_000_000 # ~1MB base64 payload
|
|
parts = [{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{big_b64}"}}]
|
|
messages = [{"role": "user", "content": parts}]
|
|
assert _should_run_preflight_estimate(
|
|
messages, PROTECT_FIRST_N, PROTECT_LAST_N, THRESHOLD_TOKENS
|
|
) is False
|