hermes-agent/tests/gateway/test_compress_command.py
Teknium f0dc919f92
fix(compression): include system prompt + tool schemas in token estimates (#18265)
The user-visible /compress banner and the post-compression last_prompt_tokens
writeback both counted only the raw message transcript (chars/4). With a 15KB
system prompt and 30 tool schemas (~26KB), a 4-message transcript that looks
like ~45 tokens to the transcript-only estimator is really ~10.5K tokens of
request pressure — a 234x gap.

Two user-facing consequences:
- Banner shows 'Compressing … (~45 tokens)…' while compression is actually
  firing on 10K+ tokens of real pressure, confusing users about why
  compression triggered (reported by @codecovenant on X; #6217).
- Post-compression last_prompt_tokens writeback omits tool schemas, so the
  next should_compress() check compares real usage against a stale
  underestimate — compression triggers late, potentially past the model's
  context limit on small-context models (#14695).

Swap estimate_messages_tokens_rough() for estimate_request_tokens_rough()
at every user-visible banner and at the post-compression writeback.
estimate_request_tokens_rough() already existed for exactly this purpose
and includes system prompt + tool schemas.

Touched call sites:
- run_agent.py: post-compression last_prompt_tokens writeback, post-tool
  call should_compress() fallback when provider usage is missing
- cli.py: /compress banner + summary
- gateway/run.py: gateway /compress banner + summary
- tui_gateway/server.py: TUI /compress status + summary
- acp_adapter/server.py: ACP /compact before/after

Left intentionally alone:
- Session-hygiene fallback and the 'no agent' /status path in gateway/run.py
  — no agent instance is in scope to query for system prompt/tools, and the
  existing 30-50% overestimate wobble on hygiene is safety-accepted.
- Verbose-mode 'Request size' logging — informational only, already counts
  system prompt via api_messages[0].

Also relabels the feedback line from 'Rough transcript estimate' to
'Approx request size' so the metric label matches what it actually measures.

Credits: diagnoses from @devilardis (#14695) and @Jackten (#6217);
user report @codecovenant on X (2026-04-30).

Closes #14695
Closes #6217
2026-04-30 23:03:54 -07:00

253 lines
10 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Tests for gateway /compress user-facing messaging."""
from datetime import datetime
from unittest.mock import MagicMock, patch
import pytest
from gateway.config import GatewayConfig, Platform, PlatformConfig
from gateway.platforms.base import MessageEvent
from gateway.session import SessionEntry, SessionSource, build_session_key
def _make_source() -> SessionSource:
return SessionSource(
platform=Platform.TELEGRAM,
user_id="u1",
chat_id="c1",
user_name="tester",
chat_type="dm",
)
def _make_event(text: str = "/compress") -> MessageEvent:
return MessageEvent(text=text, source=_make_source(), message_id="m1")
def _make_history() -> list[dict[str, str]]:
return [
{"role": "user", "content": "one"},
{"role": "assistant", "content": "two"},
{"role": "user", "content": "three"},
{"role": "assistant", "content": "four"},
]
def _make_runner(history: list[dict[str, str]]):
from gateway.run import GatewayRunner
runner = object.__new__(GatewayRunner)
runner.config = GatewayConfig(
platforms={Platform.TELEGRAM: PlatformConfig(enabled=True, token="***")}
)
session_entry = SessionEntry(
session_key=build_session_key(_make_source()),
session_id="sess-1",
created_at=datetime.now(),
updated_at=datetime.now(),
platform=Platform.TELEGRAM,
chat_type="dm",
)
runner.session_store = MagicMock()
runner.session_store.get_or_create_session.return_value = session_entry
runner.session_store.load_transcript.return_value = history
runner.session_store.rewrite_transcript = MagicMock()
runner.session_store.update_session = MagicMock()
runner.session_store._save = MagicMock()
return runner
@pytest.mark.asyncio
async def test_compress_command_reports_noop_without_success_banner():
history = _make_history()
runner = _make_runner(history)
agent_instance = MagicMock()
agent_instance.shutdown_memory_provider = MagicMock()
agent_instance.close = MagicMock()
agent_instance._cached_system_prompt = ""
agent_instance.tools = None
agent_instance.context_compressor.has_content_to_compress.return_value = True
agent_instance.session_id = "sess-1"
agent_instance._compress_context.return_value = (list(history), "")
def _estimate(messages, **_kwargs):
assert messages == history
return 100
with (
patch("gateway.run._resolve_runtime_agent_kwargs", return_value={"api_key": "test-key"}),
patch("gateway.run._resolve_gateway_model", return_value="test-model"),
patch("run_agent.AIAgent", return_value=agent_instance),
patch("agent.model_metadata.estimate_request_tokens_rough", side_effect=_estimate),
):
result = await runner._handle_compress_command(_make_event())
assert "No changes from compression" in result
assert "Compressed:" not in result
assert "Approx request size: ~100 tokens (unchanged)" in result
agent_instance.shutdown_memory_provider.assert_called_once()
agent_instance.close.assert_called_once()
@pytest.mark.asyncio
async def test_compress_command_explains_when_token_estimate_rises():
history = _make_history()
compressed = [
history[0],
{"role": "assistant", "content": "Dense summary that still counts as more tokens."},
history[-1],
]
runner = _make_runner(history)
agent_instance = MagicMock()
agent_instance.shutdown_memory_provider = MagicMock()
agent_instance.close = MagicMock()
agent_instance._cached_system_prompt = ""
agent_instance.tools = None
agent_instance.context_compressor.has_content_to_compress.return_value = True
agent_instance.session_id = "sess-1"
agent_instance._compress_context.return_value = (compressed, "")
def _estimate(messages, **_kwargs):
if messages == history:
return 100
if messages == compressed:
return 120
raise AssertionError(f"unexpected transcript: {messages!r}")
with (
patch("gateway.run._resolve_runtime_agent_kwargs", return_value={"api_key": "test-key"}),
patch("gateway.run._resolve_gateway_model", return_value="test-model"),
patch("run_agent.AIAgent", return_value=agent_instance),
patch("agent.model_metadata.estimate_request_tokens_rough", side_effect=_estimate),
):
result = await runner._handle_compress_command(_make_event())
assert "Compressed: 4 → 3 messages" in result
assert "Approx request size: ~100 → ~120 tokens" in result
assert "denser summaries" in result
agent_instance.shutdown_memory_provider.assert_called_once()
agent_instance.close.assert_called_once()
@pytest.mark.asyncio
async def test_compress_command_appends_warning_when_summary_generation_fails():
"""When the auxiliary summariser fails and the compressor inserts a static
fallback placeholder, /compress must append a visible ⚠️ warning to its
reply. Otherwise the failure is silently logged and the user has no idea
earlier context is unrecoverable."""
history = _make_history()
# Compressed shape is irrelevant for this test — we only care that the
# warning surfaces. Drop one message so the headline is non-noop.
compressed = [
history[0],
{"role": "assistant", "content": "[fallback placeholder]"},
history[-1],
]
runner = _make_runner(history)
agent_instance = MagicMock()
agent_instance.shutdown_memory_provider = MagicMock()
agent_instance.close = MagicMock()
agent_instance._cached_system_prompt = ""
agent_instance.tools = None
agent_instance.context_compressor.has_content_to_compress.return_value = True
# Simulate summary-generation failure: fallback flag set, dropped count
# populated, error string captured.
agent_instance.context_compressor._last_summary_fallback_used = True
agent_instance.context_compressor._last_summary_dropped_count = 7
agent_instance.context_compressor._last_summary_error = (
"404 model not found: gemini-3-flash-preview"
)
agent_instance.session_id = "sess-1"
agent_instance._compress_context.return_value = (compressed, "")
def _estimate(messages, **_kwargs):
if messages == history:
return 100
if messages == compressed:
return 60
raise AssertionError(f"unexpected transcript: {messages!r}")
with (
patch("gateway.run._resolve_runtime_agent_kwargs", return_value={"api_key": "***"}),
patch("gateway.run._resolve_gateway_model", return_value="test-model"),
patch("run_agent.AIAgent", return_value=agent_instance),
patch("agent.model_metadata.estimate_request_tokens_rough", side_effect=_estimate),
):
result = await runner._handle_compress_command(_make_event())
# The compress reply itself still goes through (the transcript was rewritten).
assert "Compressed:" in result
# ...but a clearly-marked warning must be appended.
assert "⚠️" in result
assert "Summary generation failed" in result
# Underlying error must surface so users can fix their config.
assert "404 model not found" in result
# Dropped count must be visible — silently losing N messages is the bug.
assert "7" in result
assert "historical message(s) were removed" in result
agent_instance.shutdown_memory_provider.assert_called_once()
agent_instance.close.assert_called_once()
@pytest.mark.asyncio
async def test_compress_command_surfaces_aux_model_failure_even_when_recovered():
"""When the user's configured ``auxiliary.compression.model`` errors out
but compression recovers by retrying on the main model, /compress must
STILL inform the user. Silent recovery hides broken config the user
needs to fix."""
history = _make_history()
# Compressed transcript — normal successful compression, no placeholder.
compressed = [
history[0],
{"role": "assistant", "content": "summary via main model"},
history[-1],
]
runner = _make_runner(history)
agent_instance = MagicMock()
agent_instance.shutdown_memory_provider = MagicMock()
agent_instance.close = MagicMock()
agent_instance._cached_system_prompt = ""
agent_instance.tools = None
agent_instance.context_compressor.has_content_to_compress.return_value = True
# Fallback placeholder was NOT used — recovery succeeded.
agent_instance.context_compressor._last_summary_fallback_used = False
agent_instance.context_compressor._last_summary_dropped_count = 0
agent_instance.context_compressor._last_summary_error = None
# But the configured aux model DID fail before the retry succeeded.
agent_instance.context_compressor._last_aux_model_failure_model = (
"gemini-3-flash-preview"
)
agent_instance.context_compressor._last_aux_model_failure_error = (
"404 model not found: gemini-3-flash-preview"
)
agent_instance.session_id = "sess-1"
agent_instance._compress_context.return_value = (compressed, "")
def _estimate(messages, **_kwargs):
if messages == history:
return 100
if messages == compressed:
return 60
raise AssertionError(f"unexpected transcript: {messages!r}")
with (
patch("gateway.run._resolve_runtime_agent_kwargs", return_value={"api_key": "***"}),
patch("gateway.run._resolve_gateway_model", return_value="test-model"),
patch("run_agent.AIAgent", return_value=agent_instance),
patch("agent.model_metadata.estimate_request_tokens_rough", side_effect=_estimate),
):
result = await runner._handle_compress_command(_make_event())
# Compression succeeded
assert "Compressed:" in result
# No ⚠️ warning (that's reserved for dropped-turns case)
assert "⚠️" not in result
# But there IS an info note about the broken aux model
assert "" in result
assert "gemini-3-flash-preview" in result
assert "404" in result
assert "auxiliary.compression.model" in result
# The user's context is explicitly called out as intact
assert "intact" in result
agent_instance.shutdown_memory_provider.assert_called_once()
agent_instance.close.assert_called_once()