mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
Production fixes: - voice_mode.py: add is_recording property to AudioRecorder (parity with TermuxAudioRecorder) - cronjob_tools.py: add sms example to deliver description Test fixes: - test_real_interrupt_subagent: add missing _execution_thread_id (fixes 19 cascading failures from leaked _build_system_prompt patch) - test_anthropic_error_handling: add _FakeMessages, override _interruptible_streaming_api_call (6 fixes) - test_ctx_halving_fix: add missing request_overrides attribute (4 fixes) - test_context_token_tracking: set _disable_streaming=True for non-streaming test path (4 fixes) - test_dict_tool_call_args: set _disable_streaming=True (1 fix) - test_provider_parity: add model='gpt-4o' for AIGateway tests to meet 64K minimum context (4 fixes) - test_session_race_guard: add user_id to SessionSource (5 fixes) - test_restart_drain/helpers: add user_id to SessionSource (2 fixes) - test_telegram_photo_interrupts: add user_id to SessionSource - test_interrupt: target thread_id for per-thread interrupt system (2 fixes) - test_zombie_process_cleanup: rewrite with object.__new__ for refactored GatewayRunner.stop() (1 fix) - test_browser_camofox_state: update config version 15->17 (1 fix) - test_trajectory_compressor_async: widen lookback window 10->20 for line-shifted AsyncOpenAI (1 fix) - test_voice_mode: fixed by production is_recording addition (5 fixes) - test_voice_cli_integration: add _attached_images to CLI stub (2 fixes) - test_hermes_logging: explicit propagation/level reset for cross-test pollution defense (1 fix) - test_run_agent: add base_url for OpenRouter detection tests (2 fixes) Deleted: - test_inline_think_blocks_reasoning_only_accepted: tested unimplemented inline <think> handling
321 lines
13 KiB
Python
321 lines
13 KiB
Python
"""Tests for the context-halving bugfix.
|
|
|
|
Background
|
|
----------
|
|
When the API returns "max_tokens too large given prompt" (input is fine,
|
|
but input_tokens + requested max_tokens > context_window), the old code
|
|
incorrectly halved context_length via get_next_probe_tier().
|
|
|
|
The fix introduces:
|
|
* parse_available_output_tokens_from_error() — detects this specific
|
|
error class and returns the available output token budget.
|
|
* _ephemeral_max_output_tokens on AIAgent — a one-shot override that
|
|
caps the output for one retry without touching context_length.
|
|
|
|
Naming note
|
|
-----------
|
|
max_tokens = OUTPUT token cap (a single response).
|
|
context_length = TOTAL context window (input + output combined).
|
|
These are different and the old code conflated them; the fix keeps them
|
|
separate.
|
|
"""
|
|
|
|
import sys
|
|
import os
|
|
from unittest.mock import MagicMock, patch, PropertyMock
|
|
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
|
|
|
import pytest
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# parse_available_output_tokens_from_error — unit tests
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestParseAvailableOutputTokens:
|
|
"""Pure-function tests; no I/O required."""
|
|
|
|
def _parse(self, msg):
|
|
from agent.model_metadata import parse_available_output_tokens_from_error
|
|
return parse_available_output_tokens_from_error(msg)
|
|
|
|
# ── Should detect and extract ────────────────────────────────────────
|
|
|
|
def test_anthropic_canonical_format(self):
|
|
"""Canonical Anthropic error: max_tokens: X > context_window: Y - input_tokens: Z = available_tokens: W"""
|
|
msg = (
|
|
"max_tokens: 32768 > context_window: 200000 "
|
|
"- input_tokens: 190000 = available_tokens: 10000"
|
|
)
|
|
assert self._parse(msg) == 10000
|
|
|
|
def test_anthropic_format_large_numbers(self):
|
|
msg = (
|
|
"max_tokens: 128000 > context_window: 200000 "
|
|
"- input_tokens: 180000 = available_tokens: 20000"
|
|
)
|
|
assert self._parse(msg) == 20000
|
|
|
|
def test_available_tokens_variant_spacing(self):
|
|
"""Handles extra spaces around the colon."""
|
|
msg = "max_tokens: 32768 > 200000 available_tokens : 5000"
|
|
assert self._parse(msg) == 5000
|
|
|
|
def test_available_tokens_natural_language(self):
|
|
"""'available tokens: N' wording (no underscore)."""
|
|
msg = "max_tokens must be at most 10000 given your prompt (available tokens: 10000)"
|
|
assert self._parse(msg) == 10000
|
|
|
|
def test_single_token_available(self):
|
|
"""Edge case: only 1 token left."""
|
|
msg = "max_tokens: 9999 > context_window: 10000 - input_tokens: 9999 = available_tokens: 1"
|
|
assert self._parse(msg) == 1
|
|
|
|
# ── Should NOT detect (returns None) ─────────────────────────────────
|
|
|
|
def test_prompt_too_long_is_not_output_cap_error(self):
|
|
"""'prompt is too long' errors must NOT be caught — they need context halving."""
|
|
msg = "prompt is too long: 205000 tokens > 200000 maximum"
|
|
assert self._parse(msg) is None
|
|
|
|
def test_generic_context_window_exceeded(self):
|
|
"""Generic context window errors without available_tokens should not match."""
|
|
msg = "context window exceeded: maximum is 32768 tokens"
|
|
assert self._parse(msg) is None
|
|
|
|
def test_context_length_exceeded(self):
|
|
msg = "context_length_exceeded: prompt has 131073 tokens, limit is 131072"
|
|
assert self._parse(msg) is None
|
|
|
|
def test_no_max_tokens_keyword(self):
|
|
"""Error not related to max_tokens at all."""
|
|
msg = "invalid_api_key: the API key is invalid"
|
|
assert self._parse(msg) is None
|
|
|
|
def test_empty_string(self):
|
|
assert self._parse("") is None
|
|
|
|
def test_rate_limit_error(self):
|
|
msg = "rate_limit_error: too many requests per minute"
|
|
assert self._parse(msg) is None
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# build_anthropic_kwargs — output cap clamping
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestBuildAnthropicKwargsClamping:
|
|
"""The context_length clamp only fires when output ceiling > window.
|
|
For standard Anthropic models (output ceiling < window) it must not fire.
|
|
"""
|
|
|
|
def _build(self, model, max_tokens=None, context_length=None):
|
|
from agent.anthropic_adapter import build_anthropic_kwargs
|
|
return build_anthropic_kwargs(
|
|
model=model,
|
|
messages=[{"role": "user", "content": "hi"}],
|
|
tools=None,
|
|
max_tokens=max_tokens,
|
|
reasoning_config=None,
|
|
context_length=context_length,
|
|
)
|
|
|
|
def test_no_clamping_when_output_ceiling_fits_in_window(self):
|
|
"""Opus 4.6 native output (128K) < context window (200K) — no clamping."""
|
|
kwargs = self._build("claude-opus-4-6", context_length=200_000)
|
|
assert kwargs["max_tokens"] == 128_000
|
|
|
|
def test_clamping_fires_for_tiny_custom_window(self):
|
|
"""When context_length is 8K (local model), output cap is clamped to 7999."""
|
|
kwargs = self._build("claude-opus-4-6", context_length=8_000)
|
|
assert kwargs["max_tokens"] == 7_999
|
|
|
|
def test_explicit_max_tokens_respected_when_within_window(self):
|
|
"""Explicit max_tokens smaller than window passes through unchanged."""
|
|
kwargs = self._build("claude-opus-4-6", max_tokens=4096, context_length=200_000)
|
|
assert kwargs["max_tokens"] == 4096
|
|
|
|
def test_explicit_max_tokens_clamped_when_exceeds_window(self):
|
|
"""Explicit max_tokens larger than a small window is clamped."""
|
|
kwargs = self._build("claude-opus-4-6", max_tokens=32_768, context_length=16_000)
|
|
assert kwargs["max_tokens"] == 15_999
|
|
|
|
def test_no_context_length_uses_native_ceiling(self):
|
|
"""Without context_length the native output ceiling is used directly."""
|
|
kwargs = self._build("claude-sonnet-4-6")
|
|
assert kwargs["max_tokens"] == 64_000
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Ephemeral max_tokens mechanism — _build_api_kwargs
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestEphemeralMaxOutputTokens:
|
|
"""_build_api_kwargs consumes _ephemeral_max_output_tokens exactly once
|
|
and falls back to self.max_tokens on subsequent calls.
|
|
"""
|
|
|
|
def _make_agent(self):
|
|
"""Return a minimal AIAgent with api_mode='anthropic_messages' and
|
|
a stubbed context_compressor, bypassing full __init__ cost."""
|
|
from run_agent import AIAgent
|
|
agent = object.__new__(AIAgent)
|
|
# Minimal attributes used by _build_api_kwargs
|
|
agent.api_mode = "anthropic_messages"
|
|
agent.model = "claude-opus-4-6"
|
|
agent.tools = []
|
|
agent.max_tokens = None
|
|
agent.reasoning_config = None
|
|
agent._is_anthropic_oauth = False
|
|
agent._ephemeral_max_output_tokens = None
|
|
|
|
compressor = MagicMock()
|
|
compressor.context_length = 200_000
|
|
agent.context_compressor = compressor
|
|
|
|
# Stub out the internal message-preparation helper
|
|
agent._prepare_anthropic_messages_for_api = MagicMock(
|
|
return_value=[{"role": "user", "content": "hi"}]
|
|
)
|
|
agent._anthropic_preserve_dots = MagicMock(return_value=False)
|
|
agent.request_overrides = {}
|
|
return agent
|
|
|
|
def test_ephemeral_override_is_used_on_first_call(self):
|
|
"""When _ephemeral_max_output_tokens is set, it overrides self.max_tokens."""
|
|
agent = self._make_agent()
|
|
agent._ephemeral_max_output_tokens = 5_000
|
|
|
|
kwargs = agent._build_api_kwargs([{"role": "user", "content": "hi"}])
|
|
assert kwargs["max_tokens"] == 5_000
|
|
|
|
def test_ephemeral_override_is_consumed_after_one_call(self):
|
|
"""After one call the ephemeral override is cleared to None."""
|
|
agent = self._make_agent()
|
|
agent._ephemeral_max_output_tokens = 5_000
|
|
|
|
agent._build_api_kwargs([{"role": "user", "content": "hi"}])
|
|
assert agent._ephemeral_max_output_tokens is None
|
|
|
|
def test_subsequent_call_uses_self_max_tokens(self):
|
|
"""A second _build_api_kwargs call uses the normal max_tokens path."""
|
|
agent = self._make_agent()
|
|
agent._ephemeral_max_output_tokens = 5_000
|
|
agent.max_tokens = None # will resolve to native ceiling (128K for Opus 4.6)
|
|
|
|
agent._build_api_kwargs([{"role": "user", "content": "hi"}])
|
|
# Second call — ephemeral is gone
|
|
kwargs2 = agent._build_api_kwargs([{"role": "user", "content": "hi"}])
|
|
assert kwargs2["max_tokens"] == 128_000 # Opus 4.6 native ceiling
|
|
|
|
def test_no_ephemeral_uses_self_max_tokens_directly(self):
|
|
"""Without an ephemeral override, self.max_tokens is used normally."""
|
|
agent = self._make_agent()
|
|
agent.max_tokens = 8_192
|
|
|
|
kwargs = agent._build_api_kwargs([{"role": "user", "content": "hi"}])
|
|
assert kwargs["max_tokens"] == 8_192
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Integration: error handler does NOT halve context_length for output-cap errors
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestContextNotHalvedOnOutputCapError:
|
|
"""When the API returns 'max_tokens too large given prompt', the handler
|
|
must set _ephemeral_max_output_tokens and NOT modify context_length.
|
|
"""
|
|
|
|
def _make_agent_with_compressor(self, context_length=200_000):
|
|
from run_agent import AIAgent
|
|
from agent.context_compressor import ContextCompressor
|
|
|
|
agent = object.__new__(AIAgent)
|
|
agent.api_mode = "anthropic_messages"
|
|
agent.model = "claude-opus-4-6"
|
|
agent.base_url = "https://api.anthropic.com"
|
|
agent.tools = []
|
|
agent.max_tokens = None
|
|
agent.reasoning_config = None
|
|
agent._is_anthropic_oauth = False
|
|
agent._ephemeral_max_output_tokens = None
|
|
agent.log_prefix = ""
|
|
agent.quiet_mode = True
|
|
agent.verbose_logging = False
|
|
|
|
compressor = MagicMock(spec=ContextCompressor)
|
|
compressor.context_length = context_length
|
|
compressor.threshold_percent = 0.75
|
|
agent.context_compressor = compressor
|
|
|
|
agent._prepare_anthropic_messages_for_api = MagicMock(
|
|
return_value=[{"role": "user", "content": "hi"}]
|
|
)
|
|
agent._anthropic_preserve_dots = MagicMock(return_value=False)
|
|
agent._vprint = MagicMock()
|
|
agent.request_overrides = {}
|
|
return agent
|
|
|
|
def test_output_cap_error_sets_ephemeral_not_context_length(self):
|
|
"""On 'max_tokens too large' error, _ephemeral_max_output_tokens is set
|
|
and compressor.context_length is left unchanged."""
|
|
from agent.model_metadata import parse_available_output_tokens_from_error
|
|
from agent.model_metadata import get_next_probe_tier
|
|
|
|
error_msg = (
|
|
"max_tokens: 128000 > context_window: 200000 "
|
|
"- input_tokens: 180000 = available_tokens: 20000"
|
|
)
|
|
|
|
# Simulate the handler logic from run_agent.py
|
|
agent = self._make_agent_with_compressor(context_length=200_000)
|
|
old_ctx = agent.context_compressor.context_length
|
|
|
|
available_out = parse_available_output_tokens_from_error(error_msg)
|
|
assert available_out == 20_000, "parser must detect the error"
|
|
|
|
# The fix: set ephemeral, skip context_length modification
|
|
agent._ephemeral_max_output_tokens = max(1, available_out - 64)
|
|
|
|
# context_length must be untouched
|
|
assert agent.context_compressor.context_length == old_ctx
|
|
assert agent._ephemeral_max_output_tokens == 19_936
|
|
|
|
def test_prompt_too_long_still_triggers_probe_tier(self):
|
|
"""Genuine prompt-too-long errors must still use get_next_probe_tier."""
|
|
from agent.model_metadata import parse_available_output_tokens_from_error
|
|
from agent.model_metadata import get_next_probe_tier
|
|
|
|
error_msg = "prompt is too long: 205000 tokens > 200000 maximum"
|
|
|
|
available_out = parse_available_output_tokens_from_error(error_msg)
|
|
assert available_out is None, "prompt-too-long must not be caught by output-cap parser"
|
|
|
|
# The old halving path is still used for this class of error
|
|
new_ctx = get_next_probe_tier(200_000)
|
|
assert new_ctx == 128_000
|
|
|
|
def test_output_cap_error_safety_margin(self):
|
|
"""The ephemeral value includes a 64-token safety margin below available_out."""
|
|
from agent.model_metadata import parse_available_output_tokens_from_error
|
|
|
|
error_msg = (
|
|
"max_tokens: 32768 > context_window: 200000 "
|
|
"- input_tokens: 190000 = available_tokens: 10000"
|
|
)
|
|
available_out = parse_available_output_tokens_from_error(error_msg)
|
|
safe_out = max(1, available_out - 64)
|
|
assert safe_out == 9_936
|
|
|
|
def test_safety_margin_never_goes_below_one(self):
|
|
"""When available_out is very small, safe_out must be at least 1."""
|
|
from agent.model_metadata import parse_available_output_tokens_from_error
|
|
|
|
error_msg = (
|
|
"max_tokens: 10 > context_window: 200000 "
|
|
"- input_tokens: 199990 = available_tokens: 1"
|
|
)
|
|
available_out = parse_available_output_tokens_from_error(error_msg)
|
|
safe_out = max(1, available_out - 64)
|
|
assert safe_out == 1
|