mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-05 07:41:39 +00:00
fix: stop probe stepdown without provider context limit
This commit is contained in:
parent
5cbc3fbdcc
commit
7a3c38d0b7
3 changed files with 97 additions and 39 deletions
|
|
@ -49,9 +49,8 @@ from agent.model_metadata import (
|
|||
MINIMUM_CONTEXT_LENGTH,
|
||||
estimate_messages_tokens_rough,
|
||||
estimate_request_tokens_rough,
|
||||
get_next_probe_tier,
|
||||
get_context_length_from_provider_error,
|
||||
parse_available_output_tokens_from_error,
|
||||
parse_context_limit_from_error,
|
||||
save_context_length,
|
||||
)
|
||||
from agent.nous_rate_guard import (
|
||||
|
|
@ -2900,9 +2899,13 @@ def run_conversation(
|
|||
restart_with_compressed_messages = True
|
||||
break
|
||||
|
||||
# Error is about the INPUT being too large — reduce context_length.
|
||||
# Try to parse the actual limit from the error message
|
||||
parsed_limit = parse_context_limit_from_error(error_msg)
|
||||
# Error is about the INPUT being too large. Only reduce
|
||||
# context_length when the provider explicitly reports the
|
||||
# real lower limit. If the provider only says "input
|
||||
# exceeds the context window", keep the configured window
|
||||
# and try compression; guessing probe tiers can incorrectly
|
||||
# turn a user-configured 1M window into 256K/128K/64K.
|
||||
new_ctx = get_context_length_from_provider_error(error_msg, old_ctx)
|
||||
_provider_lower = (getattr(agent, "provider", "") or "").lower()
|
||||
_base_lower = (getattr(agent, "base_url", "") or "").rstrip("/").lower()
|
||||
is_minimax_provider = (
|
||||
|
|
@ -2914,23 +2917,12 @@ def run_conversation(
|
|||
)
|
||||
minimax_delta_only_overflow = (
|
||||
is_minimax_provider
|
||||
and parsed_limit is None
|
||||
and new_ctx is None
|
||||
and "context window exceeds limit (" in error_msg
|
||||
)
|
||||
if parsed_limit and parsed_limit < old_ctx:
|
||||
new_ctx = parsed_limit
|
||||
agent._buffer_vprint(f"Context limit detected from API: {new_ctx:,} tokens (was {old_ctx:,})")
|
||||
elif minimax_delta_only_overflow:
|
||||
new_ctx = old_ctx
|
||||
agent._buffer_vprint(
|
||||
f"Provider reported overflow amount only; "
|
||||
f"keeping context_length at {old_ctx:,} tokens and compressing."
|
||||
)
|
||||
else:
|
||||
# Step down to the next probe tier
|
||||
new_ctx = get_next_probe_tier(old_ctx)
|
||||
|
||||
if new_ctx and new_ctx < old_ctx:
|
||||
if new_ctx is not None:
|
||||
agent._buffer_vprint(f"Context limit detected from API: {new_ctx:,} tokens (was {old_ctx:,})")
|
||||
compressor.update_model(
|
||||
model=agent.model,
|
||||
context_length=new_ctx,
|
||||
|
|
@ -2940,20 +2932,22 @@ def run_conversation(
|
|||
api_mode=agent.api_mode,
|
||||
)
|
||||
# Context probing flags — only set on built-in
|
||||
# compressor (plugin engines manage their own).
|
||||
# compressor (plugin engines manage their own). This
|
||||
# value came from the provider, so it is safe to cache.
|
||||
if hasattr(compressor, "_context_probed"):
|
||||
compressor._context_probed = True
|
||||
# Only persist limits parsed from the provider's
|
||||
# error message (a real number). Guessed fallback
|
||||
# tiers from get_next_probe_tier() should stay
|
||||
# in-memory only — persisting them pollutes the
|
||||
# cache with wrong values.
|
||||
compressor._context_probe_persistable = bool(
|
||||
parsed_limit and parsed_limit == new_ctx
|
||||
)
|
||||
agent._buffer_vprint(f"⚠️ Context length exceeded — stepping down: {old_ctx:,} → {new_ctx:,} tokens")
|
||||
compressor._context_probe_persistable = True
|
||||
agent._buffer_vprint(f"⚠️ Context length exceeded — using provider limit: {old_ctx:,} → {new_ctx:,} tokens")
|
||||
elif minimax_delta_only_overflow:
|
||||
agent._buffer_vprint(
|
||||
f"Provider reported overflow amount only; "
|
||||
f"keeping context_length at {old_ctx:,} tokens and compressing."
|
||||
)
|
||||
else:
|
||||
agent._buffer_vprint(f"⚠️ Context length exceeded at minimum tier — attempting compression...")
|
||||
agent._buffer_vprint(
|
||||
f"⚠️ Context length exceeded, but provider did not report a max context length; "
|
||||
f"keeping context_length at {old_ctx:,} tokens and compressing."
|
||||
)
|
||||
|
||||
compression_attempts += 1
|
||||
if compression_attempts > max_compression_attempts:
|
||||
|
|
|
|||
|
|
@ -913,12 +913,33 @@ def parse_context_limit_from_error(error_msg: str) -> Optional[int]:
|
|||
return None
|
||||
|
||||
|
||||
def get_context_length_from_provider_error(
|
||||
error_msg: str,
|
||||
current_context_length: int,
|
||||
) -> Optional[int]:
|
||||
"""Return a provider-reported lower context limit, if one is present.
|
||||
|
||||
Context-overflow recovery must not invent a new model window size. Some
|
||||
providers only say that the input exceeds the context window without
|
||||
reporting the actual maximum. In that case callers should keep the
|
||||
configured context length and try compression only, rather than stepping
|
||||
down through guessed probe tiers (1M → 256K → 128K → ...).
|
||||
"""
|
||||
parsed_limit = parse_context_limit_from_error(error_msg)
|
||||
if parsed_limit is None:
|
||||
return None
|
||||
if parsed_limit < current_context_length:
|
||||
return parsed_limit
|
||||
return None
|
||||
|
||||
|
||||
def parse_available_output_tokens_from_error(error_msg: str) -> Optional[int]:
|
||||
"""Detect an "output cap too large" error and return how many output tokens are available.
|
||||
|
||||
Background — two distinct context errors exist:
|
||||
1. "Prompt too long" — the INPUT itself exceeds the context window.
|
||||
Fix: compress history and/or halve context_length.
|
||||
Fix: compress history, and only reduce context_length if the
|
||||
provider explicitly reports the actual lower limit.
|
||||
2. "max_tokens too large" — input is fine, but input + requested_output > window.
|
||||
Fix: reduce max_tokens (the output cap) for this call.
|
||||
Do NOT touch context_length — the window hasn't shrunk.
|
||||
|
|
|
|||
|
|
@ -11,6 +11,9 @@ The fix introduces:
|
|||
error class and returns the available output token budget.
|
||||
* _ephemeral_max_output_tokens on AIAgent — a one-shot override that
|
||||
caps the output for one retry without touching context_length.
|
||||
* get_context_length_from_provider_error() — accepts only concrete
|
||||
provider-reported lower context limits and refuses guessed probe-tier
|
||||
step-downs when the provider gives no maximum.
|
||||
|
||||
Naming note
|
||||
-----------
|
||||
|
|
@ -75,7 +78,7 @@ class TestParseAvailableOutputTokens:
|
|||
# ── Should NOT detect (returns None) ─────────────────────────────────
|
||||
|
||||
def test_prompt_too_long_is_not_output_cap_error(self):
|
||||
"""'prompt is too long' errors must NOT be caught — they need context halving."""
|
||||
"""'prompt is too long' errors must NOT be caught — they need context-overflow recovery."""
|
||||
msg = "prompt is too long: 205000 tokens > 200000 maximum"
|
||||
assert self._parse(msg) is None
|
||||
|
||||
|
|
@ -101,6 +104,49 @@ class TestParseAvailableOutputTokens:
|
|||
assert self._parse(msg) is None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Context-overflow recovery — only trust provider-reported limits
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestContextOverflowLimitSelection:
|
||||
"""Context-overflow recovery must not invent a lower window size.
|
||||
|
||||
Some providers only say "input exceeds the context window" without telling
|
||||
Hermes what the actual maximum is. In that case we may compress the
|
||||
conversation, but must not silently probe-step from a user-configured 1M
|
||||
window down to 256K/128K/64K/etc.
|
||||
"""
|
||||
|
||||
def test_generic_overflow_without_provider_limit_keeps_context_length(self):
|
||||
from agent.model_metadata import get_context_length_from_provider_error
|
||||
from agent.model_metadata import get_next_probe_tier
|
||||
from agent.model_metadata import parse_context_limit_from_error
|
||||
|
||||
old_ctx = 1_000_000
|
||||
error_msg = (
|
||||
"Your input exceeds the context window of this model. "
|
||||
"Please adjust your input and try again."
|
||||
)
|
||||
|
||||
assert parse_context_limit_from_error(error_msg) is None
|
||||
assert get_next_probe_tier(old_ctx) == 256_000
|
||||
assert get_context_length_from_provider_error(error_msg, old_ctx) is None
|
||||
|
||||
def test_explicit_provider_limit_still_selects_that_limit(self):
|
||||
from agent.model_metadata import get_context_length_from_provider_error
|
||||
|
||||
error_msg = "prompt is too long: 300000 tokens > 272000 maximum"
|
||||
|
||||
assert get_context_length_from_provider_error(error_msg, 1_000_000) == 272_000
|
||||
|
||||
def test_reported_limit_not_lower_than_current_is_ignored(self):
|
||||
from agent.model_metadata import get_context_length_from_provider_error
|
||||
|
||||
error_msg = "maximum context length is 1000000 tokens"
|
||||
|
||||
assert get_context_length_from_provider_error(error_msg, 272_000) is None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# build_anthropic_kwargs — output cap clamping
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
@ -282,19 +328,16 @@ class TestContextNotHalvedOnOutputCapError:
|
|||
assert agent.context_compressor.context_length == old_ctx
|
||||
assert agent._ephemeral_max_output_tokens == 19_936
|
||||
|
||||
def test_prompt_too_long_still_triggers_probe_tier(self):
|
||||
"""Genuine prompt-too-long errors must still use get_next_probe_tier."""
|
||||
def test_prompt_too_long_with_explicit_limit_uses_provider_limit(self):
|
||||
"""Prompt-too-long errors only change context_length when they report a concrete limit."""
|
||||
from agent.model_metadata import get_context_length_from_provider_error
|
||||
from agent.model_metadata import parse_available_output_tokens_from_error
|
||||
from agent.model_metadata import get_next_probe_tier
|
||||
|
||||
error_msg = "prompt is too long: 205000 tokens > 200000 maximum"
|
||||
|
||||
available_out = parse_available_output_tokens_from_error(error_msg)
|
||||
assert available_out is None, "prompt-too-long must not be caught by output-cap parser"
|
||||
|
||||
# The old halving path is still used for this class of error
|
||||
new_ctx = get_next_probe_tier(200_000)
|
||||
assert new_ctx == 128_000
|
||||
assert get_context_length_from_provider_error(error_msg, 1_000_000) == 200_000
|
||||
|
||||
def test_output_cap_error_safety_margin(self):
|
||||
"""The ephemeral value includes a 64-token safety margin below available_out."""
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue