fix: stop probe stepdown without provider context limit

This commit is contained in:
yanghd 2026-05-28 12:57:50 +08:00 committed by Teknium
parent 5cbc3fbdcc
commit 7a3c38d0b7
3 changed files with 97 additions and 39 deletions

View file

@ -49,9 +49,8 @@ from agent.model_metadata import (
MINIMUM_CONTEXT_LENGTH,
estimate_messages_tokens_rough,
estimate_request_tokens_rough,
get_next_probe_tier,
get_context_length_from_provider_error,
parse_available_output_tokens_from_error,
parse_context_limit_from_error,
save_context_length,
)
from agent.nous_rate_guard import (
@ -2900,9 +2899,13 @@ def run_conversation(
restart_with_compressed_messages = True
break
# Error is about the INPUT being too large — reduce context_length.
# Try to parse the actual limit from the error message
parsed_limit = parse_context_limit_from_error(error_msg)
# Error is about the INPUT being too large. Only reduce
# context_length when the provider explicitly reports the
# real lower limit. If the provider only says "input
# exceeds the context window", keep the configured window
# and try compression; guessing probe tiers can incorrectly
# turn a user-configured 1M window into 256K/128K/64K.
new_ctx = get_context_length_from_provider_error(error_msg, old_ctx)
_provider_lower = (getattr(agent, "provider", "") or "").lower()
_base_lower = (getattr(agent, "base_url", "") or "").rstrip("/").lower()
is_minimax_provider = (
@ -2914,23 +2917,12 @@ def run_conversation(
)
minimax_delta_only_overflow = (
is_minimax_provider
and parsed_limit is None
and new_ctx is None
and "context window exceeds limit (" in error_msg
)
if parsed_limit and parsed_limit < old_ctx:
new_ctx = parsed_limit
agent._buffer_vprint(f"Context limit detected from API: {new_ctx:,} tokens (was {old_ctx:,})")
elif minimax_delta_only_overflow:
new_ctx = old_ctx
agent._buffer_vprint(
f"Provider reported overflow amount only; "
f"keeping context_length at {old_ctx:,} tokens and compressing."
)
else:
# Step down to the next probe tier
new_ctx = get_next_probe_tier(old_ctx)
if new_ctx and new_ctx < old_ctx:
if new_ctx is not None:
agent._buffer_vprint(f"Context limit detected from API: {new_ctx:,} tokens (was {old_ctx:,})")
compressor.update_model(
model=agent.model,
context_length=new_ctx,
@ -2940,20 +2932,22 @@ def run_conversation(
api_mode=agent.api_mode,
)
# Context probing flags — only set on built-in
# compressor (plugin engines manage their own).
# compressor (plugin engines manage their own). This
# value came from the provider, so it is safe to cache.
if hasattr(compressor, "_context_probed"):
compressor._context_probed = True
# Only persist limits parsed from the provider's
# error message (a real number). Guessed fallback
# tiers from get_next_probe_tier() should stay
# in-memory only — persisting them pollutes the
# cache with wrong values.
compressor._context_probe_persistable = bool(
parsed_limit and parsed_limit == new_ctx
)
agent._buffer_vprint(f"⚠️ Context length exceeded — stepping down: {old_ctx:,}{new_ctx:,} tokens")
compressor._context_probe_persistable = True
agent._buffer_vprint(f"⚠️ Context length exceeded — using provider limit: {old_ctx:,}{new_ctx:,} tokens")
elif minimax_delta_only_overflow:
agent._buffer_vprint(
f"Provider reported overflow amount only; "
f"keeping context_length at {old_ctx:,} tokens and compressing."
)
else:
agent._buffer_vprint(f"⚠️ Context length exceeded at minimum tier — attempting compression...")
agent._buffer_vprint(
f"⚠️ Context length exceeded, but provider did not report a max context length; "
f"keeping context_length at {old_ctx:,} tokens and compressing."
)
compression_attempts += 1
if compression_attempts > max_compression_attempts:

View file

@ -913,12 +913,33 @@ def parse_context_limit_from_error(error_msg: str) -> Optional[int]:
return None
def get_context_length_from_provider_error(
error_msg: str,
current_context_length: int,
) -> Optional[int]:
"""Return a provider-reported lower context limit, if one is present.
Context-overflow recovery must not invent a new model window size. Some
providers only say that the input exceeds the context window without
reporting the actual maximum. In that case callers should keep the
configured context length and try compression only, rather than stepping
down through guessed probe tiers (1M 256K 128K ...).
"""
parsed_limit = parse_context_limit_from_error(error_msg)
if parsed_limit is None:
return None
if parsed_limit < current_context_length:
return parsed_limit
return None
def parse_available_output_tokens_from_error(error_msg: str) -> Optional[int]:
"""Detect an "output cap too large" error and return how many output tokens are available.
Background two distinct context errors exist:
1. "Prompt too long" the INPUT itself exceeds the context window.
Fix: compress history and/or halve context_length.
Fix: compress history, and only reduce context_length if the
provider explicitly reports the actual lower limit.
2. "max_tokens too large" input is fine, but input + requested_output > window.
Fix: reduce max_tokens (the output cap) for this call.
Do NOT touch context_length the window hasn't shrunk.

View file

@ -11,6 +11,9 @@ The fix introduces:
error class and returns the available output token budget.
* _ephemeral_max_output_tokens on AIAgent a one-shot override that
caps the output for one retry without touching context_length.
* get_context_length_from_provider_error() accepts only concrete
provider-reported lower context limits and refuses guessed probe-tier
step-downs when the provider gives no maximum.
Naming note
-----------
@ -75,7 +78,7 @@ class TestParseAvailableOutputTokens:
# ── Should NOT detect (returns None) ─────────────────────────────────
def test_prompt_too_long_is_not_output_cap_error(self):
"""'prompt is too long' errors must NOT be caught — they need context halving."""
"""'prompt is too long' errors must NOT be caught — they need context-overflow recovery."""
msg = "prompt is too long: 205000 tokens > 200000 maximum"
assert self._parse(msg) is None
@ -101,6 +104,49 @@ class TestParseAvailableOutputTokens:
assert self._parse(msg) is None
# ---------------------------------------------------------------------------
# Context-overflow recovery — only trust provider-reported limits
# ---------------------------------------------------------------------------
class TestContextOverflowLimitSelection:
"""Context-overflow recovery must not invent a lower window size.
Some providers only say "input exceeds the context window" without telling
Hermes what the actual maximum is. In that case we may compress the
conversation, but must not silently probe-step from a user-configured 1M
window down to 256K/128K/64K/etc.
"""
def test_generic_overflow_without_provider_limit_keeps_context_length(self):
from agent.model_metadata import get_context_length_from_provider_error
from agent.model_metadata import get_next_probe_tier
from agent.model_metadata import parse_context_limit_from_error
old_ctx = 1_000_000
error_msg = (
"Your input exceeds the context window of this model. "
"Please adjust your input and try again."
)
assert parse_context_limit_from_error(error_msg) is None
assert get_next_probe_tier(old_ctx) == 256_000
assert get_context_length_from_provider_error(error_msg, old_ctx) is None
def test_explicit_provider_limit_still_selects_that_limit(self):
from agent.model_metadata import get_context_length_from_provider_error
error_msg = "prompt is too long: 300000 tokens > 272000 maximum"
assert get_context_length_from_provider_error(error_msg, 1_000_000) == 272_000
def test_reported_limit_not_lower_than_current_is_ignored(self):
from agent.model_metadata import get_context_length_from_provider_error
error_msg = "maximum context length is 1000000 tokens"
assert get_context_length_from_provider_error(error_msg, 272_000) is None
# ---------------------------------------------------------------------------
# build_anthropic_kwargs — output cap clamping
# ---------------------------------------------------------------------------
@ -282,19 +328,16 @@ class TestContextNotHalvedOnOutputCapError:
assert agent.context_compressor.context_length == old_ctx
assert agent._ephemeral_max_output_tokens == 19_936
def test_prompt_too_long_still_triggers_probe_tier(self):
"""Genuine prompt-too-long errors must still use get_next_probe_tier."""
def test_prompt_too_long_with_explicit_limit_uses_provider_limit(self):
"""Prompt-too-long errors only change context_length when they report a concrete limit."""
from agent.model_metadata import get_context_length_from_provider_error
from agent.model_metadata import parse_available_output_tokens_from_error
from agent.model_metadata import get_next_probe_tier
error_msg = "prompt is too long: 205000 tokens > 200000 maximum"
available_out = parse_available_output_tokens_from_error(error_msg)
assert available_out is None, "prompt-too-long must not be caught by output-cap parser"
# The old halving path is still used for this class of error
new_ctx = get_next_probe_tier(200_000)
assert new_ctx == 128_000
assert get_context_length_from_provider_error(error_msg, 1_000_000) == 200_000
def test_output_cap_error_safety_margin(self):
"""The ephemeral value includes a 64-token safety margin below available_out."""