From 7a3c38d0b724dac729b1a295c1612a26634279af Mon Sep 17 00:00:00 2001 From: yanghd Date: Thu, 28 May 2026 12:57:50 +0800 Subject: [PATCH] fix: stop probe stepdown without provider context limit --- agent/conversation_loop.py | 54 ++++++++++++++------------------ agent/model_metadata.py | 23 +++++++++++++- tests/test_ctx_halving_fix.py | 59 ++++++++++++++++++++++++++++++----- 3 files changed, 97 insertions(+), 39 deletions(-) diff --git a/agent/conversation_loop.py b/agent/conversation_loop.py index 9d78918c267..7e7ee26431f 100644 --- a/agent/conversation_loop.py +++ b/agent/conversation_loop.py @@ -49,9 +49,8 @@ from agent.model_metadata import ( MINIMUM_CONTEXT_LENGTH, estimate_messages_tokens_rough, estimate_request_tokens_rough, - get_next_probe_tier, + get_context_length_from_provider_error, parse_available_output_tokens_from_error, - parse_context_limit_from_error, save_context_length, ) from agent.nous_rate_guard import ( @@ -2900,9 +2899,13 @@ def run_conversation( restart_with_compressed_messages = True break - # Error is about the INPUT being too large — reduce context_length. - # Try to parse the actual limit from the error message - parsed_limit = parse_context_limit_from_error(error_msg) + # Error is about the INPUT being too large. Only reduce + # context_length when the provider explicitly reports the + # real lower limit. If the provider only says "input + # exceeds the context window", keep the configured window + # and try compression; guessing probe tiers can incorrectly + # turn a user-configured 1M window into 256K/128K/64K. + new_ctx = get_context_length_from_provider_error(error_msg, old_ctx) _provider_lower = (getattr(agent, "provider", "") or "").lower() _base_lower = (getattr(agent, "base_url", "") or "").rstrip("/").lower() is_minimax_provider = ( @@ -2914,23 +2917,12 @@ def run_conversation( ) minimax_delta_only_overflow = ( is_minimax_provider - and parsed_limit is None + and new_ctx is None and "context window exceeds limit (" in error_msg ) - if parsed_limit and parsed_limit < old_ctx: - new_ctx = parsed_limit - agent._buffer_vprint(f"Context limit detected from API: {new_ctx:,} tokens (was {old_ctx:,})") - elif minimax_delta_only_overflow: - new_ctx = old_ctx - agent._buffer_vprint( - f"Provider reported overflow amount only; " - f"keeping context_length at {old_ctx:,} tokens and compressing." - ) - else: - # Step down to the next probe tier - new_ctx = get_next_probe_tier(old_ctx) - if new_ctx and new_ctx < old_ctx: + if new_ctx is not None: + agent._buffer_vprint(f"Context limit detected from API: {new_ctx:,} tokens (was {old_ctx:,})") compressor.update_model( model=agent.model, context_length=new_ctx, @@ -2940,20 +2932,22 @@ def run_conversation( api_mode=agent.api_mode, ) # Context probing flags — only set on built-in - # compressor (plugin engines manage their own). + # compressor (plugin engines manage their own). This + # value came from the provider, so it is safe to cache. if hasattr(compressor, "_context_probed"): compressor._context_probed = True - # Only persist limits parsed from the provider's - # error message (a real number). Guessed fallback - # tiers from get_next_probe_tier() should stay - # in-memory only — persisting them pollutes the - # cache with wrong values. - compressor._context_probe_persistable = bool( - parsed_limit and parsed_limit == new_ctx - ) - agent._buffer_vprint(f"⚠️ Context length exceeded — stepping down: {old_ctx:,} → {new_ctx:,} tokens") + compressor._context_probe_persistable = True + agent._buffer_vprint(f"⚠️ Context length exceeded — using provider limit: {old_ctx:,} → {new_ctx:,} tokens") + elif minimax_delta_only_overflow: + agent._buffer_vprint( + f"Provider reported overflow amount only; " + f"keeping context_length at {old_ctx:,} tokens and compressing." + ) else: - agent._buffer_vprint(f"⚠️ Context length exceeded at minimum tier — attempting compression...") + agent._buffer_vprint( + f"⚠️ Context length exceeded, but provider did not report a max context length; " + f"keeping context_length at {old_ctx:,} tokens and compressing." + ) compression_attempts += 1 if compression_attempts > max_compression_attempts: diff --git a/agent/model_metadata.py b/agent/model_metadata.py index c77dcff1ace..a2d9b2daa3d 100644 --- a/agent/model_metadata.py +++ b/agent/model_metadata.py @@ -913,12 +913,33 @@ def parse_context_limit_from_error(error_msg: str) -> Optional[int]: return None +def get_context_length_from_provider_error( + error_msg: str, + current_context_length: int, +) -> Optional[int]: + """Return a provider-reported lower context limit, if one is present. + + Context-overflow recovery must not invent a new model window size. Some + providers only say that the input exceeds the context window without + reporting the actual maximum. In that case callers should keep the + configured context length and try compression only, rather than stepping + down through guessed probe tiers (1M → 256K → 128K → ...). + """ + parsed_limit = parse_context_limit_from_error(error_msg) + if parsed_limit is None: + return None + if parsed_limit < current_context_length: + return parsed_limit + return None + + def parse_available_output_tokens_from_error(error_msg: str) -> Optional[int]: """Detect an "output cap too large" error and return how many output tokens are available. Background — two distinct context errors exist: 1. "Prompt too long" — the INPUT itself exceeds the context window. - Fix: compress history and/or halve context_length. + Fix: compress history, and only reduce context_length if the + provider explicitly reports the actual lower limit. 2. "max_tokens too large" — input is fine, but input + requested_output > window. Fix: reduce max_tokens (the output cap) for this call. Do NOT touch context_length — the window hasn't shrunk. diff --git a/tests/test_ctx_halving_fix.py b/tests/test_ctx_halving_fix.py index 0dd3ca4e7eb..bf81ffbae9b 100644 --- a/tests/test_ctx_halving_fix.py +++ b/tests/test_ctx_halving_fix.py @@ -11,6 +11,9 @@ The fix introduces: error class and returns the available output token budget. * _ephemeral_max_output_tokens on AIAgent — a one-shot override that caps the output for one retry without touching context_length. + * get_context_length_from_provider_error() — accepts only concrete + provider-reported lower context limits and refuses guessed probe-tier + step-downs when the provider gives no maximum. Naming note ----------- @@ -75,7 +78,7 @@ class TestParseAvailableOutputTokens: # ── Should NOT detect (returns None) ───────────────────────────────── def test_prompt_too_long_is_not_output_cap_error(self): - """'prompt is too long' errors must NOT be caught — they need context halving.""" + """'prompt is too long' errors must NOT be caught — they need context-overflow recovery.""" msg = "prompt is too long: 205000 tokens > 200000 maximum" assert self._parse(msg) is None @@ -101,6 +104,49 @@ class TestParseAvailableOutputTokens: assert self._parse(msg) is None +# --------------------------------------------------------------------------- +# Context-overflow recovery — only trust provider-reported limits +# --------------------------------------------------------------------------- + +class TestContextOverflowLimitSelection: + """Context-overflow recovery must not invent a lower window size. + + Some providers only say "input exceeds the context window" without telling + Hermes what the actual maximum is. In that case we may compress the + conversation, but must not silently probe-step from a user-configured 1M + window down to 256K/128K/64K/etc. + """ + + def test_generic_overflow_without_provider_limit_keeps_context_length(self): + from agent.model_metadata import get_context_length_from_provider_error + from agent.model_metadata import get_next_probe_tier + from agent.model_metadata import parse_context_limit_from_error + + old_ctx = 1_000_000 + error_msg = ( + "Your input exceeds the context window of this model. " + "Please adjust your input and try again." + ) + + assert parse_context_limit_from_error(error_msg) is None + assert get_next_probe_tier(old_ctx) == 256_000 + assert get_context_length_from_provider_error(error_msg, old_ctx) is None + + def test_explicit_provider_limit_still_selects_that_limit(self): + from agent.model_metadata import get_context_length_from_provider_error + + error_msg = "prompt is too long: 300000 tokens > 272000 maximum" + + assert get_context_length_from_provider_error(error_msg, 1_000_000) == 272_000 + + def test_reported_limit_not_lower_than_current_is_ignored(self): + from agent.model_metadata import get_context_length_from_provider_error + + error_msg = "maximum context length is 1000000 tokens" + + assert get_context_length_from_provider_error(error_msg, 272_000) is None + + # --------------------------------------------------------------------------- # build_anthropic_kwargs — output cap clamping # --------------------------------------------------------------------------- @@ -282,19 +328,16 @@ class TestContextNotHalvedOnOutputCapError: assert agent.context_compressor.context_length == old_ctx assert agent._ephemeral_max_output_tokens == 19_936 - def test_prompt_too_long_still_triggers_probe_tier(self): - """Genuine prompt-too-long errors must still use get_next_probe_tier.""" + def test_prompt_too_long_with_explicit_limit_uses_provider_limit(self): + """Prompt-too-long errors only change context_length when they report a concrete limit.""" + from agent.model_metadata import get_context_length_from_provider_error from agent.model_metadata import parse_available_output_tokens_from_error - from agent.model_metadata import get_next_probe_tier error_msg = "prompt is too long: 205000 tokens > 200000 maximum" available_out = parse_available_output_tokens_from_error(error_msg) assert available_out is None, "prompt-too-long must not be caught by output-cap parser" - - # The old halving path is still used for this class of error - new_ctx = get_next_probe_tier(200_000) - assert new_ctx == 128_000 + assert get_context_length_from_provider_error(error_msg, 1_000_000) == 200_000 def test_output_cap_error_safety_margin(self): """The ephemeral value includes a 64-token safety margin below available_out."""