fix: stop probe stepdown without provider context limit

2026-06-05 07:41:39 +00:00 · 2026-05-28 12:57:50 +08:00 · 2026-05-28 12:57:50 +08:00 · 7a3c38d0b7
commit 7a3c38d0b7
parent 5cbc3fbdcc
3 changed files with 97 additions and 39 deletions
--- a/agent/conversation_loop.py
+++ b/agent/conversation_loop.py
@ -49,9 +49,8 @@ from agent.model_metadata import (
    MINIMUM_CONTEXT_LENGTH,
    estimate_messages_tokens_rough,
    estimate_request_tokens_rough,
-    get_next_probe_tier,
+    get_context_length_from_provider_error,
    parse_available_output_tokens_from_error,
-    parse_context_limit_from_error,
    save_context_length,
 )
 from agent.nous_rate_guard import (
@ -2900,9 +2899,13 @@ def run_conversation(
                        restart_with_compressed_messages = True
                        break

-                    # Error is about the INPUT being too large — reduce context_length.
-                    # Try to parse the actual limit from the error message
-                    parsed_limit = parse_context_limit_from_error(error_msg)
+                    # Error is about the INPUT being too large.  Only reduce
+                    # context_length when the provider explicitly reports the
+                    # real lower limit.  If the provider only says "input
+                    # exceeds the context window", keep the configured window
+                    # and try compression; guessing probe tiers can incorrectly
+                    # turn a user-configured 1M window into 256K/128K/64K.
+                    new_ctx = get_context_length_from_provider_error(error_msg, old_ctx)
                    _provider_lower = (getattr(agent, "provider", "") or "").lower()
                    _base_lower = (getattr(agent, "base_url", "") or "").rstrip("/").lower()
                    is_minimax_provider = (
@ -2914,23 +2917,12 @@ def run_conversation(
                    )
                    minimax_delta_only_overflow = (
                        is_minimax_provider
-                        and parsed_limit is None
+                        and new_ctx is None
                        and "context window exceeds limit (" in error_msg
                    )
-                    if parsed_limit and parsed_limit < old_ctx:
-                        new_ctx = parsed_limit
-                        agent._buffer_vprint(f"Context limit detected from API: {new_ctx:,} tokens (was {old_ctx:,})")
-                    elif minimax_delta_only_overflow:
-                        new_ctx = old_ctx
-                        agent._buffer_vprint(
-                            f"Provider reported overflow amount only; "
-                            f"keeping context_length at {old_ctx:,} tokens and compressing."
-                        )
-                    else:
-                        # Step down to the next probe tier
-                        new_ctx = get_next_probe_tier(old_ctx)

-                    if new_ctx and new_ctx < old_ctx:
+                    if new_ctx is not None:
+                        agent._buffer_vprint(f"Context limit detected from API: {new_ctx:,} tokens (was {old_ctx:,})")
                        compressor.update_model(
                            model=agent.model,
                            context_length=new_ctx,
@ -2940,20 +2932,22 @@ def run_conversation(
                            api_mode=agent.api_mode,
                        )
                        # Context probing flags — only set on built-in
-                        # compressor (plugin engines manage their own).
+                        # compressor (plugin engines manage their own).  This
+                        # value came from the provider, so it is safe to cache.
                        if hasattr(compressor, "_context_probed"):
                            compressor._context_probed = True
-                            # Only persist limits parsed from the provider's
-                            # error message (a real number).  Guessed fallback
-                            # tiers from get_next_probe_tier() should stay
-                            # in-memory only — persisting them pollutes the
-                            # cache with wrong values.
-                            compressor._context_probe_persistable = bool(
-                                parsed_limit and parsed_limit == new_ctx
-                            )
-                        agent._buffer_vprint(f"⚠️  Context length exceeded — stepping down: {old_ctx:,} → {new_ctx:,} tokens")
+                            compressor._context_probe_persistable = True
+                        agent._buffer_vprint(f"⚠️  Context length exceeded — using provider limit: {old_ctx:,} → {new_ctx:,} tokens")
+                    elif minimax_delta_only_overflow:
+                        agent._buffer_vprint(
+                            f"Provider reported overflow amount only; "
+                            f"keeping context_length at {old_ctx:,} tokens and compressing."
+                        )
                    else:
-                        agent._buffer_vprint(f"⚠️  Context length exceeded at minimum tier — attempting compression...")
+                        agent._buffer_vprint(
+                            f"⚠️  Context length exceeded, but provider did not report a max context length; "
+                            f"keeping context_length at {old_ctx:,} tokens and compressing."
+                        )

                    compression_attempts += 1
                    if compression_attempts > max_compression_attempts:
--- a/agent/model_metadata.py
+++ b/agent/model_metadata.py
@ -913,12 +913,33 @@ def parse_context_limit_from_error(error_msg: str) -> Optional[int]:
    return None


+def get_context_length_from_provider_error(
+    error_msg: str,
+    current_context_length: int,
+) -> Optional[int]:
+    """Return a provider-reported lower context limit, if one is present.
+
+    Context-overflow recovery must not invent a new model window size.  Some
+    providers only say that the input exceeds the context window without
+    reporting the actual maximum.  In that case callers should keep the
+    configured context length and try compression only, rather than stepping
+    down through guessed probe tiers (1M → 256K → 128K → ...).
+    """
+    parsed_limit = parse_context_limit_from_error(error_msg)
+    if parsed_limit is None:
+        return None
+    if parsed_limit < current_context_length:
+        return parsed_limit
+    return None
+
+
 def parse_available_output_tokens_from_error(error_msg: str) -> Optional[int]:
    """Detect an "output cap too large" error and return how many output tokens are available.

    Background — two distinct context errors exist:
      1. "Prompt too long"  — the INPUT itself exceeds the context window.
-           Fix: compress history and/or halve context_length.
+           Fix: compress history, and only reduce context_length if the
+           provider explicitly reports the actual lower limit.
      2. "max_tokens too large" — input is fine, but input + requested_output > window.
           Fix: reduce max_tokens (the output cap) for this call.
           Do NOT touch context_length — the window hasn't shrunk.
--- a/tests/test_ctx_halving_fix.py
+++ b/tests/test_ctx_halving_fix.py
@ -11,6 +11,9 @@ The fix introduces:
    error class and returns the available output token budget.
  * _ephemeral_max_output_tokens on AIAgent — a one-shot override that
    caps the output for one retry without touching context_length.
+  * get_context_length_from_provider_error() — accepts only concrete
+    provider-reported lower context limits and refuses guessed probe-tier
+    step-downs when the provider gives no maximum.

 Naming note
 -----------
@ -75,7 +78,7 @@ class TestParseAvailableOutputTokens:
    # ── Should NOT detect (returns None) ─────────────────────────────────

    def test_prompt_too_long_is_not_output_cap_error(self):
-        """'prompt is too long' errors must NOT be caught — they need context halving."""
+        """'prompt is too long' errors must NOT be caught — they need context-overflow recovery."""
        msg = "prompt is too long: 205000 tokens > 200000 maximum"
        assert self._parse(msg) is None

@ -101,6 +104,49 @@ class TestParseAvailableOutputTokens:
        assert self._parse(msg) is None


+# ---------------------------------------------------------------------------
+# Context-overflow recovery — only trust provider-reported limits
+# ---------------------------------------------------------------------------
+
+class TestContextOverflowLimitSelection:
+    """Context-overflow recovery must not invent a lower window size.
+
+    Some providers only say "input exceeds the context window" without telling
+    Hermes what the actual maximum is.  In that case we may compress the
+    conversation, but must not silently probe-step from a user-configured 1M
+    window down to 256K/128K/64K/etc.
+    """
+
+    def test_generic_overflow_without_provider_limit_keeps_context_length(self):
+        from agent.model_metadata import get_context_length_from_provider_error
+        from agent.model_metadata import get_next_probe_tier
+        from agent.model_metadata import parse_context_limit_from_error
+
+        old_ctx = 1_000_000
+        error_msg = (
+            "Your input exceeds the context window of this model. "
+            "Please adjust your input and try again."
+        )
+
+        assert parse_context_limit_from_error(error_msg) is None
+        assert get_next_probe_tier(old_ctx) == 256_000
+        assert get_context_length_from_provider_error(error_msg, old_ctx) is None
+
+    def test_explicit_provider_limit_still_selects_that_limit(self):
+        from agent.model_metadata import get_context_length_from_provider_error
+
+        error_msg = "prompt is too long: 300000 tokens > 272000 maximum"
+
+        assert get_context_length_from_provider_error(error_msg, 1_000_000) == 272_000
+
+    def test_reported_limit_not_lower_than_current_is_ignored(self):
+        from agent.model_metadata import get_context_length_from_provider_error
+
+        error_msg = "maximum context length is 1000000 tokens"
+
+        assert get_context_length_from_provider_error(error_msg, 272_000) is None
+
+
 # ---------------------------------------------------------------------------
 # build_anthropic_kwargs — output cap clamping
 # ---------------------------------------------------------------------------
@ -282,19 +328,16 @@ class TestContextNotHalvedOnOutputCapError:
        assert agent.context_compressor.context_length == old_ctx
        assert agent._ephemeral_max_output_tokens == 19_936

-    def test_prompt_too_long_still_triggers_probe_tier(self):
-        """Genuine prompt-too-long errors must still use get_next_probe_tier."""
+    def test_prompt_too_long_with_explicit_limit_uses_provider_limit(self):
+        """Prompt-too-long errors only change context_length when they report a concrete limit."""
+        from agent.model_metadata import get_context_length_from_provider_error
        from agent.model_metadata import parse_available_output_tokens_from_error
-        from agent.model_metadata import get_next_probe_tier

        error_msg = "prompt is too long: 205000 tokens > 200000 maximum"

        available_out = parse_available_output_tokens_from_error(error_msg)
        assert available_out is None, "prompt-too-long must not be caught by output-cap parser"
-
-        # The old halving path is still used for this class of error
-        new_ctx = get_next_probe_tier(200_000)
-        assert new_ctx == 128_000
+        assert get_context_length_from_provider_error(error_msg, 1_000_000) == 200_000

    def test_output_cap_error_safety_margin(self):
        """The ephemeral value includes a 64-token safety margin below available_out."""