From 7a3c38d0b724dac729b1a295c1612a26634279af Mon Sep 17 00:00:00 2001
From: yanghd <yanghongda@jackyun.com>
Date: Thu, 28 May 2026 12:57:50 +0800
Subject: [PATCH] fix: stop probe stepdown without provider context limit

---
 agent/conversation_loop.py    | 54 ++++++++++++++------------------
 agent/model_metadata.py       | 23 +++++++++++++-
 tests/test_ctx_halving_fix.py | 59 ++++++++++++++++++++++++++++++-----
 3 files changed, 97 insertions(+), 39 deletions(-)

diff --git a/agent/conversation_loop.py b/agent/conversation_loop.py
index 9d78918c267..7e7ee26431f 100644
--- a/agent/conversation_loop.py
+++ b/agent/conversation_loop.py
@@ -49,9 +49,8 @@ from agent.model_metadata import (
     MINIMUM_CONTEXT_LENGTH,
     estimate_messages_tokens_rough,
     estimate_request_tokens_rough,
-    get_next_probe_tier,
+    get_context_length_from_provider_error,
     parse_available_output_tokens_from_error,
-    parse_context_limit_from_error,
     save_context_length,
 )
 from agent.nous_rate_guard import (
@@ -2900,9 +2899,13 @@ def run_conversation(
                         restart_with_compressed_messages = True
                         break
 
-                    # Error is about the INPUT being too large — reduce context_length.
-                    # Try to parse the actual limit from the error message
-                    parsed_limit = parse_context_limit_from_error(error_msg)
+                    # Error is about the INPUT being too large.  Only reduce
+                    # context_length when the provider explicitly reports the
+                    # real lower limit.  If the provider only says "input
+                    # exceeds the context window", keep the configured window
+                    # and try compression; guessing probe tiers can incorrectly
+                    # turn a user-configured 1M window into 256K/128K/64K.
+                    new_ctx = get_context_length_from_provider_error(error_msg, old_ctx)
                     _provider_lower = (getattr(agent, "provider", "") or "").lower()
                     _base_lower = (getattr(agent, "base_url", "") or "").rstrip("/").lower()
                     is_minimax_provider = (
@@ -2914,23 +2917,12 @@ def run_conversation(
                     )
                     minimax_delta_only_overflow = (
                         is_minimax_provider
-                        and parsed_limit is None
+                        and new_ctx is None
                         and "context window exceeds limit (" in error_msg
                     )
-                    if parsed_limit and parsed_limit < old_ctx:
-                        new_ctx = parsed_limit
-                        agent._buffer_vprint(f"Context limit detected from API: {new_ctx:,} tokens (was {old_ctx:,})")
-                    elif minimax_delta_only_overflow:
-                        new_ctx = old_ctx
-                        agent._buffer_vprint(
-                            f"Provider reported overflow amount only; "
-                            f"keeping context_length at {old_ctx:,} tokens and compressing."
-                        )
-                    else:
-                        # Step down to the next probe tier
-                        new_ctx = get_next_probe_tier(old_ctx)
 
-                    if new_ctx and new_ctx < old_ctx:
+                    if new_ctx is not None:
+                        agent._buffer_vprint(f"Context limit detected from API: {new_ctx:,} tokens (was {old_ctx:,})")
                         compressor.update_model(
                             model=agent.model,
                             context_length=new_ctx,
@@ -2940,20 +2932,22 @@ def run_conversation(
                             api_mode=agent.api_mode,
                         )
                         # Context probing flags — only set on built-in
-                        # compressor (plugin engines manage their own).
+                        # compressor (plugin engines manage their own).  This
+                        # value came from the provider, so it is safe to cache.
                         if hasattr(compressor, "_context_probed"):
                             compressor._context_probed = True
-                            # Only persist limits parsed from the provider's
-                            # error message (a real number).  Guessed fallback
-                            # tiers from get_next_probe_tier() should stay
-                            # in-memory only — persisting them pollutes the
-                            # cache with wrong values.
-                            compressor._context_probe_persistable = bool(
-                                parsed_limit and parsed_limit == new_ctx
-                            )
-                        agent._buffer_vprint(f"⚠️  Context length exceeded — stepping down: {old_ctx:,} → {new_ctx:,} tokens")
+                            compressor._context_probe_persistable = True
+                        agent._buffer_vprint(f"⚠️  Context length exceeded — using provider limit: {old_ctx:,} → {new_ctx:,} tokens")
+                    elif minimax_delta_only_overflow:
+                        agent._buffer_vprint(
+                            f"Provider reported overflow amount only; "
+                            f"keeping context_length at {old_ctx:,} tokens and compressing."
+                        )
                     else:
-                        agent._buffer_vprint(f"⚠️  Context length exceeded at minimum tier — attempting compression...")
+                        agent._buffer_vprint(
+                            f"⚠️  Context length exceeded, but provider did not report a max context length; "
+                            f"keeping context_length at {old_ctx:,} tokens and compressing."
+                        )
 
                     compression_attempts += 1
                     if compression_attempts > max_compression_attempts:
diff --git a/agent/model_metadata.py b/agent/model_metadata.py
index c77dcff1ace..a2d9b2daa3d 100644
--- a/agent/model_metadata.py
+++ b/agent/model_metadata.py
@@ -913,12 +913,33 @@ def parse_context_limit_from_error(error_msg: str) -> Optional[int]:
     return None
 
 
+def get_context_length_from_provider_error(
+    error_msg: str,
+    current_context_length: int,
+) -> Optional[int]:
+    """Return a provider-reported lower context limit, if one is present.
+
+    Context-overflow recovery must not invent a new model window size.  Some
+    providers only say that the input exceeds the context window without
+    reporting the actual maximum.  In that case callers should keep the
+    configured context length and try compression only, rather than stepping
+    down through guessed probe tiers (1M → 256K → 128K → ...).
+    """
+    parsed_limit = parse_context_limit_from_error(error_msg)
+    if parsed_limit is None:
+        return None
+    if parsed_limit < current_context_length:
+        return parsed_limit
+    return None
+
+
 def parse_available_output_tokens_from_error(error_msg: str) -> Optional[int]:
     """Detect an "output cap too large" error and return how many output tokens are available.
 
     Background — two distinct context errors exist:
       1. "Prompt too long"  — the INPUT itself exceeds the context window.
-           Fix: compress history and/or halve context_length.
+           Fix: compress history, and only reduce context_length if the
+           provider explicitly reports the actual lower limit.
       2. "max_tokens too large" — input is fine, but input + requested_output > window.
            Fix: reduce max_tokens (the output cap) for this call.
            Do NOT touch context_length — the window hasn't shrunk.
diff --git a/tests/test_ctx_halving_fix.py b/tests/test_ctx_halving_fix.py
index 0dd3ca4e7eb..bf81ffbae9b 100644
--- a/tests/test_ctx_halving_fix.py
+++ b/tests/test_ctx_halving_fix.py
@@ -11,6 +11,9 @@ The fix introduces:
     error class and returns the available output token budget.
   * _ephemeral_max_output_tokens on AIAgent — a one-shot override that
     caps the output for one retry without touching context_length.
+  * get_context_length_from_provider_error() — accepts only concrete
+    provider-reported lower context limits and refuses guessed probe-tier
+    step-downs when the provider gives no maximum.
 
 Naming note
 -----------
@@ -75,7 +78,7 @@ class TestParseAvailableOutputTokens:
     # ── Should NOT detect (returns None) ─────────────────────────────────
 
     def test_prompt_too_long_is_not_output_cap_error(self):
-        """'prompt is too long' errors must NOT be caught — they need context halving."""
+        """'prompt is too long' errors must NOT be caught — they need context-overflow recovery."""
         msg = "prompt is too long: 205000 tokens > 200000 maximum"
         assert self._parse(msg) is None
 
@@ -101,6 +104,49 @@ class TestParseAvailableOutputTokens:
         assert self._parse(msg) is None
 
 
+# ---------------------------------------------------------------------------
+# Context-overflow recovery — only trust provider-reported limits
+# ---------------------------------------------------------------------------
+
+class TestContextOverflowLimitSelection:
+    """Context-overflow recovery must not invent a lower window size.
+
+    Some providers only say "input exceeds the context window" without telling
+    Hermes what the actual maximum is.  In that case we may compress the
+    conversation, but must not silently probe-step from a user-configured 1M
+    window down to 256K/128K/64K/etc.
+    """
+
+    def test_generic_overflow_without_provider_limit_keeps_context_length(self):
+        from agent.model_metadata import get_context_length_from_provider_error
+        from agent.model_metadata import get_next_probe_tier
+        from agent.model_metadata import parse_context_limit_from_error
+
+        old_ctx = 1_000_000
+        error_msg = (
+            "Your input exceeds the context window of this model. "
+            "Please adjust your input and try again."
+        )
+
+        assert parse_context_limit_from_error(error_msg) is None
+        assert get_next_probe_tier(old_ctx) == 256_000
+        assert get_context_length_from_provider_error(error_msg, old_ctx) is None
+
+    def test_explicit_provider_limit_still_selects_that_limit(self):
+        from agent.model_metadata import get_context_length_from_provider_error
+
+        error_msg = "prompt is too long: 300000 tokens > 272000 maximum"
+
+        assert get_context_length_from_provider_error(error_msg, 1_000_000) == 272_000
+
+    def test_reported_limit_not_lower_than_current_is_ignored(self):
+        from agent.model_metadata import get_context_length_from_provider_error
+
+        error_msg = "maximum context length is 1000000 tokens"
+
+        assert get_context_length_from_provider_error(error_msg, 272_000) is None
+
+
 # ---------------------------------------------------------------------------
 # build_anthropic_kwargs — output cap clamping
 # ---------------------------------------------------------------------------
@@ -282,19 +328,16 @@ class TestContextNotHalvedOnOutputCapError:
         assert agent.context_compressor.context_length == old_ctx
         assert agent._ephemeral_max_output_tokens == 19_936
 
-    def test_prompt_too_long_still_triggers_probe_tier(self):
-        """Genuine prompt-too-long errors must still use get_next_probe_tier."""
+    def test_prompt_too_long_with_explicit_limit_uses_provider_limit(self):
+        """Prompt-too-long errors only change context_length when they report a concrete limit."""
+        from agent.model_metadata import get_context_length_from_provider_error
         from agent.model_metadata import parse_available_output_tokens_from_error
-        from agent.model_metadata import get_next_probe_tier
 
         error_msg = "prompt is too long: 205000 tokens > 200000 maximum"
 
         available_out = parse_available_output_tokens_from_error(error_msg)
         assert available_out is None, "prompt-too-long must not be caught by output-cap parser"
-
-        # The old halving path is still used for this class of error
-        new_ctx = get_next_probe_tier(200_000)
-        assert new_ctx == 128_000
+        assert get_context_length_from_provider_error(error_msg, 1_000_000) == 200_000
 
     def test_output_cap_error_safety_margin(self):
         """The ephemeral value includes a 64-token safety margin below available_out."""