From 3a74b752174bc4f36039c128d0552b05e4146367 Mon Sep 17 00:00:00 2001
From: xxxigm <tuancanhnguyen706@gmail.com>
Date: Tue, 9 Jun 2026 16:51:07 +0700
Subject: [PATCH] fix(agent): recover from char-based output-cap overflow
 (#42741)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

LM Studio / llama.cpp-style servers report the context window in tokens
but the prompt size in characters, e.g. "maximum context length is 65536
tokens. However, you requested 65536 output tokens and your prompt
contains 77409 characters". When a provider profile's default_max_tokens
equals the model's context window, the very first request asks for the
whole window as output and the server returns a hard HTTP 400 — even on a
trivial "hi".

parse_available_output_tokens_from_error did not recognise this phrasing,
so the overflow was misrouted to the prompt-too-long/compression path
(which can't help when the input already fits) instead of the output-cap
reduction + retry path. Detect the "requested N output tokens" form,
estimate the input from the character count (~3 chars/token, conservative
so the retried cap stays inside the window), and return the available
output budget so the existing retry logic shrinks max_tokens and succeeds.
---
 agent/model_metadata.py | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/agent/model_metadata.py b/agent/model_metadata.py
index 531e9ae8459..25f60a0d961 100644
--- a/agent/model_metadata.py
+++ b/agent/model_metadata.py
@@ -968,6 +968,16 @@ def parse_available_output_tokens_from_error(error_msg: str) -> Optional[int]:
         # OpenRouter/Nous phrasing of the same condition.
         "in the output" in error_lower
         and "maximum context length" in error_lower
+    ) or (
+        # LM Studio / llama.cpp / some OpenAI-compatible servers:
+        #   "This model's maximum context length is 65536 tokens. However, you
+        #    requested 65536 output tokens and your prompt contains 77409
+        #    characters ..."
+        # The "requested N output tokens" phrasing means the OUTPUT cap is the
+        # problem (the input itself fits) — reduce max_tokens, don't compress.
+        "maximum context length" in error_lower
+        and "requested" in error_lower
+        and "output tokens" in error_lower
     )
     if not is_output_cap_error:
         return None
@@ -999,6 +1009,22 @@ def parse_available_output_tokens_from_error(error_msg: str) -> Optional[int]:
         if _available >= 1:
             return _available
 
+    # LM Studio / llama.cpp style: context window is reported in tokens but the
+    # prompt size is reported in CHARACTERS, e.g.
+    #   "maximum context length is 65536 tokens ... your prompt contains 77409
+    #    characters ...".
+    # Estimate the input tokens conservatively (~3 chars/token, which
+    # over-reserves the input so the retried output cap stays safely inside the
+    # window) and leave the remainder of the window for output.
+    _m_ctx_tok = re.search(r'maximum context length is (\d+)\s*token', error_lower)
+    _m_chars = re.search(r'prompt contains (\d+)\s*character', error_lower)
+    if _m_ctx_tok and _m_chars:
+        _ctx = int(_m_ctx_tok.group(1))
+        _est_input = (int(_m_chars.group(1)) + 2) // 3
+        _available = _ctx - _est_input
+        if _available >= 1:
+            return _available
+
     return None