fix(agent): recover from char-based output-cap overflow (#42741)

LM Studio / llama.cpp-style servers report the context window in tokens but the prompt size in characters, e.g. "maximum context length is 65536 tokens. However, you requested 65536 output tokens and your prompt contains 77409 characters". When a provider profile's default_max_tokens equals the model's context window, the very first request asks for the whole window as output and the server returns a hard HTTP 400 — even on a trivial "hi". parse_available_output_tokens_from_error did not recognise this phrasing, so the overflow was misrouted to the prompt-too-long/compression path (which can't help when the input already fits) instead of the output-cap reduction + retry path. Detect the "requested N output tokens" form, estimate the input from the character count (~3 chars/token, conservative so the retried cap stays inside the window), and return the available output budget so the existing retry logic shrinks max_tokens and succeeds.
2026-07-26 17:38:36 +00:00 · 2026-06-09 16:51:07 +07:00 · 2026-06-09 16:51:07 +07:00 · 3a74b75217
commit 3a74b75217
parent 24a934295f
1 changed files with 26 additions and 0 deletions
--- a/agent/model_metadata.py
+++ b/agent/model_metadata.py
@ -968,6 +968,16 @@ def parse_available_output_tokens_from_error(error_msg: str) -> Optional[int]:
        # OpenRouter/Nous phrasing of the same condition.
        "in the output" in error_lower
        and "maximum context length" in error_lower
+    ) or (
+        # LM Studio / llama.cpp / some OpenAI-compatible servers:
+        #   "This model's maximum context length is 65536 tokens. However, you
+        #    requested 65536 output tokens and your prompt contains 77409
+        #    characters ..."
+        # The "requested N output tokens" phrasing means the OUTPUT cap is the
+        # problem (the input itself fits) — reduce max_tokens, don't compress.
+        "maximum context length" in error_lower
+        and "requested" in error_lower
+        and "output tokens" in error_lower
    )
    if not is_output_cap_error:
        return None
@ -999,6 +1009,22 @@ def parse_available_output_tokens_from_error(error_msg: str) -> Optional[int]:
        if _available >= 1:
            return _available

+    # LM Studio / llama.cpp style: context window is reported in tokens but the
+    # prompt size is reported in CHARACTERS, e.g.
+    #   "maximum context length is 65536 tokens ... your prompt contains 77409
+    #    characters ...".
+    # Estimate the input tokens conservatively (~3 chars/token, which
+    # over-reserves the input so the retried output cap stays safely inside the
+    # window) and leave the remainder of the window for output.
+    _m_ctx_tok = re.search(r'maximum context length is (\d+)\s*token', error_lower)
+    _m_chars = re.search(r'prompt contains (\d+)\s*character', error_lower)
+    if _m_ctx_tok and _m_chars:
+        _ctx = int(_m_ctx_tok.group(1))
+        _est_input = (int(_m_chars.group(1)) + 2) // 3
+        _available = _ctx - _est_input
+        if _available >= 1:
+            return _available
+
    return None