From 8cf6b3da9d157bfced382cf139a9613eff90c006 Mon Sep 17 00:00:00 2001
From: teknium1 <127238744+teknium1@users.noreply.github.com>
Date: Thu, 28 May 2026 20:35:04 -0700
Subject: [PATCH] fix(opencode-go): cap mimo-v2.5-pro max_tokens at 131072
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The opencode-go relay defaults max_tokens to 262144 when none is sent,
but Xiami mimo-v2.5-pro only supports 131072 completion tokens — every
request 400s with "max_tokens is too large: 262144" before the agent
can do anything.

Add a get_max_tokens(model) hook on ProviderProfile (default returns
default_max_tokens) so profiles fronting multiple upstreams can vary
the cap per-model. Wire chat_completions transport through the hook.
Override on OpenCodeGoProfile with mimo-v2.5-pro=131072.

Only mimo-v2.5-pro is capped — other opencode-go models (kimi, glm,
qwen, minimax, other mimo variants) unchanged.
---
 agent/transports/chat_completions.py             |  8 ++++++--
 plugins/model-providers/opencode-zen/__init__.py | 15 +++++++++++++++
 providers/base.py                                | 14 ++++++++++++++
 3 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/agent/transports/chat_completions.py b/agent/transports/chat_completions.py
index 96997afca43..0582ef1ef00 100644
--- a/agent/transports/chat_completions.py
+++ b/agent/transports/chat_completions.py
@@ -476,13 +476,17 @@ class ChatCompletionsTransport(ProviderTransport):
         ephemeral = params.get("ephemeral_max_output_tokens")
         user_max = params.get("max_tokens")
         anthropic_max = params.get("anthropic_max_output")
+        # Per-model default cap — profiles override get_max_tokens() when
+        # they front several backends with different completion-token limits
+        # (e.g. opencode-go: mimo-v2.5-pro = 131072).
+        profile_max = profile.get_max_tokens(model)
 
         if ephemeral is not None and max_tokens_fn:
             api_kwargs.update(max_tokens_fn(ephemeral))
         elif user_max is not None and max_tokens_fn:
             api_kwargs.update(max_tokens_fn(user_max))
-        elif profile.default_max_tokens and max_tokens_fn:
-            api_kwargs.update(max_tokens_fn(profile.default_max_tokens))
+        elif profile_max and max_tokens_fn:
+            api_kwargs.update(max_tokens_fn(profile_max))
         elif anthropic_max is not None:
             api_kwargs["max_tokens"] = anthropic_max
 
diff --git a/plugins/model-providers/opencode-zen/__init__.py b/plugins/model-providers/opencode-zen/__init__.py
index 385741f09a1..a8c72cdc25c 100644
--- a/plugins/model-providers/opencode-zen/__init__.py
+++ b/plugins/model-providers/opencode-zen/__init__.py
@@ -34,6 +34,21 @@ def _is_deepseek_thinking_model(model: str | None) -> bool:
 class OpenCodeGoProfile(ProviderProfile):
     """OpenCode Go - model-specific reasoning controls."""
 
+    # Per-model completion-token cap. The opencode-go relay's default is
+    # too large for mimo-v2.5-pro — it sends max_tokens=262144 but Xiaomi
+    # only supports 131072 completion tokens and 400s the request.
+    # Setting an explicit cap here prevents the relay default from being
+    # applied. Keys are normalized via _flat_model_name().
+    _MODEL_MAX_TOKENS: dict[str, int] = {
+        "mimo-v2.5-pro": 131072,
+    }
+
+    def get_max_tokens(self, model: str | None) -> int | None:
+        cap = self._MODEL_MAX_TOKENS.get(_flat_model_name(model))
+        if cap is not None:
+            return cap
+        return self.default_max_tokens
+
     def build_api_kwargs_extras(
         self, *, reasoning_config: dict | None = None, model: str | None = None, **context
     ) -> tuple[dict[str, Any], dict[str, Any]]:
diff --git a/providers/base.py b/providers/base.py
index fa6765d103c..01023ff55c2 100644
--- a/providers/base.py
+++ b/providers/base.py
@@ -129,6 +129,20 @@ class ProviderProfile:
         """
         return {}, {}
 
+    def get_max_tokens(self, model: str | None) -> int | None:
+        """Return the default max_tokens cap for *model*.
+
+        Overrideable hook for providers that need per-model output caps —
+        e.g. a relay that fronts several upstream backends, each with a
+        different completion-token limit. The transport calls this when
+        the user hasn't set an explicit max_tokens.
+
+        Default: return self.default_max_tokens (the static profile field),
+        ignoring the model name. Override in a subclass to vary the cap
+        per-model.
+        """
+        return self.default_max_tokens
+
     def fetch_models(
         self,
         *,