diff --git a/agent/transports/chat_completions.py b/agent/transports/chat_completions.py index 96997afca43..0582ef1ef00 100644 --- a/agent/transports/chat_completions.py +++ b/agent/transports/chat_completions.py @@ -476,13 +476,17 @@ class ChatCompletionsTransport(ProviderTransport): ephemeral = params.get("ephemeral_max_output_tokens") user_max = params.get("max_tokens") anthropic_max = params.get("anthropic_max_output") + # Per-model default cap — profiles override get_max_tokens() when + # they front several backends with different completion-token limits + # (e.g. opencode-go: mimo-v2.5-pro = 131072). + profile_max = profile.get_max_tokens(model) if ephemeral is not None and max_tokens_fn: api_kwargs.update(max_tokens_fn(ephemeral)) elif user_max is not None and max_tokens_fn: api_kwargs.update(max_tokens_fn(user_max)) - elif profile.default_max_tokens and max_tokens_fn: - api_kwargs.update(max_tokens_fn(profile.default_max_tokens)) + elif profile_max and max_tokens_fn: + api_kwargs.update(max_tokens_fn(profile_max)) elif anthropic_max is not None: api_kwargs["max_tokens"] = anthropic_max diff --git a/plugins/model-providers/opencode-zen/__init__.py b/plugins/model-providers/opencode-zen/__init__.py index 385741f09a1..a8c72cdc25c 100644 --- a/plugins/model-providers/opencode-zen/__init__.py +++ b/plugins/model-providers/opencode-zen/__init__.py @@ -34,6 +34,21 @@ def _is_deepseek_thinking_model(model: str | None) -> bool: class OpenCodeGoProfile(ProviderProfile): """OpenCode Go - model-specific reasoning controls.""" + # Per-model completion-token cap. The opencode-go relay's default is + # too large for mimo-v2.5-pro — it sends max_tokens=262144 but Xiaomi + # only supports 131072 completion tokens and 400s the request. + # Setting an explicit cap here prevents the relay default from being + # applied. Keys are normalized via _flat_model_name(). + _MODEL_MAX_TOKENS: dict[str, int] = { + "mimo-v2.5-pro": 131072, + } + + def get_max_tokens(self, model: str | None) -> int | None: + cap = self._MODEL_MAX_TOKENS.get(_flat_model_name(model)) + if cap is not None: + return cap + return self.default_max_tokens + def build_api_kwargs_extras( self, *, reasoning_config: dict | None = None, model: str | None = None, **context ) -> tuple[dict[str, Any], dict[str, Any]]: diff --git a/providers/base.py b/providers/base.py index fa6765d103c..01023ff55c2 100644 --- a/providers/base.py +++ b/providers/base.py @@ -129,6 +129,20 @@ class ProviderProfile: """ return {}, {} + def get_max_tokens(self, model: str | None) -> int | None: + """Return the default max_tokens cap for *model*. + + Overrideable hook for providers that need per-model output caps — + e.g. a relay that fronts several upstream backends, each with a + different completion-token limit. The transport calls this when + the user hasn't set an explicit max_tokens. + + Default: return self.default_max_tokens (the static profile field), + ignoring the model name. Override in a subclass to vary the cap + per-model. + """ + return self.default_max_tokens + def fetch_models( self, *,