mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-30 06:41:51 +00:00
fix(opencode-go): cap mimo-v2.5-pro max_tokens at 131072
The opencode-go relay defaults max_tokens to 262144 when none is sent, but Xiami mimo-v2.5-pro only supports 131072 completion tokens — every request 400s with "max_tokens is too large: 262144" before the agent can do anything. Add a get_max_tokens(model) hook on ProviderProfile (default returns default_max_tokens) so profiles fronting multiple upstreams can vary the cap per-model. Wire chat_completions transport through the hook. Override on OpenCodeGoProfile with mimo-v2.5-pro=131072. Only mimo-v2.5-pro is capped — other opencode-go models (kimi, glm, qwen, minimax, other mimo variants) unchanged.
This commit is contained in:
parent
bfecfabd0f
commit
8cf6b3da9d
3 changed files with 35 additions and 2 deletions
|
|
@ -476,13 +476,17 @@ class ChatCompletionsTransport(ProviderTransport):
|
|||
ephemeral = params.get("ephemeral_max_output_tokens")
|
||||
user_max = params.get("max_tokens")
|
||||
anthropic_max = params.get("anthropic_max_output")
|
||||
# Per-model default cap — profiles override get_max_tokens() when
|
||||
# they front several backends with different completion-token limits
|
||||
# (e.g. opencode-go: mimo-v2.5-pro = 131072).
|
||||
profile_max = profile.get_max_tokens(model)
|
||||
|
||||
if ephemeral is not None and max_tokens_fn:
|
||||
api_kwargs.update(max_tokens_fn(ephemeral))
|
||||
elif user_max is not None and max_tokens_fn:
|
||||
api_kwargs.update(max_tokens_fn(user_max))
|
||||
elif profile.default_max_tokens and max_tokens_fn:
|
||||
api_kwargs.update(max_tokens_fn(profile.default_max_tokens))
|
||||
elif profile_max and max_tokens_fn:
|
||||
api_kwargs.update(max_tokens_fn(profile_max))
|
||||
elif anthropic_max is not None:
|
||||
api_kwargs["max_tokens"] = anthropic_max
|
||||
|
||||
|
|
|
|||
|
|
@ -34,6 +34,21 @@ def _is_deepseek_thinking_model(model: str | None) -> bool:
|
|||
class OpenCodeGoProfile(ProviderProfile):
|
||||
"""OpenCode Go - model-specific reasoning controls."""
|
||||
|
||||
# Per-model completion-token cap. The opencode-go relay's default is
|
||||
# too large for mimo-v2.5-pro — it sends max_tokens=262144 but Xiaomi
|
||||
# only supports 131072 completion tokens and 400s the request.
|
||||
# Setting an explicit cap here prevents the relay default from being
|
||||
# applied. Keys are normalized via _flat_model_name().
|
||||
_MODEL_MAX_TOKENS: dict[str, int] = {
|
||||
"mimo-v2.5-pro": 131072,
|
||||
}
|
||||
|
||||
def get_max_tokens(self, model: str | None) -> int | None:
|
||||
cap = self._MODEL_MAX_TOKENS.get(_flat_model_name(model))
|
||||
if cap is not None:
|
||||
return cap
|
||||
return self.default_max_tokens
|
||||
|
||||
def build_api_kwargs_extras(
|
||||
self, *, reasoning_config: dict | None = None, model: str | None = None, **context
|
||||
) -> tuple[dict[str, Any], dict[str, Any]]:
|
||||
|
|
|
|||
|
|
@ -129,6 +129,20 @@ class ProviderProfile:
|
|||
"""
|
||||
return {}, {}
|
||||
|
||||
def get_max_tokens(self, model: str | None) -> int | None:
|
||||
"""Return the default max_tokens cap for *model*.
|
||||
|
||||
Overrideable hook for providers that need per-model output caps —
|
||||
e.g. a relay that fronts several upstream backends, each with a
|
||||
different completion-token limit. The transport calls this when
|
||||
the user hasn't set an explicit max_tokens.
|
||||
|
||||
Default: return self.default_max_tokens (the static profile field),
|
||||
ignoring the model name. Override in a subclass to vary the cap
|
||||
per-model.
|
||||
"""
|
||||
return self.default_max_tokens
|
||||
|
||||
def fetch_models(
|
||||
self,
|
||||
*,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue