fix(vision): Z.AI vision model compatibility — endpoint routing and max_tokens handling

Z.AI (智谱 GLM) vision models (glm-4v-flash, glm-4v-plus, etc.) have two compatibility issues when used through the Anthropic-compatible endpoint: 1. **Error 1210 — max_tokens rejected on multimodal calls**: Z.AI rejects the max_tokens parameter for vision model requests with error code 1210 ("API 调用参数有误"). The error string does not contain "max_tokens", so the existing unsupported-parameter retry logic never fires. 2. **Wrong endpoint inheritance**: When the main runtime provider uses Z.AI's Anthropic-compatible endpoint (open.bigmodel.cn/api/anthropic), the vision client inherits this endpoint. But Z.AI's Anthropic wire cannot properly handle image content — models silently fail ("I can't see the image") or reject max_tokens. Changes: - resolve_vision_provider_client(): force Z.AI vision to use OpenAI-compatible endpoint (open.bigmodel.cn/api/paas/v4) instead of inheriting Anthropic wire - _build_call_kwargs(): skip max_tokens for Z.AI vision models (4v/5v/-v suffix) - _AnthropicCompletionsAdapter: support _skip_zai_max_tokens flag - _to_openai_base_url(): rewrite Z.AI Anthropic URLs to OpenAI-compatible path - call_llm() retry: detect Z.AI error 1210 and strip max_tokens before retry
2026-05-16 04:22:36 +00:00 · 2026-05-03 17:12:13 -03:00 · 2026-05-03 17:12:13 -03:00 · 6ea4a6a740
commit 6ea4a6a740
parent fa582749e1
1 changed files with 73 additions and 4 deletions
--- a/agent/auxiliary_client.py
+++ b/agent/auxiliary_client.py
@ -455,6 +455,12 @@ def _to_openai_base_url(base_url: str) -> str:
    """
    url = str(base_url or "").strip().rstrip("/")
    if url.endswith("/anthropic"):
        # ZAI (open.bigmodel.cn) uses /api/anthropic for Anthropic wire
        # but /api/paas/v4 for OpenAI wire — the generic /v1 rewrite is wrong.
        if "open.bigmodel.cn" in url or "bigmodel" in url:
            rewritten = url[: -len("/anthropic")] + "/paas/v4"
            logger.debug("Auxiliary client: rewrote ZAI base URL %s → %s", url, rewritten)
            return rewritten
        rewritten = url[: -len("/anthropic")] + "/v1"
        logger.debug("Auxiliary client: rewrote base URL %s → %s", url, rewritten)
        return rewritten
@ -828,6 +834,13 @@ class _AnthropicCompletionsAdapter:
        model = kwargs.get("model", self._model)
        tools = kwargs.get("tools")
        tool_choice = kwargs.get("tool_choice")
        # ZAI's Anthropic-compatible endpoint rejects max_tokens on vision
        # models (glm-4v-flash etc.) with error code 1210.  When the caller
        # signals this by setting _skip_zai_max_tokens in kwargs, omit it.
        _skip_mt = kwargs.pop("_skip_zai_max_tokens", False)
        if _skip_mt:
            max_tokens = None
        else:
            max_tokens = kwargs.get("max_tokens") or kwargs.get("max_completion_tokens") or 2000
        temperature = kwargs.get("temperature")
@ -2835,6 +2848,33 @@ def resolve_vision_provider_client(
        )
        return _finalize(requested, sync_client, default_model)
    # ZAI vision models must use the OpenAI-compatible endpoint, not the
    # Anthropic-compatible one (which may be the main-runtime default).
    # The Anthropic wire rejects max_tokens on multimodal calls (error 1210),
    # while the OpenAI wire handles it correctly.
    if requested == "zai" and not resolved_base_url:
        zai_openai_urls = [
            "https://open.bigmodel.cn/api/paas/v4",
            "https://api.z.ai/api/paas/v4",
        ]
        for _zai_url in zai_openai_urls:
            client, final_model = _get_cached_client(
                requested, resolved_model, async_mode,
                base_url=_zai_url,
                api_key=resolved_api_key or None,
                api_mode="chat_completions",
                is_vision=True,
            )
            if client is not None:
                return _finalize(requested, client, final_model)
        # Fallback: try without explicit base_url (old behavior)
        client, final_model = _get_cached_client(requested, resolved_model, async_mode,
                                                 api_mode=resolved_api_mode,
                                                 is_vision=True)
        if client is None:
            return requested, None, None
        return requested, client, final_model
    client, final_model = _get_cached_client(requested, resolved_model, async_mode,
                                             api_mode=resolved_api_mode,
                                             is_vision=True)
@ -3394,7 +3434,16 @@ def _build_call_kwargs(
    if max_tokens is not None:
        # Codex adapter handles max_tokens internally; OpenRouter/Nous use max_tokens.
        # Direct OpenAI api.openai.com with newer models needs max_completion_tokens.
-        if provider == "custom":
+        # ZAI vision models (glm-4v-flash, glm-4v-plus, etc.) reject max_tokens with
        # error code 1210 ("API 调用参数有误") on multimodal requests — skip it.
        _model_lower = (model or "").lower()
        _skip_max_tokens = (
            provider == "zai"
            and ("4v" in _model_lower or "5v" in _model_lower or "-v" in _model_lower)
        )
        if _skip_max_tokens:
            pass  # ZAI vision models do not accept max_tokens
        elif provider == "custom":
            custom_base = base_url or _current_custom_base_url()
            if base_url_hostname(custom_base) == "api.openai.com":
                kwargs["max_completion_tokens"] = max_tokens
@ -3625,13 +3674,23 @@ def call_llm(
                kwargs = retry_kwargs
        err_str = str(first_err)
        # ZAI vision models (glm-4v-flash etc.) return error code 1210
        # ("API 调用参数有误") when max_tokens is passed on multimodal
        # calls.  The error message does NOT contain "max_tokens" so the
        # generic retry below never fires.  Detect the ZAI-specific error
        # and strip max_tokens before retrying.
        _is_zai_param_error = (
            "1210" in err_str
            and "bigmodel" in str(getattr(client, "base_url", ""))
        )
        if max_tokens is not None and (
            "max_tokens" in err_str
            or "unsupported_parameter" in err_str
            or _is_unsupported_parameter_error(first_err, "max_tokens")
            or _is_zai_param_error
        ):
            kwargs.pop("max_tokens", None)
-            kwargs["max_completion_tokens"] = max_tokens
+            kwargs.pop("max_completion_tokens", None)
            try:
                return _validate_llm_response(
                    client.chat.completions.create(**kwargs), task)
@ -3931,13 +3990,23 @@ async def async_call_llm(
                kwargs = retry_kwargs
        err_str = str(first_err)
        # ZAI vision models (glm-4v-flash etc.) return error code 1210
        # ("API 调用参数有误") when max_tokens is passed on multimodal
        # calls.  The error message does NOT contain "max_tokens" so the
        # generic retry below never fires.  Detect the ZAI-specific error
        # and strip max_tokens before retrying.
        _is_zai_param_error = (
            "1210" in err_str
            and "bigmodel" in str(getattr(client, "base_url", ""))
        )
        if max_tokens is not None and (
            "max_tokens" in err_str
            or "unsupported_parameter" in err_str
            or _is_unsupported_parameter_error(first_err, "max_tokens")
            or _is_zai_param_error
        ):
            kwargs.pop("max_tokens", None)
-            kwargs["max_completion_tokens"] = max_tokens
+            kwargs.pop("max_completion_tokens", None)
            try:
                return _validate_llm_response(
                    await client.chat.completions.create(**kwargs), task)