fix(vision): Z.AI vision model compatibility — endpoint routing and max_tokens handling

Z.AI (智谱 GLM) vision models (glm-4v-flash, glm-4v-plus, etc.) have two compatibility issues when used through the Anthropic-compatible endpoint: 1. **Error 1210 — max_tokens rejected on multimodal calls**: Z.AI rejects the max_tokens parameter for vision model requests with error code 1210 ("API 调用参数有误"). The error string does not contain "max_tokens", so the existing unsupported-parameter retry logic never fires. 2. **Wrong endpoint inheritance**: When the main runtime provider uses Z.AI's Anthropic-compatible endpoint (open.bigmodel.cn/api/anthropic), the vision client inherits this endpoint. But Z.AI's Anthropic wire cannot properly handle image content — models silently fail ("I can't see the image") or reject max_tokens. Changes: - resolve_vision_provider_client(): force Z.AI vision to use OpenAI-compatible endpoint (open.bigmodel.cn/api/paas/v4) instead of inheriting Anthropic wire - _build_call_kwargs(): skip max_tokens for Z.AI vision models (4v/5v/-v suffix) - _AnthropicCompletionsAdapter: support _skip_zai_max_tokens flag - _to_openai_base_url(): rewrite Z.AI Anthropic URLs to OpenAI-compatible path - call_llm() retry: detect Z.AI error 1210 and strip max_tokens before retry
2026-05-11 03:31:55 +00:00 · 2026-05-03 17:12:13 -03:00 · 2026-05-03 17:12:13 -03:00 · 6ea4a6a740
commit 6ea4a6a740
parent fa582749e1
1 changed files with 73 additions and 4 deletions
--- a/agent/auxiliary_client.py
+++ b/agent/auxiliary_client.py
@ -455,6 +455,12 @@ def _to_openai_base_url(base_url: str) -> str:
    """
    url = str(base_url or "").strip().rstrip("/")
    if url.endswith("/anthropic"):
+        # ZAI (open.bigmodel.cn) uses /api/anthropic for Anthropic wire
+        # but /api/paas/v4 for OpenAI wire — the generic /v1 rewrite is wrong.
+        if "open.bigmodel.cn" in url or "bigmodel" in url:
+            rewritten = url[: -len("/anthropic")] + "/paas/v4"
+            logger.debug("Auxiliary client: rewrote ZAI base URL %s → %s", url, rewritten)
+            return rewritten
        rewritten = url[: -len("/anthropic")] + "/v1"
        logger.debug("Auxiliary client: rewrote base URL %s → %s", url, rewritten)
        return rewritten
@ -828,7 +834,14 @@ class _AnthropicCompletionsAdapter:
        model = kwargs.get("model", self._model)
        tools = kwargs.get("tools")
        tool_choice = kwargs.get("tool_choice")
-        max_tokens = kwargs.get("max_tokens") or kwargs.get("max_completion_tokens") or 2000
+        # ZAI's Anthropic-compatible endpoint rejects max_tokens on vision
+        # models (glm-4v-flash etc.) with error code 1210.  When the caller
+        # signals this by setting _skip_zai_max_tokens in kwargs, omit it.
+        _skip_mt = kwargs.pop("_skip_zai_max_tokens", False)
+        if _skip_mt:
+            max_tokens = None
+        else:
+            max_tokens = kwargs.get("max_tokens") or kwargs.get("max_completion_tokens") or 2000
        temperature = kwargs.get("temperature")

        normalized_tool_choice = None
@ -2835,6 +2848,33 @@ def resolve_vision_provider_client(
        )
        return _finalize(requested, sync_client, default_model)

+    # ZAI vision models must use the OpenAI-compatible endpoint, not the
+    # Anthropic-compatible one (which may be the main-runtime default).
+    # The Anthropic wire rejects max_tokens on multimodal calls (error 1210),
+    # while the OpenAI wire handles it correctly.
+    if requested == "zai" and not resolved_base_url:
+        zai_openai_urls = [
+            "https://open.bigmodel.cn/api/paas/v4",
+            "https://api.z.ai/api/paas/v4",
+        ]
+        for _zai_url in zai_openai_urls:
+            client, final_model = _get_cached_client(
+                requested, resolved_model, async_mode,
+                base_url=_zai_url,
+                api_key=resolved_api_key or None,
+                api_mode="chat_completions",
+                is_vision=True,
+            )
+            if client is not None:
+                return _finalize(requested, client, final_model)
+        # Fallback: try without explicit base_url (old behavior)
+        client, final_model = _get_cached_client(requested, resolved_model, async_mode,
+                                                 api_mode=resolved_api_mode,
+                                                 is_vision=True)
+        if client is None:
+            return requested, None, None
+        return requested, client, final_model
+
    client, final_model = _get_cached_client(requested, resolved_model, async_mode,
                                             api_mode=resolved_api_mode,
                                             is_vision=True)
@ -3394,7 +3434,16 @@ def _build_call_kwargs(
    if max_tokens is not None:
        # Codex adapter handles max_tokens internally; OpenRouter/Nous use max_tokens.
        # Direct OpenAI api.openai.com with newer models needs max_completion_tokens.
-        if provider == "custom":
+        # ZAI vision models (glm-4v-flash, glm-4v-plus, etc.) reject max_tokens with
+        # error code 1210 ("API 调用参数有误") on multimodal requests — skip it.
+        _model_lower = (model or "").lower()
+        _skip_max_tokens = (
+            provider == "zai"
+            and ("4v" in _model_lower or "5v" in _model_lower or "-v" in _model_lower)
+        )
+        if _skip_max_tokens:
+            pass  # ZAI vision models do not accept max_tokens
+        elif provider == "custom":
            custom_base = base_url or _current_custom_base_url()
            if base_url_hostname(custom_base) == "api.openai.com":
                kwargs["max_completion_tokens"] = max_tokens
@ -3625,13 +3674,23 @@ def call_llm(
                kwargs = retry_kwargs

        err_str = str(first_err)
+        # ZAI vision models (glm-4v-flash etc.) return error code 1210
+        # ("API 调用参数有误") when max_tokens is passed on multimodal
+        # calls.  The error message does NOT contain "max_tokens" so the
+        # generic retry below never fires.  Detect the ZAI-specific error
+        # and strip max_tokens before retrying.
+        _is_zai_param_error = (
+            "1210" in err_str
+            and "bigmodel" in str(getattr(client, "base_url", ""))
+        )
        if max_tokens is not None and (
            "max_tokens" in err_str
            or "unsupported_parameter" in err_str
            or _is_unsupported_parameter_error(first_err, "max_tokens")
+            or _is_zai_param_error
        ):
            kwargs.pop("max_tokens", None)
-            kwargs["max_completion_tokens"] = max_tokens
+            kwargs.pop("max_completion_tokens", None)
            try:
                return _validate_llm_response(
                    client.chat.completions.create(**kwargs), task)
@ -3931,13 +3990,23 @@ async def async_call_llm(
                kwargs = retry_kwargs

        err_str = str(first_err)
+        # ZAI vision models (glm-4v-flash etc.) return error code 1210
+        # ("API 调用参数有误") when max_tokens is passed on multimodal
+        # calls.  The error message does NOT contain "max_tokens" so the
+        # generic retry below never fires.  Detect the ZAI-specific error
+        # and strip max_tokens before retrying.
+        _is_zai_param_error = (
+            "1210" in err_str
+            and "bigmodel" in str(getattr(client, "base_url", ""))
+        )
        if max_tokens is not None and (
            "max_tokens" in err_str
            or "unsupported_parameter" in err_str
            or _is_unsupported_parameter_error(first_err, "max_tokens")
+            or _is_zai_param_error
        ):
            kwargs.pop("max_tokens", None)
-            kwargs["max_completion_tokens"] = max_tokens
+            kwargs.pop("max_completion_tokens", None)
            try:
                return _validate_llm_response(
                    await client.chat.completions.create(**kwargs), task)