fix(xai): OAuth Responses native web_search, incomplete guard, grok-composer context

- model_metadata: grok-composer-2.5-fast → 262144 (OAuth slug not in /v1/models) - codex transport: inject native {"type":"web_search"} for is_xai_responses; drop client web_search to avoid duplicate-name 400s - codex adapter: do not treat in-progress server-side *_call items as incomplete - tests: adapter, transport build_kwargs, model_metadata, oauth recovery
2026-07-31 19:16:29 +00:00 · 2026-06-11 11:06:01 -04:00 · 2026-06-11 11:06:01 -04:00 · 6f89e17a33
commit 6f89e17a33
parent 4b7a186003
7 changed files with 310 additions and 2 deletions
--- a/agent/codex_responses_adapter.py
+++ b/agent/codex_responses_adapter.py
@ -262,6 +262,26 @@ def _responses_tools(tools: Optional[List[Dict[str, Any]]] = None) -> Optional[L
    return converted or None


+# Provider-executed built-in tool *declaration* types accepted on the
+# Responses ``tools`` array.  These are declared by ``type`` alone (no
+# client-side name/parameters schema) and run server-side — the provider
+# owns the implementation and reports progress via the matching ``*_call``
+# output items.  Hermes injects xAI's native ``web_search`` for the xAI
+# transport (see agent/transports/codex.py); the rest are listed so the
+# preflight validator passes them through rather than rejecting them as
+# "unsupported type".  Mirrors the ``*_call`` item-type set used in
+# _normalize_codex_response.
+_RESPONSES_BUILTIN_TOOL_TYPES = {
+    "web_search",
+    "web_search_preview",
+    "file_search",
+    "code_interpreter",
+    "image_generation",
+    "computer_use_preview",
+    "local_shell",
+}
+
+
 # ---------------------------------------------------------------------------
 # Message format conversion
 # ---------------------------------------------------------------------------
@ -802,7 +822,22 @@ def _preflight_codex_api_kwargs(
        for idx, tool in enumerate(tools):
            if not isinstance(tool, dict):
                raise ValueError(f"Codex Responses tools[{idx}] must be an object.")
-            if tool.get("type") != "function":
+
+            tool_type = tool.get("type")
+
+            # Provider-executed built-in tools (xAI native web_search, code
+            # interpreter, etc.) are declared by ``type`` alone and carry no
+            # ``name``/``parameters`` schema — the provider owns the
+            # implementation.  Pass them through verbatim instead of forcing
+            # them through the function-tool validation below (which would
+            # otherwise reject them with "unsupported type").  See
+            # agent/transports/codex.py for where xAI's native web_search is
+            # injected.
+            if tool_type in _RESPONSES_BUILTIN_TOOL_TYPES:
+                normalized_tools.append(dict(tool))
+                continue
+
+            if tool_type != "function":
                raise ValueError(f"Codex Responses tools[{idx}] has unsupported type {tool.get('type')!r}.")

            name = tool.get("name")
@ -1086,6 +1121,33 @@ def _normalize_codex_response(
    saw_final_answer_phase = False
    saw_reasoning_item = False

+    # Server-side built-in tool calls (xAI's native web_search, code
+    # interpreter, etc.) are executed by the provider and reported as
+    # discrete ``*_call`` output items.  xAI's /v1/responses surface
+    # (e.g. grok-composer-2.5-fast on SuperGrok OAuth) routinely leaves
+    # these items at ``status="in_progress"`` even when the overall
+    # ``response.status == "completed"`` — the search ran to completion
+    # server-side, the per-item status simply isn't reconciled.  These
+    # are NOT a signal that the model's turn is unfinished, so they must
+    # not flip ``has_incomplete_items``.  Only the response-level status
+    # and genuine model output items (message/reasoning/function_call)
+    # govern the incomplete verdict.  Without this guard, any turn where
+    # grok-composer invokes server-side search is misclassified as
+    # ``finish_reason="incomplete"`` and burns 3 fruitless continuation
+    # retries before failing with "Codex response remained incomplete
+    # after 3 continuation attempts".  client-side function/custom tool
+    # calls keep their own in_progress handling below (they are skipped,
+    # not awaited).
+    _SERVER_SIDE_TOOL_CALL_TYPES = {
+        "web_search_call",
+        "file_search_call",
+        "code_interpreter_call",
+        "image_generation_call",
+        "computer_call",
+        "local_shell_call",
+        "mcp_call",
+    }
+
    for item in output:
        item_type = getattr(item, "type", None)
        item_status = getattr(item, "status", None)
@ -1094,7 +1156,10 @@ def _normalize_codex_response(
        else:
            item_status = None

-        if item_status in {"queued", "in_progress", "incomplete"}:
+        if (
+            item_status in {"queued", "in_progress", "incomplete"}
+            and item_type not in _SERVER_SIDE_TOOL_CALL_TYPES
+        ):
            has_incomplete_items = True
            saw_streaming_or_item_incomplete = True

--- a/agent/model_metadata.py
+++ b/agent/model_metadata.py
@ -277,6 +277,9 @@ DEFAULT_CONTEXT_LENGTHS = {
    # matches "grok-4.20-0309-reasoning" / "-non-reasoning" / "-multi-agent-0309".
    "grok-composer": 200000,    # grok-composer-2.5-fast (Grok Build CLI)
    "grok-build": 256000,       # grok-build-0.1
+    # OAuth-only slug; absent from GET /v1/models. Live /v1/responses probe
+    # (2026-03) enforces ~262144 tokens total (input+output), not 131k.
+    "grok-composer": 262144,    # grok-composer-2.5-fast
    "grok-code-fast": 256000,   # grok-code-fast-1
    "grok-2-vision": 8192,      # grok-2-vision, -1212, -latest
    "grok-4-fast": 2000000,     # grok-4-fast-(non-)reasoning, also matches -reasoning
--- a/agent/transports/codex.py
+++ b/agent/transports/codex.py
@ -128,6 +128,49 @@ class ResponsesApiTransport(ProviderTransport):
        reasoning_effort = _effort_clamp.get(reasoning_effort, reasoning_effort)

        response_tools = _responses_tools(tools)
+
+        # xAI server-side web search.
+        #
+        # grok models on xAI's /v1/responses surface (notably
+        # grok-composer-2.5-fast on SuperGrok OAuth) have a *native*,
+        # server-executed web search.  When the model is handed a
+        # client-side function literally named ``web_search``, it routes
+        # the intent to that native engine — but because the tool is
+        # declared as a plain ``function`` rather than xAI's first-class
+        # ``{"type": "web_search"}`` built-in, the server-side search is
+        # dispatched but never reconciled: the response streams reasoning
+        # + ``web_search_call`` progress items, the searches never reach
+        # ``status="completed"`` in the assembled output, no final
+        # message is emitted, and ``_normalize_codex_response`` correctly
+        # sees reasoning-with-no-answer and reports ``incomplete``.  The
+        # turn then burns 3 continuation retries and fails with "Codex
+        # response remained incomplete after 3 continuation attempts".
+        # Verified live against grok-composer-2.5-fast (2026-06).
+        #
+        # Fix: declare xAI's native ``web_search`` built-in so the search
+        # actually runs to completion server-side and the model streams a
+        # real answer.  The Responses API rejects two tools sharing the
+        # name ``web_search`` (HTTP 400 "Duplicate tool names"), so we
+        # drop the client-side ``web_search`` function for the xAI path
+        # and let the native tool satisfy it.  All other client-side
+        # tools (read_file, terminal, web_extract, MCP tools, …) are
+        # untouched and continue to dispatch through Hermes's agent loop.
+        #
+        # NOTE: this routes ``web_search`` to Grok's native search engine
+        # for xAI sessions instead of Hermes's configured web provider
+        # (Tavily/etc.), and those results bypass Hermes's tool-trace /
+        # citation plumbing (they arrive baked into the model's answer
+        # rather than as a tool result the loop observes).  Scoped to
+        # ``is_xai_responses`` deliberately; narrow to specific models if
+        # a future grok variant should keep the client-side function.
+        if is_xai_responses:
+            filtered = [
+                t for t in (response_tools or [])
+                if not (isinstance(t, dict) and t.get("name") == "web_search")
+            ]
+            filtered.append({"type": "web_search"})
+            response_tools = filtered
+
        # ``tools`` MUST be omitted entirely when there are no functions to
        # expose: the openai SDK's ``responses.stream()`` / ``responses.parse()``
        # eagerly call ``_make_tools(tools)`` which does ``for tool in tools``
--- a/tests/agent/test_codex_responses_adapter.py
+++ b/tests/agent/test_codex_responses_adapter.py
@ -5,6 +5,7 @@ import pytest
 from agent.codex_responses_adapter import (
    _format_responses_error,
    _normalize_codex_response,
+    _preflight_codex_api_kwargs,
 )


@ -68,6 +69,115 @@ def test_normalize_codex_response_treats_summary_only_reasoning_as_incomplete():
    assert assistant_message.codex_reasoning_items is None


+# ---------------------------------------------------------------------------
+# Server-side built-in tool calls (xAI native web_search, code interpreter,
+# etc.) come back as discrete ``*_call`` output items that xAI's
+# /v1/responses surface routinely leaves at ``status="in_progress"`` even
+# when the overall ``response.status == "completed"``.  These must NOT mark
+# the turn incomplete — otherwise grok-composer-2.5-fast research queries
+# (which invoke server-side web_search) get misclassified as
+# ``finish_reason="incomplete"`` and burn 3 fruitless continuation retries
+# before failing with "Codex response remained incomplete after 3
+# continuation attempts".  Observed live against grok-composer-2.5-fast on
+# SuperGrok OAuth (2026-06).
+# ---------------------------------------------------------------------------
+
+
+def test_normalize_codex_response_ignores_in_progress_server_side_tool_calls():
+    """A completed response with a final message + lingering in_progress
+    server-side web_search_call items resolves to 'stop', not 'incomplete'."""
+    response = SimpleNamespace(
+        status="completed",
+        incomplete_details=None,
+        output=[
+            SimpleNamespace(
+                type="reasoning",
+                id="rs_1",
+                encrypted_content="opaque",
+                summary=[SimpleNamespace(text="researching blades")],
+            ),
+            SimpleNamespace(
+                type="message",
+                role="assistant",
+                status="completed",
+                content=[SimpleNamespace(
+                    type="output_text",
+                    text="Milwaukee M18 blade 49-16-2734, ~$30 OEM.",
+                )],
+            ),
+            SimpleNamespace(type="web_search_call", status="in_progress"),
+            SimpleNamespace(type="web_search_call", status="in_progress"),
+            SimpleNamespace(type="web_search_call", status="in_progress"),
+        ],
+    )
+
+    assistant_message, finish_reason = _normalize_codex_response(response)
+
+    assert finish_reason == "stop"
+    assert assistant_message.content == "Milwaukee M18 blade 49-16-2734, ~$30 OEM."
+
+
+def test_normalize_codex_response_in_progress_message_still_incomplete():
+    """Guard scope: an in_progress *message* item (genuine model output that
+    is still streaming) must still mark the turn incomplete — only
+    server-side ``*_call`` items are exempted."""
+    response = SimpleNamespace(
+        status="completed",
+        incomplete_details=None,
+        output=[
+            SimpleNamespace(
+                type="message",
+                role="assistant",
+                status="in_progress",
+                content=[SimpleNamespace(type="output_text", text="partial...")],
+            ),
+        ],
+    )
+
+    _assistant_message, finish_reason = _normalize_codex_response(response)
+
+    assert finish_reason == "incomplete"
+
+
+# ---------------------------------------------------------------------------
+# _preflight_codex_api_kwargs — built-in (provider-executed) tools must pass
+# through validation.  Regression guard for the xAI native web_search
+# injection: the preflight validator previously rejected any tool whose
+# ``type != "function"`` with "unsupported type", which would 400 every xAI
+# turn once the native web_search tool is declared.
+# ---------------------------------------------------------------------------
+
+
+def test_preflight_passes_native_web_search_tool_through():
+    kwargs = {
+        "model": "grok-composer-2.5-fast",
+        "instructions": "You are helpful.",
+        "input": [{"role": "user", "content": [{"type": "input_text", "text": "hi"}]}],
+        "store": False,
+        "tools": [
+            {"type": "function", "name": "read_file", "description": "Read.",
+             "parameters": {"type": "object", "properties": {}}},
+            {"type": "web_search"},
+        ],
+    }
+    out = _preflight_codex_api_kwargs(kwargs, allow_stream=True)
+    tools = out["tools"]
+    assert {"type": "web_search"} in tools
+    assert any(t.get("type") == "function" and t.get("name") == "read_file" for t in tools)
+
+
+def test_preflight_still_rejects_unknown_tool_type():
+    kwargs = {
+        "model": "grok-composer-2.5-fast",
+        "instructions": "You are helpful.",
+        "input": [{"role": "user", "content": [{"type": "input_text", "text": "hi"}]}],
+        "store": False,
+        "tools": [{"type": "totally_made_up_tool"}],
+    }
+    with pytest.raises(ValueError, match="unsupported type"):
+        _preflight_codex_api_kwargs(kwargs, allow_stream=True)
+
+
 # ---------------------------------------------------------------------------
 # _format_responses_error — adapted from anomalyco/opencode#28757.
 # Provider failures should surface BOTH the code (rate_limit_exceeded /
--- a/tests/agent/test_model_metadata.py
+++ b/tests/agent/test_model_metadata.py
@ -142,6 +142,7 @@ class TestDefaultContextLengths:
                ("grok-4", 256000),
                ("grok-4-0709", 256000),
                ("grok-build-0.1", 256000),
+                ("grok-composer-2.5-fast", 262144),
                ("grok-code-fast-1", 256000),
                ("grok-3", 131072),
                ("grok-3-mini", 131072),
--- a/tests/agent/transports/test_codex_transport.py
+++ b/tests/agent/transports/test_codex_transport.py
@ -263,6 +263,72 @@ class TestCodexBuildKwargs:
        # full history.
        assert "reasoning.encrypted_content" in kw.get("include", [])

+    def test_xai_injects_native_web_search_tool(self, transport):
+        """xAI path declares xAI's native server-side web_search built-in so
+        grok server-side search runs to completion (otherwise the turn stalls
+        as reasoning-with-no-answer -> false 'incomplete' -> 3 retries -> fail).
+        """
+        messages = [{"role": "user", "content": "Find current prices."}]
+        kw = transport.build_kwargs(
+            model="grok-composer-2.5-fast", messages=messages,
+            tools=[{"type": "function", "function": {
+                "name": "read_file", "description": "Read a file.",
+                "parameters": {"type": "object",
+                               "properties": {"path": {"type": "string"}}}}}],
+            is_xai_responses=True,
+        )
+        tool_types = [t.get("type") for t in kw.get("tools", [])]
+        assert "web_search" in tool_types, kw.get("tools")
+        # Non-conflicting client-side tools are preserved.
+        names = [t.get("name") for t in kw.get("tools", []) if t.get("type") == "function"]
+        assert "read_file" in names
+
+    def test_xai_drops_clientside_web_search_to_avoid_duplicate(self, transport):
+        """When the client registers its own 'web_search' function, the xAI
+        path must drop it and rely on the native built-in — otherwise xAI
+        returns HTTP 400 'Duplicate tool names: web_search'."""
+        messages = [{"role": "user", "content": "Search the web."}]
+        kw = transport.build_kwargs(
+            model="grok-composer-2.5-fast", messages=messages,
+            tools=[{"type": "function", "function": {
+                "name": "web_search", "description": "Search the web.",
+                "parameters": {"type": "object",
+                               "properties": {"query": {"type": "string"}}}}}],
+            is_xai_responses=True,
+        )
+        tools = kw.get("tools", [])
+        # Exactly one tool named/typed web_search, and it is the native built-in.
+        web_search_entries = [
+            t for t in tools
+            if t.get("name") == "web_search" or t.get("type") == "web_search"
+        ]
+        assert len(web_search_entries) == 1
+        assert web_search_entries[0] == {"type": "web_search"}
+        # No client-side function form of web_search survives.
+        assert not any(
+            t.get("type") == "function" and t.get("name") == "web_search"
+            for t in tools
+        )
+
+    def test_non_xai_path_does_not_inject_native_web_search(self, transport):
+        """Native web_search injection is scoped to xAI — Codex/GitHub paths
+        keep the client-side web_search function untouched."""
+        messages = [{"role": "user", "content": "Search."}]
+        kw = transport.build_kwargs(
+            model="gpt-5.4", messages=messages,
+            tools=[{"type": "function", "function": {
+                "name": "web_search", "description": "Search the web.",
+                "parameters": {"type": "object",
+                               "properties": {"query": {"type": "string"}}}}}],
+            is_xai_responses=False,
+        )
+        tools = kw.get("tools", [])
+        assert not any(t.get("type") == "web_search" for t in tools)
+        assert any(
+            t.get("type") == "function" and t.get("name") == "web_search"
+            for t in tools
+        )
+
    def test_xai_reasoning_disabled_no_reasoning_key(self, transport):
        messages = [{"role": "user", "content": "Hi"}]
        kw = transport.build_kwargs(
--- a/tests/run_agent/test_codex_xai_oauth_recovery.py
+++ b/tests/run_agent/test_codex_xai_oauth_recovery.py
@ -949,6 +949,26 @@ def test_grok_4_still_resolves_to_256k():
        assert DEFAULT_CONTEXT_LENGTHS[matched_key] == 256_000


+def test_grok_composer_context_length_is_262k():
+    """grok-composer-2.5-fast is OAuth-only and missing from /v1/models.
+
+    Without a specific entry it fell through to the generic ``grok`` 131k
+    catch-all, under-reporting ~262k enforced on /v1/responses.
+    """
+    from agent.model_metadata import DEFAULT_CONTEXT_LENGTHS
+
+    assert DEFAULT_CONTEXT_LENGTHS["grok-composer"] == 262_144
+    slug = "grok-composer-2.5-fast"
+    matched_key = max(
+        (k for k in DEFAULT_CONTEXT_LENGTHS if k in slug.lower()),
+        key=len,
+    )
+    assert matched_key == "grok-composer", (
+        f"Expected longest-first match on grok-composer for {slug}, got {matched_key}"
+    )
+    assert DEFAULT_CONTEXT_LENGTHS[matched_key] == 262_144
+
+
 # ---------------------------------------------------------------------------
 # Cross-issuer reasoning replay guard
 #