fix(xai): scope native web_search to swap-only + reconcile composer ctx to 200k

Salvage corrections on top of @XVVH's #44341: - Make native web_search injection a 1:1 swap for an already-present client web_search function, NOT an additive grant. The original unconditionally appended {"type":"web_search"} on every is_xai_responses turn with any tools, force-enabling Grok server-side search even when the user never enabled the web toolset (bypassing Hermes web-provider config + tool-trace plumbing). Now gated on a client web_search actually being present. - Reconcile grok-composer context to 200000 (merged in #47908) rather than 262144; 200k is xAI's published usable context window for Composer 2.5, 262144 is the /v1/responses input+output budget. - Update tests to match scoped behavior + add a no-web-toolset guard test. - AUTHOR_MAP entry for #44341 salvage. Incomplete-guard (server-side *_call items at in_progress no longer flip has_incomplete_items) and preflight built-in-tool allowlist kept as-is.
2026-07-31 19:16:29 +00:00 · 2026-06-17 16:44:12 -07:00 · 2026-06-17 16:44:12 -07:00 · c5eb64b9f7
commit c5eb64b9f7
parent 6f89e17a33
6 changed files with 89 additions and 38 deletions
--- a/agent/model_metadata.py
+++ b/agent/model_metadata.py
@ -275,11 +275,12 @@ DEFAULT_CONTEXT_LENGTHS = {
    # via a custom provider. Values sourced from models.dev (2026-04).
    # Keys use substring matching (longest-first), so e.g. "grok-4.20"
    # matches "grok-4.20-0309-reasoning" / "-non-reasoning" / "-multi-agent-0309".
+    # OAuth-only slug; absent from GET /v1/models. xAI publishes a 200k
+    # usable context window for Composer 2.5 on Grok Build (SuperGrok /
+    # Premium+); /v1/responses additionally enforces a ~262144 input+output
+    # budget, but the usable context (what we track here) is 200k.
    "grok-composer": 200000,    # grok-composer-2.5-fast (Grok Build CLI)
    "grok-build": 256000,       # grok-build-0.1
-    # OAuth-only slug; absent from GET /v1/models. Live /v1/responses probe
-    # (2026-03) enforces ~262144 tokens total (input+output), not 131k.
-    "grok-composer": 262144,    # grok-composer-2.5-fast
    "grok-code-fast": 256000,   # grok-code-fast-1
    "grok-2-vision": 8192,      # grok-2-vision, -1212, -latest
    "grok-4-fast": 2000000,     # grok-4-fast-(non-)reasoning, also matches -reasoning
--- a/agent/transports/codex.py
+++ b/agent/transports/codex.py
@ -147,29 +147,45 @@ class ResponsesApiTransport(ProviderTransport):
        # response remained incomplete after 3 continuation attempts".
        # Verified live against grok-composer-2.5-fast (2026-06).
        #
-        # Fix: declare xAI's native ``web_search`` built-in so the search
-        # actually runs to completion server-side and the model streams a
-        # real answer.  The Responses API rejects two tools sharing the
-        # name ``web_search`` (HTTP 400 "Duplicate tool names"), so we
-        # drop the client-side ``web_search`` function for the xAI path
-        # and let the native tool satisfy it.  All other client-side
-        # tools (read_file, terminal, web_extract, MCP tools, …) are
-        # untouched and continue to dispatch through Hermes's agent loop.
+        # Fix: when the agent HAS a client-side ``web_search`` function (i.e.
+        # the user enabled the web toolset), declare xAI's native
+        # ``web_search`` built-in instead so the search actually runs to
+        # completion server-side and the model streams a real answer.  The
+        # Responses API rejects two tools sharing the name ``web_search``
+        # (HTTP 400 "Duplicate tool names"), so we drop the client-side
+        # ``web_search`` function for the xAI path and let the native tool
+        # satisfy it.  All other client-side tools (read_file, terminal,
+        # web_extract, MCP tools, …) are untouched and continue to dispatch
+        # through Hermes's agent loop.
        #
-        # NOTE: this routes ``web_search`` to Grok's native search engine
-        # for xAI sessions instead of Hermes's configured web provider
-        # (Tavily/etc.), and those results bypass Hermes's tool-trace /
-        # citation plumbing (they arrive baked into the model's answer
-        # rather than as a tool result the loop observes).  Scoped to
-        # ``is_xai_responses`` deliberately; narrow to specific models if
-        # a future grok variant should keep the client-side function.
-        if is_xai_responses:
-            filtered = [
-                t for t in (response_tools or [])
-                if not (isinstance(t, dict) and t.get("name") == "web_search")
-            ]
-            filtered.append({"type": "web_search"})
-            response_tools = filtered
+        # Scope: we ONLY swap in the native built-in when the client
+        # ``web_search`` was actually present.  We do NOT force-enable Grok
+        # server-side search on turns where the user never had web enabled —
+        # that would silently route around Hermes's web-provider config and
+        # tool-trace/citation plumbing for every xai-oauth turn.  The swap is
+        # a 1:1 replacement of an already-requested capability, not an
+        # additive grant.
+        #
+        # NOTE: for the swapped case this routes ``web_search`` to Grok's
+        # native search engine for xAI sessions instead of Hermes's
+        # configured web provider (Tavily/etc.), and those results bypass
+        # Hermes's tool-trace / citation plumbing (they arrive baked into the
+        # model's answer rather than as a tool result the loop observes).
+        # Scoped to ``is_xai_responses`` deliberately; narrow to specific
+        # models if a future grok variant should keep the client-side
+        # function.
+        if is_xai_responses and response_tools:
+            has_client_web_search = any(
+                isinstance(t, dict) and t.get("name") == "web_search"
+                for t in response_tools
+            )
+            if has_client_web_search:
+                filtered = [
+                    t for t in response_tools
+                    if not (isinstance(t, dict) and t.get("name") == "web_search")
+                ]
+                filtered.append({"type": "web_search"})
+                response_tools = filtered

        # ``tools`` MUST be omitted entirely when there are no functions to
        # expose: the openai SDK's ``responses.stream()`` / ``responses.parse()``
--- a/scripts/release.py
+++ b/scripts/release.py
@ -49,6 +49,7 @@ AUTHOR_MAP = {
    "zheng@omegasys.eu": "omegazheng",
    "220877172+james47kjv@users.noreply.github.com": "james47kjv",
    "yuhanglin@YuhangdeMac-mini.local": "1960697431",
+    "admin@fent.quest": "XVVH",
    "despitemeguru@gmail.com": "definitelynotguru",
    "chaslui@outlook.com": "ChasLui",
    "rio.jeong@thebytesize.ai": "rio-jeong",
--- a/tests/agent/test_model_metadata.py
+++ b/tests/agent/test_model_metadata.py
@ -142,7 +142,7 @@ class TestDefaultContextLengths:
                ("grok-4", 256000),
                ("grok-4-0709", 256000),
                ("grok-build-0.1", 256000),
-                ("grok-composer-2.5-fast", 262144),
+                ("grok-composer-2.5-fast", 200000),
                ("grok-code-fast-1", 256000),
                ("grok-3", 131072),
                ("grok-3-mini", 131072),
--- a/tests/agent/transports/test_codex_transport.py
+++ b/tests/agent/transports/test_codex_transport.py
@ -263,12 +263,43 @@ class TestCodexBuildKwargs:
        # full history.
        assert "reasoning.encrypted_content" in kw.get("include", [])

-    def test_xai_injects_native_web_search_tool(self, transport):
-        """xAI path declares xAI's native server-side web_search built-in so
-        grok server-side search runs to completion (otherwise the turn stalls
-        as reasoning-with-no-answer -> false 'incomplete' -> 3 retries -> fail).
+    def test_xai_injects_native_web_search_when_client_web_search_present(self, transport):
+        """xAI path swaps a client-side ``web_search`` function for xAI's
+        native server-side ``web_search`` built-in so grok server-side search
+        runs to completion (otherwise the turn stalls as
+        reasoning-with-no-answer -> false 'incomplete' -> 3 retries -> fail).
+        Non-conflicting client tools are preserved.
        """
        messages = [{"role": "user", "content": "Find current prices."}]
+        kw = transport.build_kwargs(
+            model="grok-composer-2.5-fast", messages=messages,
+            tools=[
+                {"type": "function", "function": {
+                    "name": "read_file", "description": "Read a file.",
+                    "parameters": {"type": "object",
+                                   "properties": {"path": {"type": "string"}}}}},
+                {"type": "function", "function": {
+                    "name": "web_search", "description": "Search the web.",
+                    "parameters": {"type": "object",
+                                   "properties": {"query": {"type": "string"}}}}},
+            ],
+            is_xai_responses=True,
+        )
+        tool_types = [t.get("type") for t in kw.get("tools", [])]
+        assert "web_search" in tool_types, kw.get("tools")
+        # Non-conflicting client-side tools are preserved.
+        names = [t.get("name") for t in kw.get("tools", []) if t.get("type") == "function"]
+        assert "read_file" in names
+
+    def test_xai_does_not_inject_native_web_search_without_client_web_search(self, transport):
+        """The native ``web_search`` built-in is a 1:1 swap for an
+        already-requested client ``web_search`` — NOT an additive grant.  A
+        turn whose toolset has no ``web_search`` (user never enabled the web
+        toolset) must not get Grok server-side search force-injected, which
+        would silently bypass Hermes's web-provider config and tool-trace
+        plumbing for every xai-oauth turn.
+        """
+        messages = [{"role": "user", "content": "Read this file."}]
        kw = transport.build_kwargs(
            model="grok-composer-2.5-fast", messages=messages,
            tools=[{"type": "function", "function": {
@ -277,10 +308,9 @@ class TestCodexBuildKwargs:
                               "properties": {"path": {"type": "string"}}}}}],
            is_xai_responses=True,
        )
-        tool_types = [t.get("type") for t in kw.get("tools", [])]
-        assert "web_search" in tool_types, kw.get("tools")
-        # Non-conflicting client-side tools are preserved.
-        names = [t.get("name") for t in kw.get("tools", []) if t.get("type") == "function"]
+        tools = kw.get("tools", [])
+        assert not any(t.get("type") == "web_search" for t in tools), tools
+        names = [t.get("name") for t in tools if t.get("type") == "function"]
        assert "read_file" in names

    def test_xai_drops_clientside_web_search_to_avoid_duplicate(self, transport):
--- a/tests/run_agent/test_codex_xai_oauth_recovery.py
+++ b/tests/run_agent/test_codex_xai_oauth_recovery.py
@ -949,15 +949,18 @@ def test_grok_4_still_resolves_to_256k():
        assert DEFAULT_CONTEXT_LENGTHS[matched_key] == 256_000


-def test_grok_composer_context_length_is_262k():
+def test_grok_composer_context_length_is_200k():
    """grok-composer-2.5-fast is OAuth-only and missing from /v1/models.

    Without a specific entry it fell through to the generic ``grok`` 131k
-    catch-all, under-reporting ~262k enforced on /v1/responses.
+    catch-all.  xAI publishes a 200k usable context window for Composer 2.5
+    on Grok Build (SuperGrok / Premium+); /v1/responses additionally caps
+    the input+output budget at ~262144, but the usable context (what we
+    track) is 200k.
    """
    from agent.model_metadata import DEFAULT_CONTEXT_LENGTHS

-    assert DEFAULT_CONTEXT_LENGTHS["grok-composer"] == 262_144
+    assert DEFAULT_CONTEXT_LENGTHS["grok-composer"] == 200_000
    slug = "grok-composer-2.5-fast"
    matched_key = max(
        (k for k in DEFAULT_CONTEXT_LENGTHS if k in slug.lower()),
@ -966,7 +969,7 @@ def test_grok_composer_context_length_is_262k():
    assert matched_key == "grok-composer", (
        f"Expected longest-first match on grok-composer for {slug}, got {matched_key}"
    )
-    assert DEFAULT_CONTEXT_LENGTHS[matched_key] == 262_144
+    assert DEFAULT_CONTEXT_LENGTHS[matched_key] == 200_000


 # ---------------------------------------------------------------------------