From 6f89e17a33edbfffb93c2c0ab6b8dafc21e296b2 Mon Sep 17 00:00:00 2001 From: XVVH Date: Thu, 11 Jun 2026 11:06:01 -0400 Subject: [PATCH] fix(xai): OAuth Responses native web_search, incomplete guard, grok-composer context MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - model_metadata: grok-composer-2.5-fast → 262144 (OAuth slug not in /v1/models) - codex transport: inject native {"type":"web_search"} for is_xai_responses; drop client web_search to avoid duplicate-name 400s - codex adapter: do not treat in-progress server-side *_call items as incomplete - tests: adapter, transport build_kwargs, model_metadata, oauth recovery --- agent/codex_responses_adapter.py | 69 ++++++++++- agent/model_metadata.py | 3 + agent/transports/codex.py | 43 +++++++ tests/agent/test_codex_responses_adapter.py | 110 ++++++++++++++++++ tests/agent/test_model_metadata.py | 1 + .../agent/transports/test_codex_transport.py | 66 +++++++++++ .../test_codex_xai_oauth_recovery.py | 20 ++++ 7 files changed, 310 insertions(+), 2 deletions(-) diff --git a/agent/codex_responses_adapter.py b/agent/codex_responses_adapter.py index b8479141db1..e9b6ace9b85 100644 --- a/agent/codex_responses_adapter.py +++ b/agent/codex_responses_adapter.py @@ -262,6 +262,26 @@ def _responses_tools(tools: Optional[List[Dict[str, Any]]] = None) -> Optional[L return converted or None +# Provider-executed built-in tool *declaration* types accepted on the +# Responses ``tools`` array. These are declared by ``type`` alone (no +# client-side name/parameters schema) and run server-side — the provider +# owns the implementation and reports progress via the matching ``*_call`` +# output items. Hermes injects xAI's native ``web_search`` for the xAI +# transport (see agent/transports/codex.py); the rest are listed so the +# preflight validator passes them through rather than rejecting them as +# "unsupported type". Mirrors the ``*_call`` item-type set used in +# _normalize_codex_response. +_RESPONSES_BUILTIN_TOOL_TYPES = { + "web_search", + "web_search_preview", + "file_search", + "code_interpreter", + "image_generation", + "computer_use_preview", + "local_shell", +} + + # --------------------------------------------------------------------------- # Message format conversion # --------------------------------------------------------------------------- @@ -802,7 +822,22 @@ def _preflight_codex_api_kwargs( for idx, tool in enumerate(tools): if not isinstance(tool, dict): raise ValueError(f"Codex Responses tools[{idx}] must be an object.") - if tool.get("type") != "function": + + tool_type = tool.get("type") + + # Provider-executed built-in tools (xAI native web_search, code + # interpreter, etc.) are declared by ``type`` alone and carry no + # ``name``/``parameters`` schema — the provider owns the + # implementation. Pass them through verbatim instead of forcing + # them through the function-tool validation below (which would + # otherwise reject them with "unsupported type"). See + # agent/transports/codex.py for where xAI's native web_search is + # injected. + if tool_type in _RESPONSES_BUILTIN_TOOL_TYPES: + normalized_tools.append(dict(tool)) + continue + + if tool_type != "function": raise ValueError(f"Codex Responses tools[{idx}] has unsupported type {tool.get('type')!r}.") name = tool.get("name") @@ -1086,6 +1121,33 @@ def _normalize_codex_response( saw_final_answer_phase = False saw_reasoning_item = False + # Server-side built-in tool calls (xAI's native web_search, code + # interpreter, etc.) are executed by the provider and reported as + # discrete ``*_call`` output items. xAI's /v1/responses surface + # (e.g. grok-composer-2.5-fast on SuperGrok OAuth) routinely leaves + # these items at ``status="in_progress"`` even when the overall + # ``response.status == "completed"`` — the search ran to completion + # server-side, the per-item status simply isn't reconciled. These + # are NOT a signal that the model's turn is unfinished, so they must + # not flip ``has_incomplete_items``. Only the response-level status + # and genuine model output items (message/reasoning/function_call) + # govern the incomplete verdict. Without this guard, any turn where + # grok-composer invokes server-side search is misclassified as + # ``finish_reason="incomplete"`` and burns 3 fruitless continuation + # retries before failing with "Codex response remained incomplete + # after 3 continuation attempts". client-side function/custom tool + # calls keep their own in_progress handling below (they are skipped, + # not awaited). + _SERVER_SIDE_TOOL_CALL_TYPES = { + "web_search_call", + "file_search_call", + "code_interpreter_call", + "image_generation_call", + "computer_call", + "local_shell_call", + "mcp_call", + } + for item in output: item_type = getattr(item, "type", None) item_status = getattr(item, "status", None) @@ -1094,7 +1156,10 @@ def _normalize_codex_response( else: item_status = None - if item_status in {"queued", "in_progress", "incomplete"}: + if ( + item_status in {"queued", "in_progress", "incomplete"} + and item_type not in _SERVER_SIDE_TOOL_CALL_TYPES + ): has_incomplete_items = True saw_streaming_or_item_incomplete = True diff --git a/agent/model_metadata.py b/agent/model_metadata.py index 4d14826b9ef..aa99302b013 100644 --- a/agent/model_metadata.py +++ b/agent/model_metadata.py @@ -277,6 +277,9 @@ DEFAULT_CONTEXT_LENGTHS = { # matches "grok-4.20-0309-reasoning" / "-non-reasoning" / "-multi-agent-0309". "grok-composer": 200000, # grok-composer-2.5-fast (Grok Build CLI) "grok-build": 256000, # grok-build-0.1 + # OAuth-only slug; absent from GET /v1/models. Live /v1/responses probe + # (2026-03) enforces ~262144 tokens total (input+output), not 131k. + "grok-composer": 262144, # grok-composer-2.5-fast "grok-code-fast": 256000, # grok-code-fast-1 "grok-2-vision": 8192, # grok-2-vision, -1212, -latest "grok-4-fast": 2000000, # grok-4-fast-(non-)reasoning, also matches -reasoning diff --git a/agent/transports/codex.py b/agent/transports/codex.py index eaf6160ae1d..d98e8d50463 100644 --- a/agent/transports/codex.py +++ b/agent/transports/codex.py @@ -128,6 +128,49 @@ class ResponsesApiTransport(ProviderTransport): reasoning_effort = _effort_clamp.get(reasoning_effort, reasoning_effort) response_tools = _responses_tools(tools) + + # xAI server-side web search. + # + # grok models on xAI's /v1/responses surface (notably + # grok-composer-2.5-fast on SuperGrok OAuth) have a *native*, + # server-executed web search. When the model is handed a + # client-side function literally named ``web_search``, it routes + # the intent to that native engine — but because the tool is + # declared as a plain ``function`` rather than xAI's first-class + # ``{"type": "web_search"}`` built-in, the server-side search is + # dispatched but never reconciled: the response streams reasoning + # + ``web_search_call`` progress items, the searches never reach + # ``status="completed"`` in the assembled output, no final + # message is emitted, and ``_normalize_codex_response`` correctly + # sees reasoning-with-no-answer and reports ``incomplete``. The + # turn then burns 3 continuation retries and fails with "Codex + # response remained incomplete after 3 continuation attempts". + # Verified live against grok-composer-2.5-fast (2026-06). + # + # Fix: declare xAI's native ``web_search`` built-in so the search + # actually runs to completion server-side and the model streams a + # real answer. The Responses API rejects two tools sharing the + # name ``web_search`` (HTTP 400 "Duplicate tool names"), so we + # drop the client-side ``web_search`` function for the xAI path + # and let the native tool satisfy it. All other client-side + # tools (read_file, terminal, web_extract, MCP tools, …) are + # untouched and continue to dispatch through Hermes's agent loop. + # + # NOTE: this routes ``web_search`` to Grok's native search engine + # for xAI sessions instead of Hermes's configured web provider + # (Tavily/etc.), and those results bypass Hermes's tool-trace / + # citation plumbing (they arrive baked into the model's answer + # rather than as a tool result the loop observes). Scoped to + # ``is_xai_responses`` deliberately; narrow to specific models if + # a future grok variant should keep the client-side function. + if is_xai_responses: + filtered = [ + t for t in (response_tools or []) + if not (isinstance(t, dict) and t.get("name") == "web_search") + ] + filtered.append({"type": "web_search"}) + response_tools = filtered + # ``tools`` MUST be omitted entirely when there are no functions to # expose: the openai SDK's ``responses.stream()`` / ``responses.parse()`` # eagerly call ``_make_tools(tools)`` which does ``for tool in tools`` diff --git a/tests/agent/test_codex_responses_adapter.py b/tests/agent/test_codex_responses_adapter.py index db3316a0567..b8586dbeace 100644 --- a/tests/agent/test_codex_responses_adapter.py +++ b/tests/agent/test_codex_responses_adapter.py @@ -5,6 +5,7 @@ import pytest from agent.codex_responses_adapter import ( _format_responses_error, _normalize_codex_response, + _preflight_codex_api_kwargs, ) @@ -68,6 +69,115 @@ def test_normalize_codex_response_treats_summary_only_reasoning_as_incomplete(): assert assistant_message.codex_reasoning_items is None +# --------------------------------------------------------------------------- +# Server-side built-in tool calls (xAI native web_search, code interpreter, +# etc.) come back as discrete ``*_call`` output items that xAI's +# /v1/responses surface routinely leaves at ``status="in_progress"`` even +# when the overall ``response.status == "completed"``. These must NOT mark +# the turn incomplete — otherwise grok-composer-2.5-fast research queries +# (which invoke server-side web_search) get misclassified as +# ``finish_reason="incomplete"`` and burn 3 fruitless continuation retries +# before failing with "Codex response remained incomplete after 3 +# continuation attempts". Observed live against grok-composer-2.5-fast on +# SuperGrok OAuth (2026-06). +# --------------------------------------------------------------------------- + + +def test_normalize_codex_response_ignores_in_progress_server_side_tool_calls(): + """A completed response with a final message + lingering in_progress + server-side web_search_call items resolves to 'stop', not 'incomplete'.""" + response = SimpleNamespace( + status="completed", + incomplete_details=None, + output=[ + SimpleNamespace( + type="reasoning", + id="rs_1", + encrypted_content="opaque", + summary=[SimpleNamespace(text="researching blades")], + ), + SimpleNamespace( + type="message", + role="assistant", + status="completed", + content=[SimpleNamespace( + type="output_text", + text="Milwaukee M18 blade 49-16-2734, ~$30 OEM.", + )], + ), + SimpleNamespace(type="web_search_call", status="in_progress"), + SimpleNamespace(type="web_search_call", status="in_progress"), + SimpleNamespace(type="web_search_call", status="in_progress"), + ], + ) + + assistant_message, finish_reason = _normalize_codex_response(response) + + assert finish_reason == "stop" + assert assistant_message.content == "Milwaukee M18 blade 49-16-2734, ~$30 OEM." + + +def test_normalize_codex_response_in_progress_message_still_incomplete(): + """Guard scope: an in_progress *message* item (genuine model output that + is still streaming) must still mark the turn incomplete — only + server-side ``*_call`` items are exempted.""" + response = SimpleNamespace( + status="completed", + incomplete_details=None, + output=[ + SimpleNamespace( + type="message", + role="assistant", + status="in_progress", + content=[SimpleNamespace(type="output_text", text="partial...")], + ), + ], + ) + + _assistant_message, finish_reason = _normalize_codex_response(response) + + assert finish_reason == "incomplete" + + +# --------------------------------------------------------------------------- +# _preflight_codex_api_kwargs — built-in (provider-executed) tools must pass +# through validation. Regression guard for the xAI native web_search +# injection: the preflight validator previously rejected any tool whose +# ``type != "function"`` with "unsupported type", which would 400 every xAI +# turn once the native web_search tool is declared. +# --------------------------------------------------------------------------- + + +def test_preflight_passes_native_web_search_tool_through(): + kwargs = { + "model": "grok-composer-2.5-fast", + "instructions": "You are helpful.", + "input": [{"role": "user", "content": [{"type": "input_text", "text": "hi"}]}], + "store": False, + "tools": [ + {"type": "function", "name": "read_file", "description": "Read.", + "parameters": {"type": "object", "properties": {}}}, + {"type": "web_search"}, + ], + } + out = _preflight_codex_api_kwargs(kwargs, allow_stream=True) + tools = out["tools"] + assert {"type": "web_search"} in tools + assert any(t.get("type") == "function" and t.get("name") == "read_file" for t in tools) + + +def test_preflight_still_rejects_unknown_tool_type(): + kwargs = { + "model": "grok-composer-2.5-fast", + "instructions": "You are helpful.", + "input": [{"role": "user", "content": [{"type": "input_text", "text": "hi"}]}], + "store": False, + "tools": [{"type": "totally_made_up_tool"}], + } + with pytest.raises(ValueError, match="unsupported type"): + _preflight_codex_api_kwargs(kwargs, allow_stream=True) + + # --------------------------------------------------------------------------- # _format_responses_error — adapted from anomalyco/opencode#28757. # Provider failures should surface BOTH the code (rate_limit_exceeded / diff --git a/tests/agent/test_model_metadata.py b/tests/agent/test_model_metadata.py index b6c926f5a08..c1f239c2e47 100644 --- a/tests/agent/test_model_metadata.py +++ b/tests/agent/test_model_metadata.py @@ -142,6 +142,7 @@ class TestDefaultContextLengths: ("grok-4", 256000), ("grok-4-0709", 256000), ("grok-build-0.1", 256000), + ("grok-composer-2.5-fast", 262144), ("grok-code-fast-1", 256000), ("grok-3", 131072), ("grok-3-mini", 131072), diff --git a/tests/agent/transports/test_codex_transport.py b/tests/agent/transports/test_codex_transport.py index 9355a786964..906774ea503 100644 --- a/tests/agent/transports/test_codex_transport.py +++ b/tests/agent/transports/test_codex_transport.py @@ -263,6 +263,72 @@ class TestCodexBuildKwargs: # full history. assert "reasoning.encrypted_content" in kw.get("include", []) + def test_xai_injects_native_web_search_tool(self, transport): + """xAI path declares xAI's native server-side web_search built-in so + grok server-side search runs to completion (otherwise the turn stalls + as reasoning-with-no-answer -> false 'incomplete' -> 3 retries -> fail). + """ + messages = [{"role": "user", "content": "Find current prices."}] + kw = transport.build_kwargs( + model="grok-composer-2.5-fast", messages=messages, + tools=[{"type": "function", "function": { + "name": "read_file", "description": "Read a file.", + "parameters": {"type": "object", + "properties": {"path": {"type": "string"}}}}}], + is_xai_responses=True, + ) + tool_types = [t.get("type") for t in kw.get("tools", [])] + assert "web_search" in tool_types, kw.get("tools") + # Non-conflicting client-side tools are preserved. + names = [t.get("name") for t in kw.get("tools", []) if t.get("type") == "function"] + assert "read_file" in names + + def test_xai_drops_clientside_web_search_to_avoid_duplicate(self, transport): + """When the client registers its own 'web_search' function, the xAI + path must drop it and rely on the native built-in — otherwise xAI + returns HTTP 400 'Duplicate tool names: web_search'.""" + messages = [{"role": "user", "content": "Search the web."}] + kw = transport.build_kwargs( + model="grok-composer-2.5-fast", messages=messages, + tools=[{"type": "function", "function": { + "name": "web_search", "description": "Search the web.", + "parameters": {"type": "object", + "properties": {"query": {"type": "string"}}}}}], + is_xai_responses=True, + ) + tools = kw.get("tools", []) + # Exactly one tool named/typed web_search, and it is the native built-in. + web_search_entries = [ + t for t in tools + if t.get("name") == "web_search" or t.get("type") == "web_search" + ] + assert len(web_search_entries) == 1 + assert web_search_entries[0] == {"type": "web_search"} + # No client-side function form of web_search survives. + assert not any( + t.get("type") == "function" and t.get("name") == "web_search" + for t in tools + ) + + def test_non_xai_path_does_not_inject_native_web_search(self, transport): + """Native web_search injection is scoped to xAI — Codex/GitHub paths + keep the client-side web_search function untouched.""" + messages = [{"role": "user", "content": "Search."}] + kw = transport.build_kwargs( + model="gpt-5.4", messages=messages, + tools=[{"type": "function", "function": { + "name": "web_search", "description": "Search the web.", + "parameters": {"type": "object", + "properties": {"query": {"type": "string"}}}}}], + is_xai_responses=False, + ) + tools = kw.get("tools", []) + assert not any(t.get("type") == "web_search" for t in tools) + assert any( + t.get("type") == "function" and t.get("name") == "web_search" + for t in tools + ) + def test_xai_reasoning_disabled_no_reasoning_key(self, transport): messages = [{"role": "user", "content": "Hi"}] kw = transport.build_kwargs( diff --git a/tests/run_agent/test_codex_xai_oauth_recovery.py b/tests/run_agent/test_codex_xai_oauth_recovery.py index 8db6c262693..20714f117a8 100644 --- a/tests/run_agent/test_codex_xai_oauth_recovery.py +++ b/tests/run_agent/test_codex_xai_oauth_recovery.py @@ -949,6 +949,26 @@ def test_grok_4_still_resolves_to_256k(): assert DEFAULT_CONTEXT_LENGTHS[matched_key] == 256_000 +def test_grok_composer_context_length_is_262k(): + """grok-composer-2.5-fast is OAuth-only and missing from /v1/models. + + Without a specific entry it fell through to the generic ``grok`` 131k + catch-all, under-reporting ~262k enforced on /v1/responses. + """ + from agent.model_metadata import DEFAULT_CONTEXT_LENGTHS + + assert DEFAULT_CONTEXT_LENGTHS["grok-composer"] == 262_144 + slug = "grok-composer-2.5-fast" + matched_key = max( + (k for k in DEFAULT_CONTEXT_LENGTHS if k in slug.lower()), + key=len, + ) + assert matched_key == "grok-composer", ( + f"Expected longest-first match on grok-composer for {slug}, got {matched_key}" + ) + assert DEFAULT_CONTEXT_LENGTHS[matched_key] == 262_144 + + # --------------------------------------------------------------------------- # Cross-issuer reasoning replay guard #