fix(xai): OAuth Responses native web_search, incomplete guard, grok-composer context

- model_metadata: grok-composer-2.5-fast → 262144 (OAuth slug not in /v1/models)
- codex transport: inject native {"type":"web_search"} for is_xai_responses;
  drop client web_search to avoid duplicate-name 400s
- codex adapter: do not treat in-progress server-side *_call items as incomplete
- tests: adapter, transport build_kwargs, model_metadata, oauth recovery
This commit is contained in:
XVVH 2026-06-11 11:06:01 -04:00 committed by Teknium
parent 4b7a186003
commit 6f89e17a33
7 changed files with 310 additions and 2 deletions

View file

@ -262,6 +262,26 @@ def _responses_tools(tools: Optional[List[Dict[str, Any]]] = None) -> Optional[L
return converted or None
# Provider-executed built-in tool *declaration* types accepted on the
# Responses ``tools`` array. These are declared by ``type`` alone (no
# client-side name/parameters schema) and run server-side — the provider
# owns the implementation and reports progress via the matching ``*_call``
# output items. Hermes injects xAI's native ``web_search`` for the xAI
# transport (see agent/transports/codex.py); the rest are listed so the
# preflight validator passes them through rather than rejecting them as
# "unsupported type". Mirrors the ``*_call`` item-type set used in
# _normalize_codex_response.
_RESPONSES_BUILTIN_TOOL_TYPES = {
"web_search",
"web_search_preview",
"file_search",
"code_interpreter",
"image_generation",
"computer_use_preview",
"local_shell",
}
# ---------------------------------------------------------------------------
# Message format conversion
# ---------------------------------------------------------------------------
@ -802,7 +822,22 @@ def _preflight_codex_api_kwargs(
for idx, tool in enumerate(tools):
if not isinstance(tool, dict):
raise ValueError(f"Codex Responses tools[{idx}] must be an object.")
if tool.get("type") != "function":
tool_type = tool.get("type")
# Provider-executed built-in tools (xAI native web_search, code
# interpreter, etc.) are declared by ``type`` alone and carry no
# ``name``/``parameters`` schema — the provider owns the
# implementation. Pass them through verbatim instead of forcing
# them through the function-tool validation below (which would
# otherwise reject them with "unsupported type"). See
# agent/transports/codex.py for where xAI's native web_search is
# injected.
if tool_type in _RESPONSES_BUILTIN_TOOL_TYPES:
normalized_tools.append(dict(tool))
continue
if tool_type != "function":
raise ValueError(f"Codex Responses tools[{idx}] has unsupported type {tool.get('type')!r}.")
name = tool.get("name")
@ -1086,6 +1121,33 @@ def _normalize_codex_response(
saw_final_answer_phase = False
saw_reasoning_item = False
# Server-side built-in tool calls (xAI's native web_search, code
# interpreter, etc.) are executed by the provider and reported as
# discrete ``*_call`` output items. xAI's /v1/responses surface
# (e.g. grok-composer-2.5-fast on SuperGrok OAuth) routinely leaves
# these items at ``status="in_progress"`` even when the overall
# ``response.status == "completed"`` — the search ran to completion
# server-side, the per-item status simply isn't reconciled. These
# are NOT a signal that the model's turn is unfinished, so they must
# not flip ``has_incomplete_items``. Only the response-level status
# and genuine model output items (message/reasoning/function_call)
# govern the incomplete verdict. Without this guard, any turn where
# grok-composer invokes server-side search is misclassified as
# ``finish_reason="incomplete"`` and burns 3 fruitless continuation
# retries before failing with "Codex response remained incomplete
# after 3 continuation attempts". client-side function/custom tool
# calls keep their own in_progress handling below (they are skipped,
# not awaited).
_SERVER_SIDE_TOOL_CALL_TYPES = {
"web_search_call",
"file_search_call",
"code_interpreter_call",
"image_generation_call",
"computer_call",
"local_shell_call",
"mcp_call",
}
for item in output:
item_type = getattr(item, "type", None)
item_status = getattr(item, "status", None)
@ -1094,7 +1156,10 @@ def _normalize_codex_response(
else:
item_status = None
if item_status in {"queued", "in_progress", "incomplete"}:
if (
item_status in {"queued", "in_progress", "incomplete"}
and item_type not in _SERVER_SIDE_TOOL_CALL_TYPES
):
has_incomplete_items = True
saw_streaming_or_item_incomplete = True

View file

@ -277,6 +277,9 @@ DEFAULT_CONTEXT_LENGTHS = {
# matches "grok-4.20-0309-reasoning" / "-non-reasoning" / "-multi-agent-0309".
"grok-composer": 200000, # grok-composer-2.5-fast (Grok Build CLI)
"grok-build": 256000, # grok-build-0.1
# OAuth-only slug; absent from GET /v1/models. Live /v1/responses probe
# (2026-03) enforces ~262144 tokens total (input+output), not 131k.
"grok-composer": 262144, # grok-composer-2.5-fast
"grok-code-fast": 256000, # grok-code-fast-1
"grok-2-vision": 8192, # grok-2-vision, -1212, -latest
"grok-4-fast": 2000000, # grok-4-fast-(non-)reasoning, also matches -reasoning

View file

@ -128,6 +128,49 @@ class ResponsesApiTransport(ProviderTransport):
reasoning_effort = _effort_clamp.get(reasoning_effort, reasoning_effort)
response_tools = _responses_tools(tools)
# xAI server-side web search.
#
# grok models on xAI's /v1/responses surface (notably
# grok-composer-2.5-fast on SuperGrok OAuth) have a *native*,
# server-executed web search. When the model is handed a
# client-side function literally named ``web_search``, it routes
# the intent to that native engine — but because the tool is
# declared as a plain ``function`` rather than xAI's first-class
# ``{"type": "web_search"}`` built-in, the server-side search is
# dispatched but never reconciled: the response streams reasoning
# + ``web_search_call`` progress items, the searches never reach
# ``status="completed"`` in the assembled output, no final
# message is emitted, and ``_normalize_codex_response`` correctly
# sees reasoning-with-no-answer and reports ``incomplete``. The
# turn then burns 3 continuation retries and fails with "Codex
# response remained incomplete after 3 continuation attempts".
# Verified live against grok-composer-2.5-fast (2026-06).
#
# Fix: declare xAI's native ``web_search`` built-in so the search
# actually runs to completion server-side and the model streams a
# real answer. The Responses API rejects two tools sharing the
# name ``web_search`` (HTTP 400 "Duplicate tool names"), so we
# drop the client-side ``web_search`` function for the xAI path
# and let the native tool satisfy it. All other client-side
# tools (read_file, terminal, web_extract, MCP tools, …) are
# untouched and continue to dispatch through Hermes's agent loop.
#
# NOTE: this routes ``web_search`` to Grok's native search engine
# for xAI sessions instead of Hermes's configured web provider
# (Tavily/etc.), and those results bypass Hermes's tool-trace /
# citation plumbing (they arrive baked into the model's answer
# rather than as a tool result the loop observes). Scoped to
# ``is_xai_responses`` deliberately; narrow to specific models if
# a future grok variant should keep the client-side function.
if is_xai_responses:
filtered = [
t for t in (response_tools or [])
if not (isinstance(t, dict) and t.get("name") == "web_search")
]
filtered.append({"type": "web_search"})
response_tools = filtered
# ``tools`` MUST be omitted entirely when there are no functions to
# expose: the openai SDK's ``responses.stream()`` / ``responses.parse()``
# eagerly call ``_make_tools(tools)`` which does ``for tool in tools``

View file

@ -5,6 +5,7 @@ import pytest
from agent.codex_responses_adapter import (
_format_responses_error,
_normalize_codex_response,
_preflight_codex_api_kwargs,
)
@ -68,6 +69,115 @@ def test_normalize_codex_response_treats_summary_only_reasoning_as_incomplete():
assert assistant_message.codex_reasoning_items is None
# ---------------------------------------------------------------------------
# Server-side built-in tool calls (xAI native web_search, code interpreter,
# etc.) come back as discrete ``*_call`` output items that xAI's
# /v1/responses surface routinely leaves at ``status="in_progress"`` even
# when the overall ``response.status == "completed"``. These must NOT mark
# the turn incomplete — otherwise grok-composer-2.5-fast research queries
# (which invoke server-side web_search) get misclassified as
# ``finish_reason="incomplete"`` and burn 3 fruitless continuation retries
# before failing with "Codex response remained incomplete after 3
# continuation attempts". Observed live against grok-composer-2.5-fast on
# SuperGrok OAuth (2026-06).
# ---------------------------------------------------------------------------
def test_normalize_codex_response_ignores_in_progress_server_side_tool_calls():
"""A completed response with a final message + lingering in_progress
server-side web_search_call items resolves to 'stop', not 'incomplete'."""
response = SimpleNamespace(
status="completed",
incomplete_details=None,
output=[
SimpleNamespace(
type="reasoning",
id="rs_1",
encrypted_content="opaque",
summary=[SimpleNamespace(text="researching blades")],
),
SimpleNamespace(
type="message",
role="assistant",
status="completed",
content=[SimpleNamespace(
type="output_text",
text="Milwaukee M18 blade 49-16-2734, ~$30 OEM.",
)],
),
SimpleNamespace(type="web_search_call", status="in_progress"),
SimpleNamespace(type="web_search_call", status="in_progress"),
SimpleNamespace(type="web_search_call", status="in_progress"),
],
)
assistant_message, finish_reason = _normalize_codex_response(response)
assert finish_reason == "stop"
assert assistant_message.content == "Milwaukee M18 blade 49-16-2734, ~$30 OEM."
def test_normalize_codex_response_in_progress_message_still_incomplete():
"""Guard scope: an in_progress *message* item (genuine model output that
is still streaming) must still mark the turn incomplete only
server-side ``*_call`` items are exempted."""
response = SimpleNamespace(
status="completed",
incomplete_details=None,
output=[
SimpleNamespace(
type="message",
role="assistant",
status="in_progress",
content=[SimpleNamespace(type="output_text", text="partial...")],
),
],
)
_assistant_message, finish_reason = _normalize_codex_response(response)
assert finish_reason == "incomplete"
# ---------------------------------------------------------------------------
# _preflight_codex_api_kwargs — built-in (provider-executed) tools must pass
# through validation. Regression guard for the xAI native web_search
# injection: the preflight validator previously rejected any tool whose
# ``type != "function"`` with "unsupported type", which would 400 every xAI
# turn once the native web_search tool is declared.
# ---------------------------------------------------------------------------
def test_preflight_passes_native_web_search_tool_through():
kwargs = {
"model": "grok-composer-2.5-fast",
"instructions": "You are helpful.",
"input": [{"role": "user", "content": [{"type": "input_text", "text": "hi"}]}],
"store": False,
"tools": [
{"type": "function", "name": "read_file", "description": "Read.",
"parameters": {"type": "object", "properties": {}}},
{"type": "web_search"},
],
}
out = _preflight_codex_api_kwargs(kwargs, allow_stream=True)
tools = out["tools"]
assert {"type": "web_search"} in tools
assert any(t.get("type") == "function" and t.get("name") == "read_file" for t in tools)
def test_preflight_still_rejects_unknown_tool_type():
kwargs = {
"model": "grok-composer-2.5-fast",
"instructions": "You are helpful.",
"input": [{"role": "user", "content": [{"type": "input_text", "text": "hi"}]}],
"store": False,
"tools": [{"type": "totally_made_up_tool"}],
}
with pytest.raises(ValueError, match="unsupported type"):
_preflight_codex_api_kwargs(kwargs, allow_stream=True)
# ---------------------------------------------------------------------------
# _format_responses_error — adapted from anomalyco/opencode#28757.
# Provider failures should surface BOTH the code (rate_limit_exceeded /

View file

@ -142,6 +142,7 @@ class TestDefaultContextLengths:
("grok-4", 256000),
("grok-4-0709", 256000),
("grok-build-0.1", 256000),
("grok-composer-2.5-fast", 262144),
("grok-code-fast-1", 256000),
("grok-3", 131072),
("grok-3-mini", 131072),

View file

@ -263,6 +263,72 @@ class TestCodexBuildKwargs:
# full history.
assert "reasoning.encrypted_content" in kw.get("include", [])
def test_xai_injects_native_web_search_tool(self, transport):
"""xAI path declares xAI's native server-side web_search built-in so
grok server-side search runs to completion (otherwise the turn stalls
as reasoning-with-no-answer -> false 'incomplete' -> 3 retries -> fail).
"""
messages = [{"role": "user", "content": "Find current prices."}]
kw = transport.build_kwargs(
model="grok-composer-2.5-fast", messages=messages,
tools=[{"type": "function", "function": {
"name": "read_file", "description": "Read a file.",
"parameters": {"type": "object",
"properties": {"path": {"type": "string"}}}}}],
is_xai_responses=True,
)
tool_types = [t.get("type") for t in kw.get("tools", [])]
assert "web_search" in tool_types, kw.get("tools")
# Non-conflicting client-side tools are preserved.
names = [t.get("name") for t in kw.get("tools", []) if t.get("type") == "function"]
assert "read_file" in names
def test_xai_drops_clientside_web_search_to_avoid_duplicate(self, transport):
"""When the client registers its own 'web_search' function, the xAI
path must drop it and rely on the native built-in otherwise xAI
returns HTTP 400 'Duplicate tool names: web_search'."""
messages = [{"role": "user", "content": "Search the web."}]
kw = transport.build_kwargs(
model="grok-composer-2.5-fast", messages=messages,
tools=[{"type": "function", "function": {
"name": "web_search", "description": "Search the web.",
"parameters": {"type": "object",
"properties": {"query": {"type": "string"}}}}}],
is_xai_responses=True,
)
tools = kw.get("tools", [])
# Exactly one tool named/typed web_search, and it is the native built-in.
web_search_entries = [
t for t in tools
if t.get("name") == "web_search" or t.get("type") == "web_search"
]
assert len(web_search_entries) == 1
assert web_search_entries[0] == {"type": "web_search"}
# No client-side function form of web_search survives.
assert not any(
t.get("type") == "function" and t.get("name") == "web_search"
for t in tools
)
def test_non_xai_path_does_not_inject_native_web_search(self, transport):
"""Native web_search injection is scoped to xAI — Codex/GitHub paths
keep the client-side web_search function untouched."""
messages = [{"role": "user", "content": "Search."}]
kw = transport.build_kwargs(
model="gpt-5.4", messages=messages,
tools=[{"type": "function", "function": {
"name": "web_search", "description": "Search the web.",
"parameters": {"type": "object",
"properties": {"query": {"type": "string"}}}}}],
is_xai_responses=False,
)
tools = kw.get("tools", [])
assert not any(t.get("type") == "web_search" for t in tools)
assert any(
t.get("type") == "function" and t.get("name") == "web_search"
for t in tools
)
def test_xai_reasoning_disabled_no_reasoning_key(self, transport):
messages = [{"role": "user", "content": "Hi"}]
kw = transport.build_kwargs(

View file

@ -949,6 +949,26 @@ def test_grok_4_still_resolves_to_256k():
assert DEFAULT_CONTEXT_LENGTHS[matched_key] == 256_000
def test_grok_composer_context_length_is_262k():
"""grok-composer-2.5-fast is OAuth-only and missing from /v1/models.
Without a specific entry it fell through to the generic ``grok`` 131k
catch-all, under-reporting ~262k enforced on /v1/responses.
"""
from agent.model_metadata import DEFAULT_CONTEXT_LENGTHS
assert DEFAULT_CONTEXT_LENGTHS["grok-composer"] == 262_144
slug = "grok-composer-2.5-fast"
matched_key = max(
(k for k in DEFAULT_CONTEXT_LENGTHS if k in slug.lower()),
key=len,
)
assert matched_key == "grok-composer", (
f"Expected longest-first match on grok-composer for {slug}, got {matched_key}"
)
assert DEFAULT_CONTEXT_LENGTHS[matched_key] == 262_144
# ---------------------------------------------------------------------------
# Cross-issuer reasoning replay guard
#