mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-18 09:51:59 +00:00
fix(xai): OAuth Responses native web_search, incomplete guard, grok-composer context
- model_metadata: grok-composer-2.5-fast → 262144 (OAuth slug not in /v1/models)
- codex transport: inject native {"type":"web_search"} for is_xai_responses;
drop client web_search to avoid duplicate-name 400s
- codex adapter: do not treat in-progress server-side *_call items as incomplete
- tests: adapter, transport build_kwargs, model_metadata, oauth recovery
This commit is contained in:
parent
4b7a186003
commit
6f89e17a33
7 changed files with 310 additions and 2 deletions
|
|
@ -262,6 +262,26 @@ def _responses_tools(tools: Optional[List[Dict[str, Any]]] = None) -> Optional[L
|
|||
return converted or None
|
||||
|
||||
|
||||
# Provider-executed built-in tool *declaration* types accepted on the
|
||||
# Responses ``tools`` array. These are declared by ``type`` alone (no
|
||||
# client-side name/parameters schema) and run server-side — the provider
|
||||
# owns the implementation and reports progress via the matching ``*_call``
|
||||
# output items. Hermes injects xAI's native ``web_search`` for the xAI
|
||||
# transport (see agent/transports/codex.py); the rest are listed so the
|
||||
# preflight validator passes them through rather than rejecting them as
|
||||
# "unsupported type". Mirrors the ``*_call`` item-type set used in
|
||||
# _normalize_codex_response.
|
||||
_RESPONSES_BUILTIN_TOOL_TYPES = {
|
||||
"web_search",
|
||||
"web_search_preview",
|
||||
"file_search",
|
||||
"code_interpreter",
|
||||
"image_generation",
|
||||
"computer_use_preview",
|
||||
"local_shell",
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Message format conversion
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
@ -802,7 +822,22 @@ def _preflight_codex_api_kwargs(
|
|||
for idx, tool in enumerate(tools):
|
||||
if not isinstance(tool, dict):
|
||||
raise ValueError(f"Codex Responses tools[{idx}] must be an object.")
|
||||
if tool.get("type") != "function":
|
||||
|
||||
tool_type = tool.get("type")
|
||||
|
||||
# Provider-executed built-in tools (xAI native web_search, code
|
||||
# interpreter, etc.) are declared by ``type`` alone and carry no
|
||||
# ``name``/``parameters`` schema — the provider owns the
|
||||
# implementation. Pass them through verbatim instead of forcing
|
||||
# them through the function-tool validation below (which would
|
||||
# otherwise reject them with "unsupported type"). See
|
||||
# agent/transports/codex.py for where xAI's native web_search is
|
||||
# injected.
|
||||
if tool_type in _RESPONSES_BUILTIN_TOOL_TYPES:
|
||||
normalized_tools.append(dict(tool))
|
||||
continue
|
||||
|
||||
if tool_type != "function":
|
||||
raise ValueError(f"Codex Responses tools[{idx}] has unsupported type {tool.get('type')!r}.")
|
||||
|
||||
name = tool.get("name")
|
||||
|
|
@ -1086,6 +1121,33 @@ def _normalize_codex_response(
|
|||
saw_final_answer_phase = False
|
||||
saw_reasoning_item = False
|
||||
|
||||
# Server-side built-in tool calls (xAI's native web_search, code
|
||||
# interpreter, etc.) are executed by the provider and reported as
|
||||
# discrete ``*_call`` output items. xAI's /v1/responses surface
|
||||
# (e.g. grok-composer-2.5-fast on SuperGrok OAuth) routinely leaves
|
||||
# these items at ``status="in_progress"`` even when the overall
|
||||
# ``response.status == "completed"`` — the search ran to completion
|
||||
# server-side, the per-item status simply isn't reconciled. These
|
||||
# are NOT a signal that the model's turn is unfinished, so they must
|
||||
# not flip ``has_incomplete_items``. Only the response-level status
|
||||
# and genuine model output items (message/reasoning/function_call)
|
||||
# govern the incomplete verdict. Without this guard, any turn where
|
||||
# grok-composer invokes server-side search is misclassified as
|
||||
# ``finish_reason="incomplete"`` and burns 3 fruitless continuation
|
||||
# retries before failing with "Codex response remained incomplete
|
||||
# after 3 continuation attempts". client-side function/custom tool
|
||||
# calls keep their own in_progress handling below (they are skipped,
|
||||
# not awaited).
|
||||
_SERVER_SIDE_TOOL_CALL_TYPES = {
|
||||
"web_search_call",
|
||||
"file_search_call",
|
||||
"code_interpreter_call",
|
||||
"image_generation_call",
|
||||
"computer_call",
|
||||
"local_shell_call",
|
||||
"mcp_call",
|
||||
}
|
||||
|
||||
for item in output:
|
||||
item_type = getattr(item, "type", None)
|
||||
item_status = getattr(item, "status", None)
|
||||
|
|
@ -1094,7 +1156,10 @@ def _normalize_codex_response(
|
|||
else:
|
||||
item_status = None
|
||||
|
||||
if item_status in {"queued", "in_progress", "incomplete"}:
|
||||
if (
|
||||
item_status in {"queued", "in_progress", "incomplete"}
|
||||
and item_type not in _SERVER_SIDE_TOOL_CALL_TYPES
|
||||
):
|
||||
has_incomplete_items = True
|
||||
saw_streaming_or_item_incomplete = True
|
||||
|
||||
|
|
|
|||
|
|
@ -277,6 +277,9 @@ DEFAULT_CONTEXT_LENGTHS = {
|
|||
# matches "grok-4.20-0309-reasoning" / "-non-reasoning" / "-multi-agent-0309".
|
||||
"grok-composer": 200000, # grok-composer-2.5-fast (Grok Build CLI)
|
||||
"grok-build": 256000, # grok-build-0.1
|
||||
# OAuth-only slug; absent from GET /v1/models. Live /v1/responses probe
|
||||
# (2026-03) enforces ~262144 tokens total (input+output), not 131k.
|
||||
"grok-composer": 262144, # grok-composer-2.5-fast
|
||||
"grok-code-fast": 256000, # grok-code-fast-1
|
||||
"grok-2-vision": 8192, # grok-2-vision, -1212, -latest
|
||||
"grok-4-fast": 2000000, # grok-4-fast-(non-)reasoning, also matches -reasoning
|
||||
|
|
|
|||
|
|
@ -128,6 +128,49 @@ class ResponsesApiTransport(ProviderTransport):
|
|||
reasoning_effort = _effort_clamp.get(reasoning_effort, reasoning_effort)
|
||||
|
||||
response_tools = _responses_tools(tools)
|
||||
|
||||
# xAI server-side web search.
|
||||
#
|
||||
# grok models on xAI's /v1/responses surface (notably
|
||||
# grok-composer-2.5-fast on SuperGrok OAuth) have a *native*,
|
||||
# server-executed web search. When the model is handed a
|
||||
# client-side function literally named ``web_search``, it routes
|
||||
# the intent to that native engine — but because the tool is
|
||||
# declared as a plain ``function`` rather than xAI's first-class
|
||||
# ``{"type": "web_search"}`` built-in, the server-side search is
|
||||
# dispatched but never reconciled: the response streams reasoning
|
||||
# + ``web_search_call`` progress items, the searches never reach
|
||||
# ``status="completed"`` in the assembled output, no final
|
||||
# message is emitted, and ``_normalize_codex_response`` correctly
|
||||
# sees reasoning-with-no-answer and reports ``incomplete``. The
|
||||
# turn then burns 3 continuation retries and fails with "Codex
|
||||
# response remained incomplete after 3 continuation attempts".
|
||||
# Verified live against grok-composer-2.5-fast (2026-06).
|
||||
#
|
||||
# Fix: declare xAI's native ``web_search`` built-in so the search
|
||||
# actually runs to completion server-side and the model streams a
|
||||
# real answer. The Responses API rejects two tools sharing the
|
||||
# name ``web_search`` (HTTP 400 "Duplicate tool names"), so we
|
||||
# drop the client-side ``web_search`` function for the xAI path
|
||||
# and let the native tool satisfy it. All other client-side
|
||||
# tools (read_file, terminal, web_extract, MCP tools, …) are
|
||||
# untouched and continue to dispatch through Hermes's agent loop.
|
||||
#
|
||||
# NOTE: this routes ``web_search`` to Grok's native search engine
|
||||
# for xAI sessions instead of Hermes's configured web provider
|
||||
# (Tavily/etc.), and those results bypass Hermes's tool-trace /
|
||||
# citation plumbing (they arrive baked into the model's answer
|
||||
# rather than as a tool result the loop observes). Scoped to
|
||||
# ``is_xai_responses`` deliberately; narrow to specific models if
|
||||
# a future grok variant should keep the client-side function.
|
||||
if is_xai_responses:
|
||||
filtered = [
|
||||
t for t in (response_tools or [])
|
||||
if not (isinstance(t, dict) and t.get("name") == "web_search")
|
||||
]
|
||||
filtered.append({"type": "web_search"})
|
||||
response_tools = filtered
|
||||
|
||||
# ``tools`` MUST be omitted entirely when there are no functions to
|
||||
# expose: the openai SDK's ``responses.stream()`` / ``responses.parse()``
|
||||
# eagerly call ``_make_tools(tools)`` which does ``for tool in tools``
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@ import pytest
|
|||
from agent.codex_responses_adapter import (
|
||||
_format_responses_error,
|
||||
_normalize_codex_response,
|
||||
_preflight_codex_api_kwargs,
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -68,6 +69,115 @@ def test_normalize_codex_response_treats_summary_only_reasoning_as_incomplete():
|
|||
assert assistant_message.codex_reasoning_items is None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Server-side built-in tool calls (xAI native web_search, code interpreter,
|
||||
# etc.) come back as discrete ``*_call`` output items that xAI's
|
||||
# /v1/responses surface routinely leaves at ``status="in_progress"`` even
|
||||
# when the overall ``response.status == "completed"``. These must NOT mark
|
||||
# the turn incomplete — otherwise grok-composer-2.5-fast research queries
|
||||
# (which invoke server-side web_search) get misclassified as
|
||||
# ``finish_reason="incomplete"`` and burn 3 fruitless continuation retries
|
||||
# before failing with "Codex response remained incomplete after 3
|
||||
# continuation attempts". Observed live against grok-composer-2.5-fast on
|
||||
# SuperGrok OAuth (2026-06).
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_normalize_codex_response_ignores_in_progress_server_side_tool_calls():
|
||||
"""A completed response with a final message + lingering in_progress
|
||||
server-side web_search_call items resolves to 'stop', not 'incomplete'."""
|
||||
response = SimpleNamespace(
|
||||
status="completed",
|
||||
incomplete_details=None,
|
||||
output=[
|
||||
SimpleNamespace(
|
||||
type="reasoning",
|
||||
id="rs_1",
|
||||
encrypted_content="opaque",
|
||||
summary=[SimpleNamespace(text="researching blades")],
|
||||
),
|
||||
SimpleNamespace(
|
||||
type="message",
|
||||
role="assistant",
|
||||
status="completed",
|
||||
content=[SimpleNamespace(
|
||||
type="output_text",
|
||||
text="Milwaukee M18 blade 49-16-2734, ~$30 OEM.",
|
||||
)],
|
||||
),
|
||||
SimpleNamespace(type="web_search_call", status="in_progress"),
|
||||
SimpleNamespace(type="web_search_call", status="in_progress"),
|
||||
SimpleNamespace(type="web_search_call", status="in_progress"),
|
||||
],
|
||||
)
|
||||
|
||||
assistant_message, finish_reason = _normalize_codex_response(response)
|
||||
|
||||
assert finish_reason == "stop"
|
||||
assert assistant_message.content == "Milwaukee M18 blade 49-16-2734, ~$30 OEM."
|
||||
|
||||
|
||||
def test_normalize_codex_response_in_progress_message_still_incomplete():
|
||||
"""Guard scope: an in_progress *message* item (genuine model output that
|
||||
is still streaming) must still mark the turn incomplete — only
|
||||
server-side ``*_call`` items are exempted."""
|
||||
response = SimpleNamespace(
|
||||
status="completed",
|
||||
incomplete_details=None,
|
||||
output=[
|
||||
SimpleNamespace(
|
||||
type="message",
|
||||
role="assistant",
|
||||
status="in_progress",
|
||||
content=[SimpleNamespace(type="output_text", text="partial...")],
|
||||
),
|
||||
],
|
||||
)
|
||||
|
||||
_assistant_message, finish_reason = _normalize_codex_response(response)
|
||||
|
||||
assert finish_reason == "incomplete"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _preflight_codex_api_kwargs — built-in (provider-executed) tools must pass
|
||||
# through validation. Regression guard for the xAI native web_search
|
||||
# injection: the preflight validator previously rejected any tool whose
|
||||
# ``type != "function"`` with "unsupported type", which would 400 every xAI
|
||||
# turn once the native web_search tool is declared.
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_preflight_passes_native_web_search_tool_through():
|
||||
kwargs = {
|
||||
"model": "grok-composer-2.5-fast",
|
||||
"instructions": "You are helpful.",
|
||||
"input": [{"role": "user", "content": [{"type": "input_text", "text": "hi"}]}],
|
||||
"store": False,
|
||||
"tools": [
|
||||
{"type": "function", "name": "read_file", "description": "Read.",
|
||||
"parameters": {"type": "object", "properties": {}}},
|
||||
{"type": "web_search"},
|
||||
],
|
||||
}
|
||||
out = _preflight_codex_api_kwargs(kwargs, allow_stream=True)
|
||||
tools = out["tools"]
|
||||
assert {"type": "web_search"} in tools
|
||||
assert any(t.get("type") == "function" and t.get("name") == "read_file" for t in tools)
|
||||
|
||||
|
||||
def test_preflight_still_rejects_unknown_tool_type():
|
||||
kwargs = {
|
||||
"model": "grok-composer-2.5-fast",
|
||||
"instructions": "You are helpful.",
|
||||
"input": [{"role": "user", "content": [{"type": "input_text", "text": "hi"}]}],
|
||||
"store": False,
|
||||
"tools": [{"type": "totally_made_up_tool"}],
|
||||
}
|
||||
with pytest.raises(ValueError, match="unsupported type"):
|
||||
_preflight_codex_api_kwargs(kwargs, allow_stream=True)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _format_responses_error — adapted from anomalyco/opencode#28757.
|
||||
# Provider failures should surface BOTH the code (rate_limit_exceeded /
|
||||
|
|
|
|||
|
|
@ -142,6 +142,7 @@ class TestDefaultContextLengths:
|
|||
("grok-4", 256000),
|
||||
("grok-4-0709", 256000),
|
||||
("grok-build-0.1", 256000),
|
||||
("grok-composer-2.5-fast", 262144),
|
||||
("grok-code-fast-1", 256000),
|
||||
("grok-3", 131072),
|
||||
("grok-3-mini", 131072),
|
||||
|
|
|
|||
|
|
@ -263,6 +263,72 @@ class TestCodexBuildKwargs:
|
|||
# full history.
|
||||
assert "reasoning.encrypted_content" in kw.get("include", [])
|
||||
|
||||
def test_xai_injects_native_web_search_tool(self, transport):
|
||||
"""xAI path declares xAI's native server-side web_search built-in so
|
||||
grok server-side search runs to completion (otherwise the turn stalls
|
||||
as reasoning-with-no-answer -> false 'incomplete' -> 3 retries -> fail).
|
||||
"""
|
||||
messages = [{"role": "user", "content": "Find current prices."}]
|
||||
kw = transport.build_kwargs(
|
||||
model="grok-composer-2.5-fast", messages=messages,
|
||||
tools=[{"type": "function", "function": {
|
||||
"name": "read_file", "description": "Read a file.",
|
||||
"parameters": {"type": "object",
|
||||
"properties": {"path": {"type": "string"}}}}}],
|
||||
is_xai_responses=True,
|
||||
)
|
||||
tool_types = [t.get("type") for t in kw.get("tools", [])]
|
||||
assert "web_search" in tool_types, kw.get("tools")
|
||||
# Non-conflicting client-side tools are preserved.
|
||||
names = [t.get("name") for t in kw.get("tools", []) if t.get("type") == "function"]
|
||||
assert "read_file" in names
|
||||
|
||||
def test_xai_drops_clientside_web_search_to_avoid_duplicate(self, transport):
|
||||
"""When the client registers its own 'web_search' function, the xAI
|
||||
path must drop it and rely on the native built-in — otherwise xAI
|
||||
returns HTTP 400 'Duplicate tool names: web_search'."""
|
||||
messages = [{"role": "user", "content": "Search the web."}]
|
||||
kw = transport.build_kwargs(
|
||||
model="grok-composer-2.5-fast", messages=messages,
|
||||
tools=[{"type": "function", "function": {
|
||||
"name": "web_search", "description": "Search the web.",
|
||||
"parameters": {"type": "object",
|
||||
"properties": {"query": {"type": "string"}}}}}],
|
||||
is_xai_responses=True,
|
||||
)
|
||||
tools = kw.get("tools", [])
|
||||
# Exactly one tool named/typed web_search, and it is the native built-in.
|
||||
web_search_entries = [
|
||||
t for t in tools
|
||||
if t.get("name") == "web_search" or t.get("type") == "web_search"
|
||||
]
|
||||
assert len(web_search_entries) == 1
|
||||
assert web_search_entries[0] == {"type": "web_search"}
|
||||
# No client-side function form of web_search survives.
|
||||
assert not any(
|
||||
t.get("type") == "function" and t.get("name") == "web_search"
|
||||
for t in tools
|
||||
)
|
||||
|
||||
def test_non_xai_path_does_not_inject_native_web_search(self, transport):
|
||||
"""Native web_search injection is scoped to xAI — Codex/GitHub paths
|
||||
keep the client-side web_search function untouched."""
|
||||
messages = [{"role": "user", "content": "Search."}]
|
||||
kw = transport.build_kwargs(
|
||||
model="gpt-5.4", messages=messages,
|
||||
tools=[{"type": "function", "function": {
|
||||
"name": "web_search", "description": "Search the web.",
|
||||
"parameters": {"type": "object",
|
||||
"properties": {"query": {"type": "string"}}}}}],
|
||||
is_xai_responses=False,
|
||||
)
|
||||
tools = kw.get("tools", [])
|
||||
assert not any(t.get("type") == "web_search" for t in tools)
|
||||
assert any(
|
||||
t.get("type") == "function" and t.get("name") == "web_search"
|
||||
for t in tools
|
||||
)
|
||||
|
||||
def test_xai_reasoning_disabled_no_reasoning_key(self, transport):
|
||||
messages = [{"role": "user", "content": "Hi"}]
|
||||
kw = transport.build_kwargs(
|
||||
|
|
|
|||
|
|
@ -949,6 +949,26 @@ def test_grok_4_still_resolves_to_256k():
|
|||
assert DEFAULT_CONTEXT_LENGTHS[matched_key] == 256_000
|
||||
|
||||
|
||||
def test_grok_composer_context_length_is_262k():
|
||||
"""grok-composer-2.5-fast is OAuth-only and missing from /v1/models.
|
||||
|
||||
Without a specific entry it fell through to the generic ``grok`` 131k
|
||||
catch-all, under-reporting ~262k enforced on /v1/responses.
|
||||
"""
|
||||
from agent.model_metadata import DEFAULT_CONTEXT_LENGTHS
|
||||
|
||||
assert DEFAULT_CONTEXT_LENGTHS["grok-composer"] == 262_144
|
||||
slug = "grok-composer-2.5-fast"
|
||||
matched_key = max(
|
||||
(k for k in DEFAULT_CONTEXT_LENGTHS if k in slug.lower()),
|
||||
key=len,
|
||||
)
|
||||
assert matched_key == "grok-composer", (
|
||||
f"Expected longest-first match on grok-composer for {slug}, got {matched_key}"
|
||||
)
|
||||
assert DEFAULT_CONTEXT_LENGTHS[matched_key] == 262_144
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Cross-issuer reasoning replay guard
|
||||
#
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue