diff --git a/agent/model_metadata.py b/agent/model_metadata.py index aa99302b013..4493eae5f1f 100644 --- a/agent/model_metadata.py +++ b/agent/model_metadata.py @@ -275,11 +275,12 @@ DEFAULT_CONTEXT_LENGTHS = { # via a custom provider. Values sourced from models.dev (2026-04). # Keys use substring matching (longest-first), so e.g. "grok-4.20" # matches "grok-4.20-0309-reasoning" / "-non-reasoning" / "-multi-agent-0309". + # OAuth-only slug; absent from GET /v1/models. xAI publishes a 200k + # usable context window for Composer 2.5 on Grok Build (SuperGrok / + # Premium+); /v1/responses additionally enforces a ~262144 input+output + # budget, but the usable context (what we track here) is 200k. "grok-composer": 200000, # grok-composer-2.5-fast (Grok Build CLI) "grok-build": 256000, # grok-build-0.1 - # OAuth-only slug; absent from GET /v1/models. Live /v1/responses probe - # (2026-03) enforces ~262144 tokens total (input+output), not 131k. - "grok-composer": 262144, # grok-composer-2.5-fast "grok-code-fast": 256000, # grok-code-fast-1 "grok-2-vision": 8192, # grok-2-vision, -1212, -latest "grok-4-fast": 2000000, # grok-4-fast-(non-)reasoning, also matches -reasoning diff --git a/agent/transports/codex.py b/agent/transports/codex.py index d98e8d50463..1ce449eeaa7 100644 --- a/agent/transports/codex.py +++ b/agent/transports/codex.py @@ -147,29 +147,45 @@ class ResponsesApiTransport(ProviderTransport): # response remained incomplete after 3 continuation attempts". # Verified live against grok-composer-2.5-fast (2026-06). # - # Fix: declare xAI's native ``web_search`` built-in so the search - # actually runs to completion server-side and the model streams a - # real answer. The Responses API rejects two tools sharing the - # name ``web_search`` (HTTP 400 "Duplicate tool names"), so we - # drop the client-side ``web_search`` function for the xAI path - # and let the native tool satisfy it. All other client-side - # tools (read_file, terminal, web_extract, MCP tools, …) are - # untouched and continue to dispatch through Hermes's agent loop. + # Fix: when the agent HAS a client-side ``web_search`` function (i.e. + # the user enabled the web toolset), declare xAI's native + # ``web_search`` built-in instead so the search actually runs to + # completion server-side and the model streams a real answer. The + # Responses API rejects two tools sharing the name ``web_search`` + # (HTTP 400 "Duplicate tool names"), so we drop the client-side + # ``web_search`` function for the xAI path and let the native tool + # satisfy it. All other client-side tools (read_file, terminal, + # web_extract, MCP tools, …) are untouched and continue to dispatch + # through Hermes's agent loop. # - # NOTE: this routes ``web_search`` to Grok's native search engine - # for xAI sessions instead of Hermes's configured web provider - # (Tavily/etc.), and those results bypass Hermes's tool-trace / - # citation plumbing (they arrive baked into the model's answer - # rather than as a tool result the loop observes). Scoped to - # ``is_xai_responses`` deliberately; narrow to specific models if - # a future grok variant should keep the client-side function. - if is_xai_responses: - filtered = [ - t for t in (response_tools or []) - if not (isinstance(t, dict) and t.get("name") == "web_search") - ] - filtered.append({"type": "web_search"}) - response_tools = filtered + # Scope: we ONLY swap in the native built-in when the client + # ``web_search`` was actually present. We do NOT force-enable Grok + # server-side search on turns where the user never had web enabled — + # that would silently route around Hermes's web-provider config and + # tool-trace/citation plumbing for every xai-oauth turn. The swap is + # a 1:1 replacement of an already-requested capability, not an + # additive grant. + # + # NOTE: for the swapped case this routes ``web_search`` to Grok's + # native search engine for xAI sessions instead of Hermes's + # configured web provider (Tavily/etc.), and those results bypass + # Hermes's tool-trace / citation plumbing (they arrive baked into the + # model's answer rather than as a tool result the loop observes). + # Scoped to ``is_xai_responses`` deliberately; narrow to specific + # models if a future grok variant should keep the client-side + # function. + if is_xai_responses and response_tools: + has_client_web_search = any( + isinstance(t, dict) and t.get("name") == "web_search" + for t in response_tools + ) + if has_client_web_search: + filtered = [ + t for t in response_tools + if not (isinstance(t, dict) and t.get("name") == "web_search") + ] + filtered.append({"type": "web_search"}) + response_tools = filtered # ``tools`` MUST be omitted entirely when there are no functions to # expose: the openai SDK's ``responses.stream()`` / ``responses.parse()`` diff --git a/scripts/release.py b/scripts/release.py index 2a74b301507..455c6a94d29 100755 --- a/scripts/release.py +++ b/scripts/release.py @@ -49,6 +49,7 @@ AUTHOR_MAP = { "zheng@omegasys.eu": "omegazheng", "220877172+james47kjv@users.noreply.github.com": "james47kjv", "yuhanglin@YuhangdeMac-mini.local": "1960697431", + "admin@fent.quest": "XVVH", "despitemeguru@gmail.com": "definitelynotguru", "chaslui@outlook.com": "ChasLui", "rio.jeong@thebytesize.ai": "rio-jeong", diff --git a/tests/agent/test_model_metadata.py b/tests/agent/test_model_metadata.py index c1f239c2e47..ecde355d059 100644 --- a/tests/agent/test_model_metadata.py +++ b/tests/agent/test_model_metadata.py @@ -142,7 +142,7 @@ class TestDefaultContextLengths: ("grok-4", 256000), ("grok-4-0709", 256000), ("grok-build-0.1", 256000), - ("grok-composer-2.5-fast", 262144), + ("grok-composer-2.5-fast", 200000), ("grok-code-fast-1", 256000), ("grok-3", 131072), ("grok-3-mini", 131072), diff --git a/tests/agent/transports/test_codex_transport.py b/tests/agent/transports/test_codex_transport.py index 906774ea503..86b8c12692f 100644 --- a/tests/agent/transports/test_codex_transport.py +++ b/tests/agent/transports/test_codex_transport.py @@ -263,12 +263,43 @@ class TestCodexBuildKwargs: # full history. assert "reasoning.encrypted_content" in kw.get("include", []) - def test_xai_injects_native_web_search_tool(self, transport): - """xAI path declares xAI's native server-side web_search built-in so - grok server-side search runs to completion (otherwise the turn stalls - as reasoning-with-no-answer -> false 'incomplete' -> 3 retries -> fail). + def test_xai_injects_native_web_search_when_client_web_search_present(self, transport): + """xAI path swaps a client-side ``web_search`` function for xAI's + native server-side ``web_search`` built-in so grok server-side search + runs to completion (otherwise the turn stalls as + reasoning-with-no-answer -> false 'incomplete' -> 3 retries -> fail). + Non-conflicting client tools are preserved. """ messages = [{"role": "user", "content": "Find current prices."}] + kw = transport.build_kwargs( + model="grok-composer-2.5-fast", messages=messages, + tools=[ + {"type": "function", "function": { + "name": "read_file", "description": "Read a file.", + "parameters": {"type": "object", + "properties": {"path": {"type": "string"}}}}}, + {"type": "function", "function": { + "name": "web_search", "description": "Search the web.", + "parameters": {"type": "object", + "properties": {"query": {"type": "string"}}}}}, + ], + is_xai_responses=True, + ) + tool_types = [t.get("type") for t in kw.get("tools", [])] + assert "web_search" in tool_types, kw.get("tools") + # Non-conflicting client-side tools are preserved. + names = [t.get("name") for t in kw.get("tools", []) if t.get("type") == "function"] + assert "read_file" in names + + def test_xai_does_not_inject_native_web_search_without_client_web_search(self, transport): + """The native ``web_search`` built-in is a 1:1 swap for an + already-requested client ``web_search`` — NOT an additive grant. A + turn whose toolset has no ``web_search`` (user never enabled the web + toolset) must not get Grok server-side search force-injected, which + would silently bypass Hermes's web-provider config and tool-trace + plumbing for every xai-oauth turn. + """ + messages = [{"role": "user", "content": "Read this file."}] kw = transport.build_kwargs( model="grok-composer-2.5-fast", messages=messages, tools=[{"type": "function", "function": { @@ -277,10 +308,9 @@ class TestCodexBuildKwargs: "properties": {"path": {"type": "string"}}}}}], is_xai_responses=True, ) - tool_types = [t.get("type") for t in kw.get("tools", [])] - assert "web_search" in tool_types, kw.get("tools") - # Non-conflicting client-side tools are preserved. - names = [t.get("name") for t in kw.get("tools", []) if t.get("type") == "function"] + tools = kw.get("tools", []) + assert not any(t.get("type") == "web_search" for t in tools), tools + names = [t.get("name") for t in tools if t.get("type") == "function"] assert "read_file" in names def test_xai_drops_clientside_web_search_to_avoid_duplicate(self, transport): diff --git a/tests/run_agent/test_codex_xai_oauth_recovery.py b/tests/run_agent/test_codex_xai_oauth_recovery.py index 20714f117a8..8a2ce564193 100644 --- a/tests/run_agent/test_codex_xai_oauth_recovery.py +++ b/tests/run_agent/test_codex_xai_oauth_recovery.py @@ -949,15 +949,18 @@ def test_grok_4_still_resolves_to_256k(): assert DEFAULT_CONTEXT_LENGTHS[matched_key] == 256_000 -def test_grok_composer_context_length_is_262k(): +def test_grok_composer_context_length_is_200k(): """grok-composer-2.5-fast is OAuth-only and missing from /v1/models. Without a specific entry it fell through to the generic ``grok`` 131k - catch-all, under-reporting ~262k enforced on /v1/responses. + catch-all. xAI publishes a 200k usable context window for Composer 2.5 + on Grok Build (SuperGrok / Premium+); /v1/responses additionally caps + the input+output budget at ~262144, but the usable context (what we + track) is 200k. """ from agent.model_metadata import DEFAULT_CONTEXT_LENGTHS - assert DEFAULT_CONTEXT_LENGTHS["grok-composer"] == 262_144 + assert DEFAULT_CONTEXT_LENGTHS["grok-composer"] == 200_000 slug = "grok-composer-2.5-fast" matched_key = max( (k for k in DEFAULT_CONTEXT_LENGTHS if k in slug.lower()), @@ -966,7 +969,7 @@ def test_grok_composer_context_length_is_262k(): assert matched_key == "grok-composer", ( f"Expected longest-first match on grok-composer for {slug}, got {matched_key}" ) - assert DEFAULT_CONTEXT_LENGTHS[matched_key] == 262_144 + assert DEFAULT_CONTEXT_LENGTHS[matched_key] == 200_000 # ---------------------------------------------------------------------------