mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-19 10:02:16 +00:00
- model_metadata: grok-composer-2.5-fast → 262144 (OAuth slug not in /v1/models)
- codex transport: inject native {"type":"web_search"} for is_xai_responses;
drop client web_search to avoid duplicate-name 400s
- codex adapter: do not treat in-progress server-side *_call items as incomplete
- tests: adapter, transport build_kwargs, model_metadata, oauth recovery
286 lines
11 KiB
Python
286 lines
11 KiB
Python
from types import SimpleNamespace
|
|
|
|
import pytest
|
|
|
|
from agent.codex_responses_adapter import (
|
|
_format_responses_error,
|
|
_normalize_codex_response,
|
|
_preflight_codex_api_kwargs,
|
|
)
|
|
|
|
|
|
def test_normalize_codex_response_drops_transient_rs_tmp_reasoning_items():
|
|
response = SimpleNamespace(
|
|
status="completed",
|
|
output=[
|
|
SimpleNamespace(
|
|
type="reasoning",
|
|
id="rs_tmp_123",
|
|
encrypted_content="opaque-transient",
|
|
summary=[],
|
|
),
|
|
SimpleNamespace(
|
|
type="reasoning",
|
|
id="rs_456",
|
|
encrypted_content="opaque-stable",
|
|
summary=[SimpleNamespace(text="stable summary")],
|
|
),
|
|
SimpleNamespace(
|
|
type="message",
|
|
role="assistant",
|
|
status="completed",
|
|
content=[SimpleNamespace(type="output_text", text="done")],
|
|
),
|
|
],
|
|
)
|
|
|
|
assistant_message, finish_reason = _normalize_codex_response(response)
|
|
|
|
assert finish_reason == "stop"
|
|
assert assistant_message.content == "done"
|
|
assert assistant_message.codex_reasoning_items == [
|
|
{
|
|
"type": "reasoning",
|
|
"encrypted_content": "opaque-stable",
|
|
"id": "rs_456",
|
|
"summary": [{"type": "summary_text", "text": "stable summary"}],
|
|
}
|
|
]
|
|
|
|
|
|
def test_normalize_codex_response_treats_summary_only_reasoning_as_incomplete():
|
|
response = SimpleNamespace(
|
|
status="completed",
|
|
output=[
|
|
SimpleNamespace(
|
|
type="reasoning",
|
|
id="rs_tmp_789",
|
|
encrypted_content="opaque-transient",
|
|
summary=[SimpleNamespace(text="still thinking")],
|
|
)
|
|
],
|
|
)
|
|
|
|
assistant_message, finish_reason = _normalize_codex_response(response)
|
|
|
|
assert finish_reason == "incomplete"
|
|
assert assistant_message.content == ""
|
|
assert assistant_message.reasoning == "still thinking"
|
|
assert assistant_message.codex_reasoning_items is None
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Server-side built-in tool calls (xAI native web_search, code interpreter,
|
|
# etc.) come back as discrete ``*_call`` output items that xAI's
|
|
# /v1/responses surface routinely leaves at ``status="in_progress"`` even
|
|
# when the overall ``response.status == "completed"``. These must NOT mark
|
|
# the turn incomplete — otherwise grok-composer-2.5-fast research queries
|
|
# (which invoke server-side web_search) get misclassified as
|
|
# ``finish_reason="incomplete"`` and burn 3 fruitless continuation retries
|
|
# before failing with "Codex response remained incomplete after 3
|
|
# continuation attempts". Observed live against grok-composer-2.5-fast on
|
|
# SuperGrok OAuth (2026-06).
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_normalize_codex_response_ignores_in_progress_server_side_tool_calls():
|
|
"""A completed response with a final message + lingering in_progress
|
|
server-side web_search_call items resolves to 'stop', not 'incomplete'."""
|
|
response = SimpleNamespace(
|
|
status="completed",
|
|
incomplete_details=None,
|
|
output=[
|
|
SimpleNamespace(
|
|
type="reasoning",
|
|
id="rs_1",
|
|
encrypted_content="opaque",
|
|
summary=[SimpleNamespace(text="researching blades")],
|
|
),
|
|
SimpleNamespace(
|
|
type="message",
|
|
role="assistant",
|
|
status="completed",
|
|
content=[SimpleNamespace(
|
|
type="output_text",
|
|
text="Milwaukee M18 blade 49-16-2734, ~$30 OEM.",
|
|
)],
|
|
),
|
|
SimpleNamespace(type="web_search_call", status="in_progress"),
|
|
SimpleNamespace(type="web_search_call", status="in_progress"),
|
|
SimpleNamespace(type="web_search_call", status="in_progress"),
|
|
],
|
|
)
|
|
|
|
assistant_message, finish_reason = _normalize_codex_response(response)
|
|
|
|
assert finish_reason == "stop"
|
|
assert assistant_message.content == "Milwaukee M18 blade 49-16-2734, ~$30 OEM."
|
|
|
|
|
|
def test_normalize_codex_response_in_progress_message_still_incomplete():
|
|
"""Guard scope: an in_progress *message* item (genuine model output that
|
|
is still streaming) must still mark the turn incomplete — only
|
|
server-side ``*_call`` items are exempted."""
|
|
response = SimpleNamespace(
|
|
status="completed",
|
|
incomplete_details=None,
|
|
output=[
|
|
SimpleNamespace(
|
|
type="message",
|
|
role="assistant",
|
|
status="in_progress",
|
|
content=[SimpleNamespace(type="output_text", text="partial...")],
|
|
),
|
|
],
|
|
)
|
|
|
|
_assistant_message, finish_reason = _normalize_codex_response(response)
|
|
|
|
assert finish_reason == "incomplete"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _preflight_codex_api_kwargs — built-in (provider-executed) tools must pass
|
|
# through validation. Regression guard for the xAI native web_search
|
|
# injection: the preflight validator previously rejected any tool whose
|
|
# ``type != "function"`` with "unsupported type", which would 400 every xAI
|
|
# turn once the native web_search tool is declared.
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_preflight_passes_native_web_search_tool_through():
|
|
kwargs = {
|
|
"model": "grok-composer-2.5-fast",
|
|
"instructions": "You are helpful.",
|
|
"input": [{"role": "user", "content": [{"type": "input_text", "text": "hi"}]}],
|
|
"store": False,
|
|
"tools": [
|
|
{"type": "function", "name": "read_file", "description": "Read.",
|
|
"parameters": {"type": "object", "properties": {}}},
|
|
{"type": "web_search"},
|
|
],
|
|
}
|
|
out = _preflight_codex_api_kwargs(kwargs, allow_stream=True)
|
|
tools = out["tools"]
|
|
assert {"type": "web_search"} in tools
|
|
assert any(t.get("type") == "function" and t.get("name") == "read_file" for t in tools)
|
|
|
|
|
|
def test_preflight_still_rejects_unknown_tool_type():
|
|
kwargs = {
|
|
"model": "grok-composer-2.5-fast",
|
|
"instructions": "You are helpful.",
|
|
"input": [{"role": "user", "content": [{"type": "input_text", "text": "hi"}]}],
|
|
"store": False,
|
|
"tools": [{"type": "totally_made_up_tool"}],
|
|
}
|
|
with pytest.raises(ValueError, match="unsupported type"):
|
|
_preflight_codex_api_kwargs(kwargs, allow_stream=True)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _format_responses_error — adapted from anomalyco/opencode#28757.
|
|
# Provider failures should surface BOTH the code (rate_limit_exceeded /
|
|
# context_length_exceeded / internal_error / server_error) and the message,
|
|
# so consumers can tell rate limits apart from context-length failures and
|
|
# both apart from generic stream drops.
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_format_responses_error_combines_code_and_message():
|
|
err = {"code": "rate_limit_exceeded", "message": "Slow down"}
|
|
assert _format_responses_error(err, "failed") == "rate_limit_exceeded: Slow down"
|
|
|
|
|
|
def test_format_responses_error_message_only():
|
|
err = {"message": "Upstream model unavailable"}
|
|
assert _format_responses_error(err, "failed") == "Upstream model unavailable"
|
|
|
|
|
|
def test_format_responses_error_code_only_when_message_empty():
|
|
# Some providers/proxies emit a code with an empty message body. We
|
|
# used to fall back to ``str(error_obj)`` — a dict dump — which leaked
|
|
# ``{'code': 'internal_error', 'message': ''}`` into chat output. Now
|
|
# the bare code is surfaced, which is the meaningful field.
|
|
err = {"code": "internal_error", "message": ""}
|
|
assert _format_responses_error(err, "failed") == "internal_error"
|
|
|
|
|
|
def test_format_responses_error_code_only_when_message_missing():
|
|
err = {"code": "server_error"}
|
|
assert _format_responses_error(err, "failed") == "server_error"
|
|
|
|
|
|
def test_format_responses_error_attribute_style_payload():
|
|
# SDK objects expose ``code``/``message`` as attributes rather than dict
|
|
# keys. The helper must accept both shapes since the Responses SDK
|
|
# returns SimpleNamespace-style objects on ``response.failed``.
|
|
err = SimpleNamespace(code="context_length_exceeded", message="too long")
|
|
assert _format_responses_error(err, "failed") == "context_length_exceeded: too long"
|
|
|
|
|
|
def test_format_responses_error_falls_back_to_status_when_empty():
|
|
assert (
|
|
_format_responses_error(None, "failed")
|
|
== "Responses API returned status 'failed'"
|
|
)
|
|
assert (
|
|
_format_responses_error(None, "cancelled")
|
|
== "Responses API returned status 'cancelled'"
|
|
)
|
|
|
|
|
|
def test_format_responses_error_stringifies_opaque_payload():
|
|
# Last-resort: a provider sent something that isn't a dict and has no
|
|
# code/message attributes. Surface its repr rather than swallow it
|
|
# silently — at least it's visible in logs.
|
|
assert _format_responses_error("opaque sentinel", "failed") == "opaque sentinel"
|
|
|
|
|
|
def test_format_responses_error_ignores_non_string_code_message():
|
|
# Defensive: a malformed gateway could send numbers/objects in these
|
|
# fields. We don't want to crash; we want a best-effort string.
|
|
err = {"code": 500, "message": None}
|
|
assert _format_responses_error(err, "failed") == "500"
|
|
|
|
|
|
def test_normalize_codex_response_failed_includes_code_in_error():
|
|
"""Regression: response_status == 'failed' should surface the error
|
|
code, not just the message. Used to leak a bare 'Slow down' string
|
|
that was indistinguishable from a generic stream truncation."""
|
|
# ``output`` non-empty so we don't trip the "no output items" guard
|
|
# before reaching the failed-status branch. Real failed responses
|
|
# often DO carry a partial message item alongside the error.
|
|
response = SimpleNamespace(
|
|
status="failed",
|
|
output=[
|
|
SimpleNamespace(
|
|
type="message",
|
|
role="assistant",
|
|
status="incomplete",
|
|
content=[SimpleNamespace(type="output_text", text="partial")],
|
|
),
|
|
],
|
|
error={"code": "rate_limit_exceeded", "message": "Slow down"},
|
|
)
|
|
with pytest.raises(RuntimeError, match=r"^rate_limit_exceeded: Slow down$"):
|
|
_normalize_codex_response(response)
|
|
|
|
|
|
def test_normalize_codex_response_failed_with_message_only():
|
|
"""Backwards-compat: a failed response with only a message field
|
|
(no code) should still surface that message verbatim."""
|
|
response = SimpleNamespace(
|
|
status="failed",
|
|
output=[
|
|
SimpleNamespace(
|
|
type="message",
|
|
role="assistant",
|
|
status="incomplete",
|
|
content=[SimpleNamespace(type="output_text", text="partial")],
|
|
),
|
|
],
|
|
error={"message": "model error"},
|
|
)
|
|
with pytest.raises(RuntimeError, match=r"^model error$"):
|
|
_normalize_codex_response(response)
|