mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-30 06:41:51 +00:00
* fix(codex): surface error code in Responses 'failed' status errors
When a Codex Responses turn ends with status=failed, the response carries
the failure details under `response.error` as
`{code, message, param, ...}`. The previous extractor pulled only
`message`, so users seeing a rate-limit failure got a bare "Slow down"
string indistinguishable from a generic stream truncation; an
internal_error with empty message degraded to a dict dump
("{'code': 'internal_error', 'message': ''}").
Extract a `_format_responses_error()` helper that:
- prefixes `code` when both code and message are present
(e.g. 'rate_limit_exceeded: Slow down')
- falls back to the bare `code` when message is empty
- accepts both dict and attribute-style payloads (SDK and JSON-RPC paths)
- preserves the prior status-only fallback when no error payload exists
Apply the same helper at the sibling site in
`codex_app_server_session.run_turn()` so codex-CLI subprocess turn
failures get the same treatment.
Tests:
- 8 new unit tests for `_format_responses_error` covering both shapes,
empty/missing fields, non-string fields, and the status-only fallback.
- 2 regression tests on `_normalize_codex_response` for failed status
with and without a code, asserting the exact RuntimeError message.
- All 3603 tests in tests/agent/ pass.
Adapted from anomalyco/opencode#28757.
* feat(prompt): universal task-completion guidance + local Python toolchain probe
Two cross-model failure modes get a single-line answer in the cached
system prompt. Both gated by config (default on), both add zero overhead
when not needed, both verified via real AIAgent prompt builds.
## What changed
`TASK_COMPLETION_GUIDANCE` — short prompt block applied to ALL models.
Targets two failure modes observed on a real Sarasota real-estate build
task: (1) Opus stopped after writing an 85-byte stub and gave a prose
response with finish_reason=stop on call #3 of 90; (2) DeepSeek pushed
through a PEP-668 wall, then returned fabricated listings instead of
admitting the blocker. Both behaviors are model-family-agnostic, so the
guidance lives outside the existing tool_use_enforcement gate (~192
tokens, paid once per session via prefix cache).
`tools/env_probe.py` — local Python toolchain probe. Detects
python3/pip/uv/PEP-668 state and emits ONE short line in the system
prompt when something is non-default. Emits NOTHING when the env is
clean (zero token cost for normal users). Skipped entirely for remote
terminal backends (docker/modal/ssh) — they have their own probe.
Example output on a broken environment (the actual case):
Python toolchain: python3=3.11.15 (no pip module),
python=missing (use python3), pip→python3.12 (mismatch),
PEP 668=yes (use venv or uv).
## Config
Both flags live under `agent.` in config.yaml, default True:
agent:
task_completion_guidance: true # universal "finish the job" block
environment_probe: true # local Python toolchain hints
Neither addition required a `_config_version` bump — deep-merge fills
defaults in for existing user configs.
## Validation
| Test surface | Result |
|---|---|
| tests/tools/test_env_probe.py | 10/10 pass (probe unit) |
| tests/run_agent/test_run_agent.py — new classes | 8/8 pass (integration) |
| TestToolUseEnforcementConfig | 17/17 pass (no regression) |
| TestBuildSystemPrompt | 9/9 pass (no regression) |
| TestInvalidateSystemPrompt | 2/2 pass (no regression) |
| tests/agent/test_prompt_builder.py | 124/124 pass (no regression) |
| tests/hermes_cli/ | 5662/5662 pass (config defaults) |
| E2E AIAgent build (broken env) | Both blocks present, 2,178 chars |
| E2E AIAgent build (clean env) | 771-char net overhead, env probe silent |
176 lines
6.5 KiB
Python
176 lines
6.5 KiB
Python
from types import SimpleNamespace
|
|
|
|
import pytest
|
|
|
|
from agent.codex_responses_adapter import (
|
|
_format_responses_error,
|
|
_normalize_codex_response,
|
|
)
|
|
|
|
|
|
def test_normalize_codex_response_drops_transient_rs_tmp_reasoning_items():
|
|
response = SimpleNamespace(
|
|
status="completed",
|
|
output=[
|
|
SimpleNamespace(
|
|
type="reasoning",
|
|
id="rs_tmp_123",
|
|
encrypted_content="opaque-transient",
|
|
summary=[],
|
|
),
|
|
SimpleNamespace(
|
|
type="reasoning",
|
|
id="rs_456",
|
|
encrypted_content="opaque-stable",
|
|
summary=[SimpleNamespace(text="stable summary")],
|
|
),
|
|
SimpleNamespace(
|
|
type="message",
|
|
role="assistant",
|
|
status="completed",
|
|
content=[SimpleNamespace(type="output_text", text="done")],
|
|
),
|
|
],
|
|
)
|
|
|
|
assistant_message, finish_reason = _normalize_codex_response(response)
|
|
|
|
assert finish_reason == "stop"
|
|
assert assistant_message.content == "done"
|
|
assert assistant_message.codex_reasoning_items == [
|
|
{
|
|
"type": "reasoning",
|
|
"encrypted_content": "opaque-stable",
|
|
"id": "rs_456",
|
|
"summary": [{"type": "summary_text", "text": "stable summary"}],
|
|
}
|
|
]
|
|
|
|
|
|
def test_normalize_codex_response_treats_summary_only_reasoning_as_incomplete():
|
|
response = SimpleNamespace(
|
|
status="completed",
|
|
output=[
|
|
SimpleNamespace(
|
|
type="reasoning",
|
|
id="rs_tmp_789",
|
|
encrypted_content="opaque-transient",
|
|
summary=[SimpleNamespace(text="still thinking")],
|
|
)
|
|
],
|
|
)
|
|
|
|
assistant_message, finish_reason = _normalize_codex_response(response)
|
|
|
|
assert finish_reason == "incomplete"
|
|
assert assistant_message.content == ""
|
|
assert assistant_message.reasoning == "still thinking"
|
|
assert assistant_message.codex_reasoning_items is None
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _format_responses_error — adapted from anomalyco/opencode#28757.
|
|
# Provider failures should surface BOTH the code (rate_limit_exceeded /
|
|
# context_length_exceeded / internal_error / server_error) and the message,
|
|
# so consumers can tell rate limits apart from context-length failures and
|
|
# both apart from generic stream drops.
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_format_responses_error_combines_code_and_message():
|
|
err = {"code": "rate_limit_exceeded", "message": "Slow down"}
|
|
assert _format_responses_error(err, "failed") == "rate_limit_exceeded: Slow down"
|
|
|
|
|
|
def test_format_responses_error_message_only():
|
|
err = {"message": "Upstream model unavailable"}
|
|
assert _format_responses_error(err, "failed") == "Upstream model unavailable"
|
|
|
|
|
|
def test_format_responses_error_code_only_when_message_empty():
|
|
# Some providers/proxies emit a code with an empty message body. We
|
|
# used to fall back to ``str(error_obj)`` — a dict dump — which leaked
|
|
# ``{'code': 'internal_error', 'message': ''}`` into chat output. Now
|
|
# the bare code is surfaced, which is the meaningful field.
|
|
err = {"code": "internal_error", "message": ""}
|
|
assert _format_responses_error(err, "failed") == "internal_error"
|
|
|
|
|
|
def test_format_responses_error_code_only_when_message_missing():
|
|
err = {"code": "server_error"}
|
|
assert _format_responses_error(err, "failed") == "server_error"
|
|
|
|
|
|
def test_format_responses_error_attribute_style_payload():
|
|
# SDK objects expose ``code``/``message`` as attributes rather than dict
|
|
# keys. The helper must accept both shapes since the Responses SDK
|
|
# returns SimpleNamespace-style objects on ``response.failed``.
|
|
err = SimpleNamespace(code="context_length_exceeded", message="too long")
|
|
assert _format_responses_error(err, "failed") == "context_length_exceeded: too long"
|
|
|
|
|
|
def test_format_responses_error_falls_back_to_status_when_empty():
|
|
assert (
|
|
_format_responses_error(None, "failed")
|
|
== "Responses API returned status 'failed'"
|
|
)
|
|
assert (
|
|
_format_responses_error(None, "cancelled")
|
|
== "Responses API returned status 'cancelled'"
|
|
)
|
|
|
|
|
|
def test_format_responses_error_stringifies_opaque_payload():
|
|
# Last-resort: a provider sent something that isn't a dict and has no
|
|
# code/message attributes. Surface its repr rather than swallow it
|
|
# silently — at least it's visible in logs.
|
|
assert _format_responses_error("opaque sentinel", "failed") == "opaque sentinel"
|
|
|
|
|
|
def test_format_responses_error_ignores_non_string_code_message():
|
|
# Defensive: a malformed gateway could send numbers/objects in these
|
|
# fields. We don't want to crash; we want a best-effort string.
|
|
err = {"code": 500, "message": None}
|
|
assert _format_responses_error(err, "failed") == "500"
|
|
|
|
|
|
def test_normalize_codex_response_failed_includes_code_in_error():
|
|
"""Regression: response_status == 'failed' should surface the error
|
|
code, not just the message. Used to leak a bare 'Slow down' string
|
|
that was indistinguishable from a generic stream truncation."""
|
|
# ``output`` non-empty so we don't trip the "no output items" guard
|
|
# before reaching the failed-status branch. Real failed responses
|
|
# often DO carry a partial message item alongside the error.
|
|
response = SimpleNamespace(
|
|
status="failed",
|
|
output=[
|
|
SimpleNamespace(
|
|
type="message",
|
|
role="assistant",
|
|
status="incomplete",
|
|
content=[SimpleNamespace(type="output_text", text="partial")],
|
|
),
|
|
],
|
|
error={"code": "rate_limit_exceeded", "message": "Slow down"},
|
|
)
|
|
with pytest.raises(RuntimeError, match=r"^rate_limit_exceeded: Slow down$"):
|
|
_normalize_codex_response(response)
|
|
|
|
|
|
def test_normalize_codex_response_failed_with_message_only():
|
|
"""Backwards-compat: a failed response with only a message field
|
|
(no code) should still surface that message verbatim."""
|
|
response = SimpleNamespace(
|
|
status="failed",
|
|
output=[
|
|
SimpleNamespace(
|
|
type="message",
|
|
role="assistant",
|
|
status="incomplete",
|
|
content=[SimpleNamespace(type="output_text", text="partial")],
|
|
),
|
|
],
|
|
error={"message": "model error"},
|
|
)
|
|
with pytest.raises(RuntimeError, match=r"^model error$"):
|
|
_normalize_codex_response(response)
|