mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-30 06:41:51 +00:00
* fix(codex): surface error code in Responses 'failed' status errors
When a Codex Responses turn ends with status=failed, the response carries
the failure details under `response.error` as
`{code, message, param, ...}`. The previous extractor pulled only
`message`, so users seeing a rate-limit failure got a bare "Slow down"
string indistinguishable from a generic stream truncation; an
internal_error with empty message degraded to a dict dump
("{'code': 'internal_error', 'message': ''}").
Extract a `_format_responses_error()` helper that:
- prefixes `code` when both code and message are present
(e.g. 'rate_limit_exceeded: Slow down')
- falls back to the bare `code` when message is empty
- accepts both dict and attribute-style payloads (SDK and JSON-RPC paths)
- preserves the prior status-only fallback when no error payload exists
Apply the same helper at the sibling site in
`codex_app_server_session.run_turn()` so codex-CLI subprocess turn
failures get the same treatment.
Tests:
- 8 new unit tests for `_format_responses_error` covering both shapes,
empty/missing fields, non-string fields, and the status-only fallback.
- 2 regression tests on `_normalize_codex_response` for failed status
with and without a code, asserting the exact RuntimeError message.
- All 3603 tests in tests/agent/ pass.
Adapted from anomalyco/opencode#28757.
* feat(prompt): universal task-completion guidance + local Python toolchain probe
Two cross-model failure modes get a single-line answer in the cached
system prompt. Both gated by config (default on), both add zero overhead
when not needed, both verified via real AIAgent prompt builds.
## What changed
`TASK_COMPLETION_GUIDANCE` — short prompt block applied to ALL models.
Targets two failure modes observed on a real Sarasota real-estate build
task: (1) Opus stopped after writing an 85-byte stub and gave a prose
response with finish_reason=stop on call #3 of 90; (2) DeepSeek pushed
through a PEP-668 wall, then returned fabricated listings instead of
admitting the blocker. Both behaviors are model-family-agnostic, so the
guidance lives outside the existing tool_use_enforcement gate (~192
tokens, paid once per session via prefix cache).
`tools/env_probe.py` — local Python toolchain probe. Detects
python3/pip/uv/PEP-668 state and emits ONE short line in the system
prompt when something is non-default. Emits NOTHING when the env is
clean (zero token cost for normal users). Skipped entirely for remote
terminal backends (docker/modal/ssh) — they have their own probe.
Example output on a broken environment (the actual case):
Python toolchain: python3=3.11.15 (no pip module),
python=missing (use python3), pip→python3.12 (mismatch),
PEP 668=yes (use venv or uv).
## Config
Both flags live under `agent.` in config.yaml, default True:
agent:
task_completion_guidance: true # universal "finish the job" block
environment_probe: true # local Python toolchain hints
Neither addition required a `_config_version` bump — deep-merge fills
defaults in for existing user configs.
## Validation
| Test surface | Result |
|---|---|
| tests/tools/test_env_probe.py | 10/10 pass (probe unit) |
| tests/run_agent/test_run_agent.py — new classes | 8/8 pass (integration) |
| TestToolUseEnforcementConfig | 17/17 pass (no regression) |
| TestBuildSystemPrompt | 9/9 pass (no regression) |
| TestInvalidateSystemPrompt | 2/2 pass (no regression) |
| tests/agent/test_prompt_builder.py | 124/124 pass (no regression) |
| tests/hermes_cli/ | 5662/5662 pass (config defaults) |
| E2E AIAgent build (broken env) | Both blocks present, 2,178 chars |
| E2E AIAgent build (clean env) | 771-char net overhead, env probe silent |
157 lines
6.9 KiB
Python
157 lines
6.9 KiB
Python
"""Tests for tools/env_probe.py — local Python toolchain probe."""
|
|
|
|
import sys
|
|
|
|
import pytest
|
|
|
|
from tools import env_probe
|
|
|
|
|
|
@pytest.fixture(autouse=True)
|
|
def reset_probe_cache():
|
|
"""Each test starts with a clean cache."""
|
|
env_probe._reset_cache_for_tests()
|
|
yield
|
|
env_probe._reset_cache_for_tests()
|
|
|
|
|
|
class TestSilentWhenHealthy:
|
|
"""The probe must emit nothing when the environment is clean — otherwise
|
|
every prompt for every user pays an unnecessary token tax."""
|
|
|
|
def test_clean_env_returns_empty(self, monkeypatch):
|
|
"""python3 + pip module + no PEP 668 → silent."""
|
|
monkeypatch.setattr(env_probe, "_python_version_of",
|
|
lambda b: "3.13.3" if b == "python3" else None)
|
|
monkeypatch.setattr(env_probe, "_has_pip_module", lambda b: True)
|
|
monkeypatch.setattr(env_probe, "_detect_pep668", lambda b: False)
|
|
monkeypatch.setattr(env_probe, "_pip_python_version", lambda: "3.13")
|
|
monkeypatch.setattr(env_probe.shutil, "which", lambda name: None)
|
|
assert env_probe.get_environment_probe_line() == ""
|
|
|
|
def test_pep668_with_uv_returns_empty(self, monkeypatch):
|
|
"""PEP 668 alone shouldn't trigger output if uv is installed —
|
|
agent has a viable install path."""
|
|
monkeypatch.setattr(env_probe, "_python_version_of",
|
|
lambda b: "3.12.4" if b == "python3" else None)
|
|
monkeypatch.setattr(env_probe, "_has_pip_module", lambda b: True)
|
|
monkeypatch.setattr(env_probe, "_detect_pep668", lambda b: True)
|
|
monkeypatch.setattr(env_probe, "_pip_python_version", lambda: "3.12")
|
|
monkeypatch.setattr(env_probe.shutil, "which",
|
|
lambda name: "/usr/local/bin/uv" if name == "uv" else None)
|
|
assert env_probe.get_environment_probe_line() == ""
|
|
|
|
|
|
class TestEmitsOnRealProblems:
|
|
"""The probe must produce a usable line for the real failure modes
|
|
that drove this feature."""
|
|
|
|
def test_allen_scenario_python_version_mismatch(self, monkeypatch):
|
|
"""python3 is 3.11 (no pip module), pip on PATH is 3.12, PEP 668 on,
|
|
no uv — the exact scenario from the Sarasota real-estate task."""
|
|
monkeypatch.setattr(env_probe, "_python_version_of",
|
|
lambda b: {"python3": "3.11.15", "python": None}.get(b))
|
|
monkeypatch.setattr(env_probe, "_has_pip_module", lambda b: False)
|
|
monkeypatch.setattr(env_probe, "_detect_pep668", lambda b: True)
|
|
monkeypatch.setattr(env_probe, "_pip_python_version", lambda: "3.12")
|
|
monkeypatch.setattr(env_probe.shutil, "which",
|
|
lambda name: None if name == "uv" else "/usr/bin/" + name)
|
|
|
|
line = env_probe.get_environment_probe_line()
|
|
assert line # not silent
|
|
# Single line — must not blow up the system prompt.
|
|
assert "\n" not in line
|
|
# Names the real toolchain state
|
|
assert "3.11.15" in line
|
|
assert "no pip module" in line
|
|
assert "mismatch" in line
|
|
assert "PEP 668" in line
|
|
# Points at the right escape hatch
|
|
assert "venv" in line or "uv" in line
|
|
|
|
def test_missing_python3_is_named(self, monkeypatch):
|
|
"""If python3 isn't installed at all, say so."""
|
|
monkeypatch.setattr(env_probe, "_python_version_of", lambda b: None)
|
|
monkeypatch.setattr(env_probe, "_has_pip_module", lambda b: False)
|
|
monkeypatch.setattr(env_probe, "_detect_pep668", lambda b: False)
|
|
monkeypatch.setattr(env_probe, "_pip_python_version", lambda: None)
|
|
monkeypatch.setattr(env_probe.shutil, "which", lambda name: None)
|
|
|
|
line = env_probe.get_environment_probe_line()
|
|
assert "python3=missing" in line
|
|
|
|
def test_python_missing_but_python3_present(self, monkeypatch):
|
|
"""Common on Debian: only python3 exists, agent shouldn't type
|
|
`python`."""
|
|
monkeypatch.setattr(env_probe, "_python_version_of",
|
|
lambda b: "3.12.4" if b == "python3" else None)
|
|
monkeypatch.setattr(env_probe, "_has_pip_module", lambda b: True)
|
|
monkeypatch.setattr(env_probe, "_detect_pep668", lambda b: True)
|
|
monkeypatch.setattr(env_probe, "_pip_python_version", lambda: "3.12")
|
|
monkeypatch.setattr(env_probe.shutil, "which",
|
|
lambda name: None if name == "uv" else "/usr/bin/" + name)
|
|
|
|
line = env_probe.get_environment_probe_line()
|
|
# `python=missing` only matters in the non-silent path; PEP 668 (without
|
|
# uv) is what brings us off-silent here, so check both signals.
|
|
assert "PEP 668" in line
|
|
assert "python=missing" in line
|
|
|
|
|
|
class TestSkipsRemoteBackends:
|
|
"""Remote backends have their own probe; this one must stay out."""
|
|
|
|
def test_docker_returns_empty(self, monkeypatch):
|
|
monkeypatch.setenv("TERMINAL_ENV", "docker")
|
|
# Even with a broken local env, docker must emit nothing.
|
|
monkeypatch.setattr(env_probe, "_python_version_of", lambda b: None)
|
|
monkeypatch.setattr(env_probe, "_has_pip_module", lambda b: False)
|
|
assert env_probe.get_environment_probe_line() == ""
|
|
|
|
def test_modal_returns_empty(self, monkeypatch):
|
|
monkeypatch.setenv("TERMINAL_ENV", "modal")
|
|
assert env_probe.get_environment_probe_line() == ""
|
|
|
|
def test_ssh_returns_empty(self, monkeypatch):
|
|
monkeypatch.setenv("TERMINAL_ENV", "ssh")
|
|
assert env_probe.get_environment_probe_line() == ""
|
|
|
|
|
|
class TestCaching:
|
|
"""The probe runs once per process — the result is deterministic for
|
|
the lifetime of the agent."""
|
|
|
|
def test_result_cached(self, monkeypatch):
|
|
calls = []
|
|
|
|
def counting_version(b):
|
|
calls.append(b)
|
|
return "3.12.4" if b == "python3" else None
|
|
|
|
monkeypatch.setattr(env_probe, "_python_version_of", counting_version)
|
|
monkeypatch.setattr(env_probe, "_has_pip_module", lambda b: True)
|
|
monkeypatch.setattr(env_probe, "_detect_pep668", lambda b: False)
|
|
monkeypatch.setattr(env_probe, "_pip_python_version", lambda: "3.12")
|
|
monkeypatch.setattr(env_probe.shutil, "which", lambda name: None)
|
|
|
|
env_probe.get_environment_probe_line()
|
|
env_probe.get_environment_probe_line()
|
|
env_probe.get_environment_probe_line()
|
|
|
|
# Only the first call probes — caller-counting confirms it.
|
|
# Two calls (python3 + python) on first invocation, zero after.
|
|
assert len(calls) == 2
|
|
|
|
|
|
class TestRobustness:
|
|
"""The probe must NEVER crash the prompt build."""
|
|
|
|
def test_subprocess_failure_returns_empty(self, monkeypatch):
|
|
"""If every subprocess fails, just stay silent."""
|
|
def boom(*a, **kw):
|
|
raise OSError("simulated")
|
|
monkeypatch.setattr(env_probe.subprocess, "run", boom)
|
|
# Should not raise, should just return ""
|
|
result = env_probe.get_environment_probe_line()
|
|
# Whatever the result is, it must be a string
|
|
assert isinstance(result, str)
|