feat(prompt): universal task-completion guidance + local Python toolchain probe (#34340)

* fix(codex): surface error code in Responses 'failed' status errors When a Codex Responses turn ends with status=failed, the response carries the failure details under `response.error` as `{code, message, param, ...}`. The previous extractor pulled only `message`, so users seeing a rate-limit failure got a bare "Slow down" string indistinguishable from a generic stream truncation; an internal_error with empty message degraded to a dict dump ("{'code': 'internal_error', 'message': ''}"). Extract a `_format_responses_error()` helper that: - prefixes `code` when both code and message are present (e.g. 'rate_limit_exceeded: Slow down') - falls back to the bare `code` when message is empty - accepts both dict and attribute-style payloads (SDK and JSON-RPC paths) - preserves the prior status-only fallback when no error payload exists Apply the same helper at the sibling site in `codex_app_server_session.run_turn()` so codex-CLI subprocess turn failures get the same treatment. Tests: - 8 new unit tests for `_format_responses_error` covering both shapes, empty/missing fields, non-string fields, and the status-only fallback. - 2 regression tests on `_normalize_codex_response` for failed status with and without a code, asserting the exact RuntimeError message. - All 3603 tests in tests/agent/ pass. Adapted from anomalyco/opencode#28757. * feat(prompt): universal task-completion guidance + local Python toolchain probe Two cross-model failure modes get a single-line answer in the cached system prompt. Both gated by config (default on), both add zero overhead when not needed, both verified via real AIAgent prompt builds. ## What changed `TASK_COMPLETION_GUIDANCE` — short prompt block applied to ALL models. Targets two failure modes observed on a real Sarasota real-estate build task: (1) Opus stopped after writing an 85-byte stub and gave a prose response with finish_reason=stop on call #3 of 90; (2) DeepSeek pushed through a PEP-668 wall, then returned fabricated listings instead of admitting the blocker. Both behaviors are model-family-agnostic, so the guidance lives outside the existing tool_use_enforcement gate (~192 tokens, paid once per session via prefix cache). `tools/env_probe.py` — local Python toolchain probe. Detects python3/pip/uv/PEP-668 state and emits ONE short line in the system prompt when something is non-default. Emits NOTHING when the env is clean (zero token cost for normal users). Skipped entirely for remote terminal backends (docker/modal/ssh) — they have their own probe. Example output on a broken environment (the actual case): Python toolchain: python3=3.11.15 (no pip module), python=missing (use python3), pip→python3.12 (mismatch), PEP 668=yes (use venv or uv). ## Config Both flags live under `agent.` in config.yaml, default True: agent: task_completion_guidance: true # universal "finish the job" block environment_probe: true # local Python toolchain hints Neither addition required a `_config_version` bump — deep-merge fills defaults in for existing user configs. ## Validation | Test surface | Result | |---|---| | tests/tools/test_env_probe.py | 10/10 pass (probe unit) | | tests/run_agent/test_run_agent.py — new classes | 8/8 pass (integration) | | TestToolUseEnforcementConfig | 17/17 pass (no regression) | | TestBuildSystemPrompt | 9/9 pass (no regression) | | TestInvalidateSystemPrompt | 2/2 pass (no regression) | | tests/agent/test_prompt_builder.py | 124/124 pass (no regression) | | tests/hermes_cli/ | 5662/5662 pass (config defaults) | | E2E AIAgent build (broken env) | Both blocks present, 2,178 chars | | E2E AIAgent build (clean env) | 771-char net overhead, env probe silent |
2026-06-08 08:11:38 +00:00 · 2026-05-28 22:26:09 -07:00 · 2026-05-28 22:26:09 -07:00 · a4d8f0f62a
commit a4d8f0f62a
parent 75d2c081c9
10 changed files with 819 additions and 6 deletions
--- a/tests/agent/test_codex_responses_adapter.py
+++ b/tests/agent/test_codex_responses_adapter.py
@ -1,6 +1,11 @@
 from types import SimpleNamespace

-from agent.codex_responses_adapter import _normalize_codex_response
+import pytest
+
+from agent.codex_responses_adapter import (
+    _format_responses_error,
+    _normalize_codex_response,
+)


 def test_normalize_codex_response_drops_transient_rs_tmp_reasoning_items():
@ -61,3 +66,111 @@ def test_normalize_codex_response_treats_summary_only_reasoning_as_incomplete():
    assert assistant_message.content == ""
    assert assistant_message.reasoning == "still thinking"
    assert assistant_message.codex_reasoning_items is None
+
+
+# ---------------------------------------------------------------------------
+# _format_responses_error — adapted from anomalyco/opencode#28757.
+# Provider failures should surface BOTH the code (rate_limit_exceeded /
+# context_length_exceeded / internal_error / server_error) and the message,
+# so consumers can tell rate limits apart from context-length failures and
+# both apart from generic stream drops.
+# ---------------------------------------------------------------------------
+
+
+def test_format_responses_error_combines_code_and_message():
+    err = {"code": "rate_limit_exceeded", "message": "Slow down"}
+    assert _format_responses_error(err, "failed") == "rate_limit_exceeded: Slow down"
+
+
+def test_format_responses_error_message_only():
+    err = {"message": "Upstream model unavailable"}
+    assert _format_responses_error(err, "failed") == "Upstream model unavailable"
+
+
+def test_format_responses_error_code_only_when_message_empty():
+    # Some providers/proxies emit a code with an empty message body. We
+    # used to fall back to ``str(error_obj)`` — a dict dump — which leaked
+    # ``{'code': 'internal_error', 'message': ''}`` into chat output. Now
+    # the bare code is surfaced, which is the meaningful field.
+    err = {"code": "internal_error", "message": ""}
+    assert _format_responses_error(err, "failed") == "internal_error"
+
+
+def test_format_responses_error_code_only_when_message_missing():
+    err = {"code": "server_error"}
+    assert _format_responses_error(err, "failed") == "server_error"
+
+
+def test_format_responses_error_attribute_style_payload():
+    # SDK objects expose ``code``/``message`` as attributes rather than dict
+    # keys. The helper must accept both shapes since the Responses SDK
+    # returns SimpleNamespace-style objects on ``response.failed``.
+    err = SimpleNamespace(code="context_length_exceeded", message="too long")
+    assert _format_responses_error(err, "failed") == "context_length_exceeded: too long"
+
+
+def test_format_responses_error_falls_back_to_status_when_empty():
+    assert (
+        _format_responses_error(None, "failed")
+        == "Responses API returned status 'failed'"
+    )
+    assert (
+        _format_responses_error(None, "cancelled")
+        == "Responses API returned status 'cancelled'"
+    )
+
+
+def test_format_responses_error_stringifies_opaque_payload():
+    # Last-resort: a provider sent something that isn't a dict and has no
+    # code/message attributes. Surface its repr rather than swallow it
+    # silently — at least it's visible in logs.
+    assert _format_responses_error("opaque sentinel", "failed") == "opaque sentinel"
+
+
+def test_format_responses_error_ignores_non_string_code_message():
+    # Defensive: a malformed gateway could send numbers/objects in these
+    # fields. We don't want to crash; we want a best-effort string.
+    err = {"code": 500, "message": None}
+    assert _format_responses_error(err, "failed") == "500"
+
+
+def test_normalize_codex_response_failed_includes_code_in_error():
+    """Regression: response_status == 'failed' should surface the error
+    code, not just the message. Used to leak a bare 'Slow down' string
+    that was indistinguishable from a generic stream truncation."""
+    # ``output`` non-empty so we don't trip the "no output items" guard
+    # before reaching the failed-status branch. Real failed responses
+    # often DO carry a partial message item alongside the error.
+    response = SimpleNamespace(
+        status="failed",
+        output=[
+            SimpleNamespace(
+                type="message",
+                role="assistant",
+                status="incomplete",
+                content=[SimpleNamespace(type="output_text", text="partial")],
+            ),
+        ],
+        error={"code": "rate_limit_exceeded", "message": "Slow down"},
+    )
+    with pytest.raises(RuntimeError, match=r"^rate_limit_exceeded: Slow down$"):
+        _normalize_codex_response(response)
+
+
+def test_normalize_codex_response_failed_with_message_only():
+    """Backwards-compat: a failed response with only a message field
+    (no code) should still surface that message verbatim."""
+    response = SimpleNamespace(
+        status="failed",
+        output=[
+            SimpleNamespace(
+                type="message",
+                role="assistant",
+                status="incomplete",
+                content=[SimpleNamespace(type="output_text", text="partial")],
+            ),
+        ],
+        error={"message": "model error"},
+    )
+    with pytest.raises(RuntimeError, match=r"^model error$"):
+        _normalize_codex_response(response)
--- a/tests/run_agent/test_run_agent.py
+++ b/tests/run_agent/test_run_agent.py
@ -1323,6 +1323,178 @@ class TestToolUseEnforcementConfig:
            assert TOOL_USE_ENFORCEMENT_GUIDANCE not in prompt


+class TestTaskCompletionGuidance:
+    """Tests for the universal task-completion / no-fabrication guidance
+    (config.yaml ``agent.task_completion_guidance``).
+
+    Unlike tool_use_enforcement, this block is model-family-agnostic — it
+    targets cross-model failure modes (stopping after a stub; fabricating
+    output when blocked) and should appear for every model by default."""
+
+    def _make_agent(self, model="anthropic/claude-opus-4.8",
+                    task_completion_guidance=True, **extra_cfg):
+        agent_cfg = {"task_completion_guidance": task_completion_guidance}
+        agent_cfg.update(extra_cfg)
+        with (
+            patch(
+                "run_agent.get_tool_definitions",
+                return_value=_make_tool_defs("terminal", "web_search"),
+            ),
+            patch("run_agent.check_toolset_requirements", return_value={}),
+            patch("run_agent.OpenAI"),
+            patch(
+                "hermes_cli.config.load_config",
+                return_value={"agent": agent_cfg},
+            ),
+        ):
+            a = AIAgent(
+                model=model,
+                api_key="test-key-1234567890",
+                base_url="https://openrouter.ai/api/v1",
+                quiet_mode=True,
+                skip_context_files=True,
+                skip_memory=True,
+            )
+            a.client = MagicMock()
+            return a
+
+    def test_default_injects_for_claude(self):
+        """The block must reach Claude by default — that's the
+        primary motivating model family."""
+        from agent.prompt_builder import TASK_COMPLETION_GUIDANCE
+        agent = self._make_agent(model="anthropic/claude-opus-4.8")
+        prompt = agent._build_system_prompt()
+        assert TASK_COMPLETION_GUIDANCE in prompt
+
+    def test_default_injects_for_deepseek(self):
+        """And for DeepSeek — the other model that failed the Sarasota
+        real-estate task by fabricating output."""
+        from agent.prompt_builder import TASK_COMPLETION_GUIDANCE
+        agent = self._make_agent(model="deepseek/deepseek-v4-flash")
+        prompt = agent._build_system_prompt()
+        assert TASK_COMPLETION_GUIDANCE in prompt
+
+    def test_default_injects_for_gpt(self):
+        """Also reaches model families that already get enforcement —
+        it's additive, not exclusive."""
+        from agent.prompt_builder import TASK_COMPLETION_GUIDANCE
+        agent = self._make_agent(model="openai/gpt-5.4")
+        prompt = agent._build_system_prompt()
+        assert TASK_COMPLETION_GUIDANCE in prompt
+
+    def test_false_disables(self):
+        from agent.prompt_builder import TASK_COMPLETION_GUIDANCE
+        agent = self._make_agent(
+            model="anthropic/claude-opus-4.8", task_completion_guidance=False
+        )
+        prompt = agent._build_system_prompt()
+        assert TASK_COMPLETION_GUIDANCE not in prompt
+
+    def test_no_tools_no_injection(self):
+        """Same gate as tool_use_enforcement — no tools means no guidance.
+        The guidance refers to ``tool calls`` and ``tool output``; without
+        tools it would be advice for a capability the agent doesn't have."""
+        from agent.prompt_builder import TASK_COMPLETION_GUIDANCE
+        with (
+            patch("run_agent.get_tool_definitions", return_value=[]),
+            patch("run_agent.check_toolset_requirements", return_value={}),
+            patch("run_agent.OpenAI"),
+            patch(
+                "hermes_cli.config.load_config",
+                return_value={"agent": {"task_completion_guidance": True}},
+            ),
+        ):
+            a = AIAgent(
+                api_key="test-key-1234567890",
+                base_url="https://openrouter.ai/api/v1",
+                quiet_mode=True,
+                skip_context_files=True,
+                skip_memory=True,
+                enabled_toolsets=[],
+            )
+            a.client = MagicMock()
+            assert TASK_COMPLETION_GUIDANCE not in a._build_system_prompt()
+
+
+class TestEnvironmentProbeIntegration:
+    """Tests for the local Python toolchain probe wiring (config.yaml
+    ``agent.environment_probe``).  The probe itself is unit-tested in
+    tests/tools/test_env_probe.py; this class confirms it lands in the
+    system prompt when enabled and stays out when disabled."""
+
+    def _make_agent(self, model="anthropic/claude-opus-4.8",
+                    environment_probe=True):
+        with (
+            patch(
+                "run_agent.get_tool_definitions",
+                return_value=_make_tool_defs("terminal"),
+            ),
+            patch("run_agent.check_toolset_requirements", return_value={}),
+            patch("run_agent.OpenAI"),
+            patch(
+                "hermes_cli.config.load_config",
+                return_value={"agent": {"environment_probe": environment_probe}},
+            ),
+        ):
+            a = AIAgent(
+                model=model,
+                api_key="test-key-1234567890",
+                base_url="https://openrouter.ai/api/v1",
+                quiet_mode=True,
+                skip_context_files=True,
+                skip_memory=True,
+            )
+            a.client = MagicMock()
+            return a
+
+    def test_probe_appears_when_problem_detected(self, monkeypatch):
+        """When the probe finds something off, the line lands in the prompt."""
+        from tools import env_probe
+        env_probe._reset_cache_for_tests()
+        monkeypatch.setattr(env_probe, "_python_version_of",
+                            lambda b: {"python3": "3.11.15"}.get(b))
+        monkeypatch.setattr(env_probe, "_has_pip_module", lambda b: False)
+        monkeypatch.setattr(env_probe, "_detect_pep668", lambda b: True)
+        monkeypatch.setattr(env_probe, "_pip_python_version", lambda: "3.12")
+        monkeypatch.setattr(env_probe.shutil, "which",
+                            lambda name: None if name == "uv" else "/usr/bin/" + name)
+
+        agent = self._make_agent(environment_probe=True)
+        prompt = agent._build_system_prompt()
+        assert "Python toolchain:" in prompt
+        assert "3.11.15" in prompt
+
+    def test_probe_silent_on_clean_env(self, monkeypatch):
+        """Clean environment → probe emits nothing → no line in prompt."""
+        from tools import env_probe
+        env_probe._reset_cache_for_tests()
+        monkeypatch.setattr(env_probe, "_python_version_of",
+                            lambda b: "3.13.3" if b == "python3" else None)
+        monkeypatch.setattr(env_probe, "_has_pip_module", lambda b: True)
+        monkeypatch.setattr(env_probe, "_detect_pep668", lambda b: False)
+        monkeypatch.setattr(env_probe, "_pip_python_version", lambda: "3.13")
+        monkeypatch.setattr(env_probe.shutil, "which", lambda name: None)
+
+        agent = self._make_agent(environment_probe=True)
+        prompt = agent._build_system_prompt()
+        assert "Python toolchain:" not in prompt
+
+    def test_probe_disabled_by_config(self, monkeypatch):
+        """Even with detectable problems, the probe stays out when disabled."""
+        from tools import env_probe
+        env_probe._reset_cache_for_tests()
+        monkeypatch.setattr(env_probe, "_python_version_of",
+                            lambda b: {"python3": "3.11.15"}.get(b))
+        monkeypatch.setattr(env_probe, "_has_pip_module", lambda b: False)
+        monkeypatch.setattr(env_probe, "_detect_pep668", lambda b: True)
+        monkeypatch.setattr(env_probe, "_pip_python_version", lambda: "3.12")
+        monkeypatch.setattr(env_probe.shutil, "which", lambda name: None)
+
+        agent = self._make_agent(environment_probe=False)
+        prompt = agent._build_system_prompt()
+        assert "Python toolchain:" not in prompt
+
+
 class TestInvalidateSystemPrompt:
    def test_clears_cache(self, agent):
        agent._cached_system_prompt = "cached value"
--- a/tests/tools/test_env_probe.py
+++ b/tests/tools/test_env_probe.py
@ -0,0 +1,157 @@
+"""Tests for tools/env_probe.py — local Python toolchain probe."""
+
+import sys
+
+import pytest
+
+from tools import env_probe
+
+
+@pytest.fixture(autouse=True)
+def reset_probe_cache():
+    """Each test starts with a clean cache."""
+    env_probe._reset_cache_for_tests()
+    yield
+    env_probe._reset_cache_for_tests()
+
+
+class TestSilentWhenHealthy:
+    """The probe must emit nothing when the environment is clean — otherwise
+    every prompt for every user pays an unnecessary token tax."""
+
+    def test_clean_env_returns_empty(self, monkeypatch):
+        """python3 + pip module + no PEP 668 → silent."""
+        monkeypatch.setattr(env_probe, "_python_version_of",
+                            lambda b: "3.13.3" if b == "python3" else None)
+        monkeypatch.setattr(env_probe, "_has_pip_module", lambda b: True)
+        monkeypatch.setattr(env_probe, "_detect_pep668", lambda b: False)
+        monkeypatch.setattr(env_probe, "_pip_python_version", lambda: "3.13")
+        monkeypatch.setattr(env_probe.shutil, "which", lambda name: None)
+        assert env_probe.get_environment_probe_line() == ""
+
+    def test_pep668_with_uv_returns_empty(self, monkeypatch):
+        """PEP 668 alone shouldn't trigger output if uv is installed —
+        agent has a viable install path."""
+        monkeypatch.setattr(env_probe, "_python_version_of",
+                            lambda b: "3.12.4" if b == "python3" else None)
+        monkeypatch.setattr(env_probe, "_has_pip_module", lambda b: True)
+        monkeypatch.setattr(env_probe, "_detect_pep668", lambda b: True)
+        monkeypatch.setattr(env_probe, "_pip_python_version", lambda: "3.12")
+        monkeypatch.setattr(env_probe.shutil, "which",
+                            lambda name: "/usr/local/bin/uv" if name == "uv" else None)
+        assert env_probe.get_environment_probe_line() == ""
+
+
+class TestEmitsOnRealProblems:
+    """The probe must produce a usable line for the real failure modes
+    that drove this feature."""
+
+    def test_allen_scenario_python_version_mismatch(self, monkeypatch):
+        """python3 is 3.11 (no pip module), pip on PATH is 3.12, PEP 668 on,
+        no uv — the exact scenario from the Sarasota real-estate task."""
+        monkeypatch.setattr(env_probe, "_python_version_of",
+                            lambda b: {"python3": "3.11.15", "python": None}.get(b))
+        monkeypatch.setattr(env_probe, "_has_pip_module", lambda b: False)
+        monkeypatch.setattr(env_probe, "_detect_pep668", lambda b: True)
+        monkeypatch.setattr(env_probe, "_pip_python_version", lambda: "3.12")
+        monkeypatch.setattr(env_probe.shutil, "which",
+                            lambda name: None if name == "uv" else "/usr/bin/" + name)
+
+        line = env_probe.get_environment_probe_line()
+        assert line  # not silent
+        # Single line — must not blow up the system prompt.
+        assert "\n" not in line
+        # Names the real toolchain state
+        assert "3.11.15" in line
+        assert "no pip module" in line
+        assert "mismatch" in line
+        assert "PEP 668" in line
+        # Points at the right escape hatch
+        assert "venv" in line or "uv" in line
+
+    def test_missing_python3_is_named(self, monkeypatch):
+        """If python3 isn't installed at all, say so."""
+        monkeypatch.setattr(env_probe, "_python_version_of", lambda b: None)
+        monkeypatch.setattr(env_probe, "_has_pip_module", lambda b: False)
+        monkeypatch.setattr(env_probe, "_detect_pep668", lambda b: False)
+        monkeypatch.setattr(env_probe, "_pip_python_version", lambda: None)
+        monkeypatch.setattr(env_probe.shutil, "which", lambda name: None)
+
+        line = env_probe.get_environment_probe_line()
+        assert "python3=missing" in line
+
+    def test_python_missing_but_python3_present(self, monkeypatch):
+        """Common on Debian: only python3 exists, agent shouldn't type
+        `python`."""
+        monkeypatch.setattr(env_probe, "_python_version_of",
+                            lambda b: "3.12.4" if b == "python3" else None)
+        monkeypatch.setattr(env_probe, "_has_pip_module", lambda b: True)
+        monkeypatch.setattr(env_probe, "_detect_pep668", lambda b: True)
+        monkeypatch.setattr(env_probe, "_pip_python_version", lambda: "3.12")
+        monkeypatch.setattr(env_probe.shutil, "which",
+                            lambda name: None if name == "uv" else "/usr/bin/" + name)
+
+        line = env_probe.get_environment_probe_line()
+        # `python=missing` only matters in the non-silent path; PEP 668 (without
+        # uv) is what brings us off-silent here, so check both signals.
+        assert "PEP 668" in line
+        assert "python=missing" in line
+
+
+class TestSkipsRemoteBackends:
+    """Remote backends have their own probe; this one must stay out."""
+
+    def test_docker_returns_empty(self, monkeypatch):
+        monkeypatch.setenv("TERMINAL_ENV", "docker")
+        # Even with a broken local env, docker must emit nothing.
+        monkeypatch.setattr(env_probe, "_python_version_of", lambda b: None)
+        monkeypatch.setattr(env_probe, "_has_pip_module", lambda b: False)
+        assert env_probe.get_environment_probe_line() == ""
+
+    def test_modal_returns_empty(self, monkeypatch):
+        monkeypatch.setenv("TERMINAL_ENV", "modal")
+        assert env_probe.get_environment_probe_line() == ""
+
+    def test_ssh_returns_empty(self, monkeypatch):
+        monkeypatch.setenv("TERMINAL_ENV", "ssh")
+        assert env_probe.get_environment_probe_line() == ""
+
+
+class TestCaching:
+    """The probe runs once per process — the result is deterministic for
+    the lifetime of the agent."""
+
+    def test_result_cached(self, monkeypatch):
+        calls = []
+
+        def counting_version(b):
+            calls.append(b)
+            return "3.12.4" if b == "python3" else None
+
+        monkeypatch.setattr(env_probe, "_python_version_of", counting_version)
+        monkeypatch.setattr(env_probe, "_has_pip_module", lambda b: True)
+        monkeypatch.setattr(env_probe, "_detect_pep668", lambda b: False)
+        monkeypatch.setattr(env_probe, "_pip_python_version", lambda: "3.12")
+        monkeypatch.setattr(env_probe.shutil, "which", lambda name: None)
+
+        env_probe.get_environment_probe_line()
+        env_probe.get_environment_probe_line()
+        env_probe.get_environment_probe_line()
+
+        # Only the first call probes — caller-counting confirms it.
+        # Two calls (python3 + python) on first invocation, zero after.
+        assert len(calls) == 2
+
+
+class TestRobustness:
+    """The probe must NEVER crash the prompt build."""
+
+    def test_subprocess_failure_returns_empty(self, monkeypatch):
+        """If every subprocess fails, just stay silent."""
+        def boom(*a, **kw):
+            raise OSError("simulated")
+        monkeypatch.setattr(env_probe.subprocess, "run", boom)
+        # Should not raise, should just return ""
+        result = env_probe.get_environment_probe_line()
+        # Whatever the result is, it must be a string
+        assert isinstance(result, str)