diff --git a/agent/agent_init.py b/agent/agent_init.py index 79b5522a292..5897853c0d1 100644 --- a/agent/agent_init.py +++ b/agent/agent_init.py @@ -1201,6 +1201,18 @@ def init_agent( _agent_section = {} agent._tool_use_enforcement = _agent_section.get("tool_use_enforcement", "auto") + # Universal task-completion guidance toggle. Default True. Surfaced + # as a separate flag from tool_use_enforcement because the guidance + # applies to ALL models, not just the model families enforcement + # targets. + agent._task_completion_guidance = bool(_agent_section.get("task_completion_guidance", True)) + + # Local Python toolchain probe toggle. Default True. When False, + # the probe is skipped entirely (no subprocess calls, no system-prompt + # line). Useful for users on exotic setups where the probe heuristics + # are noisy. + agent._environment_probe = bool(_agent_section.get("environment_probe", True)) + # App-level API retry count (wraps each model API call). Default 3, # overridable via agent.api_max_retries in config.yaml. See #11616. try: diff --git a/agent/codex_responses_adapter.py b/agent/codex_responses_adapter.py index 230a6e613b1..943131f5592 100644 --- a/agent/codex_responses_adapter.py +++ b/agent/codex_responses_adapter.py @@ -980,6 +980,48 @@ def _extract_responses_reasoning_text(item: Any) -> str: return "" +def _format_responses_error(error_obj: Any, response_status: str) -> str: + """Build a human-readable error string from a Responses ``response.error`` payload. + + The OpenAI Responses API carries failure details under ``response.error`` + on terminal ``response.failed`` events, in the shape + ``{"code": "rate_limit_exceeded", "message": "Slow down", "param": ...}``. + Earlier code only surfaced ``message``, which left users staring at bare + strings like ``"Slow down"`` while the failure mode (rate limit vs + context-length vs internal_error vs model-overloaded) was hidden in + ``code``. We now prefix ``code`` when both are present so consumers can + distinguish failure modes without parsing the bare message. + + Falls back to ``code`` alone when ``message`` is empty, and to a stable + default referencing the response status when no error payload is + available at all. Adapted from anomalyco/opencode#28757. + """ + # Pull code and message from either dict or attribute-style payloads. + code: Any = None + message: Any = None + if isinstance(error_obj, dict): + code = error_obj.get("code") + message = error_obj.get("message") + elif error_obj is not None: + code = getattr(error_obj, "code", None) + message = getattr(error_obj, "message", None) + + code_str = str(code).strip() if isinstance(code, str) else (str(code).strip() if code else "") + message_str = str(message).strip() if isinstance(message, str) else (str(message).strip() if message else "") + + if code_str and message_str: + return f"{code_str}: {message_str}" + if message_str: + return message_str + if code_str: + return code_str + if error_obj: + # Last-resort: stringify whatever the provider sent so it's at least + # visible in logs/UI rather than silently swallowed. + return str(error_obj) + return f"Responses API returned status '{response_status}'" + + # --------------------------------------------------------------------------- # Full response normalization # --------------------------------------------------------------------------- @@ -1023,10 +1065,7 @@ def _normalize_codex_response( if response_status in {"failed", "cancelled"}: error_obj = getattr(response, "error", None) - if isinstance(error_obj, dict): - error_msg = error_obj.get("message") or str(error_obj) - else: - error_msg = str(error_obj) if error_obj else f"Responses API returned status '{response_status}'" + error_msg = _format_responses_error(error_obj, response_status) raise RuntimeError(error_msg) content_parts: List[str] = [] diff --git a/agent/prompt_builder.py b/agent/prompt_builder.py index 365bcdc075f..059d16ceb1a 100644 --- a/agent/prompt_builder.py +++ b/agent/prompt_builder.py @@ -262,6 +262,37 @@ TOOL_USE_ENFORCEMENT_GUIDANCE = ( # Add new patterns here when a model family needs explicit steering. TOOL_USE_ENFORCEMENT_MODELS = ("gpt", "codex", "gemini", "gemma", "grok", "glm", "qwen", "deepseek") +# Universal "finish the job" guidance — applied to ALL models, not gated +# by model family. Addresses two cross-model failure modes: +# 1. Stopping after a stub: writing a tiny file or running one command +# and then ending the turn with a description of the plan instead +# of the finished artifact. (Observed on Opus during a real +# Sarasota real-estate build task: 3 API calls, 85-byte file, +# one terminal command, finish_reason=stop.) +# 2. Fabricating output when a real path is blocked. When `pip` or a +# tool fails, some models will synthesize plausible-looking results +# (fake addresses, fake JSON, fake numbers) instead of reporting +# the blocker. (Observed on DeepSeek v4-flash on the same task: +# pushed through PEP-668 wall, then returned fabricated listings.) +# +# Short on purpose. This block is shipped to every user, every session, +# in the cached system prompt — token cost is paid once at install and +# then amortised across all sessions via prefix caching. Keep it tight. +TASK_COMPLETION_GUIDANCE = ( + "# Finishing the job\n" + "When the user asks you to build, run, or verify something, the deliverable is " + "a working artifact backed by real tool output — not a description of one. " + "Do not stop after writing a stub, a plan, or a single command. Keep working " + "until you have actually exercised the code or produced the requested result, " + "then report what real execution returned.\n" + "If a tool, install, or network call fails and blocks the real path, say so " + "directly and try an alternative (different package manager, different " + "approach, ask the user). NEVER substitute plausible-looking fabricated " + "output (made-up data, invented file contents, synthesised API responses) " + "for results you couldn't actually produce. Reporting a blocker honestly " + "is always better than inventing a result." +) + # OpenAI GPT/Codex-specific execution guidance. Addresses known failure modes # where GPT models abandon work on partial results, skip prerequisite lookups, # hallucinate instead of using tools, and declare "done" without verification. diff --git a/agent/system_prompt.py b/agent/system_prompt.py index 8fa4c191563..4a61a2ee045 100644 --- a/agent/system_prompt.py +++ b/agent/system_prompt.py @@ -37,6 +37,7 @@ from agent.prompt_builder import ( PLATFORM_HINTS, SESSION_SEARCH_GUIDANCE, SKILLS_GUIDANCE, + TASK_COMPLETION_GUIDANCE, TOOL_USE_ENFORCEMENT_GUIDANCE, TOOL_USE_ENFORCEMENT_MODELS, ) @@ -100,6 +101,15 @@ def build_system_prompt_parts(agent: Any, system_message: Optional[str] = None) # Pointer to the hermes-agent skill + docs for user questions about Hermes itself. stable_parts.append(HERMES_AGENT_HELP_GUIDANCE) + # Universal task-completion / no-fabrication guidance. Applied to ALL + # models regardless of tool_use_enforcement gating — the failure modes + # this targets (stopping after a stub; fabricating output when a real + # path is blocked) are not model-family specific. Gated only by + # config.yaml ``agent.task_completion_guidance`` (default True) so + # users who want a leaner prompt can turn it off. + if getattr(agent, "_task_completion_guidance", True) and agent.valid_tool_names: + stable_parts.append(TASK_COMPLETION_GUIDANCE) + # Tool-aware behavioral guidance: only inject when the tools are loaded tool_guidance = [] if "memory" in agent.valid_tool_names: @@ -205,6 +215,23 @@ def build_system_prompt_parts(agent: Any, system_message: Optional[str] = None) if _env_hints: stable_parts.append(_env_hints) + # Local Python toolchain probe — names python/pip/uv/PEP-668 state when + # something is non-default so the model can pick the right install + # strategy without discovering by failure. Emits a single line; emits + # NOTHING when the environment is clean (no token cost). Skipped + # entirely for remote terminal backends (the host's Python state is + # irrelevant when tools run inside docker/modal/ssh). Gated by + # config.yaml ``agent.environment_probe`` (default True). + if getattr(agent, "_environment_probe", True): + try: + from tools.env_probe import get_environment_probe_line + _probe_line = get_environment_probe_line() + if _probe_line: + stable_parts.append(_probe_line) + except Exception: + # Probe failure must never block prompt build. + pass + # Active-profile hint — names the Hermes profile the agent is running # under so it doesn't conflate ~/.hermes/skills/ (default profile) with # ~/.hermes/profiles//skills/ (this profile's). Deterministic diff --git a/agent/transports/codex_app_server_session.py b/agent/transports/codex_app_server_session.py index 74e164d64d9..60eb607084f 100644 --- a/agent/transports/codex_app_server_session.py +++ b/agent/transports/codex_app_server_session.py @@ -31,6 +31,7 @@ import time from dataclasses import dataclass, field from typing import Any, Callable, Optional +from agent.codex_responses_adapter import _format_responses_error from agent.redact import redact_sensitive_text from agent.transports.codex_app_server import ( CodexAppServerClient, @@ -581,7 +582,7 @@ class CodexAppServerSession: (note.get("params") or {}).get("turn") or {} ).get("error") if err_obj: - err_msg = err_obj.get("message") or str(err_obj) + err_msg = _format_responses_error(err_obj, str(turn_status)) # If the turn failed for an auth/refresh reason, # rewrite the error into a re-auth hint AND mark # the session for retirement. diff --git a/hermes_cli/config.py b/hermes_cli/config.py index c3c3cf61154..f0df9cd0345 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -669,6 +669,20 @@ DEFAULT_CONFIG = { # (force on/off for all models), or a list of model-name substrings # to match (e.g. ["gpt", "codex", "gemini", "qwen"]). "tool_use_enforcement": "auto", + # Universal "finish the job" guidance — short prompt block applied to + # all models that targets two cross-family failure modes: (1) stopping + # after a stub instead of finishing the artifact, (2) fabricating + # plausible-looking output when a real path is blocked. Costs ~80 + # tokens in the cached system prompt. Set False to disable globally. + "task_completion_guidance": True, + # Local-environment toolchain probe — surfaces Python/pip/uv/PEP-668 + # state in the system prompt when something non-default is detected + # (e.g. python3 has no pip module, pip→python version mismatch, PEP + # 668 enforcement without uv). Costs zero tokens when the env is + # clean (probe emits nothing). Skipped for remote terminal backends + # (docker/modal/ssh — they have their own probe). Set False to + # disable entirely. + "environment_probe": True, # Staged inactivity warning: send a warning to the user at this # threshold before escalating to a full timeout. The warning fires # once per run and does not interrupt the agent. 0 = disable warning. diff --git a/tests/agent/test_codex_responses_adapter.py b/tests/agent/test_codex_responses_adapter.py index 751348bc6da..db3316a0567 100644 --- a/tests/agent/test_codex_responses_adapter.py +++ b/tests/agent/test_codex_responses_adapter.py @@ -1,6 +1,11 @@ from types import SimpleNamespace -from agent.codex_responses_adapter import _normalize_codex_response +import pytest + +from agent.codex_responses_adapter import ( + _format_responses_error, + _normalize_codex_response, +) def test_normalize_codex_response_drops_transient_rs_tmp_reasoning_items(): @@ -61,3 +66,111 @@ def test_normalize_codex_response_treats_summary_only_reasoning_as_incomplete(): assert assistant_message.content == "" assert assistant_message.reasoning == "still thinking" assert assistant_message.codex_reasoning_items is None + + +# --------------------------------------------------------------------------- +# _format_responses_error — adapted from anomalyco/opencode#28757. +# Provider failures should surface BOTH the code (rate_limit_exceeded / +# context_length_exceeded / internal_error / server_error) and the message, +# so consumers can tell rate limits apart from context-length failures and +# both apart from generic stream drops. +# --------------------------------------------------------------------------- + + +def test_format_responses_error_combines_code_and_message(): + err = {"code": "rate_limit_exceeded", "message": "Slow down"} + assert _format_responses_error(err, "failed") == "rate_limit_exceeded: Slow down" + + +def test_format_responses_error_message_only(): + err = {"message": "Upstream model unavailable"} + assert _format_responses_error(err, "failed") == "Upstream model unavailable" + + +def test_format_responses_error_code_only_when_message_empty(): + # Some providers/proxies emit a code with an empty message body. We + # used to fall back to ``str(error_obj)`` — a dict dump — which leaked + # ``{'code': 'internal_error', 'message': ''}`` into chat output. Now + # the bare code is surfaced, which is the meaningful field. + err = {"code": "internal_error", "message": ""} + assert _format_responses_error(err, "failed") == "internal_error" + + +def test_format_responses_error_code_only_when_message_missing(): + err = {"code": "server_error"} + assert _format_responses_error(err, "failed") == "server_error" + + +def test_format_responses_error_attribute_style_payload(): + # SDK objects expose ``code``/``message`` as attributes rather than dict + # keys. The helper must accept both shapes since the Responses SDK + # returns SimpleNamespace-style objects on ``response.failed``. + err = SimpleNamespace(code="context_length_exceeded", message="too long") + assert _format_responses_error(err, "failed") == "context_length_exceeded: too long" + + +def test_format_responses_error_falls_back_to_status_when_empty(): + assert ( + _format_responses_error(None, "failed") + == "Responses API returned status 'failed'" + ) + assert ( + _format_responses_error(None, "cancelled") + == "Responses API returned status 'cancelled'" + ) + + +def test_format_responses_error_stringifies_opaque_payload(): + # Last-resort: a provider sent something that isn't a dict and has no + # code/message attributes. Surface its repr rather than swallow it + # silently — at least it's visible in logs. + assert _format_responses_error("opaque sentinel", "failed") == "opaque sentinel" + + +def test_format_responses_error_ignores_non_string_code_message(): + # Defensive: a malformed gateway could send numbers/objects in these + # fields. We don't want to crash; we want a best-effort string. + err = {"code": 500, "message": None} + assert _format_responses_error(err, "failed") == "500" + + +def test_normalize_codex_response_failed_includes_code_in_error(): + """Regression: response_status == 'failed' should surface the error + code, not just the message. Used to leak a bare 'Slow down' string + that was indistinguishable from a generic stream truncation.""" + # ``output`` non-empty so we don't trip the "no output items" guard + # before reaching the failed-status branch. Real failed responses + # often DO carry a partial message item alongside the error. + response = SimpleNamespace( + status="failed", + output=[ + SimpleNamespace( + type="message", + role="assistant", + status="incomplete", + content=[SimpleNamespace(type="output_text", text="partial")], + ), + ], + error={"code": "rate_limit_exceeded", "message": "Slow down"}, + ) + with pytest.raises(RuntimeError, match=r"^rate_limit_exceeded: Slow down$"): + _normalize_codex_response(response) + + +def test_normalize_codex_response_failed_with_message_only(): + """Backwards-compat: a failed response with only a message field + (no code) should still surface that message verbatim.""" + response = SimpleNamespace( + status="failed", + output=[ + SimpleNamespace( + type="message", + role="assistant", + status="incomplete", + content=[SimpleNamespace(type="output_text", text="partial")], + ), + ], + error={"message": "model error"}, + ) + with pytest.raises(RuntimeError, match=r"^model error$"): + _normalize_codex_response(response) diff --git a/tests/run_agent/test_run_agent.py b/tests/run_agent/test_run_agent.py index 927ae9f1cb0..70d5abf42b5 100644 --- a/tests/run_agent/test_run_agent.py +++ b/tests/run_agent/test_run_agent.py @@ -1323,6 +1323,178 @@ class TestToolUseEnforcementConfig: assert TOOL_USE_ENFORCEMENT_GUIDANCE not in prompt +class TestTaskCompletionGuidance: + """Tests for the universal task-completion / no-fabrication guidance + (config.yaml ``agent.task_completion_guidance``). + + Unlike tool_use_enforcement, this block is model-family-agnostic — it + targets cross-model failure modes (stopping after a stub; fabricating + output when blocked) and should appear for every model by default.""" + + def _make_agent(self, model="anthropic/claude-opus-4.8", + task_completion_guidance=True, **extra_cfg): + agent_cfg = {"task_completion_guidance": task_completion_guidance} + agent_cfg.update(extra_cfg) + with ( + patch( + "run_agent.get_tool_definitions", + return_value=_make_tool_defs("terminal", "web_search"), + ), + patch("run_agent.check_toolset_requirements", return_value={}), + patch("run_agent.OpenAI"), + patch( + "hermes_cli.config.load_config", + return_value={"agent": agent_cfg}, + ), + ): + a = AIAgent( + model=model, + api_key="test-key-1234567890", + base_url="https://openrouter.ai/api/v1", + quiet_mode=True, + skip_context_files=True, + skip_memory=True, + ) + a.client = MagicMock() + return a + + def test_default_injects_for_claude(self): + """The block must reach Claude by default — that's the + primary motivating model family.""" + from agent.prompt_builder import TASK_COMPLETION_GUIDANCE + agent = self._make_agent(model="anthropic/claude-opus-4.8") + prompt = agent._build_system_prompt() + assert TASK_COMPLETION_GUIDANCE in prompt + + def test_default_injects_for_deepseek(self): + """And for DeepSeek — the other model that failed the Sarasota + real-estate task by fabricating output.""" + from agent.prompt_builder import TASK_COMPLETION_GUIDANCE + agent = self._make_agent(model="deepseek/deepseek-v4-flash") + prompt = agent._build_system_prompt() + assert TASK_COMPLETION_GUIDANCE in prompt + + def test_default_injects_for_gpt(self): + """Also reaches model families that already get enforcement — + it's additive, not exclusive.""" + from agent.prompt_builder import TASK_COMPLETION_GUIDANCE + agent = self._make_agent(model="openai/gpt-5.4") + prompt = agent._build_system_prompt() + assert TASK_COMPLETION_GUIDANCE in prompt + + def test_false_disables(self): + from agent.prompt_builder import TASK_COMPLETION_GUIDANCE + agent = self._make_agent( + model="anthropic/claude-opus-4.8", task_completion_guidance=False + ) + prompt = agent._build_system_prompt() + assert TASK_COMPLETION_GUIDANCE not in prompt + + def test_no_tools_no_injection(self): + """Same gate as tool_use_enforcement — no tools means no guidance. + The guidance refers to ``tool calls`` and ``tool output``; without + tools it would be advice for a capability the agent doesn't have.""" + from agent.prompt_builder import TASK_COMPLETION_GUIDANCE + with ( + patch("run_agent.get_tool_definitions", return_value=[]), + patch("run_agent.check_toolset_requirements", return_value={}), + patch("run_agent.OpenAI"), + patch( + "hermes_cli.config.load_config", + return_value={"agent": {"task_completion_guidance": True}}, + ), + ): + a = AIAgent( + api_key="test-key-1234567890", + base_url="https://openrouter.ai/api/v1", + quiet_mode=True, + skip_context_files=True, + skip_memory=True, + enabled_toolsets=[], + ) + a.client = MagicMock() + assert TASK_COMPLETION_GUIDANCE not in a._build_system_prompt() + + +class TestEnvironmentProbeIntegration: + """Tests for the local Python toolchain probe wiring (config.yaml + ``agent.environment_probe``). The probe itself is unit-tested in + tests/tools/test_env_probe.py; this class confirms it lands in the + system prompt when enabled and stays out when disabled.""" + + def _make_agent(self, model="anthropic/claude-opus-4.8", + environment_probe=True): + with ( + patch( + "run_agent.get_tool_definitions", + return_value=_make_tool_defs("terminal"), + ), + patch("run_agent.check_toolset_requirements", return_value={}), + patch("run_agent.OpenAI"), + patch( + "hermes_cli.config.load_config", + return_value={"agent": {"environment_probe": environment_probe}}, + ), + ): + a = AIAgent( + model=model, + api_key="test-key-1234567890", + base_url="https://openrouter.ai/api/v1", + quiet_mode=True, + skip_context_files=True, + skip_memory=True, + ) + a.client = MagicMock() + return a + + def test_probe_appears_when_problem_detected(self, monkeypatch): + """When the probe finds something off, the line lands in the prompt.""" + from tools import env_probe + env_probe._reset_cache_for_tests() + monkeypatch.setattr(env_probe, "_python_version_of", + lambda b: {"python3": "3.11.15"}.get(b)) + monkeypatch.setattr(env_probe, "_has_pip_module", lambda b: False) + monkeypatch.setattr(env_probe, "_detect_pep668", lambda b: True) + monkeypatch.setattr(env_probe, "_pip_python_version", lambda: "3.12") + monkeypatch.setattr(env_probe.shutil, "which", + lambda name: None if name == "uv" else "/usr/bin/" + name) + + agent = self._make_agent(environment_probe=True) + prompt = agent._build_system_prompt() + assert "Python toolchain:" in prompt + assert "3.11.15" in prompt + + def test_probe_silent_on_clean_env(self, monkeypatch): + """Clean environment → probe emits nothing → no line in prompt.""" + from tools import env_probe + env_probe._reset_cache_for_tests() + monkeypatch.setattr(env_probe, "_python_version_of", + lambda b: "3.13.3" if b == "python3" else None) + monkeypatch.setattr(env_probe, "_has_pip_module", lambda b: True) + monkeypatch.setattr(env_probe, "_detect_pep668", lambda b: False) + monkeypatch.setattr(env_probe, "_pip_python_version", lambda: "3.13") + monkeypatch.setattr(env_probe.shutil, "which", lambda name: None) + + agent = self._make_agent(environment_probe=True) + prompt = agent._build_system_prompt() + assert "Python toolchain:" not in prompt + + def test_probe_disabled_by_config(self, monkeypatch): + """Even with detectable problems, the probe stays out when disabled.""" + from tools import env_probe + env_probe._reset_cache_for_tests() + monkeypatch.setattr(env_probe, "_python_version_of", + lambda b: {"python3": "3.11.15"}.get(b)) + monkeypatch.setattr(env_probe, "_has_pip_module", lambda b: False) + monkeypatch.setattr(env_probe, "_detect_pep668", lambda b: True) + monkeypatch.setattr(env_probe, "_pip_python_version", lambda: "3.12") + monkeypatch.setattr(env_probe.shutil, "which", lambda name: None) + + agent = self._make_agent(environment_probe=False) + prompt = agent._build_system_prompt() + assert "Python toolchain:" not in prompt + + class TestInvalidateSystemPrompt: def test_clears_cache(self, agent): agent._cached_system_prompt = "cached value" diff --git a/tests/tools/test_env_probe.py b/tests/tools/test_env_probe.py new file mode 100644 index 00000000000..a8ae89d8402 --- /dev/null +++ b/tests/tools/test_env_probe.py @@ -0,0 +1,157 @@ +"""Tests for tools/env_probe.py — local Python toolchain probe.""" + +import sys + +import pytest + +from tools import env_probe + + +@pytest.fixture(autouse=True) +def reset_probe_cache(): + """Each test starts with a clean cache.""" + env_probe._reset_cache_for_tests() + yield + env_probe._reset_cache_for_tests() + + +class TestSilentWhenHealthy: + """The probe must emit nothing when the environment is clean — otherwise + every prompt for every user pays an unnecessary token tax.""" + + def test_clean_env_returns_empty(self, monkeypatch): + """python3 + pip module + no PEP 668 → silent.""" + monkeypatch.setattr(env_probe, "_python_version_of", + lambda b: "3.13.3" if b == "python3" else None) + monkeypatch.setattr(env_probe, "_has_pip_module", lambda b: True) + monkeypatch.setattr(env_probe, "_detect_pep668", lambda b: False) + monkeypatch.setattr(env_probe, "_pip_python_version", lambda: "3.13") + monkeypatch.setattr(env_probe.shutil, "which", lambda name: None) + assert env_probe.get_environment_probe_line() == "" + + def test_pep668_with_uv_returns_empty(self, monkeypatch): + """PEP 668 alone shouldn't trigger output if uv is installed — + agent has a viable install path.""" + monkeypatch.setattr(env_probe, "_python_version_of", + lambda b: "3.12.4" if b == "python3" else None) + monkeypatch.setattr(env_probe, "_has_pip_module", lambda b: True) + monkeypatch.setattr(env_probe, "_detect_pep668", lambda b: True) + monkeypatch.setattr(env_probe, "_pip_python_version", lambda: "3.12") + monkeypatch.setattr(env_probe.shutil, "which", + lambda name: "/usr/local/bin/uv" if name == "uv" else None) + assert env_probe.get_environment_probe_line() == "" + + +class TestEmitsOnRealProblems: + """The probe must produce a usable line for the real failure modes + that drove this feature.""" + + def test_allen_scenario_python_version_mismatch(self, monkeypatch): + """python3 is 3.11 (no pip module), pip on PATH is 3.12, PEP 668 on, + no uv — the exact scenario from the Sarasota real-estate task.""" + monkeypatch.setattr(env_probe, "_python_version_of", + lambda b: {"python3": "3.11.15", "python": None}.get(b)) + monkeypatch.setattr(env_probe, "_has_pip_module", lambda b: False) + monkeypatch.setattr(env_probe, "_detect_pep668", lambda b: True) + monkeypatch.setattr(env_probe, "_pip_python_version", lambda: "3.12") + monkeypatch.setattr(env_probe.shutil, "which", + lambda name: None if name == "uv" else "/usr/bin/" + name) + + line = env_probe.get_environment_probe_line() + assert line # not silent + # Single line — must not blow up the system prompt. + assert "\n" not in line + # Names the real toolchain state + assert "3.11.15" in line + assert "no pip module" in line + assert "mismatch" in line + assert "PEP 668" in line + # Points at the right escape hatch + assert "venv" in line or "uv" in line + + def test_missing_python3_is_named(self, monkeypatch): + """If python3 isn't installed at all, say so.""" + monkeypatch.setattr(env_probe, "_python_version_of", lambda b: None) + monkeypatch.setattr(env_probe, "_has_pip_module", lambda b: False) + monkeypatch.setattr(env_probe, "_detect_pep668", lambda b: False) + monkeypatch.setattr(env_probe, "_pip_python_version", lambda: None) + monkeypatch.setattr(env_probe.shutil, "which", lambda name: None) + + line = env_probe.get_environment_probe_line() + assert "python3=missing" in line + + def test_python_missing_but_python3_present(self, monkeypatch): + """Common on Debian: only python3 exists, agent shouldn't type + `python`.""" + monkeypatch.setattr(env_probe, "_python_version_of", + lambda b: "3.12.4" if b == "python3" else None) + monkeypatch.setattr(env_probe, "_has_pip_module", lambda b: True) + monkeypatch.setattr(env_probe, "_detect_pep668", lambda b: True) + monkeypatch.setattr(env_probe, "_pip_python_version", lambda: "3.12") + monkeypatch.setattr(env_probe.shutil, "which", + lambda name: None if name == "uv" else "/usr/bin/" + name) + + line = env_probe.get_environment_probe_line() + # `python=missing` only matters in the non-silent path; PEP 668 (without + # uv) is what brings us off-silent here, so check both signals. + assert "PEP 668" in line + assert "python=missing" in line + + +class TestSkipsRemoteBackends: + """Remote backends have their own probe; this one must stay out.""" + + def test_docker_returns_empty(self, monkeypatch): + monkeypatch.setenv("TERMINAL_ENV", "docker") + # Even with a broken local env, docker must emit nothing. + monkeypatch.setattr(env_probe, "_python_version_of", lambda b: None) + monkeypatch.setattr(env_probe, "_has_pip_module", lambda b: False) + assert env_probe.get_environment_probe_line() == "" + + def test_modal_returns_empty(self, monkeypatch): + monkeypatch.setenv("TERMINAL_ENV", "modal") + assert env_probe.get_environment_probe_line() == "" + + def test_ssh_returns_empty(self, monkeypatch): + monkeypatch.setenv("TERMINAL_ENV", "ssh") + assert env_probe.get_environment_probe_line() == "" + + +class TestCaching: + """The probe runs once per process — the result is deterministic for + the lifetime of the agent.""" + + def test_result_cached(self, monkeypatch): + calls = [] + + def counting_version(b): + calls.append(b) + return "3.12.4" if b == "python3" else None + + monkeypatch.setattr(env_probe, "_python_version_of", counting_version) + monkeypatch.setattr(env_probe, "_has_pip_module", lambda b: True) + monkeypatch.setattr(env_probe, "_detect_pep668", lambda b: False) + monkeypatch.setattr(env_probe, "_pip_python_version", lambda: "3.12") + monkeypatch.setattr(env_probe.shutil, "which", lambda name: None) + + env_probe.get_environment_probe_line() + env_probe.get_environment_probe_line() + env_probe.get_environment_probe_line() + + # Only the first call probes — caller-counting confirms it. + # Two calls (python3 + python) on first invocation, zero after. + assert len(calls) == 2 + + +class TestRobustness: + """The probe must NEVER crash the prompt build.""" + + def test_subprocess_failure_returns_empty(self, monkeypatch): + """If every subprocess fails, just stay silent.""" + def boom(*a, **kw): + raise OSError("simulated") + monkeypatch.setattr(env_probe.subprocess, "run", boom) + # Should not raise, should just return "" + result = env_probe.get_environment_probe_line() + # Whatever the result is, it must be a string + assert isinstance(result, str) diff --git a/tools/env_probe.py b/tools/env_probe.py new file mode 100644 index 00000000000..dfb715a9871 --- /dev/null +++ b/tools/env_probe.py @@ -0,0 +1,247 @@ +"""Local-environment toolchain probe for the system prompt. + +When the terminal backend is local (the agent's tools run on the same +machine as Hermes itself), we surface a single deterministic line about +Python tooling state so models don't have to discover it by hitting +walls. Common failure modes this addresses: + +* Hermes ships under one Python (e.g. 3.11 in a bundled venv) while the + user's login shell has a different one (e.g. 3.12 system). ``pip`` + resolved from PATH may not match ``python3 -m pip``. +* The bundled-venv Python has no pip module installed → ``python3 -m + pip`` returns ``No module named pip``. +* The system Python is PEP-668 externally-managed → naive + ``pip install`` fails with ``error: externally-managed-environment``. + +The probe is cheap (a handful of subprocess calls, ~50ms total), +cached for the lifetime of the process, and emits **at most one +short line** when something non-default is detected. When the +environment looks normal (python3+pip both present and matched, no +PEP 668), it emits nothing — no token cost. + +Remote terminal backends (docker, modal, ssh, …) are skipped: the +host's Python state is irrelevant when tools run inside a sandbox. +The sandbox has its own existing probe (``_probe_remote_backend``) +in ``agent/prompt_builder.py``. + +Toggle via ``agent.environment_probe`` in config.yaml (default True). +""" + +from __future__ import annotations + +import logging +import os +import shutil +import subprocess +import sys +import threading +from typing import Optional + +logger = logging.getLogger(__name__) + +# Module-level cache. The probe result is deterministic for the +# lifetime of the process — Python install state doesn't change +# mid-session in any way that would matter for the system prompt. +_CACHE_LOCK = threading.Lock() +_CACHED_LINE: Optional[str] = None # None = not probed yet; "" = probed, nothing to say. + +# Remote backends — keep in sync with agent/prompt_builder.py:_REMOTE_TERMINAL_BACKENDS. +# Duplicated rather than imported to avoid a circular import (prompt_builder +# imports nothing from tools). +_REMOTE_BACKENDS = frozenset({ + "docker", "singularity", "modal", "daytona", "ssh", "managed_modal", +}) + + +def _run(cmd: list[str], timeout: float = 3.0) -> tuple[int, str, str]: + """Run a short subprocess. Returns (returncode, stdout, stderr). + + Failures (binary missing, timeout, OSError) return (-1, "", ""). + """ + try: + result = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=timeout, + check=False, + ) + return result.returncode, (result.stdout or "").strip(), (result.stderr or "").strip() + except FileNotFoundError: + return -1, "", "not found" + except subprocess.TimeoutExpired: + return -1, "", "timeout" + except OSError as exc: + return -1, "", f"oserror: {exc}" + + +def _python_version_of(binary: str) -> Optional[str]: + """Return a short version string like ``3.12.4`` for ``binary``, or None.""" + if not shutil.which(binary): + return None + rc, out, err = _run([binary, "-c", "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}')"]) + if rc == 0 and out: + return out + return None + + +def _has_pip_module(binary: str) -> bool: + """True if `` -m pip --version`` succeeds.""" + if not shutil.which(binary): + return False + rc, _out, _err = _run([binary, "-m", "pip", "--version"]) + return rc == 0 + + +def _detect_pep668(binary: str) -> bool: + """True when ````'s install location is PEP-668 externally-managed. + + Looks for ``EXTERNALLY-MANAGED`` next to the stdlib (the marker file + Debian/Ubuntu drop in to gate naive ``pip install``). + """ + if not shutil.which(binary): + return False + code = ( + "import sys, os;" + "stdlib = os.path.dirname(os.__file__);" + "marker = os.path.join(stdlib, 'EXTERNALLY-MANAGED');" + "print('yes' if os.path.exists(marker) else 'no')" + ) + rc, out, _err = _run([binary, "-c", code]) + return rc == 0 and out.strip() == "yes" + + +def _pip_python_version() -> Optional[str]: + """If ``pip`` is on PATH, return the Python version it's bound to. + + ``pip --version`` output looks like:: + + pip 24.0 from /usr/lib/python3/dist-packages/pip (python 3.12) + + Returns the parenthesised version (e.g. ``"3.12"``) or None. + """ + if not shutil.which("pip"): + return None + rc, out, _err = _run(["pip", "--version"]) + if rc != 0 or not out: + return None + # Parse trailing "(python X.Y)". + if "(python " in out and out.endswith(")"): + try: + tail = out.rsplit("(python ", 1)[1] + return tail[:-1].strip() + except (IndexError, AttributeError): + return None + return None + + +def _build_probe_line() -> str: + """Build the one-liner. Returns "" when nothing notable is detected. + + Emit only when SOMETHING is off — the goal is to save the model from + hitting an avoidable wall, not to narrate a healthy environment. + """ + # Bail out if a remote terminal backend is configured; the host's + # Python state isn't where the agent's tools run. + backend = (os.getenv("TERMINAL_ENV") or "local").strip().lower() + if backend in _REMOTE_BACKENDS: + return "" + + py3_ver = _python_version_of("python3") + py_ver = _python_version_of("python") # for systems with a `python` alias + py3_has_pip = _has_pip_module("python3") if py3_ver else False + pip_bound_to = _pip_python_version() + py3_pep668 = _detect_pep668("python3") if py3_ver else False + has_uv = shutil.which("uv") is not None + + # If python3 exists, has pip, has uv (or no PEP 668), and there's no + # version mismatch between `pip` and `python3` → environment is + # clean enough to stay silent. The model can discover details by + # running commands if it cares. + mismatch = bool(pip_bound_to and py3_ver and not py3_ver.startswith(pip_bound_to)) + silent_conditions = ( + py3_ver is not None + and py3_has_pip + and not mismatch + and (not py3_pep668 or has_uv) + ) + if silent_conditions: + return "" + + # Build a compact factual summary. Keep it ONE line so it doesn't + # dominate the prompt; the model is good at parsing dense info. + bits: list[str] = [] + if py3_ver: + py3_bit = f"python3={py3_ver}" + if not py3_has_pip: + py3_bit += " (no pip module)" + bits.append(py3_bit) + else: + bits.append("python3=missing") + + if py_ver and py_ver != py3_ver: + bits.append(f"python={py_ver}") + elif not py_ver and py3_ver: + # Common on Debian/Ubuntu — call it out so the model doesn't + # type `python` and hit "command not found". + bits.append("python=missing (use python3)") + + if pip_bound_to: + if mismatch: + bits.append(f"pip→python{pip_bound_to} (mismatch)") + elif not py3_has_pip: + # pip exists but `python3 -m pip` doesn't — the script + # works but the module path doesn't. + bits.append(f"pip→python{pip_bound_to}") + elif py3_has_pip: + # `pip` not on PATH but `python3 -m pip` works. + pass + else: + bits.append("pip=missing") + + if py3_pep668: + bits.append("PEP 668=yes (use venv or uv)") + + if has_uv: + bits.append("uv=installed") + + if not bits: + return "" + + return "Python toolchain: " + ", ".join(bits) + "." + + +def get_environment_probe_line(*, force_refresh: bool = False) -> str: + """Return the cached probe line (building it on first call). + + Returns "" when the environment is clean — the system prompt + assembler should drop the section in that case rather than + emit an empty heading. + + ``force_refresh`` is for tests; real callers should never need it. + """ + global _CACHED_LINE + if force_refresh: + with _CACHE_LOCK: + _CACHED_LINE = None + + if _CACHED_LINE is not None: + return _CACHED_LINE + + with _CACHE_LOCK: + if _CACHED_LINE is not None: # raced + return _CACHED_LINE + try: + line = _build_probe_line() + except Exception as exc: # never let probe failure block prompt build + logger.debug("env_probe failed: %s", exc) + line = "" + _CACHED_LINE = line + return line + + +def _reset_cache_for_tests() -> None: + """Test helper — clear the cache between probe scenarios.""" + global _CACHED_LINE + with _CACHE_LOCK: + _CACHED_LINE = None