diff --git a/tests/tools/test_notify_on_complete.py b/tests/tools/test_notify_on_complete.py index 64d198970cb..4a4ca37bd89 100644 --- a/tests/tools/test_notify_on_complete.py +++ b/tests/tools/test_notify_on_complete.py @@ -348,3 +348,158 @@ class TestCompletionConsumed: result = registry.poll("proc_running") assert result["status"] == "running" assert not registry.is_completion_consumed("proc_running") + + +# --------------------------------------------------------------------------- +# Silent-background-process hint +# +# background=True without notify_on_complete=True OR watch_patterns runs +# the process silently — the agent has no way to learn it finished short +# of calling process(action="poll") explicitly. The tool result must +# include a "hint" field that nudges the agent toward +# notify_on_complete=True for bounded tasks. May 2026 PR #31231 incident: +# bg CI poller exited green, agent never noticed, user had to surface it. +# --------------------------------------------------------------------------- + + +def _silent_bg_base_config(tmp_path): + return { + "env_type": "local", + "docker_image": "", + "singularity_image": "", + "modal_image": "", + "daytona_image": "", + "cwd": str(tmp_path), + "timeout": 30, + } + + +def _silent_bg_harness(monkeypatch, tmp_path): + """Common test fixture: patch enough of terminal_tool to spawn a fake + background process and capture the JSON result the agent sees.""" + import tools.terminal_tool as terminal_tool_module + from tools import process_registry as process_registry_module + from types import SimpleNamespace + + config = _silent_bg_base_config(tmp_path) + dummy_env = SimpleNamespace(env={}) + + def fake_spawn_local(**kwargs): + return SimpleNamespace( + id="proc_silent_test", + pid=4242, + notify_on_complete=False, + watcher_platform="", + watcher_chat_id="", + watcher_user_id="", + watcher_user_name="", + watcher_thread_id="", + watcher_message_id="", + watcher_interval=0, + ) + + monkeypatch.setattr(terminal_tool_module, "_get_env_config", lambda: config) + monkeypatch.setattr(terminal_tool_module, "_start_cleanup_thread", lambda: None) + monkeypatch.setattr(terminal_tool_module, "_check_all_guards", lambda *_args, **_kwargs: {"approved": True}) + monkeypatch.setattr(process_registry_module.process_registry, "spawn_local", fake_spawn_local) + monkeypatch.setitem(terminal_tool_module._active_environments, "default", dummy_env) + monkeypatch.setitem(terminal_tool_module._last_activity, "default", 0.0) + return terminal_tool_module + + +def test_background_without_notify_emits_silent_process_hint(monkeypatch, tmp_path): + """The footgun case (May 2026 PR #31231): bg=True alone runs silently + and the agent has no signal it finished. Tool must nudge.""" + tt = _silent_bg_harness(monkeypatch, tmp_path) + try: + result = json.loads( + tt.terminal_tool( + command="while true; do gh pr checks 999; sleep 30; done", + background=True, + ) + ) + finally: + tt._active_environments.pop("default", None) + tt._last_activity.pop("default", None) + + assert result["session_id"] == "proc_silent_test" + hint = result.get("hint", "") + assert hint, "Silent background process must include a hint field" + assert "notify_on_complete" in hint, ( + "Hint must name the corrective flag so the agent can self-correct" + ) + assert "silent" in hint.lower() or "no way to learn" in hint.lower(), ( + "Hint must explain the failure mode, not just suggest the fix" + ) + + +def test_background_with_notify_does_not_emit_hint(monkeypatch, tmp_path): + """The correct shape — bg+notify together — must not nag.""" + tt = _silent_bg_harness(monkeypatch, tmp_path) + try: + result = json.loads( + tt.terminal_tool( + command="pytest tests/", + background=True, + notify_on_complete=True, + ) + ) + finally: + tt._active_environments.pop("default", None) + tt._last_activity.pop("default", None) + + assert "hint" not in result, ( + f"Correct usage must not emit a hint, got: {result.get('hint')!r}" + ) + assert result.get("notify_on_complete") is True + + +def test_background_with_watch_patterns_does_not_emit_hint(monkeypatch, tmp_path): + """watch_patterns is the other legitimate non-silent shape — also no hint.""" + tt = _silent_bg_harness(monkeypatch, tmp_path) + try: + result = json.loads( + tt.terminal_tool( + command="uvicorn app:server --port 8080", + background=True, + watch_patterns=["Application startup complete"], + ) + ) + finally: + tt._active_environments.pop("default", None) + tt._last_activity.pop("default", None) + + assert "hint" not in result, ( + f"watch_patterns shape must not emit a silent-process hint, got: {result.get('hint')!r}" + ) + + +def test_foreground_command_does_not_emit_hint(monkeypatch, tmp_path): + """Hint only applies to background processes — foreground returns its + result synchronously and the agent always sees the outcome.""" + tt = _silent_bg_harness(monkeypatch, tmp_path) + + # Foreground path doesn't go through spawn_local. Patch the local-env + # exec method to short-circuit to a clean exit so the test doesn't + # actually shell out. + from types import SimpleNamespace + dummy_env = SimpleNamespace( + env={}, + execute=lambda *a, **kw: {"output": "done", "exit_code": 0, "error": None}, + ) + monkeypatch.setitem(tt._active_environments, "default", dummy_env) + + try: + result = json.loads( + tt.terminal_tool( + command="echo hello", + background=False, + ) + ) + finally: + tt._active_environments.pop("default", None) + tt._last_activity.pop("default", None) + + assert "hint" not in result, ( + f"Foreground commands must not emit the background-silence hint, got: {result.get('hint')!r}" + ) diff --git a/tools/terminal_tool.py b/tools/terminal_tool.py index 387e27881ad..f7a0e14bc88 100644 --- a/tools/terminal_tool.py +++ b/tools/terminal_tool.py @@ -904,9 +904,9 @@ Do NOT use echo/cat heredoc to create files — use write_file instead. Reserve terminal for: builds, installs, git, processes, scripts, network, package managers, and anything that needs a shell. Foreground (default): Commands return INSTANTLY when done, even if the timeout is high. Set timeout=300 for long builds/scripts — you'll still get the result in seconds if it's fast. Prefer foreground for short commands. -Background: Set background=true to get a session_id. Two patterns: - (1) Long-lived processes that never exit (servers, watchers). - (2) Long-running tasks with notify_on_complete=true — you can keep working on other things and the system auto-notifies you when the task finishes. Great for test suites, builds, deployments, or anything that takes more than a minute. +Background: Set background=true to get a session_id. Almost always pair with notify_on_complete=true — bg without notify runs SILENTLY and you have no way to learn it finished short of calling process(action='poll') yourself. Two legitimate uses: + (1) Long-lived processes that never exit (servers, watchers, daemons) — silent is correct, there's no exit to notify on. + (2) Long-running bounded tasks (tests, builds, deploys, CI pollers, batch jobs) — MUST set notify_on_complete=true. Without it you'll either forget to poll or sit blocked waiting for the user to surface the result. For servers/watchers, do NOT use shell-level background wrappers (nohup/disown/setsid/trailing '&') in foreground mode. Use background=true so Hermes can track lifecycle and output. After starting a server, verify readiness with a health check or log signal, then run tests in a separate terminal() call. Avoid blind sleep loops. Use process(action="poll") for progress checks, process(action="wait") to block until done. @@ -1959,6 +1959,32 @@ def terminal_tool( if pty_disabled_reason: result_data["pty_note"] = pty_disabled_reason + # Nudge: background=True without notify_on_complete=True OR + # watch_patterns is a silent process. The agent has NO way to + # learn it finished short of calling process(action="poll"/"wait") + # explicitly. That's correct only for genuine long-lived + # processes that never exit (servers, watchers). For every + # bounded task (tests, builds, CI pollers, deploys, batch + # jobs) the agent almost certainly wanted notification and + # forgot the flag. May 2026 PR #31231 incident: bg CI poller + # ran fine, exited green, agent never noticed — user had to + # surface the result. Cheap nudge here costs ~one read for + # server cases (false positive) and prevents silent + # blindness for bounded-task cases (false negative). + if background and not notify_on_complete and not watch_patterns: + result_data["hint"] = ( + "background=true without notify_on_complete=true means " + "this process runs SILENTLY — you will not be told when " + "it exits. If this is a bounded task (test suite, build, " + "CI poller, deploy, anything with a defined end), you " + "almost certainly wanted notify_on_complete=true so the " + "system pings you on exit. Re-launch with " + "notify_on_complete=true, or call process(action='poll') " + "/ process(action='wait') yourself to learn the outcome. " + "Only ignore this hint for genuine long-lived processes " + "that never exit (servers, watchers, daemons)." + ) + # Populate routing metadata on the session so that # watch-pattern and completion notifications can be # routed back to the correct chat/thread. @@ -2322,7 +2348,7 @@ TERMINAL_SCHEMA = { }, "background": { "type": "boolean", - "description": "Run the command in the background. Two patterns: (1) Long-lived processes that never exit (servers, watchers). (2) Long-running tasks paired with notify_on_complete=true — you can keep working and get notified when the task finishes. For short commands, prefer foreground with a generous timeout instead.", + "description": "Run the command in the background. Almost always pair with notify_on_complete=true — without it, the process runs silently and you'll have no way to learn it finished short of calling process(action='poll') yourself (easy to forget, leading to silent blindness on long jobs). Two legitimate patterns: (1) Long-lived processes that never exit (servers, watchers, daemons) — these stay silent because there's no exit to notify on. (2) Long-running bounded tasks (tests, builds, deploys, CI pollers, batch jobs) — these MUST set notify_on_complete=true. For short commands, prefer foreground with a generous timeout instead.", "default": False }, "timeout": {