From d97c3244739fb92b69e23a08cc309b12eb00fea8 Mon Sep 17 00:00:00 2001
From: Teknium <127238744+teknium1@users.noreply.github.com>
Date: Sat, 23 May 2026 21:02:14 -0700
Subject: [PATCH] fix(terminal): warn at call time when background=true runs
 silently (#31289)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`terminal(background=true)` without `notify_on_complete=true` or
`watch_patterns` runs the process SILENTLY — the agent has no way
to learn it finished short of calling `process(action='poll')`
explicitly. That's correct for genuine long-lived processes (servers,
watchers, daemons) but is a footgun for every bounded task (tests,
builds, deploys, CI pollers, batch jobs), which is the vast majority
of background uses.

Hit on May 23, 2026 (PR #31231 incident): agent launched a CI-watch
loop with `background=true` only. The poller ran fine, exited green
6 minutes later, agent never noticed. User had to surface 'we are
green CI, you can merge.' Memory and skill docs said *what* to do
(poll in background) but not *how* to receive the result. The
`notify_on_complete=true` flag exists and works, but is easy to
forget when bg seems sufficient on its own.

Two changes here, mutually reinforcing:

1. Runtime nudge: tool result for `background=true` w/o notify or
   watch_patterns now includes a `hint` field explaining the silent-
   process failure mode and pointing at the corrective flag. Agent
   sees it on the same turn and self-corrects without needing the
   user to surface anything. Cost for legitimate server cases is one
   ignored read (~50 tokens); cost for forgot-notify cases is
   prevented blindness (potentially many turns, or a user nudge).
   False positives << false negatives.

2. Schema/description rewrite: top-level TERMINAL_TOOL_DESCRIPTION
   and the `background` field description now lead with 'Almost
   always pair with notify_on_complete=true' instead of presenting
   it as one of two equally-likely patterns. The two legitimate
   non-notify shapes (long-lived servers; watch_patterns mid-process
   signals) are still documented, but as the minority case.

Tests cover all four shapes: bg-only emits hint, bg+notify doesn't,
bg+watch_patterns doesn't, foreground doesn't. 4 new tests; full
suite of background/process tests stays green (160/160 across the
relevant 6 test files).
---
 tests/tools/test_notify_on_complete.py | 155 +++++++++++++++++++++++++
 tools/terminal_tool.py                 |  34 +++++-
 2 files changed, 185 insertions(+), 4 deletions(-)

diff --git a/tests/tools/test_notify_on_complete.py b/tests/tools/test_notify_on_complete.py
index 64d198970cb..4a4ca37bd89 100644
--- a/tests/tools/test_notify_on_complete.py
+++ b/tests/tools/test_notify_on_complete.py
@@ -348,3 +348,158 @@ class TestCompletionConsumed:
         result = registry.poll("proc_running")
         assert result["status"] == "running"
         assert not registry.is_completion_consumed("proc_running")
+
+
+# ---------------------------------------------------------------------------
+# Silent-background-process hint
+#
+# background=True without notify_on_complete=True OR watch_patterns runs
+# the process silently — the agent has no way to learn it finished short
+# of calling process(action="poll") explicitly. The tool result must
+# include a "hint" field that nudges the agent toward
+# notify_on_complete=True for bounded tasks. May 2026 PR #31231 incident:
+# bg CI poller exited green, agent never noticed, user had to surface it.
+# ---------------------------------------------------------------------------
+
+
+def _silent_bg_base_config(tmp_path):
+    return {
+        "env_type": "local",
+        "docker_image": "",
+        "singularity_image": "",
+        "modal_image": "",
+        "daytona_image": "",
+        "cwd": str(tmp_path),
+        "timeout": 30,
+    }
+
+
+def _silent_bg_harness(monkeypatch, tmp_path):
+    """Common test fixture: patch enough of terminal_tool to spawn a fake
+    background process and capture the JSON result the agent sees."""
+    import tools.terminal_tool as terminal_tool_module
+    from tools import process_registry as process_registry_module
+    from types import SimpleNamespace
+
+    config = _silent_bg_base_config(tmp_path)
+    dummy_env = SimpleNamespace(env={})
+
+    def fake_spawn_local(**kwargs):
+        return SimpleNamespace(
+            id="proc_silent_test",
+            pid=4242,
+            notify_on_complete=False,
+            watcher_platform="",
+            watcher_chat_id="",
+            watcher_user_id="",
+            watcher_user_name="",
+            watcher_thread_id="",
+            watcher_message_id="",
+            watcher_interval=0,
+        )
+
+    monkeypatch.setattr(terminal_tool_module, "_get_env_config", lambda: config)
+    monkeypatch.setattr(terminal_tool_module, "_start_cleanup_thread", lambda: None)
+    monkeypatch.setattr(terminal_tool_module, "_check_all_guards", lambda *_args, **_kwargs: {"approved": True})
+    monkeypatch.setattr(process_registry_module.process_registry, "spawn_local", fake_spawn_local)
+    monkeypatch.setitem(terminal_tool_module._active_environments, "default", dummy_env)
+    monkeypatch.setitem(terminal_tool_module._last_activity, "default", 0.0)
+    return terminal_tool_module
+
+
+def test_background_without_notify_emits_silent_process_hint(monkeypatch, tmp_path):
+    """The footgun case (May 2026 PR #31231): bg=True alone runs silently
+    and the agent has no signal it finished. Tool must nudge."""
+    tt = _silent_bg_harness(monkeypatch, tmp_path)
+    try:
+        result = json.loads(
+            tt.terminal_tool(
+                command="while true; do gh pr checks 999; sleep 30; done",
+                background=True,
+            )
+        )
+    finally:
+        tt._active_environments.pop("default", None)
+        tt._last_activity.pop("default", None)
+
+    assert result["session_id"] == "proc_silent_test"
+    hint = result.get("hint", "")
+    assert hint, "Silent background process must include a hint field"
+    assert "notify_on_complete" in hint, (
+        "Hint must name the corrective flag so the agent can self-correct"
+    )
+    assert "silent" in hint.lower() or "no way to learn" in hint.lower(), (
+        "Hint must explain the failure mode, not just suggest the fix"
+    )
+
+
+def test_background_with_notify_does_not_emit_hint(monkeypatch, tmp_path):
+    """The correct shape — bg+notify together — must not nag."""
+    tt = _silent_bg_harness(monkeypatch, tmp_path)
+    try:
+        result = json.loads(
+            tt.terminal_tool(
+                command="pytest tests/",
+                background=True,
+                notify_on_complete=True,
+            )
+        )
+    finally:
+        tt._active_environments.pop("default", None)
+        tt._last_activity.pop("default", None)
+
+    assert "hint" not in result, (
+        f"Correct usage must not emit a hint, got: {result.get('hint')!r}"
+    )
+    assert result.get("notify_on_complete") is True
+
+
+def test_background_with_watch_patterns_does_not_emit_hint(monkeypatch, tmp_path):
+    """watch_patterns is the other legitimate non-silent shape — also no hint."""
+    tt = _silent_bg_harness(monkeypatch, tmp_path)
+    try:
+        result = json.loads(
+            tt.terminal_tool(
+                command="uvicorn app:server --port 8080",
+                background=True,
+                watch_patterns=["Application startup complete"],
+            )
+        )
+    finally:
+        tt._active_environments.pop("default", None)
+        tt._last_activity.pop("default", None)
+
+    assert "hint" not in result, (
+        f"watch_patterns shape must not emit a silent-process hint, got: {result.get('hint')!r}"
+    )
+
+
+def test_foreground_command_does_not_emit_hint(monkeypatch, tmp_path):
+    """Hint only applies to background processes — foreground returns its
+    result synchronously and the agent always sees the outcome."""
+    tt = _silent_bg_harness(monkeypatch, tmp_path)
+
+    # Foreground path doesn't go through spawn_local. Patch the local-env
+    # exec method to short-circuit to a clean exit so the test doesn't
+    # actually shell out.
+    from types import SimpleNamespace
+    dummy_env = SimpleNamespace(
+        env={},
+        execute=lambda *a, **kw: {"output": "done", "exit_code": 0, "error": None},
+    )
+    monkeypatch.setitem(tt._active_environments, "default", dummy_env)
+
+    try:
+        result = json.loads(
+            tt.terminal_tool(
+                command="echo hello",
+                background=False,
+            )
+        )
+    finally:
+        tt._active_environments.pop("default", None)
+        tt._last_activity.pop("default", None)
+
+    assert "hint" not in result, (
+        f"Foreground commands must not emit the background-silence hint, got: {result.get('hint')!r}"
+    )
diff --git a/tools/terminal_tool.py b/tools/terminal_tool.py
index 387e27881ad..f7a0e14bc88 100644
--- a/tools/terminal_tool.py
+++ b/tools/terminal_tool.py
@@ -904,9 +904,9 @@ Do NOT use echo/cat heredoc to create files — use write_file instead.
 Reserve terminal for: builds, installs, git, processes, scripts, network, package managers, and anything that needs a shell.
 
 Foreground (default): Commands return INSTANTLY when done, even if the timeout is high. Set timeout=300 for long builds/scripts — you'll still get the result in seconds if it's fast. Prefer foreground for short commands.
-Background: Set background=true to get a session_id. Two patterns:
-  (1) Long-lived processes that never exit (servers, watchers).
-  (2) Long-running tasks with notify_on_complete=true — you can keep working on other things and the system auto-notifies you when the task finishes. Great for test suites, builds, deployments, or anything that takes more than a minute.
+Background: Set background=true to get a session_id. Almost always pair with notify_on_complete=true — bg without notify runs SILENTLY and you have no way to learn it finished short of calling process(action='poll') yourself. Two legitimate uses:
+  (1) Long-lived processes that never exit (servers, watchers, daemons) — silent is correct, there's no exit to notify on.
+  (2) Long-running bounded tasks (tests, builds, deploys, CI pollers, batch jobs) — MUST set notify_on_complete=true. Without it you'll either forget to poll or sit blocked waiting for the user to surface the result.
 For servers/watchers, do NOT use shell-level background wrappers (nohup/disown/setsid/trailing '&') in foreground mode. Use background=true so Hermes can track lifecycle and output.
 After starting a server, verify readiness with a health check or log signal, then run tests in a separate terminal() call. Avoid blind sleep loops.
 Use process(action="poll") for progress checks, process(action="wait") to block until done.
@@ -1959,6 +1959,32 @@ def terminal_tool(
                 if pty_disabled_reason:
                     result_data["pty_note"] = pty_disabled_reason
 
+                # Nudge: background=True without notify_on_complete=True OR
+                # watch_patterns is a silent process. The agent has NO way to
+                # learn it finished short of calling process(action="poll"/"wait")
+                # explicitly. That's correct only for genuine long-lived
+                # processes that never exit (servers, watchers). For every
+                # bounded task (tests, builds, CI pollers, deploys, batch
+                # jobs) the agent almost certainly wanted notification and
+                # forgot the flag. May 2026 PR #31231 incident: bg CI poller
+                # ran fine, exited green, agent never noticed — user had to
+                # surface the result. Cheap nudge here costs ~one read for
+                # server cases (false positive) and prevents silent
+                # blindness for bounded-task cases (false negative).
+                if background and not notify_on_complete and not watch_patterns:
+                    result_data["hint"] = (
+                        "background=true without notify_on_complete=true means "
+                        "this process runs SILENTLY — you will not be told when "
+                        "it exits. If this is a bounded task (test suite, build, "
+                        "CI poller, deploy, anything with a defined end), you "
+                        "almost certainly wanted notify_on_complete=true so the "
+                        "system pings you on exit. Re-launch with "
+                        "notify_on_complete=true, or call process(action='poll') "
+                        "/ process(action='wait') yourself to learn the outcome. "
+                        "Only ignore this hint for genuine long-lived processes "
+                        "that never exit (servers, watchers, daemons)."
+                    )
+
                 # Populate routing metadata on the session so that
                 # watch-pattern and completion notifications can be
                 # routed back to the correct chat/thread.
@@ -2322,7 +2348,7 @@ TERMINAL_SCHEMA = {
             },
             "background": {
                 "type": "boolean",
-                "description": "Run the command in the background. Two patterns: (1) Long-lived processes that never exit (servers, watchers). (2) Long-running tasks paired with notify_on_complete=true — you can keep working and get notified when the task finishes. For short commands, prefer foreground with a generous timeout instead.",
+                "description": "Run the command in the background. Almost always pair with notify_on_complete=true — without it, the process runs silently and you'll have no way to learn it finished short of calling process(action='poll') yourself (easy to forget, leading to silent blindness on long jobs). Two legitimate patterns: (1) Long-lived processes that never exit (servers, watchers, daemons) — these stay silent because there's no exit to notify on. (2) Long-running bounded tasks (tests, builds, deploys, CI pollers, batch jobs) — these MUST set notify_on_complete=true. For short commands, prefer foreground with a generous timeout instead.",
                 "default": False
             },
             "timeout": {