mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-30 06:41:51 +00:00
`terminal(background=true)` without `notify_on_complete=true` or `watch_patterns` runs the process SILENTLY — the agent has no way to learn it finished short of calling `process(action='poll')` explicitly. That's correct for genuine long-lived processes (servers, watchers, daemons) but is a footgun for every bounded task (tests, builds, deploys, CI pollers, batch jobs), which is the vast majority of background uses. Hit on May 23, 2026 (PR #31231 incident): agent launched a CI-watch loop with `background=true` only. The poller ran fine, exited green 6 minutes later, agent never noticed. User had to surface 'we are green CI, you can merge.' Memory and skill docs said *what* to do (poll in background) but not *how* to receive the result. The `notify_on_complete=true` flag exists and works, but is easy to forget when bg seems sufficient on its own. Two changes here, mutually reinforcing: 1. Runtime nudge: tool result for `background=true` w/o notify or watch_patterns now includes a `hint` field explaining the silent- process failure mode and pointing at the corrective flag. Agent sees it on the same turn and self-corrects without needing the user to surface anything. Cost for legitimate server cases is one ignored read (~50 tokens); cost for forgot-notify cases is prevented blindness (potentially many turns, or a user nudge). False positives << false negatives. 2. Schema/description rewrite: top-level TERMINAL_TOOL_DESCRIPTION and the `background` field description now lead with 'Almost always pair with notify_on_complete=true' instead of presenting it as one of two equally-likely patterns. The two legitimate non-notify shapes (long-lived servers; watch_patterns mid-process signals) are still documented, but as the minority case. Tests cover all four shapes: bg-only emits hint, bg+notify doesn't, bg+watch_patterns doesn't, foreground doesn't. 4 new tests; full suite of background/process tests stays green (160/160 across the relevant 6 test files).
505 lines
19 KiB
Python
505 lines
19 KiB
Python
"""Tests for notify_on_complete background process feature.
|
|
|
|
Covers:
|
|
- ProcessSession.notify_on_complete field
|
|
- ProcessRegistry.completion_queue population on _move_to_finished()
|
|
- Checkpoint persistence of notify_on_complete
|
|
- Terminal tool schema includes notify_on_complete
|
|
- Terminal tool handler passes notify_on_complete through
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import queue
|
|
import time
|
|
import pytest
|
|
from pathlib import Path
|
|
from unittest.mock import MagicMock, patch
|
|
|
|
from tools.process_registry import (
|
|
ProcessRegistry,
|
|
ProcessSession,
|
|
)
|
|
|
|
|
|
@pytest.fixture()
|
|
def registry():
|
|
"""Create a fresh ProcessRegistry."""
|
|
return ProcessRegistry()
|
|
|
|
|
|
def _make_session(
|
|
sid="proc_test_notify",
|
|
command="echo hello",
|
|
task_id="t1",
|
|
exited=False,
|
|
exit_code=None,
|
|
output="",
|
|
notify_on_complete=False,
|
|
) -> ProcessSession:
|
|
s = ProcessSession(
|
|
id=sid,
|
|
command=command,
|
|
task_id=task_id,
|
|
started_at=time.time(),
|
|
exited=exited,
|
|
exit_code=exit_code,
|
|
output_buffer=output,
|
|
notify_on_complete=notify_on_complete,
|
|
)
|
|
return s
|
|
|
|
|
|
# =========================================================================
|
|
# ProcessSession field
|
|
# =========================================================================
|
|
|
|
class TestProcessSessionField:
|
|
def test_default_false(self):
|
|
s = ProcessSession(id="proc_1", command="echo hi")
|
|
assert s.notify_on_complete is False
|
|
|
|
def test_set_true(self):
|
|
s = ProcessSession(id="proc_1", command="echo hi", notify_on_complete=True)
|
|
assert s.notify_on_complete is True
|
|
|
|
|
|
# =========================================================================
|
|
# Completion queue
|
|
# =========================================================================
|
|
|
|
class TestCompletionQueue:
|
|
def test_queue_exists(self, registry):
|
|
assert hasattr(registry, "completion_queue")
|
|
assert registry.completion_queue.empty()
|
|
|
|
def test_move_to_finished_no_notify(self, registry):
|
|
"""Processes without notify_on_complete don't enqueue."""
|
|
s = _make_session(notify_on_complete=False, output="done")
|
|
s.exited = True
|
|
s.exit_code = 0
|
|
registry._running[s.id] = s
|
|
with patch.object(registry, "_write_checkpoint"):
|
|
registry._move_to_finished(s)
|
|
assert registry.completion_queue.empty()
|
|
|
|
def test_move_to_finished_with_notify(self, registry):
|
|
"""Processes with notify_on_complete push to queue."""
|
|
s = _make_session(
|
|
notify_on_complete=True,
|
|
output="build succeeded",
|
|
exit_code=0,
|
|
)
|
|
s.exited = True
|
|
s.exit_code = 0
|
|
registry._running[s.id] = s
|
|
with patch.object(registry, "_write_checkpoint"):
|
|
registry._move_to_finished(s)
|
|
|
|
assert not registry.completion_queue.empty()
|
|
completion = registry.completion_queue.get_nowait()
|
|
assert completion["session_id"] == s.id
|
|
assert completion["command"] == "echo hello"
|
|
assert completion["exit_code"] == 0
|
|
assert "build succeeded" in completion["output"]
|
|
|
|
def test_move_to_finished_nonzero_exit(self, registry):
|
|
"""Nonzero exit codes are captured correctly."""
|
|
s = _make_session(
|
|
notify_on_complete=True,
|
|
output="FAILED",
|
|
exit_code=1,
|
|
)
|
|
s.exited = True
|
|
s.exit_code = 1
|
|
registry._running[s.id] = s
|
|
with patch.object(registry, "_write_checkpoint"):
|
|
registry._move_to_finished(s)
|
|
|
|
completion = registry.completion_queue.get_nowait()
|
|
assert completion["exit_code"] == 1
|
|
assert "FAILED" in completion["output"]
|
|
|
|
def test_move_to_finished_idempotent_no_duplicate(self, registry):
|
|
"""Calling _move_to_finished twice must NOT enqueue two notifications.
|
|
|
|
Regression test: kill_process() and the reader thread can both call
|
|
_move_to_finished() for the same session, producing duplicate
|
|
[SYSTEM: Background process ...] messages.
|
|
"""
|
|
s = _make_session(notify_on_complete=True, output="done", exit_code=-15)
|
|
s.exited = True
|
|
s.exit_code = -15
|
|
registry._running[s.id] = s
|
|
with patch.object(registry, "_write_checkpoint"):
|
|
registry._move_to_finished(s) # first call — should enqueue
|
|
s.exit_code = 143 # reader thread updates exit code
|
|
registry._move_to_finished(s) # second call — should be no-op
|
|
|
|
assert registry.completion_queue.qsize() == 1
|
|
completion = registry.completion_queue.get_nowait()
|
|
assert completion["exit_code"] == -15 # from the first (kill) call
|
|
|
|
def test_output_truncated_to_2000(self, registry):
|
|
"""Long output is truncated to last 2000 chars."""
|
|
long_output = "x" * 5000
|
|
s = _make_session(
|
|
notify_on_complete=True,
|
|
output=long_output,
|
|
)
|
|
s.exited = True
|
|
s.exit_code = 0
|
|
registry._running[s.id] = s
|
|
with patch.object(registry, "_write_checkpoint"):
|
|
registry._move_to_finished(s)
|
|
|
|
completion = registry.completion_queue.get_nowait()
|
|
assert len(completion["output"]) == 2000
|
|
|
|
def test_multiple_completions_queued(self, registry):
|
|
"""Multiple notify processes all push to the same queue."""
|
|
for i in range(3):
|
|
s = _make_session(
|
|
sid=f"proc_{i}",
|
|
notify_on_complete=True,
|
|
output=f"output_{i}",
|
|
)
|
|
s.exited = True
|
|
s.exit_code = 0
|
|
registry._running[s.id] = s
|
|
with patch.object(registry, "_write_checkpoint"):
|
|
registry._move_to_finished(s)
|
|
|
|
completions = []
|
|
while not registry.completion_queue.empty():
|
|
completions.append(registry.completion_queue.get_nowait())
|
|
assert len(completions) == 3
|
|
ids = {c["session_id"] for c in completions}
|
|
assert ids == {"proc_0", "proc_1", "proc_2"}
|
|
|
|
|
|
# =========================================================================
|
|
# Checkpoint persistence
|
|
# =========================================================================
|
|
|
|
class TestCheckpointNotify:
|
|
def test_checkpoint_includes_notify(self, registry, tmp_path):
|
|
with patch("tools.process_registry.CHECKPOINT_PATH", tmp_path / "procs.json"):
|
|
s = _make_session(notify_on_complete=True)
|
|
registry._running[s.id] = s
|
|
registry._write_checkpoint()
|
|
|
|
data = json.loads((tmp_path / "procs.json").read_text())
|
|
assert len(data) == 1
|
|
assert data[0]["notify_on_complete"] is True
|
|
|
|
def test_checkpoint_without_notify(self, registry, tmp_path):
|
|
with patch("tools.process_registry.CHECKPOINT_PATH", tmp_path / "procs.json"):
|
|
s = _make_session(notify_on_complete=False)
|
|
registry._running[s.id] = s
|
|
registry._write_checkpoint()
|
|
|
|
data = json.loads((tmp_path / "procs.json").read_text())
|
|
assert data[0]["notify_on_complete"] is False
|
|
|
|
def test_recover_preserves_notify(self, registry, tmp_path):
|
|
checkpoint = tmp_path / "procs.json"
|
|
checkpoint.write_text(json.dumps([{
|
|
"session_id": "proc_live",
|
|
"command": "sleep 999",
|
|
"pid": os.getpid(),
|
|
"task_id": "t1",
|
|
"notify_on_complete": True,
|
|
}]))
|
|
with patch("tools.process_registry.CHECKPOINT_PATH", checkpoint):
|
|
recovered = registry.recover_from_checkpoint()
|
|
assert recovered == 1
|
|
s = registry.get("proc_live")
|
|
assert s.notify_on_complete is True
|
|
|
|
def test_recover_requeues_notify_watchers(self, registry, tmp_path):
|
|
checkpoint = tmp_path / "procs.json"
|
|
checkpoint.write_text(json.dumps([{
|
|
"session_id": "proc_live",
|
|
"command": "sleep 999",
|
|
"pid": os.getpid(),
|
|
"task_id": "t1",
|
|
"session_key": "sk1",
|
|
"watcher_platform": "telegram",
|
|
"watcher_chat_id": "123",
|
|
"watcher_user_id": "u123",
|
|
"watcher_user_name": "alice",
|
|
"watcher_thread_id": "42",
|
|
"watcher_interval": 5,
|
|
"notify_on_complete": True,
|
|
}]))
|
|
with patch("tools.process_registry.CHECKPOINT_PATH", checkpoint):
|
|
recovered = registry.recover_from_checkpoint()
|
|
assert recovered == 1
|
|
assert len(registry.pending_watchers) == 1
|
|
assert registry.pending_watchers[0]["notify_on_complete"] is True
|
|
assert registry.pending_watchers[0]["user_id"] == "u123"
|
|
assert registry.pending_watchers[0]["user_name"] == "alice"
|
|
|
|
def test_recover_defaults_false(self, registry, tmp_path):
|
|
"""Old checkpoint entries without the field default to False."""
|
|
checkpoint = tmp_path / "procs.json"
|
|
checkpoint.write_text(json.dumps([{
|
|
"session_id": "proc_live",
|
|
"command": "sleep 999",
|
|
"pid": os.getpid(),
|
|
"task_id": "t1",
|
|
}]))
|
|
with patch("tools.process_registry.CHECKPOINT_PATH", checkpoint):
|
|
recovered = registry.recover_from_checkpoint()
|
|
assert recovered == 1
|
|
s = registry.get("proc_live")
|
|
assert s.notify_on_complete is False
|
|
|
|
|
|
# =========================================================================
|
|
# Terminal tool schema
|
|
# =========================================================================
|
|
|
|
class TestTerminalSchema:
|
|
def test_schema_has_notify_on_complete(self):
|
|
from tools.terminal_tool import TERMINAL_SCHEMA
|
|
props = TERMINAL_SCHEMA["parameters"]["properties"]
|
|
assert "notify_on_complete" in props
|
|
assert props["notify_on_complete"]["type"] == "boolean"
|
|
assert props["notify_on_complete"]["default"] is False
|
|
|
|
def test_handler_passes_notify(self):
|
|
"""_handle_terminal passes notify_on_complete to terminal_tool."""
|
|
from tools.terminal_tool import _handle_terminal
|
|
with patch("tools.terminal_tool.terminal_tool", return_value='{"ok":true}') as mock_tt:
|
|
_handle_terminal(
|
|
{"command": "echo hi", "background": True, "notify_on_complete": True},
|
|
task_id="t1",
|
|
)
|
|
_, kwargs = mock_tt.call_args
|
|
assert kwargs["notify_on_complete"] is True
|
|
|
|
|
|
# =========================================================================
|
|
# Code execution blocked params
|
|
# =========================================================================
|
|
|
|
class TestCodeExecutionBlocked:
|
|
def test_notify_on_complete_blocked_in_sandbox(self):
|
|
from tools.code_execution_tool import _TERMINAL_BLOCKED_PARAMS
|
|
assert "notify_on_complete" in _TERMINAL_BLOCKED_PARAMS
|
|
|
|
|
|
# =========================================================================
|
|
# Completion consumed suppression
|
|
# =========================================================================
|
|
|
|
class TestCompletionConsumed:
|
|
"""Test that wait/poll/log suppress redundant completion notifications."""
|
|
|
|
def test_wait_marks_completion_consumed(self, registry):
|
|
"""wait() returning exited status marks session as consumed."""
|
|
s = _make_session(sid="proc_wait", notify_on_complete=True, output="done")
|
|
s.exited = True
|
|
s.exit_code = 0
|
|
registry._running[s.id] = s
|
|
with patch.object(registry, "_write_checkpoint"):
|
|
registry._move_to_finished(s)
|
|
|
|
# Notification is in the queue
|
|
assert not registry.completion_queue.empty()
|
|
assert not registry.is_completion_consumed("proc_wait")
|
|
|
|
# Agent calls wait() — gets the result directly
|
|
result = registry.wait("proc_wait", timeout=1)
|
|
assert result["status"] == "exited"
|
|
|
|
# Now the completion is marked as consumed
|
|
assert registry.is_completion_consumed("proc_wait")
|
|
|
|
def test_poll_marks_completion_consumed(self, registry):
|
|
"""poll() returning exited status marks session as consumed."""
|
|
s = _make_session(sid="proc_poll", notify_on_complete=True, output="done")
|
|
s.exited = True
|
|
s.exit_code = 0
|
|
registry._finished[s.id] = s
|
|
|
|
result = registry.poll("proc_poll")
|
|
assert result["status"] == "exited"
|
|
assert registry.is_completion_consumed("proc_poll")
|
|
|
|
def test_log_marks_completion_consumed(self, registry):
|
|
"""read_log() on exited session marks as consumed."""
|
|
s = _make_session(sid="proc_log", notify_on_complete=True, output="line1\nline2")
|
|
s.exited = True
|
|
s.exit_code = 0
|
|
registry._finished[s.id] = s
|
|
|
|
result = registry.read_log("proc_log")
|
|
assert result["status"] == "exited"
|
|
assert registry.is_completion_consumed("proc_log")
|
|
|
|
def test_running_process_not_consumed(self, registry):
|
|
"""poll() on a still-running process does not mark as consumed."""
|
|
s = _make_session(sid="proc_running", notify_on_complete=True, output="partial")
|
|
registry._running[s.id] = s
|
|
|
|
result = registry.poll("proc_running")
|
|
assert result["status"] == "running"
|
|
assert not registry.is_completion_consumed("proc_running")
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Silent-background-process hint
|
|
#
|
|
# background=True without notify_on_complete=True OR watch_patterns runs
|
|
# the process silently — the agent has no way to learn it finished short
|
|
# of calling process(action="poll") explicitly. The tool result must
|
|
# include a "hint" field that nudges the agent toward
|
|
# notify_on_complete=True for bounded tasks. May 2026 PR #31231 incident:
|
|
# bg CI poller exited green, agent never noticed, user had to surface it.
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _silent_bg_base_config(tmp_path):
|
|
return {
|
|
"env_type": "local",
|
|
"docker_image": "",
|
|
"singularity_image": "",
|
|
"modal_image": "",
|
|
"daytona_image": "",
|
|
"cwd": str(tmp_path),
|
|
"timeout": 30,
|
|
}
|
|
|
|
|
|
def _silent_bg_harness(monkeypatch, tmp_path):
|
|
"""Common test fixture: patch enough of terminal_tool to spawn a fake
|
|
background process and capture the JSON result the agent sees."""
|
|
import tools.terminal_tool as terminal_tool_module
|
|
from tools import process_registry as process_registry_module
|
|
from types import SimpleNamespace
|
|
|
|
config = _silent_bg_base_config(tmp_path)
|
|
dummy_env = SimpleNamespace(env={})
|
|
|
|
def fake_spawn_local(**kwargs):
|
|
return SimpleNamespace(
|
|
id="proc_silent_test",
|
|
pid=4242,
|
|
notify_on_complete=False,
|
|
watcher_platform="",
|
|
watcher_chat_id="",
|
|
watcher_user_id="",
|
|
watcher_user_name="",
|
|
watcher_thread_id="",
|
|
watcher_message_id="",
|
|
watcher_interval=0,
|
|
)
|
|
|
|
monkeypatch.setattr(terminal_tool_module, "_get_env_config", lambda: config)
|
|
monkeypatch.setattr(terminal_tool_module, "_start_cleanup_thread", lambda: None)
|
|
monkeypatch.setattr(terminal_tool_module, "_check_all_guards", lambda *_args, **_kwargs: {"approved": True})
|
|
monkeypatch.setattr(process_registry_module.process_registry, "spawn_local", fake_spawn_local)
|
|
monkeypatch.setitem(terminal_tool_module._active_environments, "default", dummy_env)
|
|
monkeypatch.setitem(terminal_tool_module._last_activity, "default", 0.0)
|
|
return terminal_tool_module
|
|
|
|
|
|
def test_background_without_notify_emits_silent_process_hint(monkeypatch, tmp_path):
|
|
"""The footgun case (May 2026 PR #31231): bg=True alone runs silently
|
|
and the agent has no signal it finished. Tool must nudge."""
|
|
tt = _silent_bg_harness(monkeypatch, tmp_path)
|
|
try:
|
|
result = json.loads(
|
|
tt.terminal_tool(
|
|
command="while true; do gh pr checks 999; sleep 30; done",
|
|
background=True,
|
|
)
|
|
)
|
|
finally:
|
|
tt._active_environments.pop("default", None)
|
|
tt._last_activity.pop("default", None)
|
|
|
|
assert result["session_id"] == "proc_silent_test"
|
|
hint = result.get("hint", "")
|
|
assert hint, "Silent background process must include a hint field"
|
|
assert "notify_on_complete" in hint, (
|
|
"Hint must name the corrective flag so the agent can self-correct"
|
|
)
|
|
assert "silent" in hint.lower() or "no way to learn" in hint.lower(), (
|
|
"Hint must explain the failure mode, not just suggest the fix"
|
|
)
|
|
|
|
|
|
def test_background_with_notify_does_not_emit_hint(monkeypatch, tmp_path):
|
|
"""The correct shape — bg+notify together — must not nag."""
|
|
tt = _silent_bg_harness(monkeypatch, tmp_path)
|
|
try:
|
|
result = json.loads(
|
|
tt.terminal_tool(
|
|
command="pytest tests/",
|
|
background=True,
|
|
notify_on_complete=True,
|
|
)
|
|
)
|
|
finally:
|
|
tt._active_environments.pop("default", None)
|
|
tt._last_activity.pop("default", None)
|
|
|
|
assert "hint" not in result, (
|
|
f"Correct usage must not emit a hint, got: {result.get('hint')!r}"
|
|
)
|
|
assert result.get("notify_on_complete") is True
|
|
|
|
|
|
def test_background_with_watch_patterns_does_not_emit_hint(monkeypatch, tmp_path):
|
|
"""watch_patterns is the other legitimate non-silent shape — also no hint."""
|
|
tt = _silent_bg_harness(monkeypatch, tmp_path)
|
|
try:
|
|
result = json.loads(
|
|
tt.terminal_tool(
|
|
command="uvicorn app:server --port 8080",
|
|
background=True,
|
|
watch_patterns=["Application startup complete"],
|
|
)
|
|
)
|
|
finally:
|
|
tt._active_environments.pop("default", None)
|
|
tt._last_activity.pop("default", None)
|
|
|
|
assert "hint" not in result, (
|
|
f"watch_patterns shape must not emit a silent-process hint, got: {result.get('hint')!r}"
|
|
)
|
|
|
|
|
|
def test_foreground_command_does_not_emit_hint(monkeypatch, tmp_path):
|
|
"""Hint only applies to background processes — foreground returns its
|
|
result synchronously and the agent always sees the outcome."""
|
|
tt = _silent_bg_harness(monkeypatch, tmp_path)
|
|
|
|
# Foreground path doesn't go through spawn_local. Patch the local-env
|
|
# exec method to short-circuit to a clean exit so the test doesn't
|
|
# actually shell out.
|
|
from types import SimpleNamespace
|
|
dummy_env = SimpleNamespace(
|
|
env={},
|
|
execute=lambda *a, **kw: {"output": "done", "exit_code": 0, "error": None},
|
|
)
|
|
monkeypatch.setitem(tt._active_environments, "default", dummy_env)
|
|
|
|
try:
|
|
result = json.loads(
|
|
tt.terminal_tool(
|
|
command="echo hello",
|
|
background=False,
|
|
)
|
|
)
|
|
finally:
|
|
tt._active_environments.pop("default", None)
|
|
tt._last_activity.pop("default", None)
|
|
|
|
assert "hint" not in result, (
|
|
f"Foreground commands must not emit the background-silence hint, got: {result.get('hint')!r}"
|
|
)
|