hermes-agent/tests/tools/test_process_registry.py
kshitijk4poor 66827f8947 chore: prune unused imports and duplicate import redefinitions
Remove unused imports (F401) and duplicate/shadowed import
redefinitions (F811) across the codebase using ruff's safe
autofixes. No behavioral changes -- imports only.

- ~1400 safe autofixes applied across 644 files (net -1072 lines)
- __init__.py re-exports preserved (excluded from F401 removal so
  public re-export surfaces stay intact)
- Re-exports that are imported or monkeypatched by tests but look
  unused in their defining module are kept with explicit # noqa:
  F401 (gateway/run.py load_dotenv; run_agent re-exports from
  agent.message_sanitization, agent.context_compressor,
  agent.retry_utils, agent.prompt_builder, agent.process_bootstrap,
  agent.codex_responses_adapter)
- Unsafe F841 (unused-variable) fixes deliberately skipped -- those
  can change behavior when the RHS has side effects
- ruff lints remain disabled in pyproject.toml (only PLW1514 is
  selected); this is a one-time cleanup, not a config change

Verification:
- python -m compileall: clean
- pytest --collect-only: all 27161 tests collect (zero import errors)
- core entry points import clean (run_agent, model_tools, cli,
  toolsets, hermes_state, batch_runner, gateway)
- static scan: every name any test imports directly from an edited
  module still resolves
2026-05-28 22:26:25 -07:00

1167 lines
41 KiB
Python

"""Tests for tools/process_registry.py — ProcessRegistry query methods, pruning, checkpoint."""
import json
import os
import signal
import subprocess
import sys
import time
import pytest
from unittest.mock import MagicMock, patch
from tools.environments.local import _HERMES_PROVIDER_ENV_FORCE_PREFIX
from tools.process_registry import (
ProcessRegistry,
ProcessSession,
FINISHED_TTL_SECONDS,
MAX_PROCESSES,
)
@pytest.fixture()
def registry():
"""Create a fresh ProcessRegistry."""
return ProcessRegistry()
def _make_session(
sid="proc_test123",
command="echo hello",
task_id="t1",
exited=False,
exit_code=None,
output="",
started_at=None,
) -> ProcessSession:
"""Helper to create a ProcessSession for testing."""
s = ProcessSession(
id=sid,
command=command,
task_id=task_id,
started_at=started_at or time.time(),
exited=exited,
exit_code=exit_code,
output_buffer=output,
)
return s
def _spawn_python_sleep(seconds: float) -> subprocess.Popen:
"""Spawn a portable short-lived Python sleep process."""
return subprocess.Popen(
[sys.executable, "-c", f"import time; time.sleep({seconds})"],
)
def _wait_until(predicate, timeout: float = 5.0, interval: float = 0.05) -> bool:
"""Poll a predicate until it returns truthy or the timeout elapses."""
deadline = time.monotonic() + timeout
while time.monotonic() < deadline:
if predicate():
return True
time.sleep(interval)
return False
# =========================================================================
# Get / Poll
# =========================================================================
class TestGetAndPoll:
def test_get_not_found(self, registry):
assert registry.get("nonexistent") is None
def test_get_running(self, registry):
s = _make_session()
registry._running[s.id] = s
assert registry.get(s.id) is s
def test_get_finished(self, registry):
s = _make_session(exited=True, exit_code=0)
registry._finished[s.id] = s
assert registry.get(s.id) is s
def test_poll_not_found(self, registry):
result = registry.poll("nonexistent")
assert result["status"] == "not_found"
def test_poll_running(self, registry):
s = _make_session(output="some output here")
registry._running[s.id] = s
result = registry.poll(s.id)
assert result["status"] == "running"
assert "some output" in result["output_preview"]
assert result["command"] == "echo hello"
def test_poll_exited(self, registry):
s = _make_session(exited=True, exit_code=0, output="done")
registry._finished[s.id] = s
result = registry.poll(s.id)
assert result["status"] == "exited"
assert result["exit_code"] == 0
# =========================================================================
# Orphaned-pipe reconciliation (issue #17327)
# =========================================================================
@pytest.mark.skipif(sys.platform == "win32", reason="POSIX-only: uses setsid/fcntl")
class TestOrphanedPipeReconciliation:
"""Regression tests for issue #17327.
`hermes update` in Feishu spawned a background subprocess that restarted
the gateway; the direct child exited quickly but a descendant daemon
held the stdout pipe open. `_reader_loop.finally` never ran, so
`session.exited` stayed False and the agent polled 74 times over 7
minutes, all returning `status: running`.
The fix is `_reconcile_local_exit()`: poll() and wait() now check the
direct `Popen.poll()` before trusting `session.exited`.
"""
def test_reconcile_flips_exited_when_direct_child_done(self, registry):
"""Direct child exited but reader thread is blocked on orphaned pipe."""
# Simulate the orphaned-pipe scenario: direct child exited, but a
# descendant holds stdout open so the reader never sees EOF.
# Approach: spawn `sh -c 'sleep 10 &'` with setsid — sh forks the
# sleep into a new session group, exits immediately, but sleep
# inherits the stdout pipe and keeps it open.
proc = subprocess.Popen(
["sh", "-c", "exec 1>&2; ( sleep 30 ) & disown; exit 0"],
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
preexec_fn=os.setsid,
)
s = _make_session(sid="proc_orphan_test")
s.process = proc
s.pid = proc.pid
registry._running[s.id] = s
# Wait for the direct child to exit. We don't start a reader thread,
# so session.exited stays False (mimicking the stuck-reader state).
assert _wait_until(lambda: proc.poll() is not None, timeout=5.0), (
"Direct child should exit quickly (sh exits, sleep descendant "
"holds the pipe open)"
)
# Before the fix: poll would return "running" forever.
# After the fix: poll reconciles against proc.poll() and flips.
assert s.exited is False # Precondition: reader hasn't updated it.
result = registry.poll(s.id)
assert result["status"] == "exited", (
f"Expected reconciled 'exited' status; got {result!r}. "
"This is issue #17327 — reader is blocked on orphaned pipe."
)
assert result["exit_code"] == 0
assert s.exited is True
assert s.id in registry._finished
assert s.id not in registry._running
# Clean up the orphaned descendant.
try:
os.killpg(os.getpgid(proc.pid), signal.SIGKILL)
except (ProcessLookupError, PermissionError):
pass
def test_reconcile_noop_when_child_still_running(self, registry):
"""Reconcile must NOT flip exited when the direct child is alive."""
proc = _spawn_python_sleep(5.0)
s = _make_session(sid="proc_running_test")
s.process = proc
s.pid = proc.pid
registry._running[s.id] = s
result = registry.poll(s.id)
assert result["status"] == "running"
assert s.exited is False
proc.kill()
proc.wait()
def test_reconcile_noop_on_already_exited(self, registry):
"""Reconcile is a no-op when session.exited is already True."""
s = _make_session(sid="proc_already_exited", exited=True, exit_code=7)
s.process = MagicMock()
s.process.poll = MagicMock(return_value=0) # Would say exit 0
registry._finished[s.id] = s
registry._reconcile_local_exit(s)
# Must not overwrite the existing exit_code with proc.poll()'s 0.
assert s.exit_code == 7
def test_reconcile_noop_on_no_process(self, registry):
"""Reconcile is a no-op for sessions without a local Popen (env/PTY)."""
s = _make_session(sid="proc_no_popen")
assert getattr(s, "process", None) is None
# Must not raise.
registry._reconcile_local_exit(s)
assert s.exited is False
def test_wait_returns_when_reader_blocked(self, registry):
"""wait() must also reconcile — not just poll()."""
proc = subprocess.Popen(
["sh", "-c", "( sleep 30 ) & disown; exit 0"],
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
preexec_fn=os.setsid,
)
s = _make_session(sid="proc_wait_orphan")
s.process = proc
s.pid = proc.pid
registry._running[s.id] = s
assert _wait_until(lambda: proc.poll() is not None, timeout=5.0)
start = time.monotonic()
result = registry.wait(s.id, timeout=10)
elapsed = time.monotonic() - start
assert result["status"] == "exited", result
assert elapsed < 5.0, (
f"wait() should return ~immediately via reconcile; took {elapsed:.1f}s"
)
try:
os.killpg(os.getpgid(proc.pid), signal.SIGKILL)
except (ProcessLookupError, PermissionError):
pass
# =========================================================================
# Read log
# =========================================================================
class TestReadLog:
def test_not_found(self, registry):
result = registry.read_log("nonexistent")
assert result["status"] == "not_found"
def test_read_full_log(self, registry):
lines = "\n".join([f"line {i}" for i in range(50)])
s = _make_session(output=lines)
registry._running[s.id] = s
result = registry.read_log(s.id)
assert result["total_lines"] == 50
def test_read_with_limit(self, registry):
lines = "\n".join([f"line {i}" for i in range(100)])
s = _make_session(output=lines)
registry._running[s.id] = s
result = registry.read_log(s.id, limit=10)
# Default: last 10 lines
assert "10 lines" in result["showing"]
def test_read_with_offset(self, registry):
lines = "\n".join([f"line {i}" for i in range(100)])
s = _make_session(output=lines)
registry._running[s.id] = s
result = registry.read_log(s.id, offset=10, limit=5)
assert "5 lines" in result["showing"]
# =========================================================================
# Stdin helpers
# =========================================================================
class TestStdinHelpers:
def test_close_stdin_not_found(self, registry):
result = registry.close_stdin("nonexistent")
assert result["status"] == "not_found"
def test_close_stdin_pipe_mode(self, registry):
proc = MagicMock()
proc.stdin = MagicMock()
s = _make_session()
s.process = proc
registry._running[s.id] = s
result = registry.close_stdin(s.id)
proc.stdin.close.assert_called_once()
assert result["status"] == "ok"
def test_close_stdin_pty_mode(self, registry):
pty = MagicMock()
s = _make_session()
s._pty = pty
registry._running[s.id] = s
result = registry.close_stdin(s.id)
pty.sendeof.assert_called_once()
assert result["status"] == "ok"
def test_close_stdin_allows_eof_driven_process_to_finish(self, registry, tmp_path):
"""PTY mode: writing data + sending EOF lets an EOF-driven child finish.
Background non-PTY mode used to expose subprocess stdin via a pipe,
but PR #214b95392 detached non-PTY stdin to DEVNULL to fix keyboard
lockout (#17959). For interactive stdin → PTY mode is now the only
supported path.
"""
session = registry.spawn_local(
'python3 -c "import sys; print(sys.stdin.read().strip())"',
cwd=str(tmp_path),
use_pty=True,
)
try:
time.sleep(0.5)
assert registry.submit_stdin(session.id, "hello")["status"] == "ok"
assert registry.close_stdin(session.id)["status"] == "ok"
deadline = time.time() + 5
while time.time() < deadline:
poll = registry.poll(session.id)
if poll["status"] == "exited":
assert poll["exit_code"] == 0
assert "hello" in poll["output_preview"]
return
time.sleep(0.2)
pytest.fail("process did not exit after stdin was closed")
finally:
registry.kill_process(session.id)
# =========================================================================
# List sessions
# =========================================================================
class TestListSessions:
def test_empty(self, registry):
assert registry.list_sessions() == []
def test_lists_running_and_finished(self, registry):
s1 = _make_session(sid="proc_1", task_id="t1")
s2 = _make_session(sid="proc_2", task_id="t1", exited=True, exit_code=0)
registry._running[s1.id] = s1
registry._finished[s2.id] = s2
result = registry.list_sessions()
assert len(result) == 2
def test_filter_by_task_id(self, registry):
s1 = _make_session(sid="proc_1", task_id="t1")
s2 = _make_session(sid="proc_2", task_id="t2")
registry._running[s1.id] = s1
registry._running[s2.id] = s2
result = registry.list_sessions(task_id="t1")
assert len(result) == 1
assert result[0]["session_id"] == "proc_1"
def test_list_entry_fields(self, registry):
s = _make_session(output="preview text")
registry._running[s.id] = s
entry = registry.list_sessions()[0]
assert "session_id" in entry
assert "command" in entry
assert "status" in entry
assert "pid" in entry
assert "output_preview" in entry
# =========================================================================
# Active process queries
# =========================================================================
class TestActiveQueries:
def test_has_active_processes(self, registry):
s = _make_session(task_id="t1")
registry._running[s.id] = s
assert registry.has_active_processes("t1") is True
assert registry.has_active_processes("t2") is False
def test_has_active_for_session(self, registry):
s = _make_session()
s.session_key = "gw_session_1"
registry._running[s.id] = s
assert registry.has_active_for_session("gw_session_1") is True
assert registry.has_active_for_session("other") is False
def test_exited_not_active(self, registry):
s = _make_session(task_id="t1", exited=True, exit_code=0)
registry._finished[s.id] = s
assert registry.has_active_processes("t1") is False
# =========================================================================
# Pruning
# =========================================================================
class TestPruning:
def test_prune_expired_finished(self, registry):
old_session = _make_session(
sid="proc_old",
exited=True,
started_at=time.time() - FINISHED_TTL_SECONDS - 100,
)
registry._finished[old_session.id] = old_session
registry._prune_if_needed()
assert "proc_old" not in registry._finished
def test_prune_keeps_recent(self, registry):
recent = _make_session(sid="proc_recent", exited=True)
registry._finished[recent.id] = recent
registry._prune_if_needed()
assert "proc_recent" in registry._finished
def test_prune_over_max_removes_oldest(self, registry):
# Fill up to MAX_PROCESSES
for i in range(MAX_PROCESSES):
s = _make_session(
sid=f"proc_{i}",
exited=True,
started_at=time.time() - i, # older as i increases
)
registry._finished[s.id] = s
# Add one more running to trigger prune
s = _make_session(sid="proc_new")
registry._running[s.id] = s
registry._prune_if_needed()
total = len(registry._running) + len(registry._finished)
assert total <= MAX_PROCESSES
# =========================================================================
# Spawn env sanitization
# =========================================================================
class TestSpawnEnvSanitization:
def test_spawn_local_strips_blocked_vars_from_background_env(self, registry):
captured = {}
def fake_popen(cmd, **kwargs):
captured["env"] = kwargs["env"]
proc = MagicMock()
proc.pid = 4321
proc.stdout = iter([])
proc.stdin = MagicMock()
proc.poll.return_value = None
return proc
fake_thread = MagicMock()
with patch.dict(os.environ, {
"PATH": "/usr/bin:/bin",
"HOME": "/home/user",
"USER": "tester",
"TELEGRAM_BOT_TOKEN": "bot-secret",
"FIRECRAWL_API_KEY": "fc-secret",
}, clear=True), \
patch("tools.process_registry._find_shell", return_value="/bin/bash"), \
patch("subprocess.Popen", side_effect=fake_popen), \
patch("threading.Thread", return_value=fake_thread), \
patch.object(registry, "_write_checkpoint"):
registry.spawn_local(
"echo hello",
cwd="/tmp",
env_vars={
"MY_CUSTOM_VAR": "keep-me",
"TELEGRAM_BOT_TOKEN": "drop-me",
f"{_HERMES_PROVIDER_ENV_FORCE_PREFIX}TELEGRAM_BOT_TOKEN": "forced-bot-token",
},
)
env = captured["env"]
assert env["MY_CUSTOM_VAR"] == "keep-me"
assert env["TELEGRAM_BOT_TOKEN"] == "forced-bot-token"
assert "FIRECRAWL_API_KEY" not in env
assert f"{_HERMES_PROVIDER_ENV_FORCE_PREFIX}TELEGRAM_BOT_TOKEN" not in env
assert env["PYTHONUNBUFFERED"] == "1"
def test_spawn_via_env_uses_backend_temp_dir_for_artifacts(self, registry):
class FakeEnv:
def __init__(self):
self.commands = []
def get_temp_dir(self):
return "/data/data/com.termux/files/usr/tmp"
def execute(self, command, timeout=None):
self.commands.append((command, timeout))
return {"output": "4321\n"}
env = FakeEnv()
fake_thread = MagicMock()
with patch("tools.process_registry.threading.Thread", return_value=fake_thread), \
patch.object(registry, "_write_checkpoint"):
session = registry.spawn_via_env(env, "echo hello")
bg_command = env.commands[0][0]
assert session.pid == 4321
assert "/data/data/com.termux/files/usr/tmp/hermes_bg_" in bg_command
assert ".exit" in bg_command
assert "rc=$?;" in bg_command
assert " > /tmp/hermes_bg_" not in bg_command
assert "cat /tmp/hermes_bg_" not in bg_command
fake_thread.start.assert_called_once()
def test_env_poller_quotes_temp_paths_with_spaces(self, registry):
session = _make_session(sid="proc_space")
session.exited = False
class FakeEnv:
def __init__(self):
self.commands = []
self._responses = iter([
{"output": "hello\n"},
{"output": "1\n"},
{"output": "0\n"},
])
def execute(self, command, timeout=None):
self.commands.append((command, timeout))
return next(self._responses)
env = FakeEnv()
with patch("tools.process_registry.time.sleep", return_value=None), \
patch.object(registry, "_move_to_finished"):
registry._env_poller_loop(
session,
env,
"/path with spaces/hermes_bg.log",
"/path with spaces/hermes_bg.pid",
"/path with spaces/hermes_bg.exit",
)
assert env.commands[0][0] == "cat '/path with spaces/hermes_bg.log' 2>/dev/null"
assert env.commands[1][0] == "kill -0 \"$(cat '/path with spaces/hermes_bg.pid' 2>/dev/null)\" 2>/dev/null; echo $?"
assert env.commands[2][0] == "cat '/path with spaces/hermes_bg.exit' 2>/dev/null"
# =========================================================================
# Popen leak prevention
# =========================================================================
class TestPopenLeakOnSetupFailure:
"""Regression for issue #2749: subprocess orphaned when post-Popen setup raises."""
def test_popen_killed_when_thread_creation_fails(self, registry):
"""If Thread() raises after Popen, proc must be killed — not orphaned."""
killed = []
proc = MagicMock()
proc.pid = 9999
proc.stdout = iter([])
proc.stdin = MagicMock()
proc.poll.return_value = None
def fake_kill():
killed.append(True)
proc.kill = fake_kill
proc.wait = MagicMock()
def boom(*args, **kwargs):
raise RuntimeError("Thread creation failed")
with patch("tools.process_registry._find_shell", return_value="/bin/bash"), \
patch("subprocess.Popen", return_value=proc), \
patch("threading.Thread", side_effect=boom), \
patch.object(registry, "_write_checkpoint"):
with pytest.raises(RuntimeError, match="Thread creation failed"):
registry.spawn_local("echo hello", cwd="/tmp")
assert killed, "proc.kill() must be called when post-Popen setup raises"
def test_popen_killed_when_write_checkpoint_fails(self, registry):
"""If _write_checkpoint raises after Popen, proc must still be killed."""
killed = []
proc = MagicMock()
proc.pid = 8888
proc.stdout = iter([])
proc.stdin = MagicMock()
proc.poll.return_value = None
def fake_kill():
killed.append(True)
proc.kill = fake_kill
proc.wait = MagicMock()
fake_thread = MagicMock()
with patch("tools.process_registry._find_shell", return_value="/bin/bash"), \
patch("subprocess.Popen", return_value=proc), \
patch("threading.Thread", return_value=fake_thread), \
patch.object(registry, "_write_checkpoint", side_effect=OSError("disk full")):
with pytest.raises(OSError, match="disk full"):
registry.spawn_local("echo hello", cwd="/tmp")
assert killed, "proc.kill() must be called when _write_checkpoint raises"
def test_popen_not_killed_on_success(self, registry):
"""Successful spawn must NOT kill the process."""
killed = []
proc = MagicMock()
proc.pid = 7777
proc.stdout = iter([])
proc.stdin = MagicMock()
proc.poll.return_value = None
def fake_kill():
killed.append(True)
proc.kill = fake_kill
proc.wait = MagicMock()
fake_thread = MagicMock()
with patch("tools.process_registry._find_shell", return_value="/bin/bash"), \
patch("subprocess.Popen", return_value=proc), \
patch("threading.Thread", return_value=fake_thread), \
patch.object(registry, "_write_checkpoint"):
session = registry.spawn_local("echo hello", cwd="/tmp")
assert not killed, "proc.kill() must NOT be called on successful spawn"
assert session.pid == 7777
# =========================================================================
# Checkpoint
# =========================================================================
class TestCheckpoint:
def test_write_checkpoint(self, registry, tmp_path):
with patch("tools.process_registry.CHECKPOINT_PATH", tmp_path / "procs.json"):
s = _make_session()
registry._running[s.id] = s
registry._write_checkpoint()
data = json.loads((tmp_path / "procs.json").read_text())
assert len(data) == 1
assert data[0]["session_id"] == s.id
def test_recover_no_file(self, registry, tmp_path):
with patch("tools.process_registry.CHECKPOINT_PATH", tmp_path / "missing.json"):
assert registry.recover_from_checkpoint() == 0
def test_recover_dead_pid(self, registry, tmp_path):
checkpoint = tmp_path / "procs.json"
checkpoint.write_text(json.dumps([{
"session_id": "proc_dead",
"command": "sleep 999",
"pid": 999999999, # almost certainly not running
"task_id": "t1",
}]))
with patch("tools.process_registry.CHECKPOINT_PATH", checkpoint):
recovered = registry.recover_from_checkpoint()
assert recovered == 0
def test_write_checkpoint_includes_watcher_metadata(self, registry, tmp_path):
with patch("tools.process_registry.CHECKPOINT_PATH", tmp_path / "procs.json"):
s = _make_session()
s.watcher_platform = "telegram"
s.watcher_chat_id = "999"
s.watcher_user_id = "u123"
s.watcher_user_name = "alice"
s.watcher_thread_id = "42"
s.watcher_interval = 60
registry._running[s.id] = s
registry._write_checkpoint()
data = json.loads((tmp_path / "procs.json").read_text())
assert len(data) == 1
assert data[0]["watcher_platform"] == "telegram"
assert data[0]["watcher_chat_id"] == "999"
assert data[0]["watcher_user_id"] == "u123"
assert data[0]["watcher_user_name"] == "alice"
assert data[0]["watcher_thread_id"] == "42"
assert data[0]["watcher_interval"] == 60
def test_recover_enqueues_watchers(self, registry, tmp_path):
checkpoint = tmp_path / "procs.json"
checkpoint.write_text(json.dumps([{
"session_id": "proc_live",
"command": "sleep 999",
"pid": os.getpid(), # current process — guaranteed alive
"task_id": "t1",
"session_key": "sk1",
"watcher_platform": "telegram",
"watcher_chat_id": "123",
"watcher_user_id": "u123",
"watcher_user_name": "alice",
"watcher_thread_id": "42",
"watcher_interval": 60,
}]))
with patch("tools.process_registry.CHECKPOINT_PATH", checkpoint):
recovered = registry.recover_from_checkpoint()
assert recovered == 1
assert len(registry.pending_watchers) == 1
w = registry.pending_watchers[0]
assert w["session_id"] == "proc_live"
assert w["platform"] == "telegram"
assert w["chat_id"] == "123"
assert w["user_id"] == "u123"
assert w["user_name"] == "alice"
assert w["thread_id"] == "42"
assert w["check_interval"] == 60
def test_recover_skips_watcher_when_no_interval(self, registry, tmp_path):
checkpoint = tmp_path / "procs.json"
checkpoint.write_text(json.dumps([{
"session_id": "proc_live",
"command": "sleep 999",
"pid": os.getpid(),
"task_id": "t1",
"watcher_interval": 0,
}]))
with patch("tools.process_registry.CHECKPOINT_PATH", checkpoint):
recovered = registry.recover_from_checkpoint()
assert recovered == 1
assert len(registry.pending_watchers) == 0
def test_recovery_keeps_live_checkpoint_entries(self, registry, tmp_path):
checkpoint = tmp_path / "procs.json"
checkpoint.write_text(json.dumps([{
"session_id": "proc_live",
"command": "sleep 999",
"pid": os.getpid(),
"task_id": "t1",
"session_key": "sk1",
}]))
with patch("tools.process_registry.CHECKPOINT_PATH", checkpoint):
recovered = registry.recover_from_checkpoint()
assert recovered == 1
assert registry.get("proc_live") is not None
data = json.loads(checkpoint.read_text())
assert len(data) == 1
assert data[0]["session_id"] == "proc_live"
assert data[0]["pid"] == os.getpid()
assert data != []
def test_recovery_skips_explicit_sandbox_backed_entries(self, registry, tmp_path):
checkpoint = tmp_path / "procs.json"
original = [{
"session_id": "proc_remote",
"command": "sleep 999",
"pid": os.getpid(),
"task_id": "t1",
"pid_scope": "sandbox",
}]
checkpoint.write_text(json.dumps(original))
with patch("tools.process_registry.CHECKPOINT_PATH", checkpoint):
recovered = registry.recover_from_checkpoint()
assert recovered == 0
assert registry.get("proc_remote") is None
data = json.loads(checkpoint.read_text())
assert data == []
def test_detached_recovered_process_eventually_exits(self, registry, tmp_path):
proc = _spawn_python_sleep(0.4)
checkpoint = tmp_path / "procs.json"
checkpoint.write_text(json.dumps([{
"session_id": "proc_live",
"command": "python -c 'import time; time.sleep(0.4)'",
"pid": proc.pid,
"task_id": "t1",
"session_key": "sk1",
}]))
try:
with patch("tools.process_registry.CHECKPOINT_PATH", checkpoint):
recovered = registry.recover_from_checkpoint()
assert recovered == 1
session = registry.get("proc_live")
assert session is not None
assert session.detached is True
proc.wait(timeout=5)
assert _wait_until(
lambda: registry.get("proc_live") is not None
and registry.get("proc_live").exited,
timeout=5,
)
poll_result = registry.poll("proc_live")
assert poll_result["status"] == "exited"
wait_result = registry.wait("proc_live", timeout=1)
assert wait_result["status"] == "exited"
finally:
if proc.poll() is None:
proc.terminate()
try:
proc.wait(timeout=5)
except Exception:
proc.kill()
proc.wait(timeout=5)
# =========================================================================
# Kill process
# =========================================================================
class TestKillProcess:
def test_kill_not_found(self, registry):
result = registry.kill_process("nonexistent")
assert result["status"] == "not_found"
def test_kill_already_exited(self, registry):
s = _make_session(exited=True, exit_code=0)
registry._finished[s.id] = s
result = registry.kill_process(s.id)
assert result["status"] == "already_exited"
def test_kill_detached_session_uses_host_pid(self, registry):
s = _make_session(sid="proc_detached", command="sleep 999")
s.pid = 424242
s.detached = True
registry._running[s.id] = s
terminate_calls = []
class FakeProcess:
def __init__(self, pid):
self.pid = pid
def children(self, recursive=False):
return []
def terminate(self):
terminate_calls.append(("terminate", self.pid))
import psutil as _psutil
try:
# Post-#21561: liveness probe routes through
# ``ProcessRegistry._is_host_pid_alive`` (→
# ``gateway.status._pid_exists``), and the actual kill on POSIX
# routes through ``psutil.Process(pid).terminate()``. Neither
# touches ``os.kill`` directly. Mock both seams.
with patch("gateway.status._pid_exists", return_value=True), \
patch.object(_psutil, "Process", side_effect=lambda pid: FakeProcess(pid)):
result = registry.kill_process(s.id)
assert result["status"] == "killed"
assert ("terminate", 424242) in terminate_calls
finally:
registry._running.pop(s.id, None)
# =========================================================================
# Tool handler
# =========================================================================
class TestProcessToolHandler:
def test_list_action(self):
from tools.process_registry import _handle_process
result = json.loads(_handle_process({"action": "list"}))
assert "processes" in result
def test_poll_missing_session_id(self):
from tools.process_registry import _handle_process
result = json.loads(_handle_process({"action": "poll"}))
assert "error" in result
def test_unknown_action(self):
from tools.process_registry import _handle_process
result = json.loads(_handle_process({"action": "unknown_action"}))
assert "error" in result
# =========================================================================
# format_process_notification + drain_notifications (shared helpers)
# =========================================================================
from tools.process_registry import format_process_notification
def test_format_completion_event():
evt = {
"type": "completion",
"session_id": "proc_abc",
"command": "sleep 5",
"exit_code": 0,
"output": "done",
}
result = format_process_notification(evt)
assert "[IMPORTANT: Background process proc_abc completed" in result
assert "exit code 0" in result
assert "Command: sleep 5" in result
assert "Output:\ndone]" in result
def test_format_watch_match_event():
evt = {
"type": "watch_match",
"session_id": "proc_xyz",
"command": "tail -f log",
"pattern": "ERROR",
"output": "ERROR: disk full",
"suppressed": 0,
}
result = format_process_notification(evt)
assert 'watch pattern "ERROR"' in result
assert "Matched output:\nERROR: disk full" in result
def test_format_watch_match_with_suppressed():
evt = {
"type": "watch_match",
"session_id": "proc_xyz",
"command": "tail -f log",
"pattern": "WARN",
"output": "WARN: low mem",
"suppressed": 3,
}
result = format_process_notification(evt)
assert "3 earlier matches were suppressed" in result
def test_format_watch_disabled_event():
evt = {
"type": "watch_disabled",
"message": "Watch disabled for proc_xyz: too many matches",
}
result = format_process_notification(evt)
assert "[IMPORTANT: Watch disabled for proc_xyz" in result
def test_format_returns_none_for_empty_event():
evt = {}
result = format_process_notification(evt)
assert result is not None
assert "unknown" in result
def test_drain_notifications_returns_pending_events():
from tools.process_registry import process_registry
while not process_registry.completion_queue.empty():
process_registry.completion_queue.get_nowait()
process_registry.completion_queue.put({
"type": "completion",
"session_id": "proc_drain1",
"command": "echo hi",
"exit_code": 0,
"output": "hi",
})
process_registry.completion_queue.put({
"type": "watch_match",
"session_id": "proc_drain2",
"command": "tail -f x",
"pattern": "ERR",
"output": "ERR found",
"suppressed": 0,
})
try:
results = process_registry.drain_notifications()
assert len(results) == 2
assert results[0][0]["session_id"] == "proc_drain1"
assert "proc_drain1 completed" in results[0][1]
assert results[1][0]["session_id"] == "proc_drain2"
assert "watch pattern" in results[1][1]
finally:
while not process_registry.completion_queue.empty():
process_registry.completion_queue.get_nowait()
process_registry._completion_consumed.discard("proc_drain1")
process_registry._completion_consumed.discard("proc_drain2")
def test_drain_notifications_skips_consumed():
from tools.process_registry import process_registry
while not process_registry.completion_queue.empty():
process_registry.completion_queue.get_nowait()
process_registry._completion_consumed.add("proc_consumed")
process_registry.completion_queue.put({
"type": "completion",
"session_id": "proc_consumed",
"command": "echo done",
"exit_code": 0,
"output": "done",
})
try:
results = process_registry.drain_notifications()
assert len(results) == 0
finally:
process_registry._completion_consumed.discard("proc_consumed")
while not process_registry.completion_queue.empty():
process_registry.completion_queue.get_nowait()
def test_drain_notifications_empty_queue():
from tools.process_registry import process_registry
while not process_registry.completion_queue.empty():
process_registry.completion_queue.get_nowait()
results = process_registry.drain_notifications()
assert results == []
# ---------------------------------------------------------------------------
# _terminate_host_pid — cross-platform process-tree termination
# ---------------------------------------------------------------------------
class TestTerminateHostPidWindows:
"""Windows branch uses ``taskkill /T /F`` — the documented MS tree-kill
primitive. We can't use psutil's ``children(recursive=True)`` /
``.terminate()`` path on Windows because (1) Windows doesn't maintain
a Unix-style process tree so the walk is unreliable, and (2)
``Process.terminate()`` on Windows is ``TerminateProcess()`` for the
target handle only, not the tree.
"""
def test_windows_invokes_taskkill_with_tree_and_force_flags(self, monkeypatch):
"""The Windows branch must shell out to ``taskkill /PID N /T /F``."""
from tools import process_registry as pr
captured = {}
def fake_run(args, **kwargs):
captured["args"] = args
captured["kwargs"] = kwargs
return MagicMock(returncode=0, stderr="", stdout="")
monkeypatch.setattr(pr, "_IS_WINDOWS", True)
monkeypatch.setattr(pr.subprocess, "run", fake_run)
pr.ProcessRegistry._terminate_host_pid(12345)
assert captured["args"][0] == "taskkill"
assert "/PID" in captured["args"]
assert "12345" in captured["args"]
assert "/T" in captured["args"], "Tree flag required to reach descendants"
assert "/F" in captured["args"], "Force flag required for headless Chromium"
def test_windows_falls_back_to_os_kill_when_taskkill_missing(self, monkeypatch):
"""If ``taskkill.exe`` is somehow unavailable, fall back to a bare
``os.kill(pid, SIGTERM)`` so we at least try to kill the parent."""
from tools import process_registry as pr
kill_calls = []
def fake_run(*args, **kwargs):
raise FileNotFoundError("taskkill not found")
def fake_kill(pid, sig):
kill_calls.append((pid, sig))
monkeypatch.setattr(pr, "_IS_WINDOWS", True)
monkeypatch.setattr(pr.subprocess, "run", fake_run)
monkeypatch.setattr(pr.os, "kill", fake_kill)
pr.ProcessRegistry._terminate_host_pid(12345)
assert kill_calls == [(12345, signal.SIGTERM)]
def test_windows_does_not_call_psutil(self, monkeypatch):
"""The Windows branch must NOT exercise the psutil tree-walk
(it's unreliable on Windows — see the function docstring)."""
from tools import process_registry as pr
import psutil
psutil_calls = []
class _BoomProcess:
def __init__(self, pid):
psutil_calls.append(("Process", pid))
def children(self, recursive=False):
psutil_calls.append(("children", recursive))
return []
def terminate(self):
psutil_calls.append(("terminate",))
def fake_run(args, **kwargs):
return MagicMock(returncode=0, stderr="", stdout="")
monkeypatch.setattr(pr, "_IS_WINDOWS", True)
monkeypatch.setattr(pr.subprocess, "run", fake_run)
monkeypatch.setattr(psutil, "Process", _BoomProcess)
pr.ProcessRegistry._terminate_host_pid(12345)
assert psutil_calls == [], (
f"Windows branch must not touch psutil, but saw {psutil_calls!r}"
)
class TestTerminateHostPidPosix:
"""POSIX branch walks the tree via psutil and SIGTERMs children first."""
def test_posix_walks_tree_and_terminates_children_then_parent(self, monkeypatch):
from tools import process_registry as pr
import psutil
terminate_order = []
class _FakeChild:
def __init__(self, pid):
self.pid = pid
def terminate(self):
terminate_order.append(self.pid)
class _FakeParent:
def __init__(self, pid):
self.pid = pid
def children(self, recursive=False):
assert recursive is True
return [_FakeChild(101), _FakeChild(102), _FakeChild(103)]
def terminate(self):
terminate_order.append(self.pid)
monkeypatch.setattr(pr, "_IS_WINDOWS", False)
monkeypatch.setattr(psutil, "Process", _FakeParent)
pr.ProcessRegistry._terminate_host_pid(12345)
assert terminate_order == [101, 102, 103, 12345], (
"Children must be terminated before the parent"
)
def test_posix_no_such_process_swallowed(self, monkeypatch):
from tools import process_registry as pr
import psutil
def boom(pid):
raise psutil.NoSuchProcess(pid)
monkeypatch.setattr(pr, "_IS_WINDOWS", False)
monkeypatch.setattr(psutil, "Process", boom)
# Must not raise.
pr.ProcessRegistry._terminate_host_pid(999999999)
def test_posix_oserror_falls_back_to_os_kill(self, monkeypatch):
from tools import process_registry as pr
import psutil
def boom(pid):
raise PermissionError("can't read /proc")
kill_calls = []
def fake_kill(pid, sig):
kill_calls.append((pid, sig))
monkeypatch.setattr(pr, "_IS_WINDOWS", False)
monkeypatch.setattr(psutil, "Process", boom)
monkeypatch.setattr(pr.os, "kill", fake_kill)
pr.ProcessRegistry._terminate_host_pid(12345)
assert kill_calls == [(12345, signal.SIGTERM)]