hermes-agent/tests/tools/test_zombie_process_cleanup.py
Teknium 18f3fc8a6f
fix(tests): resolve 17 persistent CI test failures (#15084)
Make the main-branch test suite pass again. Most failures were tests
still asserting old shapes after recent refactors; two were real source
bugs.

Source fixes:
- tools/mcp_tool.py: _kill_orphaned_mcp_children() slept 2s on every
  shutdown even when no tracked PIDs existed, making test_shutdown_is_parallel
  measure ~3s for 3 parallel 1s shutdowns. Early-return when pids is empty.
- hermes_cli/tips.py: tip 105 was 157 chars; corpus max is 150.

Test fixes (mostly stale mock targets / missing fixture fields):
- test_zombie_process_cleanup, test_agent_cache: patch run_agent.cleanup_vm
  (the local name bound at import), not tools.terminal_tool.cleanup_vm.
- test_browser_camofox: patch tools.browser_camofox.load_config, not
  hermes_cli.config.load_config (the source module, not the resolved one).
- test_flush_memories_codex._chat_response_with_memory_call: add
  finish_reason, tool_call.id, tool_call.type so the chat_completions
  transport normalizer doesn't AttributeError.
- test_concurrent_interrupt: polling_tool signature now accepts
  messages= kwarg that _invoke_tool() passes through.
- test_minimax_provider: add _fallback_chain=[] to the __new__'d agent
  so switch_model() doesn't AttributeError.
- test_skills_config: SKILLS_DIR MagicMock + .rglob stopped working
  after the scanner switched to agent.skill_utils.iter_skill_index_files
  (os.walk-based). Point SKILLS_DIR at a real tmp_path and patch
  agent.skill_utils.get_external_skills_dirs.
- test_browser_cdp_tool: browser_cdp toolset was intentionally split into
  'browser-cdp' (commit 96b0f3700) so its stricter check_fn doesn't gate
  the whole browser toolset; test now expects 'browser-cdp'.
- test_registry: add tools.browser_dialog_tool to the expected
  builtin-discovery set (PR #14540 added it).
- test_file_tools TestPatchHints: patch_tool surfaces hints as a '_hint'
  key on the JSON payload, not inline '[Hint: ...' text.
- test_write_deny test_hermes_env: resolve .env via get_hermes_home() so
  the path matches the profile-aware denylist under hermetic HERMES_HOME.
- test_checkpoint_manager test_falls_back_to_parent: guard the walk-up
  so a stray /tmp/pyproject.toml on the host doesn't pick up /tmp as the
  project root.
- test_quick_commands: set cli.session_id in the __new__'d CLI so the
  alias-args path doesn't trip AttributeError when fuzzy-matching leaks
  a skill command across xdist test distribution.
2026-04-24 03:46:46 -07:00

293 lines
10 KiB
Python

"""Tests for zombie process cleanup — verifies processes spawned by tools
are properly reaped when agent sessions end.
Reproduction for issue #7131: zombie process accumulation on long-running
gateway deployments.
"""
import os
import signal
import subprocess
import sys
import time
import threading
import pytest
def _spawn_sleep(seconds: float = 60) -> subprocess.Popen:
"""Spawn a portable long-lived Python sleep process (no shell wrapper)."""
return subprocess.Popen(
[sys.executable, "-c", f"import time; time.sleep({seconds})"],
)
def _pid_alive(pid: int) -> bool:
"""Return True if a process with the given PID is still running."""
try:
os.kill(pid, 0)
return True
except (ProcessLookupError, PermissionError):
return False
class TestZombieReproduction:
"""Demonstrate that subprocesses survive when cleanup is not called."""
def test_orphaned_processes_survive_without_cleanup(self):
"""REPRODUCTION: processes spawned directly survive if no one kills
them — this models the gap that causes zombie accumulation when
the gateway drops agent references without calling close()."""
pids = []
try:
for _ in range(3):
proc = _spawn_sleep(60)
pids.append(proc.pid)
for pid in pids:
assert _pid_alive(pid), f"PID {pid} should be alive after spawn"
# Simulate "session end" by just dropping the reference
del proc # noqa: F821
# BUG: processes are still alive after reference is dropped
for pid in pids:
assert _pid_alive(pid), (
f"PID {pid} died after ref drop — "
f"expected it to survive (demonstrating the bug)"
)
finally:
for pid in pids:
try:
os.kill(pid, signal.SIGKILL)
except (ProcessLookupError, PermissionError):
pass
def test_explicit_terminate_reaps_processes(self):
"""Explicitly terminating+waiting on Popen handles works.
This models what ProcessRegistry.kill_process does internally."""
procs = []
try:
for _ in range(3):
proc = _spawn_sleep(60)
procs.append(proc)
for proc in procs:
assert _pid_alive(proc.pid)
for proc in procs:
proc.terminate()
proc.wait(timeout=5)
for proc in procs:
assert proc.returncode is not None, (
f"PID {proc.pid} should have exited after terminate+wait"
)
finally:
for proc in procs:
try:
proc.kill()
proc.wait(timeout=1)
except Exception:
pass
class TestAgentCloseMethod:
"""Verify AIAgent.close() exists, is idempotent, and calls cleanup."""
def test_close_calls_cleanup_functions(self):
"""close() should call kill_all, cleanup_vm, cleanup_browser."""
from unittest.mock import patch
with patch("run_agent.AIAgent.__init__", return_value=None):
from run_agent import AIAgent
agent = AIAgent.__new__(AIAgent)
agent.session_id = "test-close-cleanup"
agent._active_children = []
agent._active_children_lock = threading.Lock()
agent.client = None
with patch("tools.process_registry.process_registry") as mock_registry, \
patch("run_agent.cleanup_vm") as mock_cleanup_vm, \
patch("run_agent.cleanup_browser") as mock_cleanup_browser:
agent.close()
mock_registry.kill_all.assert_called_once_with(
task_id="test-close-cleanup"
)
mock_cleanup_vm.assert_called_once_with("test-close-cleanup")
mock_cleanup_browser.assert_called_once_with("test-close-cleanup")
def test_close_is_idempotent(self):
"""close() can be called multiple times without error."""
from unittest.mock import patch
with patch("run_agent.AIAgent.__init__", return_value=None):
from run_agent import AIAgent
agent = AIAgent.__new__(AIAgent)
agent.session_id = "test-close-idempotent"
agent._active_children = []
agent._active_children_lock = threading.Lock()
agent.client = None
agent.close()
agent.close()
agent.close()
def test_close_propagates_to_children(self):
"""close() should call close() on all active child agents."""
from unittest.mock import MagicMock, patch
with patch("run_agent.AIAgent.__init__", return_value=None):
from run_agent import AIAgent
agent = AIAgent.__new__(AIAgent)
agent.session_id = "test-close-children"
agent._active_children_lock = threading.Lock()
agent.client = None
child_1 = MagicMock()
child_2 = MagicMock()
agent._active_children = [child_1, child_2]
agent.close()
child_1.close.assert_called_once()
child_2.close.assert_called_once()
assert agent._active_children == []
def test_close_survives_partial_failures(self):
"""close() continues cleanup even if one step fails."""
from unittest.mock import patch
with patch("run_agent.AIAgent.__init__", return_value=None):
from run_agent import AIAgent
agent = AIAgent.__new__(AIAgent)
agent.session_id = "test-close-partial"
agent._active_children = []
agent._active_children_lock = threading.Lock()
agent.client = None
with patch(
"tools.process_registry.process_registry"
) as mock_reg, patch(
"run_agent.cleanup_vm"
) as mock_vm, patch(
"run_agent.cleanup_browser"
) as mock_browser:
mock_reg.kill_all.side_effect = RuntimeError("boom")
agent.close()
mock_vm.assert_called_once()
mock_browser.assert_called_once()
class TestGatewayCleanupWiring:
"""Verify gateway lifecycle calls close() on agents."""
def test_gateway_stop_calls_close(self):
"""gateway stop() should call close() on all running agents."""
import asyncio
import threading
from unittest.mock import AsyncMock, MagicMock, patch
from gateway.run import GatewayRunner
runner = object.__new__(GatewayRunner)
runner._running = True
runner._running_agents = {}
runner._running_agents_ts = {}
runner.adapters = {}
runner._background_tasks = set()
runner._pending_messages = {}
runner._pending_approvals = {}
runner._pending_model_notes = {}
runner._shutdown_event = asyncio.Event()
runner._exit_reason = None
runner._exit_code = None
runner._stop_task = None
runner._draining = False
runner._restart_requested = False
runner._restart_task_started = False
runner._restart_detached = False
runner._restart_via_service = False
runner._restart_drain_timeout = 5.0
runner._voice_mode = {}
runner._session_model_overrides = {}
runner._update_prompt_pending = {}
runner._busy_input_mode = "interrupt"
runner._agent_cache = {}
runner._agent_cache_lock = threading.Lock()
runner._shutdown_all_gateway_honcho = lambda: None
runner._update_runtime_status = MagicMock()
mock_agent_1 = MagicMock()
mock_agent_2 = MagicMock()
runner._running_agents = {
"session-1": mock_agent_1,
"session-2": mock_agent_2,
}
loop = asyncio.new_event_loop()
try:
with patch("gateway.status.remove_pid_file"), \
patch("gateway.status.write_runtime_status"), \
patch("tools.terminal_tool.cleanup_all_environments"), \
patch("tools.browser_tool.cleanup_all_browsers"):
loop.run_until_complete(GatewayRunner.stop(runner))
finally:
loop.close()
mock_agent_1.close.assert_called()
mock_agent_2.close.assert_called()
def test_evict_does_not_call_close(self):
"""_evict_cached_agent() should NOT call close() — it's also used
for non-destructive refreshes (model switch, branch, fallback)."""
import threading
from unittest.mock import MagicMock
from gateway.run import GatewayRunner
runner = object.__new__(GatewayRunner)
runner._agent_cache_lock = threading.Lock()
mock_agent = MagicMock()
runner._agent_cache = {"session-key": (mock_agent, 12345)}
GatewayRunner._evict_cached_agent(runner, "session-key")
mock_agent.close.assert_not_called()
assert "session-key" not in runner._agent_cache
class TestDelegationCleanup:
"""Verify subagent delegation cleans up child agents."""
def test_run_single_child_calls_close(self):
"""_run_single_child finally block should call close() on child."""
from unittest.mock import MagicMock
from tools.delegate_tool import _run_single_child
parent = MagicMock()
parent._active_children = []
parent._active_children_lock = threading.Lock()
child = MagicMock()
child._delegate_saved_tool_names = ["tool1"]
child.run_conversation.side_effect = RuntimeError("test abort")
parent._active_children.append(child)
result = _run_single_child(
task_index=0,
goal="test goal",
child=child,
parent_agent=parent,
)
child.close.assert_called_once()
assert child not in parent._active_children
assert result["status"] == "error"