test: migrate stale os.kill monkeypatches to gateway.status._pid_exists

PR #21561 migrated liveness probes across 14 call sites from
`os.kill(pid, 0)` to `gateway.status._pid_exists` (psutil-first) so
the gateway doesn't Ctrl+C-itself on Windows via bpo-14484. A handful of
tests still patched the old `os.kill` seam and either happened to pass
on POSIX (when PID 12345 incidentally wasn't alive on the CI worker) or
failed outright — on CI runs they surfaced as 7 flaky/stable failures.

Migrate each affected test to patch the correct seam:

- tests/tools/test_browser_orphan_reaper.py (5 tests)
    Patch `gateway.status._pid_exists` instead of `os.kill`.
    Rename test_permission_error_on_kill_check_skips to
    test_alive_legacy_daemon_is_reaped — the old assertion was
    "PermissionError on sig 0 → skip dir"; post-migration the
    untracked-alive-daemon path always reaps the dir after SIGTERM
    (best-effort semantics were preserved).

- tests/tools/test_windows_native_support.py (4 tests)
    Replace tests that asserted `os.kill` seam behavior with tests
    that exercise `ProcessRegistry._is_host_pid_alive` as a
    delegator and split out a new TestPidExistsOSErrorWidening class
    that hits `gateway.status._pid_exists` directly via the POSIX
    fallback branch (so Windows-style `OSError(WinError 87)` + `PermissionError`
    widening is still covered on Linux CI).

- tests/tools/test_process_registry.py (1 test)
    Mock `psutil.Process` + `_pid_exists` instead of `os.kill`
    for the detached-session kill path.

- tests/tools/test_mcp_stability.py::test_kill_orphaned_uses_sigkill_when_available
    SIGTERM → alive-check → SIGKILL flow now uses `_pid_exists`
    for the middle step; assertion count drops from 3 to 2.

- tests/gateway/test_status.py::TestScopedLocks (2 tests)
    `acquire_scoped_lock` consults `_pid_exists`; patch that
    seam directly instead of trying to control the nested psutil
    call via os.kill monkeypatch.

- tests/hermes_cli/test_gateway.py::test_stop_profile_gateway_keeps_pid_file_when_process_still_running
    The stop loop sends one SIGTERM via os.kill then polls 20x via
    _pid_exists; instrument both separately. Old assertion
    `calls["kill"] == 21` split into `kill == 1` + `alive_probes == 20`.

- tests/hermes_cli/test_auth_toctou_file_modes.py::test_shared_nous_store_writes_0o600_with_0o700_parent
    Commit c34884ea2 switched the pytest seat-belt guard in
    `_nous_shared_store_path()` from `Path.home() / ".hermes"`
    to `get_default_hermes_root()`, which honors HERMES_HOME. The
    test sets both HERMES_HOME and HERMES_SHARED_AUTH_DIR to
    subpaths of the same tmp_path, and the override now collapses
    onto the same path the guard is refusing. Renamed the override
    subdirectory so the two paths diverge — guard passes, test runs.

All 21 original CI failures and their local-flaky siblings now pass
(278 tests across the touched files, 0 failures).
This commit is contained in:
Teknium 2026-05-08 14:18:41 -07:00
parent 291a158441
commit f5ee780124
7 changed files with 160 additions and 80 deletions

View file

@ -81,19 +81,18 @@ class TestReapOrphanedBrowserSessions:
d = _make_socket_dir(fake_tmpdir, "h_orphan12345", pid=12345)
kill_calls = []
original_kill = os.kill
def mock_kill(pid, sig):
kill_calls.append((pid, sig))
if sig == 0:
return # pretend process exists
# Don't actually kill anything
with patch("os.kill", side_effect=mock_kill):
# Post-#21561 the liveness probe goes through
# ``gateway.status._pid_exists`` (which wraps ``psutil.pid_exists``
# so it's safe on Windows — ``os.kill(pid, 0)`` is bpo-14484).
with patch("gateway.status._pid_exists", return_value=True), \
patch("os.kill", side_effect=mock_kill):
_reap_orphaned_browser_sessions()
# Should have checked existence (sig 0) then killed (SIGTERM)
assert (12345, 0) in kill_calls
assert (12345, signal.SIGTERM) in kill_calls
def test_tracked_session_is_not_reaped(self, fake_tmpdir):
@ -120,21 +119,31 @@ class TestReapOrphanedBrowserSessions:
# Dir should still exist
assert d.exists()
def test_permission_error_on_kill_check_skips(self, fake_tmpdir):
"""If we can't check the PID (PermissionError), skip it."""
def test_alive_legacy_daemon_is_reaped(self, fake_tmpdir):
"""Alive, untracked, legacy (no owner_pid) daemon is reaped.
Post-#21561 the liveness probe goes through
``gateway.status._pid_exists`` (which wraps ``psutil.pid_exists``
because ``os.kill(pid, 0)`` is a footgun on Windows bpo-14484).
With no owner_pid file and no tracked-name entry, the reaper
SIGTERMs the daemon and removes its socket dir regardless of
whether SIGTERM succeeded (best-effort semantics).
"""
from tools.browser_tool import _reap_orphaned_browser_sessions
d = _make_socket_dir(fake_tmpdir, "h_perm1234567", pid=12345)
def mock_kill(pid, sig):
if sig == 0:
raise PermissionError("not our process")
sigterm_calls = []
with patch("os.kill", side_effect=mock_kill):
def mock_kill(pid, sig):
sigterm_calls.append((pid, sig))
with patch("gateway.status._pid_exists", return_value=True), \
patch("os.kill", side_effect=mock_kill):
_reap_orphaned_browser_sessions()
# Dir should still exist (we didn't touch someone else's process)
assert d.exists()
assert (12345, signal.SIGTERM) in sigterm_calls
assert not d.exists()
def test_cdp_sessions_are_also_reaped(self, fake_tmpdir):
"""CDP sessions (cdp_ prefix) are also scanned."""
@ -196,19 +205,13 @@ class TestOwnerPidCrossProcess:
def mock_kill(pid, sig):
kill_calls.append((pid, sig))
if pid == os.getpid() and sig == 0:
return # real existence check: owner alive
if sig == 0:
return # pretend daemon exists too
# Don't actually kill anything
with patch("os.kill", side_effect=mock_kill):
# Owner alive → reaper skips without ever probing the daemon.
with patch("gateway.status._pid_exists", return_value=True), \
patch("os.kill", side_effect=mock_kill):
_reap_orphaned_browser_sessions()
# We should have checked the owner (sig 0) but never tried to kill
# the daemon.
assert (12345, signal.SIGTERM) not in kill_calls
# Dir should still exist
assert d.exists()
def test_dead_owner_triggers_reap(self, fake_tmpdir):
@ -224,20 +227,15 @@ class TestOwnerPidCrossProcess:
def mock_kill(pid, sig):
kill_calls.append((pid, sig))
if pid == 999999999 and sig == 0:
raise ProcessLookupError # owner dead
if pid == 12345 and sig == 0:
return # daemon still alive
# SIGTERM to daemon — noop in test
with patch("os.kill", side_effect=mock_kill):
# Owner 999999999 dead, daemon 12345 alive.
pid_alive = {999999999: False, 12345: True}
with patch("gateway.status._pid_exists",
side_effect=lambda pid: pid_alive.get(int(pid), False)), \
patch("os.kill", side_effect=mock_kill):
_reap_orphaned_browser_sessions()
# Owner checked (returned dead), daemon checked (alive), daemon killed
assert (999999999, 0) in kill_calls
assert (12345, 0) in kill_calls
assert (12345, signal.SIGTERM) in kill_calls
# Dir cleaned up
assert not d.exists()
def test_corrupt_owner_pid_falls_back_to_legacy(self, fake_tmpdir):
@ -258,7 +256,8 @@ class TestOwnerPidCrossProcess:
def mock_kill(pid, sig):
kill_calls.append((pid, sig))
with patch("os.kill", side_effect=mock_kill):
with patch("gateway.status._pid_exists", return_value=True), \
patch("os.kill", side_effect=mock_kill):
_reap_orphaned_browser_sessions()
# Legacy path took over → tracked → not reaped
@ -266,10 +265,12 @@ class TestOwnerPidCrossProcess:
assert d.exists()
def test_owner_pid_permission_error_treated_as_alive(self, fake_tmpdir):
"""If os.kill(owner, 0) raises PermissionError, treat owner as alive.
"""Owner PID owned by another user → treat as alive.
PermissionError means the PID exists but is owned by a different user
we must not assume the owner is dead (could kill someone else's daemon).
Post-#21561 this is handled inside ``gateway.status._pid_exists``
(via psutil's ``OpenProcess`` returning ``ERROR_ACCESS_DENIED`` on
Windows, or via the POSIX fallback's ``except PermissionError``
branch). Exposed to callers as ``alive=True``.
"""
from tools.browser_tool import _reap_orphaned_browser_sessions
@ -281,13 +282,13 @@ class TestOwnerPidCrossProcess:
def mock_kill(pid, sig):
kill_calls.append((pid, sig))
if pid == 22222 and sig == 0:
raise PermissionError("not our user")
with patch("os.kill", side_effect=mock_kill):
# Owner 22222 reported alive (PermissionError collapses to True
# inside _pid_exists). Daemon never probed, never SIGTERMed.
with patch("gateway.status._pid_exists", return_value=True), \
patch("os.kill", side_effect=mock_kill):
_reap_orphaned_browser_sessions()
# Must NOT have tried to kill the daemon
assert (12345, signal.SIGTERM) not in kill_calls
assert d.exists()