test: migrate stale os.kill monkeypatches to gateway.status._pid_exists

PR #21561 migrated liveness probes across 14 call sites from
`os.kill(pid, 0)` to `gateway.status._pid_exists` (psutil-first) so
the gateway doesn't Ctrl+C-itself on Windows via bpo-14484. A handful of
tests still patched the old `os.kill` seam and either happened to pass
on POSIX (when PID 12345 incidentally wasn't alive on the CI worker) or
failed outright — on CI runs they surfaced as 7 flaky/stable failures.

Migrate each affected test to patch the correct seam:

- tests/tools/test_browser_orphan_reaper.py (5 tests)
    Patch `gateway.status._pid_exists` instead of `os.kill`.
    Rename test_permission_error_on_kill_check_skips to
    test_alive_legacy_daemon_is_reaped — the old assertion was
    "PermissionError on sig 0 → skip dir"; post-migration the
    untracked-alive-daemon path always reaps the dir after SIGTERM
    (best-effort semantics were preserved).

- tests/tools/test_windows_native_support.py (4 tests)
    Replace tests that asserted `os.kill` seam behavior with tests
    that exercise `ProcessRegistry._is_host_pid_alive` as a
    delegator and split out a new TestPidExistsOSErrorWidening class
    that hits `gateway.status._pid_exists` directly via the POSIX
    fallback branch (so Windows-style `OSError(WinError 87)` + `PermissionError`
    widening is still covered on Linux CI).

- tests/tools/test_process_registry.py (1 test)
    Mock `psutil.Process` + `_pid_exists` instead of `os.kill`
    for the detached-session kill path.

- tests/tools/test_mcp_stability.py::test_kill_orphaned_uses_sigkill_when_available
    SIGTERM → alive-check → SIGKILL flow now uses `_pid_exists`
    for the middle step; assertion count drops from 3 to 2.

- tests/gateway/test_status.py::TestScopedLocks (2 tests)
    `acquire_scoped_lock` consults `_pid_exists`; patch that
    seam directly instead of trying to control the nested psutil
    call via os.kill monkeypatch.

- tests/hermes_cli/test_gateway.py::test_stop_profile_gateway_keeps_pid_file_when_process_still_running
    The stop loop sends one SIGTERM via os.kill then polls 20x via
    _pid_exists; instrument both separately. Old assertion
    `calls["kill"] == 21` split into `kill == 1` + `alive_probes == 20`.

- tests/hermes_cli/test_auth_toctou_file_modes.py::test_shared_nous_store_writes_0o600_with_0o700_parent
    Commit c34884ea2 switched the pytest seat-belt guard in
    `_nous_shared_store_path()` from `Path.home() / ".hermes"`
    to `get_default_hermes_root()`, which honors HERMES_HOME. The
    test sets both HERMES_HOME and HERMES_SHARED_AUTH_DIR to
    subpaths of the same tmp_path, and the override now collapses
    onto the same path the guard is refusing. Renamed the override
    subdirectory so the two paths diverge — guard passes, test runs.

All 21 original CI failures and their local-flaky siblings now pass
(278 tests across the touched files, 0 failures).
This commit is contained in:
Teknium 2026-05-08 14:18:41 -07:00
parent 291a158441
commit f5ee780124
7 changed files with 160 additions and 80 deletions

View file

@ -308,54 +308,106 @@ class TestSigkillFallback:
# ---------------------------------------------------------------------------
# OSError widening on os.kill(pid, 0) probes
# OSError widening on liveness probes
#
# Post-#21561, ``ProcessRegistry._is_host_pid_alive`` delegates to
# ``gateway.status._pid_exists``, which is the cross-platform liveness
# primitive (psutil-first, ctypes/os.kill fallback). The tests below assert
# (a) the delegation is correct and (b) ``_pid_exists`` correctly widens
# Windows' ``OSError(WinError 87)`` / ``PermissionError`` behavior on the
# POSIX fallback branch.
# ---------------------------------------------------------------------------
class TestProcessRegistryOSErrorWidening:
"""_is_host_pid_alive must treat Windows' OSError as 'not alive'."""
"""_is_host_pid_alive delegates to gateway.status._pid_exists."""
def test_oserror_treated_as_not_alive(self, monkeypatch):
"""_pid_exists → False propagates as _is_host_pid_alive → False."""
from tools.process_registry import ProcessRegistry
def fake_kill(pid, sig):
# Simulate Windows' WinError 87 for an unknown PID
raise OSError(22, "Invalid argument")
monkeypatch.setattr("tools.process_registry.os.kill", fake_kill)
monkeypatch.setattr("gateway.status._pid_exists", lambda pid: False)
assert ProcessRegistry._is_host_pid_alive(12345) is False
def test_permission_error_treated_as_not_alive(self, monkeypatch):
"""Conservative: PermissionError also means 'not alive' (matches existing behavior)."""
def test_permission_error_treated_as_alive(self, monkeypatch):
"""PermissionError is encoded by _pid_exists as alive=True; propagates as-is.
This is a meaningful semantic change from the pre-#21561 version of
this test (which asserted PermissionError not-alive). The old
``os.kill(pid, 0)``-based probe couldn't distinguish "gone" from
"owned by another user" on some platforms, so it conservatively
returned False. The new psutil-based probe CAN distinguish them via
``OpenProcess + ERROR_ACCESS_DENIED`` on Windows / ``except
PermissionError`` on POSIX, so alive=True is correct.
"""
from tools.process_registry import ProcessRegistry
def fake_kill(pid, sig):
raise PermissionError(1, "Operation not permitted")
monkeypatch.setattr("gateway.status._pid_exists", lambda pid: True)
assert ProcessRegistry._is_host_pid_alive(12345) is True
monkeypatch.setattr("tools.process_registry.os.kill", fake_kill)
assert ProcessRegistry._is_host_pid_alive(12345) is False
def test_zero_or_none_pid_returns_false_without_calling_kill(self, monkeypatch):
def test_zero_or_none_pid_returns_false_without_probing(self, monkeypatch):
"""No wasted syscall on falsy pids."""
from tools.process_registry import ProcessRegistry
kill_calls = []
probes = []
monkeypatch.setattr(
"tools.process_registry.os.kill",
lambda pid, sig: kill_calls.append(pid),
"gateway.status._pid_exists",
lambda pid: probes.append(pid) or True,
)
assert ProcessRegistry._is_host_pid_alive(None) is False
assert ProcessRegistry._is_host_pid_alive(0) is False
assert kill_calls == []
assert probes == []
def test_alive_pid_returns_true(self, monkeypatch):
from tools.process_registry import ProcessRegistry
# os.kill returning None (default) means "probe succeeded → pid alive"
monkeypatch.setattr("tools.process_registry.os.kill", lambda pid, sig: None)
monkeypatch.setattr("gateway.status._pid_exists", lambda pid: True)
assert ProcessRegistry._is_host_pid_alive(os.getpid()) is True
class TestPidExistsOSErrorWidening:
"""gateway.status._pid_exists itself must widen Windows errors correctly.
The POSIX fallback branch (reached when psutil isn't importable) is the
only path where Python raises ``OSError(WinError 87)`` on Windows for a
gone PID instead of ``ProcessLookupError``. The function must catch the
wider ``OSError`` to match POSIX semantics.
"""
def test_oserror_gone_pid_returns_false(self, monkeypatch):
"""Simulate Windows' OSError(WinError 87) for a gone PID via the POSIX fallback."""
from gateway import status
# Force the psutil-first branch to miss so we exercise the fallback.
monkeypatch.setitem(
__import__("sys").modules, "psutil",
type("P", (), {"pid_exists": staticmethod(lambda pid: (_ for _ in ()).throw(ImportError()))})()
)
monkeypatch.setattr(status, "_IS_WINDOWS", False)
def fake_kill(pid, sig):
raise OSError(22, "Invalid argument")
monkeypatch.setattr(status.os, "kill", fake_kill)
assert status._pid_exists(12345) is False
def test_permission_error_returns_true(self, monkeypatch):
"""POSIX fallback: PermissionError means alive (owned by another user)."""
from gateway import status
monkeypatch.setitem(
__import__("sys").modules, "psutil",
type("P", (), {"pid_exists": staticmethod(lambda pid: (_ for _ in ()).throw(ImportError()))})()
)
monkeypatch.setattr(status, "_IS_WINDOWS", False)
def fake_kill(pid, sig):
raise PermissionError(1, "Operation not permitted")
monkeypatch.setattr(status.os, "kill", fake_kill)
assert status._pid_exists(12345) is True
# ---------------------------------------------------------------------------
# tzdata dependency
# ---------------------------------------------------------------------------