mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-30 11:52:04 +00:00
fix(gateway): treat zombie PIDs as dead in _pid_exists to unblock --replace (closes #42126)
Under systemd Restart=always, the old gateway becomes a zombie (in the process table, awaiting reap) when the replacement starts. _pid_exists() reported the zombie as alive, so --replace waited on a PID that never dies, then aborted with exit 1 — a silent crash loop. Standalone runs are unaffected because nothing respawns the gateway into a zombie. The live path is psutil.pid_exists(), which returns True for zombies, so the check is added there (Process.status() == STATUS_ZOMBIE -> dead). The psutil-less POSIX fallback also reads /proc/<pid>/stat (state Z) with a ps state= fallback for macOS/BSD, before the os.kill(pid, 0) liveness probe. Diagnosis and the /proc + ps POSIX fallback by MorAlekss (PR #44898); extended to cover the psutil hot path so the fix applies on normal installs. Co-authored-by: MorAlekss <mor.aleksandr@yahoo.com>
This commit is contained in:
parent
463225caf1
commit
acca526286
2 changed files with 144 additions and 1 deletions
|
|
@ -546,10 +546,26 @@ def _pid_exists(pid: int) -> bool:
|
|||
"""
|
||||
try:
|
||||
import psutil # type: ignore
|
||||
|
||||
# A zombie (defunct) process is still in the process table, so
|
||||
# ``psutil.pid_exists()`` returns True for it — but it is already
|
||||
# dead: SIGKILL has no effect and it cannot be a running gateway.
|
||||
# Treating a zombie as alive makes ``--replace`` wait for the old
|
||||
# PID to die (it never does, until its parent reaps it), then abort
|
||||
# with exit 1 — a silent crash loop under systemd ``Restart=always``,
|
||||
# which respawns the gateway before reaping the previous process
|
||||
# (issue #42126). Report zombies as dead so the takeover proceeds.
|
||||
try:
|
||||
if psutil.Process(int(pid)).status() == psutil.STATUS_ZOMBIE:
|
||||
return False
|
||||
except psutil.NoSuchProcess:
|
||||
return False
|
||||
except psutil.Error:
|
||||
pass # Access denied / transient — defer to pid_exists below.
|
||||
return bool(psutil.pid_exists(int(pid)))
|
||||
|
||||
except ImportError:
|
||||
pass # Fall through to stdlib fallback.
|
||||
|
||||
if _IS_WINDOWS:
|
||||
try:
|
||||
import ctypes
|
||||
|
|
@ -584,6 +600,31 @@ def _pid_exists(pid: int) -> bool:
|
|||
except (OSError, AttributeError):
|
||||
return False
|
||||
else:
|
||||
# psutil missing (stripped install / scaffold phase). Catch the same
|
||||
# zombie case as the psutil path above (issue #42126): a zombie
|
||||
# answers os.kill(pid, 0) successfully, so without this check
|
||||
# ``--replace`` would wait on a dead PID and abort with exit 1.
|
||||
try:
|
||||
stat_fields = (
|
||||
Path(f"/proc/{int(pid)}/stat").read_text(encoding="utf-8").split()
|
||||
)
|
||||
if len(stat_fields) > 2 and stat_fields[2] == "Z":
|
||||
return False
|
||||
except FileNotFoundError:
|
||||
# No /proc (macOS/BSD) — fall back to ps state.
|
||||
try:
|
||||
r = subprocess.run(
|
||||
["ps", "-o", "state=", "-p", str(int(pid))],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5,
|
||||
)
|
||||
if r.returncode == 0 and r.stdout.strip().startswith("Z"):
|
||||
return False
|
||||
except Exception:
|
||||
pass
|
||||
except (IndexError, PermissionError, OSError):
|
||||
pass
|
||||
try:
|
||||
os.kill(int(pid), 0) # windows-footgun: ok — POSIX-only branch (the whole point of _pid_exists)
|
||||
return True
|
||||
|
|
|
|||
|
|
@ -449,3 +449,105 @@ async def test_signal_initiated_restart_still_persists_stopped(tmp_path, monkeyp
|
|||
assert _stopped_state_persisted(runner), (
|
||||
"a restart must persist gateway_state=stopped via the normal path"
|
||||
)
|
||||
|
||||
|
||||
# ── #42126: zombie PID must be treated as dead in _pid_exists ────────────────
|
||||
# Under systemd Restart=always, the old gateway becomes a zombie (still in the
|
||||
# process table, not yet reaped) when the replacement starts. _pid_exists must
|
||||
# report it dead so --replace proceeds instead of waiting on it and aborting
|
||||
# with exit 1 (a silent crash loop).
|
||||
|
||||
|
||||
def test_pid_exists_zombie_via_psutil_returns_false(monkeypatch):
|
||||
"""The live path is psutil. psutil.pid_exists() returns True for a zombie,
|
||||
so _pid_exists must additionally check Process.status() == STATUS_ZOMBIE."""
|
||||
import sys
|
||||
import types
|
||||
|
||||
from gateway import status
|
||||
|
||||
fake_psutil = types.SimpleNamespace()
|
||||
fake_psutil.STATUS_ZOMBIE = "zombie"
|
||||
|
||||
class NoSuchProcess(Exception):
|
||||
pass
|
||||
|
||||
class PsutilError(Exception):
|
||||
pass
|
||||
|
||||
fake_psutil.NoSuchProcess = NoSuchProcess
|
||||
fake_psutil.Error = PsutilError
|
||||
|
||||
class _Proc:
|
||||
def __init__(self, pid):
|
||||
self.pid = pid
|
||||
|
||||
def status(self):
|
||||
return "zombie"
|
||||
|
||||
fake_psutil.Process = _Proc
|
||||
# Without the zombie guard, this True would make the caller treat the
|
||||
# zombie as a live gateway.
|
||||
fake_psutil.pid_exists = lambda pid: True
|
||||
|
||||
monkeypatch.setitem(sys.modules, "psutil", fake_psutil)
|
||||
|
||||
assert status._pid_exists(4242) is False
|
||||
|
||||
|
||||
def test_pid_exists_live_via_psutil_returns_true(monkeypatch):
|
||||
"""A genuinely running (non-zombie) process is still reported alive."""
|
||||
import sys
|
||||
import types
|
||||
|
||||
from gateway import status
|
||||
|
||||
fake_psutil = types.SimpleNamespace()
|
||||
fake_psutil.STATUS_ZOMBIE = "zombie"
|
||||
fake_psutil.NoSuchProcess = type("NoSuchProcess", (Exception,), {})
|
||||
fake_psutil.Error = type("Error", (Exception,), {})
|
||||
|
||||
class _Proc:
|
||||
def __init__(self, pid):
|
||||
self.pid = pid
|
||||
|
||||
def status(self):
|
||||
return "running"
|
||||
|
||||
fake_psutil.Process = _Proc
|
||||
fake_psutil.pid_exists = lambda pid: True
|
||||
|
||||
monkeypatch.setitem(sys.modules, "psutil", fake_psutil)
|
||||
|
||||
assert status._pid_exists(4242) is True
|
||||
|
||||
|
||||
def test_pid_exists_zombie_via_proc_fallback_returns_false(monkeypatch):
|
||||
"""When psutil is unavailable, the POSIX fallback reads /proc/<pid>/stat
|
||||
and must treat state 'Z' as dead before reaching os.kill."""
|
||||
import builtins
|
||||
import sys
|
||||
|
||||
from gateway import status
|
||||
|
||||
monkeypatch.setitem(sys.modules, "psutil", None) # force ImportError
|
||||
real_import = builtins.__import__
|
||||
|
||||
def _no_psutil(name, *a, **k):
|
||||
if name == "psutil":
|
||||
raise ImportError("psutil disabled for test")
|
||||
return real_import(name, *a, **k)
|
||||
|
||||
monkeypatch.setattr(builtins, "__import__", _no_psutil)
|
||||
monkeypatch.setattr(status, "_IS_WINDOWS", False)
|
||||
|
||||
fake_stat = "4242 (defunct) Z 1 0 0 0 -1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0"
|
||||
fake_path = MagicMock()
|
||||
fake_path.read_text.return_value = fake_stat
|
||||
monkeypatch.setattr(status, "Path", lambda *_a, **_k: fake_path)
|
||||
|
||||
kill = MagicMock()
|
||||
monkeypatch.setattr(status.os, "kill", kill)
|
||||
|
||||
assert status._pid_exists(4242) is False
|
||||
kill.assert_not_called()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue