fix(gateway): treat zombie PIDs as dead in _pid_exists to unblock --replace (closes #42126)

Under systemd Restart=always, the old gateway becomes a zombie (in the
process table, awaiting reap) when the replacement starts. _pid_exists()
reported the zombie as alive, so --replace waited on a PID that never
dies, then aborted with exit 1 — a silent crash loop. Standalone runs are
unaffected because nothing respawns the gateway into a zombie.

The live path is psutil.pid_exists(), which returns True for zombies, so
the check is added there (Process.status() == STATUS_ZOMBIE -> dead). The
psutil-less POSIX fallback also reads /proc/<pid>/stat (state Z) with a ps
state= fallback for macOS/BSD, before the os.kill(pid, 0) liveness probe.

Diagnosis and the /proc + ps POSIX fallback by MorAlekss (PR #44898);
extended to cover the psutil hot path so the fix applies on normal installs.

Co-authored-by: MorAlekss <mor.aleksandr@yahoo.com>
This commit is contained in:
MorAlekss 2026-06-28 02:52:57 -07:00 committed by Teknium
parent 463225caf1
commit acca526286
2 changed files with 144 additions and 1 deletions

View file

@ -546,10 +546,26 @@ def _pid_exists(pid: int) -> bool:
"""
try:
import psutil # type: ignore
# A zombie (defunct) process is still in the process table, so
# ``psutil.pid_exists()`` returns True for it — but it is already
# dead: SIGKILL has no effect and it cannot be a running gateway.
# Treating a zombie as alive makes ``--replace`` wait for the old
# PID to die (it never does, until its parent reaps it), then abort
# with exit 1 — a silent crash loop under systemd ``Restart=always``,
# which respawns the gateway before reaping the previous process
# (issue #42126). Report zombies as dead so the takeover proceeds.
try:
if psutil.Process(int(pid)).status() == psutil.STATUS_ZOMBIE:
return False
except psutil.NoSuchProcess:
return False
except psutil.Error:
pass # Access denied / transient — defer to pid_exists below.
return bool(psutil.pid_exists(int(pid)))
except ImportError:
pass # Fall through to stdlib fallback.
if _IS_WINDOWS:
try:
import ctypes
@ -584,6 +600,31 @@ def _pid_exists(pid: int) -> bool:
except (OSError, AttributeError):
return False
else:
# psutil missing (stripped install / scaffold phase). Catch the same
# zombie case as the psutil path above (issue #42126): a zombie
# answers os.kill(pid, 0) successfully, so without this check
# ``--replace`` would wait on a dead PID and abort with exit 1.
try:
stat_fields = (
Path(f"/proc/{int(pid)}/stat").read_text(encoding="utf-8").split()
)
if len(stat_fields) > 2 and stat_fields[2] == "Z":
return False
except FileNotFoundError:
# No /proc (macOS/BSD) — fall back to ps state.
try:
r = subprocess.run(
["ps", "-o", "state=", "-p", str(int(pid))],
capture_output=True,
text=True,
timeout=5,
)
if r.returncode == 0 and r.stdout.strip().startswith("Z"):
return False
except Exception:
pass
except (IndexError, PermissionError, OSError):
pass
try:
os.kill(int(pid), 0) # windows-footgun: ok — POSIX-only branch (the whole point of _pid_exists)
return True

View file

@ -449,3 +449,105 @@ async def test_signal_initiated_restart_still_persists_stopped(tmp_path, monkeyp
assert _stopped_state_persisted(runner), (
"a restart must persist gateway_state=stopped via the normal path"
)
# ── #42126: zombie PID must be treated as dead in _pid_exists ────────────────
# Under systemd Restart=always, the old gateway becomes a zombie (still in the
# process table, not yet reaped) when the replacement starts. _pid_exists must
# report it dead so --replace proceeds instead of waiting on it and aborting
# with exit 1 (a silent crash loop).
def test_pid_exists_zombie_via_psutil_returns_false(monkeypatch):
"""The live path is psutil. psutil.pid_exists() returns True for a zombie,
so _pid_exists must additionally check Process.status() == STATUS_ZOMBIE."""
import sys
import types
from gateway import status
fake_psutil = types.SimpleNamespace()
fake_psutil.STATUS_ZOMBIE = "zombie"
class NoSuchProcess(Exception):
pass
class PsutilError(Exception):
pass
fake_psutil.NoSuchProcess = NoSuchProcess
fake_psutil.Error = PsutilError
class _Proc:
def __init__(self, pid):
self.pid = pid
def status(self):
return "zombie"
fake_psutil.Process = _Proc
# Without the zombie guard, this True would make the caller treat the
# zombie as a live gateway.
fake_psutil.pid_exists = lambda pid: True
monkeypatch.setitem(sys.modules, "psutil", fake_psutil)
assert status._pid_exists(4242) is False
def test_pid_exists_live_via_psutil_returns_true(monkeypatch):
"""A genuinely running (non-zombie) process is still reported alive."""
import sys
import types
from gateway import status
fake_psutil = types.SimpleNamespace()
fake_psutil.STATUS_ZOMBIE = "zombie"
fake_psutil.NoSuchProcess = type("NoSuchProcess", (Exception,), {})
fake_psutil.Error = type("Error", (Exception,), {})
class _Proc:
def __init__(self, pid):
self.pid = pid
def status(self):
return "running"
fake_psutil.Process = _Proc
fake_psutil.pid_exists = lambda pid: True
monkeypatch.setitem(sys.modules, "psutil", fake_psutil)
assert status._pid_exists(4242) is True
def test_pid_exists_zombie_via_proc_fallback_returns_false(monkeypatch):
"""When psutil is unavailable, the POSIX fallback reads /proc/<pid>/stat
and must treat state 'Z' as dead before reaching os.kill."""
import builtins
import sys
from gateway import status
monkeypatch.setitem(sys.modules, "psutil", None) # force ImportError
real_import = builtins.__import__
def _no_psutil(name, *a, **k):
if name == "psutil":
raise ImportError("psutil disabled for test")
return real_import(name, *a, **k)
monkeypatch.setattr(builtins, "__import__", _no_psutil)
monkeypatch.setattr(status, "_IS_WINDOWS", False)
fake_stat = "4242 (defunct) Z 1 0 0 0 -1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0"
fake_path = MagicMock()
fake_path.read_text.return_value = fake_stat
monkeypatch.setattr(status, "Path", lambda *_a, **_k: fake_path)
kill = MagicMock()
monkeypatch.setattr(status.os, "kill", kill)
assert status._pid_exists(4242) is False
kill.assert_not_called()