diff --git a/gateway/status.py b/gateway/status.py index 9290e0917cd..c0798065afe 100644 --- a/gateway/status.py +++ b/gateway/status.py @@ -546,10 +546,26 @@ def _pid_exists(pid: int) -> bool: """ try: import psutil # type: ignore + + # A zombie (defunct) process is still in the process table, so + # ``psutil.pid_exists()`` returns True for it — but it is already + # dead: SIGKILL has no effect and it cannot be a running gateway. + # Treating a zombie as alive makes ``--replace`` wait for the old + # PID to die (it never does, until its parent reaps it), then abort + # with exit 1 — a silent crash loop under systemd ``Restart=always``, + # which respawns the gateway before reaping the previous process + # (issue #42126). Report zombies as dead so the takeover proceeds. + try: + if psutil.Process(int(pid)).status() == psutil.STATUS_ZOMBIE: + return False + except psutil.NoSuchProcess: + return False + except psutil.Error: + pass # Access denied / transient — defer to pid_exists below. return bool(psutil.pid_exists(int(pid))) + except ImportError: pass # Fall through to stdlib fallback. - if _IS_WINDOWS: try: import ctypes @@ -584,6 +600,31 @@ def _pid_exists(pid: int) -> bool: except (OSError, AttributeError): return False else: + # psutil missing (stripped install / scaffold phase). Catch the same + # zombie case as the psutil path above (issue #42126): a zombie + # answers os.kill(pid, 0) successfully, so without this check + # ``--replace`` would wait on a dead PID and abort with exit 1. + try: + stat_fields = ( + Path(f"/proc/{int(pid)}/stat").read_text(encoding="utf-8").split() + ) + if len(stat_fields) > 2 and stat_fields[2] == "Z": + return False + except FileNotFoundError: + # No /proc (macOS/BSD) — fall back to ps state. + try: + r = subprocess.run( + ["ps", "-o", "state=", "-p", str(int(pid))], + capture_output=True, + text=True, + timeout=5, + ) + if r.returncode == 0 and r.stdout.strip().startswith("Z"): + return False + except Exception: + pass + except (IndexError, PermissionError, OSError): + pass try: os.kill(int(pid), 0) # windows-footgun: ok — POSIX-only branch (the whole point of _pid_exists) return True diff --git a/tests/gateway/test_gateway_shutdown.py b/tests/gateway/test_gateway_shutdown.py index 0df0cb90411..9910f3923aa 100644 --- a/tests/gateway/test_gateway_shutdown.py +++ b/tests/gateway/test_gateway_shutdown.py @@ -449,3 +449,105 @@ async def test_signal_initiated_restart_still_persists_stopped(tmp_path, monkeyp assert _stopped_state_persisted(runner), ( "a restart must persist gateway_state=stopped via the normal path" ) + + +# ── #42126: zombie PID must be treated as dead in _pid_exists ──────────────── +# Under systemd Restart=always, the old gateway becomes a zombie (still in the +# process table, not yet reaped) when the replacement starts. _pid_exists must +# report it dead so --replace proceeds instead of waiting on it and aborting +# with exit 1 (a silent crash loop). + + +def test_pid_exists_zombie_via_psutil_returns_false(monkeypatch): + """The live path is psutil. psutil.pid_exists() returns True for a zombie, + so _pid_exists must additionally check Process.status() == STATUS_ZOMBIE.""" + import sys + import types + + from gateway import status + + fake_psutil = types.SimpleNamespace() + fake_psutil.STATUS_ZOMBIE = "zombie" + + class NoSuchProcess(Exception): + pass + + class PsutilError(Exception): + pass + + fake_psutil.NoSuchProcess = NoSuchProcess + fake_psutil.Error = PsutilError + + class _Proc: + def __init__(self, pid): + self.pid = pid + + def status(self): + return "zombie" + + fake_psutil.Process = _Proc + # Without the zombie guard, this True would make the caller treat the + # zombie as a live gateway. + fake_psutil.pid_exists = lambda pid: True + + monkeypatch.setitem(sys.modules, "psutil", fake_psutil) + + assert status._pid_exists(4242) is False + + +def test_pid_exists_live_via_psutil_returns_true(monkeypatch): + """A genuinely running (non-zombie) process is still reported alive.""" + import sys + import types + + from gateway import status + + fake_psutil = types.SimpleNamespace() + fake_psutil.STATUS_ZOMBIE = "zombie" + fake_psutil.NoSuchProcess = type("NoSuchProcess", (Exception,), {}) + fake_psutil.Error = type("Error", (Exception,), {}) + + class _Proc: + def __init__(self, pid): + self.pid = pid + + def status(self): + return "running" + + fake_psutil.Process = _Proc + fake_psutil.pid_exists = lambda pid: True + + monkeypatch.setitem(sys.modules, "psutil", fake_psutil) + + assert status._pid_exists(4242) is True + + +def test_pid_exists_zombie_via_proc_fallback_returns_false(monkeypatch): + """When psutil is unavailable, the POSIX fallback reads /proc//stat + and must treat state 'Z' as dead before reaching os.kill.""" + import builtins + import sys + + from gateway import status + + monkeypatch.setitem(sys.modules, "psutil", None) # force ImportError + real_import = builtins.__import__ + + def _no_psutil(name, *a, **k): + if name == "psutil": + raise ImportError("psutil disabled for test") + return real_import(name, *a, **k) + + monkeypatch.setattr(builtins, "__import__", _no_psutil) + monkeypatch.setattr(status, "_IS_WINDOWS", False) + + fake_stat = "4242 (defunct) Z 1 0 0 0 -1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0" + fake_path = MagicMock() + fake_path.read_text.return_value = fake_stat + monkeypatch.setattr(status, "Path", lambda *_a, **_k: fake_path) + + kill = MagicMock() + monkeypatch.setattr(status.os, "kill", kill) + + assert status._pid_exists(4242) is False + kill.assert_not_called()