fix(gateway): abort --replace when old PID survives SIGKILL

When --replace force-kills an unresponsive old gateway, SIGKILL can fail
to reap it (uninterruptible sleep, zombie-reaping parent, etc.). The old
code unconditionally cleared the PID file and scoped locks and started a
fresh instance anyway, leaving two live gateways fighting over the same
bot token — a duplicate-gateway failure mode of #19471.

Re-verify the process is actually gone (via the Windows-safe _pid_exists
helper) after the force-kill; if it still appears alive, clear the
takeover marker and abort the replacement instead of duplicating.

Co-authored-by: Hermes <noreply@nousresearch.com>
This commit is contained in:
LeonSGP43 2026-06-07 23:15:18 -07:00 committed by Teknium
parent 3714caa1b9
commit e02f4c03c3
2 changed files with 99 additions and 4 deletions

View file

@ -18765,8 +18765,10 @@ async def start_gateway(config: Optional[GatewayConfig] = None, replace: bool =
# ``os.kill(pid, 0)`` on Windows is NOT a no-op — use the
# handle-based existence check instead.
from gateway.status import _pid_exists
old_gateway_exited = False
for _ in range(20):
if not _pid_exists(existing_pid):
old_gateway_exited = True
break # Process is gone
time.sleep(0.5)
else:
@ -18777,9 +18779,34 @@ async def start_gateway(config: Optional[GatewayConfig] = None, replace: bool =
)
try:
terminate_pid(existing_pid, force=True)
time.sleep(0.5)
except (ProcessLookupError, PermissionError, OSError):
except ProcessLookupError:
old_gateway_exited = True
except (PermissionError, OSError):
pass
# Confirm the force-kill actually reaped the process before we
# clear its PID file / scoped locks. SIGKILL can fail to take
# (e.g. an uninterruptible-sleep or zombie-reaping parent), and
# if we blindly clear the metadata and start a fresh instance
# we end up with two live gateways fighting over the same
# token — the duplicate-gateway failure in #19471.
if not old_gateway_exited:
for _ in range(20):
if not _pid_exists(existing_pid):
old_gateway_exited = True
break
time.sleep(0.25)
if not old_gateway_exited:
logger.error(
"Old gateway (PID %d) still appears alive after SIGKILL; "
"aborting replacement to avoid a duplicate gateway.",
existing_pid,
)
try:
from gateway.status import clear_takeover_marker
clear_takeover_marker()
except Exception:
pass
return False
remove_pid_file()
# remove_pid_file() is a no-op when the PID doesn't match.
# Force-unlink to cover the old-process-crashed case.

View file

@ -206,8 +206,17 @@ async def test_start_gateway_replace_force_uses_terminate_pid(monkeypatch, tmp_p
"gateway.status.release_all_scoped_locks",
lambda **kwargs: 0,
)
monkeypatch.setattr("gateway.status.terminate_pid", lambda pid, force=False: calls.append((pid, force)))
monkeypatch.setattr("gateway.status._pid_exists", lambda pid: True)
# force-kill reaps the process: terminate_pid(force=True) flips it dead,
# and the post-kill re-poll via _pid_exists then sees it gone so the
# replacement proceeds.
def _mock_terminate_pid(pid, force=False):
calls.append((pid, force))
if force:
_pid_state["alive"] = False
monkeypatch.setattr("gateway.status.terminate_pid", _mock_terminate_pid)
monkeypatch.setattr(
"gateway.status._pid_exists", lambda pid: _pid_state["alive"]
)
monkeypatch.setattr("gateway.run.os.getpid", lambda: 100)
monkeypatch.setattr("gateway.run.os.kill", lambda pid, sig: None)
monkeypatch.setattr("time.sleep", lambda _: None)
@ -224,6 +233,65 @@ async def test_start_gateway_replace_force_uses_terminate_pid(monkeypatch, tmp_p
assert calls == [(42, False), (42, True)]
@pytest.mark.asyncio
async def test_start_gateway_replace_aborts_when_force_killed_pid_still_alive(
monkeypatch, tmp_path
):
"""Regression for #19471 (duplicate-gateway half).
If SIGKILL fails to reap the old gateway, --replace must NOT clear the PID
file / scoped locks and start a fresh instance that leaves two live
gateways fighting over the same token. It should abort instead.
"""
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
calls = []
removed_pid = False
released_locks = False
class _RunnerShouldNotStart:
def __init__(self, config):
raise AssertionError("replacement must not start while old PID is alive")
def _mock_remove_pid_file():
nonlocal removed_pid
removed_pid = True
def _mock_release_all_scoped_locks(**kwargs):
nonlocal released_locks
released_locks = True
return 0
monkeypatch.setattr("gateway.status.get_running_pid", lambda: 42)
monkeypatch.setattr("gateway.status.remove_pid_file", _mock_remove_pid_file)
monkeypatch.setattr(
"gateway.status.release_all_scoped_locks",
_mock_release_all_scoped_locks,
)
monkeypatch.setattr(
"gateway.status.terminate_pid",
lambda pid, force=False: calls.append((pid, force)),
)
# _pid_exists never goes False — the force-kill did not take.
monkeypatch.setattr("gateway.status._pid_exists", lambda pid: True)
monkeypatch.setattr("gateway.run.os.getpid", lambda: 100)
monkeypatch.setattr("gateway.run.os.kill", lambda pid, sig: None)
monkeypatch.setattr("time.sleep", lambda _: None)
monkeypatch.setattr("tools.skills_sync.sync_skills", lambda quiet=True: None)
monkeypatch.setattr("hermes_logging.setup_logging", lambda hermes_home, mode: tmp_path)
monkeypatch.setattr("hermes_logging._add_rotating_handler", lambda *args, **kwargs: None)
monkeypatch.setattr("gateway.run.GatewayRunner", _RunnerShouldNotStart)
from gateway.run import start_gateway
ok = await start_gateway(config=GatewayConfig(), replace=True, verbosity=None)
assert ok is False
assert calls == [(42, False), (42, True)]
assert removed_pid is False
assert released_locks is False
@pytest.mark.asyncio
async def test_start_gateway_replace_writes_takeover_marker_before_sigterm(
monkeypatch, tmp_path