From e02f4c03c312adfbea316da57aa248104a26e678 Mon Sep 17 00:00:00 2001 From: LeonSGP43 Date: Sun, 7 Jun 2026 23:15:18 -0700 Subject: [PATCH] fix(gateway): abort --replace when old PID survives SIGKILL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When --replace force-kills an unresponsive old gateway, SIGKILL can fail to reap it (uninterruptible sleep, zombie-reaping parent, etc.). The old code unconditionally cleared the PID file and scoped locks and started a fresh instance anyway, leaving two live gateways fighting over the same bot token — a duplicate-gateway failure mode of #19471. Re-verify the process is actually gone (via the Windows-safe _pid_exists helper) after the force-kill; if it still appears alive, clear the takeover marker and abort the replacement instead of duplicating. Co-authored-by: Hermes --- gateway/run.py | 31 +++++++- tests/gateway/test_runner_startup_failures.py | 72 ++++++++++++++++++- 2 files changed, 99 insertions(+), 4 deletions(-) diff --git a/gateway/run.py b/gateway/run.py index 1af44364acc..3cef89b96fd 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -18765,8 +18765,10 @@ async def start_gateway(config: Optional[GatewayConfig] = None, replace: bool = # ``os.kill(pid, 0)`` on Windows is NOT a no-op — use the # handle-based existence check instead. from gateway.status import _pid_exists + old_gateway_exited = False for _ in range(20): if not _pid_exists(existing_pid): + old_gateway_exited = True break # Process is gone time.sleep(0.5) else: @@ -18777,9 +18779,34 @@ async def start_gateway(config: Optional[GatewayConfig] = None, replace: bool = ) try: terminate_pid(existing_pid, force=True) - time.sleep(0.5) - except (ProcessLookupError, PermissionError, OSError): + except ProcessLookupError: + old_gateway_exited = True + except (PermissionError, OSError): pass + # Confirm the force-kill actually reaped the process before we + # clear its PID file / scoped locks. SIGKILL can fail to take + # (e.g. an uninterruptible-sleep or zombie-reaping parent), and + # if we blindly clear the metadata and start a fresh instance + # we end up with two live gateways fighting over the same + # token — the duplicate-gateway failure in #19471. + if not old_gateway_exited: + for _ in range(20): + if not _pid_exists(existing_pid): + old_gateway_exited = True + break + time.sleep(0.25) + if not old_gateway_exited: + logger.error( + "Old gateway (PID %d) still appears alive after SIGKILL; " + "aborting replacement to avoid a duplicate gateway.", + existing_pid, + ) + try: + from gateway.status import clear_takeover_marker + clear_takeover_marker() + except Exception: + pass + return False remove_pid_file() # remove_pid_file() is a no-op when the PID doesn't match. # Force-unlink to cover the old-process-crashed case. diff --git a/tests/gateway/test_runner_startup_failures.py b/tests/gateway/test_runner_startup_failures.py index 3fbf3708852..329ad1e9b63 100644 --- a/tests/gateway/test_runner_startup_failures.py +++ b/tests/gateway/test_runner_startup_failures.py @@ -206,8 +206,17 @@ async def test_start_gateway_replace_force_uses_terminate_pid(monkeypatch, tmp_p "gateway.status.release_all_scoped_locks", lambda **kwargs: 0, ) - monkeypatch.setattr("gateway.status.terminate_pid", lambda pid, force=False: calls.append((pid, force))) - monkeypatch.setattr("gateway.status._pid_exists", lambda pid: True) + # force-kill reaps the process: terminate_pid(force=True) flips it dead, + # and the post-kill re-poll via _pid_exists then sees it gone so the + # replacement proceeds. + def _mock_terminate_pid(pid, force=False): + calls.append((pid, force)) + if force: + _pid_state["alive"] = False + monkeypatch.setattr("gateway.status.terminate_pid", _mock_terminate_pid) + monkeypatch.setattr( + "gateway.status._pid_exists", lambda pid: _pid_state["alive"] + ) monkeypatch.setattr("gateway.run.os.getpid", lambda: 100) monkeypatch.setattr("gateway.run.os.kill", lambda pid, sig: None) monkeypatch.setattr("time.sleep", lambda _: None) @@ -224,6 +233,65 @@ async def test_start_gateway_replace_force_uses_terminate_pid(monkeypatch, tmp_p assert calls == [(42, False), (42, True)] +@pytest.mark.asyncio +async def test_start_gateway_replace_aborts_when_force_killed_pid_still_alive( + monkeypatch, tmp_path +): + """Regression for #19471 (duplicate-gateway half). + + If SIGKILL fails to reap the old gateway, --replace must NOT clear the PID + file / scoped locks and start a fresh instance — that leaves two live + gateways fighting over the same token. It should abort instead. + """ + monkeypatch.setenv("HERMES_HOME", str(tmp_path)) + + calls = [] + removed_pid = False + released_locks = False + + class _RunnerShouldNotStart: + def __init__(self, config): + raise AssertionError("replacement must not start while old PID is alive") + + def _mock_remove_pid_file(): + nonlocal removed_pid + removed_pid = True + + def _mock_release_all_scoped_locks(**kwargs): + nonlocal released_locks + released_locks = True + return 0 + + monkeypatch.setattr("gateway.status.get_running_pid", lambda: 42) + monkeypatch.setattr("gateway.status.remove_pid_file", _mock_remove_pid_file) + monkeypatch.setattr( + "gateway.status.release_all_scoped_locks", + _mock_release_all_scoped_locks, + ) + monkeypatch.setattr( + "gateway.status.terminate_pid", + lambda pid, force=False: calls.append((pid, force)), + ) + # _pid_exists never goes False — the force-kill did not take. + monkeypatch.setattr("gateway.status._pid_exists", lambda pid: True) + monkeypatch.setattr("gateway.run.os.getpid", lambda: 100) + monkeypatch.setattr("gateway.run.os.kill", lambda pid, sig: None) + monkeypatch.setattr("time.sleep", lambda _: None) + monkeypatch.setattr("tools.skills_sync.sync_skills", lambda quiet=True: None) + monkeypatch.setattr("hermes_logging.setup_logging", lambda hermes_home, mode: tmp_path) + monkeypatch.setattr("hermes_logging._add_rotating_handler", lambda *args, **kwargs: None) + monkeypatch.setattr("gateway.run.GatewayRunner", _RunnerShouldNotStart) + + from gateway.run import start_gateway + + ok = await start_gateway(config=GatewayConfig(), replace=True, verbosity=None) + + assert ok is False + assert calls == [(42, False), (42, True)] + assert removed_pid is False + assert released_locks is False + + @pytest.mark.asyncio async def test_start_gateway_replace_writes_takeover_marker_before_sigterm( monkeypatch, tmp_path