mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-09 08:21:50 +00:00
fix(gateway): abort --replace when old PID survives SIGKILL
When --replace force-kills an unresponsive old gateway, SIGKILL can fail to reap it (uninterruptible sleep, zombie-reaping parent, etc.). The old code unconditionally cleared the PID file and scoped locks and started a fresh instance anyway, leaving two live gateways fighting over the same bot token — a duplicate-gateway failure mode of #19471. Re-verify the process is actually gone (via the Windows-safe _pid_exists helper) after the force-kill; if it still appears alive, clear the takeover marker and abort the replacement instead of duplicating. Co-authored-by: Hermes <noreply@nousresearch.com>
This commit is contained in:
parent
3714caa1b9
commit
e02f4c03c3
2 changed files with 99 additions and 4 deletions
|
|
@ -18765,8 +18765,10 @@ async def start_gateway(config: Optional[GatewayConfig] = None, replace: bool =
|
|||
# ``os.kill(pid, 0)`` on Windows is NOT a no-op — use the
|
||||
# handle-based existence check instead.
|
||||
from gateway.status import _pid_exists
|
||||
old_gateway_exited = False
|
||||
for _ in range(20):
|
||||
if not _pid_exists(existing_pid):
|
||||
old_gateway_exited = True
|
||||
break # Process is gone
|
||||
time.sleep(0.5)
|
||||
else:
|
||||
|
|
@ -18777,9 +18779,34 @@ async def start_gateway(config: Optional[GatewayConfig] = None, replace: bool =
|
|||
)
|
||||
try:
|
||||
terminate_pid(existing_pid, force=True)
|
||||
time.sleep(0.5)
|
||||
except (ProcessLookupError, PermissionError, OSError):
|
||||
except ProcessLookupError:
|
||||
old_gateway_exited = True
|
||||
except (PermissionError, OSError):
|
||||
pass
|
||||
# Confirm the force-kill actually reaped the process before we
|
||||
# clear its PID file / scoped locks. SIGKILL can fail to take
|
||||
# (e.g. an uninterruptible-sleep or zombie-reaping parent), and
|
||||
# if we blindly clear the metadata and start a fresh instance
|
||||
# we end up with two live gateways fighting over the same
|
||||
# token — the duplicate-gateway failure in #19471.
|
||||
if not old_gateway_exited:
|
||||
for _ in range(20):
|
||||
if not _pid_exists(existing_pid):
|
||||
old_gateway_exited = True
|
||||
break
|
||||
time.sleep(0.25)
|
||||
if not old_gateway_exited:
|
||||
logger.error(
|
||||
"Old gateway (PID %d) still appears alive after SIGKILL; "
|
||||
"aborting replacement to avoid a duplicate gateway.",
|
||||
existing_pid,
|
||||
)
|
||||
try:
|
||||
from gateway.status import clear_takeover_marker
|
||||
clear_takeover_marker()
|
||||
except Exception:
|
||||
pass
|
||||
return False
|
||||
remove_pid_file()
|
||||
# remove_pid_file() is a no-op when the PID doesn't match.
|
||||
# Force-unlink to cover the old-process-crashed case.
|
||||
|
|
|
|||
|
|
@ -206,8 +206,17 @@ async def test_start_gateway_replace_force_uses_terminate_pid(monkeypatch, tmp_p
|
|||
"gateway.status.release_all_scoped_locks",
|
||||
lambda **kwargs: 0,
|
||||
)
|
||||
monkeypatch.setattr("gateway.status.terminate_pid", lambda pid, force=False: calls.append((pid, force)))
|
||||
monkeypatch.setattr("gateway.status._pid_exists", lambda pid: True)
|
||||
# force-kill reaps the process: terminate_pid(force=True) flips it dead,
|
||||
# and the post-kill re-poll via _pid_exists then sees it gone so the
|
||||
# replacement proceeds.
|
||||
def _mock_terminate_pid(pid, force=False):
|
||||
calls.append((pid, force))
|
||||
if force:
|
||||
_pid_state["alive"] = False
|
||||
monkeypatch.setattr("gateway.status.terminate_pid", _mock_terminate_pid)
|
||||
monkeypatch.setattr(
|
||||
"gateway.status._pid_exists", lambda pid: _pid_state["alive"]
|
||||
)
|
||||
monkeypatch.setattr("gateway.run.os.getpid", lambda: 100)
|
||||
monkeypatch.setattr("gateway.run.os.kill", lambda pid, sig: None)
|
||||
monkeypatch.setattr("time.sleep", lambda _: None)
|
||||
|
|
@ -224,6 +233,65 @@ async def test_start_gateway_replace_force_uses_terminate_pid(monkeypatch, tmp_p
|
|||
assert calls == [(42, False), (42, True)]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_start_gateway_replace_aborts_when_force_killed_pid_still_alive(
|
||||
monkeypatch, tmp_path
|
||||
):
|
||||
"""Regression for #19471 (duplicate-gateway half).
|
||||
|
||||
If SIGKILL fails to reap the old gateway, --replace must NOT clear the PID
|
||||
file / scoped locks and start a fresh instance — that leaves two live
|
||||
gateways fighting over the same token. It should abort instead.
|
||||
"""
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
||||
|
||||
calls = []
|
||||
removed_pid = False
|
||||
released_locks = False
|
||||
|
||||
class _RunnerShouldNotStart:
|
||||
def __init__(self, config):
|
||||
raise AssertionError("replacement must not start while old PID is alive")
|
||||
|
||||
def _mock_remove_pid_file():
|
||||
nonlocal removed_pid
|
||||
removed_pid = True
|
||||
|
||||
def _mock_release_all_scoped_locks(**kwargs):
|
||||
nonlocal released_locks
|
||||
released_locks = True
|
||||
return 0
|
||||
|
||||
monkeypatch.setattr("gateway.status.get_running_pid", lambda: 42)
|
||||
monkeypatch.setattr("gateway.status.remove_pid_file", _mock_remove_pid_file)
|
||||
monkeypatch.setattr(
|
||||
"gateway.status.release_all_scoped_locks",
|
||||
_mock_release_all_scoped_locks,
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"gateway.status.terminate_pid",
|
||||
lambda pid, force=False: calls.append((pid, force)),
|
||||
)
|
||||
# _pid_exists never goes False — the force-kill did not take.
|
||||
monkeypatch.setattr("gateway.status._pid_exists", lambda pid: True)
|
||||
monkeypatch.setattr("gateway.run.os.getpid", lambda: 100)
|
||||
monkeypatch.setattr("gateway.run.os.kill", lambda pid, sig: None)
|
||||
monkeypatch.setattr("time.sleep", lambda _: None)
|
||||
monkeypatch.setattr("tools.skills_sync.sync_skills", lambda quiet=True: None)
|
||||
monkeypatch.setattr("hermes_logging.setup_logging", lambda hermes_home, mode: tmp_path)
|
||||
monkeypatch.setattr("hermes_logging._add_rotating_handler", lambda *args, **kwargs: None)
|
||||
monkeypatch.setattr("gateway.run.GatewayRunner", _RunnerShouldNotStart)
|
||||
|
||||
from gateway.run import start_gateway
|
||||
|
||||
ok = await start_gateway(config=GatewayConfig(), replace=True, verbosity=None)
|
||||
|
||||
assert ok is False
|
||||
assert calls == [(42, False), (42, True)]
|
||||
assert removed_pid is False
|
||||
assert released_locks is False
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_start_gateway_replace_writes_takeover_marker_before_sigterm(
|
||||
monkeypatch, tmp_path
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue