fix(gateway): exit cleanly on replace startup races

This commit is contained in:
giugiu-a11y 2026-04-24 19:13:45 -03:00
parent 4fade39c90
commit ea0f07279e
No known key found for this signature in database
2 changed files with 131 additions and 8 deletions

View file

@ -11169,23 +11169,31 @@ async def start_gateway(config: Optional[GatewayConfig] = None, replace: bool =
from gateway.status import write_pid_file, remove_pid_file, get_running_pid
_current_pid = get_running_pid()
if _current_pid is not None and _current_pid != os.getpid():
logger.error(
message = (
"Another gateway instance (PID %d) started during our startup. "
"Exiting to avoid double-running.", _current_pid
"Exiting to avoid double-running."
)
if replace:
logger.info(message, _current_pid)
return True
logger.error(message, _current_pid)
return False
if not acquire_gateway_runtime_lock():
logger.error(
"Gateway runtime lock is already held by another instance. Exiting."
)
message = "Gateway runtime lock is already held by another instance. Exiting."
if replace:
logger.info(message)
return True
logger.error(message)
return False
try:
write_pid_file()
except FileExistsError:
release_gateway_runtime_lock()
logger.error(
"PID file race lost to another gateway instance. Exiting."
)
message = "PID file race lost to another gateway instance. Exiting."
if replace:
logger.info(message)
return True
logger.error(message)
return False
atexit.register(remove_pid_file)
atexit.register(release_gateway_runtime_lock)

View file

@ -300,6 +300,121 @@ async def test_start_gateway_replace_writes_takeover_marker_before_sigterm(
assert not (tmp_path / ".gateway-takeover.json").exists()
@pytest.mark.asyncio
async def test_start_gateway_replace_loser_exits_cleanly_when_peer_wins_startup(
monkeypatch, tmp_path
):
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
class _RunnerThatMustNotStart:
def __init__(self, config):
self.config = config
self.should_exit_cleanly = True
self.exit_reason = None
self.adapters = {}
async def start(self):
raise AssertionError("racing loser must not start platform adapters")
async def stop(self):
return None
calls = {"get_running_pid": 0}
def _mock_get_running_pid():
calls["get_running_pid"] += 1
return None if calls["get_running_pid"] == 1 else 4242
monkeypatch.setattr("gateway.status.get_running_pid", _mock_get_running_pid)
monkeypatch.setattr("tools.skills_sync.sync_skills", lambda quiet=True: None)
monkeypatch.setattr("hermes_logging.setup_logging", lambda hermes_home, mode: tmp_path)
monkeypatch.setattr("hermes_logging._add_rotating_handler", lambda *args, **kwargs: None)
monkeypatch.setattr("gateway.run.GatewayRunner", _RunnerThatMustNotStart)
from gateway.run import start_gateway
ok = await start_gateway(config=GatewayConfig(), replace=True, verbosity=None)
assert ok is True
@pytest.mark.asyncio
async def test_start_gateway_replace_loser_exits_cleanly_when_runtime_lock_held(
monkeypatch, tmp_path
):
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
class _RunnerThatMustNotStart:
def __init__(self, config):
self.config = config
self.should_exit_cleanly = True
self.exit_reason = None
self.adapters = {}
async def start(self):
raise AssertionError("racing loser must not start platform adapters")
async def stop(self):
return None
monkeypatch.setattr("gateway.status.get_running_pid", lambda: None)
monkeypatch.setattr("gateway.status.acquire_gateway_runtime_lock", lambda: False)
monkeypatch.setattr("tools.skills_sync.sync_skills", lambda quiet=True: None)
monkeypatch.setattr("hermes_logging.setup_logging", lambda hermes_home, mode: tmp_path)
monkeypatch.setattr("hermes_logging._add_rotating_handler", lambda *args, **kwargs: None)
monkeypatch.setattr("gateway.run.GatewayRunner", _RunnerThatMustNotStart)
from gateway.run import start_gateway
ok = await start_gateway(config=GatewayConfig(), replace=True, verbosity=None)
assert ok is True
@pytest.mark.asyncio
async def test_start_gateway_replace_loser_exits_cleanly_when_pid_file_race_lost(
monkeypatch, tmp_path
):
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
class _RunnerThatMustNotStart:
def __init__(self, config):
self.config = config
self.should_exit_cleanly = True
self.exit_reason = None
self.adapters = {}
async def start(self):
raise AssertionError("racing loser must not start platform adapters")
async def stop(self):
return None
released = {"runtime_lock": False}
def _raise_file_exists():
raise FileExistsError("simulated concurrent pid writer")
def _release_runtime_lock():
released["runtime_lock"] = True
monkeypatch.setattr("gateway.status.get_running_pid", lambda: None)
monkeypatch.setattr("gateway.status.acquire_gateway_runtime_lock", lambda: True)
monkeypatch.setattr("gateway.status.write_pid_file", _raise_file_exists)
monkeypatch.setattr("gateway.status.release_gateway_runtime_lock", _release_runtime_lock)
monkeypatch.setattr("tools.skills_sync.sync_skills", lambda quiet=True: None)
monkeypatch.setattr("hermes_logging.setup_logging", lambda hermes_home, mode: tmp_path)
monkeypatch.setattr("hermes_logging._add_rotating_handler", lambda *args, **kwargs: None)
monkeypatch.setattr("gateway.run.GatewayRunner", _RunnerThatMustNotStart)
from gateway.run import start_gateway
ok = await start_gateway(config=GatewayConfig(), replace=True, verbosity=None)
assert ok is True
assert released["runtime_lock"] is True
@pytest.mark.asyncio
async def test_start_gateway_replace_clears_marker_on_permission_denied(
monkeypatch, tmp_path