diff --git a/gateway/run.py b/gateway/run.py index 14bd3ff0d..077b6d5f0 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -11169,23 +11169,31 @@ async def start_gateway(config: Optional[GatewayConfig] = None, replace: bool = from gateway.status import write_pid_file, remove_pid_file, get_running_pid _current_pid = get_running_pid() if _current_pid is not None and _current_pid != os.getpid(): - logger.error( + message = ( "Another gateway instance (PID %d) started during our startup. " - "Exiting to avoid double-running.", _current_pid + "Exiting to avoid double-running." ) + if replace: + logger.info(message, _current_pid) + return True + logger.error(message, _current_pid) return False if not acquire_gateway_runtime_lock(): - logger.error( - "Gateway runtime lock is already held by another instance. Exiting." - ) + message = "Gateway runtime lock is already held by another instance. Exiting." + if replace: + logger.info(message) + return True + logger.error(message) return False try: write_pid_file() except FileExistsError: release_gateway_runtime_lock() - logger.error( - "PID file race lost to another gateway instance. Exiting." - ) + message = "PID file race lost to another gateway instance. Exiting." + if replace: + logger.info(message) + return True + logger.error(message) return False atexit.register(remove_pid_file) atexit.register(release_gateway_runtime_lock) diff --git a/tests/gateway/test_runner_startup_failures.py b/tests/gateway/test_runner_startup_failures.py index d94e466ec..2a1d3dcc4 100644 --- a/tests/gateway/test_runner_startup_failures.py +++ b/tests/gateway/test_runner_startup_failures.py @@ -300,6 +300,121 @@ async def test_start_gateway_replace_writes_takeover_marker_before_sigterm( assert not (tmp_path / ".gateway-takeover.json").exists() +@pytest.mark.asyncio +async def test_start_gateway_replace_loser_exits_cleanly_when_peer_wins_startup( + monkeypatch, tmp_path +): + monkeypatch.setenv("HERMES_HOME", str(tmp_path)) + + class _RunnerThatMustNotStart: + def __init__(self, config): + self.config = config + self.should_exit_cleanly = True + self.exit_reason = None + self.adapters = {} + + async def start(self): + raise AssertionError("racing loser must not start platform adapters") + + async def stop(self): + return None + + calls = {"get_running_pid": 0} + + def _mock_get_running_pid(): + calls["get_running_pid"] += 1 + return None if calls["get_running_pid"] == 1 else 4242 + + monkeypatch.setattr("gateway.status.get_running_pid", _mock_get_running_pid) + monkeypatch.setattr("tools.skills_sync.sync_skills", lambda quiet=True: None) + monkeypatch.setattr("hermes_logging.setup_logging", lambda hermes_home, mode: tmp_path) + monkeypatch.setattr("hermes_logging._add_rotating_handler", lambda *args, **kwargs: None) + monkeypatch.setattr("gateway.run.GatewayRunner", _RunnerThatMustNotStart) + + from gateway.run import start_gateway + + ok = await start_gateway(config=GatewayConfig(), replace=True, verbosity=None) + + assert ok is True + + +@pytest.mark.asyncio +async def test_start_gateway_replace_loser_exits_cleanly_when_runtime_lock_held( + monkeypatch, tmp_path +): + monkeypatch.setenv("HERMES_HOME", str(tmp_path)) + + class _RunnerThatMustNotStart: + def __init__(self, config): + self.config = config + self.should_exit_cleanly = True + self.exit_reason = None + self.adapters = {} + + async def start(self): + raise AssertionError("racing loser must not start platform adapters") + + async def stop(self): + return None + + monkeypatch.setattr("gateway.status.get_running_pid", lambda: None) + monkeypatch.setattr("gateway.status.acquire_gateway_runtime_lock", lambda: False) + monkeypatch.setattr("tools.skills_sync.sync_skills", lambda quiet=True: None) + monkeypatch.setattr("hermes_logging.setup_logging", lambda hermes_home, mode: tmp_path) + monkeypatch.setattr("hermes_logging._add_rotating_handler", lambda *args, **kwargs: None) + monkeypatch.setattr("gateway.run.GatewayRunner", _RunnerThatMustNotStart) + + from gateway.run import start_gateway + + ok = await start_gateway(config=GatewayConfig(), replace=True, verbosity=None) + + assert ok is True + + +@pytest.mark.asyncio +async def test_start_gateway_replace_loser_exits_cleanly_when_pid_file_race_lost( + monkeypatch, tmp_path +): + monkeypatch.setenv("HERMES_HOME", str(tmp_path)) + + class _RunnerThatMustNotStart: + def __init__(self, config): + self.config = config + self.should_exit_cleanly = True + self.exit_reason = None + self.adapters = {} + + async def start(self): + raise AssertionError("racing loser must not start platform adapters") + + async def stop(self): + return None + + released = {"runtime_lock": False} + + def _raise_file_exists(): + raise FileExistsError("simulated concurrent pid writer") + + def _release_runtime_lock(): + released["runtime_lock"] = True + + monkeypatch.setattr("gateway.status.get_running_pid", lambda: None) + monkeypatch.setattr("gateway.status.acquire_gateway_runtime_lock", lambda: True) + monkeypatch.setattr("gateway.status.write_pid_file", _raise_file_exists) + monkeypatch.setattr("gateway.status.release_gateway_runtime_lock", _release_runtime_lock) + monkeypatch.setattr("tools.skills_sync.sync_skills", lambda quiet=True: None) + monkeypatch.setattr("hermes_logging.setup_logging", lambda hermes_home, mode: tmp_path) + monkeypatch.setattr("hermes_logging._add_rotating_handler", lambda *args, **kwargs: None) + monkeypatch.setattr("gateway.run.GatewayRunner", _RunnerThatMustNotStart) + + from gateway.run import start_gateway + + ok = await start_gateway(config=GatewayConfig(), replace=True, verbosity=None) + + assert ok is True + assert released["runtime_lock"] is True + + @pytest.mark.asyncio async def test_start_gateway_replace_clears_marker_on_permission_denied( monkeypatch, tmp_path