diff --git a/gateway/restart.py b/gateway/restart.py index fe9b70022af..97830872b9e 100644 --- a/gateway/restart.py +++ b/gateway/restart.py @@ -6,6 +6,12 @@ from hermes_cli.config import DEFAULT_CONFIG # the gateway after a graceful drain/reload path completes. GATEWAY_SERVICE_RESTART_EXIT_CODE = 75 +# EX_CONFIG from sysexits.h — fatal configuration error (e.g. token +# collision, no messaging platforms). The s6 finish script translates +# this into exit 125 (permanent failure) so the supervisor stops +# restarting the gateway. See #51228. +GATEWAY_FATAL_CONFIG_EXIT_CODE = 78 + DEFAULT_GATEWAY_RESTART_DRAIN_TIMEOUT = float( DEFAULT_CONFIG["agent"]["restart_drain_timeout"] ) diff --git a/gateway/run.py b/gateway/run.py index dbce984deb4..401a92bbd6b 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -1701,6 +1701,7 @@ from gateway.platforms.base import ( ) from gateway.restart import ( DEFAULT_GATEWAY_RESTART_DRAIN_TIMEOUT, + GATEWAY_FATAL_CONFIG_EXIT_CODE, GATEWAY_SERVICE_RESTART_EXIT_CODE, parse_restart_drain_timeout, ) @@ -5747,6 +5748,7 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew write_runtime_status(gateway_state="startup_failed", exit_reason=reason) except Exception: pass + self._exit_code = GATEWAY_FATAL_CONFIG_EXIT_CODE self._request_clean_exit(reason) self._startup_restore_in_progress = False return True @@ -5762,6 +5764,7 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew write_runtime_status(gateway_state="startup_failed", exit_reason=reason) except Exception: pass + self._exit_code = GATEWAY_FATAL_CONFIG_EXIT_CODE self._request_clean_exit(reason) self._startup_restore_in_progress = False return True diff --git a/hermes_cli/container_boot.py b/hermes_cli/container_boot.py index c299bbcf966..cc14d87180b 100644 --- a/hermes_cli/container_boot.py +++ b/hermes_cli/container_boot.py @@ -398,6 +398,10 @@ def _register_service(scandir: Path, profile: str, *, start: bool) -> None: run.write_text(S6ServiceManager._render_run_script(profile, extra_env={})) run.chmod(0o755) + finish = tmp_dir / "finish" + finish.write_text(S6ServiceManager._render_finish_script()) + finish.chmod(0o755) + # Persistent log rotation (OQ8-C). log_subdir = tmp_dir / "log" log_subdir.mkdir() diff --git a/hermes_cli/service_manager.py b/hermes_cli/service_manager.py index f5254107b8b..28992a046b1 100644 --- a/hermes_cli/service_manager.py +++ b/hermes_cli/service_manager.py @@ -709,6 +709,30 @@ class S6ServiceManager: lines.append(f"exec s6-setuidgid hermes {gateway_cmd}") return "\n".join(lines) + "\n" + @staticmethod + def _render_finish_script() -> str: + """Generate the finish script for a profile-gateway s6 service. + + When the gateway exits with EX_CONFIG (78) — a fatal + configuration error such as a token collision or no messaging + platforms — we tell s6-supervise to stop restarting by exiting + 125 (permanent failure). Any other exit code lets s6 restart + normally. See #51228. + """ + from gateway.restart import GATEWAY_FATAL_CONFIG_EXIT_CODE + + code = GATEWAY_FATAL_CONFIG_EXIT_CODE + return ( + "#!/command/with-contenv sh\n" + "# shellcheck shell=sh\n" + "# $1 = exit code from the run script.\n" + f"# Exit {code} (EX_CONFIG) = fatal config error — don't restart.\n" + f'if [ "$1" = "{code}" ]; then\n' + " exit 125\n" + "fi\n" + "exit 0\n" + ) + @staticmethod def _render_log_run(profile: str) -> str: """Generate the log/run script for a profile-gateway service. @@ -956,6 +980,10 @@ class S6ServiceManager: run_path.write_text(run_script) run_path.chmod(0o755) + finish_path = tmp_dir / "finish" + finish_path.write_text(self._render_finish_script()) + finish_path.chmod(0o755) + # Persistent log rotation (OQ8-C). log_subdir = tmp_dir / "log" log_subdir.mkdir() diff --git a/tests/gateway/test_runner_startup_failures.py b/tests/gateway/test_runner_startup_failures.py index 329ad1e9b63..a70a5c6c4d1 100644 --- a/tests/gateway/test_runner_startup_failures.py +++ b/tests/gateway/test_runner_startup_failures.py @@ -3,6 +3,7 @@ from unittest.mock import AsyncMock from gateway.config import GatewayConfig, Platform, PlatformConfig from gateway.platforms.base import BasePlatformAdapter +from gateway.restart import GATEWAY_FATAL_CONFIG_EXIT_CODE from gateway.run import GatewayRunner from gateway.status import read_runtime_status @@ -458,6 +459,54 @@ async def test_runner_degrades_gracefully_when_all_adapters_missing(monkeypatch, ), "Expected degraded-mode warning when all adapters are missing" +class _NonRetryableFailureAdapter(BasePlatformAdapter): + """Simulates a fatal config error like token collision.""" + def __init__(self): + super().__init__(PlatformConfig(enabled=True, token="***"), Platform.DISCORD) + + async def connect(self) -> bool: + self._set_fatal_error( + "discord-bot-token_lock", + "Discord bot token already in use (PID 999). Stop the other gateway first.", + retryable=False, + ) + return False + + async def disconnect(self) -> None: + self._mark_disconnected() + + async def send(self, chat_id, content, reply_to=None, metadata=None): + raise NotImplementedError + + async def get_chat_info(self, chat_id): + return {"id": chat_id} + + +@pytest.mark.asyncio +async def test_runner_exits_with_ex_config_on_nonretryable_startup_error(monkeypatch, tmp_path): + """Non-retryable startup errors (token collision, no platforms) must + set exit_code to 78 (EX_CONFIG) so the s6 finish script can translate + it to exit 125 (permanent failure). See #51228.""" + monkeypatch.setenv("HERMES_HOME", str(tmp_path)) + config = GatewayConfig( + platforms={ + Platform.DISCORD: PlatformConfig(enabled=True, token="***") + }, + sessions_dir=tmp_path / "sessions", + ) + runner = GatewayRunner(config) + + monkeypatch.setattr(runner, "_create_adapter", lambda platform, platform_config: _NonRetryableFailureAdapter()) + + ok = await runner.start() + + assert ok is True # start() returns True (clean exit requested) + assert runner.should_exit_cleanly is True + assert runner.exit_code == GATEWAY_FATAL_CONFIG_EXIT_CODE + state = read_runtime_status() + assert state["gateway_state"] == "startup_failed" + + def test_runner_warns_when_docker_gateway_lacks_explicit_output_mount(monkeypatch, tmp_path, caplog): monkeypatch.setenv("HERMES_HOME", str(tmp_path)) monkeypatch.setenv("TERMINAL_ENV", "docker") diff --git a/tests/hermes_cli/test_container_boot.py b/tests/hermes_cli/test_container_boot.py index 7dac6ced1a6..165712d2152 100644 --- a/tests/hermes_cli/test_container_boot.py +++ b/tests/hermes_cli/test_container_boot.py @@ -128,6 +128,24 @@ def test_running_profile_is_registered_and_autostarted(tmp_path: Path) -> None: assert not (svc / "down").exists() +def test_registered_profile_has_finish_script(tmp_path: Path) -> None: + """The finish script must be written so s6 stops restarting on + fatal config errors (exit 78 → exit 125). See #51228.""" + scandir = tmp_path / "run-service"; scandir.mkdir() + _make_profile(tmp_path, "coder", state="running") + + reconcile_profile_gateways( + hermes_home=tmp_path, scandir=scandir, dry_run=False, + ) + + finish = scandir / "gateway-coder" / "finish" + assert finish.exists() + assert finish.stat().st_mode & 0o111 # executable + text = finish.read_text() + assert "78" in text + assert "125" in text + + def test_stopped_profile_is_registered_but_not_started(tmp_path: Path) -> None: scandir = tmp_path / "run-service"; scandir.mkdir() _make_profile(tmp_path, "writer", state="stopped") diff --git a/tests/hermes_cli/test_service_manager.py b/tests/hermes_cli/test_service_manager.py index 80c7432fd1e..cd78c35d55d 100644 --- a/tests/hermes_cli/test_service_manager.py +++ b/tests/hermes_cli/test_service_manager.py @@ -673,6 +673,30 @@ def test_render_run_script_uses_replace_to_take_over_stale_holder() -> None: ) +def test_render_finish_script_exits_125_on_ex_config() -> None: + """The finish script must translate exit 78 (EX_CONFIG) into exit 125 + (permanent failure) so s6 stops restarting on fatal config errors. + See #51228.""" + text = S6ServiceManager._render_finish_script() + assert '[ "$1" = "78" ]' in text + assert "exit 125" in text + assert "exit 0" in text + + +def test_s6_register_writes_finish_script( + s6_scandir, fake_subprocess_run, +) -> None: + """The finish script must be written alongside the run script.""" + mgr = S6ServiceManager(scandir=s6_scandir) + mgr.register_profile_gateway("coder") + + finish_path = s6_scandir / "gateway-coder" / "finish" + assert finish_path.is_file() + assert finish_path.stat().st_mode & 0o111 # executable + assert "78" in finish_path.read_text() + assert "125" in finish_path.read_text() + + def test_s6_register_rejects_invalid_profile_name(s6_scandir) -> None: mgr = S6ServiceManager(scandir=s6_scandir) with pytest.raises(ValueError):