From 366c2a37669fb4cc02c7fc1f34aee1cc518cb3e0 Mon Sep 17 00:00:00 2001 From: teknium1 <127238744+teknium1@users.noreply.github.com> Date: Tue, 23 Jun 2026 22:29:20 -0700 Subject: [PATCH] fix(gateway): propagate fatal-config exit code through start_gateway clean-exit path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The contributor PR stamped runner._exit_code=78 on non-retryable startup errors, but start_gateway()'s clean-exit branch returned True before the SystemExit(runner.exit_code) site, so main() exited 0. The s6 finish script's [ "$1" = "78" ] check never matched and s6 crash-looped the gateway anyway — the fix was dead as shipped (#51228). Honor runner.exit_code in the clean-exit branch: raise SystemExit(code) when set, else return True (normal /restart clean exit). Add a start_gateway()-level test that asserts process-level SystemExit(78) propagation — the gap the PR's object-level test missed — plus exit_code on the existing _CleanExitRunner mocks. --- gateway/run.py | 9 ++++ tests/gateway/test_runner_startup_failures.py | 43 +++++++++++++++++++ 2 files changed, 52 insertions(+) diff --git a/gateway/run.py b/gateway/run.py index 401a92bbd6b..34c56edbb8c 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -17783,6 +17783,15 @@ async def start_gateway(config: Optional[GatewayConfig] = None, replace: bool = if runner.should_exit_cleanly: if runner.exit_reason: logger.error("Gateway exiting cleanly: %s", runner.exit_reason) + # A clean exit that carries an explicit exit code (e.g. a fatal + # config error stamped with GATEWAY_FATAL_CONFIG_EXIT_CODE) must + # propagate that code to the process so the s6 finish script can + # translate it (78 → 125) and stop the supervisor restart loop. + # Without this, the early `return True` below makes main() exit 0, + # the finish script's `[ "$1" = "78" ]` check never matches, and + # s6 crash-loops the gateway anyway (#51228). + if runner.exit_code is not None: + raise SystemExit(runner.exit_code) return True # Start the background cron scheduler via the resolved provider so diff --git a/tests/gateway/test_runner_startup_failures.py b/tests/gateway/test_runner_startup_failures.py index a70a5c6c4d1..12aa5c4a3d8 100644 --- a/tests/gateway/test_runner_startup_failures.py +++ b/tests/gateway/test_runner_startup_failures.py @@ -152,6 +152,7 @@ async def test_start_gateway_verbosity_imports_redacting_formatter(monkeypatch, self.config = config self.should_exit_cleanly = True self.exit_reason = None + self.exit_code = None self.adapters = {} async def start(self): @@ -186,6 +187,7 @@ async def test_start_gateway_replace_force_uses_terminate_pid(monkeypatch, tmp_p self.config = config self.should_exit_cleanly = True self.exit_reason = None + self.exit_code = None self.adapters = {} async def start(self): @@ -334,6 +336,7 @@ async def test_start_gateway_replace_writes_takeover_marker_before_sigterm( self.config = config self.should_exit_cleanly = True self.exit_reason = None + self.exit_code = None self.adapters = {} async def start(self): @@ -507,6 +510,46 @@ async def test_runner_exits_with_ex_config_on_nonretryable_startup_error(monkeyp assert state["gateway_state"] == "startup_failed" +@pytest.mark.asyncio +async def test_start_gateway_propagates_fatal_config_exit_code(monkeypatch, tmp_path): + """A clean exit carrying GATEWAY_FATAL_CONFIG_EXIT_CODE must surface as a + process-level SystemExit(78) — NOT a truthy return — so main() exits 78 + and the s6 finish script can translate it to 125 (no restart). + + This guards the propagation gap: runner.start() stamps exit_code=78 and + requests a clean exit, but start_gateway()'s clean-exit branch used to + `return True` before the SystemExit(exit_code) site, so main() exited 0 + and s6 crash-looped anyway (#51228).""" + monkeypatch.setenv("HERMES_HOME", str(tmp_path)) + + class _FatalConfigRunner: + def __init__(self, config): + self.config = config + self.should_exit_cleanly = True + self.exit_reason = "discord: Discord bot token already in use" + self.exit_code = GATEWAY_FATAL_CONFIG_EXIT_CODE + self.adapters = {} + + async def start(self): + return True + + async def stop(self): + return None + + monkeypatch.setattr("gateway.status.get_running_pid", lambda: None) + monkeypatch.setattr("tools.skills_sync.sync_skills", lambda quiet=True: None) + monkeypatch.setattr("hermes_logging.setup_logging", lambda hermes_home, mode: tmp_path) + monkeypatch.setattr("hermes_logging._add_rotating_handler", lambda *args, **kwargs: None) + monkeypatch.setattr("gateway.run.GatewayRunner", _FatalConfigRunner) + + from gateway.run import start_gateway + + with pytest.raises(SystemExit) as exc_info: + await start_gateway(config=GatewayConfig(), replace=False, verbosity=0) + + assert exc_info.value.code == GATEWAY_FATAL_CONFIG_EXIT_CODE + + def test_runner_warns_when_docker_gateway_lacks_explicit_output_mount(monkeypatch, tmp_path, caplog): monkeypatch.setenv("HERMES_HOME", str(tmp_path)) monkeypatch.setenv("TERMINAL_ENV", "docker")