fix(gateway): propagate fatal-config exit code through start_gateway clean-exit path

The contributor PR stamped runner._exit_code=78 on non-retryable startup
errors, but start_gateway()'s clean-exit branch returned True before the
SystemExit(runner.exit_code) site, so main() exited 0. The s6 finish
script's [ "$1" = "78" ] check never matched and s6 crash-looped the
gateway anyway — the fix was dead as shipped (#51228).

Honor runner.exit_code in the clean-exit branch: raise SystemExit(code)
when set, else return True (normal /restart clean exit). Add a
start_gateway()-level test that asserts process-level SystemExit(78)
propagation — the gap the PR's object-level test missed — plus exit_code
on the existing _CleanExitRunner mocks.
This commit is contained in:
teknium1 2026-06-23 22:29:20 -07:00 committed by Ben Barclay
parent 776f68e1ee
commit 366c2a3766
2 changed files with 52 additions and 0 deletions

View file

@ -17783,6 +17783,15 @@ async def start_gateway(config: Optional[GatewayConfig] = None, replace: bool =
if runner.should_exit_cleanly:
if runner.exit_reason:
logger.error("Gateway exiting cleanly: %s", runner.exit_reason)
# A clean exit that carries an explicit exit code (e.g. a fatal
# config error stamped with GATEWAY_FATAL_CONFIG_EXIT_CODE) must
# propagate that code to the process so the s6 finish script can
# translate it (78 → 125) and stop the supervisor restart loop.
# Without this, the early `return True` below makes main() exit 0,
# the finish script's `[ "$1" = "78" ]` check never matches, and
# s6 crash-loops the gateway anyway (#51228).
if runner.exit_code is not None:
raise SystemExit(runner.exit_code)
return True
# Start the background cron scheduler via the resolved provider so

View file

@ -152,6 +152,7 @@ async def test_start_gateway_verbosity_imports_redacting_formatter(monkeypatch,
self.config = config
self.should_exit_cleanly = True
self.exit_reason = None
self.exit_code = None
self.adapters = {}
async def start(self):
@ -186,6 +187,7 @@ async def test_start_gateway_replace_force_uses_terminate_pid(monkeypatch, tmp_p
self.config = config
self.should_exit_cleanly = True
self.exit_reason = None
self.exit_code = None
self.adapters = {}
async def start(self):
@ -334,6 +336,7 @@ async def test_start_gateway_replace_writes_takeover_marker_before_sigterm(
self.config = config
self.should_exit_cleanly = True
self.exit_reason = None
self.exit_code = None
self.adapters = {}
async def start(self):
@ -507,6 +510,46 @@ async def test_runner_exits_with_ex_config_on_nonretryable_startup_error(monkeyp
assert state["gateway_state"] == "startup_failed"
@pytest.mark.asyncio
async def test_start_gateway_propagates_fatal_config_exit_code(monkeypatch, tmp_path):
"""A clean exit carrying GATEWAY_FATAL_CONFIG_EXIT_CODE must surface as a
process-level SystemExit(78) NOT a truthy return so main() exits 78
and the s6 finish script can translate it to 125 (no restart).
This guards the propagation gap: runner.start() stamps exit_code=78 and
requests a clean exit, but start_gateway()'s clean-exit branch used to
`return True` before the SystemExit(exit_code) site, so main() exited 0
and s6 crash-looped anyway (#51228)."""
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
class _FatalConfigRunner:
def __init__(self, config):
self.config = config
self.should_exit_cleanly = True
self.exit_reason = "discord: Discord bot token already in use"
self.exit_code = GATEWAY_FATAL_CONFIG_EXIT_CODE
self.adapters = {}
async def start(self):
return True
async def stop(self):
return None
monkeypatch.setattr("gateway.status.get_running_pid", lambda: None)
monkeypatch.setattr("tools.skills_sync.sync_skills", lambda quiet=True: None)
monkeypatch.setattr("hermes_logging.setup_logging", lambda hermes_home, mode: tmp_path)
monkeypatch.setattr("hermes_logging._add_rotating_handler", lambda *args, **kwargs: None)
monkeypatch.setattr("gateway.run.GatewayRunner", _FatalConfigRunner)
from gateway.run import start_gateway
with pytest.raises(SystemExit) as exc_info:
await start_gateway(config=GatewayConfig(), replace=False, verbosity=0)
assert exc_info.value.code == GATEWAY_FATAL_CONFIG_EXIT_CODE
def test_runner_warns_when_docker_gateway_lacks_explicit_output_mount(monkeypatch, tmp_path, caplog):
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
monkeypatch.setenv("TERMINAL_ENV", "docker")