fix(gateway): exit 78 (EX_CONFIG) on fatal startup errors, s6 finish script stops restart loop

Profiles without their own messaging token inherit the default
profile's token via os.getenv, hit a token collision, and exit with
startup_failed.  s6 restarts them immediately, creating ~30MB tirith
sandbox dirs in /tmp each cycle — filling the disk in hours (#51228).

Changes:
- gateway/restart.py: add GATEWAY_FATAL_CONFIG_EXIT_CODE = 78
- gateway/run.py: set exit_code=78 on non-retryable startup errors
  (token collision, no platforms)
- hermes_cli/service_manager.py: add _render_finish_script() that
  translates exit 78 → exit 125 (s6 permanent failure)
- hermes_cli/container_boot.py: write finish script alongside run
  script during profile registration

The s6 finish script pattern follows docker/s6-rc.d/dashboard/finish.

Closes #51228
This commit is contained in:
Francesco Mucio 2026-06-23 14:38:06 +02:00 committed by Ben Barclay
parent d93d0aee83
commit 776f68e1ee
7 changed files with 132 additions and 0 deletions

View file

@ -6,6 +6,12 @@ from hermes_cli.config import DEFAULT_CONFIG
# the gateway after a graceful drain/reload path completes.
GATEWAY_SERVICE_RESTART_EXIT_CODE = 75
# EX_CONFIG from sysexits.h — fatal configuration error (e.g. token
# collision, no messaging platforms). The s6 finish script translates
# this into exit 125 (permanent failure) so the supervisor stops
# restarting the gateway. See #51228.
GATEWAY_FATAL_CONFIG_EXIT_CODE = 78
DEFAULT_GATEWAY_RESTART_DRAIN_TIMEOUT = float(
DEFAULT_CONFIG["agent"]["restart_drain_timeout"]
)

View file

@ -1701,6 +1701,7 @@ from gateway.platforms.base import (
)
from gateway.restart import (
DEFAULT_GATEWAY_RESTART_DRAIN_TIMEOUT,
GATEWAY_FATAL_CONFIG_EXIT_CODE,
GATEWAY_SERVICE_RESTART_EXIT_CODE,
parse_restart_drain_timeout,
)
@ -5747,6 +5748,7 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
write_runtime_status(gateway_state="startup_failed", exit_reason=reason)
except Exception:
pass
self._exit_code = GATEWAY_FATAL_CONFIG_EXIT_CODE
self._request_clean_exit(reason)
self._startup_restore_in_progress = False
return True
@ -5762,6 +5764,7 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
write_runtime_status(gateway_state="startup_failed", exit_reason=reason)
except Exception:
pass
self._exit_code = GATEWAY_FATAL_CONFIG_EXIT_CODE
self._request_clean_exit(reason)
self._startup_restore_in_progress = False
return True

View file

@ -398,6 +398,10 @@ def _register_service(scandir: Path, profile: str, *, start: bool) -> None:
run.write_text(S6ServiceManager._render_run_script(profile, extra_env={}))
run.chmod(0o755)
finish = tmp_dir / "finish"
finish.write_text(S6ServiceManager._render_finish_script())
finish.chmod(0o755)
# Persistent log rotation (OQ8-C).
log_subdir = tmp_dir / "log"
log_subdir.mkdir()

View file

@ -709,6 +709,30 @@ class S6ServiceManager:
lines.append(f"exec s6-setuidgid hermes {gateway_cmd}")
return "\n".join(lines) + "\n"
@staticmethod
def _render_finish_script() -> str:
"""Generate the finish script for a profile-gateway s6 service.
When the gateway exits with EX_CONFIG (78) a fatal
configuration error such as a token collision or no messaging
platforms we tell s6-supervise to stop restarting by exiting
125 (permanent failure). Any other exit code lets s6 restart
normally. See #51228.
"""
from gateway.restart import GATEWAY_FATAL_CONFIG_EXIT_CODE
code = GATEWAY_FATAL_CONFIG_EXIT_CODE
return (
"#!/command/with-contenv sh\n"
"# shellcheck shell=sh\n"
"# $1 = exit code from the run script.\n"
f"# Exit {code} (EX_CONFIG) = fatal config error — don't restart.\n"
f'if [ "$1" = "{code}" ]; then\n'
" exit 125\n"
"fi\n"
"exit 0\n"
)
@staticmethod
def _render_log_run(profile: str) -> str:
"""Generate the log/run script for a profile-gateway service.
@ -956,6 +980,10 @@ class S6ServiceManager:
run_path.write_text(run_script)
run_path.chmod(0o755)
finish_path = tmp_dir / "finish"
finish_path.write_text(self._render_finish_script())
finish_path.chmod(0o755)
# Persistent log rotation (OQ8-C).
log_subdir = tmp_dir / "log"
log_subdir.mkdir()

View file

@ -3,6 +3,7 @@ from unittest.mock import AsyncMock
from gateway.config import GatewayConfig, Platform, PlatformConfig
from gateway.platforms.base import BasePlatformAdapter
from gateway.restart import GATEWAY_FATAL_CONFIG_EXIT_CODE
from gateway.run import GatewayRunner
from gateway.status import read_runtime_status
@ -458,6 +459,54 @@ async def test_runner_degrades_gracefully_when_all_adapters_missing(monkeypatch,
), "Expected degraded-mode warning when all adapters are missing"
class _NonRetryableFailureAdapter(BasePlatformAdapter):
"""Simulates a fatal config error like token collision."""
def __init__(self):
super().__init__(PlatformConfig(enabled=True, token="***"), Platform.DISCORD)
async def connect(self) -> bool:
self._set_fatal_error(
"discord-bot-token_lock",
"Discord bot token already in use (PID 999). Stop the other gateway first.",
retryable=False,
)
return False
async def disconnect(self) -> None:
self._mark_disconnected()
async def send(self, chat_id, content, reply_to=None, metadata=None):
raise NotImplementedError
async def get_chat_info(self, chat_id):
return {"id": chat_id}
@pytest.mark.asyncio
async def test_runner_exits_with_ex_config_on_nonretryable_startup_error(monkeypatch, tmp_path):
"""Non-retryable startup errors (token collision, no platforms) must
set exit_code to 78 (EX_CONFIG) so the s6 finish script can translate
it to exit 125 (permanent failure). See #51228."""
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
config = GatewayConfig(
platforms={
Platform.DISCORD: PlatformConfig(enabled=True, token="***")
},
sessions_dir=tmp_path / "sessions",
)
runner = GatewayRunner(config)
monkeypatch.setattr(runner, "_create_adapter", lambda platform, platform_config: _NonRetryableFailureAdapter())
ok = await runner.start()
assert ok is True # start() returns True (clean exit requested)
assert runner.should_exit_cleanly is True
assert runner.exit_code == GATEWAY_FATAL_CONFIG_EXIT_CODE
state = read_runtime_status()
assert state["gateway_state"] == "startup_failed"
def test_runner_warns_when_docker_gateway_lacks_explicit_output_mount(monkeypatch, tmp_path, caplog):
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
monkeypatch.setenv("TERMINAL_ENV", "docker")

View file

@ -128,6 +128,24 @@ def test_running_profile_is_registered_and_autostarted(tmp_path: Path) -> None:
assert not (svc / "down").exists()
def test_registered_profile_has_finish_script(tmp_path: Path) -> None:
"""The finish script must be written so s6 stops restarting on
fatal config errors (exit 78 exit 125). See #51228."""
scandir = tmp_path / "run-service"; scandir.mkdir()
_make_profile(tmp_path, "coder", state="running")
reconcile_profile_gateways(
hermes_home=tmp_path, scandir=scandir, dry_run=False,
)
finish = scandir / "gateway-coder" / "finish"
assert finish.exists()
assert finish.stat().st_mode & 0o111 # executable
text = finish.read_text()
assert "78" in text
assert "125" in text
def test_stopped_profile_is_registered_but_not_started(tmp_path: Path) -> None:
scandir = tmp_path / "run-service"; scandir.mkdir()
_make_profile(tmp_path, "writer", state="stopped")

View file

@ -673,6 +673,30 @@ def test_render_run_script_uses_replace_to_take_over_stale_holder() -> None:
)
def test_render_finish_script_exits_125_on_ex_config() -> None:
"""The finish script must translate exit 78 (EX_CONFIG) into exit 125
(permanent failure) so s6 stops restarting on fatal config errors.
See #51228."""
text = S6ServiceManager._render_finish_script()
assert '[ "$1" = "78" ]' in text
assert "exit 125" in text
assert "exit 0" in text
def test_s6_register_writes_finish_script(
s6_scandir, fake_subprocess_run,
) -> None:
"""The finish script must be written alongside the run script."""
mgr = S6ServiceManager(scandir=s6_scandir)
mgr.register_profile_gateway("coder")
finish_path = s6_scandir / "gateway-coder" / "finish"
assert finish_path.is_file()
assert finish_path.stat().st_mode & 0o111 # executable
assert "78" in finish_path.read_text()
assert "125" in finish_path.read_text()
def test_s6_register_rejects_invalid_profile_name(s6_scandir) -> None:
mgr = S6ServiceManager(scandir=s6_scandir)
with pytest.raises(ValueError):