mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-27 11:22:03 +00:00
fix(gateway): exit 78 (EX_CONFIG) on fatal startup errors, s6 finish script stops restart loop
Profiles without their own messaging token inherit the default profile's token via os.getenv, hit a token collision, and exit with startup_failed. s6 restarts them immediately, creating ~30MB tirith sandbox dirs in /tmp each cycle — filling the disk in hours (#51228). Changes: - gateway/restart.py: add GATEWAY_FATAL_CONFIG_EXIT_CODE = 78 - gateway/run.py: set exit_code=78 on non-retryable startup errors (token collision, no platforms) - hermes_cli/service_manager.py: add _render_finish_script() that translates exit 78 → exit 125 (s6 permanent failure) - hermes_cli/container_boot.py: write finish script alongside run script during profile registration The s6 finish script pattern follows docker/s6-rc.d/dashboard/finish. Closes #51228
This commit is contained in:
parent
d93d0aee83
commit
776f68e1ee
7 changed files with 132 additions and 0 deletions
|
|
@ -6,6 +6,12 @@ from hermes_cli.config import DEFAULT_CONFIG
|
|||
# the gateway after a graceful drain/reload path completes.
|
||||
GATEWAY_SERVICE_RESTART_EXIT_CODE = 75
|
||||
|
||||
# EX_CONFIG from sysexits.h — fatal configuration error (e.g. token
|
||||
# collision, no messaging platforms). The s6 finish script translates
|
||||
# this into exit 125 (permanent failure) so the supervisor stops
|
||||
# restarting the gateway. See #51228.
|
||||
GATEWAY_FATAL_CONFIG_EXIT_CODE = 78
|
||||
|
||||
DEFAULT_GATEWAY_RESTART_DRAIN_TIMEOUT = float(
|
||||
DEFAULT_CONFIG["agent"]["restart_drain_timeout"]
|
||||
)
|
||||
|
|
|
|||
|
|
@ -1701,6 +1701,7 @@ from gateway.platforms.base import (
|
|||
)
|
||||
from gateway.restart import (
|
||||
DEFAULT_GATEWAY_RESTART_DRAIN_TIMEOUT,
|
||||
GATEWAY_FATAL_CONFIG_EXIT_CODE,
|
||||
GATEWAY_SERVICE_RESTART_EXIT_CODE,
|
||||
parse_restart_drain_timeout,
|
||||
)
|
||||
|
|
@ -5747,6 +5748,7 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
|
|||
write_runtime_status(gateway_state="startup_failed", exit_reason=reason)
|
||||
except Exception:
|
||||
pass
|
||||
self._exit_code = GATEWAY_FATAL_CONFIG_EXIT_CODE
|
||||
self._request_clean_exit(reason)
|
||||
self._startup_restore_in_progress = False
|
||||
return True
|
||||
|
|
@ -5762,6 +5764,7 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
|
|||
write_runtime_status(gateway_state="startup_failed", exit_reason=reason)
|
||||
except Exception:
|
||||
pass
|
||||
self._exit_code = GATEWAY_FATAL_CONFIG_EXIT_CODE
|
||||
self._request_clean_exit(reason)
|
||||
self._startup_restore_in_progress = False
|
||||
return True
|
||||
|
|
|
|||
|
|
@ -398,6 +398,10 @@ def _register_service(scandir: Path, profile: str, *, start: bool) -> None:
|
|||
run.write_text(S6ServiceManager._render_run_script(profile, extra_env={}))
|
||||
run.chmod(0o755)
|
||||
|
||||
finish = tmp_dir / "finish"
|
||||
finish.write_text(S6ServiceManager._render_finish_script())
|
||||
finish.chmod(0o755)
|
||||
|
||||
# Persistent log rotation (OQ8-C).
|
||||
log_subdir = tmp_dir / "log"
|
||||
log_subdir.mkdir()
|
||||
|
|
|
|||
|
|
@ -709,6 +709,30 @@ class S6ServiceManager:
|
|||
lines.append(f"exec s6-setuidgid hermes {gateway_cmd}")
|
||||
return "\n".join(lines) + "\n"
|
||||
|
||||
@staticmethod
|
||||
def _render_finish_script() -> str:
|
||||
"""Generate the finish script for a profile-gateway s6 service.
|
||||
|
||||
When the gateway exits with EX_CONFIG (78) — a fatal
|
||||
configuration error such as a token collision or no messaging
|
||||
platforms — we tell s6-supervise to stop restarting by exiting
|
||||
125 (permanent failure). Any other exit code lets s6 restart
|
||||
normally. See #51228.
|
||||
"""
|
||||
from gateway.restart import GATEWAY_FATAL_CONFIG_EXIT_CODE
|
||||
|
||||
code = GATEWAY_FATAL_CONFIG_EXIT_CODE
|
||||
return (
|
||||
"#!/command/with-contenv sh\n"
|
||||
"# shellcheck shell=sh\n"
|
||||
"# $1 = exit code from the run script.\n"
|
||||
f"# Exit {code} (EX_CONFIG) = fatal config error — don't restart.\n"
|
||||
f'if [ "$1" = "{code}" ]; then\n'
|
||||
" exit 125\n"
|
||||
"fi\n"
|
||||
"exit 0\n"
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _render_log_run(profile: str) -> str:
|
||||
"""Generate the log/run script for a profile-gateway service.
|
||||
|
|
@ -956,6 +980,10 @@ class S6ServiceManager:
|
|||
run_path.write_text(run_script)
|
||||
run_path.chmod(0o755)
|
||||
|
||||
finish_path = tmp_dir / "finish"
|
||||
finish_path.write_text(self._render_finish_script())
|
||||
finish_path.chmod(0o755)
|
||||
|
||||
# Persistent log rotation (OQ8-C).
|
||||
log_subdir = tmp_dir / "log"
|
||||
log_subdir.mkdir()
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@ from unittest.mock import AsyncMock
|
|||
|
||||
from gateway.config import GatewayConfig, Platform, PlatformConfig
|
||||
from gateway.platforms.base import BasePlatformAdapter
|
||||
from gateway.restart import GATEWAY_FATAL_CONFIG_EXIT_CODE
|
||||
from gateway.run import GatewayRunner
|
||||
from gateway.status import read_runtime_status
|
||||
|
||||
|
|
@ -458,6 +459,54 @@ async def test_runner_degrades_gracefully_when_all_adapters_missing(monkeypatch,
|
|||
), "Expected degraded-mode warning when all adapters are missing"
|
||||
|
||||
|
||||
class _NonRetryableFailureAdapter(BasePlatformAdapter):
|
||||
"""Simulates a fatal config error like token collision."""
|
||||
def __init__(self):
|
||||
super().__init__(PlatformConfig(enabled=True, token="***"), Platform.DISCORD)
|
||||
|
||||
async def connect(self) -> bool:
|
||||
self._set_fatal_error(
|
||||
"discord-bot-token_lock",
|
||||
"Discord bot token already in use (PID 999). Stop the other gateway first.",
|
||||
retryable=False,
|
||||
)
|
||||
return False
|
||||
|
||||
async def disconnect(self) -> None:
|
||||
self._mark_disconnected()
|
||||
|
||||
async def send(self, chat_id, content, reply_to=None, metadata=None):
|
||||
raise NotImplementedError
|
||||
|
||||
async def get_chat_info(self, chat_id):
|
||||
return {"id": chat_id}
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_runner_exits_with_ex_config_on_nonretryable_startup_error(monkeypatch, tmp_path):
|
||||
"""Non-retryable startup errors (token collision, no platforms) must
|
||||
set exit_code to 78 (EX_CONFIG) so the s6 finish script can translate
|
||||
it to exit 125 (permanent failure). See #51228."""
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
||||
config = GatewayConfig(
|
||||
platforms={
|
||||
Platform.DISCORD: PlatformConfig(enabled=True, token="***")
|
||||
},
|
||||
sessions_dir=tmp_path / "sessions",
|
||||
)
|
||||
runner = GatewayRunner(config)
|
||||
|
||||
monkeypatch.setattr(runner, "_create_adapter", lambda platform, platform_config: _NonRetryableFailureAdapter())
|
||||
|
||||
ok = await runner.start()
|
||||
|
||||
assert ok is True # start() returns True (clean exit requested)
|
||||
assert runner.should_exit_cleanly is True
|
||||
assert runner.exit_code == GATEWAY_FATAL_CONFIG_EXIT_CODE
|
||||
state = read_runtime_status()
|
||||
assert state["gateway_state"] == "startup_failed"
|
||||
|
||||
|
||||
def test_runner_warns_when_docker_gateway_lacks_explicit_output_mount(monkeypatch, tmp_path, caplog):
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
||||
monkeypatch.setenv("TERMINAL_ENV", "docker")
|
||||
|
|
|
|||
|
|
@ -128,6 +128,24 @@ def test_running_profile_is_registered_and_autostarted(tmp_path: Path) -> None:
|
|||
assert not (svc / "down").exists()
|
||||
|
||||
|
||||
def test_registered_profile_has_finish_script(tmp_path: Path) -> None:
|
||||
"""The finish script must be written so s6 stops restarting on
|
||||
fatal config errors (exit 78 → exit 125). See #51228."""
|
||||
scandir = tmp_path / "run-service"; scandir.mkdir()
|
||||
_make_profile(tmp_path, "coder", state="running")
|
||||
|
||||
reconcile_profile_gateways(
|
||||
hermes_home=tmp_path, scandir=scandir, dry_run=False,
|
||||
)
|
||||
|
||||
finish = scandir / "gateway-coder" / "finish"
|
||||
assert finish.exists()
|
||||
assert finish.stat().st_mode & 0o111 # executable
|
||||
text = finish.read_text()
|
||||
assert "78" in text
|
||||
assert "125" in text
|
||||
|
||||
|
||||
def test_stopped_profile_is_registered_but_not_started(tmp_path: Path) -> None:
|
||||
scandir = tmp_path / "run-service"; scandir.mkdir()
|
||||
_make_profile(tmp_path, "writer", state="stopped")
|
||||
|
|
|
|||
|
|
@ -673,6 +673,30 @@ def test_render_run_script_uses_replace_to_take_over_stale_holder() -> None:
|
|||
)
|
||||
|
||||
|
||||
def test_render_finish_script_exits_125_on_ex_config() -> None:
|
||||
"""The finish script must translate exit 78 (EX_CONFIG) into exit 125
|
||||
(permanent failure) so s6 stops restarting on fatal config errors.
|
||||
See #51228."""
|
||||
text = S6ServiceManager._render_finish_script()
|
||||
assert '[ "$1" = "78" ]' in text
|
||||
assert "exit 125" in text
|
||||
assert "exit 0" in text
|
||||
|
||||
|
||||
def test_s6_register_writes_finish_script(
|
||||
s6_scandir, fake_subprocess_run,
|
||||
) -> None:
|
||||
"""The finish script must be written alongside the run script."""
|
||||
mgr = S6ServiceManager(scandir=s6_scandir)
|
||||
mgr.register_profile_gateway("coder")
|
||||
|
||||
finish_path = s6_scandir / "gateway-coder" / "finish"
|
||||
assert finish_path.is_file()
|
||||
assert finish_path.stat().st_mode & 0o111 # executable
|
||||
assert "78" in finish_path.read_text()
|
||||
assert "125" in finish_path.read_text()
|
||||
|
||||
|
||||
def test_s6_register_rejects_invalid_profile_name(s6_scandir) -> None:
|
||||
mgr = S6ServiceManager(scandir=s6_scandir)
|
||||
with pytest.raises(ValueError):
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue