add generic gateway startup readiness checks

This commit is contained in:
Shannon Sands 2026-04-15 10:03:23 +10:00
parent 10494b42a1
commit bad9fe2452
9 changed files with 637 additions and 8 deletions

View file

@ -125,6 +125,25 @@ async def test_gateway_stop_service_restart_sets_named_exit_code():
assert runner._exit_code == GATEWAY_SERVICE_RESTART_EXIT_CODE
@pytest.mark.asyncio
async def test_gateway_stop_emits_shutdown_hook_after_drain(monkeypatch):
runner, adapter = make_restart_runner()
adapter.disconnect = AsyncMock()
runner.hooks.emit = AsyncMock()
with patch("gateway.status.remove_pid_file"), patch("gateway.status.write_runtime_status"):
await runner.stop(restart=True, service_restart=True)
runner.hooks.emit.assert_awaited_once_with(
"gateway:shutdown",
{
"restart": True,
"service_restart": True,
"detached_restart": False,
},
)
@pytest.mark.asyncio
async def test_drain_active_agents_throttles_status_updates():
runner, _adapter = make_restart_runner()

View file

@ -9,7 +9,7 @@ import pytest
from gateway.hooks import HookRegistry
def _create_hook(hooks_dir, hook_name, events, handler_code):
def _create_hook(hooks_dir, hook_name, events, handler_code, *, manifest_extra=""):
"""Helper to create a hook directory with HOOK.yaml and handler.py."""
hook_dir = hooks_dir / hook_name
hook_dir.mkdir(parents=True)
@ -17,6 +17,7 @@ def _create_hook(hooks_dir, hook_name, events, handler_code):
f"name: {hook_name}\n"
f"description: Test hook\n"
f"events: {events}\n"
f"{manifest_extra}"
)
(hook_dir / "handler.py").write_text(handler_code)
return hook_dir
@ -112,6 +113,24 @@ class TestDiscoverAndLoad:
assert len(reg.loaded_hooks) == 2
def test_preserves_optional_startup_readiness_metadata(self, tmp_path):
_create_hook(
tmp_path,
"ready-hook",
'["gateway:startup"]',
"def handle(e, c): pass\n",
manifest_extra="startup_readiness:\n id: beam-runtime\n required: false\n",
)
reg = HookRegistry()
with patch("gateway.hooks.HOOKS_DIR", tmp_path), _patch_no_builtins(reg):
reg.discover_and_load()
assert reg.loaded_hooks[0]["startup_readiness"] == {
"id": "beam-runtime",
"required": False,
}
class TestEmit:
@pytest.mark.asyncio

View file

@ -132,6 +132,68 @@ async def test_runner_records_connected_platform_state_on_success(monkeypatch, t
assert state["platforms"]["discord"]["error_message"] is None
@pytest.mark.asyncio
async def test_runner_discovers_plugins_before_loading_hooks(monkeypatch, tmp_path):
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
config = GatewayConfig(
platforms={
Platform.DISCORD: PlatformConfig(enabled=True, token="***")
},
sessions_dir=tmp_path / "sessions",
)
runner = GatewayRunner(config)
order: list[str] = []
monkeypatch.setattr(runner, "_create_adapter", lambda platform, platform_config: _SuccessfulAdapter())
monkeypatch.setattr("hermes_cli.plugins.discover_plugins", lambda: order.append("plugins"))
monkeypatch.setattr(runner.hooks, "discover_and_load", lambda: order.append("hooks"))
monkeypatch.setattr(runner.hooks, "emit", AsyncMock())
ok = await runner.start()
assert ok is True
assert order == ["plugins", "hooks"]
@pytest.mark.asyncio
async def test_runner_initializes_startup_checks_before_gateway_startup_emit(monkeypatch, tmp_path):
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
config = GatewayConfig(
platforms={
Platform.DISCORD: PlatformConfig(enabled=True, token="***")
},
sessions_dir=tmp_path / "sessions",
)
runner = GatewayRunner(config)
runner.hooks._loaded_hooks = [
{
"name": "beam-runtime",
"events": ["gateway:startup"],
"path": str(tmp_path / "hook"),
"startup_readiness": {
"id": "beam-runtime",
"required": True,
},
}
]
monkeypatch.setattr(runner, "_create_adapter", lambda platform, platform_config: _SuccessfulAdapter())
monkeypatch.setattr("hermes_cli.plugins.discover_plugins", lambda: None)
monkeypatch.setattr(runner.hooks, "discover_and_load", lambda: None)
async def _assert_checks(event_type, context):
state = read_runtime_status()
assert event_type == "gateway:startup"
assert state["startup_checks"]["beam-runtime"]["state"] == "pending"
assert state["startup_checks"]["beam-runtime"]["required"] is True
monkeypatch.setattr(runner.hooks, "emit", _assert_checks)
ok = await runner.start()
assert ok is True
@pytest.mark.asyncio
async def test_start_gateway_verbosity_imports_redacting_formatter(monkeypatch, tmp_path):
"""Verbosity != None must not crash with NameError on RedactingFormatter (#8044)."""

View file

@ -132,6 +132,72 @@ class TestGatewayRuntimeStatus:
assert payload["platforms"]["discord"]["error_code"] is None
assert payload["platforms"]["discord"]["error_message"] is None
def test_reset_startup_checks_replaces_previous_run_entries(self, tmp_path, monkeypatch):
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
status.write_runtime_status(
gateway_state="running",
startup_checks={
"old-check": {
"state": "ready",
"required": True,
"source": "old-hook",
"detail": None,
}
},
)
status.reset_startup_checks([
{
"name": "new-hook",
"startup_readiness": {
"id": "new-check",
"required": False,
},
}
])
payload = status.read_runtime_status()
assert set(payload["startup_checks"]) == {"new-check"}
assert payload["startup_checks"]["new-check"]["state"] == "pending"
assert payload["startup_checks"]["new-check"]["required"] is False
assert payload["startup_checks"]["new-check"]["source"] == "new-hook"
def test_mark_startup_check_ready_persists_detail(self, tmp_path, monkeypatch):
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
status.reset_startup_checks([
{
"name": "beam",
"startup_readiness": {
"id": "beam-runtime",
"required": True,
},
}
])
status.mark_startup_check_ready("beam-runtime", detail="ready for RPC")
payload = status.read_runtime_status()
assert payload["startup_checks"]["beam-runtime"]["state"] == "ready"
assert payload["startup_checks"]["beam-runtime"]["detail"] == "ready for RPC"
def test_mark_startup_check_failed_creates_missing_entry(self, tmp_path, monkeypatch):
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
status.mark_startup_check_failed(
"late-hook",
detail="startup hook crashed",
required=False,
source="late-hook",
)
payload = status.read_runtime_status()
assert payload["startup_checks"]["late-hook"]["state"] == "failed"
assert payload["startup_checks"]["late-hook"]["required"] is False
assert payload["startup_checks"]["late-hook"]["source"] == "late-hook"
assert payload["startup_checks"]["late-hook"]["detail"] == "startup hook crashed"
class TestTerminatePid:
def test_force_uses_taskkill_on_windows(self, monkeypatch):

View file

@ -6,12 +6,21 @@ from pathlib import Path
from types import SimpleNamespace
import hermes_cli.gateway as gateway_cli
import pytest
from gateway.restart import (
DEFAULT_GATEWAY_RESTART_DRAIN_TIMEOUT,
GATEWAY_SERVICE_RESTART_EXIT_CODE,
)
_REAL_AWAIT_SERVICE_READY = gateway_cli._await_service_ready_or_exit
@pytest.fixture(autouse=True)
def _stub_service_readiness(monkeypatch):
monkeypatch.setattr(gateway_cli, "_await_service_ready_or_exit", lambda **kwargs: None)
class TestSystemdServiceRefresh:
def test_systemd_install_repairs_outdated_unit_without_force(self, tmp_path, monkeypatch):
unit_path = tmp_path / "hermes-gateway.service"
@ -82,6 +91,30 @@ class TestSystemdServiceRefresh:
["systemctl", "--user", "reload-or-restart", gateway_cli.get_service_name()],
]
def test_systemd_start_waits_for_readiness_before_reporting_success(self, monkeypatch):
calls = []
monkeypatch.setattr(gateway_cli, "_select_systemd_scope", lambda system=False: False)
monkeypatch.setattr(gateway_cli, "refresh_systemd_unit_if_needed", lambda system=False: calls.append(("refresh", system)))
monkeypatch.setattr(
gateway_cli,
"_run_systemctl",
lambda cmd, system=False, check=True, timeout=30, **kwargs: calls.append((tuple(cmd), system, timeout)),
)
monkeypatch.setattr(
gateway_cli,
"_await_service_ready_or_exit",
lambda **kwargs: calls.append(("ready", kwargs)),
)
gateway_cli.systemd_start()
assert calls == [
("refresh", False),
(("start", gateway_cli.get_service_name()), False, 30),
("ready", {"action": "start"}),
]
class TestGeneratedSystemdUnits:
def test_user_unit_avoids_recursive_execstop_and_uses_extended_stop_timeout(self):
@ -268,6 +301,32 @@ class TestLaunchdServiceRecovery:
["launchctl", "kickstart", target],
]
def test_launchd_start_waits_for_readiness_before_reporting_success(self, tmp_path, monkeypatch):
plist_path = tmp_path / "ai.hermes.gateway.plist"
plist_path.write_text(gateway_cli.generate_launchd_plist(), encoding="utf-8")
label = gateway_cli.get_launchd_label()
calls = []
monkeypatch.setattr(gateway_cli, "get_launchd_plist_path", lambda: plist_path)
monkeypatch.setattr(gateway_cli, "refresh_launchd_plist_if_needed", lambda: None)
monkeypatch.setattr(
gateway_cli.subprocess,
"run",
lambda cmd, check=False, **kwargs: calls.append(cmd) or SimpleNamespace(returncode=0, stdout="", stderr=""),
)
monkeypatch.setattr(
gateway_cli,
"_await_service_ready_or_exit",
lambda **kwargs: calls.append(("ready", kwargs)),
)
gateway_cli.launchd_start()
assert calls == [
["launchctl", "kickstart", f"{gateway_cli._launchd_domain()}/{label}"],
("ready", {"action": "start"}),
]
def test_launchd_restart_drains_running_gateway_before_kickstart(self, monkeypatch):
calls = []
target = f"{gateway_cli._launchd_domain()}/{gateway_cli.get_launchd_label()}"
@ -315,7 +374,7 @@ class TestLaunchdServiceRecovery:
gateway_cli.launchd_restart()
assert calls == [("self", 321)]
assert "restart requested" in capsys.readouterr().out.lower()
assert "service restarted" in capsys.readouterr().out.lower()
def test_launchd_stop_uses_bootout_not_kill(self, monkeypatch):
"""launchd_stop must bootout the service so KeepAlive doesn't respawn it."""
@ -393,6 +452,109 @@ class TestLaunchdServiceRecovery:
assert "not loaded" in output.lower()
class TestGatewayServiceReadiness:
def test_wait_for_service_readiness_accepts_running_gateway_without_checks(self, monkeypatch):
monkeypatch.setattr("gateway.status.get_running_pid", lambda: 123)
monkeypatch.setattr(
"gateway.status.read_runtime_status",
lambda: {"pid": 123, "gateway_state": "running", "startup_checks": {}},
)
warnings = gateway_cli._wait_for_service_readiness(action="start", timeout=0.1, poll_interval=0.0)
assert warnings == []
def test_wait_for_service_readiness_ignores_stale_runtime_state_until_pid_matches(self, monkeypatch):
runtime_states = iter(
[
{"pid": 999, "gateway_state": "running", "startup_checks": {}},
{"pid": 123, "gateway_state": "running", "startup_checks": {}},
]
)
monkeypatch.setattr("gateway.status.get_running_pid", lambda: 123)
monkeypatch.setattr("gateway.status.read_runtime_status", lambda: next(runtime_states))
warnings = gateway_cli._wait_for_service_readiness(action="start", timeout=0.1, poll_interval=0.0)
assert warnings == []
def test_wait_for_service_readiness_returns_optional_pending_warnings(self, monkeypatch):
monkeypatch.setattr("gateway.status.get_running_pid", lambda: 123)
monkeypatch.setattr(
"gateway.status.read_runtime_status",
lambda: {
"pid": 123,
"gateway_state": "running",
"startup_checks": {
"optional-check": {
"state": "pending",
"required": False,
"source": "test-hook",
"detail": "still warming",
}
},
},
)
warnings = gateway_cli._wait_for_service_readiness(action="start", timeout=0.1, poll_interval=0.0)
assert warnings == ["pending: optional-check (test-hook): still warming"]
def test_wait_for_service_readiness_fails_when_required_check_fails(self, monkeypatch):
monkeypatch.setattr("gateway.status.get_running_pid", lambda: 123)
monkeypatch.setattr(
"gateway.status.read_runtime_status",
lambda: {
"pid": 123,
"gateway_state": "running",
"startup_checks": {
"beam-runtime": {
"state": "failed",
"required": True,
"source": "beam",
"detail": "RPC boot failed",
}
},
},
)
with pytest.raises(RuntimeError, match=r"required startup checks failed: beam-runtime \(beam\): RPC boot failed"):
gateway_cli._wait_for_service_readiness(action="start", timeout=0.1, poll_interval=0.0)
def test_wait_for_service_readiness_times_out_on_pending_required_check(self, monkeypatch):
monkeypatch.setattr("gateway.status.get_running_pid", lambda: 123)
monkeypatch.setattr(
"gateway.status.read_runtime_status",
lambda: {
"pid": 123,
"gateway_state": "running",
"startup_checks": {
"beam-runtime": {
"state": "pending",
"required": True,
"source": "beam",
"detail": "waiting for runtime",
}
},
},
)
with pytest.raises(RuntimeError, match=r"timed out waiting for required startup checks: beam-runtime \(beam\): waiting for runtime"):
gateway_cli._wait_for_service_readiness(action="start", timeout=0.01, poll_interval=0.0)
def test_await_service_ready_or_exit_raises_system_exit_when_not_ready(self, monkeypatch):
monkeypatch.setattr(gateway_cli, "_await_service_ready_or_exit", _REAL_AWAIT_SERVICE_READY)
monkeypatch.setattr(
gateway_cli,
"_wait_for_service_readiness",
lambda **kwargs: (_ for _ in ()).throw(RuntimeError("not ready")),
)
with pytest.raises(SystemExit, match="1"):
gateway_cli._await_service_ready_or_exit(action="start")
class TestGatewayServiceDetection:
def test_supports_systemd_services_requires_systemctl_binary(self, monkeypatch):
monkeypatch.setattr(gateway_cli, "is_linux", lambda: True)
@ -475,7 +637,7 @@ class TestGatewaySystemServiceRouting:
gateway_cli.systemd_restart()
assert calls == [("refresh", False), ("self", 654)]
assert "restart requested" in capsys.readouterr().out.lower()
assert "service restarted" in capsys.readouterr().out.lower()
def test_gateway_install_passes_system_flags(self, monkeypatch):
monkeypatch.setattr(gateway_cli, "supports_systemd_services", lambda: True)