mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
add generic gateway startup readiness checks
This commit is contained in:
parent
10494b42a1
commit
bad9fe2452
9 changed files with 637 additions and 8 deletions
|
|
@ -3,11 +3,12 @@ Event Hook System
|
|||
|
||||
A lightweight event-driven system that fires handlers at key lifecycle points.
|
||||
Hooks are discovered from ~/.hermes/hooks/ directories, each containing:
|
||||
- HOOK.yaml (metadata: name, description, events list)
|
||||
- HOOK.yaml (metadata: name, description, events list, optional startup_readiness)
|
||||
- handler.py (Python handler with async def handle(event_type, context))
|
||||
|
||||
Events:
|
||||
- gateway:startup -- Gateway process starts
|
||||
- gateway:shutdown -- Gateway process is shutting down
|
||||
- session:start -- New session created (first message of a new session)
|
||||
- session:end -- Session ends (user ran /new or /reset)
|
||||
- session:reset -- Session reset completed (new session entry created)
|
||||
|
|
@ -31,6 +32,26 @@ from hermes_cli.config import get_hermes_home
|
|||
HOOKS_DIR = get_hermes_home() / "hooks"
|
||||
|
||||
|
||||
def _normalize_startup_readiness(hook_name: str, manifest: dict[str, Any]) -> Optional[dict[str, Any]]:
|
||||
"""Validate and normalize optional startup readiness metadata."""
|
||||
readiness = manifest.get("startup_readiness")
|
||||
if readiness is None:
|
||||
return None
|
||||
if not isinstance(readiness, dict):
|
||||
print(f"[hooks] Ignoring startup_readiness for {hook_name}: expected mapping", flush=True)
|
||||
return None
|
||||
|
||||
check_id = str(readiness.get("id", "")).strip()
|
||||
if not check_id:
|
||||
print(f"[hooks] Ignoring startup_readiness for {hook_name}: missing id", flush=True)
|
||||
return None
|
||||
|
||||
return {
|
||||
"id": check_id,
|
||||
"required": bool(readiness.get("required", True)),
|
||||
}
|
||||
|
||||
|
||||
class HookRegistry:
|
||||
"""
|
||||
Discovers, loads, and fires event hooks.
|
||||
|
|
@ -62,6 +83,7 @@ class HookRegistry:
|
|||
"description": "Run ~/.hermes/BOOT.md on gateway startup",
|
||||
"events": ["gateway:startup"],
|
||||
"path": "(builtin)",
|
||||
"startup_readiness": None,
|
||||
})
|
||||
except Exception as e:
|
||||
print(f"[hooks] Could not load built-in boot-md hook: {e}", flush=True)
|
||||
|
|
@ -102,6 +124,7 @@ class HookRegistry:
|
|||
if not events:
|
||||
print(f"[hooks] Skipping {hook_name}: no events declared", flush=True)
|
||||
continue
|
||||
startup_readiness = _normalize_startup_readiness(hook_name, manifest)
|
||||
|
||||
# Dynamically load the handler module
|
||||
spec = importlib.util.spec_from_file_location(
|
||||
|
|
@ -128,6 +151,7 @@ class HookRegistry:
|
|||
"description": manifest.get("description", ""),
|
||||
"events": events,
|
||||
"path": str(hook_dir),
|
||||
"startup_readiness": startup_readiness,
|
||||
})
|
||||
|
||||
print(f"[hooks] Loaded hook '{hook_name}' for events: {events}", flush=True)
|
||||
|
|
|
|||
|
|
@ -1540,7 +1540,7 @@ class GatewayRunner:
|
|||
pass
|
||||
try:
|
||||
from gateway.status import write_runtime_status
|
||||
write_runtime_status(gateway_state="starting", exit_reason=None)
|
||||
write_runtime_status(gateway_state="starting", exit_reason=None, startup_checks={})
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
|
@ -1582,8 +1582,23 @@ class GatewayRunner:
|
|||
"or configure platform allowlists (e.g., TELEGRAM_ALLOWED_USERS=your_id)."
|
||||
)
|
||||
|
||||
# Discover plugins before hooks so plugin-owned hook bundles can
|
||||
# participate in this same startup cycle.
|
||||
try:
|
||||
from hermes_cli.plugins import discover_plugins
|
||||
|
||||
discover_plugins()
|
||||
except Exception as e:
|
||||
logger.warning("Plugin discovery during gateway startup failed: %s", e)
|
||||
|
||||
# Discover and load event hooks
|
||||
self.hooks.discover_and_load()
|
||||
try:
|
||||
from gateway.status import reset_startup_checks
|
||||
|
||||
reset_startup_checks(self.hooks.loaded_hooks)
|
||||
except Exception as e:
|
||||
logger.warning("Startup readiness initialization failed: %s", e)
|
||||
|
||||
# Recover background processes from checkpoint (crash recovery)
|
||||
try:
|
||||
|
|
@ -2104,6 +2119,11 @@ class GatewayRunner:
|
|||
logger.error("Failed to launch detached gateway restart: %s", e)
|
||||
|
||||
self._finalize_shutdown_agents(active_agents)
|
||||
await self.hooks.emit("gateway:shutdown", {
|
||||
"restart": self._restart_requested,
|
||||
"service_restart": self._restart_via_service,
|
||||
"detached_restart": self._restart_detached,
|
||||
})
|
||||
|
||||
for platform, adapter in list(self.adapters.items()):
|
||||
try:
|
||||
|
|
|
|||
|
|
@ -27,6 +27,7 @@ _RUNTIME_STATUS_FILE = "gateway_state.json"
|
|||
_LOCKS_DIRNAME = "gateway-locks"
|
||||
_IS_WINDOWS = sys.platform == "win32"
|
||||
_UNSET = object()
|
||||
_VALID_STARTUP_CHECK_STATES = {"pending", "ready", "failed"}
|
||||
|
||||
|
||||
def _get_pid_path() -> Path:
|
||||
|
|
@ -162,11 +163,39 @@ def _build_runtime_status_record() -> dict[str, Any]:
|
|||
"restart_requested": False,
|
||||
"active_agents": 0,
|
||||
"platforms": {},
|
||||
"startup_checks": {},
|
||||
"updated_at": _utc_now_iso(),
|
||||
})
|
||||
return payload
|
||||
|
||||
|
||||
def _normalize_startup_check_entries(
|
||||
startup_checks: Optional[dict[str, Any]],
|
||||
) -> dict[str, dict[str, Any]]:
|
||||
"""Normalize persisted startup readiness entries."""
|
||||
if not isinstance(startup_checks, dict):
|
||||
return {}
|
||||
|
||||
now = _utc_now_iso()
|
||||
normalized: dict[str, dict[str, Any]] = {}
|
||||
for raw_id, raw_payload in startup_checks.items():
|
||||
check_id = str(raw_id).strip()
|
||||
if not check_id:
|
||||
continue
|
||||
payload = raw_payload if isinstance(raw_payload, dict) else {}
|
||||
state = str(payload.get("state", "pending")).strip().lower()
|
||||
if state not in _VALID_STARTUP_CHECK_STATES:
|
||||
state = "pending"
|
||||
normalized[check_id] = {
|
||||
"state": state,
|
||||
"required": bool(payload.get("required", True)),
|
||||
"source": payload.get("source"),
|
||||
"detail": payload.get("detail"),
|
||||
"updated_at": payload.get("updated_at") or now,
|
||||
}
|
||||
return normalized
|
||||
|
||||
|
||||
def _read_json_file(path: Path) -> Optional[dict[str, Any]]:
|
||||
if not path.exists():
|
||||
return None
|
||||
|
|
@ -223,6 +252,7 @@ def write_runtime_status(
|
|||
exit_reason: Any = _UNSET,
|
||||
restart_requested: Any = _UNSET,
|
||||
active_agents: Any = _UNSET,
|
||||
startup_checks: Any = _UNSET,
|
||||
platform: Any = _UNSET,
|
||||
platform_state: Any = _UNSET,
|
||||
error_code: Any = _UNSET,
|
||||
|
|
@ -245,6 +275,8 @@ def write_runtime_status(
|
|||
payload["restart_requested"] = bool(restart_requested)
|
||||
if active_agents is not _UNSET:
|
||||
payload["active_agents"] = max(0, int(active_agents))
|
||||
if startup_checks is not _UNSET:
|
||||
payload["startup_checks"] = _normalize_startup_check_entries(startup_checks)
|
||||
|
||||
if platform is not _UNSET:
|
||||
platform_payload = payload["platforms"].get(platform, {})
|
||||
|
|
@ -262,7 +294,109 @@ def write_runtime_status(
|
|||
|
||||
def read_runtime_status() -> Optional[dict[str, Any]]:
|
||||
"""Read the persisted gateway runtime health/status information."""
|
||||
return _read_json_file(_get_runtime_status_path())
|
||||
payload = _read_json_file(_get_runtime_status_path())
|
||||
if payload is None:
|
||||
return None
|
||||
payload.setdefault("platforms", {})
|
||||
payload["startup_checks"] = _normalize_startup_check_entries(payload.get("startup_checks"))
|
||||
return payload
|
||||
|
||||
|
||||
def reset_startup_checks(checks: Optional[list[dict[str, Any]]] = None) -> dict[str, dict[str, Any]]:
|
||||
"""Replace persisted startup readiness checks for the current run."""
|
||||
normalized: dict[str, dict[str, Any]] = {}
|
||||
now = _utc_now_iso()
|
||||
|
||||
for hook in checks or []:
|
||||
if not isinstance(hook, dict):
|
||||
continue
|
||||
readiness = hook.get("startup_readiness")
|
||||
if not isinstance(readiness, dict):
|
||||
continue
|
||||
check_id = str(readiness.get("id", "")).strip()
|
||||
if not check_id:
|
||||
continue
|
||||
normalized[check_id] = {
|
||||
"state": "pending",
|
||||
"required": bool(readiness.get("required", True)),
|
||||
"source": hook.get("name"),
|
||||
"detail": None,
|
||||
"updated_at": now,
|
||||
}
|
||||
|
||||
write_runtime_status(startup_checks=normalized)
|
||||
return normalized
|
||||
|
||||
|
||||
def update_startup_check(
|
||||
check_id: str,
|
||||
state: str,
|
||||
*,
|
||||
detail: Any = _UNSET,
|
||||
required: Any = _UNSET,
|
||||
source: Any = _UNSET,
|
||||
) -> dict[str, Any]:
|
||||
"""Update a single startup readiness check in the runtime status file."""
|
||||
normalized_id = str(check_id).strip()
|
||||
if not normalized_id:
|
||||
raise ValueError("startup readiness check id is required")
|
||||
|
||||
normalized_state = str(state).strip().lower()
|
||||
if normalized_state not in _VALID_STARTUP_CHECK_STATES:
|
||||
raise ValueError(f"invalid startup readiness state: {state}")
|
||||
|
||||
path = _get_runtime_status_path()
|
||||
payload = _read_json_file(path) or _build_runtime_status_record()
|
||||
checks = _normalize_startup_check_entries(payload.get("startup_checks"))
|
||||
existing = checks.get(normalized_id, {})
|
||||
now = _utc_now_iso()
|
||||
|
||||
checks[normalized_id] = {
|
||||
"state": normalized_state,
|
||||
"required": bool(existing.get("required", True) if required is _UNSET else required),
|
||||
"source": existing.get("source") if source is _UNSET else source,
|
||||
"detail": existing.get("detail") if detail is _UNSET else detail,
|
||||
"updated_at": now,
|
||||
}
|
||||
|
||||
payload["startup_checks"] = checks
|
||||
payload.setdefault("platforms", {})
|
||||
payload.setdefault("kind", _GATEWAY_KIND)
|
||||
payload["pid"] = os.getpid()
|
||||
payload["start_time"] = _get_process_start_time(os.getpid())
|
||||
payload["updated_at"] = now
|
||||
_write_json_file(path, payload)
|
||||
return checks[normalized_id]
|
||||
|
||||
|
||||
def mark_startup_check_pending(
|
||||
check_id: str,
|
||||
*,
|
||||
detail: Any = _UNSET,
|
||||
required: Any = _UNSET,
|
||||
source: Any = _UNSET,
|
||||
) -> dict[str, Any]:
|
||||
return update_startup_check(check_id, "pending", detail=detail, required=required, source=source)
|
||||
|
||||
|
||||
def mark_startup_check_ready(
|
||||
check_id: str,
|
||||
*,
|
||||
detail: Any = _UNSET,
|
||||
required: Any = _UNSET,
|
||||
source: Any = _UNSET,
|
||||
) -> dict[str, Any]:
|
||||
return update_startup_check(check_id, "ready", detail=detail, required=required, source=source)
|
||||
|
||||
|
||||
def mark_startup_check_failed(
|
||||
check_id: str,
|
||||
*,
|
||||
detail: Any = _UNSET,
|
||||
required: Any = _UNSET,
|
||||
source: Any = _UNSET,
|
||||
) -> dict[str, Any]:
|
||||
return update_startup_check(check_id, "failed", detail=detail, required=required, source=source)
|
||||
|
||||
|
||||
def remove_pid_file() -> None:
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@ import shutil
|
|||
import signal
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
PROJECT_ROOT = Path(__file__).parent.parent.resolve()
|
||||
|
|
@ -37,6 +38,10 @@ from hermes_cli.setup import (
|
|||
from hermes_cli.colors import Colors, color
|
||||
|
||||
|
||||
_SERVICE_READINESS_TIMEOUT = 30.0
|
||||
_SERVICE_READINESS_POLL_INTERVAL = 0.2
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Process Management (for manual gateway runs)
|
||||
# =============================================================================
|
||||
|
|
@ -1100,12 +1105,123 @@ def systemd_uninstall(system: bool = False):
|
|||
print(f"✓ {_service_scope_label(system).capitalize()} service uninstalled")
|
||||
|
||||
|
||||
def _describe_startup_check(check_id: str, check: dict) -> str:
|
||||
source = check.get("source")
|
||||
detail = check.get("detail")
|
||||
label = f"{check_id} ({source})" if source and source != check_id else check_id
|
||||
return f"{label}: {detail}" if detail else label
|
||||
|
||||
|
||||
def _classify_startup_checks(state: dict | None) -> tuple[list[str], list[str], list[str]]:
|
||||
checks = (state or {}).get("startup_checks") or {}
|
||||
pending_required: list[str] = []
|
||||
failed_required: list[str] = []
|
||||
optional_warnings: list[str] = []
|
||||
|
||||
if not isinstance(checks, dict):
|
||||
return pending_required, failed_required, optional_warnings
|
||||
|
||||
for check_id, raw_check in checks.items():
|
||||
check = raw_check if isinstance(raw_check, dict) else {}
|
||||
label = _describe_startup_check(str(check_id), check)
|
||||
check_state = str(check.get("state", "pending")).strip().lower()
|
||||
required = bool(check.get("required", True))
|
||||
|
||||
if check_state == "ready":
|
||||
continue
|
||||
if required:
|
||||
if check_state == "failed":
|
||||
failed_required.append(label)
|
||||
else:
|
||||
pending_required.append(label)
|
||||
else:
|
||||
prefix = "failed" if check_state == "failed" else "pending"
|
||||
optional_warnings.append(f"{prefix}: {label}")
|
||||
|
||||
return pending_required, failed_required, optional_warnings
|
||||
|
||||
|
||||
def _wait_for_service_readiness(
|
||||
*,
|
||||
action: str,
|
||||
previous_pid: int | None = None,
|
||||
timeout: float = _SERVICE_READINESS_TIMEOUT,
|
||||
poll_interval: float = _SERVICE_READINESS_POLL_INTERVAL,
|
||||
) -> list[str]:
|
||||
from gateway.status import get_running_pid, read_runtime_status
|
||||
|
||||
deadline = time.monotonic() + timeout
|
||||
last_pending: list[str] = []
|
||||
|
||||
while time.monotonic() < deadline:
|
||||
live_pid = get_running_pid()
|
||||
if live_pid is None or (previous_pid is not None and live_pid == previous_pid):
|
||||
time.sleep(poll_interval)
|
||||
continue
|
||||
|
||||
runtime = read_runtime_status() or {}
|
||||
try:
|
||||
runtime_pid = int(runtime.get("pid"))
|
||||
except (TypeError, ValueError):
|
||||
runtime_pid = None
|
||||
if runtime_pid != live_pid:
|
||||
time.sleep(poll_interval)
|
||||
continue
|
||||
|
||||
gateway_state = runtime.get("gateway_state")
|
||||
pending_required, failed_required, optional_warnings = _classify_startup_checks(runtime)
|
||||
last_pending = pending_required
|
||||
|
||||
if gateway_state == "startup_failed":
|
||||
reason = runtime.get("exit_reason") or f"gateway {action} failed during startup"
|
||||
raise RuntimeError(reason)
|
||||
if failed_required:
|
||||
raise RuntimeError(
|
||||
"required startup checks failed: " + "; ".join(failed_required)
|
||||
)
|
||||
if gateway_state == "running" and not pending_required:
|
||||
return optional_warnings
|
||||
|
||||
time.sleep(poll_interval)
|
||||
|
||||
if last_pending:
|
||||
raise RuntimeError(
|
||||
"timed out waiting for required startup checks: " + "; ".join(last_pending)
|
||||
)
|
||||
if previous_pid is not None:
|
||||
raise RuntimeError(
|
||||
f"timed out waiting for gateway {action}; previous process is still active or no new runtime became ready"
|
||||
)
|
||||
raise RuntimeError(f"timed out waiting for gateway {action} readiness")
|
||||
|
||||
|
||||
def _await_service_ready_or_exit(
|
||||
*,
|
||||
action: str,
|
||||
previous_pid: int | None = None,
|
||||
timeout: float = _SERVICE_READINESS_TIMEOUT,
|
||||
) -> None:
|
||||
try:
|
||||
optional_warnings = _wait_for_service_readiness(
|
||||
action=action,
|
||||
previous_pid=previous_pid,
|
||||
timeout=timeout,
|
||||
)
|
||||
except RuntimeError as exc:
|
||||
print_error(f" Gateway {action} did not become ready: {exc}")
|
||||
raise SystemExit(1) from exc
|
||||
|
||||
for warning in optional_warnings:
|
||||
print_warning(f" Optional startup check {warning}")
|
||||
|
||||
|
||||
def systemd_start(system: bool = False):
|
||||
system = _select_systemd_scope(system)
|
||||
if system:
|
||||
_require_root_for_system_service("start")
|
||||
refresh_systemd_unit_if_needed(system=system)
|
||||
_run_systemctl(["start", get_service_name()], system=system, check=True, timeout=30)
|
||||
_await_service_ready_or_exit(action="start")
|
||||
print(f"✓ {_service_scope_label(system).capitalize()} service started")
|
||||
|
||||
|
||||
|
|
@ -1128,9 +1244,11 @@ def systemd_restart(system: bool = False):
|
|||
|
||||
pid = get_running_pid()
|
||||
if pid is not None and _request_gateway_self_restart(pid):
|
||||
print(f"✓ {_service_scope_label(system).capitalize()} service restart requested")
|
||||
_await_service_ready_or_exit(action="restart", previous_pid=pid)
|
||||
print(f"✓ {_service_scope_label(system).capitalize()} service restarted")
|
||||
return
|
||||
_run_systemctl(["reload-or-restart", get_service_name()], system=system, check=True, timeout=90)
|
||||
_await_service_ready_or_exit(action="restart", previous_pid=pid)
|
||||
print(f"✓ {_service_scope_label(system).capitalize()} service restarted")
|
||||
|
||||
|
||||
|
|
@ -1389,6 +1507,7 @@ def launchd_start():
|
|||
plist_path.write_text(generate_launchd_plist(), encoding="utf-8")
|
||||
subprocess.run(["launchctl", "bootstrap", _launchd_domain(), str(plist_path)], check=True, timeout=30)
|
||||
subprocess.run(["launchctl", "kickstart", f"{_launchd_domain()}/{label}"], check=True, timeout=30)
|
||||
_await_service_ready_or_exit(action="start")
|
||||
print("✓ Service started")
|
||||
return
|
||||
|
||||
|
|
@ -1401,6 +1520,7 @@ def launchd_start():
|
|||
print("↻ launchd job was unloaded; reloading service definition")
|
||||
subprocess.run(["launchctl", "bootstrap", _launchd_domain(), str(plist_path)], check=True, timeout=30)
|
||||
subprocess.run(["launchctl", "kickstart", f"{_launchd_domain()}/{label}"], check=True, timeout=30)
|
||||
_await_service_ready_or_exit(action="start")
|
||||
print("✓ Service started")
|
||||
|
||||
def launchd_stop():
|
||||
|
|
@ -1471,7 +1591,8 @@ def launchd_restart():
|
|||
try:
|
||||
pid = get_running_pid()
|
||||
if pid is not None and _request_gateway_self_restart(pid):
|
||||
print("✓ Service restart requested")
|
||||
_await_service_ready_or_exit(action="restart", previous_pid=pid)
|
||||
print("✓ Service restarted")
|
||||
return
|
||||
if pid is not None:
|
||||
try:
|
||||
|
|
@ -1483,6 +1604,7 @@ def launchd_restart():
|
|||
if not exited:
|
||||
print(f"⚠ Gateway drain timed out after {drain_timeout:.0f}s — forcing launchd restart")
|
||||
subprocess.run(["launchctl", "kickstart", "-k", target], check=True, timeout=90)
|
||||
_await_service_ready_or_exit(action="restart", previous_pid=pid)
|
||||
print("✓ Service restarted")
|
||||
except subprocess.CalledProcessError as e:
|
||||
if e.returncode not in (3, 113):
|
||||
|
|
@ -1492,6 +1614,7 @@ def launchd_restart():
|
|||
plist_path = get_launchd_plist_path()
|
||||
subprocess.run(["launchctl", "bootstrap", _launchd_domain(), str(plist_path)], check=True, timeout=30)
|
||||
subprocess.run(["launchctl", "kickstart", target], check=True, timeout=30)
|
||||
_await_service_ready_or_exit(action="restart", previous_pid=pid)
|
||||
print("✓ Service restarted")
|
||||
|
||||
def launchd_status(deep: bool = False):
|
||||
|
|
|
|||
|
|
@ -125,6 +125,25 @@ async def test_gateway_stop_service_restart_sets_named_exit_code():
|
|||
assert runner._exit_code == GATEWAY_SERVICE_RESTART_EXIT_CODE
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_gateway_stop_emits_shutdown_hook_after_drain(monkeypatch):
|
||||
runner, adapter = make_restart_runner()
|
||||
adapter.disconnect = AsyncMock()
|
||||
runner.hooks.emit = AsyncMock()
|
||||
|
||||
with patch("gateway.status.remove_pid_file"), patch("gateway.status.write_runtime_status"):
|
||||
await runner.stop(restart=True, service_restart=True)
|
||||
|
||||
runner.hooks.emit.assert_awaited_once_with(
|
||||
"gateway:shutdown",
|
||||
{
|
||||
"restart": True,
|
||||
"service_restart": True,
|
||||
"detached_restart": False,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_drain_active_agents_throttles_status_updates():
|
||||
runner, _adapter = make_restart_runner()
|
||||
|
|
|
|||
|
|
@ -9,7 +9,7 @@ import pytest
|
|||
from gateway.hooks import HookRegistry
|
||||
|
||||
|
||||
def _create_hook(hooks_dir, hook_name, events, handler_code):
|
||||
def _create_hook(hooks_dir, hook_name, events, handler_code, *, manifest_extra=""):
|
||||
"""Helper to create a hook directory with HOOK.yaml and handler.py."""
|
||||
hook_dir = hooks_dir / hook_name
|
||||
hook_dir.mkdir(parents=True)
|
||||
|
|
@ -17,6 +17,7 @@ def _create_hook(hooks_dir, hook_name, events, handler_code):
|
|||
f"name: {hook_name}\n"
|
||||
f"description: Test hook\n"
|
||||
f"events: {events}\n"
|
||||
f"{manifest_extra}"
|
||||
)
|
||||
(hook_dir / "handler.py").write_text(handler_code)
|
||||
return hook_dir
|
||||
|
|
@ -112,6 +113,24 @@ class TestDiscoverAndLoad:
|
|||
|
||||
assert len(reg.loaded_hooks) == 2
|
||||
|
||||
def test_preserves_optional_startup_readiness_metadata(self, tmp_path):
|
||||
_create_hook(
|
||||
tmp_path,
|
||||
"ready-hook",
|
||||
'["gateway:startup"]',
|
||||
"def handle(e, c): pass\n",
|
||||
manifest_extra="startup_readiness:\n id: beam-runtime\n required: false\n",
|
||||
)
|
||||
|
||||
reg = HookRegistry()
|
||||
with patch("gateway.hooks.HOOKS_DIR", tmp_path), _patch_no_builtins(reg):
|
||||
reg.discover_and_load()
|
||||
|
||||
assert reg.loaded_hooks[0]["startup_readiness"] == {
|
||||
"id": "beam-runtime",
|
||||
"required": False,
|
||||
}
|
||||
|
||||
|
||||
class TestEmit:
|
||||
@pytest.mark.asyncio
|
||||
|
|
|
|||
|
|
@ -132,6 +132,68 @@ async def test_runner_records_connected_platform_state_on_success(monkeypatch, t
|
|||
assert state["platforms"]["discord"]["error_message"] is None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_runner_discovers_plugins_before_loading_hooks(monkeypatch, tmp_path):
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
||||
config = GatewayConfig(
|
||||
platforms={
|
||||
Platform.DISCORD: PlatformConfig(enabled=True, token="***")
|
||||
},
|
||||
sessions_dir=tmp_path / "sessions",
|
||||
)
|
||||
runner = GatewayRunner(config)
|
||||
order: list[str] = []
|
||||
|
||||
monkeypatch.setattr(runner, "_create_adapter", lambda platform, platform_config: _SuccessfulAdapter())
|
||||
monkeypatch.setattr("hermes_cli.plugins.discover_plugins", lambda: order.append("plugins"))
|
||||
monkeypatch.setattr(runner.hooks, "discover_and_load", lambda: order.append("hooks"))
|
||||
monkeypatch.setattr(runner.hooks, "emit", AsyncMock())
|
||||
|
||||
ok = await runner.start()
|
||||
|
||||
assert ok is True
|
||||
assert order == ["plugins", "hooks"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_runner_initializes_startup_checks_before_gateway_startup_emit(monkeypatch, tmp_path):
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
||||
config = GatewayConfig(
|
||||
platforms={
|
||||
Platform.DISCORD: PlatformConfig(enabled=True, token="***")
|
||||
},
|
||||
sessions_dir=tmp_path / "sessions",
|
||||
)
|
||||
runner = GatewayRunner(config)
|
||||
|
||||
runner.hooks._loaded_hooks = [
|
||||
{
|
||||
"name": "beam-runtime",
|
||||
"events": ["gateway:startup"],
|
||||
"path": str(tmp_path / "hook"),
|
||||
"startup_readiness": {
|
||||
"id": "beam-runtime",
|
||||
"required": True,
|
||||
},
|
||||
}
|
||||
]
|
||||
monkeypatch.setattr(runner, "_create_adapter", lambda platform, platform_config: _SuccessfulAdapter())
|
||||
monkeypatch.setattr("hermes_cli.plugins.discover_plugins", lambda: None)
|
||||
monkeypatch.setattr(runner.hooks, "discover_and_load", lambda: None)
|
||||
|
||||
async def _assert_checks(event_type, context):
|
||||
state = read_runtime_status()
|
||||
assert event_type == "gateway:startup"
|
||||
assert state["startup_checks"]["beam-runtime"]["state"] == "pending"
|
||||
assert state["startup_checks"]["beam-runtime"]["required"] is True
|
||||
|
||||
monkeypatch.setattr(runner.hooks, "emit", _assert_checks)
|
||||
|
||||
ok = await runner.start()
|
||||
|
||||
assert ok is True
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_start_gateway_verbosity_imports_redacting_formatter(monkeypatch, tmp_path):
|
||||
"""Verbosity != None must not crash with NameError on RedactingFormatter (#8044)."""
|
||||
|
|
|
|||
|
|
@ -132,6 +132,72 @@ class TestGatewayRuntimeStatus:
|
|||
assert payload["platforms"]["discord"]["error_code"] is None
|
||||
assert payload["platforms"]["discord"]["error_message"] is None
|
||||
|
||||
def test_reset_startup_checks_replaces_previous_run_entries(self, tmp_path, monkeypatch):
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
||||
|
||||
status.write_runtime_status(
|
||||
gateway_state="running",
|
||||
startup_checks={
|
||||
"old-check": {
|
||||
"state": "ready",
|
||||
"required": True,
|
||||
"source": "old-hook",
|
||||
"detail": None,
|
||||
}
|
||||
},
|
||||
)
|
||||
|
||||
status.reset_startup_checks([
|
||||
{
|
||||
"name": "new-hook",
|
||||
"startup_readiness": {
|
||||
"id": "new-check",
|
||||
"required": False,
|
||||
},
|
||||
}
|
||||
])
|
||||
|
||||
payload = status.read_runtime_status()
|
||||
assert set(payload["startup_checks"]) == {"new-check"}
|
||||
assert payload["startup_checks"]["new-check"]["state"] == "pending"
|
||||
assert payload["startup_checks"]["new-check"]["required"] is False
|
||||
assert payload["startup_checks"]["new-check"]["source"] == "new-hook"
|
||||
|
||||
def test_mark_startup_check_ready_persists_detail(self, tmp_path, monkeypatch):
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
||||
|
||||
status.reset_startup_checks([
|
||||
{
|
||||
"name": "beam",
|
||||
"startup_readiness": {
|
||||
"id": "beam-runtime",
|
||||
"required": True,
|
||||
},
|
||||
}
|
||||
])
|
||||
|
||||
status.mark_startup_check_ready("beam-runtime", detail="ready for RPC")
|
||||
|
||||
payload = status.read_runtime_status()
|
||||
assert payload["startup_checks"]["beam-runtime"]["state"] == "ready"
|
||||
assert payload["startup_checks"]["beam-runtime"]["detail"] == "ready for RPC"
|
||||
|
||||
def test_mark_startup_check_failed_creates_missing_entry(self, tmp_path, monkeypatch):
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
||||
|
||||
status.mark_startup_check_failed(
|
||||
"late-hook",
|
||||
detail="startup hook crashed",
|
||||
required=False,
|
||||
source="late-hook",
|
||||
)
|
||||
|
||||
payload = status.read_runtime_status()
|
||||
assert payload["startup_checks"]["late-hook"]["state"] == "failed"
|
||||
assert payload["startup_checks"]["late-hook"]["required"] is False
|
||||
assert payload["startup_checks"]["late-hook"]["source"] == "late-hook"
|
||||
assert payload["startup_checks"]["late-hook"]["detail"] == "startup hook crashed"
|
||||
|
||||
|
||||
class TestTerminatePid:
|
||||
def test_force_uses_taskkill_on_windows(self, monkeypatch):
|
||||
|
|
|
|||
|
|
@ -6,12 +6,21 @@ from pathlib import Path
|
|||
from types import SimpleNamespace
|
||||
|
||||
import hermes_cli.gateway as gateway_cli
|
||||
import pytest
|
||||
from gateway.restart import (
|
||||
DEFAULT_GATEWAY_RESTART_DRAIN_TIMEOUT,
|
||||
GATEWAY_SERVICE_RESTART_EXIT_CODE,
|
||||
)
|
||||
|
||||
|
||||
_REAL_AWAIT_SERVICE_READY = gateway_cli._await_service_ready_or_exit
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _stub_service_readiness(monkeypatch):
|
||||
monkeypatch.setattr(gateway_cli, "_await_service_ready_or_exit", lambda **kwargs: None)
|
||||
|
||||
|
||||
class TestSystemdServiceRefresh:
|
||||
def test_systemd_install_repairs_outdated_unit_without_force(self, tmp_path, monkeypatch):
|
||||
unit_path = tmp_path / "hermes-gateway.service"
|
||||
|
|
@ -82,6 +91,30 @@ class TestSystemdServiceRefresh:
|
|||
["systemctl", "--user", "reload-or-restart", gateway_cli.get_service_name()],
|
||||
]
|
||||
|
||||
def test_systemd_start_waits_for_readiness_before_reporting_success(self, monkeypatch):
|
||||
calls = []
|
||||
|
||||
monkeypatch.setattr(gateway_cli, "_select_systemd_scope", lambda system=False: False)
|
||||
monkeypatch.setattr(gateway_cli, "refresh_systemd_unit_if_needed", lambda system=False: calls.append(("refresh", system)))
|
||||
monkeypatch.setattr(
|
||||
gateway_cli,
|
||||
"_run_systemctl",
|
||||
lambda cmd, system=False, check=True, timeout=30, **kwargs: calls.append((tuple(cmd), system, timeout)),
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
gateway_cli,
|
||||
"_await_service_ready_or_exit",
|
||||
lambda **kwargs: calls.append(("ready", kwargs)),
|
||||
)
|
||||
|
||||
gateway_cli.systemd_start()
|
||||
|
||||
assert calls == [
|
||||
("refresh", False),
|
||||
(("start", gateway_cli.get_service_name()), False, 30),
|
||||
("ready", {"action": "start"}),
|
||||
]
|
||||
|
||||
|
||||
class TestGeneratedSystemdUnits:
|
||||
def test_user_unit_avoids_recursive_execstop_and_uses_extended_stop_timeout(self):
|
||||
|
|
@ -268,6 +301,32 @@ class TestLaunchdServiceRecovery:
|
|||
["launchctl", "kickstart", target],
|
||||
]
|
||||
|
||||
def test_launchd_start_waits_for_readiness_before_reporting_success(self, tmp_path, monkeypatch):
|
||||
plist_path = tmp_path / "ai.hermes.gateway.plist"
|
||||
plist_path.write_text(gateway_cli.generate_launchd_plist(), encoding="utf-8")
|
||||
label = gateway_cli.get_launchd_label()
|
||||
calls = []
|
||||
|
||||
monkeypatch.setattr(gateway_cli, "get_launchd_plist_path", lambda: plist_path)
|
||||
monkeypatch.setattr(gateway_cli, "refresh_launchd_plist_if_needed", lambda: None)
|
||||
monkeypatch.setattr(
|
||||
gateway_cli.subprocess,
|
||||
"run",
|
||||
lambda cmd, check=False, **kwargs: calls.append(cmd) or SimpleNamespace(returncode=0, stdout="", stderr=""),
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
gateway_cli,
|
||||
"_await_service_ready_or_exit",
|
||||
lambda **kwargs: calls.append(("ready", kwargs)),
|
||||
)
|
||||
|
||||
gateway_cli.launchd_start()
|
||||
|
||||
assert calls == [
|
||||
["launchctl", "kickstart", f"{gateway_cli._launchd_domain()}/{label}"],
|
||||
("ready", {"action": "start"}),
|
||||
]
|
||||
|
||||
def test_launchd_restart_drains_running_gateway_before_kickstart(self, monkeypatch):
|
||||
calls = []
|
||||
target = f"{gateway_cli._launchd_domain()}/{gateway_cli.get_launchd_label()}"
|
||||
|
|
@ -315,7 +374,7 @@ class TestLaunchdServiceRecovery:
|
|||
gateway_cli.launchd_restart()
|
||||
|
||||
assert calls == [("self", 321)]
|
||||
assert "restart requested" in capsys.readouterr().out.lower()
|
||||
assert "service restarted" in capsys.readouterr().out.lower()
|
||||
|
||||
def test_launchd_stop_uses_bootout_not_kill(self, monkeypatch):
|
||||
"""launchd_stop must bootout the service so KeepAlive doesn't respawn it."""
|
||||
|
|
@ -393,6 +452,109 @@ class TestLaunchdServiceRecovery:
|
|||
assert "not loaded" in output.lower()
|
||||
|
||||
|
||||
class TestGatewayServiceReadiness:
|
||||
def test_wait_for_service_readiness_accepts_running_gateway_without_checks(self, monkeypatch):
|
||||
monkeypatch.setattr("gateway.status.get_running_pid", lambda: 123)
|
||||
monkeypatch.setattr(
|
||||
"gateway.status.read_runtime_status",
|
||||
lambda: {"pid": 123, "gateway_state": "running", "startup_checks": {}},
|
||||
)
|
||||
|
||||
warnings = gateway_cli._wait_for_service_readiness(action="start", timeout=0.1, poll_interval=0.0)
|
||||
|
||||
assert warnings == []
|
||||
|
||||
def test_wait_for_service_readiness_ignores_stale_runtime_state_until_pid_matches(self, monkeypatch):
|
||||
runtime_states = iter(
|
||||
[
|
||||
{"pid": 999, "gateway_state": "running", "startup_checks": {}},
|
||||
{"pid": 123, "gateway_state": "running", "startup_checks": {}},
|
||||
]
|
||||
)
|
||||
|
||||
monkeypatch.setattr("gateway.status.get_running_pid", lambda: 123)
|
||||
monkeypatch.setattr("gateway.status.read_runtime_status", lambda: next(runtime_states))
|
||||
|
||||
warnings = gateway_cli._wait_for_service_readiness(action="start", timeout=0.1, poll_interval=0.0)
|
||||
|
||||
assert warnings == []
|
||||
|
||||
def test_wait_for_service_readiness_returns_optional_pending_warnings(self, monkeypatch):
|
||||
monkeypatch.setattr("gateway.status.get_running_pid", lambda: 123)
|
||||
monkeypatch.setattr(
|
||||
"gateway.status.read_runtime_status",
|
||||
lambda: {
|
||||
"pid": 123,
|
||||
"gateway_state": "running",
|
||||
"startup_checks": {
|
||||
"optional-check": {
|
||||
"state": "pending",
|
||||
"required": False,
|
||||
"source": "test-hook",
|
||||
"detail": "still warming",
|
||||
}
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
warnings = gateway_cli._wait_for_service_readiness(action="start", timeout=0.1, poll_interval=0.0)
|
||||
|
||||
assert warnings == ["pending: optional-check (test-hook): still warming"]
|
||||
|
||||
def test_wait_for_service_readiness_fails_when_required_check_fails(self, monkeypatch):
|
||||
monkeypatch.setattr("gateway.status.get_running_pid", lambda: 123)
|
||||
monkeypatch.setattr(
|
||||
"gateway.status.read_runtime_status",
|
||||
lambda: {
|
||||
"pid": 123,
|
||||
"gateway_state": "running",
|
||||
"startup_checks": {
|
||||
"beam-runtime": {
|
||||
"state": "failed",
|
||||
"required": True,
|
||||
"source": "beam",
|
||||
"detail": "RPC boot failed",
|
||||
}
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
with pytest.raises(RuntimeError, match=r"required startup checks failed: beam-runtime \(beam\): RPC boot failed"):
|
||||
gateway_cli._wait_for_service_readiness(action="start", timeout=0.1, poll_interval=0.0)
|
||||
|
||||
def test_wait_for_service_readiness_times_out_on_pending_required_check(self, monkeypatch):
|
||||
monkeypatch.setattr("gateway.status.get_running_pid", lambda: 123)
|
||||
monkeypatch.setattr(
|
||||
"gateway.status.read_runtime_status",
|
||||
lambda: {
|
||||
"pid": 123,
|
||||
"gateway_state": "running",
|
||||
"startup_checks": {
|
||||
"beam-runtime": {
|
||||
"state": "pending",
|
||||
"required": True,
|
||||
"source": "beam",
|
||||
"detail": "waiting for runtime",
|
||||
}
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
with pytest.raises(RuntimeError, match=r"timed out waiting for required startup checks: beam-runtime \(beam\): waiting for runtime"):
|
||||
gateway_cli._wait_for_service_readiness(action="start", timeout=0.01, poll_interval=0.0)
|
||||
|
||||
def test_await_service_ready_or_exit_raises_system_exit_when_not_ready(self, monkeypatch):
|
||||
monkeypatch.setattr(gateway_cli, "_await_service_ready_or_exit", _REAL_AWAIT_SERVICE_READY)
|
||||
monkeypatch.setattr(
|
||||
gateway_cli,
|
||||
"_wait_for_service_readiness",
|
||||
lambda **kwargs: (_ for _ in ()).throw(RuntimeError("not ready")),
|
||||
)
|
||||
|
||||
with pytest.raises(SystemExit, match="1"):
|
||||
gateway_cli._await_service_ready_or_exit(action="start")
|
||||
|
||||
|
||||
class TestGatewayServiceDetection:
|
||||
def test_supports_systemd_services_requires_systemctl_binary(self, monkeypatch):
|
||||
monkeypatch.setattr(gateway_cli, "is_linux", lambda: True)
|
||||
|
|
@ -475,7 +637,7 @@ class TestGatewaySystemServiceRouting:
|
|||
gateway_cli.systemd_restart()
|
||||
|
||||
assert calls == [("refresh", False), ("self", 654)]
|
||||
assert "restart requested" in capsys.readouterr().out.lower()
|
||||
assert "service restarted" in capsys.readouterr().out.lower()
|
||||
|
||||
def test_gateway_install_passes_system_flags(self, monkeypatch):
|
||||
monkeypatch.setattr(gateway_cli, "supports_systemd_services", lambda: True)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue