mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-29 06:31:32 +00:00
Merge a3f878ba41 into 13038dc747
This commit is contained in:
commit
c640cf8d1c
4 changed files with 783 additions and 3 deletions
|
|
@ -126,8 +126,8 @@ def check_warn(text: str, detail: str = ""):
|
||||||
def check_fail(text: str, detail: str = ""):
|
def check_fail(text: str, detail: str = ""):
|
||||||
print(f" {color('✗', Colors.RED)} {text}" + (f" {color(detail, Colors.DIM)}" if detail else ""))
|
print(f" {color('✗', Colors.RED)} {text}" + (f" {color(detail, Colors.DIM)}" if detail else ""))
|
||||||
|
|
||||||
def check_info(text: str):
|
def check_info(text: str, detail: str = ""):
|
||||||
print(f" {color('→', Colors.CYAN)} {text}")
|
print(f" {color('→', Colors.CYAN)} {text}" + (f" {color(detail, Colors.DIM)}" if detail else ""))
|
||||||
|
|
||||||
|
|
||||||
def _check_gateway_service_linger(issues: list[str]) -> None:
|
def _check_gateway_service_linger(issues: list[str]) -> None:
|
||||||
|
|
@ -163,6 +163,225 @@ def _check_gateway_service_linger(issues: list[str]) -> None:
|
||||||
check_warn("Could not verify systemd linger", f"({linger_detail})")
|
check_warn("Could not verify systemd linger", f"({linger_detail})")
|
||||||
|
|
||||||
|
|
||||||
|
_PLATFORM_ALERT_STATES = frozenset({"retrying", "fatal", "disconnected"})
|
||||||
|
|
||||||
|
|
||||||
|
def _count_active_cron_jobs() -> int:
|
||||||
|
"""Return the number of enabled cron jobs that depend on the gateway."""
|
||||||
|
try:
|
||||||
|
# Lazy import keeps doctor usable if cron helpers are unavailable.
|
||||||
|
from cron.jobs import list_jobs
|
||||||
|
|
||||||
|
return len(list_jobs(include_disabled=False))
|
||||||
|
except Exception:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
def _append_runtime_issue(
|
||||||
|
issues: list[str],
|
||||||
|
warning: str,
|
||||||
|
issue: str,
|
||||||
|
detail: str = "",
|
||||||
|
) -> None:
|
||||||
|
check_warn(warning, detail)
|
||||||
|
issues.append(issue)
|
||||||
|
|
||||||
|
|
||||||
|
def _platform_state_detail(platform_state: dict) -> str:
|
||||||
|
message = platform_state.get("error_message") or platform_state.get("error_code")
|
||||||
|
return f"({message})" if message else ""
|
||||||
|
|
||||||
|
|
||||||
|
def _check_systemd_runtime_state(health, issues: list[str]) -> None:
|
||||||
|
# Lazy import avoids loading gateway restart machinery unless systemd state is rendered.
|
||||||
|
from gateway.restart import GATEWAY_SERVICE_RESTART_EXIT_CODE
|
||||||
|
|
||||||
|
props = health.systemd_unit or {}
|
||||||
|
active_state = props.get("ActiveState")
|
||||||
|
sub_state = props.get("SubState")
|
||||||
|
result = props.get("Result")
|
||||||
|
exec_status = props.get("ExecMainStatus")
|
||||||
|
|
||||||
|
if active_state == "activating" and sub_state == "auto-restart":
|
||||||
|
_append_runtime_issue(
|
||||||
|
issues,
|
||||||
|
"Gateway service is auto-restarting",
|
||||||
|
"Gateway service is auto-restarting — inspect logs with 'hermes gateway status --deep'",
|
||||||
|
)
|
||||||
|
elif active_state == "failed" and exec_status == str(GATEWAY_SERVICE_RESTART_EXIT_CODE):
|
||||||
|
_append_runtime_issue(
|
||||||
|
issues,
|
||||||
|
"Gateway service failed during planned restart",
|
||||||
|
"Gateway service is stuck after a planned restart — run 'hermes gateway status --deep'",
|
||||||
|
f"(ExecMainStatus={GATEWAY_SERVICE_RESTART_EXIT_CODE})",
|
||||||
|
)
|
||||||
|
elif active_state == "failed":
|
||||||
|
detail = f"(Result={result})" if result else ""
|
||||||
|
_append_runtime_issue(
|
||||||
|
issues,
|
||||||
|
"Gateway service failed",
|
||||||
|
"Gateway service failed — inspect logs with 'hermes gateway status --deep'",
|
||||||
|
detail,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _runtime_updated_detail(updated_at: str | None) -> str:
|
||||||
|
return f"(updated {updated_at})" if updated_at else ""
|
||||||
|
|
||||||
|
|
||||||
|
def _paren_list(parts: list[str]) -> str:
|
||||||
|
return f"({'; '.join(parts)})" if parts else ""
|
||||||
|
|
||||||
|
|
||||||
|
def _check_runtime_health(issues: list[str]) -> None:
|
||||||
|
"""Check live gateway and delivery-surface runtime health."""
|
||||||
|
print()
|
||||||
|
print(color("◆ Runtime Health", Colors.CYAN, Colors.BOLD))
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Lazy import keeps doctor usable if gateway helpers fail to import.
|
||||||
|
from hermes_cli.gateway import _format_gateway_pids, get_gateway_runtime_health
|
||||||
|
|
||||||
|
health = get_gateway_runtime_health()
|
||||||
|
except Exception as e:
|
||||||
|
check_warn("Runtime health unavailable", f"({e})")
|
||||||
|
return
|
||||||
|
|
||||||
|
snapshot = health.snapshot
|
||||||
|
configured_platforms = health.configured_platforms
|
||||||
|
active_cron_jobs = _count_active_cron_jobs()
|
||||||
|
gateway_needed = bool(configured_platforms or active_cron_jobs)
|
||||||
|
|
||||||
|
if not gateway_needed and not snapshot.running and not snapshot.service_installed:
|
||||||
|
check_info("No long-lived gateway-managed runtime configured")
|
||||||
|
return
|
||||||
|
|
||||||
|
if snapshot.running:
|
||||||
|
detail_parts = []
|
||||||
|
if snapshot.gateway_pids:
|
||||||
|
detail_parts.append(f"PID(s): {_format_gateway_pids(snapshot.gateway_pids, limit=None)}")
|
||||||
|
if snapshot.manager:
|
||||||
|
detail_parts.append(snapshot.manager)
|
||||||
|
check_ok("Gateway process running", _paren_list(detail_parts))
|
||||||
|
else:
|
||||||
|
if configured_platforms:
|
||||||
|
detail_parts = [f"configured: {', '.join(configured_platforms)}"]
|
||||||
|
issue = "Start the gateway so configured platforms can receive messages"
|
||||||
|
if health.gateway_state == "startup_failed" and health.exit_reason:
|
||||||
|
detail_parts.append(f"last startup issue: {health.exit_reason}")
|
||||||
|
issue = f"{issue}; last startup issue: {health.exit_reason}"
|
||||||
|
_append_runtime_issue(
|
||||||
|
issues,
|
||||||
|
"Gateway is not running",
|
||||||
|
issue,
|
||||||
|
_paren_list(detail_parts),
|
||||||
|
)
|
||||||
|
if active_cron_jobs:
|
||||||
|
_append_runtime_issue(
|
||||||
|
issues,
|
||||||
|
"Gateway is not running — scheduled jobs will not fire automatically",
|
||||||
|
"Start the gateway so scheduled jobs can fire automatically",
|
||||||
|
f"({active_cron_jobs} active job(s))",
|
||||||
|
)
|
||||||
|
|
||||||
|
if (
|
||||||
|
snapshot.service_installed
|
||||||
|
and not snapshot.service_running
|
||||||
|
and not snapshot.has_process_service_mismatch
|
||||||
|
):
|
||||||
|
if gateway_needed:
|
||||||
|
_append_runtime_issue(
|
||||||
|
issues,
|
||||||
|
"Gateway service installed but stopped",
|
||||||
|
"Start the installed gateway service with 'hermes gateway start'",
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
check_info(
|
||||||
|
"Gateway service installed but stopped",
|
||||||
|
"(no configured delivery surfaces or scheduled jobs)",
|
||||||
|
)
|
||||||
|
|
||||||
|
if snapshot.has_process_service_mismatch:
|
||||||
|
pid_detail = _format_gateway_pids(snapshot.gateway_pids, limit=None)
|
||||||
|
_append_runtime_issue(
|
||||||
|
issues,
|
||||||
|
"Gateway process is running but the installed service is not active",
|
||||||
|
"Gateway process is not service-managed — stop the manual process or start the service",
|
||||||
|
f"(PID(s): {pid_detail})",
|
||||||
|
)
|
||||||
|
|
||||||
|
_check_systemd_runtime_state(health, issues)
|
||||||
|
|
||||||
|
if not snapshot.running:
|
||||||
|
return
|
||||||
|
|
||||||
|
if not configured_platforms and not active_cron_jobs:
|
||||||
|
check_info("No configured delivery surfaces or scheduled jobs to check")
|
||||||
|
|
||||||
|
if active_cron_jobs:
|
||||||
|
check_ok(
|
||||||
|
"Scheduled jobs can fire automatically",
|
||||||
|
f"({active_cron_jobs} active job(s))",
|
||||||
|
)
|
||||||
|
|
||||||
|
if not health.runtime_status_available:
|
||||||
|
check_warn("Gateway runtime status unavailable", f"({_DHH}/gateway_state.json missing or unreadable)")
|
||||||
|
else:
|
||||||
|
gateway_state = health.gateway_state
|
||||||
|
updated_detail = _runtime_updated_detail(health.updated_at)
|
||||||
|
if gateway_state == "running":
|
||||||
|
check_ok("Gateway runtime state running", updated_detail)
|
||||||
|
elif gateway_state == "draining":
|
||||||
|
check_info("Gateway runtime state draining", updated_detail)
|
||||||
|
elif gateway_state == "startup_failed":
|
||||||
|
reason = health.exit_reason or "unknown startup issue"
|
||||||
|
detail_parts = [reason]
|
||||||
|
if health.updated_at:
|
||||||
|
detail_parts.append(f"updated {health.updated_at}")
|
||||||
|
_append_runtime_issue(
|
||||||
|
issues,
|
||||||
|
"Gateway startup failed",
|
||||||
|
f"Gateway startup failed: {reason}",
|
||||||
|
_paren_list(detail_parts),
|
||||||
|
)
|
||||||
|
elif gateway_state:
|
||||||
|
check_info(f"Gateway runtime state {gateway_state}", updated_detail)
|
||||||
|
else:
|
||||||
|
check_warn("Gateway runtime state unknown")
|
||||||
|
|
||||||
|
if configured_platforms and health.runtime_status_available:
|
||||||
|
for platform in configured_platforms:
|
||||||
|
platform_state = health.platforms.get(platform)
|
||||||
|
if not platform_state:
|
||||||
|
_append_runtime_issue(
|
||||||
|
issues,
|
||||||
|
f"{platform} runtime health unknown",
|
||||||
|
f"{platform} is configured but missing from gateway runtime status",
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
state = platform_state.get("state")
|
||||||
|
if state == "connected":
|
||||||
|
check_ok(f"{platform} connected")
|
||||||
|
elif state == "connecting":
|
||||||
|
check_info(f"{platform} connecting")
|
||||||
|
elif state in _PLATFORM_ALERT_STATES:
|
||||||
|
_append_runtime_issue(
|
||||||
|
issues,
|
||||||
|
f"{platform} {state}",
|
||||||
|
f"{platform} runtime state is {state}",
|
||||||
|
_platform_state_detail(platform_state),
|
||||||
|
)
|
||||||
|
elif state:
|
||||||
|
check_info(f"{platform} {state}")
|
||||||
|
else:
|
||||||
|
_append_runtime_issue(
|
||||||
|
issues,
|
||||||
|
f"{platform} runtime health unknown",
|
||||||
|
f"{platform} runtime state is missing",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def run_doctor(args):
|
def run_doctor(args):
|
||||||
"""Run diagnostic checks."""
|
"""Run diagnostic checks."""
|
||||||
should_fix = getattr(args, 'fix', False)
|
should_fix = getattr(args, 'fix', False)
|
||||||
|
|
@ -631,6 +850,7 @@ def run_doctor(args):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
_check_gateway_service_linger(issues)
|
_check_gateway_service_linger(issues)
|
||||||
|
_check_runtime_health(issues)
|
||||||
|
|
||||||
# =========================================================================
|
# =========================================================================
|
||||||
# Check: Command installation (hermes bin symlink)
|
# Check: Command installation (hermes bin symlink)
|
||||||
|
|
|
||||||
|
|
@ -12,6 +12,7 @@ import subprocess
|
||||||
import sys
|
import sys
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
PROJECT_ROOT = Path(__file__).parent.parent.resolve()
|
PROJECT_ROOT = Path(__file__).parent.parent.resolve()
|
||||||
|
|
||||||
|
|
@ -59,6 +60,19 @@ class GatewayRuntimeSnapshot:
|
||||||
def has_process_service_mismatch(self) -> bool:
|
def has_process_service_mismatch(self) -> bool:
|
||||||
return self.service_installed and self.running and not self.service_running
|
return self.service_installed and self.running and not self.service_running
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class GatewayRuntimeHealth:
|
||||||
|
snapshot: GatewayRuntimeSnapshot
|
||||||
|
configured_platforms: tuple[str, ...]
|
||||||
|
runtime_status_available: bool
|
||||||
|
gateway_state: str | None
|
||||||
|
exit_reason: str | None
|
||||||
|
platforms: dict[str, dict[str, Any]]
|
||||||
|
updated_at: str | None
|
||||||
|
systemd_unit: dict[str, str]
|
||||||
|
|
||||||
|
|
||||||
def _get_service_pids() -> set:
|
def _get_service_pids() -> set:
|
||||||
"""Return PIDs currently managed by systemd or launchd gateway services.
|
"""Return PIDs currently managed by systemd or launchd gateway services.
|
||||||
|
|
||||||
|
|
@ -586,6 +600,65 @@ def get_gateway_runtime_snapshot(system: bool = False) -> GatewayRuntimeSnapshot
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def get_gateway_runtime_health(system: bool = False) -> GatewayRuntimeHealth:
|
||||||
|
"""Return normalized gateway runtime health for diagnostics surfaces."""
|
||||||
|
snapshot = get_gateway_runtime_snapshot(system=system)
|
||||||
|
|
||||||
|
try:
|
||||||
|
from gateway.config import load_gateway_config
|
||||||
|
|
||||||
|
configured_platforms = tuple(
|
||||||
|
platform.value for platform in load_gateway_config().get_connected_platforms()
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
configured_platforms = ()
|
||||||
|
|
||||||
|
try:
|
||||||
|
from gateway.status import read_runtime_status
|
||||||
|
|
||||||
|
runtime_status = read_runtime_status()
|
||||||
|
except Exception:
|
||||||
|
runtime_status = None
|
||||||
|
|
||||||
|
runtime_status_available = isinstance(runtime_status, dict)
|
||||||
|
gateway_state = runtime_status.get("gateway_state") if runtime_status_available else None
|
||||||
|
exit_reason = runtime_status.get("exit_reason") if runtime_status_available else None
|
||||||
|
updated_at = runtime_status.get("updated_at") if runtime_status_available else None
|
||||||
|
raw_platforms = runtime_status.get("platforms", {}) if runtime_status_available else {}
|
||||||
|
if not isinstance(raw_platforms, dict):
|
||||||
|
raw_platforms = {}
|
||||||
|
|
||||||
|
configured_set = set(configured_platforms)
|
||||||
|
platforms: dict[str, dict[str, Any]] = {}
|
||||||
|
for platform, payload in raw_platforms.items():
|
||||||
|
slug = str(platform)
|
||||||
|
if slug not in configured_set or not isinstance(payload, dict):
|
||||||
|
continue
|
||||||
|
platforms[slug] = dict(payload)
|
||||||
|
|
||||||
|
if not snapshot.running:
|
||||||
|
platforms = {}
|
||||||
|
if gateway_state == "running":
|
||||||
|
gateway_state = "stopped"
|
||||||
|
|
||||||
|
systemd_unit = (
|
||||||
|
_read_systemd_unit_properties(system=system)
|
||||||
|
if supports_systemd_services() and snapshot.service_installed
|
||||||
|
else {}
|
||||||
|
)
|
||||||
|
|
||||||
|
return GatewayRuntimeHealth(
|
||||||
|
snapshot=snapshot,
|
||||||
|
configured_platforms=configured_platforms,
|
||||||
|
runtime_status_available=runtime_status_available,
|
||||||
|
gateway_state=gateway_state,
|
||||||
|
exit_reason=exit_reason,
|
||||||
|
platforms=platforms,
|
||||||
|
updated_at=updated_at,
|
||||||
|
systemd_unit=systemd_unit,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def _format_gateway_pids(pids: tuple[int, ...] | list[int], *, limit: int | None = 3) -> str:
|
def _format_gateway_pids(pids: tuple[int, ...] | list[int], *, limit: int | None = 3) -> str:
|
||||||
rendered = [str(pid) for pid in pids[:limit] if pid > 0] if limit is not None else [str(pid) for pid in pids if pid > 0]
|
rendered = [str(pid) for pid in pids[:limit] if pid > 0] if limit is not None else [str(pid) for pid in pids if pid > 0]
|
||||||
if limit is not None and len(pids) > limit:
|
if limit is not None and len(pids) > limit:
|
||||||
|
|
|
||||||
|
|
@ -14,6 +14,9 @@ import hermes_cli.doctor as doctor
|
||||||
import hermes_cli.gateway as gateway_cli
|
import hermes_cli.gateway as gateway_cli
|
||||||
from hermes_cli import doctor as doctor_mod
|
from hermes_cli import doctor as doctor_mod
|
||||||
from hermes_cli.doctor import _has_provider_env_config
|
from hermes_cli.doctor import _has_provider_env_config
|
||||||
|
from hermes_cli.gateway import GatewayRuntimeHealth, GatewayRuntimeSnapshot
|
||||||
|
|
||||||
|
_DEFAULT_RUNTIME_UPDATED_AT = "2026-04-23T00:00:00+00:00"
|
||||||
|
|
||||||
|
|
||||||
class TestDoctorPlatformHints:
|
class TestDoctorPlatformHints:
|
||||||
|
|
@ -161,6 +164,355 @@ def test_check_gateway_service_linger_skips_when_service_not_installed(monkeypat
|
||||||
assert issues == []
|
assert issues == []
|
||||||
|
|
||||||
|
|
||||||
|
def _gateway_health(
|
||||||
|
*,
|
||||||
|
snapshot=None,
|
||||||
|
configured=(),
|
||||||
|
runtime_status_available=True,
|
||||||
|
gateway_state="running",
|
||||||
|
exit_reason=None,
|
||||||
|
platforms=None,
|
||||||
|
systemd_unit=None,
|
||||||
|
updated_at=_DEFAULT_RUNTIME_UPDATED_AT,
|
||||||
|
):
|
||||||
|
if not runtime_status_available and updated_at == _DEFAULT_RUNTIME_UPDATED_AT:
|
||||||
|
updated_at = None
|
||||||
|
return GatewayRuntimeHealth(
|
||||||
|
snapshot=snapshot or GatewayRuntimeSnapshot(
|
||||||
|
manager="manual process",
|
||||||
|
gateway_pids=(1234,),
|
||||||
|
),
|
||||||
|
configured_platforms=tuple(configured),
|
||||||
|
runtime_status_available=runtime_status_available,
|
||||||
|
gateway_state=gateway_state,
|
||||||
|
exit_reason=exit_reason,
|
||||||
|
platforms=platforms or {},
|
||||||
|
updated_at=updated_at,
|
||||||
|
systemd_unit=systemd_unit or {},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _run_runtime_check(monkeypatch, capsys, health, *, active_cron_jobs=0):
|
||||||
|
monkeypatch.setattr(gateway_cli, "get_gateway_runtime_health", lambda: health)
|
||||||
|
monkeypatch.setattr(doctor, "_count_active_cron_jobs", lambda: active_cron_jobs)
|
||||||
|
issues = []
|
||||||
|
doctor._check_runtime_health(issues)
|
||||||
|
return capsys.readouterr().out, issues
|
||||||
|
|
||||||
|
|
||||||
|
def test_runtime_health_no_gateway_configured_is_info_only(monkeypatch, capsys):
|
||||||
|
health = _gateway_health(
|
||||||
|
snapshot=GatewayRuntimeSnapshot(manager="manual process"),
|
||||||
|
configured=(),
|
||||||
|
runtime_status_available=False,
|
||||||
|
gateway_state=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
out, issues = _run_runtime_check(monkeypatch, capsys, health)
|
||||||
|
|
||||||
|
assert "No long-lived gateway-managed runtime configured" in out
|
||||||
|
assert issues == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_runtime_health_gateway_not_running_adds_one_liveness_issue(monkeypatch, capsys):
|
||||||
|
health = _gateway_health(
|
||||||
|
snapshot=GatewayRuntimeSnapshot(manager="manual process"),
|
||||||
|
configured=("telegram",),
|
||||||
|
gateway_state="stopped",
|
||||||
|
)
|
||||||
|
|
||||||
|
out, issues = _run_runtime_check(monkeypatch, capsys, health)
|
||||||
|
|
||||||
|
assert "Gateway is not running" in out
|
||||||
|
assert len(issues) == 1
|
||||||
|
assert issues[0] == "Start the gateway so configured platforms can receive messages"
|
||||||
|
|
||||||
|
|
||||||
|
def test_runtime_health_gateway_not_running_includes_startup_failure(monkeypatch, capsys):
|
||||||
|
health = _gateway_health(
|
||||||
|
snapshot=GatewayRuntimeSnapshot(manager="manual process"),
|
||||||
|
configured=("telegram",),
|
||||||
|
gateway_state="startup_failed",
|
||||||
|
exit_reason="telegram conflict",
|
||||||
|
)
|
||||||
|
|
||||||
|
out, issues = _run_runtime_check(monkeypatch, capsys, health)
|
||||||
|
|
||||||
|
assert "last startup issue: telegram conflict" in out
|
||||||
|
assert issues == [
|
||||||
|
"Start the gateway so configured platforms can receive messages; last startup issue: telegram conflict"
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_runtime_health_missing_status_file_does_not_emit_platform_issues(monkeypatch, capsys):
|
||||||
|
health = _gateway_health(
|
||||||
|
configured=("telegram", "discord", "slack"),
|
||||||
|
runtime_status_available=False,
|
||||||
|
gateway_state=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
out, issues = _run_runtime_check(monkeypatch, capsys, health)
|
||||||
|
|
||||||
|
assert "Gateway runtime status unavailable" in out
|
||||||
|
assert "runtime health unknown" not in out
|
||||||
|
assert issues == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_runtime_health_startup_failed_adds_issue(monkeypatch, capsys):
|
||||||
|
health = _gateway_health(
|
||||||
|
configured=("telegram",),
|
||||||
|
gateway_state="startup_failed",
|
||||||
|
exit_reason="telegram conflict",
|
||||||
|
platforms={"telegram": {"state": "connected"}},
|
||||||
|
)
|
||||||
|
|
||||||
|
out, issues = _run_runtime_check(monkeypatch, capsys, health)
|
||||||
|
|
||||||
|
assert "Gateway startup failed" in out
|
||||||
|
assert "telegram conflict" in out
|
||||||
|
assert issues == ["Gateway startup failed: telegram conflict"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_runtime_health_platform_retrying_adds_issue(monkeypatch, capsys):
|
||||||
|
health = _gateway_health(
|
||||||
|
configured=("telegram",),
|
||||||
|
platforms={
|
||||||
|
"telegram": {
|
||||||
|
"state": "retrying",
|
||||||
|
"error_message": "another poller is active",
|
||||||
|
}
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
out, issues = _run_runtime_check(monkeypatch, capsys, health)
|
||||||
|
|
||||||
|
assert "telegram retrying" in out
|
||||||
|
assert "another poller is active" in out
|
||||||
|
assert issues == ["telegram runtime state is retrying"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_runtime_health_unknown_non_alert_platform_state_is_info_only(monkeypatch, capsys):
|
||||||
|
health = _gateway_health(
|
||||||
|
configured=("telegram",),
|
||||||
|
platforms={"telegram": {"state": "idle"}},
|
||||||
|
)
|
||||||
|
|
||||||
|
out, issues = _run_runtime_check(monkeypatch, capsys, health)
|
||||||
|
|
||||||
|
assert "telegram idle" in out
|
||||||
|
assert issues == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_runtime_health_missing_configured_platform_entry_adds_issue(monkeypatch, capsys):
|
||||||
|
health = _gateway_health(configured=("telegram",), platforms={})
|
||||||
|
|
||||||
|
out, issues = _run_runtime_check(monkeypatch, capsys, health)
|
||||||
|
|
||||||
|
assert "telegram runtime health unknown" in out
|
||||||
|
assert issues == ["telegram is configured but missing from gateway runtime status"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_runtime_health_transient_states_are_info_only(monkeypatch, capsys):
|
||||||
|
health = _gateway_health(
|
||||||
|
configured=("telegram",),
|
||||||
|
gateway_state="draining",
|
||||||
|
platforms={"telegram": {"state": "connecting"}},
|
||||||
|
)
|
||||||
|
|
||||||
|
out, issues = _run_runtime_check(monkeypatch, capsys, health)
|
||||||
|
|
||||||
|
assert "Gateway runtime state draining" in out
|
||||||
|
assert "telegram connecting" in out
|
||||||
|
assert issues == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_runtime_health_cron_jobs_without_gateway_adds_issue(monkeypatch, capsys):
|
||||||
|
health = _gateway_health(
|
||||||
|
snapshot=GatewayRuntimeSnapshot(manager="manual process"),
|
||||||
|
configured=(),
|
||||||
|
runtime_status_available=False,
|
||||||
|
gateway_state=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
out, issues = _run_runtime_check(monkeypatch, capsys, health, active_cron_jobs=2)
|
||||||
|
|
||||||
|
assert "scheduled jobs will not fire automatically" in out
|
||||||
|
assert issues == ["Start the gateway so scheduled jobs can fire automatically"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_runtime_health_cron_jobs_with_gateway_are_ok(monkeypatch, capsys):
|
||||||
|
health = _gateway_health(configured=(), platforms={})
|
||||||
|
|
||||||
|
out, issues = _run_runtime_check(monkeypatch, capsys, health, active_cron_jobs=2)
|
||||||
|
|
||||||
|
assert "Scheduled jobs can fire automatically" in out
|
||||||
|
assert "scheduled jobs will not fire automatically" not in out
|
||||||
|
assert issues == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_runtime_health_renders_updated_at_for_running_state(monkeypatch, capsys):
|
||||||
|
health = _gateway_health(
|
||||||
|
configured=("telegram",),
|
||||||
|
platforms={"telegram": {"state": "connected"}},
|
||||||
|
)
|
||||||
|
|
||||||
|
out, issues = _run_runtime_check(monkeypatch, capsys, health)
|
||||||
|
|
||||||
|
assert "updated 2026-04-23T00:00:00+00:00" in out
|
||||||
|
assert issues == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_runtime_health_running_state_without_updated_at_has_no_empty_detail(monkeypatch, capsys):
|
||||||
|
health = _gateway_health(
|
||||||
|
configured=("telegram",),
|
||||||
|
platforms={"telegram": {"state": "connected"}},
|
||||||
|
updated_at=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
out, issues = _run_runtime_check(monkeypatch, capsys, health)
|
||||||
|
|
||||||
|
assert "Gateway runtime state running" in out
|
||||||
|
assert "updated " not in out
|
||||||
|
assert issues == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_runtime_health_running_gateway_with_no_surfaces_is_info_only(monkeypatch, capsys):
|
||||||
|
health = _gateway_health(configured=(), platforms={})
|
||||||
|
|
||||||
|
out, issues = _run_runtime_check(monkeypatch, capsys, health)
|
||||||
|
|
||||||
|
assert "Gateway process running" in out
|
||||||
|
assert "No configured delivery surfaces or scheduled jobs to check" in out
|
||||||
|
assert issues == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_runtime_health_unknown_runtime_state_is_warn_only(monkeypatch, capsys):
|
||||||
|
health = _gateway_health(
|
||||||
|
configured=(),
|
||||||
|
runtime_status_available=True,
|
||||||
|
gateway_state=None,
|
||||||
|
platforms={},
|
||||||
|
)
|
||||||
|
|
||||||
|
out, issues = _run_runtime_check(monkeypatch, capsys, health)
|
||||||
|
|
||||||
|
assert "Gateway runtime state unknown" in out
|
||||||
|
assert issues == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_runtime_health_stopped_service_without_consumers_is_info_only(monkeypatch, capsys):
|
||||||
|
health = _gateway_health(
|
||||||
|
snapshot=GatewayRuntimeSnapshot(
|
||||||
|
manager="systemd (user)",
|
||||||
|
service_installed=True,
|
||||||
|
service_running=False,
|
||||||
|
),
|
||||||
|
configured=(),
|
||||||
|
runtime_status_available=False,
|
||||||
|
gateway_state=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
out, issues = _run_runtime_check(monkeypatch, capsys, health)
|
||||||
|
|
||||||
|
assert "Gateway service installed but stopped" in out
|
||||||
|
assert issues == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_runtime_health_stopped_service_with_configured_platform_adds_issue(monkeypatch, capsys):
|
||||||
|
health = _gateway_health(
|
||||||
|
snapshot=GatewayRuntimeSnapshot(
|
||||||
|
manager="systemd (user)",
|
||||||
|
service_installed=True,
|
||||||
|
service_running=False,
|
||||||
|
),
|
||||||
|
configured=("telegram",),
|
||||||
|
gateway_state="stopped",
|
||||||
|
)
|
||||||
|
|
||||||
|
out, issues = _run_runtime_check(monkeypatch, capsys, health)
|
||||||
|
|
||||||
|
assert "Gateway service installed but stopped" in out
|
||||||
|
assert "Start the installed gateway service with 'hermes gateway start'" in issues
|
||||||
|
|
||||||
|
|
||||||
|
def test_runtime_health_service_process_mismatch_adds_issue(monkeypatch, capsys):
|
||||||
|
health = _gateway_health(
|
||||||
|
snapshot=GatewayRuntimeSnapshot(
|
||||||
|
manager="systemd (user)",
|
||||||
|
service_installed=True,
|
||||||
|
service_running=False,
|
||||||
|
gateway_pids=(1234,),
|
||||||
|
),
|
||||||
|
configured=(),
|
||||||
|
platforms={},
|
||||||
|
)
|
||||||
|
|
||||||
|
out, issues = _run_runtime_check(monkeypatch, capsys, health)
|
||||||
|
|
||||||
|
assert "installed service is not active" in out
|
||||||
|
assert issues == [
|
||||||
|
"Gateway process is not service-managed — stop the manual process or start the service"
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_runtime_health_service_process_mismatch_suppresses_stopped_service_issue(monkeypatch, capsys):
|
||||||
|
health = _gateway_health(
|
||||||
|
snapshot=GatewayRuntimeSnapshot(
|
||||||
|
manager="systemd (user)",
|
||||||
|
service_installed=True,
|
||||||
|
service_running=False,
|
||||||
|
gateway_pids=(1234,),
|
||||||
|
),
|
||||||
|
configured=("telegram",),
|
||||||
|
platforms={"telegram": {"state": "connected"}},
|
||||||
|
)
|
||||||
|
|
||||||
|
out, issues = _run_runtime_check(monkeypatch, capsys, health)
|
||||||
|
|
||||||
|
assert "Gateway process is running but the installed service is not active" in out
|
||||||
|
assert "Gateway service installed but stopped" not in out
|
||||||
|
assert issues == [
|
||||||
|
"Gateway process is not service-managed — stop the manual process or start the service"
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
("systemd_unit", "expected"),
|
||||||
|
[
|
||||||
|
(
|
||||||
|
{"ActiveState": "activating", "SubState": "auto-restart"},
|
||||||
|
"Gateway service is auto-restarting",
|
||||||
|
),
|
||||||
|
(
|
||||||
|
{"ActiveState": "failed", "Result": "exit-code", "ExecMainStatus": "1"},
|
||||||
|
"Gateway service failed",
|
||||||
|
),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_runtime_health_systemd_failure_states_add_issue(
|
||||||
|
monkeypatch,
|
||||||
|
capsys,
|
||||||
|
systemd_unit,
|
||||||
|
expected,
|
||||||
|
):
|
||||||
|
health = _gateway_health(
|
||||||
|
snapshot=GatewayRuntimeSnapshot(
|
||||||
|
manager="systemd (user)",
|
||||||
|
service_installed=True,
|
||||||
|
service_running=False,
|
||||||
|
),
|
||||||
|
configured=(),
|
||||||
|
gateway_state="stopped",
|
||||||
|
systemd_unit=systemd_unit,
|
||||||
|
)
|
||||||
|
|
||||||
|
out, issues = _run_runtime_check(monkeypatch, capsys, health)
|
||||||
|
|
||||||
|
assert expected in out
|
||||||
|
assert len(issues) == 1
|
||||||
|
assert expected in issues[0]
|
||||||
|
|
||||||
|
|
||||||
# ── Memory provider section (doctor should only check the *active* provider) ──
|
# ── Memory provider section (doctor should only check the *active* provider) ──
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,139 @@
|
||||||
from hermes_cli.gateway import _runtime_health_lines
|
from types import SimpleNamespace
|
||||||
|
|
||||||
|
import hermes_cli.gateway as gateway_cli
|
||||||
|
from hermes_cli.gateway import (
|
||||||
|
GatewayRuntimeSnapshot,
|
||||||
|
get_gateway_runtime_health,
|
||||||
|
_runtime_health_lines,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _platform(slug: str):
|
||||||
|
return SimpleNamespace(value=slug)
|
||||||
|
|
||||||
|
|
||||||
|
def _config(*platforms: str):
|
||||||
|
return SimpleNamespace(
|
||||||
|
get_connected_platforms=lambda: [_platform(platform) for platform in platforms]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _snapshot(*, running: bool = True, service_installed: bool = False, service_running: bool = False):
|
||||||
|
return GatewayRuntimeSnapshot(
|
||||||
|
manager="manual process",
|
||||||
|
service_installed=service_installed,
|
||||||
|
service_running=service_running,
|
||||||
|
gateway_pids=(1234,) if running and not service_running else (),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _patch_runtime_health_deps(monkeypatch, *, snapshot=None, config=None, status=None):
|
||||||
|
monkeypatch.setattr(
|
||||||
|
gateway_cli,
|
||||||
|
"get_gateway_runtime_snapshot",
|
||||||
|
lambda system=False: snapshot or _snapshot(),
|
||||||
|
)
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"gateway.config.load_gateway_config",
|
||||||
|
lambda: config or _config(),
|
||||||
|
)
|
||||||
|
monkeypatch.setattr("gateway.status.read_runtime_status", lambda: status)
|
||||||
|
monkeypatch.setattr(gateway_cli, "supports_systemd_services", lambda: False)
|
||||||
|
|
||||||
|
|
||||||
|
def test_gateway_runtime_health_handles_missing_status(monkeypatch):
|
||||||
|
_patch_runtime_health_deps(
|
||||||
|
monkeypatch,
|
||||||
|
snapshot=_snapshot(running=True),
|
||||||
|
config=_config("telegram"),
|
||||||
|
status=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
health = get_gateway_runtime_health()
|
||||||
|
|
||||||
|
assert health.runtime_status_available is False
|
||||||
|
assert health.gateway_state is None
|
||||||
|
assert health.platforms == {}
|
||||||
|
|
||||||
|
|
||||||
|
def test_gateway_runtime_health_treats_unparseable_status_as_unavailable(monkeypatch):
|
||||||
|
_patch_runtime_health_deps(
|
||||||
|
monkeypatch,
|
||||||
|
snapshot=_snapshot(running=True),
|
||||||
|
config=_config("telegram"),
|
||||||
|
status=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
health = get_gateway_runtime_health()
|
||||||
|
|
||||||
|
assert health.runtime_status_available is False
|
||||||
|
assert health.gateway_state is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_gateway_runtime_health_filters_configured_platforms(monkeypatch):
|
||||||
|
_patch_runtime_health_deps(
|
||||||
|
monkeypatch,
|
||||||
|
snapshot=_snapshot(running=True),
|
||||||
|
config=_config("telegram", "discord"),
|
||||||
|
status={
|
||||||
|
"gateway_state": "running",
|
||||||
|
"updated_at": "2026-04-23T00:00:00+00:00",
|
||||||
|
"platforms": {
|
||||||
|
"telegram": {"state": "connected"},
|
||||||
|
"discord": {"state": "connecting"},
|
||||||
|
"slack": {"state": "connected"},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
health = get_gateway_runtime_health()
|
||||||
|
|
||||||
|
assert health.runtime_status_available is True
|
||||||
|
assert health.gateway_state == "running"
|
||||||
|
assert health.updated_at == "2026-04-23T00:00:00+00:00"
|
||||||
|
assert set(health.platforms) == {"telegram", "discord"}
|
||||||
|
|
||||||
|
|
||||||
|
def test_gateway_runtime_health_drops_stale_platforms_when_not_running(monkeypatch):
|
||||||
|
_patch_runtime_health_deps(
|
||||||
|
monkeypatch,
|
||||||
|
snapshot=_snapshot(running=False),
|
||||||
|
config=_config("telegram"),
|
||||||
|
status={
|
||||||
|
"gateway_state": "running",
|
||||||
|
"platforms": {"telegram": {"state": "connected"}},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
health = get_gateway_runtime_health()
|
||||||
|
|
||||||
|
assert health.gateway_state == "stopped"
|
||||||
|
assert health.platforms == {}
|
||||||
|
|
||||||
|
|
||||||
|
def test_gateway_runtime_health_exposes_systemd_properties(monkeypatch):
|
||||||
|
_patch_runtime_health_deps(
|
||||||
|
monkeypatch,
|
||||||
|
snapshot=_snapshot(running=False, service_installed=True),
|
||||||
|
config=_config("telegram"),
|
||||||
|
status={"gateway_state": "stopped", "platforms": {}},
|
||||||
|
)
|
||||||
|
monkeypatch.setattr(gateway_cli, "supports_systemd_services", lambda: True)
|
||||||
|
monkeypatch.setattr(
|
||||||
|
gateway_cli,
|
||||||
|
"_read_systemd_unit_properties",
|
||||||
|
lambda system=False: {
|
||||||
|
"ActiveState": "activating",
|
||||||
|
"SubState": "auto-restart",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
health = get_gateway_runtime_health()
|
||||||
|
|
||||||
|
assert health.systemd_unit == {
|
||||||
|
"ActiveState": "activating",
|
||||||
|
"SubState": "auto-restart",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def test_runtime_health_lines_include_fatal_platform_and_startup_reason(monkeypatch):
|
def test_runtime_health_lines_include_fatal_platform_and_startup_reason(monkeypatch):
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue