feat(dashboard): add HTTP health probe for cross-container gateway detection

The dashboard's gateway status detection relied solely on local PID checks (os.kill + /proc), which fails when the gateway runs in a separate container. Changes: - web_server.py: Add _probe_gateway_health() that queries the gateway's HTTP /health/detailed endpoint when the local PID check fails. Activated by setting the GATEWAY_HEALTH_URL env var (e.g. http://gateway:8642/health). Falls back to standard PID check when the env var is not set. - api_server.py: Add GET /health/detailed endpoint that returns full gateway state (platforms, gateway_state, active_agents, pid, etc.) without auth. The existing GET /health remains unchanged for backwards compatibility. - StatusPage.tsx: Handle the case where gateway_pid is null but the gateway is running remotely, displaying 'Running (remote)' instead of 'PID null'. Environment variables: - GATEWAY_HEALTH_URL: URL of the gateway health endpoint (e.g. http://gateway-container:8642/health). Unset = local PID check only. - GATEWAY_HEALTH_TIMEOUT: Probe timeout in seconds (default: 3).
2026-04-25 00:51:20 +00:00 · 2026-04-14 05:17:17 +00:00 · 2026-04-14 05:17:17 +00:00 · 45595f4805
commit 45595f4805
parent 397386cae2
6 changed files with 78 additions and 1 deletions
--- a/gateway/platforms/api_server.py
+++ b/gateway/platforms/api_server.py
@ -10,6 +10,7 @@ Exposes an HTTP server with endpoints:
 - POST /v1/runs                    — start a run, returns run_id immediately (202)
 - GET  /v1/runs/{run_id}/events    — SSE stream of structured lifecycle events
 - GET  /health                     — health check
+- GET  /health/detailed            — rich status for cross-container dashboard probing

 Any OpenAI-compatible frontend (Open WebUI, LobeChat, LibreChat,
 AnythingLLM, NextChat, ChatBox, etc.) can connect to hermes-agent
@ -565,6 +566,27 @@ class APIServerAdapter(BasePlatformAdapter):
        """GET /health — simple health check."""
        return web.json_response({"status": "ok", "platform": "hermes-agent"})

+    async def _handle_health_detailed(self, request: "web.Request") -> "web.Response":
+        """GET /health/detailed — rich status for cross-container dashboard probing.
+
+        Returns gateway state, connected platforms, PID, and uptime so the
+        dashboard can display full status without needing a shared PID file or
+        /proc access.  No authentication required.
+        """
+        from gateway.status import read_runtime_status
+
+        runtime = read_runtime_status() or {}
+        return web.json_response({
+            "status": "ok",
+            "platform": "hermes-agent",
+            "gateway_state": runtime.get("gateway_state"),
+            "platforms": runtime.get("platforms", {}),
+            "active_agents": runtime.get("active_agents", 0),
+            "exit_reason": runtime.get("exit_reason"),
+            "updated_at": runtime.get("updated_at"),
+            "pid": os.getpid(),
+        })
+
    async def _handle_models(self, request: "web.Request") -> "web.Response":
        """GET /v1/models — return hermes-agent as an available model."""
        auth_err = self._check_auth(request)
@ -1783,6 +1805,7 @@ class APIServerAdapter(BasePlatformAdapter):
            self._app = web.Application(middlewares=mws)
            self._app["api_server_adapter"] = self
            self._app.router.add_get("/health", self._handle_health)
+            self._app.router.add_get("/health/detailed", self._handle_health_detailed)
            self._app.router.add_get("/v1/health", self._handle_health)
            self._app.router.add_get("/v1/models", self._handle_models)
            self._app.router.add_post("/v1/chat/completions", self._handle_chat_completions)
--- a/hermes_cli/web_server.py
+++ b/hermes_cli/web_server.py
@ -13,6 +13,7 @@ import asyncio
 import hmac
 import json
 import logging
+import os
 import secrets
 import sys
 import threading
@ -319,12 +320,56 @@ class EnvVarReveal(BaseModel):
    key: str


+_GATEWAY_HEALTH_URL = os.getenv("GATEWAY_HEALTH_URL")
+_GATEWAY_HEALTH_TIMEOUT = float(os.getenv("GATEWAY_HEALTH_TIMEOUT", "3"))
+
+
+def _probe_gateway_health() -> tuple[bool, dict | None]:
+    """Probe the gateway via its HTTP health endpoint (cross-container).
+
+    Uses ``/health/detailed`` first (returns full state), falling back to
+    the simpler ``/health`` endpoint.  Returns ``(is_alive, body_dict)``.
+
+    This is a **blocking** call — run via ``run_in_executor`` from async code.
+    """
+    if not _GATEWAY_HEALTH_URL:
+        return False, None
+
+    base = _GATEWAY_HEALTH_URL.rstrip("/")
+    for path in (f"{base}/detailed", base):
+        try:
+            req = urllib.request.Request(path, method="GET")
+            with urllib.request.urlopen(req, timeout=_GATEWAY_HEALTH_TIMEOUT) as resp:
+                if resp.status == 200:
+                    body = json.loads(resp.read())
+                    return True, body
+        except Exception:
+            continue
+    return False, None
+
+
@app.get("/api/status")
 async def get_status():
    current_ver, latest_ver = check_config_version()

+    # --- Gateway liveness detection ---
+    # Try local PID check first (same-host).  If that fails and a remote
+    # GATEWAY_HEALTH_URL is configured, probe the gateway over HTTP so the
+    # dashboard works when the gateway runs in a separate container.
    gateway_pid = get_running_pid()
    gateway_running = gateway_pid is not None
+    remote_health_body: dict | None = None
+
+    if not gateway_running and _GATEWAY_HEALTH_URL:
+        loop = asyncio.get_event_loop()
+        alive, remote_health_body = await loop.run_in_executor(
+            None, _probe_gateway_health
+        )
+        if alive:
+            gateway_running = True
+            # PID from the remote container (display only — not locally valid)
+            if remote_health_body:
+                gateway_pid = remote_health_body.get("pid")

    gateway_state = None
    gateway_platforms: dict = {}
@ -341,7 +386,12 @@ async def get_status():
    except Exception:
        configured_gateway_platforms = None

+    # Prefer the detailed health endpoint response (has full state) when the
+    # local runtime status file is absent or stale (cross-container).
    runtime = read_runtime_status()
+    if runtime is None and remote_health_body and remote_health_body.get("gateway_state"):
+        runtime = remote_health_body
+
    if runtime:
        gateway_state = runtime.get("gateway_state")
        gateway_platforms = runtime.get("platforms") or {}
--- a/web/src/i18n/en.ts
+++ b/web/src/i18n/en.ts
@ -80,6 +80,7 @@ export const en: Translations = {
    notRunning: "Not running",
    startFailed: "Start failed",
    pid: "PID",
+    runningRemote: "Running (remote)",
    noneRunning: "None",
    gatewayFailedToStart: "Gateway failed to start",
    lastUpdate: "Last update",
--- a/web/src/i18n/types.ts
+++ b/web/src/i18n/types.ts
@ -83,6 +83,7 @@ export interface Translations {
    notRunning: string;
    startFailed: string;
    pid: string;
+    runningRemote: string;
    noneRunning: string;
    gatewayFailedToStart: string;
    lastUpdate: string;
--- a/web/src/i18n/zh.ts
+++ b/web/src/i18n/zh.ts
@ -80,6 +80,7 @@ export const zh: Translations = {
    notRunning: "未运行",
    startFailed: "启动失败",
    pid: "进程",
+    runningRemote: "运行中（远程）",
    noneRunning: "无",
    gatewayFailedToStart: "网关启动失败",
    lastUpdate: "最后更新",
--- a/web/src/pages/StatusPage.tsx
+++ b/web/src/pages/StatusPage.tsx
@ -53,7 +53,8 @@ export default function StatusPage() {
  };

  function gatewayValue(): string {
-    if (status!.gateway_running) return `${t.status.pid} ${status!.gateway_pid}`;
+    if (status!.gateway_running && status!.gateway_pid) return `${t.status.pid} ${status!.gateway_pid}`;
+    if (status!.gateway_running) return t.status.runningRemote;
    if (status!.gateway_state === "startup_failed") return t.status.startFailed;
    return t.status.notRunning;
  }