diff --git a/gateway/platforms/api_server.py b/gateway/platforms/api_server.py index 9a4990465..2077c9c85 100644 --- a/gateway/platforms/api_server.py +++ b/gateway/platforms/api_server.py @@ -10,6 +10,7 @@ Exposes an HTTP server with endpoints: - POST /v1/runs — start a run, returns run_id immediately (202) - GET /v1/runs/{run_id}/events — SSE stream of structured lifecycle events - GET /health — health check +- GET /health/detailed — rich status for cross-container dashboard probing Any OpenAI-compatible frontend (Open WebUI, LobeChat, LibreChat, AnythingLLM, NextChat, ChatBox, etc.) can connect to hermes-agent @@ -565,6 +566,27 @@ class APIServerAdapter(BasePlatformAdapter): """GET /health — simple health check.""" return web.json_response({"status": "ok", "platform": "hermes-agent"}) + async def _handle_health_detailed(self, request: "web.Request") -> "web.Response": + """GET /health/detailed — rich status for cross-container dashboard probing. + + Returns gateway state, connected platforms, PID, and uptime so the + dashboard can display full status without needing a shared PID file or + /proc access. No authentication required. + """ + from gateway.status import read_runtime_status + + runtime = read_runtime_status() or {} + return web.json_response({ + "status": "ok", + "platform": "hermes-agent", + "gateway_state": runtime.get("gateway_state"), + "platforms": runtime.get("platforms", {}), + "active_agents": runtime.get("active_agents", 0), + "exit_reason": runtime.get("exit_reason"), + "updated_at": runtime.get("updated_at"), + "pid": os.getpid(), + }) + async def _handle_models(self, request: "web.Request") -> "web.Response": """GET /v1/models — return hermes-agent as an available model.""" auth_err = self._check_auth(request) @@ -1783,6 +1805,7 @@ class APIServerAdapter(BasePlatformAdapter): self._app = web.Application(middlewares=mws) self._app["api_server_adapter"] = self self._app.router.add_get("/health", self._handle_health) + self._app.router.add_get("/health/detailed", self._handle_health_detailed) self._app.router.add_get("/v1/health", self._handle_health) self._app.router.add_get("/v1/models", self._handle_models) self._app.router.add_post("/v1/chat/completions", self._handle_chat_completions) diff --git a/hermes_cli/web_server.py b/hermes_cli/web_server.py index f73104ce8..be6047533 100644 --- a/hermes_cli/web_server.py +++ b/hermes_cli/web_server.py @@ -12,6 +12,7 @@ Usage: import asyncio import json import logging +import os import secrets import sys import threading @@ -280,12 +281,56 @@ class EnvVarReveal(BaseModel): key: str +_GATEWAY_HEALTH_URL = os.getenv("GATEWAY_HEALTH_URL") +_GATEWAY_HEALTH_TIMEOUT = float(os.getenv("GATEWAY_HEALTH_TIMEOUT", "3")) + + +def _probe_gateway_health() -> tuple[bool, dict | None]: + """Probe the gateway via its HTTP health endpoint (cross-container). + + Uses ``/health/detailed`` first (returns full state), falling back to + the simpler ``/health`` endpoint. Returns ``(is_alive, body_dict)``. + + This is a **blocking** call — run via ``run_in_executor`` from async code. + """ + if not _GATEWAY_HEALTH_URL: + return False, None + + base = _GATEWAY_HEALTH_URL.rstrip("/") + for path in (f"{base}/detailed", base): + try: + req = urllib.request.Request(path, method="GET") + with urllib.request.urlopen(req, timeout=_GATEWAY_HEALTH_TIMEOUT) as resp: + if resp.status == 200: + body = json.loads(resp.read()) + return True, body + except Exception: + continue + return False, None + + @app.get("/api/status") async def get_status(): current_ver, latest_ver = check_config_version() + # --- Gateway liveness detection --- + # Try local PID check first (same-host). If that fails and a remote + # GATEWAY_HEALTH_URL is configured, probe the gateway over HTTP so the + # dashboard works when the gateway runs in a separate container. gateway_pid = get_running_pid() gateway_running = gateway_pid is not None + remote_health_body: dict | None = None + + if not gateway_running and _GATEWAY_HEALTH_URL: + loop = asyncio.get_event_loop() + alive, remote_health_body = await loop.run_in_executor( + None, _probe_gateway_health + ) + if alive: + gateway_running = True + # PID from the remote container (display only — not locally valid) + if remote_health_body: + gateway_pid = remote_health_body.get("pid") gateway_state = None gateway_platforms: dict = {} @@ -302,7 +347,12 @@ async def get_status(): except Exception: configured_gateway_platforms = None + # Prefer the detailed health endpoint response (has full state) when the + # local runtime status file is absent or stale (cross-container). runtime = read_runtime_status() + if runtime is None and remote_health_body and remote_health_body.get("gateway_state"): + runtime = remote_health_body + if runtime: gateway_state = runtime.get("gateway_state") gateway_platforms = runtime.get("platforms") or {} diff --git a/web/src/pages/StatusPage.tsx b/web/src/pages/StatusPage.tsx index 4e22239eb..b4a5e362a 100644 --- a/web/src/pages/StatusPage.tsx +++ b/web/src/pages/StatusPage.tsx @@ -29,7 +29,8 @@ const GATEWAY_STATE_DISPLAY: Record