feat(dashboard): add HTTP health probe for cross-container gateway detection

The dashboard's gateway status detection relied solely on local PID checks
(os.kill + /proc), which fails when the gateway runs in a separate container.

Changes:
- web_server.py: Add _probe_gateway_health() that queries the gateway's HTTP
  /health/detailed endpoint when the local PID check fails. Activated by
  setting the GATEWAY_HEALTH_URL env var (e.g. http://gateway:8642/health).
  Falls back to standard PID check when the env var is not set.
- api_server.py: Add GET /health/detailed endpoint that returns full gateway
  state (platforms, gateway_state, active_agents, pid, etc.) without auth.
  The existing GET /health remains unchanged for backwards compatibility.
- StatusPage.tsx: Handle the case where gateway_pid is null but the gateway
  is running remotely, displaying 'Running (remote)' instead of 'PID null'.

Environment variables:
- GATEWAY_HEALTH_URL: URL of the gateway health endpoint (e.g.
  http://gateway-container:8642/health). Unset = local PID check only.
- GATEWAY_HEALTH_TIMEOUT: Probe timeout in seconds (default: 3).
This commit is contained in:
Hermes Agent 2026-04-14 05:17:17 +00:00 committed by Teknium
parent 397386cae2
commit 45595f4805
6 changed files with 78 additions and 1 deletions

View file

@ -13,6 +13,7 @@ import asyncio
import hmac
import json
import logging
import os
import secrets
import sys
import threading
@ -319,12 +320,56 @@ class EnvVarReveal(BaseModel):
key: str
_GATEWAY_HEALTH_URL = os.getenv("GATEWAY_HEALTH_URL")
_GATEWAY_HEALTH_TIMEOUT = float(os.getenv("GATEWAY_HEALTH_TIMEOUT", "3"))
def _probe_gateway_health() -> tuple[bool, dict | None]:
"""Probe the gateway via its HTTP health endpoint (cross-container).
Uses ``/health/detailed`` first (returns full state), falling back to
the simpler ``/health`` endpoint. Returns ``(is_alive, body_dict)``.
This is a **blocking** call run via ``run_in_executor`` from async code.
"""
if not _GATEWAY_HEALTH_URL:
return False, None
base = _GATEWAY_HEALTH_URL.rstrip("/")
for path in (f"{base}/detailed", base):
try:
req = urllib.request.Request(path, method="GET")
with urllib.request.urlopen(req, timeout=_GATEWAY_HEALTH_TIMEOUT) as resp:
if resp.status == 200:
body = json.loads(resp.read())
return True, body
except Exception:
continue
return False, None
@app.get("/api/status")
async def get_status():
current_ver, latest_ver = check_config_version()
# --- Gateway liveness detection ---
# Try local PID check first (same-host). If that fails and a remote
# GATEWAY_HEALTH_URL is configured, probe the gateway over HTTP so the
# dashboard works when the gateway runs in a separate container.
gateway_pid = get_running_pid()
gateway_running = gateway_pid is not None
remote_health_body: dict | None = None
if not gateway_running and _GATEWAY_HEALTH_URL:
loop = asyncio.get_event_loop()
alive, remote_health_body = await loop.run_in_executor(
None, _probe_gateway_health
)
if alive:
gateway_running = True
# PID from the remote container (display only — not locally valid)
if remote_health_body:
gateway_pid = remote_health_body.get("pid")
gateway_state = None
gateway_platforms: dict = {}
@ -341,7 +386,12 @@ async def get_status():
except Exception:
configured_gateway_platforms = None
# Prefer the detailed health endpoint response (has full state) when the
# local runtime status file is absent or stale (cross-container).
runtime = read_runtime_status()
if runtime is None and remote_health_body and remote_health_body.get("gateway_state"):
runtime = remote_health_body
if runtime:
gateway_state = runtime.get("gateway_state")
gateway_platforms = runtime.get("platforms") or {}