fix(gateway): fix discrepancies in gateway status

This commit is contained in:
Sara Reynolds 2026-04-16 13:16:02 -04:00 committed by Teknium
parent 511ed4dacc
commit 8ab1aa2efc
9 changed files with 329 additions and 166 deletions

View file

@ -43,41 +43,20 @@ def _redact(value: str) -> str:
def _gateway_status() -> str:
"""Return a short gateway status string."""
if sys.platform.startswith("linux"):
from hermes_constants import is_container
if is_container():
try:
from hermes_cli.gateway import find_gateway_pids
pids = find_gateway_pids()
if pids:
return f"running (docker, pid {pids[0]})"
return "stopped (docker)"
except Exception:
return "stopped (docker)"
try:
from hermes_cli.gateway import get_service_name
svc = get_service_name()
except Exception:
svc = "hermes-gateway"
try:
r = subprocess.run(
["systemctl", "--user", "is-active", svc],
capture_output=True, text=True, timeout=5,
)
return "running (systemd)" if r.stdout.strip() == "active" else "stopped"
except Exception:
return "unknown"
elif sys.platform == "darwin":
try:
from hermes_cli.gateway import get_launchd_label
r = subprocess.run(
["launchctl", "list", get_launchd_label()],
capture_output=True, text=True, timeout=5,
)
return "loaded (launchd)" if r.returncode == 0 else "not loaded"
except Exception:
return "unknown"
return "N/A"
try:
from hermes_cli.gateway import get_gateway_runtime_snapshot
snapshot = get_gateway_runtime_snapshot()
if snapshot.running:
mode = snapshot.manager
if snapshot.has_process_service_mismatch:
mode = "manual"
return f"running ({mode}, pid {snapshot.gateway_pids[0]})"
if snapshot.service_installed and not snapshot.service_running:
return f"stopped ({snapshot.manager})"
return f"stopped ({snapshot.manager})"
except Exception:
return "unknown" if sys.platform.startswith(("linux", "darwin")) else "N/A"
def _count_skills(hermes_home: Path) -> int:

View file

@ -10,6 +10,7 @@ import shutil
import signal
import subprocess
import sys
from dataclasses import dataclass
from pathlib import Path
PROJECT_ROOT = Path(__file__).parent.parent.resolve()
@ -41,6 +42,23 @@ from hermes_cli.colors import Colors, color
# Process Management (for manual gateway runs)
# =============================================================================
@dataclass(frozen=True)
class GatewayRuntimeSnapshot:
manager: str
service_installed: bool = False
service_running: bool = False
gateway_pids: tuple[int, ...] = ()
service_scope: str | None = None
@property
def running(self) -> bool:
return self.service_running or bool(self.gateway_pids)
@property
def has_process_service_mismatch(self) -> bool:
return self.service_installed and self.running and not self.service_running
def _get_service_pids() -> set:
"""Return PIDs currently managed by systemd or launchd gateway services.
@ -157,20 +175,22 @@ def _request_gateway_self_restart(pid: int) -> bool:
return True
def find_gateway_pids(exclude_pids: set | None = None, all_profiles: bool = False) -> list:
"""Find PIDs of running gateway processes.
def _append_unique_pid(pids: list[int], pid: int | None, exclude_pids: set[int]) -> None:
if pid is None or pid <= 0:
return
if pid == os.getpid() or pid in exclude_pids or pid in pids:
return
pids.append(pid)
Args:
exclude_pids: PIDs to exclude from the result (e.g. service-managed
PIDs that should not be killed during a stale-process sweep).
all_profiles: When ``True``, return gateway PIDs across **all**
profiles (the pre-7923 global behaviour). ``hermes update``
needs this because a code update affects every profile.
When ``False`` (default), only PIDs belonging to the current
Hermes profile are returned.
def _scan_gateway_pids(exclude_pids: set[int], all_profiles: bool = False) -> list[int]:
"""Best-effort process-table scan for gateway PIDs.
This supplements the profile-scoped PID file so status views can still spot
a live gateway when the PID file is stale/missing, and ``--all`` sweeps can
discover gateways outside the current profile.
"""
_exclude = exclude_pids or set()
pids = [pid for pid in _get_service_pids() if pid not in _exclude]
pids: list[int] = []
patterns = [
"hermes_cli.main gateway",
"hermes_cli.main --profile",
@ -203,20 +223,24 @@ def find_gateway_pids(exclude_pids: set | None = None, all_profiles: bool = Fals
if is_windows():
result = subprocess.run(
["wmic", "process", "get", "ProcessId,CommandLine", "/FORMAT:LIST"],
capture_output=True, text=True, timeout=10
capture_output=True,
text=True,
timeout=10,
)
if result.returncode != 0:
return []
current_cmd = ""
for line in result.stdout.split('\n'):
for line in result.stdout.split("\n"):
line = line.strip()
if line.startswith("CommandLine="):
current_cmd = line[len("CommandLine="):]
elif line.startswith("ProcessId="):
pid_str = line[len("ProcessId="):]
if any(p in current_cmd for p in patterns) and (all_profiles or _matches_current_profile(current_cmd)):
if any(p in current_cmd for p in patterns) and (
all_profiles or _matches_current_profile(current_cmd)
):
try:
pid = int(pid_str)
if pid != os.getpid() and pid not in pids and pid not in _exclude:
pids.append(pid)
_append_unique_pid(pids, int(pid_str), exclude_pids)
except ValueError:
pass
current_cmd = ""
@ -227,9 +251,11 @@ def find_gateway_pids(exclude_pids: set | None = None, all_profiles: bool = Fals
text=True,
timeout=10,
)
for line in result.stdout.split('\n'):
if result.returncode != 0:
return []
for line in result.stdout.split("\n"):
stripped = line.strip()
if not stripped or 'grep' in stripped:
if not stripped or "grep" in stripped:
continue
pid = None
@ -251,16 +277,137 @@ def find_gateway_pids(exclude_pids: set | None = None, all_profiles: bool = Fals
if pid is None:
continue
if pid == os.getpid() or pid in pids or pid in _exclude:
continue
if any(pattern in command for pattern in patterns) and (all_profiles or _matches_current_profile(command)):
pids.append(pid)
if any(pattern in command for pattern in patterns) and (
all_profiles or _matches_current_profile(command)
):
_append_unique_pid(pids, pid, exclude_pids)
except (OSError, subprocess.TimeoutExpired):
pass
return []
return pids
def find_gateway_pids(exclude_pids: set | None = None, all_profiles: bool = False) -> list:
"""Find PIDs of running gateway processes.
Args:
exclude_pids: PIDs to exclude from the result (e.g. service-managed
PIDs that should not be killed during a stale-process sweep).
all_profiles: When ``True``, return gateway PIDs across **all**
profiles (the pre-7923 global behaviour). ``hermes update``
needs this because a code update affects every profile.
When ``False`` (default), only PIDs belonging to the current
Hermes profile are returned.
"""
_exclude = set(exclude_pids or set())
pids: list[int] = []
if not all_profiles:
try:
from gateway.status import get_running_pid
_append_unique_pid(pids, get_running_pid(), _exclude)
except Exception:
pass
for pid in _get_service_pids():
_append_unique_pid(pids, pid, _exclude)
for pid in _scan_gateway_pids(_exclude, all_profiles=all_profiles):
_append_unique_pid(pids, pid, _exclude)
return pids
def _probe_systemd_service_running(system: bool = False) -> tuple[bool, bool]:
selected_system = _select_systemd_scope(system)
unit_exists = get_systemd_unit_path(system=selected_system).exists()
if not unit_exists:
return selected_system, False
try:
result = _run_systemctl(
["is-active", get_service_name()],
system=selected_system,
capture_output=True,
text=True,
timeout=10,
)
except (RuntimeError, subprocess.TimeoutExpired):
return selected_system, False
return selected_system, result.stdout.strip() == "active"
def _probe_launchd_service_running() -> bool:
if not get_launchd_plist_path().exists():
return False
try:
result = subprocess.run(
["launchctl", "list", get_launchd_label()],
capture_output=True,
text=True,
timeout=10,
)
except subprocess.TimeoutExpired:
return False
return result.returncode == 0
def get_gateway_runtime_snapshot(system: bool = False) -> GatewayRuntimeSnapshot:
"""Return a unified view of gateway liveness for the current profile."""
gateway_pids = tuple(find_gateway_pids())
if is_termux():
return GatewayRuntimeSnapshot(
manager="Termux / manual process",
gateway_pids=gateway_pids,
)
from hermes_constants import is_container
if is_linux() and is_container():
return GatewayRuntimeSnapshot(
manager="docker (foreground)",
gateway_pids=gateway_pids,
)
if supports_systemd_services():
selected_system, service_running = _probe_systemd_service_running(system=system)
scope_label = _service_scope_label(selected_system)
return GatewayRuntimeSnapshot(
manager=f"systemd ({scope_label})",
service_installed=get_systemd_unit_path(system=selected_system).exists(),
service_running=service_running,
gateway_pids=gateway_pids,
service_scope=scope_label,
)
if is_macos():
return GatewayRuntimeSnapshot(
manager="launchd",
service_installed=get_launchd_plist_path().exists(),
service_running=_probe_launchd_service_running(),
gateway_pids=gateway_pids,
service_scope="launchd",
)
return GatewayRuntimeSnapshot(
manager="manual process",
gateway_pids=gateway_pids,
)
def _format_gateway_pids(pids: tuple[int, ...] | list[int], *, limit: int | None = 3) -> str:
rendered = [str(pid) for pid in pids[:limit] if pid > 0] if limit is not None else [str(pid) for pid in pids if pid > 0]
if limit is not None and len(pids) > limit:
rendered.append("...")
return ", ".join(rendered)
def _print_gateway_process_mismatch(snapshot: GatewayRuntimeSnapshot) -> None:
if not snapshot.has_process_service_mismatch:
return
print()
print("⚠ Gateway process is running for this profile, but the service is not active")
print(f" PID(s): {_format_gateway_pids(snapshot.gateway_pids, limit=None)}")
print(" This is usually a manual foreground/tmux/nohup run, so `hermes gateway`")
print(" can refuse to start another copy until this process stops.")
def kill_gateway_processes(force: bool = False, exclude_pids: set | None = None,
all_profiles: bool = False) -> int:
"""Kill any running gateway processes. Returns count killed.
@ -3376,15 +3523,18 @@ def gateway_command(args):
elif subcmd == "status":
deep = getattr(args, 'deep', False)
system = getattr(args, 'system', False)
snapshot = get_gateway_runtime_snapshot(system=system)
# Check for service first
if supports_systemd_services() and (get_systemd_unit_path(system=False).exists() or get_systemd_unit_path(system=True).exists()):
systemd_status(deep, system=system)
_print_gateway_process_mismatch(snapshot)
elif is_macos() and get_launchd_plist_path().exists():
launchd_status(deep)
_print_gateway_process_mismatch(snapshot)
else:
# Check for manually running processes
pids = find_gateway_pids()
pids = list(snapshot.gateway_pids)
if pids:
print(f"✓ Gateway is running (PID: {', '.join(map(str, pids))})")
print(" (Running manually, not as a system service)")

View file

@ -300,19 +300,10 @@ def _read_config_model(profile_dir: Path) -> tuple:
def _check_gateway_running(profile_dir: Path) -> bool:
"""Check if a gateway is running for a given profile directory."""
pid_file = profile_dir / "gateway.pid"
if not pid_file.exists():
return False
try:
raw = pid_file.read_text().strip()
if not raw:
return False
data = json.loads(raw) if raw.startswith("{") else {"pid": int(raw)}
pid = int(data["pid"])
os.kill(pid, 0) # existence check
return True
except (json.JSONDecodeError, KeyError, ValueError, TypeError,
ProcessLookupError, PermissionError, OSError):
from gateway.status import get_running_pid
return get_running_pid(profile_dir / "gateway.pid", cleanup_stale=False) is not None
except Exception:
return False

View file

@ -342,73 +342,36 @@ def show_status(args):
# =========================================================================
print()
print(color("◆ Gateway Service", Colors.CYAN, Colors.BOLD))
if _is_termux():
try:
from hermes_cli.gateway import find_gateway_pids
gateway_pids = find_gateway_pids()
except Exception:
gateway_pids = []
is_running = bool(gateway_pids)
try:
from hermes_cli.gateway import get_gateway_runtime_snapshot, _format_gateway_pids
snapshot = get_gateway_runtime_snapshot()
is_running = snapshot.running
print(f" Status: {check_mark(is_running)} {'running' if is_running else 'stopped'}")
print(" Manager: Termux / manual process")
if gateway_pids:
rendered = ", ".join(str(pid) for pid in gateway_pids[:3])
if len(gateway_pids) > 3:
rendered += ", ..."
print(f" PID(s): {rendered}")
else:
print(f" Manager: {snapshot.manager}")
if snapshot.gateway_pids:
print(f" PID(s): {_format_gateway_pids(snapshot.gateway_pids)}")
if snapshot.has_process_service_mismatch:
print(" Service: installed but not managing the current running gateway")
elif _is_termux() and not snapshot.gateway_pids:
print(" Start with: hermes gateway")
print(" Note: Android may stop background jobs when Termux is suspended")
elif sys.platform.startswith('linux'):
from hermes_constants import is_container
if is_container():
# Docker/Podman: no systemd — check for running gateway processes
try:
from hermes_cli.gateway import find_gateway_pids
gateway_pids = find_gateway_pids()
is_active = len(gateway_pids) > 0
except Exception:
is_active = False
print(f" Status: {check_mark(is_active)} {'running' if is_active else 'stopped'}")
print(" Manager: docker (foreground)")
elif snapshot.service_installed and not snapshot.service_running:
print(" Service: installed but stopped")
except Exception:
if _is_termux():
print(f" Status: {color('unknown', Colors.DIM)}")
print(" Manager: Termux / manual process")
elif sys.platform.startswith('linux'):
print(f" Status: {color('unknown', Colors.DIM)}")
print(" Manager: systemd/manual")
elif sys.platform == 'darwin':
print(f" Status: {color('unknown', Colors.DIM)}")
print(" Manager: launchd")
else:
try:
from hermes_cli.gateway import get_service_name
_gw_svc = get_service_name()
except Exception:
_gw_svc = "hermes-gateway"
try:
result = subprocess.run(
["systemctl", "--user", "is-active", _gw_svc],
capture_output=True,
text=True,
timeout=5
)
is_active = result.stdout.strip() == "active"
except (FileNotFoundError, subprocess.TimeoutExpired):
is_active = False
print(f" Status: {check_mark(is_active)} {'running' if is_active else 'stopped'}")
print(" Manager: systemd (user)")
elif sys.platform == 'darwin':
from hermes_cli.gateway import get_launchd_label
try:
result = subprocess.run(
["launchctl", "list", get_launchd_label()],
capture_output=True,
text=True,
timeout=5
)
is_loaded = result.returncode == 0
except subprocess.TimeoutExpired:
is_loaded = False
print(f" Status: {check_mark(is_loaded)} {'loaded' if is_loaded else 'not loaded'}")
print(" Manager: launchd")
else:
print(f" Status: {color('N/A', Colors.DIM)}")
print(" Manager: (not supported on this platform)")
print(f" Status: {color('N/A', Colors.DIM)}")
print(" Manager: (not supported on this platform)")
# =========================================================================
# Cron Jobs