mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
Root cause: when the gateway received SIGTERM (from hermes update, external kill, WSL2 runtime, etc.), it exited with status 0. systemd's Restart=on-failure only restarts on non-zero exit, so the gateway stayed dead permanently. Users had to manually restart. Fix 1: Signal-initiated shutdown exits non-zero When SIGTERM/SIGINT is received and no restart was requested (via /restart, /update, or SIGUSR1), start_gateway() returns False which causes sys.exit(1). systemd sees a failure exit and auto-restarts after RestartSec=30. This is safe because systemctl stop tracks its own stop-requested state independently of exit code — Restart= never fires for a deliberate stop, regardless of exit code. Also logs 'Received SIGTERM/SIGINT — initiating shutdown' so the cause of unexpected shutdowns is visible in agent.log. Fix 2: PID file ownership guard remove_pid_file() now checks that the PID file belongs to the current process before removing it. During --replace handoffs, the old process's atexit handler could fire AFTER the new process wrote its PID file, deleting the new record. This left the gateway running but invisible to get_running_pid(), causing 'Another gateway already running' errors on next restart. Test plan: - All restart drain tests pass (13) - All gateway service tests pass (84) - All update gateway restart tests pass (34)
455 lines
15 KiB
Python
455 lines
15 KiB
Python
"""
|
|
Gateway runtime status helpers.
|
|
|
|
Provides PID-file based detection of whether the gateway daemon is running,
|
|
used by send_message's check_fn to gate availability in the CLI.
|
|
|
|
The PID file lives at ``{HERMES_HOME}/gateway.pid``. HERMES_HOME defaults to
|
|
``~/.hermes`` but can be overridden via the environment variable. This means
|
|
separate HERMES_HOME directories naturally get separate PID files — a property
|
|
that will be useful when we add named profiles (multiple agents running
|
|
concurrently under distinct configurations).
|
|
"""
|
|
|
|
import hashlib
|
|
import json
|
|
import os
|
|
import signal
|
|
import subprocess
|
|
import sys
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from hermes_constants import get_hermes_home
|
|
from typing import Any, Optional
|
|
|
|
_GATEWAY_KIND = "hermes-gateway"
|
|
_RUNTIME_STATUS_FILE = "gateway_state.json"
|
|
_LOCKS_DIRNAME = "gateway-locks"
|
|
_IS_WINDOWS = sys.platform == "win32"
|
|
_UNSET = object()
|
|
|
|
|
|
def _get_pid_path() -> Path:
|
|
"""Return the path to the gateway PID file, respecting HERMES_HOME."""
|
|
home = get_hermes_home()
|
|
return home / "gateway.pid"
|
|
|
|
|
|
def _get_runtime_status_path() -> Path:
|
|
"""Return the persisted runtime health/status file path."""
|
|
return _get_pid_path().with_name(_RUNTIME_STATUS_FILE)
|
|
|
|
|
|
def _get_lock_dir() -> Path:
|
|
"""Return the machine-local directory for token-scoped gateway locks."""
|
|
override = os.getenv("HERMES_GATEWAY_LOCK_DIR")
|
|
if override:
|
|
return Path(override)
|
|
state_home = Path(os.getenv("XDG_STATE_HOME", Path.home() / ".local" / "state"))
|
|
return state_home / "hermes" / _LOCKS_DIRNAME
|
|
|
|
|
|
def _utc_now_iso() -> str:
|
|
return datetime.now(timezone.utc).isoformat()
|
|
|
|
|
|
def terminate_pid(pid: int, *, force: bool = False) -> None:
|
|
"""Terminate a PID with platform-appropriate force semantics.
|
|
|
|
POSIX uses SIGTERM/SIGKILL. Windows uses taskkill /T /F for true force-kill
|
|
because os.kill(..., SIGTERM) is not equivalent to a tree-killing hard stop.
|
|
"""
|
|
if force and _IS_WINDOWS:
|
|
try:
|
|
result = subprocess.run(
|
|
["taskkill", "/PID", str(pid), "/T", "/F"],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=10,
|
|
)
|
|
except FileNotFoundError:
|
|
os.kill(pid, signal.SIGTERM)
|
|
return
|
|
|
|
if result.returncode != 0:
|
|
details = (result.stderr or result.stdout or "").strip()
|
|
raise OSError(details or f"taskkill failed for PID {pid}")
|
|
return
|
|
|
|
sig = signal.SIGTERM if not force else getattr(signal, "SIGKILL", signal.SIGTERM)
|
|
os.kill(pid, sig)
|
|
|
|
|
|
def _scope_hash(identity: str) -> str:
|
|
return hashlib.sha256(identity.encode("utf-8")).hexdigest()[:16]
|
|
|
|
|
|
def _get_scope_lock_path(scope: str, identity: str) -> Path:
|
|
return _get_lock_dir() / f"{scope}-{_scope_hash(identity)}.lock"
|
|
|
|
|
|
def _get_process_start_time(pid: int) -> Optional[int]:
|
|
"""Return the kernel start time for a process when available."""
|
|
stat_path = Path(f"/proc/{pid}/stat")
|
|
try:
|
|
# Field 22 in /proc/<pid>/stat is process start time (clock ticks).
|
|
return int(stat_path.read_text().split()[21])
|
|
except (FileNotFoundError, IndexError, PermissionError, ValueError, OSError):
|
|
return None
|
|
|
|
|
|
def _read_process_cmdline(pid: int) -> Optional[str]:
|
|
"""Return the process command line as a space-separated string."""
|
|
cmdline_path = Path(f"/proc/{pid}/cmdline")
|
|
try:
|
|
raw = cmdline_path.read_bytes()
|
|
except (FileNotFoundError, PermissionError, OSError):
|
|
return None
|
|
|
|
if not raw:
|
|
return None
|
|
return raw.replace(b"\x00", b" ").decode("utf-8", errors="ignore").strip()
|
|
|
|
|
|
def _looks_like_gateway_process(pid: int) -> bool:
|
|
"""Return True when the live PID still looks like the Hermes gateway."""
|
|
cmdline = _read_process_cmdline(pid)
|
|
if not cmdline:
|
|
return False
|
|
|
|
patterns = (
|
|
"hermes_cli.main gateway",
|
|
"hermes_cli/main.py gateway",
|
|
"hermes gateway",
|
|
"gateway/run.py",
|
|
)
|
|
return any(pattern in cmdline for pattern in patterns)
|
|
|
|
|
|
def _record_looks_like_gateway(record: dict[str, Any]) -> bool:
|
|
"""Validate gateway identity from PID-file metadata when cmdline is unavailable."""
|
|
if record.get("kind") != _GATEWAY_KIND:
|
|
return False
|
|
|
|
argv = record.get("argv")
|
|
if not isinstance(argv, list) or not argv:
|
|
return False
|
|
|
|
cmdline = " ".join(str(part) for part in argv)
|
|
patterns = (
|
|
"hermes_cli.main gateway",
|
|
"hermes_cli/main.py gateway",
|
|
"hermes gateway",
|
|
"gateway/run.py",
|
|
)
|
|
return any(pattern in cmdline for pattern in patterns)
|
|
|
|
|
|
def _build_pid_record() -> dict:
|
|
return {
|
|
"pid": os.getpid(),
|
|
"kind": _GATEWAY_KIND,
|
|
"argv": list(sys.argv),
|
|
"start_time": _get_process_start_time(os.getpid()),
|
|
}
|
|
|
|
|
|
def _build_runtime_status_record() -> dict[str, Any]:
|
|
payload = _build_pid_record()
|
|
payload.update({
|
|
"gateway_state": "starting",
|
|
"exit_reason": None,
|
|
"restart_requested": False,
|
|
"active_agents": 0,
|
|
"platforms": {},
|
|
"updated_at": _utc_now_iso(),
|
|
})
|
|
return payload
|
|
|
|
|
|
def _read_json_file(path: Path) -> Optional[dict[str, Any]]:
|
|
if not path.exists():
|
|
return None
|
|
try:
|
|
raw = path.read_text().strip()
|
|
except OSError:
|
|
return None
|
|
if not raw:
|
|
return None
|
|
try:
|
|
payload = json.loads(raw)
|
|
except json.JSONDecodeError:
|
|
return None
|
|
return payload if isinstance(payload, dict) else None
|
|
|
|
|
|
def _write_json_file(path: Path, payload: dict[str, Any]) -> None:
|
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
path.write_text(json.dumps(payload))
|
|
|
|
|
|
def _read_pid_record() -> Optional[dict]:
|
|
pid_path = _get_pid_path()
|
|
if not pid_path.exists():
|
|
return None
|
|
|
|
raw = pid_path.read_text().strip()
|
|
if not raw:
|
|
return None
|
|
|
|
try:
|
|
payload = json.loads(raw)
|
|
except json.JSONDecodeError:
|
|
try:
|
|
return {"pid": int(raw)}
|
|
except ValueError:
|
|
return None
|
|
|
|
if isinstance(payload, int):
|
|
return {"pid": payload}
|
|
if isinstance(payload, dict):
|
|
return payload
|
|
return None
|
|
|
|
|
|
def write_pid_file() -> None:
|
|
"""Write the current process PID and metadata to the gateway PID file."""
|
|
_write_json_file(_get_pid_path(), _build_pid_record())
|
|
|
|
|
|
def write_runtime_status(
|
|
*,
|
|
gateway_state: Any = _UNSET,
|
|
exit_reason: Any = _UNSET,
|
|
restart_requested: Any = _UNSET,
|
|
active_agents: Any = _UNSET,
|
|
platform: Any = _UNSET,
|
|
platform_state: Any = _UNSET,
|
|
error_code: Any = _UNSET,
|
|
error_message: Any = _UNSET,
|
|
) -> None:
|
|
"""Persist gateway runtime health information for diagnostics/status."""
|
|
path = _get_runtime_status_path()
|
|
payload = _read_json_file(path) or _build_runtime_status_record()
|
|
payload.setdefault("platforms", {})
|
|
payload.setdefault("kind", _GATEWAY_KIND)
|
|
payload["pid"] = os.getpid()
|
|
payload["start_time"] = _get_process_start_time(os.getpid())
|
|
payload["updated_at"] = _utc_now_iso()
|
|
|
|
if gateway_state is not _UNSET:
|
|
payload["gateway_state"] = gateway_state
|
|
if exit_reason is not _UNSET:
|
|
payload["exit_reason"] = exit_reason
|
|
if restart_requested is not _UNSET:
|
|
payload["restart_requested"] = bool(restart_requested)
|
|
if active_agents is not _UNSET:
|
|
payload["active_agents"] = max(0, int(active_agents))
|
|
|
|
if platform is not _UNSET:
|
|
platform_payload = payload["platforms"].get(platform, {})
|
|
if platform_state is not _UNSET:
|
|
platform_payload["state"] = platform_state
|
|
if error_code is not _UNSET:
|
|
platform_payload["error_code"] = error_code
|
|
if error_message is not _UNSET:
|
|
platform_payload["error_message"] = error_message
|
|
platform_payload["updated_at"] = _utc_now_iso()
|
|
payload["platforms"][platform] = platform_payload
|
|
|
|
_write_json_file(path, payload)
|
|
|
|
|
|
def read_runtime_status() -> Optional[dict[str, Any]]:
|
|
"""Read the persisted gateway runtime health/status information."""
|
|
return _read_json_file(_get_runtime_status_path())
|
|
|
|
|
|
def remove_pid_file() -> None:
|
|
"""Remove the gateway PID file, but only if it belongs to this process.
|
|
|
|
During --replace handoffs, the old process's atexit handler can fire AFTER
|
|
the new process has written its own PID file. Blindly removing the file
|
|
would delete the new process's record, leaving the gateway running with no
|
|
PID file (invisible to ``get_running_pid()``).
|
|
"""
|
|
try:
|
|
path = _get_pid_path()
|
|
record = _read_json_file(path)
|
|
if record is not None:
|
|
try:
|
|
file_pid = int(record["pid"])
|
|
except (KeyError, TypeError, ValueError):
|
|
file_pid = None
|
|
if file_pid is not None and file_pid != os.getpid():
|
|
# PID file belongs to a different process — leave it alone.
|
|
return
|
|
path.unlink(missing_ok=True)
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
def acquire_scoped_lock(scope: str, identity: str, metadata: Optional[dict[str, Any]] = None) -> tuple[bool, Optional[dict[str, Any]]]:
|
|
"""Acquire a machine-local lock keyed by scope + identity.
|
|
|
|
Used to prevent multiple local gateways from using the same external identity
|
|
at once (e.g. the same Telegram bot token across different HERMES_HOME dirs).
|
|
"""
|
|
lock_path = _get_scope_lock_path(scope, identity)
|
|
lock_path.parent.mkdir(parents=True, exist_ok=True)
|
|
record = {
|
|
**_build_pid_record(),
|
|
"scope": scope,
|
|
"identity_hash": _scope_hash(identity),
|
|
"metadata": metadata or {},
|
|
"updated_at": _utc_now_iso(),
|
|
}
|
|
|
|
existing = _read_json_file(lock_path)
|
|
if existing is None and lock_path.exists():
|
|
# Lock file exists but is empty or contains invalid JSON — treat as
|
|
# stale. This happens when a previous process was killed between
|
|
# O_CREAT|O_EXCL and the subsequent json.dump() (e.g. DNS failure
|
|
# during rapid Slack reconnect retries).
|
|
try:
|
|
lock_path.unlink(missing_ok=True)
|
|
except OSError:
|
|
pass
|
|
if existing:
|
|
try:
|
|
existing_pid = int(existing["pid"])
|
|
except (KeyError, TypeError, ValueError):
|
|
existing_pid = None
|
|
|
|
if existing_pid == os.getpid() and existing.get("start_time") == record.get("start_time"):
|
|
_write_json_file(lock_path, record)
|
|
return True, existing
|
|
|
|
stale = existing_pid is None
|
|
if not stale:
|
|
try:
|
|
os.kill(existing_pid, 0)
|
|
except (ProcessLookupError, PermissionError):
|
|
stale = True
|
|
else:
|
|
current_start = _get_process_start_time(existing_pid)
|
|
if (
|
|
existing.get("start_time") is not None
|
|
and current_start is not None
|
|
and current_start != existing.get("start_time")
|
|
):
|
|
stale = True
|
|
# Check if process is stopped (Ctrl+Z / SIGTSTP) — stopped
|
|
# processes still respond to os.kill(pid, 0) but are not
|
|
# actually running. Treat them as stale so --replace works.
|
|
if not stale:
|
|
try:
|
|
_proc_status = Path(f"/proc/{existing_pid}/status")
|
|
if _proc_status.exists():
|
|
for _line in _proc_status.read_text().splitlines():
|
|
if _line.startswith("State:"):
|
|
_state = _line.split()[1]
|
|
if _state in ("T", "t"): # stopped or tracing stop
|
|
stale = True
|
|
break
|
|
except (OSError, PermissionError):
|
|
pass
|
|
if stale:
|
|
try:
|
|
lock_path.unlink(missing_ok=True)
|
|
except OSError:
|
|
pass
|
|
else:
|
|
return False, existing
|
|
|
|
try:
|
|
fd = os.open(lock_path, os.O_CREAT | os.O_EXCL | os.O_WRONLY)
|
|
except FileExistsError:
|
|
return False, _read_json_file(lock_path)
|
|
try:
|
|
with os.fdopen(fd, "w", encoding="utf-8") as handle:
|
|
json.dump(record, handle)
|
|
except Exception:
|
|
try:
|
|
lock_path.unlink(missing_ok=True)
|
|
except OSError:
|
|
pass
|
|
raise
|
|
return True, None
|
|
|
|
|
|
def release_scoped_lock(scope: str, identity: str) -> None:
|
|
"""Release a previously-acquired scope lock when owned by this process."""
|
|
lock_path = _get_scope_lock_path(scope, identity)
|
|
existing = _read_json_file(lock_path)
|
|
if not existing:
|
|
return
|
|
if existing.get("pid") != os.getpid():
|
|
return
|
|
if existing.get("start_time") != _get_process_start_time(os.getpid()):
|
|
return
|
|
try:
|
|
lock_path.unlink(missing_ok=True)
|
|
except OSError:
|
|
pass
|
|
|
|
|
|
def release_all_scoped_locks() -> int:
|
|
"""Remove all scoped lock files in the lock directory.
|
|
|
|
Called during --replace to clean up stale locks left by stopped/killed
|
|
gateway processes that did not release their locks gracefully.
|
|
Returns the number of lock files removed.
|
|
"""
|
|
lock_dir = _get_lock_dir()
|
|
removed = 0
|
|
if lock_dir.exists():
|
|
for lock_file in lock_dir.glob("*.lock"):
|
|
try:
|
|
lock_file.unlink(missing_ok=True)
|
|
removed += 1
|
|
except OSError:
|
|
pass
|
|
return removed
|
|
|
|
|
|
def get_running_pid() -> Optional[int]:
|
|
"""Return the PID of a running gateway instance, or ``None``.
|
|
|
|
Checks the PID file and verifies the process is actually alive.
|
|
Cleans up stale PID files automatically.
|
|
"""
|
|
record = _read_pid_record()
|
|
if not record:
|
|
remove_pid_file()
|
|
return None
|
|
|
|
try:
|
|
pid = int(record["pid"])
|
|
except (KeyError, TypeError, ValueError):
|
|
remove_pid_file()
|
|
return None
|
|
|
|
try:
|
|
os.kill(pid, 0) # signal 0 = existence check, no actual signal sent
|
|
except (ProcessLookupError, PermissionError):
|
|
remove_pid_file()
|
|
return None
|
|
|
|
recorded_start = record.get("start_time")
|
|
current_start = _get_process_start_time(pid)
|
|
if recorded_start is not None and current_start is not None and current_start != recorded_start:
|
|
remove_pid_file()
|
|
return None
|
|
|
|
if not _looks_like_gateway_process(pid):
|
|
if not _record_looks_like_gateway(record):
|
|
remove_pid_file()
|
|
return None
|
|
|
|
return pid
|
|
|
|
|
|
def is_gateway_running() -> bool:
|
|
"""Check if the gateway daemon is currently running."""
|
|
return get_running_pid() is not None
|