mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-27 11:22:03 +00:00
fix(gateway): track no-systemd restart runtimes
This commit is contained in:
parent
d539cd9004
commit
044996e403
6 changed files with 176 additions and 18 deletions
|
|
@ -190,8 +190,8 @@ def _read_process_cmdline(pid: int) -> Optional[str]:
|
|||
return None
|
||||
|
||||
|
||||
def looks_like_gateway_command_line(command: str | None) -> bool:
|
||||
"""Return True only for a real ``gateway run`` process command line.
|
||||
def _gateway_command_subcommand(command: str | None) -> str | None:
|
||||
"""Return the Hermes gateway lifecycle subcommand from a command line.
|
||||
|
||||
Lifecycle decisions (is the gateway up? did restart relaunch it?) must not
|
||||
fire on loose substring matches. The previous ``"... gateway" in cmdline``
|
||||
|
|
@ -211,7 +211,7 @@ def looks_like_gateway_command_line(command: str | None) -> bool:
|
|||
either side of the ``gateway`` subcommand.
|
||||
"""
|
||||
if not command:
|
||||
return False
|
||||
return None
|
||||
|
||||
try:
|
||||
raw_tokens = shlex.split(command, posix=False)
|
||||
|
|
@ -220,15 +220,15 @@ def looks_like_gateway_command_line(command: str | None) -> bool:
|
|||
# Strip surrounding quotes, normalize slashes + case per token.
|
||||
tokens = [t.strip("\"'").replace("\\", "/").lower() for t in raw_tokens]
|
||||
if not tokens:
|
||||
return False
|
||||
return None
|
||||
|
||||
# Gateway-dedicated entrypoints carry no subcommand to inspect.
|
||||
for token in tokens:
|
||||
if token == "gateway/run.py" or token.endswith("/gateway/run.py"):
|
||||
return True
|
||||
return "run"
|
||||
basename = token.rsplit("/", 1)[-1]
|
||||
if basename in ("hermes-gateway", "hermes-gateway.exe"):
|
||||
return True
|
||||
return "run"
|
||||
|
||||
joined = " ".join(tokens)
|
||||
has_gateway_entry = (
|
||||
|
|
@ -237,7 +237,7 @@ def looks_like_gateway_command_line(command: str | None) -> bool:
|
|||
or any(t.rsplit("/", 1)[-1] in ("hermes", "hermes.exe") for t in tokens)
|
||||
)
|
||||
if not has_gateway_entry:
|
||||
return False
|
||||
return None
|
||||
|
||||
# Drop profile selectors anywhere: --profile X / -p X / --profile=X / -p=X.
|
||||
# This consumes a profile VALUE of "gateway" too, so the real subcommand
|
||||
|
|
@ -259,9 +259,28 @@ def looks_like_gateway_command_line(command: str | None) -> bool:
|
|||
if token != "gateway":
|
||||
continue
|
||||
if i + 1 >= len(filtered):
|
||||
return True # bare `hermes gateway` defaults to `run`
|
||||
return filtered[i + 1] == "run"
|
||||
return False
|
||||
return "run" # bare `hermes gateway` defaults to `run`
|
||||
return filtered[i + 1]
|
||||
return None
|
||||
|
||||
|
||||
def looks_like_gateway_command_line(command: str | None) -> bool:
|
||||
"""Return True only for a real ``gateway run`` process command line."""
|
||||
return _gateway_command_subcommand(command) == "run"
|
||||
|
||||
|
||||
def looks_like_gateway_runtime_command_line(command: str | None) -> bool:
|
||||
"""Return True for command lines that can host the gateway runtime.
|
||||
|
||||
``gateway restart`` is normally a management command, not the gateway
|
||||
runtime. On hosts without a service manager, though, the manual restart
|
||||
fallback executes ``run_gateway()`` in that same process, so its argv stays
|
||||
as ``gateway restart`` while it owns the webhook port and writes runtime
|
||||
state. Keep the public ``looks_like_gateway_command_line()`` strict, and
|
||||
use this broader matcher only when validating Hermes-owned runtime records
|
||||
or no-supervisor cleanup scans.
|
||||
"""
|
||||
return _gateway_command_subcommand(command) in {"run", "restart"}
|
||||
|
||||
|
||||
def _looks_like_gateway_process(pid: int) -> bool:
|
||||
|
|
@ -282,7 +301,7 @@ def _record_looks_like_gateway(record: dict[str, Any]) -> bool:
|
|||
return False
|
||||
|
||||
cmdline = " ".join(str(part) for part in argv)
|
||||
return looks_like_gateway_command_line(cmdline)
|
||||
return looks_like_gateway_runtime_command_line(cmdline)
|
||||
|
||||
|
||||
def _build_pid_record() -> dict:
|
||||
|
|
@ -1213,6 +1232,10 @@ def get_running_pid(
|
|||
resolved_lock_path = _get_gateway_lock_path(resolved_pid_path)
|
||||
lock_active = is_gateway_runtime_lock_active(resolved_lock_path)
|
||||
if not lock_active:
|
||||
if pid_path is None:
|
||||
runtime_pid = get_runtime_status_running_pid()
|
||||
if runtime_pid is not None:
|
||||
return runtime_pid
|
||||
_cleanup_invalid_pid_path(resolved_pid_path, cleanup_stale=cleanup_stale)
|
||||
return None
|
||||
|
||||
|
|
@ -1236,6 +1259,10 @@ def get_running_pid(
|
|||
return pid
|
||||
|
||||
_cleanup_invalid_pid_path(resolved_pid_path, cleanup_stale=cleanup_stale)
|
||||
if pid_path is None:
|
||||
runtime_pid = get_runtime_status_running_pid()
|
||||
if runtime_pid is not None:
|
||||
return runtime_pid
|
||||
return None
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -308,7 +308,11 @@ def _append_unique_pid(
|
|||
pids.append(pid)
|
||||
|
||||
|
||||
def _scan_gateway_pids(exclude_pids: set[int], all_profiles: bool = False) -> list[int]:
|
||||
def _scan_gateway_pids(
|
||||
exclude_pids: set[int],
|
||||
all_profiles: bool = False,
|
||||
include_restart_managers: bool = False,
|
||||
) -> list[int]:
|
||||
"""Best-effort process-table scan for gateway PIDs.
|
||||
|
||||
This supplements the profile-scoped PID file so status views can still spot
|
||||
|
|
@ -325,7 +329,10 @@ def _scan_gateway_pids(exclude_pids: set[int], all_profiles: bool = False) -> li
|
|||
# scan no longer false-matches ``gateway status``/``dashboard`` siblings or
|
||||
# unrelated processes like ``python -m tui_gateway``. Lazy import mirrors the
|
||||
# circular-import avoidance used elsewhere in this module.
|
||||
from gateway.status import looks_like_gateway_command_line
|
||||
from gateway.status import (
|
||||
looks_like_gateway_command_line,
|
||||
looks_like_gateway_runtime_command_line,
|
||||
)
|
||||
current_home = str(get_hermes_home().resolve())
|
||||
current_home_lc = current_home.lower()
|
||||
current_profile_arg = _profile_arg(current_home)
|
||||
|
|
@ -357,6 +364,11 @@ def _scan_gateway_pids(exclude_pids: set[int], all_profiles: bool = False) -> li
|
|||
return False
|
||||
return True
|
||||
|
||||
def _matches_gateway_runtime(command: str) -> bool:
|
||||
if looks_like_gateway_command_line(command):
|
||||
return True
|
||||
return include_restart_managers and looks_like_gateway_runtime_command_line(command)
|
||||
|
||||
try:
|
||||
if is_windows():
|
||||
# Prefer wmic when present (fast, stable output format). On
|
||||
|
|
@ -420,7 +432,7 @@ def _scan_gateway_pids(exclude_pids: set[int], all_profiles: bool = False) -> li
|
|||
current_cmd = line[len("CommandLine=") :]
|
||||
elif line.startswith("ProcessId="):
|
||||
pid_str = line[len("ProcessId=") :]
|
||||
if looks_like_gateway_command_line(current_cmd) and (
|
||||
if _matches_gateway_runtime(current_cmd) and (
|
||||
all_profiles or _matches_current_profile(current_cmd)
|
||||
):
|
||||
try:
|
||||
|
|
@ -445,7 +457,7 @@ def _scan_gateway_pids(exclude_pids: set[int], all_profiles: bool = False) -> li
|
|||
with open(f"/proc/{pid}/cmdline", "rb") as _f:
|
||||
cmdline = _f.read().decode("utf-8", errors="replace")
|
||||
cmdline = cmdline.replace("\x00", " ")
|
||||
if looks_like_gateway_command_line(cmdline) and (
|
||||
if _matches_gateway_runtime(cmdline) and (
|
||||
all_profiles or _matches_current_profile(cmdline)
|
||||
):
|
||||
_append_unique_pid(pids, pid, exclude_pids)
|
||||
|
|
@ -488,7 +500,7 @@ def _scan_gateway_pids(exclude_pids: set[int], all_profiles: bool = False) -> li
|
|||
|
||||
if pid is None:
|
||||
continue
|
||||
if looks_like_gateway_command_line(command) and (
|
||||
if _matches_gateway_runtime(command) and (
|
||||
all_profiles or _matches_current_profile(command)
|
||||
):
|
||||
_append_unique_pid(pids, pid, exclude_pids)
|
||||
|
|
@ -567,7 +579,15 @@ def find_gateway_pids(
|
|||
pass
|
||||
for pid in _get_service_pids():
|
||||
_append_unique_pid(pids, pid, _exclude)
|
||||
for pid in _scan_gateway_pids(_exclude, all_profiles=all_profiles):
|
||||
try:
|
||||
include_restart_managers = not supports_systemd_services()
|
||||
except Exception:
|
||||
include_restart_managers = False
|
||||
for pid in _scan_gateway_pids(
|
||||
_exclude,
|
||||
all_profiles=all_profiles,
|
||||
include_restart_managers=include_restart_managers,
|
||||
):
|
||||
_append_unique_pid(pids, pid, _exclude)
|
||||
return pids
|
||||
|
||||
|
|
|
|||
|
|
@ -11,7 +11,10 @@ from __future__ import annotations
|
|||
|
||||
import pytest
|
||||
|
||||
from gateway.status import looks_like_gateway_command_line as matches
|
||||
from gateway.status import (
|
||||
looks_like_gateway_command_line as matches,
|
||||
looks_like_gateway_runtime_command_line as matches_runtime,
|
||||
)
|
||||
|
||||
|
||||
ACCEPT = [
|
||||
|
|
@ -58,3 +61,9 @@ def test_accepts_real_gateway_run(cmd):
|
|||
@pytest.mark.parametrize("cmd", REJECT)
|
||||
def test_rejects_non_gateway_run(cmd):
|
||||
assert matches(cmd) is False
|
||||
|
||||
|
||||
def test_runtime_matcher_accepts_no_supervisor_restart_process():
|
||||
assert matches("python -m hermes_cli.main gateway restart") is False
|
||||
assert matches_runtime("python -m hermes_cli.main gateway restart") is True
|
||||
assert matches_runtime("python -m hermes_cli.main gateway status") is False
|
||||
|
|
|
|||
|
|
@ -174,6 +174,54 @@ class TestGatewayPidState:
|
|||
assert status.get_running_pid() is None
|
||||
assert not pid_path.exists()
|
||||
|
||||
def test_get_running_pid_accepts_no_supervisor_restart_runtime(self, tmp_path, monkeypatch):
|
||||
"""WSL/no-systemd restart fallback runs the gateway in a restart argv process."""
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
||||
pid_path = tmp_path / "gateway.pid"
|
||||
record = {
|
||||
"pid": os.getpid(),
|
||||
"kind": "hermes-gateway",
|
||||
"argv": ["python", "-m", "hermes_cli.main", "gateway", "restart"],
|
||||
"start_time": 123,
|
||||
}
|
||||
pid_path.write_text(json.dumps(record))
|
||||
|
||||
monkeypatch.setattr(status.os, "kill", lambda pid, sig: None)
|
||||
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 123)
|
||||
monkeypatch.setattr(
|
||||
status,
|
||||
"_read_process_cmdline",
|
||||
lambda pid: "python -m hermes_cli.main gateway restart",
|
||||
)
|
||||
|
||||
assert status.acquire_gateway_runtime_lock() is True
|
||||
try:
|
||||
assert status.get_running_pid() == os.getpid()
|
||||
finally:
|
||||
status.release_gateway_runtime_lock()
|
||||
|
||||
def test_get_running_pid_falls_back_to_no_supervisor_runtime_state(self, tmp_path, monkeypatch):
|
||||
"""A live gateway_state.json PID should keep status accurate without a pidfile."""
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
||||
state_path = tmp_path / "gateway_state.json"
|
||||
state_path.write_text(json.dumps({
|
||||
"gateway_state": "running",
|
||||
"pid": os.getpid(),
|
||||
"kind": "hermes-gateway",
|
||||
"argv": ["python", "-m", "hermes_cli.main", "gateway", "restart"],
|
||||
"start_time": 123,
|
||||
}))
|
||||
|
||||
monkeypatch.setattr(status.os, "kill", lambda pid, sig: None)
|
||||
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 123)
|
||||
monkeypatch.setattr(
|
||||
status,
|
||||
"_read_process_cmdline",
|
||||
lambda pid: "python -m hermes_cli.main gateway restart",
|
||||
)
|
||||
|
||||
assert status.get_running_pid() == os.getpid()
|
||||
|
||||
def test_get_running_pid_cleans_stale_metadata_from_dead_foreign_pid(self, tmp_path, monkeypatch):
|
||||
"""Stale PID file from a *different* PID (crashed process) must still be cleaned.
|
||||
|
||||
|
|
|
|||
|
|
@ -821,6 +821,23 @@ def test_find_gateway_pids_falls_back_to_pid_file_when_process_scan_fails(monkey
|
|||
assert gateway.find_gateway_pids() == [321]
|
||||
|
||||
|
||||
def test_find_gateway_pids_includes_restart_managers_without_systemd(monkeypatch):
|
||||
calls = []
|
||||
|
||||
monkeypatch.setattr(gateway, "_get_service_pids", lambda: set())
|
||||
monkeypatch.setattr("gateway.status.get_running_pid", lambda: None)
|
||||
monkeypatch.setattr(gateway, "supports_systemd_services", lambda: False)
|
||||
|
||||
def fake_scan(exclude_pids, all_profiles=False, include_restart_managers=False):
|
||||
calls.append((set(exclude_pids), all_profiles, include_restart_managers))
|
||||
return [708] if include_restart_managers else []
|
||||
|
||||
monkeypatch.setattr(gateway, "_scan_gateway_pids", fake_scan)
|
||||
|
||||
assert gateway.find_gateway_pids(all_profiles=True) == [708]
|
||||
assert calls == [(set(), True, True)]
|
||||
|
||||
|
||||
def test_scan_gateway_pids_detects_windows_hermes_exe_case_variants(monkeypatch):
|
||||
monkeypatch.setattr(gateway, "is_windows", lambda: True)
|
||||
monkeypatch.setattr(gateway, "_get_ancestor_pids", lambda: set())
|
||||
|
|
|
|||
|
|
@ -77,6 +77,43 @@ class TestProcFallback:
|
|||
assert 99999 not in pids
|
||||
mock_ps.assert_not_called() # ps must NOT be called when /proc worked
|
||||
|
||||
def test_detects_no_supervisor_restart_process_only_when_enabled(self):
|
||||
entries = {
|
||||
12345: "python -m hermes_cli.main gateway restart",
|
||||
99999: _OTHER_CMD,
|
||||
}
|
||||
_isdir, _listdir, _open = _fake_proc_dir(entries)
|
||||
|
||||
with (
|
||||
patch("hermes_cli.gateway.is_windows", return_value=False),
|
||||
patch("os.path.isdir", side_effect=_isdir),
|
||||
patch("os.listdir", side_effect=_listdir),
|
||||
patch("builtins.open", side_effect=_open),
|
||||
patch("hermes_cli.gateway._get_ancestor_pids", return_value=set()),
|
||||
patch("subprocess.run") as mock_ps,
|
||||
):
|
||||
strict_pids = gateway_mod._scan_gateway_pids(set(), all_profiles=True)
|
||||
|
||||
_isdir, _listdir, _open = _fake_proc_dir(entries)
|
||||
with (
|
||||
patch("hermes_cli.gateway.is_windows", return_value=False),
|
||||
patch("os.path.isdir", side_effect=_isdir),
|
||||
patch("os.listdir", side_effect=_listdir),
|
||||
patch("builtins.open", side_effect=_open),
|
||||
patch("hermes_cli.gateway._get_ancestor_pids", return_value=set()),
|
||||
patch("subprocess.run") as mock_ps_enabled,
|
||||
):
|
||||
fallback_pids = gateway_mod._scan_gateway_pids(
|
||||
set(),
|
||||
all_profiles=True,
|
||||
include_restart_managers=True,
|
||||
)
|
||||
|
||||
assert strict_pids == []
|
||||
assert fallback_pids == [12345]
|
||||
mock_ps.assert_not_called()
|
||||
mock_ps_enabled.assert_not_called()
|
||||
|
||||
def test_excludes_own_pid_from_proc_scan(self):
|
||||
my_pid = os.getpid()
|
||||
entries = {my_pid: _GATEWAY_CMD}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue