fix(gateway): track no-systemd restart runtimes

This commit is contained in:
jeremy gu 2026-06-23 10:37:56 -07:00 committed by Teknium
parent d539cd9004
commit 044996e403
6 changed files with 176 additions and 18 deletions

View file

@ -190,8 +190,8 @@ def _read_process_cmdline(pid: int) -> Optional[str]:
return None
def looks_like_gateway_command_line(command: str | None) -> bool:
"""Return True only for a real ``gateway run`` process command line.
def _gateway_command_subcommand(command: str | None) -> str | None:
"""Return the Hermes gateway lifecycle subcommand from a command line.
Lifecycle decisions (is the gateway up? did restart relaunch it?) must not
fire on loose substring matches. The previous ``"... gateway" in cmdline``
@ -211,7 +211,7 @@ def looks_like_gateway_command_line(command: str | None) -> bool:
either side of the ``gateway`` subcommand.
"""
if not command:
return False
return None
try:
raw_tokens = shlex.split(command, posix=False)
@ -220,15 +220,15 @@ def looks_like_gateway_command_line(command: str | None) -> bool:
# Strip surrounding quotes, normalize slashes + case per token.
tokens = [t.strip("\"'").replace("\\", "/").lower() for t in raw_tokens]
if not tokens:
return False
return None
# Gateway-dedicated entrypoints carry no subcommand to inspect.
for token in tokens:
if token == "gateway/run.py" or token.endswith("/gateway/run.py"):
return True
return "run"
basename = token.rsplit("/", 1)[-1]
if basename in ("hermes-gateway", "hermes-gateway.exe"):
return True
return "run"
joined = " ".join(tokens)
has_gateway_entry = (
@ -237,7 +237,7 @@ def looks_like_gateway_command_line(command: str | None) -> bool:
or any(t.rsplit("/", 1)[-1] in ("hermes", "hermes.exe") for t in tokens)
)
if not has_gateway_entry:
return False
return None
# Drop profile selectors anywhere: --profile X / -p X / --profile=X / -p=X.
# This consumes a profile VALUE of "gateway" too, so the real subcommand
@ -259,9 +259,28 @@ def looks_like_gateway_command_line(command: str | None) -> bool:
if token != "gateway":
continue
if i + 1 >= len(filtered):
return True # bare `hermes gateway` defaults to `run`
return filtered[i + 1] == "run"
return False
return "run" # bare `hermes gateway` defaults to `run`
return filtered[i + 1]
return None
def looks_like_gateway_command_line(command: str | None) -> bool:
"""Return True only for a real ``gateway run`` process command line."""
return _gateway_command_subcommand(command) == "run"
def looks_like_gateway_runtime_command_line(command: str | None) -> bool:
"""Return True for command lines that can host the gateway runtime.
``gateway restart`` is normally a management command, not the gateway
runtime. On hosts without a service manager, though, the manual restart
fallback executes ``run_gateway()`` in that same process, so its argv stays
as ``gateway restart`` while it owns the webhook port and writes runtime
state. Keep the public ``looks_like_gateway_command_line()`` strict, and
use this broader matcher only when validating Hermes-owned runtime records
or no-supervisor cleanup scans.
"""
return _gateway_command_subcommand(command) in {"run", "restart"}
def _looks_like_gateway_process(pid: int) -> bool:
@ -282,7 +301,7 @@ def _record_looks_like_gateway(record: dict[str, Any]) -> bool:
return False
cmdline = " ".join(str(part) for part in argv)
return looks_like_gateway_command_line(cmdline)
return looks_like_gateway_runtime_command_line(cmdline)
def _build_pid_record() -> dict:
@ -1213,6 +1232,10 @@ def get_running_pid(
resolved_lock_path = _get_gateway_lock_path(resolved_pid_path)
lock_active = is_gateway_runtime_lock_active(resolved_lock_path)
if not lock_active:
if pid_path is None:
runtime_pid = get_runtime_status_running_pid()
if runtime_pid is not None:
return runtime_pid
_cleanup_invalid_pid_path(resolved_pid_path, cleanup_stale=cleanup_stale)
return None
@ -1236,6 +1259,10 @@ def get_running_pid(
return pid
_cleanup_invalid_pid_path(resolved_pid_path, cleanup_stale=cleanup_stale)
if pid_path is None:
runtime_pid = get_runtime_status_running_pid()
if runtime_pid is not None:
return runtime_pid
return None

View file

@ -308,7 +308,11 @@ def _append_unique_pid(
pids.append(pid)
def _scan_gateway_pids(exclude_pids: set[int], all_profiles: bool = False) -> list[int]:
def _scan_gateway_pids(
exclude_pids: set[int],
all_profiles: bool = False,
include_restart_managers: bool = False,
) -> list[int]:
"""Best-effort process-table scan for gateway PIDs.
This supplements the profile-scoped PID file so status views can still spot
@ -325,7 +329,10 @@ def _scan_gateway_pids(exclude_pids: set[int], all_profiles: bool = False) -> li
# scan no longer false-matches ``gateway status``/``dashboard`` siblings or
# unrelated processes like ``python -m tui_gateway``. Lazy import mirrors the
# circular-import avoidance used elsewhere in this module.
from gateway.status import looks_like_gateway_command_line
from gateway.status import (
looks_like_gateway_command_line,
looks_like_gateway_runtime_command_line,
)
current_home = str(get_hermes_home().resolve())
current_home_lc = current_home.lower()
current_profile_arg = _profile_arg(current_home)
@ -357,6 +364,11 @@ def _scan_gateway_pids(exclude_pids: set[int], all_profiles: bool = False) -> li
return False
return True
def _matches_gateway_runtime(command: str) -> bool:
if looks_like_gateway_command_line(command):
return True
return include_restart_managers and looks_like_gateway_runtime_command_line(command)
try:
if is_windows():
# Prefer wmic when present (fast, stable output format). On
@ -420,7 +432,7 @@ def _scan_gateway_pids(exclude_pids: set[int], all_profiles: bool = False) -> li
current_cmd = line[len("CommandLine=") :]
elif line.startswith("ProcessId="):
pid_str = line[len("ProcessId=") :]
if looks_like_gateway_command_line(current_cmd) and (
if _matches_gateway_runtime(current_cmd) and (
all_profiles or _matches_current_profile(current_cmd)
):
try:
@ -445,7 +457,7 @@ def _scan_gateway_pids(exclude_pids: set[int], all_profiles: bool = False) -> li
with open(f"/proc/{pid}/cmdline", "rb") as _f:
cmdline = _f.read().decode("utf-8", errors="replace")
cmdline = cmdline.replace("\x00", " ")
if looks_like_gateway_command_line(cmdline) and (
if _matches_gateway_runtime(cmdline) and (
all_profiles or _matches_current_profile(cmdline)
):
_append_unique_pid(pids, pid, exclude_pids)
@ -488,7 +500,7 @@ def _scan_gateway_pids(exclude_pids: set[int], all_profiles: bool = False) -> li
if pid is None:
continue
if looks_like_gateway_command_line(command) and (
if _matches_gateway_runtime(command) and (
all_profiles or _matches_current_profile(command)
):
_append_unique_pid(pids, pid, exclude_pids)
@ -567,7 +579,15 @@ def find_gateway_pids(
pass
for pid in _get_service_pids():
_append_unique_pid(pids, pid, _exclude)
for pid in _scan_gateway_pids(_exclude, all_profiles=all_profiles):
try:
include_restart_managers = not supports_systemd_services()
except Exception:
include_restart_managers = False
for pid in _scan_gateway_pids(
_exclude,
all_profiles=all_profiles,
include_restart_managers=include_restart_managers,
):
_append_unique_pid(pids, pid, _exclude)
return pids

View file

@ -11,7 +11,10 @@ from __future__ import annotations
import pytest
from gateway.status import looks_like_gateway_command_line as matches
from gateway.status import (
looks_like_gateway_command_line as matches,
looks_like_gateway_runtime_command_line as matches_runtime,
)
ACCEPT = [
@ -58,3 +61,9 @@ def test_accepts_real_gateway_run(cmd):
@pytest.mark.parametrize("cmd", REJECT)
def test_rejects_non_gateway_run(cmd):
assert matches(cmd) is False
def test_runtime_matcher_accepts_no_supervisor_restart_process():
assert matches("python -m hermes_cli.main gateway restart") is False
assert matches_runtime("python -m hermes_cli.main gateway restart") is True
assert matches_runtime("python -m hermes_cli.main gateway status") is False

View file

@ -174,6 +174,54 @@ class TestGatewayPidState:
assert status.get_running_pid() is None
assert not pid_path.exists()
def test_get_running_pid_accepts_no_supervisor_restart_runtime(self, tmp_path, monkeypatch):
"""WSL/no-systemd restart fallback runs the gateway in a restart argv process."""
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
pid_path = tmp_path / "gateway.pid"
record = {
"pid": os.getpid(),
"kind": "hermes-gateway",
"argv": ["python", "-m", "hermes_cli.main", "gateway", "restart"],
"start_time": 123,
}
pid_path.write_text(json.dumps(record))
monkeypatch.setattr(status.os, "kill", lambda pid, sig: None)
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 123)
monkeypatch.setattr(
status,
"_read_process_cmdline",
lambda pid: "python -m hermes_cli.main gateway restart",
)
assert status.acquire_gateway_runtime_lock() is True
try:
assert status.get_running_pid() == os.getpid()
finally:
status.release_gateway_runtime_lock()
def test_get_running_pid_falls_back_to_no_supervisor_runtime_state(self, tmp_path, monkeypatch):
"""A live gateway_state.json PID should keep status accurate without a pidfile."""
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
state_path = tmp_path / "gateway_state.json"
state_path.write_text(json.dumps({
"gateway_state": "running",
"pid": os.getpid(),
"kind": "hermes-gateway",
"argv": ["python", "-m", "hermes_cli.main", "gateway", "restart"],
"start_time": 123,
}))
monkeypatch.setattr(status.os, "kill", lambda pid, sig: None)
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 123)
monkeypatch.setattr(
status,
"_read_process_cmdline",
lambda pid: "python -m hermes_cli.main gateway restart",
)
assert status.get_running_pid() == os.getpid()
def test_get_running_pid_cleans_stale_metadata_from_dead_foreign_pid(self, tmp_path, monkeypatch):
"""Stale PID file from a *different* PID (crashed process) must still be cleaned.

View file

@ -821,6 +821,23 @@ def test_find_gateway_pids_falls_back_to_pid_file_when_process_scan_fails(monkey
assert gateway.find_gateway_pids() == [321]
def test_find_gateway_pids_includes_restart_managers_without_systemd(monkeypatch):
calls = []
monkeypatch.setattr(gateway, "_get_service_pids", lambda: set())
monkeypatch.setattr("gateway.status.get_running_pid", lambda: None)
monkeypatch.setattr(gateway, "supports_systemd_services", lambda: False)
def fake_scan(exclude_pids, all_profiles=False, include_restart_managers=False):
calls.append((set(exclude_pids), all_profiles, include_restart_managers))
return [708] if include_restart_managers else []
monkeypatch.setattr(gateway, "_scan_gateway_pids", fake_scan)
assert gateway.find_gateway_pids(all_profiles=True) == [708]
assert calls == [(set(), True, True)]
def test_scan_gateway_pids_detects_windows_hermes_exe_case_variants(monkeypatch):
monkeypatch.setattr(gateway, "is_windows", lambda: True)
monkeypatch.setattr(gateway, "_get_ancestor_pids", lambda: set())

View file

@ -77,6 +77,43 @@ class TestProcFallback:
assert 99999 not in pids
mock_ps.assert_not_called() # ps must NOT be called when /proc worked
def test_detects_no_supervisor_restart_process_only_when_enabled(self):
entries = {
12345: "python -m hermes_cli.main gateway restart",
99999: _OTHER_CMD,
}
_isdir, _listdir, _open = _fake_proc_dir(entries)
with (
patch("hermes_cli.gateway.is_windows", return_value=False),
patch("os.path.isdir", side_effect=_isdir),
patch("os.listdir", side_effect=_listdir),
patch("builtins.open", side_effect=_open),
patch("hermes_cli.gateway._get_ancestor_pids", return_value=set()),
patch("subprocess.run") as mock_ps,
):
strict_pids = gateway_mod._scan_gateway_pids(set(), all_profiles=True)
_isdir, _listdir, _open = _fake_proc_dir(entries)
with (
patch("hermes_cli.gateway.is_windows", return_value=False),
patch("os.path.isdir", side_effect=_isdir),
patch("os.listdir", side_effect=_listdir),
patch("builtins.open", side_effect=_open),
patch("hermes_cli.gateway._get_ancestor_pids", return_value=set()),
patch("subprocess.run") as mock_ps_enabled,
):
fallback_pids = gateway_mod._scan_gateway_pids(
set(),
all_profiles=True,
include_restart_managers=True,
)
assert strict_pids == []
assert fallback_pids == [12345]
mock_ps.assert_not_called()
mock_ps_enabled.assert_not_called()
def test_excludes_own_pid_from_proc_scan(self):
my_pid = os.getpid()
entries = {my_pid: _GATEWAY_CMD}