mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-21 10:22:18 +00:00
fix(gateway): Windows restart no longer causes a silent outage
`hermes gateway restart` on Windows could take the gateway offline with no
replacement. restart() was stop() -> sleep(1.0) -> start(), but the graceful
drain can run up to ~180s while the detached pythonw process stays alive. The
1s sleep let start() run against the still-draining old process; its
"already running" guard then no-opped, and when the old process finally exited
nothing relaunched it.
Two root causes, both fixed:
1. Loose PID detection. `_scan_gateway_pids` and the gateway.status helpers
used substring matches ("... gateway" in cmdline) for lifecycle decisions,
so they false-matched `gateway status`/`dashboard` siblings and unrelated
processes like `python -m tui_gateway`, plus stale gateway.pid records.
Add a shared strict matcher `looks_like_gateway_command_line()` in
gateway/status.py that requires the real `gateway run` subcommand (or the
dedicated entrypoints), and route `_looks_like_gateway_process`,
`_record_looks_like_gateway`, and `_scan_gateway_pids` through it.
2. restart() race. Wait until the gateway is authoritatively gone
(`get_running_pid()` + strict `_gateway_pids()`) before relaunch; force-kill
once if it lingers and raise rather than start a duplicate; verify the
relaunch produced a running gateway and raise loudly if not (no more
exit-0 silent outage).
Scoped to Windows; systemd/launchd restart paths are already drain-aware.
Adds tests/gateway/test_gateway_command_line_matcher.py.
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
144834b2f7
commit
fd92a3a5c9
4 changed files with 147 additions and 42 deletions
|
|
@ -14,6 +14,7 @@ concurrently under distinct configurations).
|
|||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import signal
|
||||
import subprocess
|
||||
import sys
|
||||
|
|
@ -164,20 +165,53 @@ def _read_process_cmdline(pid: int) -> Optional[str]:
|
|||
return None
|
||||
|
||||
|
||||
def looks_like_gateway_command_line(command: str | None) -> bool:
|
||||
"""Return True only for a real ``gateway run`` process command line.
|
||||
|
||||
Lifecycle decisions (is the gateway up? did restart relaunch it?) must not
|
||||
fire on loose substring matches. The previous ``"... gateway" in cmdline``
|
||||
test also matched ``hermes_cli.main gateway status`` and even unrelated
|
||||
processes like ``python -m tui_gateway`` -- which made ``restart()`` race
|
||||
against a still-draining old process and ``status``/``start`` report false
|
||||
positives. This requires the actual ``gateway`` subcommand to be followed
|
||||
by ``run`` (or the gateway-dedicated entrypoints), excluding the other
|
||||
``gateway`` management subcommands and any process that merely contains the
|
||||
word "gateway".
|
||||
"""
|
||||
if not command:
|
||||
return False
|
||||
normalized = command.replace("\\", "/").lower()
|
||||
|
||||
# Gateway-dedicated entrypoints carry no subcommand to inspect.
|
||||
if re.search(r"(^|[/\s])gateway/run\.py(\s|$)", normalized):
|
||||
return True
|
||||
if re.search(r"(^|[/\s])hermes-gateway(?:\.exe)?(\s|$)", normalized):
|
||||
return True
|
||||
|
||||
has_gateway_entry = (
|
||||
"hermes_cli.main" in normalized
|
||||
or "hermes_cli/main.py" in normalized
|
||||
or re.search(r"(^|[/\s])hermes(?:\.exe)?(\s|$)", normalized) is not None
|
||||
)
|
||||
if not has_gateway_entry:
|
||||
return False
|
||||
|
||||
tokens = [t.strip("\"'").replace("\\", "/").lower() for t in command.split()]
|
||||
for i, token in enumerate(tokens):
|
||||
if token != "gateway":
|
||||
continue
|
||||
if i + 1 >= len(tokens):
|
||||
return True # bare `hermes gateway` defaults to `run`
|
||||
return tokens[i + 1] == "run"
|
||||
return False
|
||||
|
||||
|
||||
def _looks_like_gateway_process(pid: int) -> bool:
|
||||
"""Return True when the live PID still looks like the Hermes gateway."""
|
||||
cmdline = _read_process_cmdline(pid)
|
||||
if not cmdline:
|
||||
return False
|
||||
|
||||
patterns = (
|
||||
"hermes_cli.main gateway",
|
||||
"hermes_cli/main.py gateway",
|
||||
"hermes gateway",
|
||||
"hermes-gateway",
|
||||
"gateway/run.py",
|
||||
)
|
||||
return any(pattern in cmdline for pattern in patterns)
|
||||
return looks_like_gateway_command_line(cmdline)
|
||||
|
||||
|
||||
def _record_looks_like_gateway(record: dict[str, Any]) -> bool:
|
||||
|
|
@ -189,15 +223,8 @@ def _record_looks_like_gateway(record: dict[str, Any]) -> bool:
|
|||
if not isinstance(argv, list) or not argv:
|
||||
return False
|
||||
|
||||
# Normalize Windows backslashes so patterns match cross-platform.
|
||||
cmdline = " ".join(str(part) for part in argv).replace("\\", "/")
|
||||
patterns = (
|
||||
"hermes_cli.main gateway",
|
||||
"hermes_cli/main.py gateway",
|
||||
"hermes gateway",
|
||||
"gateway/run.py",
|
||||
)
|
||||
return any(pattern in cmdline for pattern in patterns)
|
||||
cmdline = " ".join(str(part) for part in argv)
|
||||
return looks_like_gateway_command_line(cmdline)
|
||||
|
||||
|
||||
def _build_pid_record() -> dict:
|
||||
|
|
|
|||
|
|
@ -319,23 +319,12 @@ def _scan_gateway_pids(exclude_pids: set[int], all_profiles: bool = False) -> li
|
|||
# gateway. See #13242.
|
||||
exclude_pids = exclude_pids | _get_ancestor_pids()
|
||||
pids: list[int] = []
|
||||
patterns = [
|
||||
"hermes_cli.main gateway",
|
||||
"hermes_cli.main --profile",
|
||||
"hermes_cli.main -p",
|
||||
"hermes_cli/main.py gateway",
|
||||
"hermes_cli/main.py --profile",
|
||||
"hermes_cli/main.py -p",
|
||||
"hermes gateway",
|
||||
# Windows: only match invocations that actually carry the ``gateway``
|
||||
# subcommand or the gateway-dedicated console-script shim. Bare
|
||||
# ``hermes.exe --profile`` / ``hermes.exe -p`` would also match
|
||||
# ``hermes.exe --profile foo dashboard`` and other CLI subcommands,
|
||||
# producing false-positive gateway PIDs (Copilot review).
|
||||
"hermes.exe gateway",
|
||||
"hermes-gateway.exe",
|
||||
"gateway/run.py",
|
||||
]
|
||||
# Strict command-line matcher shared with gateway.status: requires the
|
||||
# actual ``gateway run`` subcommand (or the dedicated entrypoints), so this
|
||||
# scan no longer false-matches ``gateway status``/``dashboard`` siblings or
|
||||
# unrelated processes like ``python -m tui_gateway``. Lazy import mirrors the
|
||||
# circular-import avoidance used elsewhere in this module.
|
||||
from gateway.status import looks_like_gateway_command_line
|
||||
current_home = str(get_hermes_home().resolve())
|
||||
current_home_lc = current_home.lower()
|
||||
current_profile_arg = _profile_arg(current_home)
|
||||
|
|
@ -430,8 +419,7 @@ def _scan_gateway_pids(exclude_pids: set[int], all_profiles: bool = False) -> li
|
|||
current_cmd = line[len("CommandLine=") :]
|
||||
elif line.startswith("ProcessId="):
|
||||
pid_str = line[len("ProcessId=") :]
|
||||
current_cmd_lc = current_cmd.lower()
|
||||
if any(p in current_cmd_lc for p in patterns) and (
|
||||
if looks_like_gateway_command_line(current_cmd) and (
|
||||
all_profiles or _matches_current_profile(current_cmd)
|
||||
):
|
||||
try:
|
||||
|
|
@ -456,8 +444,7 @@ def _scan_gateway_pids(exclude_pids: set[int], all_profiles: bool = False) -> li
|
|||
with open(f"/proc/{pid}/cmdline", "rb") as _f:
|
||||
cmdline = _f.read().decode("utf-8", errors="replace")
|
||||
cmdline = cmdline.replace("\x00", " ")
|
||||
cmdline_lc = cmdline.lower()
|
||||
if any(p in cmdline_lc for p in patterns) and (
|
||||
if looks_like_gateway_command_line(cmdline) and (
|
||||
all_profiles or _matches_current_profile(cmdline)
|
||||
):
|
||||
_append_unique_pid(pids, pid, exclude_pids)
|
||||
|
|
@ -500,8 +487,7 @@ def _scan_gateway_pids(exclude_pids: set[int], all_profiles: bool = False) -> li
|
|||
|
||||
if pid is None:
|
||||
continue
|
||||
command_lc = command.lower()
|
||||
if any(pattern in command_lc for pattern in patterns) and (
|
||||
if looks_like_gateway_command_line(command) and (
|
||||
all_profiles or _matches_current_profile(command)
|
||||
):
|
||||
_append_unique_pid(pids, pid, exclude_pids)
|
||||
|
|
|
|||
|
|
@ -1302,10 +1302,54 @@ def stop() -> None:
|
|||
print("✗ No gateway was running")
|
||||
|
||||
|
||||
def _wait_for_gateway_absent(timeout_s: float = 30.0, interval_s: float = 0.5) -> bool:
|
||||
"""Block until no gateway process is detectable, or the timeout elapses.
|
||||
|
||||
``stop()`` can return while the previous gateway is still draining
|
||||
in-flight agents (the drain runs up to the restart-drain timeout). Uses the
|
||||
authoritative ``get_running_pid()`` (lock + liveness + start-time +
|
||||
gateway-shape) plus the now-strict ``_gateway_pids()`` scan so a relaunch
|
||||
never races a still-alive old process.
|
||||
"""
|
||||
from gateway.status import get_running_pid
|
||||
|
||||
deadline = time.monotonic() + max(timeout_s, interval_s)
|
||||
while time.monotonic() < deadline:
|
||||
if get_running_pid() is None and not _gateway_pids():
|
||||
return True
|
||||
time.sleep(interval_s)
|
||||
return get_running_pid() is None and not _gateway_pids()
|
||||
|
||||
|
||||
def restart() -> None:
|
||||
"""Stop the gateway then start it again."""
|
||||
"""Stop the gateway then start it again.
|
||||
|
||||
Waits for the old gateway to be authoritatively gone before relaunching --
|
||||
otherwise ``start()``'s "already running" guard sees the still-draining old
|
||||
process and no-ops, and when that process later exits nothing replaces it (a
|
||||
silent outage). Fails loudly if the process can't be cleared or the relaunch
|
||||
doesn't produce a running gateway.
|
||||
"""
|
||||
_assert_windows()
|
||||
from hermes_cli.gateway import kill_gateway_processes
|
||||
|
||||
stop()
|
||||
|
||||
if not _wait_for_gateway_absent(timeout_s=30.0):
|
||||
print("⚠ Gateway still present after stop; forcing termination before restart...")
|
||||
kill_gateway_processes(all_profiles=False, force=True)
|
||||
if not _wait_for_gateway_absent(timeout_s=10.0):
|
||||
raise RuntimeError(
|
||||
"Gateway process still detected after force kill; refusing to "
|
||||
"start a duplicate. Investigate stray PIDs before retrying."
|
||||
)
|
||||
|
||||
# Give Windows a moment to release the listening port.
|
||||
time.sleep(1.0)
|
||||
start()
|
||||
|
||||
if not _wait_for_gateway_ready(timeout_s=15.0):
|
||||
raise RuntimeError(
|
||||
"Gateway restart did not produce a running gateway process. "
|
||||
"Check logs/gateway.log and run `hermes gateway status`."
|
||||
)
|
||||
|
|
|
|||
48
tests/gateway/test_gateway_command_line_matcher.py
Normal file
48
tests/gateway/test_gateway_command_line_matcher.py
Normal file
|
|
@ -0,0 +1,48 @@
|
|||
"""Tests for the strict gateway command-line matcher.
|
||||
|
||||
Regression guard for the Windows ``hermes gateway restart`` silent-outage bug:
|
||||
the previous loose substring match (``"... gateway" in cmdline``) false-matched
|
||||
``gateway status``/``dashboard`` siblings and unrelated processes such as
|
||||
``python -m tui_gateway``, which let ``restart()`` race a still-draining old
|
||||
process and ``status``/``start`` report false positives.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from gateway.status import looks_like_gateway_command_line as matches
|
||||
|
||||
|
||||
ACCEPT = [
|
||||
"pythonw.exe -m hermes_cli.main gateway run",
|
||||
r"C:\Users\me\hermes\venv\Scripts\pythonw.exe -m hermes_cli.main gateway run",
|
||||
"python -m hermes_cli.main --profile work gateway run",
|
||||
"python -m hermes_cli.main gateway run --replace",
|
||||
"python -m hermes_cli/main.py gateway run",
|
||||
"python gateway/run.py",
|
||||
"hermes-gateway.exe",
|
||||
"hermes gateway", # bare `hermes gateway` defaults to run
|
||||
"hermes gateway run",
|
||||
]
|
||||
|
||||
REJECT = [
|
||||
"python -m tui_gateway", # unrelated module
|
||||
"python -m hermes_cli.main gateway status", # other subcommand
|
||||
"python -m hermes_cli.main gateway restart",
|
||||
"python -m hermes_cli.main gateway stop",
|
||||
"python -m hermes_cli.main --profile x dashboard", # non-gateway subcommand
|
||||
"some random python -m mygateway thing",
|
||||
"",
|
||||
None,
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("cmd", ACCEPT)
|
||||
def test_accepts_real_gateway_run(cmd):
|
||||
assert matches(cmd) is True
|
||||
|
||||
|
||||
@pytest.mark.parametrize("cmd", REJECT)
|
||||
def test_rejects_non_gateway_run(cmd):
|
||||
assert matches(cmd) is False
|
||||
Loading…
Add table
Add a link
Reference in a new issue