fix(gateway): Windows restart no longer causes a silent outage

`hermes gateway restart` on Windows could take the gateway offline with no
replacement. restart() was stop() -> sleep(1.0) -> start(), but the graceful
drain can run up to ~180s while the detached pythonw process stays alive. The
1s sleep let start() run against the still-draining old process; its
"already running" guard then no-opped, and when the old process finally exited
nothing relaunched it.

Two root causes, both fixed:

1. Loose PID detection. `_scan_gateway_pids` and the gateway.status helpers
   used substring matches ("... gateway" in cmdline) for lifecycle decisions,
   so they false-matched `gateway status`/`dashboard` siblings and unrelated
   processes like `python -m tui_gateway`, plus stale gateway.pid records.
   Add a shared strict matcher `looks_like_gateway_command_line()` in
   gateway/status.py that requires the real `gateway run` subcommand (or the
   dedicated entrypoints), and route `_looks_like_gateway_process`,
   `_record_looks_like_gateway`, and `_scan_gateway_pids` through it.

2. restart() race. Wait until the gateway is authoritatively gone
   (`get_running_pid()` + strict `_gateway_pids()`) before relaunch; force-kill
   once if it lingers and raise rather than start a duplicate; verify the
   relaunch produced a running gateway and raise loudly if not (no more
   exit-0 silent outage).

Scoped to Windows; systemd/launchd restart paths are already drain-aware.
Adds tests/gateway/test_gateway_command_line_matcher.py.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
Charles Power 2026-06-07 21:39:14 -07:00 committed by Teknium
parent 144834b2f7
commit fd92a3a5c9
4 changed files with 147 additions and 42 deletions

View file

@ -14,6 +14,7 @@ concurrently under distinct configurations).
import hashlib
import json
import os
import re
import signal
import subprocess
import sys
@ -164,20 +165,53 @@ def _read_process_cmdline(pid: int) -> Optional[str]:
return None
def looks_like_gateway_command_line(command: str | None) -> bool:
"""Return True only for a real ``gateway run`` process command line.
Lifecycle decisions (is the gateway up? did restart relaunch it?) must not
fire on loose substring matches. The previous ``"... gateway" in cmdline``
test also matched ``hermes_cli.main gateway status`` and even unrelated
processes like ``python -m tui_gateway`` -- which made ``restart()`` race
against a still-draining old process and ``status``/``start`` report false
positives. This requires the actual ``gateway`` subcommand to be followed
by ``run`` (or the gateway-dedicated entrypoints), excluding the other
``gateway`` management subcommands and any process that merely contains the
word "gateway".
"""
if not command:
return False
normalized = command.replace("\\", "/").lower()
# Gateway-dedicated entrypoints carry no subcommand to inspect.
if re.search(r"(^|[/\s])gateway/run\.py(\s|$)", normalized):
return True
if re.search(r"(^|[/\s])hermes-gateway(?:\.exe)?(\s|$)", normalized):
return True
has_gateway_entry = (
"hermes_cli.main" in normalized
or "hermes_cli/main.py" in normalized
or re.search(r"(^|[/\s])hermes(?:\.exe)?(\s|$)", normalized) is not None
)
if not has_gateway_entry:
return False
tokens = [t.strip("\"'").replace("\\", "/").lower() for t in command.split()]
for i, token in enumerate(tokens):
if token != "gateway":
continue
if i + 1 >= len(tokens):
return True # bare `hermes gateway` defaults to `run`
return tokens[i + 1] == "run"
return False
def _looks_like_gateway_process(pid: int) -> bool:
"""Return True when the live PID still looks like the Hermes gateway."""
cmdline = _read_process_cmdline(pid)
if not cmdline:
return False
patterns = (
"hermes_cli.main gateway",
"hermes_cli/main.py gateway",
"hermes gateway",
"hermes-gateway",
"gateway/run.py",
)
return any(pattern in cmdline for pattern in patterns)
return looks_like_gateway_command_line(cmdline)
def _record_looks_like_gateway(record: dict[str, Any]) -> bool:
@ -189,15 +223,8 @@ def _record_looks_like_gateway(record: dict[str, Any]) -> bool:
if not isinstance(argv, list) or not argv:
return False
# Normalize Windows backslashes so patterns match cross-platform.
cmdline = " ".join(str(part) for part in argv).replace("\\", "/")
patterns = (
"hermes_cli.main gateway",
"hermes_cli/main.py gateway",
"hermes gateway",
"gateway/run.py",
)
return any(pattern in cmdline for pattern in patterns)
cmdline = " ".join(str(part) for part in argv)
return looks_like_gateway_command_line(cmdline)
def _build_pid_record() -> dict:

View file

@ -319,23 +319,12 @@ def _scan_gateway_pids(exclude_pids: set[int], all_profiles: bool = False) -> li
# gateway. See #13242.
exclude_pids = exclude_pids | _get_ancestor_pids()
pids: list[int] = []
patterns = [
"hermes_cli.main gateway",
"hermes_cli.main --profile",
"hermes_cli.main -p",
"hermes_cli/main.py gateway",
"hermes_cli/main.py --profile",
"hermes_cli/main.py -p",
"hermes gateway",
# Windows: only match invocations that actually carry the ``gateway``
# subcommand or the gateway-dedicated console-script shim. Bare
# ``hermes.exe --profile`` / ``hermes.exe -p`` would also match
# ``hermes.exe --profile foo dashboard`` and other CLI subcommands,
# producing false-positive gateway PIDs (Copilot review).
"hermes.exe gateway",
"hermes-gateway.exe",
"gateway/run.py",
]
# Strict command-line matcher shared with gateway.status: requires the
# actual ``gateway run`` subcommand (or the dedicated entrypoints), so this
# scan no longer false-matches ``gateway status``/``dashboard`` siblings or
# unrelated processes like ``python -m tui_gateway``. Lazy import mirrors the
# circular-import avoidance used elsewhere in this module.
from gateway.status import looks_like_gateway_command_line
current_home = str(get_hermes_home().resolve())
current_home_lc = current_home.lower()
current_profile_arg = _profile_arg(current_home)
@ -430,8 +419,7 @@ def _scan_gateway_pids(exclude_pids: set[int], all_profiles: bool = False) -> li
current_cmd = line[len("CommandLine=") :]
elif line.startswith("ProcessId="):
pid_str = line[len("ProcessId=") :]
current_cmd_lc = current_cmd.lower()
if any(p in current_cmd_lc for p in patterns) and (
if looks_like_gateway_command_line(current_cmd) and (
all_profiles or _matches_current_profile(current_cmd)
):
try:
@ -456,8 +444,7 @@ def _scan_gateway_pids(exclude_pids: set[int], all_profiles: bool = False) -> li
with open(f"/proc/{pid}/cmdline", "rb") as _f:
cmdline = _f.read().decode("utf-8", errors="replace")
cmdline = cmdline.replace("\x00", " ")
cmdline_lc = cmdline.lower()
if any(p in cmdline_lc for p in patterns) and (
if looks_like_gateway_command_line(cmdline) and (
all_profiles or _matches_current_profile(cmdline)
):
_append_unique_pid(pids, pid, exclude_pids)
@ -500,8 +487,7 @@ def _scan_gateway_pids(exclude_pids: set[int], all_profiles: bool = False) -> li
if pid is None:
continue
command_lc = command.lower()
if any(pattern in command_lc for pattern in patterns) and (
if looks_like_gateway_command_line(command) and (
all_profiles or _matches_current_profile(command)
):
_append_unique_pid(pids, pid, exclude_pids)

View file

@ -1302,10 +1302,54 @@ def stop() -> None:
print("✗ No gateway was running")
def _wait_for_gateway_absent(timeout_s: float = 30.0, interval_s: float = 0.5) -> bool:
"""Block until no gateway process is detectable, or the timeout elapses.
``stop()`` can return while the previous gateway is still draining
in-flight agents (the drain runs up to the restart-drain timeout). Uses the
authoritative ``get_running_pid()`` (lock + liveness + start-time +
gateway-shape) plus the now-strict ``_gateway_pids()`` scan so a relaunch
never races a still-alive old process.
"""
from gateway.status import get_running_pid
deadline = time.monotonic() + max(timeout_s, interval_s)
while time.monotonic() < deadline:
if get_running_pid() is None and not _gateway_pids():
return True
time.sleep(interval_s)
return get_running_pid() is None and not _gateway_pids()
def restart() -> None:
"""Stop the gateway then start it again."""
"""Stop the gateway then start it again.
Waits for the old gateway to be authoritatively gone before relaunching --
otherwise ``start()``'s "already running" guard sees the still-draining old
process and no-ops, and when that process later exits nothing replaces it (a
silent outage). Fails loudly if the process can't be cleared or the relaunch
doesn't produce a running gateway.
"""
_assert_windows()
from hermes_cli.gateway import kill_gateway_processes
stop()
if not _wait_for_gateway_absent(timeout_s=30.0):
print("⚠ Gateway still present after stop; forcing termination before restart...")
kill_gateway_processes(all_profiles=False, force=True)
if not _wait_for_gateway_absent(timeout_s=10.0):
raise RuntimeError(
"Gateway process still detected after force kill; refusing to "
"start a duplicate. Investigate stray PIDs before retrying."
)
# Give Windows a moment to release the listening port.
time.sleep(1.0)
start()
if not _wait_for_gateway_ready(timeout_s=15.0):
raise RuntimeError(
"Gateway restart did not produce a running gateway process. "
"Check logs/gateway.log and run `hermes gateway status`."
)

View file

@ -0,0 +1,48 @@
"""Tests for the strict gateway command-line matcher.
Regression guard for the Windows ``hermes gateway restart`` silent-outage bug:
the previous loose substring match (``"... gateway" in cmdline``) false-matched
``gateway status``/``dashboard`` siblings and unrelated processes such as
``python -m tui_gateway``, which let ``restart()`` race a still-draining old
process and ``status``/``start`` report false positives.
"""
from __future__ import annotations
import pytest
from gateway.status import looks_like_gateway_command_line as matches
ACCEPT = [
"pythonw.exe -m hermes_cli.main gateway run",
r"C:\Users\me\hermes\venv\Scripts\pythonw.exe -m hermes_cli.main gateway run",
"python -m hermes_cli.main --profile work gateway run",
"python -m hermes_cli.main gateway run --replace",
"python -m hermes_cli/main.py gateway run",
"python gateway/run.py",
"hermes-gateway.exe",
"hermes gateway", # bare `hermes gateway` defaults to run
"hermes gateway run",
]
REJECT = [
"python -m tui_gateway", # unrelated module
"python -m hermes_cli.main gateway status", # other subcommand
"python -m hermes_cli.main gateway restart",
"python -m hermes_cli.main gateway stop",
"python -m hermes_cli.main --profile x dashboard", # non-gateway subcommand
"some random python -m mygateway thing",
"",
None,
]
@pytest.mark.parametrize("cmd", ACCEPT)
def test_accepts_real_gateway_run(cmd):
assert matches(cmd) is True
@pytest.mark.parametrize("cmd", REJECT)
def test_rejects_non_gateway_run(cmd):
assert matches(cmd) is False