diff --git a/gateway/status.py b/gateway/status.py index 367ac33c4d7..5e5584a1ed8 100644 --- a/gateway/status.py +++ b/gateway/status.py @@ -14,6 +14,7 @@ concurrently under distinct configurations). import hashlib import json import os +import re import signal import subprocess import sys @@ -164,20 +165,53 @@ def _read_process_cmdline(pid: int) -> Optional[str]: return None +def looks_like_gateway_command_line(command: str | None) -> bool: + """Return True only for a real ``gateway run`` process command line. + + Lifecycle decisions (is the gateway up? did restart relaunch it?) must not + fire on loose substring matches. The previous ``"... gateway" in cmdline`` + test also matched ``hermes_cli.main gateway status`` and even unrelated + processes like ``python -m tui_gateway`` -- which made ``restart()`` race + against a still-draining old process and ``status``/``start`` report false + positives. This requires the actual ``gateway`` subcommand to be followed + by ``run`` (or the gateway-dedicated entrypoints), excluding the other + ``gateway`` management subcommands and any process that merely contains the + word "gateway". + """ + if not command: + return False + normalized = command.replace("\\", "/").lower() + + # Gateway-dedicated entrypoints carry no subcommand to inspect. + if re.search(r"(^|[/\s])gateway/run\.py(\s|$)", normalized): + return True + if re.search(r"(^|[/\s])hermes-gateway(?:\.exe)?(\s|$)", normalized): + return True + + has_gateway_entry = ( + "hermes_cli.main" in normalized + or "hermes_cli/main.py" in normalized + or re.search(r"(^|[/\s])hermes(?:\.exe)?(\s|$)", normalized) is not None + ) + if not has_gateway_entry: + return False + + tokens = [t.strip("\"'").replace("\\", "/").lower() for t in command.split()] + for i, token in enumerate(tokens): + if token != "gateway": + continue + if i + 1 >= len(tokens): + return True # bare `hermes gateway` defaults to `run` + return tokens[i + 1] == "run" + return False + + def _looks_like_gateway_process(pid: int) -> bool: """Return True when the live PID still looks like the Hermes gateway.""" cmdline = _read_process_cmdline(pid) if not cmdline: return False - - patterns = ( - "hermes_cli.main gateway", - "hermes_cli/main.py gateway", - "hermes gateway", - "hermes-gateway", - "gateway/run.py", - ) - return any(pattern in cmdline for pattern in patterns) + return looks_like_gateway_command_line(cmdline) def _record_looks_like_gateway(record: dict[str, Any]) -> bool: @@ -189,15 +223,8 @@ def _record_looks_like_gateway(record: dict[str, Any]) -> bool: if not isinstance(argv, list) or not argv: return False - # Normalize Windows backslashes so patterns match cross-platform. - cmdline = " ".join(str(part) for part in argv).replace("\\", "/") - patterns = ( - "hermes_cli.main gateway", - "hermes_cli/main.py gateway", - "hermes gateway", - "gateway/run.py", - ) - return any(pattern in cmdline for pattern in patterns) + cmdline = " ".join(str(part) for part in argv) + return looks_like_gateway_command_line(cmdline) def _build_pid_record() -> dict: diff --git a/hermes_cli/gateway.py b/hermes_cli/gateway.py index 7e5406a11dd..06f9c49b916 100644 --- a/hermes_cli/gateway.py +++ b/hermes_cli/gateway.py @@ -319,23 +319,12 @@ def _scan_gateway_pids(exclude_pids: set[int], all_profiles: bool = False) -> li # gateway. See #13242. exclude_pids = exclude_pids | _get_ancestor_pids() pids: list[int] = [] - patterns = [ - "hermes_cli.main gateway", - "hermes_cli.main --profile", - "hermes_cli.main -p", - "hermes_cli/main.py gateway", - "hermes_cli/main.py --profile", - "hermes_cli/main.py -p", - "hermes gateway", - # Windows: only match invocations that actually carry the ``gateway`` - # subcommand or the gateway-dedicated console-script shim. Bare - # ``hermes.exe --profile`` / ``hermes.exe -p`` would also match - # ``hermes.exe --profile foo dashboard`` and other CLI subcommands, - # producing false-positive gateway PIDs (Copilot review). - "hermes.exe gateway", - "hermes-gateway.exe", - "gateway/run.py", - ] + # Strict command-line matcher shared with gateway.status: requires the + # actual ``gateway run`` subcommand (or the dedicated entrypoints), so this + # scan no longer false-matches ``gateway status``/``dashboard`` siblings or + # unrelated processes like ``python -m tui_gateway``. Lazy import mirrors the + # circular-import avoidance used elsewhere in this module. + from gateway.status import looks_like_gateway_command_line current_home = str(get_hermes_home().resolve()) current_home_lc = current_home.lower() current_profile_arg = _profile_arg(current_home) @@ -430,8 +419,7 @@ def _scan_gateway_pids(exclude_pids: set[int], all_profiles: bool = False) -> li current_cmd = line[len("CommandLine=") :] elif line.startswith("ProcessId="): pid_str = line[len("ProcessId=") :] - current_cmd_lc = current_cmd.lower() - if any(p in current_cmd_lc for p in patterns) and ( + if looks_like_gateway_command_line(current_cmd) and ( all_profiles or _matches_current_profile(current_cmd) ): try: @@ -456,8 +444,7 @@ def _scan_gateway_pids(exclude_pids: set[int], all_profiles: bool = False) -> li with open(f"/proc/{pid}/cmdline", "rb") as _f: cmdline = _f.read().decode("utf-8", errors="replace") cmdline = cmdline.replace("\x00", " ") - cmdline_lc = cmdline.lower() - if any(p in cmdline_lc for p in patterns) and ( + if looks_like_gateway_command_line(cmdline) and ( all_profiles or _matches_current_profile(cmdline) ): _append_unique_pid(pids, pid, exclude_pids) @@ -500,8 +487,7 @@ def _scan_gateway_pids(exclude_pids: set[int], all_profiles: bool = False) -> li if pid is None: continue - command_lc = command.lower() - if any(pattern in command_lc for pattern in patterns) and ( + if looks_like_gateway_command_line(command) and ( all_profiles or _matches_current_profile(command) ): _append_unique_pid(pids, pid, exclude_pids) diff --git a/hermes_cli/gateway_windows.py b/hermes_cli/gateway_windows.py index 08c7d8c019c..466031bfaa7 100644 --- a/hermes_cli/gateway_windows.py +++ b/hermes_cli/gateway_windows.py @@ -1302,10 +1302,54 @@ def stop() -> None: print("✗ No gateway was running") +def _wait_for_gateway_absent(timeout_s: float = 30.0, interval_s: float = 0.5) -> bool: + """Block until no gateway process is detectable, or the timeout elapses. + + ``stop()`` can return while the previous gateway is still draining + in-flight agents (the drain runs up to the restart-drain timeout). Uses the + authoritative ``get_running_pid()`` (lock + liveness + start-time + + gateway-shape) plus the now-strict ``_gateway_pids()`` scan so a relaunch + never races a still-alive old process. + """ + from gateway.status import get_running_pid + + deadline = time.monotonic() + max(timeout_s, interval_s) + while time.monotonic() < deadline: + if get_running_pid() is None and not _gateway_pids(): + return True + time.sleep(interval_s) + return get_running_pid() is None and not _gateway_pids() + + def restart() -> None: - """Stop the gateway then start it again.""" + """Stop the gateway then start it again. + + Waits for the old gateway to be authoritatively gone before relaunching -- + otherwise ``start()``'s "already running" guard sees the still-draining old + process and no-ops, and when that process later exits nothing replaces it (a + silent outage). Fails loudly if the process can't be cleared or the relaunch + doesn't produce a running gateway. + """ _assert_windows() + from hermes_cli.gateway import kill_gateway_processes + stop() + + if not _wait_for_gateway_absent(timeout_s=30.0): + print("⚠ Gateway still present after stop; forcing termination before restart...") + kill_gateway_processes(all_profiles=False, force=True) + if not _wait_for_gateway_absent(timeout_s=10.0): + raise RuntimeError( + "Gateway process still detected after force kill; refusing to " + "start a duplicate. Investigate stray PIDs before retrying." + ) + # Give Windows a moment to release the listening port. time.sleep(1.0) start() + + if not _wait_for_gateway_ready(timeout_s=15.0): + raise RuntimeError( + "Gateway restart did not produce a running gateway process. " + "Check logs/gateway.log and run `hermes gateway status`." + ) diff --git a/tests/gateway/test_gateway_command_line_matcher.py b/tests/gateway/test_gateway_command_line_matcher.py new file mode 100644 index 00000000000..5b8b16a7d54 --- /dev/null +++ b/tests/gateway/test_gateway_command_line_matcher.py @@ -0,0 +1,48 @@ +"""Tests for the strict gateway command-line matcher. + +Regression guard for the Windows ``hermes gateway restart`` silent-outage bug: +the previous loose substring match (``"... gateway" in cmdline``) false-matched +``gateway status``/``dashboard`` siblings and unrelated processes such as +``python -m tui_gateway``, which let ``restart()`` race a still-draining old +process and ``status``/``start`` report false positives. +""" + +from __future__ import annotations + +import pytest + +from gateway.status import looks_like_gateway_command_line as matches + + +ACCEPT = [ + "pythonw.exe -m hermes_cli.main gateway run", + r"C:\Users\me\hermes\venv\Scripts\pythonw.exe -m hermes_cli.main gateway run", + "python -m hermes_cli.main --profile work gateway run", + "python -m hermes_cli.main gateway run --replace", + "python -m hermes_cli/main.py gateway run", + "python gateway/run.py", + "hermes-gateway.exe", + "hermes gateway", # bare `hermes gateway` defaults to run + "hermes gateway run", +] + +REJECT = [ + "python -m tui_gateway", # unrelated module + "python -m hermes_cli.main gateway status", # other subcommand + "python -m hermes_cli.main gateway restart", + "python -m hermes_cli.main gateway stop", + "python -m hermes_cli.main --profile x dashboard", # non-gateway subcommand + "some random python -m mygateway thing", + "", + None, +] + + +@pytest.mark.parametrize("cmd", ACCEPT) +def test_accepts_real_gateway_run(cmd): + assert matches(cmd) is True + + +@pytest.mark.parametrize("cmd", REJECT) +def test_rejects_non_gateway_run(cmd): + assert matches(cmd) is False