diff --git a/hermes_cli/gateway.py b/hermes_cli/gateway.py index 64d7f71f3e4..9811749e3ca 100644 --- a/hermes_cli/gateway.py +++ b/hermes_cli/gateway.py @@ -13,6 +13,7 @@ import signal import subprocess import sys import textwrap +import time from dataclasses import dataclass from pathlib import Path @@ -1344,12 +1345,85 @@ def kill_gateway_processes( return killed +def _reap_unsupervised_gateway_orphans() -> bool: + """Kill no-supervisor gateway orphans the pidfile/runtime record can't see. + + On WSL/no-systemd hosts the manual restart fallback runs the gateway + in-process under a ``gateway restart`` argv (hermes_cli/gateway.py restart + branch → ``run_gateway()``). If its pidfile or runtime record goes missing + or stale, ``get_running_pid()`` returns ``None`` even though a live orphan + still holds the webhook port, so a follow-up restart stacks a duplicate on + the same port (#51325). This is a no-op on hosts WITH a service supervisor, + where a ``gateway restart`` argv is a transient management command, not the + running gateway — gating on ``supports_systemd_services()`` keeps the + orphan-aware scan from killing live management processes there. + + Returns True if at least one orphan was reaped. + """ + try: + if supports_systemd_services(): + return False + except Exception: + return False + + from gateway.status import _pid_exists, write_planned_stop_marker + + own = {os.getpid()} + try: + # find_gateway_pids() includes no-supervisor `gateway restart` runtimes + # for the current profile when no systemd supervisor is present. + orphans = [p for p in find_gateway_pids(exclude_pids=own) if p and p > 0] + except Exception: + return False + if not orphans: + return False + + reaped = False + for pid in orphans: + try: + write_planned_stop_marker(pid) + except Exception: + pass + try: + os.kill(pid, signal.SIGTERM) + except ProcessLookupError: + continue + except PermissionError: + print(f"⚠ Permission denied to kill orphaned gateway PID {pid}") + continue + reaped = True + + # SIGTERM released the port in the field report but the orphan kept + # running until a follow-up SIGKILL — wait briefly, then force-kill + # any survivor so the replacement can bind the port cleanly. + deadline = time.monotonic() + 5.0 + survivors = list(orphans) + while survivors and time.monotonic() < deadline: + survivors = [p for p in survivors if _pid_exists(p)] + if survivors: + time.sleep(0.2) + for pid in survivors: + try: + os.kill(pid, getattr(signal, "SIGKILL", signal.SIGTERM)) + except (ProcessLookupError, PermissionError, OSError): + pass + + return reaped + + def stop_profile_gateway() -> bool: """Stop only the gateway for the current profile (HERMES_HOME-scoped). Uses the PID file written by start_gateway(), so it only kills the gateway belonging to this profile — not gateways from other profiles. Returns True if a process was stopped, False if none was found. + + On hosts without a service supervisor (e.g. WSL/no-systemd, where the + manual restart fallback runs the gateway in-process under a ``gateway + restart`` argv), the pidfile/runtime record can be missing or stale while + a live orphan still holds the webhook port. In that case fall back to the + orphan-aware process scan so the replacement reaps the prior instance + instead of stacking a duplicate on the same port (#51325). """ try: from gateway.status import get_running_pid, remove_pid_file @@ -1358,7 +1432,7 @@ def stop_profile_gateway() -> bool: pid = get_running_pid() if pid is None: - return False + return _reap_unsupervised_gateway_orphans() try: from gateway.status import write_planned_stop_marker diff --git a/scripts/release.py b/scripts/release.py index ebfbfb570c1..530e73a9b94 100755 --- a/scripts/release.py +++ b/scripts/release.py @@ -45,6 +45,7 @@ ACP_REGISTRY_MANIFEST = REPO_ROOT / "acp_registry" / "agent.json" # Auto-extracted from noreply emails + manual overrides AUTHOR_MAP = { + "145739220+wgu9@users.noreply.github.com": "wgu9", # PR #51468 salvage (WSL/no-systemd orphan gateway tracking, #51325) "minz0721@outlook.com": "s010mn", # PR #29221 salvage (ollama-cloud reasoning_effort xhigh→max) "jeevesassistant00@gmail.com": "jeeves-assistant", # PR #50771 (computer-use CuaDriver vision capture routing) "21178861+ScotterMonk@users.noreply.github.com": "ScotterMonk", # PR #50145 salvage (cron output truncation: adapter-aware chunking, #50126) diff --git a/tests/hermes_cli/test_gateway.py b/tests/hermes_cli/test_gateway.py index ba29d2f3347..9fb3e99caca 100644 --- a/tests/hermes_cli/test_gateway.py +++ b/tests/hermes_cli/test_gateway.py @@ -1,6 +1,7 @@ """Tests for hermes_cli.gateway.""" import argparse +import signal import sys from types import ModuleType, SimpleNamespace @@ -838,6 +839,53 @@ def test_find_gateway_pids_includes_restart_managers_without_systemd(monkeypatch assert calls == [(set(), True, True)] +def test_reap_unsupervised_orphans_noop_on_systemd_hosts(monkeypatch): + """On supervised hosts a `gateway restart` argv is transient — never reap.""" + monkeypatch.setattr(gateway, "supports_systemd_services", lambda: True) + killed = [] + monkeypatch.setattr(gateway.os, "kill", lambda pid, sig: killed.append((pid, sig))) + # Should not even consult the scan when a supervisor is present. + monkeypatch.setattr( + gateway, "find_gateway_pids", + lambda *a, **k: (_ for _ in ()).throw(AssertionError("scanned on systemd host")), + ) + + assert gateway._reap_unsupervised_gateway_orphans() is False + assert killed == [] + + +def test_reap_unsupervised_orphans_sigterms_then_sigkills_survivor(monkeypatch): + """No-systemd: orphan gets SIGTERM, and a survivor is force-killed.""" + monkeypatch.setattr(gateway, "supports_systemd_services", lambda: False) + monkeypatch.setattr(gateway, "find_gateway_pids", lambda exclude_pids=None: [708]) + monkeypatch.setattr("gateway.status.write_planned_stop_marker", lambda pid: True) + # Orphan ignores SIGTERM (matches the field report) and stays alive, so the + # follow-up SIGKILL must fire. + monkeypatch.setattr("gateway.status._pid_exists", lambda pid: True) + + sent = [] + monkeypatch.setattr(gateway.os, "kill", lambda pid, sig: sent.append((pid, sig))) + # Collapse the drain window: no real sleeping, and jump past the deadline + # after the first check so the loop exits immediately. + monkeypatch.setattr(gateway.time, "sleep", lambda _s: None) + ticks = iter([0.0, 100.0, 200.0]) + monkeypatch.setattr(gateway.time, "monotonic", lambda: next(ticks, 200.0)) + + assert gateway._reap_unsupervised_gateway_orphans() is True + assert (708, signal.SIGTERM) in sent + assert (708, signal.SIGKILL) in sent + + +def test_reap_unsupervised_orphans_returns_false_when_none_found(monkeypatch): + monkeypatch.setattr(gateway, "supports_systemd_services", lambda: False) + monkeypatch.setattr(gateway, "find_gateway_pids", lambda exclude_pids=None: []) + killed = [] + monkeypatch.setattr(gateway.os, "kill", lambda pid, sig: killed.append((pid, sig))) + + assert gateway._reap_unsupervised_gateway_orphans() is False + assert killed == [] + + def test_scan_gateway_pids_detects_windows_hermes_exe_case_variants(monkeypatch): monkeypatch.setattr(gateway, "is_windows", lambda: True) monkeypatch.setattr(gateway, "_get_ancestor_pids", lambda: set())