fix(gateway): actively reap no-systemd gateway orphan before restart

Builds on @wgu9's runtime-tracking fix: now that find_gateway_pids() can
see a no-supervisor `gateway restart` runtime, have stop_profile_gateway()
fall back to an orphan-aware, profile-scoped reap (SIGTERM then SIGKILL)
when the pidfile/runtime record is missing or stale. Closes the duplicate-
accumulation path in #51325 — a follow-up restart now kills the prior
orphan instead of stacking another listener on :8644. Gated on
not supports_systemd_services() so a transient `gateway restart` argv on
supervised hosts is never killed.

Also adds the AUTHOR_MAP entry for the salvaged contributor.
This commit is contained in:
teknium1 2026-06-23 21:55:06 -07:00 committed by Teknium
parent 044996e403
commit 3d56807fbd
3 changed files with 124 additions and 1 deletions

View file

@ -13,6 +13,7 @@ import signal
import subprocess
import sys
import textwrap
import time
from dataclasses import dataclass
from pathlib import Path
@ -1344,12 +1345,85 @@ def kill_gateway_processes(
return killed
def _reap_unsupervised_gateway_orphans() -> bool:
"""Kill no-supervisor gateway orphans the pidfile/runtime record can't see.
On WSL/no-systemd hosts the manual restart fallback runs the gateway
in-process under a ``gateway restart`` argv (hermes_cli/gateway.py restart
branch ``run_gateway()``). If its pidfile or runtime record goes missing
or stale, ``get_running_pid()`` returns ``None`` even though a live orphan
still holds the webhook port, so a follow-up restart stacks a duplicate on
the same port (#51325). This is a no-op on hosts WITH a service supervisor,
where a ``gateway restart`` argv is a transient management command, not the
running gateway gating on ``supports_systemd_services()`` keeps the
orphan-aware scan from killing live management processes there.
Returns True if at least one orphan was reaped.
"""
try:
if supports_systemd_services():
return False
except Exception:
return False
from gateway.status import _pid_exists, write_planned_stop_marker
own = {os.getpid()}
try:
# find_gateway_pids() includes no-supervisor `gateway restart` runtimes
# for the current profile when no systemd supervisor is present.
orphans = [p for p in find_gateway_pids(exclude_pids=own) if p and p > 0]
except Exception:
return False
if not orphans:
return False
reaped = False
for pid in orphans:
try:
write_planned_stop_marker(pid)
except Exception:
pass
try:
os.kill(pid, signal.SIGTERM)
except ProcessLookupError:
continue
except PermissionError:
print(f"⚠ Permission denied to kill orphaned gateway PID {pid}")
continue
reaped = True
# SIGTERM released the port in the field report but the orphan kept
# running until a follow-up SIGKILL — wait briefly, then force-kill
# any survivor so the replacement can bind the port cleanly.
deadline = time.monotonic() + 5.0
survivors = list(orphans)
while survivors and time.monotonic() < deadline:
survivors = [p for p in survivors if _pid_exists(p)]
if survivors:
time.sleep(0.2)
for pid in survivors:
try:
os.kill(pid, getattr(signal, "SIGKILL", signal.SIGTERM))
except (ProcessLookupError, PermissionError, OSError):
pass
return reaped
def stop_profile_gateway() -> bool:
"""Stop only the gateway for the current profile (HERMES_HOME-scoped).
Uses the PID file written by start_gateway(), so it only kills the
gateway belonging to this profile not gateways from other profiles.
Returns True if a process was stopped, False if none was found.
On hosts without a service supervisor (e.g. WSL/no-systemd, where the
manual restart fallback runs the gateway in-process under a ``gateway
restart`` argv), the pidfile/runtime record can be missing or stale while
a live orphan still holds the webhook port. In that case fall back to the
orphan-aware process scan so the replacement reaps the prior instance
instead of stacking a duplicate on the same port (#51325).
"""
try:
from gateway.status import get_running_pid, remove_pid_file
@ -1358,7 +1432,7 @@ def stop_profile_gateway() -> bool:
pid = get_running_pid()
if pid is None:
return False
return _reap_unsupervised_gateway_orphans()
try:
from gateway.status import write_planned_stop_marker

View file

@ -45,6 +45,7 @@ ACP_REGISTRY_MANIFEST = REPO_ROOT / "acp_registry" / "agent.json"
# Auto-extracted from noreply emails + manual overrides
AUTHOR_MAP = {
"145739220+wgu9@users.noreply.github.com": "wgu9", # PR #51468 salvage (WSL/no-systemd orphan gateway tracking, #51325)
"minz0721@outlook.com": "s010mn", # PR #29221 salvage (ollama-cloud reasoning_effort xhigh→max)
"jeevesassistant00@gmail.com": "jeeves-assistant", # PR #50771 (computer-use CuaDriver vision capture routing)
"21178861+ScotterMonk@users.noreply.github.com": "ScotterMonk", # PR #50145 salvage (cron output truncation: adapter-aware chunking, #50126)

View file

@ -1,6 +1,7 @@
"""Tests for hermes_cli.gateway."""
import argparse
import signal
import sys
from types import ModuleType, SimpleNamespace
@ -838,6 +839,53 @@ def test_find_gateway_pids_includes_restart_managers_without_systemd(monkeypatch
assert calls == [(set(), True, True)]
def test_reap_unsupervised_orphans_noop_on_systemd_hosts(monkeypatch):
"""On supervised hosts a `gateway restart` argv is transient — never reap."""
monkeypatch.setattr(gateway, "supports_systemd_services", lambda: True)
killed = []
monkeypatch.setattr(gateway.os, "kill", lambda pid, sig: killed.append((pid, sig)))
# Should not even consult the scan when a supervisor is present.
monkeypatch.setattr(
gateway, "find_gateway_pids",
lambda *a, **k: (_ for _ in ()).throw(AssertionError("scanned on systemd host")),
)
assert gateway._reap_unsupervised_gateway_orphans() is False
assert killed == []
def test_reap_unsupervised_orphans_sigterms_then_sigkills_survivor(monkeypatch):
"""No-systemd: orphan gets SIGTERM, and a survivor is force-killed."""
monkeypatch.setattr(gateway, "supports_systemd_services", lambda: False)
monkeypatch.setattr(gateway, "find_gateway_pids", lambda exclude_pids=None: [708])
monkeypatch.setattr("gateway.status.write_planned_stop_marker", lambda pid: True)
# Orphan ignores SIGTERM (matches the field report) and stays alive, so the
# follow-up SIGKILL must fire.
monkeypatch.setattr("gateway.status._pid_exists", lambda pid: True)
sent = []
monkeypatch.setattr(gateway.os, "kill", lambda pid, sig: sent.append((pid, sig)))
# Collapse the drain window: no real sleeping, and jump past the deadline
# after the first check so the loop exits immediately.
monkeypatch.setattr(gateway.time, "sleep", lambda _s: None)
ticks = iter([0.0, 100.0, 200.0])
monkeypatch.setattr(gateway.time, "monotonic", lambda: next(ticks, 200.0))
assert gateway._reap_unsupervised_gateway_orphans() is True
assert (708, signal.SIGTERM) in sent
assert (708, signal.SIGKILL) in sent
def test_reap_unsupervised_orphans_returns_false_when_none_found(monkeypatch):
monkeypatch.setattr(gateway, "supports_systemd_services", lambda: False)
monkeypatch.setattr(gateway, "find_gateway_pids", lambda exclude_pids=None: [])
killed = []
monkeypatch.setattr(gateway.os, "kill", lambda pid, sig: killed.append((pid, sig)))
assert gateway._reap_unsupervised_gateway_orphans() is False
assert killed == []
def test_scan_gateway_pids_detects_windows_hermes_exe_case_variants(monkeypatch):
monkeypatch.setattr(gateway, "is_windows", lambda: True)
monkeypatch.setattr(gateway, "_get_ancestor_pids", lambda: set())