mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-27 11:22:03 +00:00
fix(gateway): actively reap no-systemd gateway orphan before restart
Builds on @wgu9's runtime-tracking fix: now that find_gateway_pids() can see a no-supervisor `gateway restart` runtime, have stop_profile_gateway() fall back to an orphan-aware, profile-scoped reap (SIGTERM then SIGKILL) when the pidfile/runtime record is missing or stale. Closes the duplicate- accumulation path in #51325 — a follow-up restart now kills the prior orphan instead of stacking another listener on :8644. Gated on not supports_systemd_services() so a transient `gateway restart` argv on supervised hosts is never killed. Also adds the AUTHOR_MAP entry for the salvaged contributor.
This commit is contained in:
parent
044996e403
commit
3d56807fbd
3 changed files with 124 additions and 1 deletions
|
|
@ -13,6 +13,7 @@ import signal
|
|||
import subprocess
|
||||
import sys
|
||||
import textwrap
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
|
|
@ -1344,12 +1345,85 @@ def kill_gateway_processes(
|
|||
return killed
|
||||
|
||||
|
||||
def _reap_unsupervised_gateway_orphans() -> bool:
|
||||
"""Kill no-supervisor gateway orphans the pidfile/runtime record can't see.
|
||||
|
||||
On WSL/no-systemd hosts the manual restart fallback runs the gateway
|
||||
in-process under a ``gateway restart`` argv (hermes_cli/gateway.py restart
|
||||
branch → ``run_gateway()``). If its pidfile or runtime record goes missing
|
||||
or stale, ``get_running_pid()`` returns ``None`` even though a live orphan
|
||||
still holds the webhook port, so a follow-up restart stacks a duplicate on
|
||||
the same port (#51325). This is a no-op on hosts WITH a service supervisor,
|
||||
where a ``gateway restart`` argv is a transient management command, not the
|
||||
running gateway — gating on ``supports_systemd_services()`` keeps the
|
||||
orphan-aware scan from killing live management processes there.
|
||||
|
||||
Returns True if at least one orphan was reaped.
|
||||
"""
|
||||
try:
|
||||
if supports_systemd_services():
|
||||
return False
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
from gateway.status import _pid_exists, write_planned_stop_marker
|
||||
|
||||
own = {os.getpid()}
|
||||
try:
|
||||
# find_gateway_pids() includes no-supervisor `gateway restart` runtimes
|
||||
# for the current profile when no systemd supervisor is present.
|
||||
orphans = [p for p in find_gateway_pids(exclude_pids=own) if p and p > 0]
|
||||
except Exception:
|
||||
return False
|
||||
if not orphans:
|
||||
return False
|
||||
|
||||
reaped = False
|
||||
for pid in orphans:
|
||||
try:
|
||||
write_planned_stop_marker(pid)
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
os.kill(pid, signal.SIGTERM)
|
||||
except ProcessLookupError:
|
||||
continue
|
||||
except PermissionError:
|
||||
print(f"⚠ Permission denied to kill orphaned gateway PID {pid}")
|
||||
continue
|
||||
reaped = True
|
||||
|
||||
# SIGTERM released the port in the field report but the orphan kept
|
||||
# running until a follow-up SIGKILL — wait briefly, then force-kill
|
||||
# any survivor so the replacement can bind the port cleanly.
|
||||
deadline = time.monotonic() + 5.0
|
||||
survivors = list(orphans)
|
||||
while survivors and time.monotonic() < deadline:
|
||||
survivors = [p for p in survivors if _pid_exists(p)]
|
||||
if survivors:
|
||||
time.sleep(0.2)
|
||||
for pid in survivors:
|
||||
try:
|
||||
os.kill(pid, getattr(signal, "SIGKILL", signal.SIGTERM))
|
||||
except (ProcessLookupError, PermissionError, OSError):
|
||||
pass
|
||||
|
||||
return reaped
|
||||
|
||||
|
||||
def stop_profile_gateway() -> bool:
|
||||
"""Stop only the gateway for the current profile (HERMES_HOME-scoped).
|
||||
|
||||
Uses the PID file written by start_gateway(), so it only kills the
|
||||
gateway belonging to this profile — not gateways from other profiles.
|
||||
Returns True if a process was stopped, False if none was found.
|
||||
|
||||
On hosts without a service supervisor (e.g. WSL/no-systemd, where the
|
||||
manual restart fallback runs the gateway in-process under a ``gateway
|
||||
restart`` argv), the pidfile/runtime record can be missing or stale while
|
||||
a live orphan still holds the webhook port. In that case fall back to the
|
||||
orphan-aware process scan so the replacement reaps the prior instance
|
||||
instead of stacking a duplicate on the same port (#51325).
|
||||
"""
|
||||
try:
|
||||
from gateway.status import get_running_pid, remove_pid_file
|
||||
|
|
@ -1358,7 +1432,7 @@ def stop_profile_gateway() -> bool:
|
|||
|
||||
pid = get_running_pid()
|
||||
if pid is None:
|
||||
return False
|
||||
return _reap_unsupervised_gateway_orphans()
|
||||
|
||||
try:
|
||||
from gateway.status import write_planned_stop_marker
|
||||
|
|
|
|||
|
|
@ -45,6 +45,7 @@ ACP_REGISTRY_MANIFEST = REPO_ROOT / "acp_registry" / "agent.json"
|
|||
|
||||
# Auto-extracted from noreply emails + manual overrides
|
||||
AUTHOR_MAP = {
|
||||
"145739220+wgu9@users.noreply.github.com": "wgu9", # PR #51468 salvage (WSL/no-systemd orphan gateway tracking, #51325)
|
||||
"minz0721@outlook.com": "s010mn", # PR #29221 salvage (ollama-cloud reasoning_effort xhigh→max)
|
||||
"jeevesassistant00@gmail.com": "jeeves-assistant", # PR #50771 (computer-use CuaDriver vision capture routing)
|
||||
"21178861+ScotterMonk@users.noreply.github.com": "ScotterMonk", # PR #50145 salvage (cron output truncation: adapter-aware chunking, #50126)
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
"""Tests for hermes_cli.gateway."""
|
||||
|
||||
import argparse
|
||||
import signal
|
||||
import sys
|
||||
from types import ModuleType, SimpleNamespace
|
||||
|
||||
|
|
@ -838,6 +839,53 @@ def test_find_gateway_pids_includes_restart_managers_without_systemd(monkeypatch
|
|||
assert calls == [(set(), True, True)]
|
||||
|
||||
|
||||
def test_reap_unsupervised_orphans_noop_on_systemd_hosts(monkeypatch):
|
||||
"""On supervised hosts a `gateway restart` argv is transient — never reap."""
|
||||
monkeypatch.setattr(gateway, "supports_systemd_services", lambda: True)
|
||||
killed = []
|
||||
monkeypatch.setattr(gateway.os, "kill", lambda pid, sig: killed.append((pid, sig)))
|
||||
# Should not even consult the scan when a supervisor is present.
|
||||
monkeypatch.setattr(
|
||||
gateway, "find_gateway_pids",
|
||||
lambda *a, **k: (_ for _ in ()).throw(AssertionError("scanned on systemd host")),
|
||||
)
|
||||
|
||||
assert gateway._reap_unsupervised_gateway_orphans() is False
|
||||
assert killed == []
|
||||
|
||||
|
||||
def test_reap_unsupervised_orphans_sigterms_then_sigkills_survivor(monkeypatch):
|
||||
"""No-systemd: orphan gets SIGTERM, and a survivor is force-killed."""
|
||||
monkeypatch.setattr(gateway, "supports_systemd_services", lambda: False)
|
||||
monkeypatch.setattr(gateway, "find_gateway_pids", lambda exclude_pids=None: [708])
|
||||
monkeypatch.setattr("gateway.status.write_planned_stop_marker", lambda pid: True)
|
||||
# Orphan ignores SIGTERM (matches the field report) and stays alive, so the
|
||||
# follow-up SIGKILL must fire.
|
||||
monkeypatch.setattr("gateway.status._pid_exists", lambda pid: True)
|
||||
|
||||
sent = []
|
||||
monkeypatch.setattr(gateway.os, "kill", lambda pid, sig: sent.append((pid, sig)))
|
||||
# Collapse the drain window: no real sleeping, and jump past the deadline
|
||||
# after the first check so the loop exits immediately.
|
||||
monkeypatch.setattr(gateway.time, "sleep", lambda _s: None)
|
||||
ticks = iter([0.0, 100.0, 200.0])
|
||||
monkeypatch.setattr(gateway.time, "monotonic", lambda: next(ticks, 200.0))
|
||||
|
||||
assert gateway._reap_unsupervised_gateway_orphans() is True
|
||||
assert (708, signal.SIGTERM) in sent
|
||||
assert (708, signal.SIGKILL) in sent
|
||||
|
||||
|
||||
def test_reap_unsupervised_orphans_returns_false_when_none_found(monkeypatch):
|
||||
monkeypatch.setattr(gateway, "supports_systemd_services", lambda: False)
|
||||
monkeypatch.setattr(gateway, "find_gateway_pids", lambda exclude_pids=None: [])
|
||||
killed = []
|
||||
monkeypatch.setattr(gateway.os, "kill", lambda pid, sig: killed.append((pid, sig)))
|
||||
|
||||
assert gateway._reap_unsupervised_gateway_orphans() is False
|
||||
assert killed == []
|
||||
|
||||
|
||||
def test_scan_gateway_pids_detects_windows_hermes_exe_case_variants(monkeypatch):
|
||||
monkeypatch.setattr(gateway, "is_windows", lambda: True)
|
||||
monkeypatch.setattr(gateway, "_get_ancestor_pids", lambda: set())
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue