mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-27 11:22:03 +00:00
fix(gateway): actively reap no-systemd gateway orphan before restart
Builds on @wgu9's runtime-tracking fix: now that find_gateway_pids() can see a no-supervisor `gateway restart` runtime, have stop_profile_gateway() fall back to an orphan-aware, profile-scoped reap (SIGTERM then SIGKILL) when the pidfile/runtime record is missing or stale. Closes the duplicate- accumulation path in #51325 — a follow-up restart now kills the prior orphan instead of stacking another listener on :8644. Gated on not supports_systemd_services() so a transient `gateway restart` argv on supervised hosts is never killed. Also adds the AUTHOR_MAP entry for the salvaged contributor.
This commit is contained in:
parent
044996e403
commit
3d56807fbd
3 changed files with 124 additions and 1 deletions
|
|
@ -13,6 +13,7 @@ import signal
|
|||
import subprocess
|
||||
import sys
|
||||
import textwrap
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
|
|
@ -1344,12 +1345,85 @@ def kill_gateway_processes(
|
|||
return killed
|
||||
|
||||
|
||||
def _reap_unsupervised_gateway_orphans() -> bool:
|
||||
"""Kill no-supervisor gateway orphans the pidfile/runtime record can't see.
|
||||
|
||||
On WSL/no-systemd hosts the manual restart fallback runs the gateway
|
||||
in-process under a ``gateway restart`` argv (hermes_cli/gateway.py restart
|
||||
branch → ``run_gateway()``). If its pidfile or runtime record goes missing
|
||||
or stale, ``get_running_pid()`` returns ``None`` even though a live orphan
|
||||
still holds the webhook port, so a follow-up restart stacks a duplicate on
|
||||
the same port (#51325). This is a no-op on hosts WITH a service supervisor,
|
||||
where a ``gateway restart`` argv is a transient management command, not the
|
||||
running gateway — gating on ``supports_systemd_services()`` keeps the
|
||||
orphan-aware scan from killing live management processes there.
|
||||
|
||||
Returns True if at least one orphan was reaped.
|
||||
"""
|
||||
try:
|
||||
if supports_systemd_services():
|
||||
return False
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
from gateway.status import _pid_exists, write_planned_stop_marker
|
||||
|
||||
own = {os.getpid()}
|
||||
try:
|
||||
# find_gateway_pids() includes no-supervisor `gateway restart` runtimes
|
||||
# for the current profile when no systemd supervisor is present.
|
||||
orphans = [p for p in find_gateway_pids(exclude_pids=own) if p and p > 0]
|
||||
except Exception:
|
||||
return False
|
||||
if not orphans:
|
||||
return False
|
||||
|
||||
reaped = False
|
||||
for pid in orphans:
|
||||
try:
|
||||
write_planned_stop_marker(pid)
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
os.kill(pid, signal.SIGTERM)
|
||||
except ProcessLookupError:
|
||||
continue
|
||||
except PermissionError:
|
||||
print(f"⚠ Permission denied to kill orphaned gateway PID {pid}")
|
||||
continue
|
||||
reaped = True
|
||||
|
||||
# SIGTERM released the port in the field report but the orphan kept
|
||||
# running until a follow-up SIGKILL — wait briefly, then force-kill
|
||||
# any survivor so the replacement can bind the port cleanly.
|
||||
deadline = time.monotonic() + 5.0
|
||||
survivors = list(orphans)
|
||||
while survivors and time.monotonic() < deadline:
|
||||
survivors = [p for p in survivors if _pid_exists(p)]
|
||||
if survivors:
|
||||
time.sleep(0.2)
|
||||
for pid in survivors:
|
||||
try:
|
||||
os.kill(pid, getattr(signal, "SIGKILL", signal.SIGTERM))
|
||||
except (ProcessLookupError, PermissionError, OSError):
|
||||
pass
|
||||
|
||||
return reaped
|
||||
|
||||
|
||||
def stop_profile_gateway() -> bool:
|
||||
"""Stop only the gateway for the current profile (HERMES_HOME-scoped).
|
||||
|
||||
Uses the PID file written by start_gateway(), so it only kills the
|
||||
gateway belonging to this profile — not gateways from other profiles.
|
||||
Returns True if a process was stopped, False if none was found.
|
||||
|
||||
On hosts without a service supervisor (e.g. WSL/no-systemd, where the
|
||||
manual restart fallback runs the gateway in-process under a ``gateway
|
||||
restart`` argv), the pidfile/runtime record can be missing or stale while
|
||||
a live orphan still holds the webhook port. In that case fall back to the
|
||||
orphan-aware process scan so the replacement reaps the prior instance
|
||||
instead of stacking a duplicate on the same port (#51325).
|
||||
"""
|
||||
try:
|
||||
from gateway.status import get_running_pid, remove_pid_file
|
||||
|
|
@ -1358,7 +1432,7 @@ def stop_profile_gateway() -> bool:
|
|||
|
||||
pid = get_running_pid()
|
||||
if pid is None:
|
||||
return False
|
||||
return _reap_unsupervised_gateway_orphans()
|
||||
|
||||
try:
|
||||
from gateway.status import write_planned_stop_marker
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue