fix(gateway): actively reap no-systemd gateway orphan before restart

Builds on @wgu9's runtime-tracking fix: now that find_gateway_pids() can
see a no-supervisor `gateway restart` runtime, have stop_profile_gateway()
fall back to an orphan-aware, profile-scoped reap (SIGTERM then SIGKILL)
when the pidfile/runtime record is missing or stale. Closes the duplicate-
accumulation path in #51325 — a follow-up restart now kills the prior
orphan instead of stacking another listener on :8644. Gated on
not supports_systemd_services() so a transient `gateway restart` argv on
supervised hosts is never killed.

Also adds the AUTHOR_MAP entry for the salvaged contributor.
This commit is contained in:
teknium1 2026-06-23 21:55:06 -07:00 committed by Teknium
parent 044996e403
commit 3d56807fbd
3 changed files with 124 additions and 1 deletions

View file

@ -13,6 +13,7 @@ import signal
import subprocess
import sys
import textwrap
import time
from dataclasses import dataclass
from pathlib import Path
@ -1344,12 +1345,85 @@ def kill_gateway_processes(
return killed
def _reap_unsupervised_gateway_orphans() -> bool:
"""Kill no-supervisor gateway orphans the pidfile/runtime record can't see.
On WSL/no-systemd hosts the manual restart fallback runs the gateway
in-process under a ``gateway restart`` argv (hermes_cli/gateway.py restart
branch ``run_gateway()``). If its pidfile or runtime record goes missing
or stale, ``get_running_pid()`` returns ``None`` even though a live orphan
still holds the webhook port, so a follow-up restart stacks a duplicate on
the same port (#51325). This is a no-op on hosts WITH a service supervisor,
where a ``gateway restart`` argv is a transient management command, not the
running gateway gating on ``supports_systemd_services()`` keeps the
orphan-aware scan from killing live management processes there.
Returns True if at least one orphan was reaped.
"""
try:
if supports_systemd_services():
return False
except Exception:
return False
from gateway.status import _pid_exists, write_planned_stop_marker
own = {os.getpid()}
try:
# find_gateway_pids() includes no-supervisor `gateway restart` runtimes
# for the current profile when no systemd supervisor is present.
orphans = [p for p in find_gateway_pids(exclude_pids=own) if p and p > 0]
except Exception:
return False
if not orphans:
return False
reaped = False
for pid in orphans:
try:
write_planned_stop_marker(pid)
except Exception:
pass
try:
os.kill(pid, signal.SIGTERM)
except ProcessLookupError:
continue
except PermissionError:
print(f"⚠ Permission denied to kill orphaned gateway PID {pid}")
continue
reaped = True
# SIGTERM released the port in the field report but the orphan kept
# running until a follow-up SIGKILL — wait briefly, then force-kill
# any survivor so the replacement can bind the port cleanly.
deadline = time.monotonic() + 5.0
survivors = list(orphans)
while survivors and time.monotonic() < deadline:
survivors = [p for p in survivors if _pid_exists(p)]
if survivors:
time.sleep(0.2)
for pid in survivors:
try:
os.kill(pid, getattr(signal, "SIGKILL", signal.SIGTERM))
except (ProcessLookupError, PermissionError, OSError):
pass
return reaped
def stop_profile_gateway() -> bool:
"""Stop only the gateway for the current profile (HERMES_HOME-scoped).
Uses the PID file written by start_gateway(), so it only kills the
gateway belonging to this profile not gateways from other profiles.
Returns True if a process was stopped, False if none was found.
On hosts without a service supervisor (e.g. WSL/no-systemd, where the
manual restart fallback runs the gateway in-process under a ``gateway
restart`` argv), the pidfile/runtime record can be missing or stale while
a live orphan still holds the webhook port. In that case fall back to the
orphan-aware process scan so the replacement reaps the prior instance
instead of stacking a duplicate on the same port (#51325).
"""
try:
from gateway.status import get_running_pid, remove_pid_file
@ -1358,7 +1432,7 @@ def stop_profile_gateway() -> bool:
pid = get_running_pid()
if pid is None:
return False
return _reap_unsupervised_gateway_orphans()
try:
from gateway.status import write_planned_stop_marker