feat(gateway): shutdown forensics — non-blocking diag, per-phase timing, stale-unit warning (#23285)

When the gateway received SIGTERM, the shutdown_signal_handler ran a
synchronous 'ps aux' (3s timeout) inside the asyncio event loop, then
asyncio.create_task(runner.stop()).  On a busy host that ate 1-3s of
the teardown budget before draining could even start, and the resulting
log line was a multi-line ps dump that didn't tell us who sent the
signal.  The shutdown path itself logged 'Stopping gateway...' and then
nothing until 'Gateway stopped' — when systemd SIGKILLed mid-drain,
there was no way to see which phase wedged.

Changes:
- New gateway/shutdown_forensics.py:
  * snapshot_shutdown_context(sig) — sub-millisecond /proc-only capture
    of signal name, parent pid+name+cmdline, INVOCATION_ID (systemd
    marker), loadavg_1m, TracerPid, takeover/planned-stop marker
    presence + whether-it-names-self.  Pure stdlib, never raises.
  * spawn_async_diagnostic(log_path, sig) — detached subprocess with
    its own 'timeout 5s', start_new_session=True, writes ps auxf +
    pstree + dmesg to ~/.hermes/logs/gateway-shutdown-diag.log.
    Returns immediately, can't block the event loop or the cgroup
    teardown.
  * check_systemd_timing_alignment(drain_timeout) — reads
    /proc/self/cgroup for our unit, asks systemctl show for
    TimeoutStopUSec, returns mismatch info when the unit's stop
    timeout is smaller than restart_drain_timeout + 30s headroom
    (the case where systemd SIGKILLs mid-drain).
  * _parse_systemd_duration_to_us — covers '90s', '1min 30s',
    '500ms', '1h' style values from systemctl show.
  * format_context_for_log — single scannable key=value line, parent
    cmdline last.
- gateway/run.py shutdown_signal_handler:
  * Replaces synchronous ps aux + ad-hoc 'hermes-related lines' filter
    with snapshot + detached spawn.
  * Always logs 'Shutdown context: signal=... parent_pid=...
    parent_cmdline=...' regardless of planned/unexpected so we can
    correlate signal source even on planned restarts.
- gateway/run.py _stop_impl:
  * Per-phase '+X.XXs' timing for notify_active_sessions, drain
    (with drain_seconds, active_at_start, active_now, timed_out),
    post-interrupt tool kill, each adapter disconnect (Xs),
    all adapters disconnected, final-cleanup tool kill, SessionDB
    close, total teardown.
- gateway/run.py start():
  * Stale-unit warning at startup when the running systemd unit's
    TimeoutStopSec is smaller than the configured drain timeout.
    Points the user at 'hermes gateway service install --replace'
    to regenerate, or at shortening agent.restart_drain_timeout.

Tests: 30 new in tests/gateway/test_shutdown_forensics.py — snapshot
speed bound, signal name resolution, marker detection self-vs-other,
async diag spawn doesn't block caller, systemd duration parser, and
alignment check returns None outside systemd.  Wider tests/gateway/
suite: 5258 passing, 3 pre-existing TTS-routing failures unchanged
on main.
This commit is contained in:
Teknium 2026-05-10 09:01:51 -07:00 committed by GitHub
parent 1f5983c4c8
commit cede612987
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 828 additions and 26 deletions

View file

@ -3206,6 +3206,28 @@ class GatewayRunner:
except RuntimeError:
self._gateway_loop = None
logger.info("Session storage: %s", self.config.sessions_dir)
# Sanity-check that systemd's TimeoutStopSec covers our drain
# window. When the user upgraded hermes-agent without re-running
# ``hermes setup``, their unit file may still encode the old
# default — in which case SIGKILL hits mid-drain and looks like
# a phantom kill in the journal. Best-effort, never raises.
try:
from gateway.shutdown_forensics import check_systemd_timing_alignment
_alignment = check_systemd_timing_alignment(self._restart_drain_timeout)
if _alignment is not None and _alignment.get("mismatch"):
logger.warning(
"Stale systemd unit detected: %s has TimeoutStopSec=%.0fs but "
"drain_timeout=%.0fs (expected >=%.0fs). systemd may SIGKILL the "
"gateway mid-drain. Run `hermes gateway service install --replace` "
"to regenerate the unit, or shorten agent.restart_drain_timeout.",
_alignment.get("unit", "(unknown)"),
_alignment["timeout_stop_sec"],
_alignment["drain_timeout"],
_alignment["expected_min"],
)
except Exception as _e:
logger.debug("check_systemd_timing_alignment failed: %s", _e)
# Log the resolved max_iterations budget so operators can verify the
# config.yaml → env bridge did the right thing at a glance (instead
# of silently running at a stale .env value for weeks).
@ -4498,15 +4520,34 @@ class GatewayRunner:
"Stopping gateway%s...",
" for restart" if self._restart_requested else "",
)
_stop_started_at = time.monotonic()
def _phase_elapsed() -> float:
return time.monotonic() - _stop_started_at
self._running = False
self._draining = True
# Notify all chats with active agents BEFORE draining.
# Adapters are still connected here, so messages can be sent.
await self._notify_active_sessions_of_shutdown()
logger.info(
"Shutdown phase: notify_active_sessions done at +%.2fs",
_phase_elapsed(),
)
timeout = self._restart_drain_timeout
_drain_started_at = time.monotonic()
active_agents, timed_out = await self._drain_active_agents(timeout)
logger.info(
"Shutdown phase: drain done at +%.2fs (drain took %.2fs, "
"timed_out=%s, active_at_start=%d, active_now=%d)",
_phase_elapsed(),
time.monotonic() - _drain_started_at,
timed_out,
len(active_agents),
self._running_agent_count(),
)
if timed_out:
logger.warning(
"Gateway drain timed out after %.1fs with %d active agent(s); interrupting remaining work.",
@ -4564,6 +4605,10 @@ class GatewayRunner:
# killed by systemd instead of us (issue #8202). The final
# catch-all cleanup below still runs for the graceful path.
_kill_tool_subprocesses("post-interrupt")
logger.info(
"Shutdown phase: post-interrupt tool kill done at +%.2fs",
_phase_elapsed(),
)
if self._restart_requested and self._restart_detached:
try:
@ -4591,15 +4636,29 @@ class GatewayRunner:
self._cleanup_agent_resources(_agent)
for platform, adapter in list(self.adapters.items()):
_adapter_started_at = time.monotonic()
try:
await adapter.cancel_background_tasks()
except Exception as e:
logger.debug("%s background-task cancel error: %s", platform.value, e)
try:
await adapter.disconnect()
logger.info("%s disconnected", platform.value)
logger.info(
"%s disconnected (%.2fs)",
platform.value,
time.monotonic() - _adapter_started_at,
)
except Exception as e:
logger.error("%s disconnect error: %s", platform.value, e)
logger.error(
"%s disconnect error after %.2fs: %s",
platform.value,
time.monotonic() - _adapter_started_at,
e,
)
logger.info(
"Shutdown phase: all adapters disconnected at +%.2fs",
_phase_elapsed(),
)
for _task in list(self._background_tasks):
if _task is self._stop_task:
@ -4624,6 +4683,10 @@ class GatewayRunner:
# that got respawned between the earlier call and adapter
# disconnect (defense in depth; safe to call repeatedly).
_kill_tool_subprocesses("final-cleanup")
logger.info(
"Shutdown phase: final-cleanup tool kill done at +%.2fs",
_phase_elapsed(),
)
# Reap the process-global auxiliary-client cache once at the very
# end of teardown. Per-turn cleanup runs in _cleanup_agent_resources
@ -4651,6 +4714,10 @@ class GatewayRunner:
_db.close()
except Exception as _e:
logger.debug("SessionDB close error: %s", _e)
logger.info(
"Shutdown phase: SessionDB close done at +%.2fs",
_phase_elapsed(),
)
from gateway.status import remove_pid_file, release_gateway_runtime_lock
remove_pid_file()
@ -4690,7 +4757,7 @@ class GatewayRunner:
self._draining = False
self._update_runtime_status("stopped", self._exit_reason)
logger.info("Gateway stopped")
logger.info("Gateway stopped (total teardown %.2fs)", _phase_elapsed())
self._stop_task = asyncio.create_task(_stop_impl())
await self._stop_task
@ -15762,40 +15829,62 @@ async def start_gateway(config: Optional[GatewayConfig] = None, replace: bool =
except Exception as e:
logger.debug("Planned stop marker check failed: %s", e)
# Fast (<10ms) snapshot of who's asking us to shut down — runs
# synchronously inside the asyncio signal handler, so we keep it
# purely stdlib + /proc reads, no subprocesses. See PR #15826
# (May 2026): the previous implementation called `ps aux` here
# synchronously, blocking the event loop for up to 3s while
# adapter teardown couldn't begin.
try:
from gateway.shutdown_forensics import (
format_context_for_log,
snapshot_shutdown_context,
spawn_async_diagnostic,
)
_shutdown_ctx = snapshot_shutdown_context(received_signal)
except Exception as _e:
_shutdown_ctx = None
logger.debug("snapshot_shutdown_context failed: %s", _e)
if planned_takeover:
logger.info(
"Received SIGTERM as a planned --replace takeover — exiting cleanly"
"Received %s as a planned --replace takeover — exiting cleanly",
_shutdown_ctx["signal"] if _shutdown_ctx else "SIGTERM",
)
elif planned_stop:
logger.info(
"Received SIGTERM/SIGINT as a planned gateway stop — exiting cleanly"
"Received %s as a planned gateway stop — exiting cleanly",
_shutdown_ctx["signal"] if _shutdown_ctx else "SIGTERM/SIGINT",
)
else:
_signal_initiated_shutdown = True
logger.info("Received SIGTERM/SIGINT — initiating shutdown")
# Diagnostic: log all hermes-related processes so we can identify
# what triggered the signal (hermes update, hermes gateway restart,
# a stale detached subprocess, etc.).
try:
import subprocess as _sp
_ps = _sp.run(
["ps", "aux"],
capture_output=True, text=True, timeout=3,
logger.info(
"Received %s — initiating shutdown",
_shutdown_ctx["signal"] if _shutdown_ctx else "SIGTERM/SIGINT",
)
_hermes_procs = [
line for line in _ps.stdout.splitlines()
if ("hermes" in line.lower() or "gateway" in line.lower())
and str(os.getpid()) not in line.split()[1:2] # exclude self
]
if _hermes_procs:
# Always log who/what triggered the signal — most useful single
# line when diagnosing "the gateway keeps dying" tickets. Format
# is one line, key=value, parent_cmdline last (often long).
if _shutdown_ctx is not None:
try:
logger.warning(
"Shutdown diagnostic — other hermes processes running:\n %s",
"\n ".join(_hermes_procs),
"Shutdown context: %s", format_context_for_log(_shutdown_ctx)
)
else:
logger.info("Shutdown diagnostic — no other hermes processes found")
except Exception:
pass
except Exception as _e:
logger.debug("format_context_for_log failed: %s", _e)
# Spawn the heavyweight diagnostic (ps auxf, pstree, dmesg) in
# a detached subprocess so it can finish writing to disk even
# if our cgroup is being torn down. Bounded by an internal
# timeout; never blocks the event loop here.
try:
_diag_log = _hermes_home / "logs" / "gateway-shutdown-diag.log"
spawn_async_diagnostic(
_diag_log, _shutdown_ctx["signal"], timeout_seconds=5.0
)
except Exception as _e:
logger.debug("spawn_async_diagnostic failed: %s", _e)
asyncio.create_task(runner.stop())
def restart_signal_handler():