From 039023f49747599199c8aee26a7a73b3640f9b6a Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Tue, 14 Apr 2026 16:26:36 -0700 Subject: [PATCH] diag: log all hermes processes on unexpected gateway shutdown (#9905) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the gateway receives SIGTERM/SIGINT, the shutdown handler now runs 'ps aux' and logs every hermes/gateway-related process (excluding itself). This will show in agent.log as: WARNING: Shutdown diagnostic — other hermes processes running: hermes 1234 ... hermes update --gateway hermes 5678 ... hermes gateway restart This is the missing diagnostic for #5646 / #6666 — we can prove the restarts are from systemctl but can't determine WHO issues the systemctl command. Next time it happens, the agent.log will contain the evidence (the process that sent the signal or called systemctl should still be alive when the handler fires). --- gateway/run.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/gateway/run.py b/gateway/run.py index da3560cf743..5c3e5f13c6e 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -9273,6 +9273,29 @@ async def start_gateway(config: Optional[GatewayConfig] = None, replace: bool = nonlocal _signal_initiated_shutdown _signal_initiated_shutdown = True logger.info("Received SIGTERM/SIGINT — initiating shutdown") + # Diagnostic: log all hermes-related processes so we can identify + # what triggered the signal (hermes update, hermes gateway restart, + # a stale detached subprocess, etc.). + try: + import subprocess as _sp + _ps = _sp.run( + ["ps", "aux"], + capture_output=True, text=True, timeout=3, + ) + _hermes_procs = [ + line for line in _ps.stdout.splitlines() + if ("hermes" in line.lower() or "gateway" in line.lower()) + and str(os.getpid()) not in line.split()[1:2] # exclude self + ] + if _hermes_procs: + logger.warning( + "Shutdown diagnostic — other hermes processes running:\n %s", + "\n ".join(_hermes_procs), + ) + else: + logger.info("Shutdown diagnostic — no other hermes processes found") + except Exception: + pass asyncio.create_task(runner.stop()) def restart_signal_handler():