mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix: gateway auto-recovers from unexpected SIGTERM via systemd (#5646)
Root cause: when the gateway received SIGTERM (from hermes update, external kill, WSL2 runtime, etc.), it exited with status 0. systemd's Restart=on-failure only restarts on non-zero exit, so the gateway stayed dead permanently. Users had to manually restart. Fix 1: Signal-initiated shutdown exits non-zero When SIGTERM/SIGINT is received and no restart was requested (via /restart, /update, or SIGUSR1), start_gateway() returns False which causes sys.exit(1). systemd sees a failure exit and auto-restarts after RestartSec=30. This is safe because systemctl stop tracks its own stop-requested state independently of exit code — Restart= never fires for a deliberate stop, regardless of exit code. Also logs 'Received SIGTERM/SIGINT — initiating shutdown' so the cause of unexpected shutdowns is visible in agent.log. Fix 2: PID file ownership guard remove_pid_file() now checks that the PID file belongs to the current process before removing it. During --replace handoffs, the old process's atexit handler could fire AFTER the new process wrote its PID file, deleting the new record. This left the gateway running but invisible to get_running_pid(), causing 'Another gateway already running' errors on next restart. Test plan: - All restart drain tests pass (13) - All gateway service tests pass (84) - All update gateway restart tests pass (34)
This commit is contained in:
parent
eed891f1bb
commit
397386cae2
2 changed files with 43 additions and 2 deletions
|
|
@ -9261,8 +9261,18 @@ async def start_gateway(config: Optional[GatewayConfig] = None, replace: bool =
|
|||
|
||||
runner = GatewayRunner(config)
|
||||
|
||||
# Track whether a signal initiated the shutdown (vs. internal request).
|
||||
# When an unexpected SIGTERM kills the gateway, we exit non-zero so
|
||||
# systemd's Restart=on-failure revives the process. systemctl stop
|
||||
# is safe: systemd tracks stop-requested state independently of exit
|
||||
# code, so Restart= never fires for a deliberate stop.
|
||||
_signal_initiated_shutdown = False
|
||||
|
||||
# Set up signal handlers
|
||||
def shutdown_signal_handler():
|
||||
nonlocal _signal_initiated_shutdown
|
||||
_signal_initiated_shutdown = True
|
||||
logger.info("Received SIGTERM/SIGINT — initiating shutdown")
|
||||
asyncio.create_task(runner.stop())
|
||||
|
||||
def restart_signal_handler():
|
||||
|
|
@ -9332,6 +9342,21 @@ async def start_gateway(config: Optional[GatewayConfig] = None, replace: bool =
|
|||
if runner.exit_code is not None:
|
||||
raise SystemExit(runner.exit_code)
|
||||
|
||||
# When a signal (SIGTERM/SIGINT) caused the shutdown and it wasn't a
|
||||
# planned restart (/restart, /update, SIGUSR1), exit non-zero so
|
||||
# systemd's Restart=on-failure revives the process. This covers:
|
||||
# - hermes update killing the gateway mid-work
|
||||
# - External kill commands
|
||||
# - WSL2/container runtime sending unexpected signals
|
||||
# systemctl stop is safe: systemd tracks "stop requested" state
|
||||
# independently of exit code, so Restart= never fires for it.
|
||||
if _signal_initiated_shutdown and not runner._restart_requested:
|
||||
logger.info(
|
||||
"Exiting with code 1 (signal-initiated shutdown without restart "
|
||||
"request) so systemd Restart=on-failure can revive the gateway."
|
||||
)
|
||||
return False # → sys.exit(1) in the caller
|
||||
|
||||
return True
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -266,9 +266,25 @@ def read_runtime_status() -> Optional[dict[str, Any]]:
|
|||
|
||||
|
||||
def remove_pid_file() -> None:
|
||||
"""Remove the gateway PID file if it exists."""
|
||||
"""Remove the gateway PID file, but only if it belongs to this process.
|
||||
|
||||
During --replace handoffs, the old process's atexit handler can fire AFTER
|
||||
the new process has written its own PID file. Blindly removing the file
|
||||
would delete the new process's record, leaving the gateway running with no
|
||||
PID file (invisible to ``get_running_pid()``).
|
||||
"""
|
||||
try:
|
||||
_get_pid_path().unlink(missing_ok=True)
|
||||
path = _get_pid_path()
|
||||
record = _read_json_file(path)
|
||||
if record is not None:
|
||||
try:
|
||||
file_pid = int(record["pid"])
|
||||
except (KeyError, TypeError, ValueError):
|
||||
file_pid = None
|
||||
if file_pid is not None and file_pid != os.getpid():
|
||||
# PID file belongs to a different process — leave it alone.
|
||||
return
|
||||
path.unlink(missing_ok=True)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue