diff --git a/gateway/run.py b/gateway/run.py index 0cdfb71466..da3560cf74 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -9261,8 +9261,18 @@ async def start_gateway(config: Optional[GatewayConfig] = None, replace: bool = runner = GatewayRunner(config) + # Track whether a signal initiated the shutdown (vs. internal request). + # When an unexpected SIGTERM kills the gateway, we exit non-zero so + # systemd's Restart=on-failure revives the process. systemctl stop + # is safe: systemd tracks stop-requested state independently of exit + # code, so Restart= never fires for a deliberate stop. + _signal_initiated_shutdown = False + # Set up signal handlers def shutdown_signal_handler(): + nonlocal _signal_initiated_shutdown + _signal_initiated_shutdown = True + logger.info("Received SIGTERM/SIGINT — initiating shutdown") asyncio.create_task(runner.stop()) def restart_signal_handler(): @@ -9332,6 +9342,21 @@ async def start_gateway(config: Optional[GatewayConfig] = None, replace: bool = if runner.exit_code is not None: raise SystemExit(runner.exit_code) + # When a signal (SIGTERM/SIGINT) caused the shutdown and it wasn't a + # planned restart (/restart, /update, SIGUSR1), exit non-zero so + # systemd's Restart=on-failure revives the process. This covers: + # - hermes update killing the gateway mid-work + # - External kill commands + # - WSL2/container runtime sending unexpected signals + # systemctl stop is safe: systemd tracks "stop requested" state + # independently of exit code, so Restart= never fires for it. + if _signal_initiated_shutdown and not runner._restart_requested: + logger.info( + "Exiting with code 1 (signal-initiated shutdown without restart " + "request) so systemd Restart=on-failure can revive the gateway." + ) + return False # → sys.exit(1) in the caller + return True diff --git a/gateway/status.py b/gateway/status.py index a801cfe5b8..becf9e8cb6 100644 --- a/gateway/status.py +++ b/gateway/status.py @@ -266,9 +266,25 @@ def read_runtime_status() -> Optional[dict[str, Any]]: def remove_pid_file() -> None: - """Remove the gateway PID file if it exists.""" + """Remove the gateway PID file, but only if it belongs to this process. + + During --replace handoffs, the old process's atexit handler can fire AFTER + the new process has written its own PID file. Blindly removing the file + would delete the new process's record, leaving the gateway running with no + PID file (invisible to ``get_running_pid()``). + """ try: - _get_pid_path().unlink(missing_ok=True) + path = _get_pid_path() + record = _read_json_file(path) + if record is not None: + try: + file_pid = int(record["pid"]) + except (KeyError, TypeError, ValueError): + file_pid = None + if file_pid is not None and file_pid != os.getpid(): + # PID file belongs to a different process — leave it alone. + return + path.unlink(missing_ok=True) except Exception: pass