fix(gateway): handle planned service stops

This commit is contained in:
helix4u 2026-05-04 13:35:23 -06:00 committed by Teknium
parent 20428f5e60
commit b632290166
5 changed files with 286 additions and 72 deletions

View file

@ -15003,15 +15003,14 @@ async def start_gateway(config: Optional[GatewayConfig] = None, replace: bool =
runner = GatewayRunner(config)
# Track whether a signal initiated the shutdown (vs. internal request).
# When an unexpected SIGTERM kills the gateway, we exit non-zero so
# systemd's Restart=on-failure revives the process. systemctl stop
# is safe: systemd tracks stop-requested state independently of exit
# code, so Restart= never fires for a deliberate stop.
# Track whether an unexpected signal initiated the shutdown. When an
# unexpected SIGTERM kills the gateway, we exit non-zero so service
# managers can revive the process. Planned stop paths write a marker
# before signalling us so they can exit cleanly instead.
_signal_initiated_shutdown = False
# Set up signal handlers
def shutdown_signal_handler():
def shutdown_signal_handler(received_signal=None):
nonlocal _signal_initiated_shutdown
# Planned --replace takeover check: when a sibling gateway is
# taking over via --replace, it wrote a marker naming this PID
@ -15027,10 +15026,28 @@ async def start_gateway(config: Optional[GatewayConfig] = None, replace: bool =
except Exception as e:
logger.debug("Takeover marker check failed: %s", e)
# Planned stop check: service managers and `hermes gateway stop`
# also send SIGTERM, which is indistinguishable from an unexpected
# external kill unless the CLI marks it first. SIGINT comes from an
# interactive Ctrl+C and is likewise an intentional foreground stop.
planned_stop = False
if received_signal == signal.SIGINT:
planned_stop = True
elif not planned_takeover:
try:
from gateway.status import consume_planned_stop_marker_for_self
planned_stop = consume_planned_stop_marker_for_self()
except Exception as e:
logger.debug("Planned stop marker check failed: %s", e)
if planned_takeover:
logger.info(
"Received SIGTERM as a planned --replace takeover — exiting cleanly"
)
elif planned_stop:
logger.info(
"Received SIGTERM/SIGINT as a planned gateway stop — exiting cleanly"
)
else:
_signal_initiated_shutdown = True
logger.info("Received SIGTERM/SIGINT — initiating shutdown")
@ -15066,7 +15083,7 @@ async def start_gateway(config: Optional[GatewayConfig] = None, replace: bool =
if threading.current_thread() is threading.main_thread():
for sig in (signal.SIGINT, signal.SIGTERM):
try:
loop.add_signal_handler(sig, shutdown_signal_handler)
loop.add_signal_handler(sig, shutdown_signal_handler, sig)
except NotImplementedError:
pass
if hasattr(signal, "SIGUSR1"):
@ -15164,14 +15181,14 @@ async def start_gateway(config: Optional[GatewayConfig] = None, replace: bool =
if runner.exit_code is not None:
raise SystemExit(runner.exit_code)
# When a signal (SIGTERM/SIGINT) caused the shutdown and it wasn't a
# planned restart (/restart, /update, SIGUSR1), exit non-zero so
# systemd's Restart=on-failure revives the process. This covers:
# When an unexpected SIGTERM caused the shutdown and it wasn't a planned
# restart (/restart, /update, SIGUSR1), exit non-zero so systemd's
# Restart=on-failure revives the process. This covers:
# - hermes update killing the gateway mid-work
# - External kill commands
# - WSL2/container runtime sending unexpected signals
# systemctl stop is safe: systemd tracks "stop requested" state
# independently of exit code, so Restart= never fires for it.
# `hermes gateway stop` and interactive Ctrl+C are handled above as
# planned stops and should not trigger service-manager revival.
if _signal_initiated_shutdown and not runner._restart_requested:
logger.info(
"Exiting with code 1 (signal-initiated shutdown without restart "

View file

@ -637,6 +637,8 @@ def release_all_scoped_locks(
_TAKEOVER_MARKER_FILENAME = ".gateway-takeover.json"
_TAKEOVER_MARKER_TTL_S = 60 # Marker older than this is treated as stale
_PLANNED_STOP_MARKER_FILENAME = ".gateway-planned-stop.json"
_PLANNED_STOP_MARKER_TTL_S = 60
def _get_takeover_marker_path() -> Path:
@ -645,6 +647,67 @@ def _get_takeover_marker_path() -> Path:
return home / _TAKEOVER_MARKER_FILENAME
def _get_planned_stop_marker_path() -> Path:
"""Return the path to the intentional gateway stop marker file."""
home = get_hermes_home()
return home / _PLANNED_STOP_MARKER_FILENAME
def _marker_is_stale(written_at: str, ttl_s: int) -> bool:
try:
written_dt = datetime.fromisoformat(written_at)
age = (datetime.now(timezone.utc) - written_dt).total_seconds()
return age > ttl_s
except (TypeError, ValueError):
return True
def _consume_pid_marker_for_self(
path: Path,
*,
pid_field: str,
start_time_field: str,
ttl_s: int,
) -> bool:
record = _read_json_file(path)
if not record:
return False
try:
target_pid = int(record[pid_field])
target_start_time = record.get(start_time_field)
written_at = record.get("written_at") or ""
except (KeyError, TypeError, ValueError):
try:
path.unlink(missing_ok=True)
except OSError:
pass
return False
if _marker_is_stale(written_at, ttl_s):
try:
path.unlink(missing_ok=True)
except OSError:
pass
return False
our_pid = os.getpid()
our_start_time = _get_process_start_time(our_pid)
matches = (
target_pid == our_pid
and target_start_time is not None
and our_start_time is not None
and target_start_time == our_start_time
)
try:
path.unlink(missing_ok=True)
except OSError:
pass
return matches
def write_takeover_marker(target_pid: int) -> bool:
"""Record that ``target_pid`` is being replaced by the current process.
@ -681,59 +744,13 @@ def consume_takeover_marker_for_self() -> bool:
Always unlinks the marker on match (and on detected staleness) so
subsequent unrelated signals don't re-trigger.
"""
path = _get_takeover_marker_path()
record = _read_json_file(path)
if not record:
return False
# Any malformed or stale marker → drop it and return False
try:
target_pid = int(record["target_pid"])
target_start_time = record.get("target_start_time")
written_at = record.get("written_at") or ""
except (KeyError, TypeError, ValueError):
try:
path.unlink(missing_ok=True)
except OSError:
pass
return False
# TTL guard: a stale marker older than _TAKEOVER_MARKER_TTL_S is ignored.
stale = False
try:
written_dt = datetime.fromisoformat(written_at)
age = (datetime.now(timezone.utc) - written_dt).total_seconds()
if age > _TAKEOVER_MARKER_TTL_S:
stale = True
except (TypeError, ValueError):
stale = True # Unparseable timestamp — treat as stale
if stale:
try:
path.unlink(missing_ok=True)
except OSError:
pass
return False
# Does the marker name THIS process?
our_pid = os.getpid()
our_start_time = _get_process_start_time(our_pid)
matches = (
target_pid == our_pid
and target_start_time is not None
and our_start_time is not None
and target_start_time == our_start_time
return _consume_pid_marker_for_self(
_get_takeover_marker_path(),
pid_field="target_pid",
start_time_field="target_start_time",
ttl_s=_TAKEOVER_MARKER_TTL_S,
)
# Consume the marker whether it matched or not — a marker that doesn't
# match our identity is stale-for-us anyway.
try:
path.unlink(missing_ok=True)
except OSError:
pass
return matches
def clear_takeover_marker() -> None:
"""Remove the takeover marker unconditionally. Safe to call repeatedly."""
@ -743,6 +760,45 @@ def clear_takeover_marker() -> None:
pass
def write_planned_stop_marker(target_pid: int) -> bool:
"""Record that ``target_pid`` is being stopped intentionally.
The gateway exits non-zero for unexpected SIGTERM so service managers can
revive it. Service stop commands send the same SIGTERM, so the CLI writes
this short-lived marker first to let the target process exit cleanly.
"""
try:
target_start_time = _get_process_start_time(target_pid)
record = {
"target_pid": target_pid,
"target_start_time": target_start_time,
"stopper_pid": os.getpid(),
"written_at": _utc_now_iso(),
}
_write_json_file(_get_planned_stop_marker_path(), record)
return True
except (OSError, PermissionError):
return False
def consume_planned_stop_marker_for_self() -> bool:
"""Return True when the current process is being intentionally stopped."""
return _consume_pid_marker_for_self(
_get_planned_stop_marker_path(),
pid_field="target_pid",
start_time_field="target_start_time",
ttl_s=_PLANNED_STOP_MARKER_TTL_S,
)
def clear_planned_stop_marker() -> None:
"""Remove the planned-stop marker unconditionally."""
try:
_get_planned_stop_marker_path().unlink(missing_ok=True)
except OSError:
pass
def get_running_pid(
pid_path: Optional[Path] = None,
*,