fix(gateway): only fire planned-stop watcher for self-targeting markers + fix Windows consume (#34749)

* fix(gateway): only fire planned-stop watcher for markers targeting self

Salvaged from #34599 — rebased onto current main.

The planned-stop watcher now only fires shutdown for a marker that targets
the current process, instead of any marker that exists on disk. Fixes the
Windows crash loop (#34597) where a stale marker from a previous Gateway
instance kills a freshly booted Gateway ~400ms after start with a false
"Received UNKNOWN — initiating shutdown".

Co-authored-by: Bartok9 <danielrpike9@gmail.com>

* fix(gateway): match planned-stop/takeover markers by PID alone when start_time is unavailable

Follow-up to the #34599 salvage. The watcher's non-destructive probe
(planned_stop_marker_targets_self) already falls back to PID equality when
a process start_time is unavailable, but the authoritative consume it gates
(_consume_pid_marker_for_self) still required a non-None start_time match.

_get_process_start_time reads /proc/<pid>/stat and returns None on macOS and
native Windows — the only platform the planned-stop watcher exists for. So on
Windows the probe would fire the shutdown handler (PID matches) but the
handler's consume_planned_stop_marker_for_self() would return False, and a
legitimate 'hermes gateway stop' was still misclassified as an unexpected
UNKNOWN exit (exit 1) and revived by the service manager — a residual half of
the #34597 crash loop on the legitimate-stop path.

Align the consume with the probe: when both start_times are known they must
match (PID-reuse guard preserved on Linux); when either is unavailable, fall
back to PID equality alone, bounded by the existing short marker TTL. This
also fixes the parallel --replace takeover consume on Windows, which shares
the same helper.

Adds regression tests for the Windows (None start_time) path, the foreign-PID
rejection under that fallback, and confirmation the start_time-mismatch guard
still rejects when both are known.

---------

Co-authored-by: Bartok9 <danielrpike9@gmail.com>
This commit is contained in:
kshitij 2026-05-29 10:36:58 -07:00 committed by GitHub
parent 0563ab0652
commit 7379f17556
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 335 additions and 15 deletions

View file

@ -18442,7 +18442,10 @@ def _run_planned_stop_watcher(
poll_interval: seconds between marker checks. 0.5s gives a
responsive shutdown without burning CPU.
"""
from gateway.status import _get_planned_stop_marker_path
from gateway.status import (
_get_planned_stop_marker_path,
planned_stop_marker_targets_self,
)
marker_path = _get_planned_stop_marker_path()
while not stop_event.is_set():
try:
@ -18451,6 +18454,26 @@ def _run_planned_stop_watcher(
and not getattr(runner, "_draining", False)
and getattr(runner, "_running", False)
):
# A marker existing is NOT sufficient — it may have been
# written for a PREVIOUS gateway instance (different PID)
# and left behind because that process exited before the
# CLI's stop() could clean it up. Firing the handler on a
# stale/foreign marker drives the gateway into shutdown,
# then consume_planned_stop_marker_for_self() correctly
# reports a PID mismatch — but by then we're already
# stopping, so it's logged as an unexpected "UNKNOWN" exit
# and the watchdog crash-loops the gateway (issue #34597,
# a regression from PR #33798 which added this watcher
# without the PID check).
#
# Only fire when the marker actually targets us. The probe
# is non-destructive on a match (the handler does the
# authoritative consume on the loop thread) and self-heals
# by unlinking stale/malformed markers so they cannot wedge
# a freshly booted gateway.
if not planned_stop_marker_targets_self():
stop_event.wait(poll_interval)
continue
# Drive the same path as a real signal handler.
# Pass signal=None — the handler tolerates that and consumes
# the marker via consume_planned_stop_marker_for_self,

View file

@ -816,12 +816,24 @@ def _consume_pid_marker_for_self(
our_pid = os.getpid()
our_start_time = _get_process_start_time(our_pid)
matches = (
target_pid == our_pid
and target_start_time is not None
and our_start_time is not None
and target_start_time == our_start_time
)
# Start-time is a PID-reuse guard. It is only meaningful when both
# sides actually have it: ``_get_process_start_time`` returns None on
# platforms without ``/proc`` (macOS, native Windows — the very
# platform the planned-stop watcher exists for). Requiring a non-None
# match there would make every consume return False, so a legitimate
# ``hermes gateway stop`` on Windows would be misclassified as an
# unexpected ``UNKNOWN`` exit (exit 1) and revived by the service
# manager. So: when both start_times are known they must match; when
# either is unknown, fall back to PID equality alone (bounded by the
# marker's short TTL). This mirrors ``planned_stop_marker_targets_self``
# so the watcher's non-destructive probe and this authoritative
# consume agree on every platform (issue #34597).
if target_pid != our_pid:
matches = False
elif target_start_time is not None and our_start_time is not None:
matches = target_start_time == our_start_time
else:
matches = True
try:
path.unlink(missing_ok=True)
@ -914,6 +926,68 @@ def consume_planned_stop_marker_for_self() -> bool:
)
def planned_stop_marker_targets_self() -> bool:
"""Return True only when a live planned-stop marker names the current process.
This is a **non-destructive** probe used by the watcher thread
(``gateway/run.py:_run_planned_stop_watcher``) to decide whether to
trigger shutdown. Unlike :func:`consume_planned_stop_marker_for_self`,
it never unlinks a marker that matches us the shutdown handler does
the authoritative consume on its own thread.
It *does* clean up markers that can never apply to this process:
malformed markers and markers older than the TTL are unlinked so a
stale file left behind by a previous gateway instance cannot wedge
the new one. Markers naming a different PID/start_time are left in
place (they may still be consumed legitimately by the process they
name) but report False here.
Returns False (without raising) on any read/parse error.
"""
path = _get_planned_stop_marker_path()
record = _read_json_file(path)
if not record:
return False
try:
target_pid = int(record["target_pid"])
target_start_time = record.get("target_start_time")
written_at = record.get("written_at") or ""
except (KeyError, TypeError, ValueError):
# Malformed marker can never match anyone — drop it.
try:
path.unlink(missing_ok=True)
except OSError:
pass
return False
if _marker_is_stale(written_at, _PLANNED_STOP_MARKER_TTL_S):
# A marker this old is past its useful life regardless of target —
# clean it up so it cannot crash-loop a freshly booted gateway.
try:
path.unlink(missing_ok=True)
except OSError:
pass
return False
our_pid = os.getpid()
if target_pid != our_pid:
return False
# Start-time is a PID-reuse guard. It is only meaningful when both
# sides actually have it: ``_get_process_start_time`` returns None on
# platforms without ``/proc`` (macOS, native Windows — the very
# platform this watcher exists for). Requiring a non-None match there
# would make the watcher never fire and re-break the #33778 Windows
# session-resume path. So: when both start_times are known they must
# match; when either is unknown, fall back to PID equality alone
# (the marker is short-lived under a 60s TTL, bounding reuse risk).
our_start_time = _get_process_start_time(our_pid)
if target_start_time is not None and our_start_time is not None:
return target_start_time == our_start_time
return True
def clear_planned_stop_marker() -> None:
"""Remove the planned-stop marker unconditionally."""
try:

View file

@ -12,12 +12,33 @@ See issue #33778 for the original Windows session-loss bug report.
"""
import asyncio
import json
import os
import threading
import time
from unittest.mock import MagicMock
from gateway.run import _run_planned_stop_watcher
from gateway import status as status_mod
def _write_self_marker(marker, *, stale: bool = False):
"""Write a planned-stop marker that targets the CURRENT process.
The watcher only fires for markers naming our PID + start_time (the
fix for issue #34597), so tests that expect a fire must write a
self-targeting marker. Pass ``stale=True`` to backdate ``written_at``
past the TTL.
"""
written_at = "2000-01-01T00:00:00+00:00" if stale else status_mod._utc_now_iso()
record = {
"target_pid": os.getpid(),
"target_start_time": status_mod._get_process_start_time(os.getpid()),
"stopper_pid": os.getpid(),
"written_at": written_at,
}
marker.write_text(json.dumps(record), encoding="utf-8")
class _FakeRunner:
@ -41,11 +62,10 @@ def _make_loop_capturing_calls():
def test_watcher_fires_shutdown_when_marker_appears(tmp_path, monkeypatch):
"""When the marker file exists, the watcher must call the shutdown handler."""
"""When a marker targeting THIS process exists, fire the shutdown handler."""
marker = tmp_path / ".gateway-planned-stop.json"
# Patch the marker-path resolver so the watcher polls our temp location.
from gateway import status as status_mod
monkeypatch.setattr(status_mod, "_get_planned_stop_marker_path", lambda: marker)
runner = _FakeRunner(running=True, draining=False)
@ -53,8 +73,8 @@ def test_watcher_fires_shutdown_when_marker_appears(tmp_path, monkeypatch):
shutdown_handler = MagicMock(name="shutdown_signal_handler")
stop_event = threading.Event()
# Drop the marker before the thread starts.
marker.write_text('{"target_pid": 1234}', encoding="utf-8")
# Drop a self-targeting marker before the thread starts.
_write_self_marker(marker)
watcher = threading.Thread(
target=_run_planned_stop_watcher,
@ -114,9 +134,8 @@ def test_watcher_skips_when_runner_already_draining(tmp_path, monkeypatch):
so the watcher backs off once any shutdown is in flight.
"""
marker = tmp_path / ".gateway-planned-stop.json"
marker.write_text('{"target_pid": 1234}', encoding="utf-8")
_write_self_marker(marker)
from gateway import status as status_mod
monkeypatch.setattr(status_mod, "_get_planned_stop_marker_path", lambda: marker)
# Already draining — watcher should be a no-op.
@ -204,9 +223,8 @@ def test_watcher_fires_only_once_when_marker_persists(tmp_path, monkeypatch):
times before the gateway actually shuts down.
"""
marker = tmp_path / ".gateway-planned-stop.json"
marker.write_text('{"target_pid": 1234}', encoding="utf-8")
_write_self_marker(marker)
from gateway import status as status_mod
monkeypatch.setattr(status_mod, "_get_planned_stop_marker_path", lambda: marker)
runner = _FakeRunner(running=True, draining=False)
@ -263,3 +281,113 @@ def test_watcher_tolerates_marker_path_resolution_errors(tmp_path, monkeypatch,
assert not watcher.is_alive(), "Watcher should still honour stop_event after errors"
# No shutdown fired because the marker never reported existence.
assert loop._captured == []
# ---------------------------------------------------------------------------
# Regression coverage for issue #34597:
# A marker left behind by a PREVIOUS gateway instance (different PID, or
# past its TTL) must NOT crash the freshly booted gateway. The watcher
# only fires when the marker targets the current process, and self-heals
# by cleaning up stale/malformed markers.
# ---------------------------------------------------------------------------
def test_watcher_does_not_fire_for_foreign_pid_marker(tmp_path, monkeypatch):
"""A marker naming a DIFFERENT process must not trigger our shutdown.
This is the core #34597 regression: a stale marker from a prior
gateway instance was firing the handler, driving the new gateway into
a false "Received UNKNOWN" shutdown and a watchdog crash loop.
"""
marker = tmp_path / ".gateway-planned-stop.json"
# Foreign PID + a start_time that cannot match ours, freshly written
# so the TTL does NOT remove it — the watcher must still decline.
record = {
"target_pid": os.getpid() + 1,
"target_start_time": -1,
"stopper_pid": os.getpid() + 1,
"written_at": status_mod._utc_now_iso(),
}
marker.write_text(json.dumps(record), encoding="utf-8")
monkeypatch.setattr(status_mod, "_get_planned_stop_marker_path", lambda: marker)
runner = _FakeRunner(running=True, draining=False)
loop = _make_loop_capturing_calls()
shutdown_handler = MagicMock(name="shutdown_signal_handler")
stop_event = threading.Event()
watcher = threading.Thread(
target=_run_planned_stop_watcher,
args=(stop_event, runner, loop, shutdown_handler),
kwargs={"poll_interval": 0.05},
daemon=True,
)
watcher.start()
time.sleep(0.3) # several poll cycles
stop_event.set()
watcher.join(timeout=2.0)
assert not watcher.is_alive()
assert loop._captured == [], (
f"Watcher fired on a foreign-PID marker (#34597 regression): {loop._captured}"
)
shutdown_handler.assert_not_called()
# Foreign (but live) marker is left in place — it may still belong to
# the process it names.
assert marker.exists()
def test_watcher_cleans_up_stale_marker_and_keeps_running(tmp_path, monkeypatch):
"""A marker older than the TTL is unlinked and never fires shutdown."""
marker = tmp_path / ".gateway-planned-stop.json"
# Self-targeting but backdated past the TTL: must be treated as dead.
_write_self_marker(marker, stale=True)
monkeypatch.setattr(status_mod, "_get_planned_stop_marker_path", lambda: marker)
runner = _FakeRunner(running=True, draining=False)
loop = _make_loop_capturing_calls()
shutdown_handler = MagicMock(name="shutdown_signal_handler")
stop_event = threading.Event()
watcher = threading.Thread(
target=_run_planned_stop_watcher,
args=(stop_event, runner, loop, shutdown_handler),
kwargs={"poll_interval": 0.05},
daemon=True,
)
watcher.start()
time.sleep(0.3)
stop_event.set()
watcher.join(timeout=2.0)
assert not watcher.is_alive()
assert loop._captured == [], "Stale marker must not fire shutdown"
shutdown_handler.assert_not_called()
assert not marker.exists(), "Stale marker should have been cleaned up"
def test_planned_stop_marker_targets_self_probe_is_non_destructive(tmp_path, monkeypatch):
"""The probe returns True for a self-marker WITHOUT unlinking it.
The shutdown handler performs the authoritative consume on its own
thread, so the watcher's probe must leave a matching marker intact.
"""
marker = tmp_path / ".gateway-planned-stop.json"
_write_self_marker(marker)
monkeypatch.setattr(status_mod, "_get_planned_stop_marker_path", lambda: marker)
assert status_mod.planned_stop_marker_targets_self() is True
assert marker.exists(), "Probe must not consume a matching marker"
# Idempotent: still True on a second call.
assert status_mod.planned_stop_marker_targets_self() is True
def test_planned_stop_marker_targets_self_drops_malformed(tmp_path, monkeypatch):
"""A malformed marker reports False and is cleaned up."""
marker = tmp_path / ".gateway-planned-stop.json"
marker.write_text("{not valid json", encoding="utf-8")
monkeypatch.setattr(status_mod, "_get_planned_stop_marker_path", lambda: marker)
assert status_mod.planned_stop_marker_targets_self() is False

View file

@ -707,6 +707,33 @@ class TestTakeoverMarker:
assert result is False
def test_consume_returns_true_on_windows_when_start_time_unavailable(
self, tmp_path, monkeypatch
):
"""Takeover consume must also recognise a self-marker on platforms
without ``/proc`` (macOS / native Windows).
``consume_takeover_marker_for_self`` shares ``_consume_pid_marker_for_self``
with the planned-stop path, so the same start_time fallback applies:
a ``--replace`` SIGTERM on Windows (where start_time is None on both
sides) must be recognised as a planned takeover and exit 0, not be
misclassified as an unexpected UNKNOWN exit. With start_time
unavailable we fall back to PID equality alone, bounded by the TTL.
"""
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
# Simulate Windows: no start_time available for any PID.
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: None)
ok = status.write_takeover_marker(target_pid=os.getpid())
assert ok is True
payload = json.loads((tmp_path / ".gateway-takeover.json").read_text())
assert payload["target_start_time"] is None
result = status.consume_takeover_marker_for_self()
assert result is True
assert not (tmp_path / ".gateway-takeover.json").exists()
def test_consume_returns_false_when_marker_missing(self, tmp_path, monkeypatch):
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
@ -899,6 +926,74 @@ class TestPlannedStopMarker:
assert ok is False
def test_consume_returns_true_on_windows_when_start_time_unavailable(
self, tmp_path, monkeypatch
):
"""Regression for #34597: a legitimate stop must be recognised on
platforms without ``/proc``.
``_get_process_start_time`` returns None on macOS / native Windows
(no ``/proc/<pid>/stat``). The planned-stop watcher only runs there,
so if the authoritative consume required a non-None start_time match
it would always return False and ``hermes gateway stop`` would be
misclassified as an unexpected ``UNKNOWN`` exit, exit 1, and revived
by the service manager (the very crash loop #34597 set out to fix).
With start_time unavailable on BOTH sides we fall back to PID
equality alone, bounded by the marker TTL.
"""
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
# Simulate Windows: no start_time available for any PID.
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: None)
ok = status.write_planned_stop_marker(target_pid=os.getpid())
assert ok is True
# Marker carries a null start_time, exactly as written on Windows.
payload = json.loads((tmp_path / ".gateway-planned-stop.json").read_text())
assert payload["target_start_time"] is None
result = status.consume_planned_stop_marker_for_self()
assert result is True
assert not (tmp_path / ".gateway-planned-stop.json").exists()
def test_consume_still_rejects_foreign_pid_when_start_time_unavailable(
self, tmp_path, monkeypatch
):
"""The PID-only fallback must NOT match a marker naming another PID.
Falling back to PID equality when start_time is unknown must remain
a PID check a marker for a different process is never ours.
"""
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: None)
ok = status.write_planned_stop_marker(target_pid=os.getpid() + 9999)
assert ok is True
result = status.consume_planned_stop_marker_for_self()
assert result is False
def test_consume_still_rejects_start_time_mismatch_when_both_known(
self, tmp_path, monkeypatch
):
"""PID-reuse defence is preserved when BOTH start_times are present.
The Windows fallback only relaxes matching when a start_time is
unavailable. When both sides report one (Linux), a mismatch must
still reject otherwise PID reuse could resurrect a stale marker.
"""
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 100)
status.write_planned_stop_marker(target_pid=os.getpid())
# Simulate PID reuse: same PID, different start_time.
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 9999)
result = status.consume_planned_stop_marker_for_self()
assert result is False
class TestReadProcessCmdlinePsFallback:
"""Tests for _read_process_cmdline falling back to ps on non-Linux."""