fix(gateway): only fire planned-stop watcher for self-targeting markers + fix Windows consume (#34749)

* fix(gateway): only fire planned-stop watcher for markers targeting self Salvaged from #34599 — rebased onto current main. The planned-stop watcher now only fires shutdown for a marker that targets the current process, instead of any marker that exists on disk. Fixes the Windows crash loop (#34597) where a stale marker from a previous Gateway instance kills a freshly booted Gateway ~400ms after start with a false "Received UNKNOWN — initiating shutdown". Co-authored-by: Bartok9 <danielrpike9@gmail.com> * fix(gateway): match planned-stop/takeover markers by PID alone when start_time is unavailable Follow-up to the #34599 salvage. The watcher's non-destructive probe (planned_stop_marker_targets_self) already falls back to PID equality when a process start_time is unavailable, but the authoritative consume it gates (_consume_pid_marker_for_self) still required a non-None start_time match. _get_process_start_time reads /proc/<pid>/stat and returns None on macOS and native Windows — the only platform the planned-stop watcher exists for. So on Windows the probe would fire the shutdown handler (PID matches) but the handler's consume_planned_stop_marker_for_self() would return False, and a legitimate 'hermes gateway stop' was still misclassified as an unexpected UNKNOWN exit (exit 1) and revived by the service manager — a residual half of the #34597 crash loop on the legitimate-stop path. Align the consume with the probe: when both start_times are known they must match (PID-reuse guard preserved on Linux); when either is unavailable, fall back to PID equality alone, bounded by the existing short marker TTL. This also fixes the parallel --replace takeover consume on Windows, which shares the same helper. Adds regression tests for the Windows (None start_time) path, the foreign-PID rejection under that fallback, and confirmation the start_time-mismatch guard still rejects when both are known. --------- Co-authored-by: Bartok9 <danielrpike9@gmail.com>
2026-06-09 08:21:50 +00:00 · 2026-05-29 10:36:58 -07:00 · 2026-05-29 10:36:58 -07:00 · 7379f17556
commit 7379f17556
parent 0563ab0652
4 changed files with 335 additions and 15 deletions
--- a/gateway/run.py
+++ b/gateway/run.py
@ -18442,7 +18442,10 @@ def _run_planned_stop_watcher(
        poll_interval: seconds between marker checks. 0.5s gives a
            responsive shutdown without burning CPU.
    """
-    from gateway.status import _get_planned_stop_marker_path
+    from gateway.status import (
+        _get_planned_stop_marker_path,
+        planned_stop_marker_targets_self,
+    )
    marker_path = _get_planned_stop_marker_path()
    while not stop_event.is_set():
        try:
@ -18451,6 +18454,26 @@ def _run_planned_stop_watcher(
                and not getattr(runner, "_draining", False)
                and getattr(runner, "_running", False)
            ):
+                # A marker existing is NOT sufficient — it may have been
+                # written for a PREVIOUS gateway instance (different PID)
+                # and left behind because that process exited before the
+                # CLI's stop() could clean it up. Firing the handler on a
+                # stale/foreign marker drives the gateway into shutdown,
+                # then consume_planned_stop_marker_for_self() correctly
+                # reports a PID mismatch — but by then we're already
+                # stopping, so it's logged as an unexpected "UNKNOWN" exit
+                # and the watchdog crash-loops the gateway (issue #34597,
+                # a regression from PR #33798 which added this watcher
+                # without the PID check).
+                #
+                # Only fire when the marker actually targets us. The probe
+                # is non-destructive on a match (the handler does the
+                # authoritative consume on the loop thread) and self-heals
+                # by unlinking stale/malformed markers so they cannot wedge
+                # a freshly booted gateway.
+                if not planned_stop_marker_targets_self():
+                    stop_event.wait(poll_interval)
+                    continue
                # Drive the same path as a real signal handler.
                # Pass signal=None — the handler tolerates that and consumes
                # the marker via consume_planned_stop_marker_for_self,
--- a/gateway/status.py
+++ b/gateway/status.py
@ -816,12 +816,24 @@ def _consume_pid_marker_for_self(

    our_pid = os.getpid()
    our_start_time = _get_process_start_time(our_pid)
-    matches = (
-        target_pid == our_pid
-        and target_start_time is not None
-        and our_start_time is not None
-        and target_start_time == our_start_time
-    )
+    # Start-time is a PID-reuse guard. It is only meaningful when both
+    # sides actually have it: ``_get_process_start_time`` returns None on
+    # platforms without ``/proc`` (macOS, native Windows — the very
+    # platform the planned-stop watcher exists for). Requiring a non-None
+    # match there would make every consume return False, so a legitimate
+    # ``hermes gateway stop`` on Windows would be misclassified as an
+    # unexpected ``UNKNOWN`` exit (exit 1) and revived by the service
+    # manager. So: when both start_times are known they must match; when
+    # either is unknown, fall back to PID equality alone (bounded by the
+    # marker's short TTL). This mirrors ``planned_stop_marker_targets_self``
+    # so the watcher's non-destructive probe and this authoritative
+    # consume agree on every platform (issue #34597).
+    if target_pid != our_pid:
+        matches = False
+    elif target_start_time is not None and our_start_time is not None:
+        matches = target_start_time == our_start_time
+    else:
+        matches = True

    try:
        path.unlink(missing_ok=True)
@ -914,6 +926,68 @@ def consume_planned_stop_marker_for_self() -> bool:
    )


+def planned_stop_marker_targets_self() -> bool:
+    """Return True only when a live planned-stop marker names the current process.
+
+    This is a **non-destructive** probe used by the watcher thread
+    (``gateway/run.py:_run_planned_stop_watcher``) to decide whether to
+    trigger shutdown. Unlike :func:`consume_planned_stop_marker_for_self`,
+    it never unlinks a marker that matches us — the shutdown handler does
+    the authoritative consume on its own thread.
+
+    It *does* clean up markers that can never apply to this process:
+    malformed markers and markers older than the TTL are unlinked so a
+    stale file left behind by a previous gateway instance cannot wedge
+    the new one. Markers naming a different PID/start_time are left in
+    place (they may still be consumed legitimately by the process they
+    name) but report False here.
+
+    Returns False (without raising) on any read/parse error.
+    """
+    path = _get_planned_stop_marker_path()
+    record = _read_json_file(path)
+    if not record:
+        return False
+
+    try:
+        target_pid = int(record["target_pid"])
+        target_start_time = record.get("target_start_time")
+        written_at = record.get("written_at") or ""
+    except (KeyError, TypeError, ValueError):
+        # Malformed marker can never match anyone — drop it.
+        try:
+            path.unlink(missing_ok=True)
+        except OSError:
+            pass
+        return False
+
+    if _marker_is_stale(written_at, _PLANNED_STOP_MARKER_TTL_S):
+        # A marker this old is past its useful life regardless of target —
+        # clean it up so it cannot crash-loop a freshly booted gateway.
+        try:
+            path.unlink(missing_ok=True)
+        except OSError:
+            pass
+        return False
+
+    our_pid = os.getpid()
+    if target_pid != our_pid:
+        return False
+
+    # Start-time is a PID-reuse guard. It is only meaningful when both
+    # sides actually have it: ``_get_process_start_time`` returns None on
+    # platforms without ``/proc`` (macOS, native Windows — the very
+    # platform this watcher exists for). Requiring a non-None match there
+    # would make the watcher never fire and re-break the #33778 Windows
+    # session-resume path. So: when both start_times are known they must
+    # match; when either is unknown, fall back to PID equality alone
+    # (the marker is short-lived under a 60s TTL, bounding reuse risk).
+    our_start_time = _get_process_start_time(our_pid)
+    if target_start_time is not None and our_start_time is not None:
+        return target_start_time == our_start_time
+    return True
+
+
 def clear_planned_stop_marker() -> None:
    """Remove the planned-stop marker unconditionally."""
    try:
--- a/tests/gateway/test_planned_stop_watcher.py
+++ b/tests/gateway/test_planned_stop_watcher.py
@ -12,12 +12,33 @@ See issue #33778 for the original Windows session-loss bug report.
 """

 import asyncio
+import json
+import os
 import threading
 import time
 from unittest.mock import MagicMock


 from gateway.run import _run_planned_stop_watcher
+from gateway import status as status_mod
+
+
+def _write_self_marker(marker, *, stale: bool = False):
+    """Write a planned-stop marker that targets the CURRENT process.
+
+    The watcher only fires for markers naming our PID + start_time (the
+    fix for issue #34597), so tests that expect a fire must write a
+    self-targeting marker. Pass ``stale=True`` to backdate ``written_at``
+    past the TTL.
+    """
+    written_at = "2000-01-01T00:00:00+00:00" if stale else status_mod._utc_now_iso()
+    record = {
+        "target_pid": os.getpid(),
+        "target_start_time": status_mod._get_process_start_time(os.getpid()),
+        "stopper_pid": os.getpid(),
+        "written_at": written_at,
+    }
+    marker.write_text(json.dumps(record), encoding="utf-8")


 class _FakeRunner:
@ -41,11 +62,10 @@ def _make_loop_capturing_calls():


 def test_watcher_fires_shutdown_when_marker_appears(tmp_path, monkeypatch):
-    """When the marker file exists, the watcher must call the shutdown handler."""
+    """When a marker targeting THIS process exists, fire the shutdown handler."""
    marker = tmp_path / ".gateway-planned-stop.json"

    # Patch the marker-path resolver so the watcher polls our temp location.
-    from gateway import status as status_mod
    monkeypatch.setattr(status_mod, "_get_planned_stop_marker_path", lambda: marker)

    runner = _FakeRunner(running=True, draining=False)
@ -53,8 +73,8 @@ def test_watcher_fires_shutdown_when_marker_appears(tmp_path, monkeypatch):
    shutdown_handler = MagicMock(name="shutdown_signal_handler")
    stop_event = threading.Event()

-    # Drop the marker before the thread starts.
-    marker.write_text('{"target_pid": 1234}', encoding="utf-8")
+    # Drop a self-targeting marker before the thread starts.
+    _write_self_marker(marker)

    watcher = threading.Thread(
        target=_run_planned_stop_watcher,
@ -114,9 +134,8 @@ def test_watcher_skips_when_runner_already_draining(tmp_path, monkeypatch):
    so the watcher backs off once any shutdown is in flight.
    """
    marker = tmp_path / ".gateway-planned-stop.json"
-    marker.write_text('{"target_pid": 1234}', encoding="utf-8")
+    _write_self_marker(marker)

-    from gateway import status as status_mod
    monkeypatch.setattr(status_mod, "_get_planned_stop_marker_path", lambda: marker)

    # Already draining — watcher should be a no-op.
@ -204,9 +223,8 @@ def test_watcher_fires_only_once_when_marker_persists(tmp_path, monkeypatch):
    times before the gateway actually shuts down.
    """
    marker = tmp_path / ".gateway-planned-stop.json"
-    marker.write_text('{"target_pid": 1234}', encoding="utf-8")
+    _write_self_marker(marker)

-    from gateway import status as status_mod
    monkeypatch.setattr(status_mod, "_get_planned_stop_marker_path", lambda: marker)

    runner = _FakeRunner(running=True, draining=False)
@ -263,3 +281,113 @@ def test_watcher_tolerates_marker_path_resolution_errors(tmp_path, monkeypatch,
    assert not watcher.is_alive(), "Watcher should still honour stop_event after errors"
    # No shutdown fired because the marker never reported existence.
    assert loop._captured == []
+
+
+# ---------------------------------------------------------------------------
+# Regression coverage for issue #34597:
+# A marker left behind by a PREVIOUS gateway instance (different PID, or
+# past its TTL) must NOT crash the freshly booted gateway. The watcher
+# only fires when the marker targets the current process, and self-heals
+# by cleaning up stale/malformed markers.
+# ---------------------------------------------------------------------------
+
+
+def test_watcher_does_not_fire_for_foreign_pid_marker(tmp_path, monkeypatch):
+    """A marker naming a DIFFERENT process must not trigger our shutdown.
+
+    This is the core #34597 regression: a stale marker from a prior
+    gateway instance was firing the handler, driving the new gateway into
+    a false "Received UNKNOWN" shutdown and a watchdog crash loop.
+    """
+    marker = tmp_path / ".gateway-planned-stop.json"
+    # Foreign PID + a start_time that cannot match ours, freshly written
+    # so the TTL does NOT remove it — the watcher must still decline.
+    record = {
+        "target_pid": os.getpid() + 1,
+        "target_start_time": -1,
+        "stopper_pid": os.getpid() + 1,
+        "written_at": status_mod._utc_now_iso(),
+    }
+    marker.write_text(json.dumps(record), encoding="utf-8")
+
+    monkeypatch.setattr(status_mod, "_get_planned_stop_marker_path", lambda: marker)
+
+    runner = _FakeRunner(running=True, draining=False)
+    loop = _make_loop_capturing_calls()
+    shutdown_handler = MagicMock(name="shutdown_signal_handler")
+    stop_event = threading.Event()
+
+    watcher = threading.Thread(
+        target=_run_planned_stop_watcher,
+        args=(stop_event, runner, loop, shutdown_handler),
+        kwargs={"poll_interval": 0.05},
+        daemon=True,
+    )
+    watcher.start()
+    time.sleep(0.3)  # several poll cycles
+    stop_event.set()
+    watcher.join(timeout=2.0)
+
+    assert not watcher.is_alive()
+    assert loop._captured == [], (
+        f"Watcher fired on a foreign-PID marker (#34597 regression): {loop._captured}"
+    )
+    shutdown_handler.assert_not_called()
+    # Foreign (but live) marker is left in place — it may still belong to
+    # the process it names.
+    assert marker.exists()
+
+
+def test_watcher_cleans_up_stale_marker_and_keeps_running(tmp_path, monkeypatch):
+    """A marker older than the TTL is unlinked and never fires shutdown."""
+    marker = tmp_path / ".gateway-planned-stop.json"
+    # Self-targeting but backdated past the TTL: must be treated as dead.
+    _write_self_marker(marker, stale=True)
+
+    monkeypatch.setattr(status_mod, "_get_planned_stop_marker_path", lambda: marker)
+
+    runner = _FakeRunner(running=True, draining=False)
+    loop = _make_loop_capturing_calls()
+    shutdown_handler = MagicMock(name="shutdown_signal_handler")
+    stop_event = threading.Event()
+
+    watcher = threading.Thread(
+        target=_run_planned_stop_watcher,
+        args=(stop_event, runner, loop, shutdown_handler),
+        kwargs={"poll_interval": 0.05},
+        daemon=True,
+    )
+    watcher.start()
+    time.sleep(0.3)
+    stop_event.set()
+    watcher.join(timeout=2.0)
+
+    assert not watcher.is_alive()
+    assert loop._captured == [], "Stale marker must not fire shutdown"
+    shutdown_handler.assert_not_called()
+    assert not marker.exists(), "Stale marker should have been cleaned up"
+
+
+def test_planned_stop_marker_targets_self_probe_is_non_destructive(tmp_path, monkeypatch):
+    """The probe returns True for a self-marker WITHOUT unlinking it.
+
+    The shutdown handler performs the authoritative consume on its own
+    thread, so the watcher's probe must leave a matching marker intact.
+    """
+    marker = tmp_path / ".gateway-planned-stop.json"
+    _write_self_marker(marker)
+    monkeypatch.setattr(status_mod, "_get_planned_stop_marker_path", lambda: marker)
+
+    assert status_mod.planned_stop_marker_targets_self() is True
+    assert marker.exists(), "Probe must not consume a matching marker"
+    # Idempotent: still True on a second call.
+    assert status_mod.planned_stop_marker_targets_self() is True
+
+
+def test_planned_stop_marker_targets_self_drops_malformed(tmp_path, monkeypatch):
+    """A malformed marker reports False and is cleaned up."""
+    marker = tmp_path / ".gateway-planned-stop.json"
+    marker.write_text("{not valid json", encoding="utf-8")
+    monkeypatch.setattr(status_mod, "_get_planned_stop_marker_path", lambda: marker)
+
+    assert status_mod.planned_stop_marker_targets_self() is False
--- a/tests/gateway/test_status.py
+++ b/tests/gateway/test_status.py
@ -707,6 +707,33 @@ class TestTakeoverMarker:

        assert result is False

+    def test_consume_returns_true_on_windows_when_start_time_unavailable(
+        self, tmp_path, monkeypatch
+    ):
+        """Takeover consume must also recognise a self-marker on platforms
+        without ``/proc`` (macOS / native Windows).
+
+        ``consume_takeover_marker_for_self`` shares ``_consume_pid_marker_for_self``
+        with the planned-stop path, so the same start_time fallback applies:
+        a ``--replace`` SIGTERM on Windows (where start_time is None on both
+        sides) must be recognised as a planned takeover and exit 0, not be
+        misclassified as an unexpected UNKNOWN exit. With start_time
+        unavailable we fall back to PID equality alone, bounded by the TTL.
+        """
+        monkeypatch.setenv("HERMES_HOME", str(tmp_path))
+        # Simulate Windows: no start_time available for any PID.
+        monkeypatch.setattr(status, "_get_process_start_time", lambda pid: None)
+
+        ok = status.write_takeover_marker(target_pid=os.getpid())
+        assert ok is True
+        payload = json.loads((tmp_path / ".gateway-takeover.json").read_text())
+        assert payload["target_start_time"] is None
+
+        result = status.consume_takeover_marker_for_self()
+
+        assert result is True
+        assert not (tmp_path / ".gateway-takeover.json").exists()
+
    def test_consume_returns_false_when_marker_missing(self, tmp_path, monkeypatch):
        monkeypatch.setenv("HERMES_HOME", str(tmp_path))

@ -899,6 +926,74 @@ class TestPlannedStopMarker:

        assert ok is False

+    def test_consume_returns_true_on_windows_when_start_time_unavailable(
+        self, tmp_path, monkeypatch
+    ):
+        """Regression for #34597: a legitimate stop must be recognised on
+        platforms without ``/proc``.
+
+        ``_get_process_start_time`` returns None on macOS / native Windows
+        (no ``/proc/<pid>/stat``). The planned-stop watcher only runs there,
+        so if the authoritative consume required a non-None start_time match
+        it would always return False — and ``hermes gateway stop`` would be
+        misclassified as an unexpected ``UNKNOWN`` exit, exit 1, and revived
+        by the service manager (the very crash loop #34597 set out to fix).
+        With start_time unavailable on BOTH sides we fall back to PID
+        equality alone, bounded by the marker TTL.
+        """
+        monkeypatch.setenv("HERMES_HOME", str(tmp_path))
+        # Simulate Windows: no start_time available for any PID.
+        monkeypatch.setattr(status, "_get_process_start_time", lambda pid: None)
+
+        ok = status.write_planned_stop_marker(target_pid=os.getpid())
+        assert ok is True
+        # Marker carries a null start_time, exactly as written on Windows.
+        payload = json.loads((tmp_path / ".gateway-planned-stop.json").read_text())
+        assert payload["target_start_time"] is None
+
+        result = status.consume_planned_stop_marker_for_self()
+
+        assert result is True
+        assert not (tmp_path / ".gateway-planned-stop.json").exists()
+
+    def test_consume_still_rejects_foreign_pid_when_start_time_unavailable(
+        self, tmp_path, monkeypatch
+    ):
+        """The PID-only fallback must NOT match a marker naming another PID.
+
+        Falling back to PID equality when start_time is unknown must remain
+        a PID check — a marker for a different process is never ours.
+        """
+        monkeypatch.setenv("HERMES_HOME", str(tmp_path))
+        monkeypatch.setattr(status, "_get_process_start_time", lambda pid: None)
+
+        ok = status.write_planned_stop_marker(target_pid=os.getpid() + 9999)
+        assert ok is True
+
+        result = status.consume_planned_stop_marker_for_self()
+
+        assert result is False
+
+    def test_consume_still_rejects_start_time_mismatch_when_both_known(
+        self, tmp_path, monkeypatch
+    ):
+        """PID-reuse defence is preserved when BOTH start_times are present.
+
+        The Windows fallback only relaxes matching when a start_time is
+        unavailable. When both sides report one (Linux), a mismatch must
+        still reject — otherwise PID reuse could resurrect a stale marker.
+        """
+        monkeypatch.setenv("HERMES_HOME", str(tmp_path))
+        monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 100)
+        status.write_planned_stop_marker(target_pid=os.getpid())
+
+        # Simulate PID reuse: same PID, different start_time.
+        monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 9999)
+
+        result = status.consume_planned_stop_marker_for_self()
+
+        assert result is False
+

 class TestReadProcessCmdlinePsFallback:
    """Tests for _read_process_cmdline falling back to ps on non-Linux."""