From 7379f175567bd0f1d833eec3a3599d0665b2a491 Mon Sep 17 00:00:00 2001 From: kshitij <82637225+kshitijk4poor@users.noreply.github.com> Date: Fri, 29 May 2026 10:36:58 -0700 Subject: [PATCH] fix(gateway): only fire planned-stop watcher for self-targeting markers + fix Windows consume (#34749) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix(gateway): only fire planned-stop watcher for markers targeting self Salvaged from #34599 — rebased onto current main. The planned-stop watcher now only fires shutdown for a marker that targets the current process, instead of any marker that exists on disk. Fixes the Windows crash loop (#34597) where a stale marker from a previous Gateway instance kills a freshly booted Gateway ~400ms after start with a false "Received UNKNOWN — initiating shutdown". Co-authored-by: Bartok9 * fix(gateway): match planned-stop/takeover markers by PID alone when start_time is unavailable Follow-up to the #34599 salvage. The watcher's non-destructive probe (planned_stop_marker_targets_self) already falls back to PID equality when a process start_time is unavailable, but the authoritative consume it gates (_consume_pid_marker_for_self) still required a non-None start_time match. _get_process_start_time reads /proc//stat and returns None on macOS and native Windows — the only platform the planned-stop watcher exists for. So on Windows the probe would fire the shutdown handler (PID matches) but the handler's consume_planned_stop_marker_for_self() would return False, and a legitimate 'hermes gateway stop' was still misclassified as an unexpected UNKNOWN exit (exit 1) and revived by the service manager — a residual half of the #34597 crash loop on the legitimate-stop path. Align the consume with the probe: when both start_times are known they must match (PID-reuse guard preserved on Linux); when either is unavailable, fall back to PID equality alone, bounded by the existing short marker TTL. This also fixes the parallel --replace takeover consume on Windows, which shares the same helper. Adds regression tests for the Windows (None start_time) path, the foreign-PID rejection under that fallback, and confirmation the start_time-mismatch guard still rejects when both are known. --------- Co-authored-by: Bartok9 --- gateway/run.py | 25 +++- gateway/status.py | 86 +++++++++++- tests/gateway/test_planned_stop_watcher.py | 144 +++++++++++++++++++-- tests/gateway/test_status.py | 95 ++++++++++++++ 4 files changed, 335 insertions(+), 15 deletions(-) diff --git a/gateway/run.py b/gateway/run.py index 20d0c2a4ea2..0549d7150a1 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -18442,7 +18442,10 @@ def _run_planned_stop_watcher( poll_interval: seconds between marker checks. 0.5s gives a responsive shutdown without burning CPU. """ - from gateway.status import _get_planned_stop_marker_path + from gateway.status import ( + _get_planned_stop_marker_path, + planned_stop_marker_targets_self, + ) marker_path = _get_planned_stop_marker_path() while not stop_event.is_set(): try: @@ -18451,6 +18454,26 @@ def _run_planned_stop_watcher( and not getattr(runner, "_draining", False) and getattr(runner, "_running", False) ): + # A marker existing is NOT sufficient — it may have been + # written for a PREVIOUS gateway instance (different PID) + # and left behind because that process exited before the + # CLI's stop() could clean it up. Firing the handler on a + # stale/foreign marker drives the gateway into shutdown, + # then consume_planned_stop_marker_for_self() correctly + # reports a PID mismatch — but by then we're already + # stopping, so it's logged as an unexpected "UNKNOWN" exit + # and the watchdog crash-loops the gateway (issue #34597, + # a regression from PR #33798 which added this watcher + # without the PID check). + # + # Only fire when the marker actually targets us. The probe + # is non-destructive on a match (the handler does the + # authoritative consume on the loop thread) and self-heals + # by unlinking stale/malformed markers so they cannot wedge + # a freshly booted gateway. + if not planned_stop_marker_targets_self(): + stop_event.wait(poll_interval) + continue # Drive the same path as a real signal handler. # Pass signal=None — the handler tolerates that and consumes # the marker via consume_planned_stop_marker_for_self, diff --git a/gateway/status.py b/gateway/status.py index 516ea8f385e..935758b904b 100644 --- a/gateway/status.py +++ b/gateway/status.py @@ -816,12 +816,24 @@ def _consume_pid_marker_for_self( our_pid = os.getpid() our_start_time = _get_process_start_time(our_pid) - matches = ( - target_pid == our_pid - and target_start_time is not None - and our_start_time is not None - and target_start_time == our_start_time - ) + # Start-time is a PID-reuse guard. It is only meaningful when both + # sides actually have it: ``_get_process_start_time`` returns None on + # platforms without ``/proc`` (macOS, native Windows — the very + # platform the planned-stop watcher exists for). Requiring a non-None + # match there would make every consume return False, so a legitimate + # ``hermes gateway stop`` on Windows would be misclassified as an + # unexpected ``UNKNOWN`` exit (exit 1) and revived by the service + # manager. So: when both start_times are known they must match; when + # either is unknown, fall back to PID equality alone (bounded by the + # marker's short TTL). This mirrors ``planned_stop_marker_targets_self`` + # so the watcher's non-destructive probe and this authoritative + # consume agree on every platform (issue #34597). + if target_pid != our_pid: + matches = False + elif target_start_time is not None and our_start_time is not None: + matches = target_start_time == our_start_time + else: + matches = True try: path.unlink(missing_ok=True) @@ -914,6 +926,68 @@ def consume_planned_stop_marker_for_self() -> bool: ) +def planned_stop_marker_targets_self() -> bool: + """Return True only when a live planned-stop marker names the current process. + + This is a **non-destructive** probe used by the watcher thread + (``gateway/run.py:_run_planned_stop_watcher``) to decide whether to + trigger shutdown. Unlike :func:`consume_planned_stop_marker_for_self`, + it never unlinks a marker that matches us — the shutdown handler does + the authoritative consume on its own thread. + + It *does* clean up markers that can never apply to this process: + malformed markers and markers older than the TTL are unlinked so a + stale file left behind by a previous gateway instance cannot wedge + the new one. Markers naming a different PID/start_time are left in + place (they may still be consumed legitimately by the process they + name) but report False here. + + Returns False (without raising) on any read/parse error. + """ + path = _get_planned_stop_marker_path() + record = _read_json_file(path) + if not record: + return False + + try: + target_pid = int(record["target_pid"]) + target_start_time = record.get("target_start_time") + written_at = record.get("written_at") or "" + except (KeyError, TypeError, ValueError): + # Malformed marker can never match anyone — drop it. + try: + path.unlink(missing_ok=True) + except OSError: + pass + return False + + if _marker_is_stale(written_at, _PLANNED_STOP_MARKER_TTL_S): + # A marker this old is past its useful life regardless of target — + # clean it up so it cannot crash-loop a freshly booted gateway. + try: + path.unlink(missing_ok=True) + except OSError: + pass + return False + + our_pid = os.getpid() + if target_pid != our_pid: + return False + + # Start-time is a PID-reuse guard. It is only meaningful when both + # sides actually have it: ``_get_process_start_time`` returns None on + # platforms without ``/proc`` (macOS, native Windows — the very + # platform this watcher exists for). Requiring a non-None match there + # would make the watcher never fire and re-break the #33778 Windows + # session-resume path. So: when both start_times are known they must + # match; when either is unknown, fall back to PID equality alone + # (the marker is short-lived under a 60s TTL, bounding reuse risk). + our_start_time = _get_process_start_time(our_pid) + if target_start_time is not None and our_start_time is not None: + return target_start_time == our_start_time + return True + + def clear_planned_stop_marker() -> None: """Remove the planned-stop marker unconditionally.""" try: diff --git a/tests/gateway/test_planned_stop_watcher.py b/tests/gateway/test_planned_stop_watcher.py index bee5802b803..451a3d8f8a7 100644 --- a/tests/gateway/test_planned_stop_watcher.py +++ b/tests/gateway/test_planned_stop_watcher.py @@ -12,12 +12,33 @@ See issue #33778 for the original Windows session-loss bug report. """ import asyncio +import json +import os import threading import time from unittest.mock import MagicMock from gateway.run import _run_planned_stop_watcher +from gateway import status as status_mod + + +def _write_self_marker(marker, *, stale: bool = False): + """Write a planned-stop marker that targets the CURRENT process. + + The watcher only fires for markers naming our PID + start_time (the + fix for issue #34597), so tests that expect a fire must write a + self-targeting marker. Pass ``stale=True`` to backdate ``written_at`` + past the TTL. + """ + written_at = "2000-01-01T00:00:00+00:00" if stale else status_mod._utc_now_iso() + record = { + "target_pid": os.getpid(), + "target_start_time": status_mod._get_process_start_time(os.getpid()), + "stopper_pid": os.getpid(), + "written_at": written_at, + } + marker.write_text(json.dumps(record), encoding="utf-8") class _FakeRunner: @@ -41,11 +62,10 @@ def _make_loop_capturing_calls(): def test_watcher_fires_shutdown_when_marker_appears(tmp_path, monkeypatch): - """When the marker file exists, the watcher must call the shutdown handler.""" + """When a marker targeting THIS process exists, fire the shutdown handler.""" marker = tmp_path / ".gateway-planned-stop.json" # Patch the marker-path resolver so the watcher polls our temp location. - from gateway import status as status_mod monkeypatch.setattr(status_mod, "_get_planned_stop_marker_path", lambda: marker) runner = _FakeRunner(running=True, draining=False) @@ -53,8 +73,8 @@ def test_watcher_fires_shutdown_when_marker_appears(tmp_path, monkeypatch): shutdown_handler = MagicMock(name="shutdown_signal_handler") stop_event = threading.Event() - # Drop the marker before the thread starts. - marker.write_text('{"target_pid": 1234}', encoding="utf-8") + # Drop a self-targeting marker before the thread starts. + _write_self_marker(marker) watcher = threading.Thread( target=_run_planned_stop_watcher, @@ -114,9 +134,8 @@ def test_watcher_skips_when_runner_already_draining(tmp_path, monkeypatch): so the watcher backs off once any shutdown is in flight. """ marker = tmp_path / ".gateway-planned-stop.json" - marker.write_text('{"target_pid": 1234}', encoding="utf-8") + _write_self_marker(marker) - from gateway import status as status_mod monkeypatch.setattr(status_mod, "_get_planned_stop_marker_path", lambda: marker) # Already draining — watcher should be a no-op. @@ -204,9 +223,8 @@ def test_watcher_fires_only_once_when_marker_persists(tmp_path, monkeypatch): times before the gateway actually shuts down. """ marker = tmp_path / ".gateway-planned-stop.json" - marker.write_text('{"target_pid": 1234}', encoding="utf-8") + _write_self_marker(marker) - from gateway import status as status_mod monkeypatch.setattr(status_mod, "_get_planned_stop_marker_path", lambda: marker) runner = _FakeRunner(running=True, draining=False) @@ -263,3 +281,113 @@ def test_watcher_tolerates_marker_path_resolution_errors(tmp_path, monkeypatch, assert not watcher.is_alive(), "Watcher should still honour stop_event after errors" # No shutdown fired because the marker never reported existence. assert loop._captured == [] + + +# --------------------------------------------------------------------------- +# Regression coverage for issue #34597: +# A marker left behind by a PREVIOUS gateway instance (different PID, or +# past its TTL) must NOT crash the freshly booted gateway. The watcher +# only fires when the marker targets the current process, and self-heals +# by cleaning up stale/malformed markers. +# --------------------------------------------------------------------------- + + +def test_watcher_does_not_fire_for_foreign_pid_marker(tmp_path, monkeypatch): + """A marker naming a DIFFERENT process must not trigger our shutdown. + + This is the core #34597 regression: a stale marker from a prior + gateway instance was firing the handler, driving the new gateway into + a false "Received UNKNOWN" shutdown and a watchdog crash loop. + """ + marker = tmp_path / ".gateway-planned-stop.json" + # Foreign PID + a start_time that cannot match ours, freshly written + # so the TTL does NOT remove it — the watcher must still decline. + record = { + "target_pid": os.getpid() + 1, + "target_start_time": -1, + "stopper_pid": os.getpid() + 1, + "written_at": status_mod._utc_now_iso(), + } + marker.write_text(json.dumps(record), encoding="utf-8") + + monkeypatch.setattr(status_mod, "_get_planned_stop_marker_path", lambda: marker) + + runner = _FakeRunner(running=True, draining=False) + loop = _make_loop_capturing_calls() + shutdown_handler = MagicMock(name="shutdown_signal_handler") + stop_event = threading.Event() + + watcher = threading.Thread( + target=_run_planned_stop_watcher, + args=(stop_event, runner, loop, shutdown_handler), + kwargs={"poll_interval": 0.05}, + daemon=True, + ) + watcher.start() + time.sleep(0.3) # several poll cycles + stop_event.set() + watcher.join(timeout=2.0) + + assert not watcher.is_alive() + assert loop._captured == [], ( + f"Watcher fired on a foreign-PID marker (#34597 regression): {loop._captured}" + ) + shutdown_handler.assert_not_called() + # Foreign (but live) marker is left in place — it may still belong to + # the process it names. + assert marker.exists() + + +def test_watcher_cleans_up_stale_marker_and_keeps_running(tmp_path, monkeypatch): + """A marker older than the TTL is unlinked and never fires shutdown.""" + marker = tmp_path / ".gateway-planned-stop.json" + # Self-targeting but backdated past the TTL: must be treated as dead. + _write_self_marker(marker, stale=True) + + monkeypatch.setattr(status_mod, "_get_planned_stop_marker_path", lambda: marker) + + runner = _FakeRunner(running=True, draining=False) + loop = _make_loop_capturing_calls() + shutdown_handler = MagicMock(name="shutdown_signal_handler") + stop_event = threading.Event() + + watcher = threading.Thread( + target=_run_planned_stop_watcher, + args=(stop_event, runner, loop, shutdown_handler), + kwargs={"poll_interval": 0.05}, + daemon=True, + ) + watcher.start() + time.sleep(0.3) + stop_event.set() + watcher.join(timeout=2.0) + + assert not watcher.is_alive() + assert loop._captured == [], "Stale marker must not fire shutdown" + shutdown_handler.assert_not_called() + assert not marker.exists(), "Stale marker should have been cleaned up" + + +def test_planned_stop_marker_targets_self_probe_is_non_destructive(tmp_path, monkeypatch): + """The probe returns True for a self-marker WITHOUT unlinking it. + + The shutdown handler performs the authoritative consume on its own + thread, so the watcher's probe must leave a matching marker intact. + """ + marker = tmp_path / ".gateway-planned-stop.json" + _write_self_marker(marker) + monkeypatch.setattr(status_mod, "_get_planned_stop_marker_path", lambda: marker) + + assert status_mod.planned_stop_marker_targets_self() is True + assert marker.exists(), "Probe must not consume a matching marker" + # Idempotent: still True on a second call. + assert status_mod.planned_stop_marker_targets_self() is True + + +def test_planned_stop_marker_targets_self_drops_malformed(tmp_path, monkeypatch): + """A malformed marker reports False and is cleaned up.""" + marker = tmp_path / ".gateway-planned-stop.json" + marker.write_text("{not valid json", encoding="utf-8") + monkeypatch.setattr(status_mod, "_get_planned_stop_marker_path", lambda: marker) + + assert status_mod.planned_stop_marker_targets_self() is False diff --git a/tests/gateway/test_status.py b/tests/gateway/test_status.py index b92c0cd4d11..ae378e0b753 100644 --- a/tests/gateway/test_status.py +++ b/tests/gateway/test_status.py @@ -707,6 +707,33 @@ class TestTakeoverMarker: assert result is False + def test_consume_returns_true_on_windows_when_start_time_unavailable( + self, tmp_path, monkeypatch + ): + """Takeover consume must also recognise a self-marker on platforms + without ``/proc`` (macOS / native Windows). + + ``consume_takeover_marker_for_self`` shares ``_consume_pid_marker_for_self`` + with the planned-stop path, so the same start_time fallback applies: + a ``--replace`` SIGTERM on Windows (where start_time is None on both + sides) must be recognised as a planned takeover and exit 0, not be + misclassified as an unexpected UNKNOWN exit. With start_time + unavailable we fall back to PID equality alone, bounded by the TTL. + """ + monkeypatch.setenv("HERMES_HOME", str(tmp_path)) + # Simulate Windows: no start_time available for any PID. + monkeypatch.setattr(status, "_get_process_start_time", lambda pid: None) + + ok = status.write_takeover_marker(target_pid=os.getpid()) + assert ok is True + payload = json.loads((tmp_path / ".gateway-takeover.json").read_text()) + assert payload["target_start_time"] is None + + result = status.consume_takeover_marker_for_self() + + assert result is True + assert not (tmp_path / ".gateway-takeover.json").exists() + def test_consume_returns_false_when_marker_missing(self, tmp_path, monkeypatch): monkeypatch.setenv("HERMES_HOME", str(tmp_path)) @@ -899,6 +926,74 @@ class TestPlannedStopMarker: assert ok is False + def test_consume_returns_true_on_windows_when_start_time_unavailable( + self, tmp_path, monkeypatch + ): + """Regression for #34597: a legitimate stop must be recognised on + platforms without ``/proc``. + + ``_get_process_start_time`` returns None on macOS / native Windows + (no ``/proc//stat``). The planned-stop watcher only runs there, + so if the authoritative consume required a non-None start_time match + it would always return False — and ``hermes gateway stop`` would be + misclassified as an unexpected ``UNKNOWN`` exit, exit 1, and revived + by the service manager (the very crash loop #34597 set out to fix). + With start_time unavailable on BOTH sides we fall back to PID + equality alone, bounded by the marker TTL. + """ + monkeypatch.setenv("HERMES_HOME", str(tmp_path)) + # Simulate Windows: no start_time available for any PID. + monkeypatch.setattr(status, "_get_process_start_time", lambda pid: None) + + ok = status.write_planned_stop_marker(target_pid=os.getpid()) + assert ok is True + # Marker carries a null start_time, exactly as written on Windows. + payload = json.loads((tmp_path / ".gateway-planned-stop.json").read_text()) + assert payload["target_start_time"] is None + + result = status.consume_planned_stop_marker_for_self() + + assert result is True + assert not (tmp_path / ".gateway-planned-stop.json").exists() + + def test_consume_still_rejects_foreign_pid_when_start_time_unavailable( + self, tmp_path, monkeypatch + ): + """The PID-only fallback must NOT match a marker naming another PID. + + Falling back to PID equality when start_time is unknown must remain + a PID check — a marker for a different process is never ours. + """ + monkeypatch.setenv("HERMES_HOME", str(tmp_path)) + monkeypatch.setattr(status, "_get_process_start_time", lambda pid: None) + + ok = status.write_planned_stop_marker(target_pid=os.getpid() + 9999) + assert ok is True + + result = status.consume_planned_stop_marker_for_self() + + assert result is False + + def test_consume_still_rejects_start_time_mismatch_when_both_known( + self, tmp_path, monkeypatch + ): + """PID-reuse defence is preserved when BOTH start_times are present. + + The Windows fallback only relaxes matching when a start_time is + unavailable. When both sides report one (Linux), a mismatch must + still reject — otherwise PID reuse could resurrect a stale marker. + """ + monkeypatch.setenv("HERMES_HOME", str(tmp_path)) + monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 100) + status.write_planned_stop_marker(target_pid=os.getpid()) + + # Simulate PID reuse: same PID, different start_time. + monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 9999) + + result = status.consume_planned_stop_marker_for_self() + + assert result is False + class TestReadProcessCmdlinePsFallback: """Tests for _read_process_cmdline falling back to ps on non-Linux."""