mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-09 08:21:50 +00:00
fix(gateway): only fire planned-stop watcher for self-targeting markers + fix Windows consume (#34749)
* fix(gateway): only fire planned-stop watcher for markers targeting self Salvaged from #34599 — rebased onto current main. The planned-stop watcher now only fires shutdown for a marker that targets the current process, instead of any marker that exists on disk. Fixes the Windows crash loop (#34597) where a stale marker from a previous Gateway instance kills a freshly booted Gateway ~400ms after start with a false "Received UNKNOWN — initiating shutdown". Co-authored-by: Bartok9 <danielrpike9@gmail.com> * fix(gateway): match planned-stop/takeover markers by PID alone when start_time is unavailable Follow-up to the #34599 salvage. The watcher's non-destructive probe (planned_stop_marker_targets_self) already falls back to PID equality when a process start_time is unavailable, but the authoritative consume it gates (_consume_pid_marker_for_self) still required a non-None start_time match. _get_process_start_time reads /proc/<pid>/stat and returns None on macOS and native Windows — the only platform the planned-stop watcher exists for. So on Windows the probe would fire the shutdown handler (PID matches) but the handler's consume_planned_stop_marker_for_self() would return False, and a legitimate 'hermes gateway stop' was still misclassified as an unexpected UNKNOWN exit (exit 1) and revived by the service manager — a residual half of the #34597 crash loop on the legitimate-stop path. Align the consume with the probe: when both start_times are known they must match (PID-reuse guard preserved on Linux); when either is unavailable, fall back to PID equality alone, bounded by the existing short marker TTL. This also fixes the parallel --replace takeover consume on Windows, which shares the same helper. Adds regression tests for the Windows (None start_time) path, the foreign-PID rejection under that fallback, and confirmation the start_time-mismatch guard still rejects when both are known. --------- Co-authored-by: Bartok9 <danielrpike9@gmail.com>
This commit is contained in:
parent
0563ab0652
commit
7379f17556
4 changed files with 335 additions and 15 deletions
|
|
@ -18442,7 +18442,10 @@ def _run_planned_stop_watcher(
|
|||
poll_interval: seconds between marker checks. 0.5s gives a
|
||||
responsive shutdown without burning CPU.
|
||||
"""
|
||||
from gateway.status import _get_planned_stop_marker_path
|
||||
from gateway.status import (
|
||||
_get_planned_stop_marker_path,
|
||||
planned_stop_marker_targets_self,
|
||||
)
|
||||
marker_path = _get_planned_stop_marker_path()
|
||||
while not stop_event.is_set():
|
||||
try:
|
||||
|
|
@ -18451,6 +18454,26 @@ def _run_planned_stop_watcher(
|
|||
and not getattr(runner, "_draining", False)
|
||||
and getattr(runner, "_running", False)
|
||||
):
|
||||
# A marker existing is NOT sufficient — it may have been
|
||||
# written for a PREVIOUS gateway instance (different PID)
|
||||
# and left behind because that process exited before the
|
||||
# CLI's stop() could clean it up. Firing the handler on a
|
||||
# stale/foreign marker drives the gateway into shutdown,
|
||||
# then consume_planned_stop_marker_for_self() correctly
|
||||
# reports a PID mismatch — but by then we're already
|
||||
# stopping, so it's logged as an unexpected "UNKNOWN" exit
|
||||
# and the watchdog crash-loops the gateway (issue #34597,
|
||||
# a regression from PR #33798 which added this watcher
|
||||
# without the PID check).
|
||||
#
|
||||
# Only fire when the marker actually targets us. The probe
|
||||
# is non-destructive on a match (the handler does the
|
||||
# authoritative consume on the loop thread) and self-heals
|
||||
# by unlinking stale/malformed markers so they cannot wedge
|
||||
# a freshly booted gateway.
|
||||
if not planned_stop_marker_targets_self():
|
||||
stop_event.wait(poll_interval)
|
||||
continue
|
||||
# Drive the same path as a real signal handler.
|
||||
# Pass signal=None — the handler tolerates that and consumes
|
||||
# the marker via consume_planned_stop_marker_for_self,
|
||||
|
|
|
|||
|
|
@ -816,12 +816,24 @@ def _consume_pid_marker_for_self(
|
|||
|
||||
our_pid = os.getpid()
|
||||
our_start_time = _get_process_start_time(our_pid)
|
||||
matches = (
|
||||
target_pid == our_pid
|
||||
and target_start_time is not None
|
||||
and our_start_time is not None
|
||||
and target_start_time == our_start_time
|
||||
)
|
||||
# Start-time is a PID-reuse guard. It is only meaningful when both
|
||||
# sides actually have it: ``_get_process_start_time`` returns None on
|
||||
# platforms without ``/proc`` (macOS, native Windows — the very
|
||||
# platform the planned-stop watcher exists for). Requiring a non-None
|
||||
# match there would make every consume return False, so a legitimate
|
||||
# ``hermes gateway stop`` on Windows would be misclassified as an
|
||||
# unexpected ``UNKNOWN`` exit (exit 1) and revived by the service
|
||||
# manager. So: when both start_times are known they must match; when
|
||||
# either is unknown, fall back to PID equality alone (bounded by the
|
||||
# marker's short TTL). This mirrors ``planned_stop_marker_targets_self``
|
||||
# so the watcher's non-destructive probe and this authoritative
|
||||
# consume agree on every platform (issue #34597).
|
||||
if target_pid != our_pid:
|
||||
matches = False
|
||||
elif target_start_time is not None and our_start_time is not None:
|
||||
matches = target_start_time == our_start_time
|
||||
else:
|
||||
matches = True
|
||||
|
||||
try:
|
||||
path.unlink(missing_ok=True)
|
||||
|
|
@ -914,6 +926,68 @@ def consume_planned_stop_marker_for_self() -> bool:
|
|||
)
|
||||
|
||||
|
||||
def planned_stop_marker_targets_self() -> bool:
|
||||
"""Return True only when a live planned-stop marker names the current process.
|
||||
|
||||
This is a **non-destructive** probe used by the watcher thread
|
||||
(``gateway/run.py:_run_planned_stop_watcher``) to decide whether to
|
||||
trigger shutdown. Unlike :func:`consume_planned_stop_marker_for_self`,
|
||||
it never unlinks a marker that matches us — the shutdown handler does
|
||||
the authoritative consume on its own thread.
|
||||
|
||||
It *does* clean up markers that can never apply to this process:
|
||||
malformed markers and markers older than the TTL are unlinked so a
|
||||
stale file left behind by a previous gateway instance cannot wedge
|
||||
the new one. Markers naming a different PID/start_time are left in
|
||||
place (they may still be consumed legitimately by the process they
|
||||
name) but report False here.
|
||||
|
||||
Returns False (without raising) on any read/parse error.
|
||||
"""
|
||||
path = _get_planned_stop_marker_path()
|
||||
record = _read_json_file(path)
|
||||
if not record:
|
||||
return False
|
||||
|
||||
try:
|
||||
target_pid = int(record["target_pid"])
|
||||
target_start_time = record.get("target_start_time")
|
||||
written_at = record.get("written_at") or ""
|
||||
except (KeyError, TypeError, ValueError):
|
||||
# Malformed marker can never match anyone — drop it.
|
||||
try:
|
||||
path.unlink(missing_ok=True)
|
||||
except OSError:
|
||||
pass
|
||||
return False
|
||||
|
||||
if _marker_is_stale(written_at, _PLANNED_STOP_MARKER_TTL_S):
|
||||
# A marker this old is past its useful life regardless of target —
|
||||
# clean it up so it cannot crash-loop a freshly booted gateway.
|
||||
try:
|
||||
path.unlink(missing_ok=True)
|
||||
except OSError:
|
||||
pass
|
||||
return False
|
||||
|
||||
our_pid = os.getpid()
|
||||
if target_pid != our_pid:
|
||||
return False
|
||||
|
||||
# Start-time is a PID-reuse guard. It is only meaningful when both
|
||||
# sides actually have it: ``_get_process_start_time`` returns None on
|
||||
# platforms without ``/proc`` (macOS, native Windows — the very
|
||||
# platform this watcher exists for). Requiring a non-None match there
|
||||
# would make the watcher never fire and re-break the #33778 Windows
|
||||
# session-resume path. So: when both start_times are known they must
|
||||
# match; when either is unknown, fall back to PID equality alone
|
||||
# (the marker is short-lived under a 60s TTL, bounding reuse risk).
|
||||
our_start_time = _get_process_start_time(our_pid)
|
||||
if target_start_time is not None and our_start_time is not None:
|
||||
return target_start_time == our_start_time
|
||||
return True
|
||||
|
||||
|
||||
def clear_planned_stop_marker() -> None:
|
||||
"""Remove the planned-stop marker unconditionally."""
|
||||
try:
|
||||
|
|
|
|||
|
|
@ -12,12 +12,33 @@ See issue #33778 for the original Windows session-loss bug report.
|
|||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import threading
|
||||
import time
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
|
||||
from gateway.run import _run_planned_stop_watcher
|
||||
from gateway import status as status_mod
|
||||
|
||||
|
||||
def _write_self_marker(marker, *, stale: bool = False):
|
||||
"""Write a planned-stop marker that targets the CURRENT process.
|
||||
|
||||
The watcher only fires for markers naming our PID + start_time (the
|
||||
fix for issue #34597), so tests that expect a fire must write a
|
||||
self-targeting marker. Pass ``stale=True`` to backdate ``written_at``
|
||||
past the TTL.
|
||||
"""
|
||||
written_at = "2000-01-01T00:00:00+00:00" if stale else status_mod._utc_now_iso()
|
||||
record = {
|
||||
"target_pid": os.getpid(),
|
||||
"target_start_time": status_mod._get_process_start_time(os.getpid()),
|
||||
"stopper_pid": os.getpid(),
|
||||
"written_at": written_at,
|
||||
}
|
||||
marker.write_text(json.dumps(record), encoding="utf-8")
|
||||
|
||||
|
||||
class _FakeRunner:
|
||||
|
|
@ -41,11 +62,10 @@ def _make_loop_capturing_calls():
|
|||
|
||||
|
||||
def test_watcher_fires_shutdown_when_marker_appears(tmp_path, monkeypatch):
|
||||
"""When the marker file exists, the watcher must call the shutdown handler."""
|
||||
"""When a marker targeting THIS process exists, fire the shutdown handler."""
|
||||
marker = tmp_path / ".gateway-planned-stop.json"
|
||||
|
||||
# Patch the marker-path resolver so the watcher polls our temp location.
|
||||
from gateway import status as status_mod
|
||||
monkeypatch.setattr(status_mod, "_get_planned_stop_marker_path", lambda: marker)
|
||||
|
||||
runner = _FakeRunner(running=True, draining=False)
|
||||
|
|
@ -53,8 +73,8 @@ def test_watcher_fires_shutdown_when_marker_appears(tmp_path, monkeypatch):
|
|||
shutdown_handler = MagicMock(name="shutdown_signal_handler")
|
||||
stop_event = threading.Event()
|
||||
|
||||
# Drop the marker before the thread starts.
|
||||
marker.write_text('{"target_pid": 1234}', encoding="utf-8")
|
||||
# Drop a self-targeting marker before the thread starts.
|
||||
_write_self_marker(marker)
|
||||
|
||||
watcher = threading.Thread(
|
||||
target=_run_planned_stop_watcher,
|
||||
|
|
@ -114,9 +134,8 @@ def test_watcher_skips_when_runner_already_draining(tmp_path, monkeypatch):
|
|||
so the watcher backs off once any shutdown is in flight.
|
||||
"""
|
||||
marker = tmp_path / ".gateway-planned-stop.json"
|
||||
marker.write_text('{"target_pid": 1234}', encoding="utf-8")
|
||||
_write_self_marker(marker)
|
||||
|
||||
from gateway import status as status_mod
|
||||
monkeypatch.setattr(status_mod, "_get_planned_stop_marker_path", lambda: marker)
|
||||
|
||||
# Already draining — watcher should be a no-op.
|
||||
|
|
@ -204,9 +223,8 @@ def test_watcher_fires_only_once_when_marker_persists(tmp_path, monkeypatch):
|
|||
times before the gateway actually shuts down.
|
||||
"""
|
||||
marker = tmp_path / ".gateway-planned-stop.json"
|
||||
marker.write_text('{"target_pid": 1234}', encoding="utf-8")
|
||||
_write_self_marker(marker)
|
||||
|
||||
from gateway import status as status_mod
|
||||
monkeypatch.setattr(status_mod, "_get_planned_stop_marker_path", lambda: marker)
|
||||
|
||||
runner = _FakeRunner(running=True, draining=False)
|
||||
|
|
@ -263,3 +281,113 @@ def test_watcher_tolerates_marker_path_resolution_errors(tmp_path, monkeypatch,
|
|||
assert not watcher.is_alive(), "Watcher should still honour stop_event after errors"
|
||||
# No shutdown fired because the marker never reported existence.
|
||||
assert loop._captured == []
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Regression coverage for issue #34597:
|
||||
# A marker left behind by a PREVIOUS gateway instance (different PID, or
|
||||
# past its TTL) must NOT crash the freshly booted gateway. The watcher
|
||||
# only fires when the marker targets the current process, and self-heals
|
||||
# by cleaning up stale/malformed markers.
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_watcher_does_not_fire_for_foreign_pid_marker(tmp_path, monkeypatch):
|
||||
"""A marker naming a DIFFERENT process must not trigger our shutdown.
|
||||
|
||||
This is the core #34597 regression: a stale marker from a prior
|
||||
gateway instance was firing the handler, driving the new gateway into
|
||||
a false "Received UNKNOWN" shutdown and a watchdog crash loop.
|
||||
"""
|
||||
marker = tmp_path / ".gateway-planned-stop.json"
|
||||
# Foreign PID + a start_time that cannot match ours, freshly written
|
||||
# so the TTL does NOT remove it — the watcher must still decline.
|
||||
record = {
|
||||
"target_pid": os.getpid() + 1,
|
||||
"target_start_time": -1,
|
||||
"stopper_pid": os.getpid() + 1,
|
||||
"written_at": status_mod._utc_now_iso(),
|
||||
}
|
||||
marker.write_text(json.dumps(record), encoding="utf-8")
|
||||
|
||||
monkeypatch.setattr(status_mod, "_get_planned_stop_marker_path", lambda: marker)
|
||||
|
||||
runner = _FakeRunner(running=True, draining=False)
|
||||
loop = _make_loop_capturing_calls()
|
||||
shutdown_handler = MagicMock(name="shutdown_signal_handler")
|
||||
stop_event = threading.Event()
|
||||
|
||||
watcher = threading.Thread(
|
||||
target=_run_planned_stop_watcher,
|
||||
args=(stop_event, runner, loop, shutdown_handler),
|
||||
kwargs={"poll_interval": 0.05},
|
||||
daemon=True,
|
||||
)
|
||||
watcher.start()
|
||||
time.sleep(0.3) # several poll cycles
|
||||
stop_event.set()
|
||||
watcher.join(timeout=2.0)
|
||||
|
||||
assert not watcher.is_alive()
|
||||
assert loop._captured == [], (
|
||||
f"Watcher fired on a foreign-PID marker (#34597 regression): {loop._captured}"
|
||||
)
|
||||
shutdown_handler.assert_not_called()
|
||||
# Foreign (but live) marker is left in place — it may still belong to
|
||||
# the process it names.
|
||||
assert marker.exists()
|
||||
|
||||
|
||||
def test_watcher_cleans_up_stale_marker_and_keeps_running(tmp_path, monkeypatch):
|
||||
"""A marker older than the TTL is unlinked and never fires shutdown."""
|
||||
marker = tmp_path / ".gateway-planned-stop.json"
|
||||
# Self-targeting but backdated past the TTL: must be treated as dead.
|
||||
_write_self_marker(marker, stale=True)
|
||||
|
||||
monkeypatch.setattr(status_mod, "_get_planned_stop_marker_path", lambda: marker)
|
||||
|
||||
runner = _FakeRunner(running=True, draining=False)
|
||||
loop = _make_loop_capturing_calls()
|
||||
shutdown_handler = MagicMock(name="shutdown_signal_handler")
|
||||
stop_event = threading.Event()
|
||||
|
||||
watcher = threading.Thread(
|
||||
target=_run_planned_stop_watcher,
|
||||
args=(stop_event, runner, loop, shutdown_handler),
|
||||
kwargs={"poll_interval": 0.05},
|
||||
daemon=True,
|
||||
)
|
||||
watcher.start()
|
||||
time.sleep(0.3)
|
||||
stop_event.set()
|
||||
watcher.join(timeout=2.0)
|
||||
|
||||
assert not watcher.is_alive()
|
||||
assert loop._captured == [], "Stale marker must not fire shutdown"
|
||||
shutdown_handler.assert_not_called()
|
||||
assert not marker.exists(), "Stale marker should have been cleaned up"
|
||||
|
||||
|
||||
def test_planned_stop_marker_targets_self_probe_is_non_destructive(tmp_path, monkeypatch):
|
||||
"""The probe returns True for a self-marker WITHOUT unlinking it.
|
||||
|
||||
The shutdown handler performs the authoritative consume on its own
|
||||
thread, so the watcher's probe must leave a matching marker intact.
|
||||
"""
|
||||
marker = tmp_path / ".gateway-planned-stop.json"
|
||||
_write_self_marker(marker)
|
||||
monkeypatch.setattr(status_mod, "_get_planned_stop_marker_path", lambda: marker)
|
||||
|
||||
assert status_mod.planned_stop_marker_targets_self() is True
|
||||
assert marker.exists(), "Probe must not consume a matching marker"
|
||||
# Idempotent: still True on a second call.
|
||||
assert status_mod.planned_stop_marker_targets_self() is True
|
||||
|
||||
|
||||
def test_planned_stop_marker_targets_self_drops_malformed(tmp_path, monkeypatch):
|
||||
"""A malformed marker reports False and is cleaned up."""
|
||||
marker = tmp_path / ".gateway-planned-stop.json"
|
||||
marker.write_text("{not valid json", encoding="utf-8")
|
||||
monkeypatch.setattr(status_mod, "_get_planned_stop_marker_path", lambda: marker)
|
||||
|
||||
assert status_mod.planned_stop_marker_targets_self() is False
|
||||
|
|
|
|||
|
|
@ -707,6 +707,33 @@ class TestTakeoverMarker:
|
|||
|
||||
assert result is False
|
||||
|
||||
def test_consume_returns_true_on_windows_when_start_time_unavailable(
|
||||
self, tmp_path, monkeypatch
|
||||
):
|
||||
"""Takeover consume must also recognise a self-marker on platforms
|
||||
without ``/proc`` (macOS / native Windows).
|
||||
|
||||
``consume_takeover_marker_for_self`` shares ``_consume_pid_marker_for_self``
|
||||
with the planned-stop path, so the same start_time fallback applies:
|
||||
a ``--replace`` SIGTERM on Windows (where start_time is None on both
|
||||
sides) must be recognised as a planned takeover and exit 0, not be
|
||||
misclassified as an unexpected UNKNOWN exit. With start_time
|
||||
unavailable we fall back to PID equality alone, bounded by the TTL.
|
||||
"""
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
||||
# Simulate Windows: no start_time available for any PID.
|
||||
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: None)
|
||||
|
||||
ok = status.write_takeover_marker(target_pid=os.getpid())
|
||||
assert ok is True
|
||||
payload = json.loads((tmp_path / ".gateway-takeover.json").read_text())
|
||||
assert payload["target_start_time"] is None
|
||||
|
||||
result = status.consume_takeover_marker_for_self()
|
||||
|
||||
assert result is True
|
||||
assert not (tmp_path / ".gateway-takeover.json").exists()
|
||||
|
||||
def test_consume_returns_false_when_marker_missing(self, tmp_path, monkeypatch):
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
||||
|
||||
|
|
@ -899,6 +926,74 @@ class TestPlannedStopMarker:
|
|||
|
||||
assert ok is False
|
||||
|
||||
def test_consume_returns_true_on_windows_when_start_time_unavailable(
|
||||
self, tmp_path, monkeypatch
|
||||
):
|
||||
"""Regression for #34597: a legitimate stop must be recognised on
|
||||
platforms without ``/proc``.
|
||||
|
||||
``_get_process_start_time`` returns None on macOS / native Windows
|
||||
(no ``/proc/<pid>/stat``). The planned-stop watcher only runs there,
|
||||
so if the authoritative consume required a non-None start_time match
|
||||
it would always return False — and ``hermes gateway stop`` would be
|
||||
misclassified as an unexpected ``UNKNOWN`` exit, exit 1, and revived
|
||||
by the service manager (the very crash loop #34597 set out to fix).
|
||||
With start_time unavailable on BOTH sides we fall back to PID
|
||||
equality alone, bounded by the marker TTL.
|
||||
"""
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
||||
# Simulate Windows: no start_time available for any PID.
|
||||
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: None)
|
||||
|
||||
ok = status.write_planned_stop_marker(target_pid=os.getpid())
|
||||
assert ok is True
|
||||
# Marker carries a null start_time, exactly as written on Windows.
|
||||
payload = json.loads((tmp_path / ".gateway-planned-stop.json").read_text())
|
||||
assert payload["target_start_time"] is None
|
||||
|
||||
result = status.consume_planned_stop_marker_for_self()
|
||||
|
||||
assert result is True
|
||||
assert not (tmp_path / ".gateway-planned-stop.json").exists()
|
||||
|
||||
def test_consume_still_rejects_foreign_pid_when_start_time_unavailable(
|
||||
self, tmp_path, monkeypatch
|
||||
):
|
||||
"""The PID-only fallback must NOT match a marker naming another PID.
|
||||
|
||||
Falling back to PID equality when start_time is unknown must remain
|
||||
a PID check — a marker for a different process is never ours.
|
||||
"""
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
||||
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: None)
|
||||
|
||||
ok = status.write_planned_stop_marker(target_pid=os.getpid() + 9999)
|
||||
assert ok is True
|
||||
|
||||
result = status.consume_planned_stop_marker_for_self()
|
||||
|
||||
assert result is False
|
||||
|
||||
def test_consume_still_rejects_start_time_mismatch_when_both_known(
|
||||
self, tmp_path, monkeypatch
|
||||
):
|
||||
"""PID-reuse defence is preserved when BOTH start_times are present.
|
||||
|
||||
The Windows fallback only relaxes matching when a start_time is
|
||||
unavailable. When both sides report one (Linux), a mismatch must
|
||||
still reject — otherwise PID reuse could resurrect a stale marker.
|
||||
"""
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
||||
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 100)
|
||||
status.write_planned_stop_marker(target_pid=os.getpid())
|
||||
|
||||
# Simulate PID reuse: same PID, different start_time.
|
||||
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 9999)
|
||||
|
||||
result = status.consume_planned_stop_marker_for_self()
|
||||
|
||||
assert result is False
|
||||
|
||||
|
||||
class TestReadProcessCmdlinePsFallback:
|
||||
"""Tests for _read_process_cmdline falling back to ps on non-Linux."""
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue