mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-08 08:11:38 +00:00
fix(gateway): drain on Windows hermes gateway stop so sessions survive restart (#33798)
Sessions now survive `hermes gateway stop` / `restart` on native Windows. Previously the gateway died on schtasks `/End` + os.kill SIGTERM without ever running the drain loop, so the v0.13.0 session-resume feature (#21192) silently broke on Windows: `resume_pending=True` was never written, and the next boot started with a blank conversation history (issue #33778). Root cause is twofold and the reporter only identified half of it: 1. `hermes_cli/gateway_windows.py::stop()` did not write the `planned_stop_marker` before signalling. The reporter caught this. 2. The bigger reason: `asyncio.add_signal_handler` raises NotImplementedError for SIGTERM/SIGINT on Windows, so even if the marker had been written, the gateway's existing SIGTERM handler (which is what calls `runner.stop()` and the `mark_resume_pending` loop) was never invoked. Writing the marker would have been necessary-but-insufficient. The fix has two parts: * gateway/run.py: new `_run_planned_stop_watcher` daemon thread polls for the planned-stop marker file every 0.5s. When the marker appears it `loop.call_soon_threadsafe(shutdown_signal_handler, None)` — the same shutdown path a real SIGTERM would have driven, including the pre-drain `mark_resume_pending` writes (run.py:5977) and graceful drain wait. The existing signal handler already accepts `received_signal=None` and falls through to `consume_planned_stop_marker_for_self()`, so no handler changes needed. Runs on every platform as cheap belt-and-suspenders. * hermes_cli/gateway_windows.py: `stop()` now writes the marker for the running gateway PID and waits up to `agent.restart_drain_timeout` (default 30s) for the PID to exit cleanly. On clean drain, the kill sweep is non-forceful; on timeout, escalates to `kill_gateway_processes(force=True)` which routes to taskkill /T /F per `references/windows-native-support.md`. Validation: * 7 new tests in tests/gateway/test_planned_stop_watcher.py covering: marker→handler dispatch, no-marker idle, already-draining skip, not-yet-running skip, stop_event responsiveness, fire-once semantics, error tolerance. * 8 new tests in tests/hermes_cli/test_gateway_windows.py covering: marker-before-kill ordering, clean-drain skips force-kill, drain-timeout escalates to force=True, no-pid-skips-drain, invalid-pid handling, fast-exit success, timeout failure, marker-write-failure tolerance. * E2E (Linux, detached orphan): write_planned_stop_marker(pid) + `_drain_gateway_pid(pid, 5.0)` returns True in 0.5s after the victim sees the marker and exits. Tested with a double-forked subprocess so the test parent isn't holding it as a zombie. * Targeted: tests/gateway/{restart_drain,restart_resume_pending, signal,signal_format,status,shutdown_forensics,approve_deny_commands, planned_stop_watcher} + tests/hermes_cli/{gateway_windows, gateway_service} → 519/519. What was wrong with the reporter's claim (for future archaeology): they described the symptom as "no `resume_pending=True` written to `sessions.json`" — but Hermes uses `state.db` (SQLite), not `sessions.json`, and `mark_resume_pending` is called regardless of the marker (the marker only affects exit code 0 vs 1 for systemd revival semantics). The real session-loss path is the missing drain on Windows, not a missing marker. Both halves are fixed here. Closes #33778.
This commit is contained in:
parent
f8896dedc8
commit
10ee4a729b
4 changed files with 649 additions and 9 deletions
|
|
@ -1014,12 +1014,70 @@ def start() -> None:
|
|||
_report_gateway_start(f"direct spawn (PID {pid})")
|
||||
|
||||
|
||||
def stop() -> None:
|
||||
"""Stop the gateway. Tries /End on the scheduled task, then kills any stragglers."""
|
||||
_assert_windows()
|
||||
from hermes_cli.gateway import kill_gateway_processes
|
||||
def _drain_gateway_pid(pid: int, drain_timeout: float) -> bool:
|
||||
"""Write the planned-stop marker and wait for the gateway PID to exit.
|
||||
|
||||
stopped_any = False
|
||||
Windows cannot deliver POSIX signals to a Python asyncio loop
|
||||
(``loop.add_signal_handler`` raises NotImplementedError), so writing
|
||||
the marker is the ONLY way to ask a running gateway to drain
|
||||
in-flight agents and persist ``resume_pending`` before exit. The
|
||||
gateway's planned-stop watcher thread (gateway/run.py) polls for
|
||||
the marker and drives the same shutdown path the SIGTERM handler
|
||||
would have on POSIX.
|
||||
|
||||
Returns True if the PID exited within the timeout, False if it
|
||||
didn't (caller should escalate to schtasks /End + taskkill).
|
||||
"""
|
||||
if pid <= 0:
|
||||
return False
|
||||
try:
|
||||
from gateway.status import write_planned_stop_marker, _pid_exists
|
||||
except ImportError:
|
||||
return False
|
||||
|
||||
try:
|
||||
write_planned_stop_marker(pid)
|
||||
except Exception:
|
||||
# Best-effort: if the marker can't be written, we have no choice
|
||||
# but to fall through to a hard kill. Caller decides escalation.
|
||||
pass
|
||||
|
||||
deadline = time.monotonic() + max(drain_timeout, 1.0)
|
||||
while time.monotonic() < deadline:
|
||||
if not _pid_exists(pid):
|
||||
return True
|
||||
time.sleep(0.5)
|
||||
return False
|
||||
|
||||
|
||||
def stop() -> None:
|
||||
"""Stop the gateway.
|
||||
|
||||
Writes the planned-stop marker first so the gateway can drain
|
||||
in-flight agents and persist ``resume_pending`` before exit (the
|
||||
gateway's marker-watcher thread picks this up — Windows asyncio
|
||||
can't deliver SIGTERM to the loop, so the marker is our only IPC).
|
||||
Then escalates: ``schtasks /End`` (kills the scheduled-task tree)
|
||||
+ ``kill_gateway_processes(force=True)`` for any strays.
|
||||
"""
|
||||
_assert_windows()
|
||||
from hermes_cli.gateway import kill_gateway_processes, _get_restart_drain_timeout
|
||||
from gateway.status import get_running_pid
|
||||
|
||||
# Phase 1: ask the running gateway (if any) to drain itself by writing
|
||||
# the planned-stop marker, then wait briefly for it to exit cleanly.
|
||||
# On clean exit, sessions land with resume_pending=True and the next
|
||||
# boot will auto-resume them.
|
||||
pid = get_running_pid()
|
||||
drained = False
|
||||
if pid is not None:
|
||||
try:
|
||||
drain_timeout = float(_get_restart_drain_timeout() or 30.0)
|
||||
except Exception:
|
||||
drain_timeout = 30.0
|
||||
drained = _drain_gateway_pid(pid, drain_timeout)
|
||||
|
||||
stopped_any = drained
|
||||
if is_task_registered():
|
||||
code, _out, err = _exec_schtasks(["/End", "/TN", get_task_name()])
|
||||
# schtasks returns nonzero when the task isn't currently running — don't treat that as an error.
|
||||
|
|
@ -1028,12 +1086,19 @@ def stop() -> None:
|
|||
elif "not running" not in (err or "").lower():
|
||||
print(f"⚠ schtasks /End returned code {code}: {err.strip()}")
|
||||
|
||||
killed = kill_gateway_processes(all_profiles=False)
|
||||
# Phase 3: hard-kill any strays. When drain succeeded this is a no-op;
|
||||
# when drain timed out this is the escalation that ensures the PID
|
||||
# actually exits. Use force=True on Windows so taskkill /T /F walks
|
||||
# the descendant tree (browser helpers, etc.).
|
||||
killed = kill_gateway_processes(all_profiles=False, force=not drained)
|
||||
if killed:
|
||||
stopped_any = True
|
||||
print(f"✓ Killed {killed} gateway process(es)")
|
||||
if stopped_any:
|
||||
print("✓ Gateway stopped")
|
||||
if drained:
|
||||
print("✓ Gateway stopped (drained cleanly)")
|
||||
else:
|
||||
print("✓ Gateway stopped")
|
||||
else:
|
||||
print("✗ No gateway was running")
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue