mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
When the gateway shuts down gracefully (hermes update, gateway restart, /restart), it now writes a .clean_shutdown marker file. On the next startup, if this marker exists, suspend_recently_active() is skipped and the marker is cleaned up. Previously, suspend_recently_active() fired on EVERY startup — including planned restarts from hermes update or hermes gateway restart. This caused users to lose their conversation history unexpectedly: the session would be marked as suspended, and the next message would trigger an auto-reset with a notification the user never asked for. The original purpose of suspend_recently_active() is crash recovery — preventing stuck sessions that were mid-processing when the gateway died unexpectedly. Graceful shutdowns already drain active agents via _drain_active_agents(), so there is no stuck-session risk. After a crash (no marker written), suspension still fires as before. Fixes the scenario where a user asks the agent to run hermes update, the gateway restarts, and the user's next message gets an unwanted 'Session automatically reset' notification with their history cleared.
226 lines
8.8 KiB
Python
226 lines
8.8 KiB
Python
"""Tests for the clean shutdown marker that prevents unwanted session auto-resets.
|
|
|
|
When the gateway shuts down gracefully (hermes update, gateway restart, /restart),
|
|
it writes a .clean_shutdown marker. On the next startup, if the marker exists,
|
|
suspend_recently_active() is skipped so users don't lose their sessions.
|
|
|
|
After a crash (no marker), suspension still fires as a safety net for stuck sessions.
|
|
"""
|
|
|
|
import os
|
|
from datetime import datetime, timedelta
|
|
from pathlib import Path
|
|
from unittest.mock import AsyncMock, MagicMock, patch
|
|
|
|
import pytest
|
|
|
|
from gateway.config import GatewayConfig, Platform, PlatformConfig, SessionResetPolicy
|
|
from gateway.session import SessionEntry, SessionSource, SessionStore
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _make_source(platform=Platform.TELEGRAM, chat_id="123", user_id="u1"):
|
|
return SessionSource(platform=platform, chat_id=chat_id, user_id=user_id)
|
|
|
|
|
|
def _make_store(tmp_path, policy=None):
|
|
config = GatewayConfig()
|
|
if policy:
|
|
config.default_reset_policy = policy
|
|
return SessionStore(sessions_dir=tmp_path, config=config)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# SessionStore.suspend_recently_active
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestSuspendRecentlyActive:
|
|
"""Verify suspend_recently_active only marks recent sessions."""
|
|
|
|
def test_suspends_recently_active_sessions(self, tmp_path):
|
|
store = _make_store(tmp_path)
|
|
source = _make_source()
|
|
entry = store.get_or_create_session(source)
|
|
assert not entry.suspended
|
|
|
|
count = store.suspend_recently_active()
|
|
assert count == 1
|
|
|
|
# Re-fetch — should be suspended now
|
|
refreshed = store.get_or_create_session(source)
|
|
assert refreshed.was_auto_reset
|
|
|
|
def test_does_not_suspend_old_sessions(self, tmp_path):
|
|
store = _make_store(tmp_path)
|
|
source = _make_source()
|
|
entry = store.get_or_create_session(source)
|
|
|
|
# Backdate the session's updated_at beyond the cutoff
|
|
with store._lock:
|
|
entry.updated_at = datetime.now() - timedelta(seconds=300)
|
|
store._save()
|
|
|
|
count = store.suspend_recently_active(max_age_seconds=120)
|
|
assert count == 0
|
|
|
|
def test_already_suspended_not_double_counted(self, tmp_path):
|
|
store = _make_store(tmp_path)
|
|
source = _make_source()
|
|
entry = store.get_or_create_session(source)
|
|
|
|
# Suspend once
|
|
count1 = store.suspend_recently_active()
|
|
assert count1 == 1
|
|
|
|
# Create a new session (the old one got reset on next access)
|
|
entry2 = store.get_or_create_session(source)
|
|
|
|
# Suspend again — the new session is recent but not yet suspended
|
|
count2 = store.suspend_recently_active()
|
|
assert count2 == 1
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Clean shutdown marker integration
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestCleanShutdownMarker:
|
|
"""Test that the marker file controls session suspension on startup."""
|
|
|
|
def test_marker_written_on_graceful_stop(self, tmp_path, monkeypatch):
|
|
"""stop() should write .clean_shutdown marker."""
|
|
monkeypatch.setattr("gateway.run._hermes_home", tmp_path)
|
|
marker = tmp_path / ".clean_shutdown"
|
|
assert not marker.exists()
|
|
|
|
# Create a minimal runner and call the shutdown logic directly
|
|
from gateway.run import GatewayRunner
|
|
runner = object.__new__(GatewayRunner)
|
|
runner._restart_requested = False
|
|
runner._restart_detached = False
|
|
runner._restart_via_service = False
|
|
runner._restart_task_started = False
|
|
runner._running = True
|
|
runner._draining = False
|
|
runner._stop_task = None
|
|
runner._running_agents = {}
|
|
runner._pending_messages = {}
|
|
runner._pending_approvals = {}
|
|
runner._background_tasks = set()
|
|
runner._shutdown_event = MagicMock()
|
|
runner._restart_drain_timeout = 5
|
|
runner._exit_code = None
|
|
runner._exit_reason = None
|
|
runner.adapters = {}
|
|
runner.config = GatewayConfig()
|
|
|
|
# Mock heavy dependencies
|
|
with patch("gateway.run.GatewayRunner._drain_active_agents", new_callable=AsyncMock, return_value=([], False)), \
|
|
patch("gateway.run.GatewayRunner._finalize_shutdown_agents"), \
|
|
patch("gateway.run.GatewayRunner._update_runtime_status"), \
|
|
patch("gateway.status.remove_pid_file"), \
|
|
patch("tools.process_registry.process_registry") as mock_proc_reg, \
|
|
patch("tools.terminal_tool.cleanup_all_environments"), \
|
|
patch("tools.browser_tool.cleanup_all_browsers"):
|
|
mock_proc_reg.kill_all = MagicMock()
|
|
|
|
import asyncio
|
|
asyncio.get_event_loop().run_until_complete(runner.stop())
|
|
|
|
assert marker.exists(), ".clean_shutdown marker should exist after graceful stop"
|
|
|
|
def test_marker_skips_suspension_on_startup(self, tmp_path, monkeypatch):
|
|
"""If .clean_shutdown exists, suspend_recently_active should NOT be called."""
|
|
monkeypatch.setattr("gateway.run._hermes_home", tmp_path)
|
|
|
|
# Create the marker
|
|
marker = tmp_path / ".clean_shutdown"
|
|
marker.touch()
|
|
|
|
# Create a store with a recently active session
|
|
store = _make_store(tmp_path)
|
|
source = _make_source()
|
|
entry = store.get_or_create_session(source)
|
|
assert not entry.suspended
|
|
|
|
# Simulate what start() does:
|
|
if marker.exists():
|
|
marker.unlink()
|
|
# Should NOT call suspend_recently_active
|
|
else:
|
|
store.suspend_recently_active()
|
|
|
|
# Session should NOT be suspended
|
|
with store._lock:
|
|
store._ensure_loaded_locked()
|
|
for e in store._entries.values():
|
|
assert not e.suspended, "Session should NOT be suspended after clean shutdown"
|
|
|
|
assert not marker.exists(), "Marker should be cleaned up"
|
|
|
|
def test_no_marker_triggers_suspension(self, tmp_path, monkeypatch):
|
|
"""Without .clean_shutdown marker (crash), suspension should fire."""
|
|
monkeypatch.setattr("gateway.run._hermes_home", tmp_path)
|
|
|
|
marker = tmp_path / ".clean_shutdown"
|
|
assert not marker.exists()
|
|
|
|
# Create a store with a recently active session
|
|
store = _make_store(tmp_path)
|
|
source = _make_source()
|
|
entry = store.get_or_create_session(source)
|
|
assert not entry.suspended
|
|
|
|
# Simulate what start() does:
|
|
if marker.exists():
|
|
marker.unlink()
|
|
else:
|
|
store.suspend_recently_active()
|
|
|
|
# Session SHOULD be suspended (crash recovery)
|
|
with store._lock:
|
|
store._ensure_loaded_locked()
|
|
suspended_count = sum(1 for e in store._entries.values() if e.suspended)
|
|
assert suspended_count == 1, "Session should be suspended after crash (no marker)"
|
|
|
|
def test_marker_written_on_restart_stop(self, tmp_path, monkeypatch):
|
|
"""stop(restart=True) should also write the marker."""
|
|
monkeypatch.setattr("gateway.run._hermes_home", tmp_path)
|
|
marker = tmp_path / ".clean_shutdown"
|
|
|
|
from gateway.run import GatewayRunner
|
|
runner = object.__new__(GatewayRunner)
|
|
runner._restart_requested = False
|
|
runner._restart_detached = False
|
|
runner._restart_via_service = False
|
|
runner._restart_task_started = False
|
|
runner._running = True
|
|
runner._draining = False
|
|
runner._stop_task = None
|
|
runner._running_agents = {}
|
|
runner._pending_messages = {}
|
|
runner._pending_approvals = {}
|
|
runner._background_tasks = set()
|
|
runner._shutdown_event = MagicMock()
|
|
runner._restart_drain_timeout = 5
|
|
runner._exit_code = None
|
|
runner._exit_reason = None
|
|
runner.adapters = {}
|
|
runner.config = GatewayConfig()
|
|
|
|
with patch("gateway.run.GatewayRunner._drain_active_agents", new_callable=AsyncMock, return_value=([], False)), \
|
|
patch("gateway.run.GatewayRunner._finalize_shutdown_agents"), \
|
|
patch("gateway.run.GatewayRunner._update_runtime_status"), \
|
|
patch("gateway.status.remove_pid_file"), \
|
|
patch("tools.process_registry.process_registry") as mock_proc_reg, \
|
|
patch("tools.terminal_tool.cleanup_all_environments"), \
|
|
patch("tools.browser_tool.cleanup_all_browsers"):
|
|
mock_proc_reg.kill_all = MagicMock()
|
|
|
|
import asyncio
|
|
asyncio.get_event_loop().run_until_complete(runner.stop(restart=True))
|
|
|
|
assert marker.exists(), ".clean_shutdown marker should exist after restart-stop too"
|