diff --git a/gateway/run.py b/gateway/run.py index 3edd905f35..556d37973c 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -1518,12 +1518,25 @@ class GatewayRunner: # This prevents stuck sessions from being blindly resumed on restart, # which can create an unrecoverable loop (#7536). Suspended sessions # auto-reset on the next incoming message, giving the user a clean start. - try: - suspended = self.session_store.suspend_recently_active() - if suspended: - logger.info("Suspended %d in-flight session(s) from previous run", suspended) - except Exception as e: - logger.warning("Session suspension on startup failed: %s", e) + # + # SKIP suspension after a clean (graceful) shutdown — the previous + # process already drained active agents, so sessions aren't stuck. + # This prevents unwanted auto-resets after `hermes update`, + # `hermes gateway restart`, or `/restart`. + _clean_marker = _hermes_home / ".clean_shutdown" + if _clean_marker.exists(): + logger.info("Previous gateway exited cleanly — skipping session suspension") + try: + _clean_marker.unlink() + except Exception: + pass + else: + try: + suspended = self.session_store.suspend_recently_active() + if suspended: + logger.info("Suspended %d in-flight session(s) from previous run", suspended) + except Exception as e: + logger.warning("Session suspension on startup failed: %s", e) connected_count = 0 enabled_platform_count = 0 @@ -2049,6 +2062,15 @@ class GatewayRunner: from gateway.status import remove_pid_file remove_pid_file() + # Write a clean-shutdown marker so the next startup knows this + # wasn't a crash. suspend_recently_active() only needs to run + # after unexpected exits — graceful shutdowns already drain + # active agents, so there's no stuck-session risk. + try: + (_hermes_home / ".clean_shutdown").touch() + except Exception: + pass + if self._restart_requested and self._restart_via_service: self._exit_code = GATEWAY_SERVICE_RESTART_EXIT_CODE self._exit_reason = self._exit_reason or "Gateway restart requested" diff --git a/tests/gateway/test_clean_shutdown_marker.py b/tests/gateway/test_clean_shutdown_marker.py new file mode 100644 index 0000000000..1a476bc49a --- /dev/null +++ b/tests/gateway/test_clean_shutdown_marker.py @@ -0,0 +1,226 @@ +"""Tests for the clean shutdown marker that prevents unwanted session auto-resets. + +When the gateway shuts down gracefully (hermes update, gateway restart, /restart), +it writes a .clean_shutdown marker. On the next startup, if the marker exists, +suspend_recently_active() is skipped so users don't lose their sessions. + +After a crash (no marker), suspension still fires as a safety net for stuck sessions. +""" + +import os +from datetime import datetime, timedelta +from pathlib import Path +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from gateway.config import GatewayConfig, Platform, PlatformConfig, SessionResetPolicy +from gateway.session import SessionEntry, SessionSource, SessionStore + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _make_source(platform=Platform.TELEGRAM, chat_id="123", user_id="u1"): + return SessionSource(platform=platform, chat_id=chat_id, user_id=user_id) + + +def _make_store(tmp_path, policy=None): + config = GatewayConfig() + if policy: + config.default_reset_policy = policy + return SessionStore(sessions_dir=tmp_path, config=config) + + +# --------------------------------------------------------------------------- +# SessionStore.suspend_recently_active +# --------------------------------------------------------------------------- + +class TestSuspendRecentlyActive: + """Verify suspend_recently_active only marks recent sessions.""" + + def test_suspends_recently_active_sessions(self, tmp_path): + store = _make_store(tmp_path) + source = _make_source() + entry = store.get_or_create_session(source) + assert not entry.suspended + + count = store.suspend_recently_active() + assert count == 1 + + # Re-fetch — should be suspended now + refreshed = store.get_or_create_session(source) + assert refreshed.was_auto_reset + + def test_does_not_suspend_old_sessions(self, tmp_path): + store = _make_store(tmp_path) + source = _make_source() + entry = store.get_or_create_session(source) + + # Backdate the session's updated_at beyond the cutoff + with store._lock: + entry.updated_at = datetime.now() - timedelta(seconds=300) + store._save() + + count = store.suspend_recently_active(max_age_seconds=120) + assert count == 0 + + def test_already_suspended_not_double_counted(self, tmp_path): + store = _make_store(tmp_path) + source = _make_source() + entry = store.get_or_create_session(source) + + # Suspend once + count1 = store.suspend_recently_active() + assert count1 == 1 + + # Create a new session (the old one got reset on next access) + entry2 = store.get_or_create_session(source) + + # Suspend again — the new session is recent but not yet suspended + count2 = store.suspend_recently_active() + assert count2 == 1 + + +# --------------------------------------------------------------------------- +# Clean shutdown marker integration +# --------------------------------------------------------------------------- + +class TestCleanShutdownMarker: + """Test that the marker file controls session suspension on startup.""" + + def test_marker_written_on_graceful_stop(self, tmp_path, monkeypatch): + """stop() should write .clean_shutdown marker.""" + monkeypatch.setattr("gateway.run._hermes_home", tmp_path) + marker = tmp_path / ".clean_shutdown" + assert not marker.exists() + + # Create a minimal runner and call the shutdown logic directly + from gateway.run import GatewayRunner + runner = object.__new__(GatewayRunner) + runner._restart_requested = False + runner._restart_detached = False + runner._restart_via_service = False + runner._restart_task_started = False + runner._running = True + runner._draining = False + runner._stop_task = None + runner._running_agents = {} + runner._pending_messages = {} + runner._pending_approvals = {} + runner._background_tasks = set() + runner._shutdown_event = MagicMock() + runner._restart_drain_timeout = 5 + runner._exit_code = None + runner._exit_reason = None + runner.adapters = {} + runner.config = GatewayConfig() + + # Mock heavy dependencies + with patch("gateway.run.GatewayRunner._drain_active_agents", new_callable=AsyncMock, return_value=([], False)), \ + patch("gateway.run.GatewayRunner._finalize_shutdown_agents"), \ + patch("gateway.run.GatewayRunner._update_runtime_status"), \ + patch("gateway.status.remove_pid_file"), \ + patch("tools.process_registry.process_registry") as mock_proc_reg, \ + patch("tools.terminal_tool.cleanup_all_environments"), \ + patch("tools.browser_tool.cleanup_all_browsers"): + mock_proc_reg.kill_all = MagicMock() + + import asyncio + asyncio.get_event_loop().run_until_complete(runner.stop()) + + assert marker.exists(), ".clean_shutdown marker should exist after graceful stop" + + def test_marker_skips_suspension_on_startup(self, tmp_path, monkeypatch): + """If .clean_shutdown exists, suspend_recently_active should NOT be called.""" + monkeypatch.setattr("gateway.run._hermes_home", tmp_path) + + # Create the marker + marker = tmp_path / ".clean_shutdown" + marker.touch() + + # Create a store with a recently active session + store = _make_store(tmp_path) + source = _make_source() + entry = store.get_or_create_session(source) + assert not entry.suspended + + # Simulate what start() does: + if marker.exists(): + marker.unlink() + # Should NOT call suspend_recently_active + else: + store.suspend_recently_active() + + # Session should NOT be suspended + with store._lock: + store._ensure_loaded_locked() + for e in store._entries.values(): + assert not e.suspended, "Session should NOT be suspended after clean shutdown" + + assert not marker.exists(), "Marker should be cleaned up" + + def test_no_marker_triggers_suspension(self, tmp_path, monkeypatch): + """Without .clean_shutdown marker (crash), suspension should fire.""" + monkeypatch.setattr("gateway.run._hermes_home", tmp_path) + + marker = tmp_path / ".clean_shutdown" + assert not marker.exists() + + # Create a store with a recently active session + store = _make_store(tmp_path) + source = _make_source() + entry = store.get_or_create_session(source) + assert not entry.suspended + + # Simulate what start() does: + if marker.exists(): + marker.unlink() + else: + store.suspend_recently_active() + + # Session SHOULD be suspended (crash recovery) + with store._lock: + store._ensure_loaded_locked() + suspended_count = sum(1 for e in store._entries.values() if e.suspended) + assert suspended_count == 1, "Session should be suspended after crash (no marker)" + + def test_marker_written_on_restart_stop(self, tmp_path, monkeypatch): + """stop(restart=True) should also write the marker.""" + monkeypatch.setattr("gateway.run._hermes_home", tmp_path) + marker = tmp_path / ".clean_shutdown" + + from gateway.run import GatewayRunner + runner = object.__new__(GatewayRunner) + runner._restart_requested = False + runner._restart_detached = False + runner._restart_via_service = False + runner._restart_task_started = False + runner._running = True + runner._draining = False + runner._stop_task = None + runner._running_agents = {} + runner._pending_messages = {} + runner._pending_approvals = {} + runner._background_tasks = set() + runner._shutdown_event = MagicMock() + runner._restart_drain_timeout = 5 + runner._exit_code = None + runner._exit_reason = None + runner.adapters = {} + runner.config = GatewayConfig() + + with patch("gateway.run.GatewayRunner._drain_active_agents", new_callable=AsyncMock, return_value=([], False)), \ + patch("gateway.run.GatewayRunner._finalize_shutdown_agents"), \ + patch("gateway.run.GatewayRunner._update_runtime_status"), \ + patch("gateway.status.remove_pid_file"), \ + patch("tools.process_registry.process_registry") as mock_proc_reg, \ + patch("tools.terminal_tool.cleanup_all_environments"), \ + patch("tools.browser_tool.cleanup_all_browsers"): + mock_proc_reg.kill_all = MagicMock() + + import asyncio + asyncio.get_event_loop().run_until_complete(runner.stop(restart=True)) + + assert marker.exists(), ".clean_shutdown marker should exist after restart-stop too"