"""Tests for the gateway platform reconnection watcher.""" import asyncio import time from unittest.mock import AsyncMock, MagicMock, patch import pytest from gateway.config import GatewayConfig, Platform, PlatformConfig from gateway.platforms.base import BasePlatformAdapter, MessageEvent, SendResult from gateway.run import GatewayRunner class StubAdapter(BasePlatformAdapter): """Adapter whose connect() result can be controlled.""" def __init__( self, *, platform=Platform.TELEGRAM, succeed=True, fatal_error=None, fatal_retryable=True, ): super().__init__(PlatformConfig(enabled=True, token="test"), platform) self._succeed = succeed self._fatal_error = fatal_error self._fatal_retryable = fatal_retryable async def connect(self): if self._fatal_error: self._set_fatal_error("test_error", self._fatal_error, retryable=self._fatal_retryable) return False return self._succeed async def disconnect(self): return None async def send(self, chat_id, content, reply_to=None, metadata=None): return SendResult(success=True, message_id="1") async def send_typing(self, chat_id, metadata=None): return None async def get_chat_info(self, chat_id): return {"id": chat_id} def _make_runner(): """Create a minimal GatewayRunner via object.__new__ to skip __init__.""" runner = object.__new__(GatewayRunner) runner.config = GatewayConfig( platforms={Platform.TELEGRAM: PlatformConfig(enabled=True, token="test")} ) runner._running = True runner._shutdown_event = asyncio.Event() runner._exit_reason = None runner._exit_with_failure = False runner._exit_cleanly = False runner._failed_platforms = {} runner.adapters = {} runner.delivery_router = MagicMock() runner._running_agents = {} runner._pending_messages = {} runner._pending_approvals = {} runner._honcho_managers = {} runner._honcho_configs = {} runner._shutdown_all_gateway_honcho = lambda: None runner.session_store = MagicMock() return runner # --- Startup queueing --- class TestStartupPlatformIsolation: """Verify one blocked platform cannot prevent later platforms from starting.""" @pytest.mark.asyncio async def test_start_continues_after_platform_connect_timeout(self, tmp_path): """A timeout on Telegram should queue it and still connect Feishu.""" runner = _make_runner() runner.config = GatewayConfig( platforms={ Platform.TELEGRAM: PlatformConfig(enabled=True, token="test"), Platform.FEISHU: PlatformConfig(enabled=True, token="test"), }, sessions_dir=tmp_path, ) runner.hooks = MagicMock() runner.hooks.loaded_hooks = [] runner.hooks.emit = AsyncMock() runner._suspend_stuck_loop_sessions = MagicMock(return_value=0) runner._update_runtime_status = MagicMock() runner._update_platform_runtime_status = MagicMock() runner._sync_voice_mode_state_to_adapter = MagicMock() runner._send_update_notification = AsyncMock(return_value=True) runner._send_restart_notification = AsyncMock() adapters = { Platform.TELEGRAM: StubAdapter(platform=Platform.TELEGRAM), Platform.FEISHU: StubAdapter(platform=Platform.FEISHU), } runner._create_adapter = MagicMock( side_effect=lambda platform, _config: adapters[platform] ) runner._connect_adapter_with_timeout = AsyncMock( side_effect=[ TimeoutError("telegram connect timed out after 30s"), True, ] ) def fake_create_task(coro): coro.close() return MagicMock() with patch("gateway.status.write_runtime_status"): with patch("hermes_cli.plugins.discover_plugins"): with patch("hermes_cli.config.load_config", return_value={}): with patch("agent.shell_hooks.register_from_config"): with patch( "tools.process_registry.process_registry.recover_from_checkpoint", return_value=0, ): with patch( "gateway.channel_directory.build_channel_directory", new=AsyncMock(return_value={"platforms": {}}), ): with patch("gateway.run.asyncio.create_task", side_effect=fake_create_task): assert await runner.start() is True assert Platform.TELEGRAM in runner._failed_platforms assert Platform.FEISHU in runner.adapters assert Platform.TELEGRAM not in runner.adapters assert runner._create_adapter.call_count == 2 @pytest.mark.asyncio async def test_connect_adapter_timeout_raises_retryable_exception(self, monkeypatch): """The timeout helper turns a hanging connect into a caught startup error.""" runner = _make_runner() adapter = StubAdapter() async def hang(): await asyncio.sleep(60) return True adapter.connect = hang monkeypatch.setenv("HERMES_GATEWAY_PLATFORM_CONNECT_TIMEOUT", "0.001") with pytest.raises(TimeoutError, match="telegram connect timed out"): await runner._connect_adapter_with_timeout(adapter, Platform.TELEGRAM) class TestStartupFailureQueuing: """Verify that failed platforms are queued during startup.""" def test_failed_platform_queued_on_connect_failure(self): """When adapter.connect() returns False without fatal error, queue for retry.""" runner = _make_runner() platform_config = PlatformConfig(enabled=True, token="test") runner._failed_platforms[Platform.TELEGRAM] = { "config": platform_config, "attempts": 1, "next_retry": time.monotonic() + 30, } assert Platform.TELEGRAM in runner._failed_platforms assert runner._failed_platforms[Platform.TELEGRAM]["attempts"] == 1 def test_failed_platform_not_queued_for_nonretryable(self): """Non-retryable errors should not be in the retry queue.""" runner = _make_runner() # Simulate: adapter had a non-retryable error, wasn't queued assert Platform.TELEGRAM not in runner._failed_platforms # --- Reconnect watcher --- class TestPlatformReconnectWatcher: """Test the _platform_reconnect_watcher background task.""" @pytest.mark.asyncio async def test_reconnect_succeeds_on_retry(self): """Watcher should reconnect a failed platform when connect() succeeds.""" runner = _make_runner() runner._sync_voice_mode_state_to_adapter = MagicMock() platform_config = PlatformConfig(enabled=True, token="test") runner._failed_platforms[Platform.TELEGRAM] = { "config": platform_config, "attempts": 1, "next_retry": time.monotonic() - 1, # Already past retry time } succeed_adapter = StubAdapter(succeed=True) real_sleep = asyncio.sleep with patch.object(runner, "_create_adapter", return_value=succeed_adapter): with patch("gateway.run.build_channel_directory", create=True): # Run one iteration of the watcher then stop async def run_one_iteration(): runner._running = True # Patch the sleep to exit after first check call_count = 0 async def fake_sleep(n): nonlocal call_count call_count += 1 if call_count > 1: runner._running = False await real_sleep(0) with patch("asyncio.sleep", side_effect=fake_sleep): await runner._platform_reconnect_watcher() await run_one_iteration() assert Platform.TELEGRAM not in runner._failed_platforms assert Platform.TELEGRAM in runner.adapters @pytest.mark.asyncio async def test_reconnect_nonretryable_removed_from_queue(self): """Non-retryable errors should remove the platform from the retry queue.""" runner = _make_runner() platform_config = PlatformConfig(enabled=True, token="test") runner._failed_platforms[Platform.TELEGRAM] = { "config": platform_config, "attempts": 1, "next_retry": time.monotonic() - 1, } fail_adapter = StubAdapter( succeed=False, fatal_error="bad token", fatal_retryable=False ) real_sleep = asyncio.sleep with patch.object(runner, "_create_adapter", return_value=fail_adapter): async def run_one_iteration(): runner._running = True call_count = 0 async def fake_sleep(n): nonlocal call_count call_count += 1 if call_count > 1: runner._running = False await real_sleep(0) with patch("asyncio.sleep", side_effect=fake_sleep): await runner._platform_reconnect_watcher() await run_one_iteration() assert Platform.TELEGRAM not in runner._failed_platforms assert Platform.TELEGRAM not in runner.adapters @pytest.mark.asyncio async def test_reconnect_retryable_stays_in_queue(self): """Retryable failures should remain in the queue with incremented attempts.""" runner = _make_runner() platform_config = PlatformConfig(enabled=True, token="test") runner._failed_platforms[Platform.TELEGRAM] = { "config": platform_config, "attempts": 1, "next_retry": time.monotonic() - 1, } fail_adapter = StubAdapter( succeed=False, fatal_error="DNS failure", fatal_retryable=True ) real_sleep = asyncio.sleep with patch.object(runner, "_create_adapter", return_value=fail_adapter): async def run_one_iteration(): runner._running = True call_count = 0 async def fake_sleep(n): nonlocal call_count call_count += 1 if call_count > 1: runner._running = False await real_sleep(0) with patch("asyncio.sleep", side_effect=fake_sleep): await runner._platform_reconnect_watcher() await run_one_iteration() assert Platform.TELEGRAM in runner._failed_platforms assert runner._failed_platforms[Platform.TELEGRAM]["attempts"] == 2 @pytest.mark.asyncio async def test_reconnect_pauses_after_circuit_breaker_threshold(self): """After enough consecutive retryable failures, the watcher should *pause* the platform (keep it in the queue but stop hammering it), not drop it. The user resumes via /platform resume. """ runner = _make_runner() platform_config = PlatformConfig(enabled=True, token="test") # 9 prior attempts — the next failure will be the 10th and should # trip the circuit breaker. runner._failed_platforms[Platform.TELEGRAM] = { "config": platform_config, "attempts": 9, "next_retry": time.monotonic() - 1, } fail_adapter = StubAdapter( succeed=False, fatal_error="DNS failure", fatal_retryable=True ) real_sleep = asyncio.sleep with patch.object(runner, "_create_adapter", return_value=fail_adapter): async def run_one_iteration(): runner._running = True call_count = 0 async def fake_sleep(n): nonlocal call_count call_count += 1 if call_count > 1: runner._running = False await real_sleep(0) with patch("asyncio.sleep", side_effect=fake_sleep): await runner._platform_reconnect_watcher() await run_one_iteration() # Platform stays in queue — paused, not dropped assert Platform.TELEGRAM in runner._failed_platforms info = runner._failed_platforms[Platform.TELEGRAM] assert info["paused"] is True assert info["attempts"] == 10 assert "pause_reason" in info @pytest.mark.asyncio async def test_reconnect_skips_paused_platforms(self): """A paused platform should not be retried by the watcher tick.""" runner = _make_runner() platform_config = PlatformConfig(enabled=True, token="test") runner._failed_platforms[Platform.TELEGRAM] = { "config": platform_config, "attempts": 10, "next_retry": time.monotonic() - 1, # would normally retry now "paused": True, "pause_reason": "paused via /platform pause", } real_sleep = asyncio.sleep with patch.object(runner, "_create_adapter") as mock_create: async def run_one_iteration(): runner._running = True call_count = 0 async def fake_sleep(n): nonlocal call_count call_count += 1 if call_count > 1: runner._running = False await real_sleep(0) with patch("asyncio.sleep", side_effect=fake_sleep): await runner._platform_reconnect_watcher() await run_one_iteration() # Paused platform stays queued and was never touched assert Platform.TELEGRAM in runner._failed_platforms assert runner._failed_platforms[Platform.TELEGRAM]["paused"] is True mock_create.assert_not_called() @pytest.mark.asyncio async def test_reconnect_skips_when_not_time_yet(self): """Watcher should skip platforms whose next_retry is in the future.""" runner = _make_runner() platform_config = PlatformConfig(enabled=True, token="test") runner._failed_platforms[Platform.TELEGRAM] = { "config": platform_config, "attempts": 1, "next_retry": time.monotonic() + 9999, # Far in the future } real_sleep = asyncio.sleep with patch.object(runner, "_create_adapter") as mock_create: async def run_one_iteration(): runner._running = True call_count = 0 async def fake_sleep(n): nonlocal call_count call_count += 1 if call_count > 1: runner._running = False await real_sleep(0) with patch("asyncio.sleep", side_effect=fake_sleep): await runner._platform_reconnect_watcher() await run_one_iteration() assert Platform.TELEGRAM in runner._failed_platforms mock_create.assert_not_called() @pytest.mark.asyncio async def test_no_failed_platforms_watcher_idles(self): """When no platforms are failed, watcher should just idle.""" runner = _make_runner() # No failed platforms real_sleep = asyncio.sleep with patch.object(runner, "_create_adapter") as mock_create: async def run_briefly(): runner._running = True call_count = 0 async def fake_sleep(n): nonlocal call_count call_count += 1 if call_count > 2: runner._running = False await real_sleep(0) with patch("asyncio.sleep", side_effect=fake_sleep): await runner._platform_reconnect_watcher() await run_briefly() mock_create.assert_not_called() @pytest.mark.asyncio async def test_adapter_create_returns_none(self): """If _create_adapter returns None, remove from queue (missing deps).""" runner = _make_runner() platform_config = PlatformConfig(enabled=True, token="test") runner._failed_platforms[Platform.TELEGRAM] = { "config": platform_config, "attempts": 1, "next_retry": time.monotonic() - 1, } real_sleep = asyncio.sleep with patch.object(runner, "_create_adapter", return_value=None): async def run_one_iteration(): runner._running = True call_count = 0 async def fake_sleep(n): nonlocal call_count call_count += 1 if call_count > 1: runner._running = False await real_sleep(0) with patch("asyncio.sleep", side_effect=fake_sleep): await runner._platform_reconnect_watcher() await run_one_iteration() assert Platform.TELEGRAM not in runner._failed_platforms # --- Runtime disconnection queueing --- class TestRuntimeDisconnectQueuing: """Test that _handle_adapter_fatal_error queues retryable disconnections.""" @pytest.mark.asyncio async def test_retryable_runtime_error_queued_for_reconnect(self): """Retryable runtime errors should add the platform to _failed_platforms.""" runner = _make_runner() runner.stop = AsyncMock() adapter = StubAdapter(succeed=True) adapter._set_fatal_error("network_error", "DNS failure", retryable=True) runner.adapters[Platform.TELEGRAM] = adapter await runner._handle_adapter_fatal_error(adapter) assert Platform.TELEGRAM in runner._failed_platforms assert runner._failed_platforms[Platform.TELEGRAM]["attempts"] == 0 @pytest.mark.asyncio async def test_nonretryable_runtime_error_not_queued(self): """Non-retryable runtime errors should not be queued for reconnection.""" runner = _make_runner() adapter = StubAdapter(succeed=True) adapter._set_fatal_error("auth_error", "bad token", retryable=False) runner.adapters[Platform.TELEGRAM] = adapter # Need to prevent stop() from running fully runner.stop = AsyncMock() await runner._handle_adapter_fatal_error(adapter) assert Platform.TELEGRAM not in runner._failed_platforms @pytest.mark.asyncio async def test_retryable_error_keeps_gateway_alive_when_all_down(self): """When all adapters fail at runtime with retryable errors, the gateway should stay alive and let the reconnect watcher recover them in the background. (Previously this exited-with-failure to trigger a systemd restart — that converted transient outages into infinite restart loops and killed in-process state.) """ runner = _make_runner() runner.stop = AsyncMock() adapter = StubAdapter(succeed=True) adapter._set_fatal_error("network_error", "DNS failure", retryable=True) runner.adapters[Platform.TELEGRAM] = adapter await runner._handle_adapter_fatal_error(adapter) # stop() should NOT be called — gateway stays alive for the watcher runner.stop.assert_not_called() assert runner._exit_with_failure is False assert Platform.TELEGRAM in runner._failed_platforms @pytest.mark.asyncio async def test_retryable_error_no_exit_when_other_adapters_still_connected(self): """Gateway should NOT exit if some adapters are still connected.""" runner = _make_runner() runner.stop = AsyncMock() failing_adapter = StubAdapter(succeed=True) failing_adapter._set_fatal_error("network_error", "DNS failure", retryable=True) runner.adapters[Platform.TELEGRAM] = failing_adapter # Another adapter is still connected healthy_adapter = StubAdapter(succeed=True) runner.adapters[Platform.DISCORD] = healthy_adapter await runner._handle_adapter_fatal_error(failing_adapter) # stop() should NOT have been called — Discord is still up runner.stop.assert_not_called() assert Platform.TELEGRAM in runner._failed_platforms @pytest.mark.asyncio async def test_nonretryable_error_triggers_shutdown(self): """Gateway should shut down when no adapters remain and nothing is queued.""" runner = _make_runner() runner.stop = AsyncMock() adapter = StubAdapter(succeed=True) adapter._set_fatal_error("auth_error", "bad token", retryable=False) runner.adapters[Platform.TELEGRAM] = adapter await runner._handle_adapter_fatal_error(adapter) runner.stop.assert_called_once() # --- Pause / resume circuit breaker --- class TestPauseResume: """Test the per-platform pause/resume helpers and slash command.""" def test_pause_marks_platform_paused(self): runner = _make_runner() runner._failed_platforms[Platform.TELEGRAM] = { "config": PlatformConfig(enabled=True, token="t"), "attempts": 3, "next_retry": time.monotonic() + 30, } runner._pause_failed_platform(Platform.TELEGRAM, reason="manual") info = runner._failed_platforms[Platform.TELEGRAM] assert info["paused"] is True assert info["pause_reason"] == "manual" assert info["next_retry"] == float("inf") def test_pause_is_idempotent(self): runner = _make_runner() runner._failed_platforms[Platform.TELEGRAM] = { "config": PlatformConfig(enabled=True, token="t"), "attempts": 3, "next_retry": time.monotonic() + 30, "paused": True, "pause_reason": "first reason", } runner._pause_failed_platform(Platform.TELEGRAM, reason="second reason") # Reason should not be overwritten on a second pause call. assert ( runner._failed_platforms[Platform.TELEGRAM]["pause_reason"] == "first reason" ) def test_pause_no_op_when_platform_not_queued(self): runner = _make_runner() # No exception even when the platform isn't in _failed_platforms. runner._pause_failed_platform(Platform.TELEGRAM, reason="x") assert Platform.TELEGRAM not in runner._failed_platforms def test_resume_clears_paused_and_resets_attempts(self): runner = _make_runner() runner._failed_platforms[Platform.TELEGRAM] = { "config": PlatformConfig(enabled=True, token="t"), "attempts": 10, "next_retry": float("inf"), "paused": True, "pause_reason": "auto-paused", } assert runner._resume_paused_platform(Platform.TELEGRAM) is True info = runner._failed_platforms[Platform.TELEGRAM] assert info["paused"] is False assert info["attempts"] == 0 assert info["next_retry"] != float("inf") assert "pause_reason" not in info def test_resume_returns_false_when_not_paused(self): runner = _make_runner() runner._failed_platforms[Platform.TELEGRAM] = { "config": PlatformConfig(enabled=True, token="t"), "attempts": 1, "next_retry": time.monotonic() + 30, } assert runner._resume_paused_platform(Platform.TELEGRAM) is False def test_resume_returns_false_when_not_queued(self): runner = _make_runner() assert runner._resume_paused_platform(Platform.TELEGRAM) is False class TestPlatformSlashCommand: """Test the /platform list|pause|resume slash command handler.""" def _make_event(self, content: str): ev = MagicMock() ev.content = content return ev @pytest.mark.asyncio async def test_list_shows_connected_and_paused(self): runner = _make_runner() runner.adapters[Platform.DISCORD] = StubAdapter(platform=Platform.DISCORD) runner._failed_platforms[Platform.WHATSAPP] = { "config": PlatformConfig(enabled=True, token="t"), "attempts": 10, "next_retry": float("inf"), "paused": True, "pause_reason": "not paired", } out = await runner._handle_platform_command(self._make_event("/platform list")) assert "discord" in out assert "whatsapp" in out assert "PAUSED" in out assert "not paired" in out @pytest.mark.asyncio async def test_pause_command_pauses_queued_platform(self): runner = _make_runner() runner._failed_platforms[Platform.WHATSAPP] = { "config": PlatformConfig(enabled=True, token="t"), "attempts": 2, "next_retry": time.monotonic() + 30, } out = await runner._handle_platform_command( self._make_event("/platform pause whatsapp") ) assert "paused" in out.lower() assert runner._failed_platforms[Platform.WHATSAPP]["paused"] is True @pytest.mark.asyncio async def test_pause_rejects_unqueued_platform(self): runner = _make_runner() out = await runner._handle_platform_command( self._make_event("/platform pause whatsapp") ) assert "not in the retry queue" in out @pytest.mark.asyncio async def test_resume_command_resumes_paused_platform(self): runner = _make_runner() runner._failed_platforms[Platform.WHATSAPP] = { "config": PlatformConfig(enabled=True, token="t"), "attempts": 10, "next_retry": float("inf"), "paused": True, "pause_reason": "x", } out = await runner._handle_platform_command( self._make_event("/platform resume whatsapp") ) assert "resumed" in out.lower() assert runner._failed_platforms[Platform.WHATSAPP]["paused"] is False @pytest.mark.asyncio async def test_unknown_platform_name(self): runner = _make_runner() out = await runner._handle_platform_command( self._make_event("/platform pause notarealplatform") ) assert "Unknown platform" in out @pytest.mark.asyncio async def test_bare_platform_shows_usage_with_list(self): # An empty /platform call defaults to "list". runner = _make_runner() out = await runner._handle_platform_command(self._make_event("/platform")) assert "Gateway platforms" in out