mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-18 04:41:56 +00:00
Stop the gateway from exiting (or systemd-restart-looping) when a single
messaging adapter fails at startup or runtime. A misconfigured WhatsApp
(npm install timeout, unpaired bridge, missing creds.json) used to take
the entire gateway down, killing cron jobs and any other connected
platforms with it.
Changes:
• Startup (gateway/run.py): when connected_count==0 but the only
errors are retryable, log a degraded-state warning and keep the
gateway alive instead of returning False. Reconnect watcher then
recovers platforms as their underlying problem clears.
• Runtime (gateway/run.py _handle_adapter_fatal_error): when the last
adapter goes down with a retryable error and is queued for
reconnection, stay alive instead of exit-with-failure. Previously
this triggered systemd Restart=on-failure, which created infinite
restart loops on persistent retryable failures (proxy outage,
repeated bridge crashes).
• Reconnect watcher (gateway/run.py _platform_reconnect_watcher):
replace the 20-attempt hard drop with a circuit-breaker pause.
After _PAUSE_AFTER_FAILURES (10) consecutive retryable failures, the
platform stays in _failed_platforms with paused=True so the watcher
skips it but the operator can still see and resume it. Non-retryable
errors still drop out of the queue immediately. Resolves #17063
(gateway giving up on Telegram after 20 attempts).
• WhatsApp preflight (gateway/platforms/whatsapp.py): refuse to start
the Node bridge when creds.json is missing. Sets a non-retryable
whatsapp_not_paired fatal error so the watcher drops it cleanly
with a single 'run hermes whatsapp' log line instead of paying the
30s bridge bootstrap timeout on every gateway start.
• WhatsApp setup ordering (hermes_cli/main.py cmd_whatsapp): only set
WHATSAPP_ENABLED=true once pairing actually succeeds. Previously
the wizard wrote the env var at step 2 (before npm install and QR
pairing), so any Ctrl+C left .env claiming WhatsApp was ready when
the bridge had no creds.json. Also propagate the env var when the
user keeps an existing pairing on a re-run.
• /platform slash command (hermes_cli/commands.py + gateway/run.py):
new gateway-only command for manual circuit-breaker control.
/platform list — show connected + failed/paused platforms
/platform pause <name> — silence a known-broken platform
/platform resume <name> — re-queue a paused platform
Tests:
• New: pause/resume helpers, /platform list|pause|resume command,
WhatsApp creds.json preflight, WhatsApp setup ordering.
• Updated: stale assertions that codified the old 'exit and let
systemd restart' behavior in test_runner_fatal_adapter.py,
test_runner_startup_failures.py, and test_platform_reconnect.py
(the 20-attempt give-up test became a circuit-breaker pause test).
5488 tests pass in tests/gateway/.
716 lines
26 KiB
Python
716 lines
26 KiB
Python
"""Tests for the gateway platform reconnection watcher."""
|
|
|
|
import asyncio
|
|
import time
|
|
from unittest.mock import AsyncMock, MagicMock, patch
|
|
|
|
import pytest
|
|
|
|
from gateway.config import GatewayConfig, Platform, PlatformConfig
|
|
from gateway.platforms.base import BasePlatformAdapter, MessageEvent, SendResult
|
|
from gateway.run import GatewayRunner
|
|
|
|
|
|
class StubAdapter(BasePlatformAdapter):
|
|
"""Adapter whose connect() result can be controlled."""
|
|
|
|
def __init__(
|
|
self,
|
|
*,
|
|
platform=Platform.TELEGRAM,
|
|
succeed=True,
|
|
fatal_error=None,
|
|
fatal_retryable=True,
|
|
):
|
|
super().__init__(PlatformConfig(enabled=True, token="test"), platform)
|
|
self._succeed = succeed
|
|
self._fatal_error = fatal_error
|
|
self._fatal_retryable = fatal_retryable
|
|
|
|
async def connect(self):
|
|
if self._fatal_error:
|
|
self._set_fatal_error("test_error", self._fatal_error, retryable=self._fatal_retryable)
|
|
return False
|
|
return self._succeed
|
|
|
|
async def disconnect(self):
|
|
return None
|
|
|
|
async def send(self, chat_id, content, reply_to=None, metadata=None):
|
|
return SendResult(success=True, message_id="1")
|
|
|
|
async def send_typing(self, chat_id, metadata=None):
|
|
return None
|
|
|
|
async def get_chat_info(self, chat_id):
|
|
return {"id": chat_id}
|
|
|
|
|
|
def _make_runner():
|
|
"""Create a minimal GatewayRunner via object.__new__ to skip __init__."""
|
|
runner = object.__new__(GatewayRunner)
|
|
runner.config = GatewayConfig(
|
|
platforms={Platform.TELEGRAM: PlatformConfig(enabled=True, token="test")}
|
|
)
|
|
runner._running = True
|
|
runner._shutdown_event = asyncio.Event()
|
|
runner._exit_reason = None
|
|
runner._exit_with_failure = False
|
|
runner._exit_cleanly = False
|
|
runner._failed_platforms = {}
|
|
runner.adapters = {}
|
|
runner.delivery_router = MagicMock()
|
|
runner._running_agents = {}
|
|
runner._pending_messages = {}
|
|
runner._pending_approvals = {}
|
|
runner._honcho_managers = {}
|
|
runner._honcho_configs = {}
|
|
runner._shutdown_all_gateway_honcho = lambda: None
|
|
runner.session_store = MagicMock()
|
|
return runner
|
|
|
|
|
|
# --- Startup queueing ---
|
|
|
|
class TestStartupPlatformIsolation:
|
|
"""Verify one blocked platform cannot prevent later platforms from starting."""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_start_continues_after_platform_connect_timeout(self, tmp_path):
|
|
"""A timeout on Telegram should queue it and still connect Feishu."""
|
|
runner = _make_runner()
|
|
runner.config = GatewayConfig(
|
|
platforms={
|
|
Platform.TELEGRAM: PlatformConfig(enabled=True, token="test"),
|
|
Platform.FEISHU: PlatformConfig(enabled=True, token="test"),
|
|
},
|
|
sessions_dir=tmp_path,
|
|
)
|
|
runner.hooks = MagicMock()
|
|
runner.hooks.loaded_hooks = []
|
|
runner.hooks.emit = AsyncMock()
|
|
runner._suspend_stuck_loop_sessions = MagicMock(return_value=0)
|
|
runner._update_runtime_status = MagicMock()
|
|
runner._update_platform_runtime_status = MagicMock()
|
|
runner._sync_voice_mode_state_to_adapter = MagicMock()
|
|
runner._send_update_notification = AsyncMock(return_value=True)
|
|
runner._send_restart_notification = AsyncMock()
|
|
|
|
adapters = {
|
|
Platform.TELEGRAM: StubAdapter(platform=Platform.TELEGRAM),
|
|
Platform.FEISHU: StubAdapter(platform=Platform.FEISHU),
|
|
}
|
|
runner._create_adapter = MagicMock(
|
|
side_effect=lambda platform, _config: adapters[platform]
|
|
)
|
|
runner._connect_adapter_with_timeout = AsyncMock(
|
|
side_effect=[
|
|
TimeoutError("telegram connect timed out after 30s"),
|
|
True,
|
|
]
|
|
)
|
|
|
|
def fake_create_task(coro):
|
|
coro.close()
|
|
return MagicMock()
|
|
|
|
with patch("gateway.status.write_runtime_status"):
|
|
with patch("hermes_cli.plugins.discover_plugins"):
|
|
with patch("hermes_cli.config.load_config", return_value={}):
|
|
with patch("agent.shell_hooks.register_from_config"):
|
|
with patch(
|
|
"tools.process_registry.process_registry.recover_from_checkpoint",
|
|
return_value=0,
|
|
):
|
|
with patch(
|
|
"gateway.channel_directory.build_channel_directory",
|
|
new=AsyncMock(return_value={"platforms": {}}),
|
|
):
|
|
with patch("gateway.run.asyncio.create_task", side_effect=fake_create_task):
|
|
assert await runner.start() is True
|
|
|
|
assert Platform.TELEGRAM in runner._failed_platforms
|
|
assert Platform.FEISHU in runner.adapters
|
|
assert Platform.TELEGRAM not in runner.adapters
|
|
assert runner._create_adapter.call_count == 2
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_connect_adapter_timeout_raises_retryable_exception(self, monkeypatch):
|
|
"""The timeout helper turns a hanging connect into a caught startup error."""
|
|
runner = _make_runner()
|
|
adapter = StubAdapter()
|
|
|
|
async def hang():
|
|
await asyncio.sleep(60)
|
|
return True
|
|
|
|
adapter.connect = hang
|
|
monkeypatch.setenv("HERMES_GATEWAY_PLATFORM_CONNECT_TIMEOUT", "0.001")
|
|
|
|
with pytest.raises(TimeoutError, match="telegram connect timed out"):
|
|
await runner._connect_adapter_with_timeout(adapter, Platform.TELEGRAM)
|
|
|
|
|
|
class TestStartupFailureQueuing:
|
|
"""Verify that failed platforms are queued during startup."""
|
|
|
|
def test_failed_platform_queued_on_connect_failure(self):
|
|
"""When adapter.connect() returns False without fatal error, queue for retry."""
|
|
runner = _make_runner()
|
|
platform_config = PlatformConfig(enabled=True, token="test")
|
|
runner._failed_platforms[Platform.TELEGRAM] = {
|
|
"config": platform_config,
|
|
"attempts": 1,
|
|
"next_retry": time.monotonic() + 30,
|
|
}
|
|
assert Platform.TELEGRAM in runner._failed_platforms
|
|
assert runner._failed_platforms[Platform.TELEGRAM]["attempts"] == 1
|
|
|
|
def test_failed_platform_not_queued_for_nonretryable(self):
|
|
"""Non-retryable errors should not be in the retry queue."""
|
|
runner = _make_runner()
|
|
# Simulate: adapter had a non-retryable error, wasn't queued
|
|
assert Platform.TELEGRAM not in runner._failed_platforms
|
|
|
|
|
|
# --- Reconnect watcher ---
|
|
|
|
class TestPlatformReconnectWatcher:
|
|
"""Test the _platform_reconnect_watcher background task."""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_reconnect_succeeds_on_retry(self):
|
|
"""Watcher should reconnect a failed platform when connect() succeeds."""
|
|
runner = _make_runner()
|
|
runner._sync_voice_mode_state_to_adapter = MagicMock()
|
|
|
|
platform_config = PlatformConfig(enabled=True, token="test")
|
|
runner._failed_platforms[Platform.TELEGRAM] = {
|
|
"config": platform_config,
|
|
"attempts": 1,
|
|
"next_retry": time.monotonic() - 1, # Already past retry time
|
|
}
|
|
|
|
succeed_adapter = StubAdapter(succeed=True)
|
|
real_sleep = asyncio.sleep
|
|
|
|
with patch.object(runner, "_create_adapter", return_value=succeed_adapter):
|
|
with patch("gateway.run.build_channel_directory", create=True):
|
|
# Run one iteration of the watcher then stop
|
|
async def run_one_iteration():
|
|
runner._running = True
|
|
# Patch the sleep to exit after first check
|
|
call_count = 0
|
|
|
|
async def fake_sleep(n):
|
|
nonlocal call_count
|
|
call_count += 1
|
|
if call_count > 1:
|
|
runner._running = False
|
|
await real_sleep(0)
|
|
|
|
with patch("asyncio.sleep", side_effect=fake_sleep):
|
|
await runner._platform_reconnect_watcher()
|
|
|
|
await run_one_iteration()
|
|
|
|
assert Platform.TELEGRAM not in runner._failed_platforms
|
|
assert Platform.TELEGRAM in runner.adapters
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_reconnect_nonretryable_removed_from_queue(self):
|
|
"""Non-retryable errors should remove the platform from the retry queue."""
|
|
runner = _make_runner()
|
|
|
|
platform_config = PlatformConfig(enabled=True, token="test")
|
|
runner._failed_platforms[Platform.TELEGRAM] = {
|
|
"config": platform_config,
|
|
"attempts": 1,
|
|
"next_retry": time.monotonic() - 1,
|
|
}
|
|
|
|
fail_adapter = StubAdapter(
|
|
succeed=False, fatal_error="bad token", fatal_retryable=False
|
|
)
|
|
|
|
real_sleep = asyncio.sleep
|
|
|
|
with patch.object(runner, "_create_adapter", return_value=fail_adapter):
|
|
async def run_one_iteration():
|
|
runner._running = True
|
|
call_count = 0
|
|
|
|
async def fake_sleep(n):
|
|
nonlocal call_count
|
|
call_count += 1
|
|
if call_count > 1:
|
|
runner._running = False
|
|
await real_sleep(0)
|
|
|
|
with patch("asyncio.sleep", side_effect=fake_sleep):
|
|
await runner._platform_reconnect_watcher()
|
|
|
|
await run_one_iteration()
|
|
|
|
assert Platform.TELEGRAM not in runner._failed_platforms
|
|
assert Platform.TELEGRAM not in runner.adapters
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_reconnect_retryable_stays_in_queue(self):
|
|
"""Retryable failures should remain in the queue with incremented attempts."""
|
|
runner = _make_runner()
|
|
|
|
platform_config = PlatformConfig(enabled=True, token="test")
|
|
runner._failed_platforms[Platform.TELEGRAM] = {
|
|
"config": platform_config,
|
|
"attempts": 1,
|
|
"next_retry": time.monotonic() - 1,
|
|
}
|
|
|
|
fail_adapter = StubAdapter(
|
|
succeed=False, fatal_error="DNS failure", fatal_retryable=True
|
|
)
|
|
|
|
real_sleep = asyncio.sleep
|
|
|
|
with patch.object(runner, "_create_adapter", return_value=fail_adapter):
|
|
async def run_one_iteration():
|
|
runner._running = True
|
|
call_count = 0
|
|
|
|
async def fake_sleep(n):
|
|
nonlocal call_count
|
|
call_count += 1
|
|
if call_count > 1:
|
|
runner._running = False
|
|
await real_sleep(0)
|
|
|
|
with patch("asyncio.sleep", side_effect=fake_sleep):
|
|
await runner._platform_reconnect_watcher()
|
|
|
|
await run_one_iteration()
|
|
|
|
assert Platform.TELEGRAM in runner._failed_platforms
|
|
assert runner._failed_platforms[Platform.TELEGRAM]["attempts"] == 2
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_reconnect_pauses_after_circuit_breaker_threshold(self):
|
|
"""After enough consecutive retryable failures, the watcher should
|
|
*pause* the platform (keep it in the queue but stop hammering it),
|
|
not drop it. The user resumes via /platform resume.
|
|
"""
|
|
runner = _make_runner()
|
|
|
|
platform_config = PlatformConfig(enabled=True, token="test")
|
|
# 9 prior attempts — the next failure will be the 10th and should
|
|
# trip the circuit breaker.
|
|
runner._failed_platforms[Platform.TELEGRAM] = {
|
|
"config": platform_config,
|
|
"attempts": 9,
|
|
"next_retry": time.monotonic() - 1,
|
|
}
|
|
|
|
fail_adapter = StubAdapter(
|
|
succeed=False, fatal_error="DNS failure", fatal_retryable=True
|
|
)
|
|
real_sleep = asyncio.sleep
|
|
|
|
with patch.object(runner, "_create_adapter", return_value=fail_adapter):
|
|
async def run_one_iteration():
|
|
runner._running = True
|
|
call_count = 0
|
|
|
|
async def fake_sleep(n):
|
|
nonlocal call_count
|
|
call_count += 1
|
|
if call_count > 1:
|
|
runner._running = False
|
|
await real_sleep(0)
|
|
|
|
with patch("asyncio.sleep", side_effect=fake_sleep):
|
|
await runner._platform_reconnect_watcher()
|
|
|
|
await run_one_iteration()
|
|
|
|
# Platform stays in queue — paused, not dropped
|
|
assert Platform.TELEGRAM in runner._failed_platforms
|
|
info = runner._failed_platforms[Platform.TELEGRAM]
|
|
assert info["paused"] is True
|
|
assert info["attempts"] == 10
|
|
assert "pause_reason" in info
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_reconnect_skips_paused_platforms(self):
|
|
"""A paused platform should not be retried by the watcher tick."""
|
|
runner = _make_runner()
|
|
|
|
platform_config = PlatformConfig(enabled=True, token="test")
|
|
runner._failed_platforms[Platform.TELEGRAM] = {
|
|
"config": platform_config,
|
|
"attempts": 10,
|
|
"next_retry": time.monotonic() - 1, # would normally retry now
|
|
"paused": True,
|
|
"pause_reason": "paused via /platform pause",
|
|
}
|
|
|
|
real_sleep = asyncio.sleep
|
|
|
|
with patch.object(runner, "_create_adapter") as mock_create:
|
|
async def run_one_iteration():
|
|
runner._running = True
|
|
call_count = 0
|
|
|
|
async def fake_sleep(n):
|
|
nonlocal call_count
|
|
call_count += 1
|
|
if call_count > 1:
|
|
runner._running = False
|
|
await real_sleep(0)
|
|
|
|
with patch("asyncio.sleep", side_effect=fake_sleep):
|
|
await runner._platform_reconnect_watcher()
|
|
|
|
await run_one_iteration()
|
|
|
|
# Paused platform stays queued and was never touched
|
|
assert Platform.TELEGRAM in runner._failed_platforms
|
|
assert runner._failed_platforms[Platform.TELEGRAM]["paused"] is True
|
|
mock_create.assert_not_called()
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_reconnect_skips_when_not_time_yet(self):
|
|
"""Watcher should skip platforms whose next_retry is in the future."""
|
|
runner = _make_runner()
|
|
|
|
platform_config = PlatformConfig(enabled=True, token="test")
|
|
runner._failed_platforms[Platform.TELEGRAM] = {
|
|
"config": platform_config,
|
|
"attempts": 1,
|
|
"next_retry": time.monotonic() + 9999, # Far in the future
|
|
}
|
|
|
|
real_sleep = asyncio.sleep
|
|
|
|
with patch.object(runner, "_create_adapter") as mock_create:
|
|
async def run_one_iteration():
|
|
runner._running = True
|
|
call_count = 0
|
|
|
|
async def fake_sleep(n):
|
|
nonlocal call_count
|
|
call_count += 1
|
|
if call_count > 1:
|
|
runner._running = False
|
|
await real_sleep(0)
|
|
|
|
with patch("asyncio.sleep", side_effect=fake_sleep):
|
|
await runner._platform_reconnect_watcher()
|
|
|
|
await run_one_iteration()
|
|
|
|
assert Platform.TELEGRAM in runner._failed_platforms
|
|
mock_create.assert_not_called()
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_no_failed_platforms_watcher_idles(self):
|
|
"""When no platforms are failed, watcher should just idle."""
|
|
runner = _make_runner()
|
|
# No failed platforms
|
|
|
|
real_sleep = asyncio.sleep
|
|
|
|
with patch.object(runner, "_create_adapter") as mock_create:
|
|
async def run_briefly():
|
|
runner._running = True
|
|
call_count = 0
|
|
|
|
async def fake_sleep(n):
|
|
nonlocal call_count
|
|
call_count += 1
|
|
if call_count > 2:
|
|
runner._running = False
|
|
await real_sleep(0)
|
|
|
|
with patch("asyncio.sleep", side_effect=fake_sleep):
|
|
await runner._platform_reconnect_watcher()
|
|
|
|
await run_briefly()
|
|
|
|
mock_create.assert_not_called()
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_adapter_create_returns_none(self):
|
|
"""If _create_adapter returns None, remove from queue (missing deps)."""
|
|
runner = _make_runner()
|
|
|
|
platform_config = PlatformConfig(enabled=True, token="test")
|
|
runner._failed_platforms[Platform.TELEGRAM] = {
|
|
"config": platform_config,
|
|
"attempts": 1,
|
|
"next_retry": time.monotonic() - 1,
|
|
}
|
|
|
|
real_sleep = asyncio.sleep
|
|
|
|
with patch.object(runner, "_create_adapter", return_value=None):
|
|
async def run_one_iteration():
|
|
runner._running = True
|
|
call_count = 0
|
|
|
|
async def fake_sleep(n):
|
|
nonlocal call_count
|
|
call_count += 1
|
|
if call_count > 1:
|
|
runner._running = False
|
|
await real_sleep(0)
|
|
|
|
with patch("asyncio.sleep", side_effect=fake_sleep):
|
|
await runner._platform_reconnect_watcher()
|
|
|
|
await run_one_iteration()
|
|
|
|
assert Platform.TELEGRAM not in runner._failed_platforms
|
|
|
|
|
|
# --- Runtime disconnection queueing ---
|
|
|
|
class TestRuntimeDisconnectQueuing:
|
|
"""Test that _handle_adapter_fatal_error queues retryable disconnections."""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_retryable_runtime_error_queued_for_reconnect(self):
|
|
"""Retryable runtime errors should add the platform to _failed_platforms."""
|
|
runner = _make_runner()
|
|
runner.stop = AsyncMock()
|
|
|
|
adapter = StubAdapter(succeed=True)
|
|
adapter._set_fatal_error("network_error", "DNS failure", retryable=True)
|
|
runner.adapters[Platform.TELEGRAM] = adapter
|
|
|
|
await runner._handle_adapter_fatal_error(adapter)
|
|
|
|
assert Platform.TELEGRAM in runner._failed_platforms
|
|
assert runner._failed_platforms[Platform.TELEGRAM]["attempts"] == 0
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_nonretryable_runtime_error_not_queued(self):
|
|
"""Non-retryable runtime errors should not be queued for reconnection."""
|
|
runner = _make_runner()
|
|
|
|
adapter = StubAdapter(succeed=True)
|
|
adapter._set_fatal_error("auth_error", "bad token", retryable=False)
|
|
runner.adapters[Platform.TELEGRAM] = adapter
|
|
|
|
# Need to prevent stop() from running fully
|
|
runner.stop = AsyncMock()
|
|
|
|
await runner._handle_adapter_fatal_error(adapter)
|
|
|
|
assert Platform.TELEGRAM not in runner._failed_platforms
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_retryable_error_keeps_gateway_alive_when_all_down(self):
|
|
"""When all adapters fail at runtime with retryable errors, the
|
|
gateway should stay alive and let the reconnect watcher recover them
|
|
in the background. (Previously this exited-with-failure to trigger
|
|
a systemd restart — that converted transient outages into infinite
|
|
restart loops and killed in-process state.)
|
|
"""
|
|
runner = _make_runner()
|
|
runner.stop = AsyncMock()
|
|
|
|
adapter = StubAdapter(succeed=True)
|
|
adapter._set_fatal_error("network_error", "DNS failure", retryable=True)
|
|
runner.adapters[Platform.TELEGRAM] = adapter
|
|
|
|
await runner._handle_adapter_fatal_error(adapter)
|
|
|
|
# stop() should NOT be called — gateway stays alive for the watcher
|
|
runner.stop.assert_not_called()
|
|
assert runner._exit_with_failure is False
|
|
assert Platform.TELEGRAM in runner._failed_platforms
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_retryable_error_no_exit_when_other_adapters_still_connected(self):
|
|
"""Gateway should NOT exit if some adapters are still connected."""
|
|
runner = _make_runner()
|
|
runner.stop = AsyncMock()
|
|
|
|
failing_adapter = StubAdapter(succeed=True)
|
|
failing_adapter._set_fatal_error("network_error", "DNS failure", retryable=True)
|
|
runner.adapters[Platform.TELEGRAM] = failing_adapter
|
|
|
|
# Another adapter is still connected
|
|
healthy_adapter = StubAdapter(succeed=True)
|
|
runner.adapters[Platform.DISCORD] = healthy_adapter
|
|
|
|
await runner._handle_adapter_fatal_error(failing_adapter)
|
|
|
|
# stop() should NOT have been called — Discord is still up
|
|
runner.stop.assert_not_called()
|
|
assert Platform.TELEGRAM in runner._failed_platforms
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_nonretryable_error_triggers_shutdown(self):
|
|
"""Gateway should shut down when no adapters remain and nothing is queued."""
|
|
runner = _make_runner()
|
|
runner.stop = AsyncMock()
|
|
|
|
adapter = StubAdapter(succeed=True)
|
|
adapter._set_fatal_error("auth_error", "bad token", retryable=False)
|
|
runner.adapters[Platform.TELEGRAM] = adapter
|
|
|
|
await runner._handle_adapter_fatal_error(adapter)
|
|
|
|
runner.stop.assert_called_once()
|
|
|
|
|
|
# --- Pause / resume circuit breaker ---
|
|
|
|
|
|
class TestPauseResume:
|
|
"""Test the per-platform pause/resume helpers and slash command."""
|
|
|
|
def test_pause_marks_platform_paused(self):
|
|
runner = _make_runner()
|
|
runner._failed_platforms[Platform.TELEGRAM] = {
|
|
"config": PlatformConfig(enabled=True, token="t"),
|
|
"attempts": 3,
|
|
"next_retry": time.monotonic() + 30,
|
|
}
|
|
runner._pause_failed_platform(Platform.TELEGRAM, reason="manual")
|
|
info = runner._failed_platforms[Platform.TELEGRAM]
|
|
assert info["paused"] is True
|
|
assert info["pause_reason"] == "manual"
|
|
assert info["next_retry"] == float("inf")
|
|
|
|
def test_pause_is_idempotent(self):
|
|
runner = _make_runner()
|
|
runner._failed_platforms[Platform.TELEGRAM] = {
|
|
"config": PlatformConfig(enabled=True, token="t"),
|
|
"attempts": 3,
|
|
"next_retry": time.monotonic() + 30,
|
|
"paused": True,
|
|
"pause_reason": "first reason",
|
|
}
|
|
runner._pause_failed_platform(Platform.TELEGRAM, reason="second reason")
|
|
# Reason should not be overwritten on a second pause call.
|
|
assert (
|
|
runner._failed_platforms[Platform.TELEGRAM]["pause_reason"]
|
|
== "first reason"
|
|
)
|
|
|
|
def test_pause_no_op_when_platform_not_queued(self):
|
|
runner = _make_runner()
|
|
# No exception even when the platform isn't in _failed_platforms.
|
|
runner._pause_failed_platform(Platform.TELEGRAM, reason="x")
|
|
assert Platform.TELEGRAM not in runner._failed_platforms
|
|
|
|
def test_resume_clears_paused_and_resets_attempts(self):
|
|
runner = _make_runner()
|
|
runner._failed_platforms[Platform.TELEGRAM] = {
|
|
"config": PlatformConfig(enabled=True, token="t"),
|
|
"attempts": 10,
|
|
"next_retry": float("inf"),
|
|
"paused": True,
|
|
"pause_reason": "auto-paused",
|
|
}
|
|
assert runner._resume_paused_platform(Platform.TELEGRAM) is True
|
|
info = runner._failed_platforms[Platform.TELEGRAM]
|
|
assert info["paused"] is False
|
|
assert info["attempts"] == 0
|
|
assert info["next_retry"] != float("inf")
|
|
assert "pause_reason" not in info
|
|
|
|
def test_resume_returns_false_when_not_paused(self):
|
|
runner = _make_runner()
|
|
runner._failed_platforms[Platform.TELEGRAM] = {
|
|
"config": PlatformConfig(enabled=True, token="t"),
|
|
"attempts": 1,
|
|
"next_retry": time.monotonic() + 30,
|
|
}
|
|
assert runner._resume_paused_platform(Platform.TELEGRAM) is False
|
|
|
|
def test_resume_returns_false_when_not_queued(self):
|
|
runner = _make_runner()
|
|
assert runner._resume_paused_platform(Platform.TELEGRAM) is False
|
|
|
|
|
|
class TestPlatformSlashCommand:
|
|
"""Test the /platform list|pause|resume slash command handler."""
|
|
|
|
def _make_event(self, content: str):
|
|
ev = MagicMock()
|
|
ev.content = content
|
|
return ev
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_list_shows_connected_and_paused(self):
|
|
runner = _make_runner()
|
|
runner.adapters[Platform.DISCORD] = StubAdapter(platform=Platform.DISCORD)
|
|
runner._failed_platforms[Platform.WHATSAPP] = {
|
|
"config": PlatformConfig(enabled=True, token="t"),
|
|
"attempts": 10,
|
|
"next_retry": float("inf"),
|
|
"paused": True,
|
|
"pause_reason": "not paired",
|
|
}
|
|
out = await runner._handle_platform_command(self._make_event("/platform list"))
|
|
assert "discord" in out
|
|
assert "whatsapp" in out
|
|
assert "PAUSED" in out
|
|
assert "not paired" in out
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_pause_command_pauses_queued_platform(self):
|
|
runner = _make_runner()
|
|
runner._failed_platforms[Platform.WHATSAPP] = {
|
|
"config": PlatformConfig(enabled=True, token="t"),
|
|
"attempts": 2,
|
|
"next_retry": time.monotonic() + 30,
|
|
}
|
|
out = await runner._handle_platform_command(
|
|
self._make_event("/platform pause whatsapp")
|
|
)
|
|
assert "paused" in out.lower()
|
|
assert runner._failed_platforms[Platform.WHATSAPP]["paused"] is True
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_pause_rejects_unqueued_platform(self):
|
|
runner = _make_runner()
|
|
out = await runner._handle_platform_command(
|
|
self._make_event("/platform pause whatsapp")
|
|
)
|
|
assert "not in the retry queue" in out
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_resume_command_resumes_paused_platform(self):
|
|
runner = _make_runner()
|
|
runner._failed_platforms[Platform.WHATSAPP] = {
|
|
"config": PlatformConfig(enabled=True, token="t"),
|
|
"attempts": 10,
|
|
"next_retry": float("inf"),
|
|
"paused": True,
|
|
"pause_reason": "x",
|
|
}
|
|
out = await runner._handle_platform_command(
|
|
self._make_event("/platform resume whatsapp")
|
|
)
|
|
assert "resumed" in out.lower()
|
|
assert runner._failed_platforms[Platform.WHATSAPP]["paused"] is False
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_unknown_platform_name(self):
|
|
runner = _make_runner()
|
|
out = await runner._handle_platform_command(
|
|
self._make_event("/platform pause notarealplatform")
|
|
)
|
|
assert "Unknown platform" in out
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_bare_platform_shows_usage_with_list(self):
|
|
# An empty /platform call defaults to "list".
|
|
runner = _make_runner()
|
|
out = await runner._handle_platform_command(self._make_event("/platform"))
|
|
assert "Gateway platforms" in out
|
|
|