fix(gateway): keep running when platforms fail; add per-platform circuit breaker + /platform (#26600)

Stop the gateway from exiting (or systemd-restart-looping) when a single
messaging adapter fails at startup or runtime.  A misconfigured WhatsApp
(npm install timeout, unpaired bridge, missing creds.json) used to take
the entire gateway down, killing cron jobs and any other connected
platforms with it.

Changes:

  • Startup (gateway/run.py): when connected_count==0 but the only
    errors are retryable, log a degraded-state warning and keep the
    gateway alive instead of returning False.  Reconnect watcher then
    recovers platforms as their underlying problem clears.

  • Runtime (gateway/run.py _handle_adapter_fatal_error): when the last
    adapter goes down with a retryable error and is queued for
    reconnection, stay alive instead of exit-with-failure.  Previously
    this triggered systemd Restart=on-failure, which created infinite
    restart loops on persistent retryable failures (proxy outage,
    repeated bridge crashes).

  • Reconnect watcher (gateway/run.py _platform_reconnect_watcher):
    replace the 20-attempt hard drop with a circuit-breaker pause.
    After _PAUSE_AFTER_FAILURES (10) consecutive retryable failures, the
    platform stays in _failed_platforms with paused=True so the watcher
    skips it but the operator can still see and resume it.  Non-retryable
    errors still drop out of the queue immediately.  Resolves #17063
    (gateway giving up on Telegram after 20 attempts).

  • WhatsApp preflight (gateway/platforms/whatsapp.py): refuse to start
    the Node bridge when creds.json is missing.  Sets a non-retryable
    whatsapp_not_paired fatal error so the watcher drops it cleanly
    with a single 'run hermes whatsapp' log line instead of paying the
    30s bridge bootstrap timeout on every gateway start.

  • WhatsApp setup ordering (hermes_cli/main.py cmd_whatsapp): only set
    WHATSAPP_ENABLED=true once pairing actually succeeds.  Previously
    the wizard wrote the env var at step 2 (before npm install and QR
    pairing), so any Ctrl+C left .env claiming WhatsApp was ready when
    the bridge had no creds.json.  Also propagate the env var when the
    user keeps an existing pairing on a re-run.

  • /platform slash command (hermes_cli/commands.py + gateway/run.py):
    new gateway-only command for manual circuit-breaker control.
      /platform list           — show connected + failed/paused platforms
      /platform pause <name>   — silence a known-broken platform
      /platform resume <name>  — re-queue a paused platform

Tests:

  • New: pause/resume helpers, /platform list|pause|resume command,
    WhatsApp creds.json preflight, WhatsApp setup ordering.
  • Updated: stale assertions that codified the old 'exit and let
    systemd restart' behavior in test_runner_fatal_adapter.py,
    test_runner_startup_failures.py, and test_platform_reconnect.py
    (the 20-attempt give-up test became a circuit-breaker pause test).

5488 tests pass in tests/gateway/.
This commit is contained in:
Teknium 2026-05-15 14:32:14 -07:00 committed by GitHub
parent 3b9368a0c4
commit 518f39557b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 745 additions and 62 deletions

View file

@ -294,15 +294,63 @@ class TestPlatformReconnectWatcher:
assert runner._failed_platforms[Platform.TELEGRAM]["attempts"] == 2
@pytest.mark.asyncio
async def test_reconnect_gives_up_after_max_attempts(self):
"""After max attempts, platform should be removed from retry queue."""
async def test_reconnect_pauses_after_circuit_breaker_threshold(self):
"""After enough consecutive retryable failures, the watcher should
*pause* the platform (keep it in the queue but stop hammering it),
not drop it. The user resumes via /platform resume.
"""
runner = _make_runner()
platform_config = PlatformConfig(enabled=True, token="test")
# 9 prior attempts — the next failure will be the 10th and should
# trip the circuit breaker.
runner._failed_platforms[Platform.TELEGRAM] = {
"config": platform_config,
"attempts": 9,
"next_retry": time.monotonic() - 1,
}
fail_adapter = StubAdapter(
succeed=False, fatal_error="DNS failure", fatal_retryable=True
)
real_sleep = asyncio.sleep
with patch.object(runner, "_create_adapter", return_value=fail_adapter):
async def run_one_iteration():
runner._running = True
call_count = 0
async def fake_sleep(n):
nonlocal call_count
call_count += 1
if call_count > 1:
runner._running = False
await real_sleep(0)
with patch("asyncio.sleep", side_effect=fake_sleep):
await runner._platform_reconnect_watcher()
await run_one_iteration()
# Platform stays in queue — paused, not dropped
assert Platform.TELEGRAM in runner._failed_platforms
info = runner._failed_platforms[Platform.TELEGRAM]
assert info["paused"] is True
assert info["attempts"] == 10
assert "pause_reason" in info
@pytest.mark.asyncio
async def test_reconnect_skips_paused_platforms(self):
"""A paused platform should not be retried by the watcher tick."""
runner = _make_runner()
platform_config = PlatformConfig(enabled=True, token="test")
runner._failed_platforms[Platform.TELEGRAM] = {
"config": platform_config,
"attempts": 20, # At max
"next_retry": time.monotonic() - 1,
"attempts": 10,
"next_retry": time.monotonic() - 1, # would normally retry now
"paused": True,
"pause_reason": "paused via /platform pause",
}
real_sleep = asyncio.sleep
@ -324,8 +372,10 @@ class TestPlatformReconnectWatcher:
await run_one_iteration()
assert Platform.TELEGRAM not in runner._failed_platforms
mock_create.assert_not_called() # Should give up without trying
# Paused platform stays queued and was never touched
assert Platform.TELEGRAM in runner._failed_platforms
assert runner._failed_platforms[Platform.TELEGRAM]["paused"] is True
mock_create.assert_not_called()
@pytest.mark.asyncio
async def test_reconnect_skips_when_not_time_yet(self):
@ -459,11 +509,12 @@ class TestRuntimeDisconnectQueuing:
assert Platform.TELEGRAM not in runner._failed_platforms
@pytest.mark.asyncio
async def test_retryable_error_exits_for_service_restart_when_all_down(self):
"""Gateway should exit with failure when all platforms fail with retryable errors.
This lets systemd Restart=on-failure restart the process, which is more
reliable than in-process background reconnection after exhausted retries.
async def test_retryable_error_keeps_gateway_alive_when_all_down(self):
"""When all adapters fail at runtime with retryable errors, the
gateway should stay alive and let the reconnect watcher recover them
in the background. (Previously this exited-with-failure to trigger
a systemd restart that converted transient outages into infinite
restart loops and killed in-process state.)
"""
runner = _make_runner()
runner.stop = AsyncMock()
@ -474,9 +525,9 @@ class TestRuntimeDisconnectQueuing:
await runner._handle_adapter_fatal_error(adapter)
# stop() SHOULD be called — gateway exits for systemd restart
runner.stop.assert_called_once()
assert runner._exit_with_failure is True
# stop() should NOT be called — gateway stays alive for the watcher
runner.stop.assert_not_called()
assert runner._exit_with_failure is False
assert Platform.TELEGRAM in runner._failed_platforms
@pytest.mark.asyncio
@ -512,3 +563,154 @@ class TestRuntimeDisconnectQueuing:
await runner._handle_adapter_fatal_error(adapter)
runner.stop.assert_called_once()
# --- Pause / resume circuit breaker ---
class TestPauseResume:
"""Test the per-platform pause/resume helpers and slash command."""
def test_pause_marks_platform_paused(self):
runner = _make_runner()
runner._failed_platforms[Platform.TELEGRAM] = {
"config": PlatformConfig(enabled=True, token="t"),
"attempts": 3,
"next_retry": time.monotonic() + 30,
}
runner._pause_failed_platform(Platform.TELEGRAM, reason="manual")
info = runner._failed_platforms[Platform.TELEGRAM]
assert info["paused"] is True
assert info["pause_reason"] == "manual"
assert info["next_retry"] == float("inf")
def test_pause_is_idempotent(self):
runner = _make_runner()
runner._failed_platforms[Platform.TELEGRAM] = {
"config": PlatformConfig(enabled=True, token="t"),
"attempts": 3,
"next_retry": time.monotonic() + 30,
"paused": True,
"pause_reason": "first reason",
}
runner._pause_failed_platform(Platform.TELEGRAM, reason="second reason")
# Reason should not be overwritten on a second pause call.
assert (
runner._failed_platforms[Platform.TELEGRAM]["pause_reason"]
== "first reason"
)
def test_pause_no_op_when_platform_not_queued(self):
runner = _make_runner()
# No exception even when the platform isn't in _failed_platforms.
runner._pause_failed_platform(Platform.TELEGRAM, reason="x")
assert Platform.TELEGRAM not in runner._failed_platforms
def test_resume_clears_paused_and_resets_attempts(self):
runner = _make_runner()
runner._failed_platforms[Platform.TELEGRAM] = {
"config": PlatformConfig(enabled=True, token="t"),
"attempts": 10,
"next_retry": float("inf"),
"paused": True,
"pause_reason": "auto-paused",
}
assert runner._resume_paused_platform(Platform.TELEGRAM) is True
info = runner._failed_platforms[Platform.TELEGRAM]
assert info["paused"] is False
assert info["attempts"] == 0
assert info["next_retry"] != float("inf")
assert "pause_reason" not in info
def test_resume_returns_false_when_not_paused(self):
runner = _make_runner()
runner._failed_platforms[Platform.TELEGRAM] = {
"config": PlatformConfig(enabled=True, token="t"),
"attempts": 1,
"next_retry": time.monotonic() + 30,
}
assert runner._resume_paused_platform(Platform.TELEGRAM) is False
def test_resume_returns_false_when_not_queued(self):
runner = _make_runner()
assert runner._resume_paused_platform(Platform.TELEGRAM) is False
class TestPlatformSlashCommand:
"""Test the /platform list|pause|resume slash command handler."""
def _make_event(self, content: str):
ev = MagicMock()
ev.content = content
return ev
@pytest.mark.asyncio
async def test_list_shows_connected_and_paused(self):
runner = _make_runner()
runner.adapters[Platform.DISCORD] = StubAdapter(platform=Platform.DISCORD)
runner._failed_platforms[Platform.WHATSAPP] = {
"config": PlatformConfig(enabled=True, token="t"),
"attempts": 10,
"next_retry": float("inf"),
"paused": True,
"pause_reason": "not paired",
}
out = await runner._handle_platform_command(self._make_event("/platform list"))
assert "discord" in out
assert "whatsapp" in out
assert "PAUSED" in out
assert "not paired" in out
@pytest.mark.asyncio
async def test_pause_command_pauses_queued_platform(self):
runner = _make_runner()
runner._failed_platforms[Platform.WHATSAPP] = {
"config": PlatformConfig(enabled=True, token="t"),
"attempts": 2,
"next_retry": time.monotonic() + 30,
}
out = await runner._handle_platform_command(
self._make_event("/platform pause whatsapp")
)
assert "paused" in out.lower()
assert runner._failed_platforms[Platform.WHATSAPP]["paused"] is True
@pytest.mark.asyncio
async def test_pause_rejects_unqueued_platform(self):
runner = _make_runner()
out = await runner._handle_platform_command(
self._make_event("/platform pause whatsapp")
)
assert "not in the retry queue" in out
@pytest.mark.asyncio
async def test_resume_command_resumes_paused_platform(self):
runner = _make_runner()
runner._failed_platforms[Platform.WHATSAPP] = {
"config": PlatformConfig(enabled=True, token="t"),
"attempts": 10,
"next_retry": float("inf"),
"paused": True,
"pause_reason": "x",
}
out = await runner._handle_platform_command(
self._make_event("/platform resume whatsapp")
)
assert "resumed" in out.lower()
assert runner._failed_platforms[Platform.WHATSAPP]["paused"] is False
@pytest.mark.asyncio
async def test_unknown_platform_name(self):
runner = _make_runner()
out = await runner._handle_platform_command(
self._make_event("/platform pause notarealplatform")
)
assert "Unknown platform" in out
@pytest.mark.asyncio
async def test_bare_platform_shows_usage_with_list(self):
# An empty /platform call defaults to "list".
runner = _make_runner()
out = await runner._handle_platform_command(self._make_event("/platform"))
assert "Gateway platforms" in out