fix(gateway): keep running when platforms fail; add per-platform circuit breaker + /platform (#26600)

Stop the gateway from exiting (or systemd-restart-looping) when a single
messaging adapter fails at startup or runtime.  A misconfigured WhatsApp
(npm install timeout, unpaired bridge, missing creds.json) used to take
the entire gateway down, killing cron jobs and any other connected
platforms with it.

Changes:

  • Startup (gateway/run.py): when connected_count==0 but the only
    errors are retryable, log a degraded-state warning and keep the
    gateway alive instead of returning False.  Reconnect watcher then
    recovers platforms as their underlying problem clears.

  • Runtime (gateway/run.py _handle_adapter_fatal_error): when the last
    adapter goes down with a retryable error and is queued for
    reconnection, stay alive instead of exit-with-failure.  Previously
    this triggered systemd Restart=on-failure, which created infinite
    restart loops on persistent retryable failures (proxy outage,
    repeated bridge crashes).

  • Reconnect watcher (gateway/run.py _platform_reconnect_watcher):
    replace the 20-attempt hard drop with a circuit-breaker pause.
    After _PAUSE_AFTER_FAILURES (10) consecutive retryable failures, the
    platform stays in _failed_platforms with paused=True so the watcher
    skips it but the operator can still see and resume it.  Non-retryable
    errors still drop out of the queue immediately.  Resolves #17063
    (gateway giving up on Telegram after 20 attempts).

  • WhatsApp preflight (gateway/platforms/whatsapp.py): refuse to start
    the Node bridge when creds.json is missing.  Sets a non-retryable
    whatsapp_not_paired fatal error so the watcher drops it cleanly
    with a single 'run hermes whatsapp' log line instead of paying the
    30s bridge bootstrap timeout on every gateway start.

  • WhatsApp setup ordering (hermes_cli/main.py cmd_whatsapp): only set
    WHATSAPP_ENABLED=true once pairing actually succeeds.  Previously
    the wizard wrote the env var at step 2 (before npm install and QR
    pairing), so any Ctrl+C left .env claiming WhatsApp was ready when
    the bridge had no creds.json.  Also propagate the env var when the
    user keeps an existing pairing on a re-run.

  • /platform slash command (hermes_cli/commands.py + gateway/run.py):
    new gateway-only command for manual circuit-breaker control.
      /platform list           — show connected + failed/paused platforms
      /platform pause <name>   — silence a known-broken platform
      /platform resume <name>  — re-queue a paused platform

Tests:

  • New: pause/resume helpers, /platform list|pause|resume command,
    WhatsApp creds.json preflight, WhatsApp setup ordering.
  • Updated: stale assertions that codified the old 'exit and let
    systemd restart' behavior in test_runner_fatal_adapter.py,
    test_runner_startup_failures.py, and test_platform_reconnect.py
    (the 20-attempt give-up test became a circuit-breaker pause test).

5488 tests pass in tests/gateway/.
This commit is contained in:
Teknium 2026-05-15 14:32:14 -07:00 committed by GitHub
parent 3b9368a0c4
commit 518f39557b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 745 additions and 62 deletions

View file

@ -493,13 +493,45 @@ class WhatsAppAdapter(BasePlatformAdapter):
"""
if not check_whatsapp_requirements():
logger.warning("[%s] Node.js not found. WhatsApp requires Node.js.", self.name)
self._set_fatal_error(
"whatsapp_node_missing",
"Node.js is not installed — install Node.js and re-run `hermes gateway`.",
retryable=False,
)
return False
bridge_path = Path(self._bridge_script)
if not bridge_path.exists():
logger.warning("[%s] Bridge script not found: %s", self.name, bridge_path)
self._set_fatal_error(
"whatsapp_bridge_missing",
f"WhatsApp bridge script missing at {bridge_path}.",
retryable=False,
)
return False
# Pre-flight: skip the 30s bridge bootstrap entirely if the user
# never finished pairing. Without creds.json the bridge prints
# QR codes to its log file and never reaches status:connected,
# so every gateway restart paid the 30s timeout + queued WhatsApp
# for indefinite retries. Mark non-retryable so the user gets a
# clear "run hermes whatsapp" message instead of the watcher
# silently hammering an unconfigured platform.
creds_path = self._session_path / "creds.json"
if not creds_path.exists():
logger.warning(
"[%s] WhatsApp is enabled but not paired (no creds.json at %s). "
"Run `hermes whatsapp` to pair, or remove WHATSAPP_ENABLED from "
"your .env to disable.",
self.name, creds_path,
)
self._set_fatal_error(
"whatsapp_not_paired",
"WhatsApp enabled but not paired — run `hermes whatsapp` to pair.",
retryable=False,
)
return False
logger.info("[%s] Bridge found at %s", self.name, bridge_path)
# Acquire scoped lock to prevent duplicate sessions

View file

@ -1990,21 +1990,21 @@ class GatewayRunner:
await self.stop()
elif not self.adapters and self._failed_platforms:
# All platforms are down and queued for background reconnection.
# If the error is retryable, exit with failure so systemd Restart=on-failure
# can restart the process. Otherwise stay alive and keep retrying in background.
if adapter.fatal_error_retryable:
self._exit_reason = adapter.fatal_error_message or "All messaging platforms failed with retryable errors"
self._exit_with_failure = True
logger.error(
"All messaging platforms failed with retryable errors. "
"Shutting down gateway for service restart (systemd will retry)."
)
await self.stop()
else:
logger.warning(
"No connected messaging platforms remain, but %d platform(s) queued for reconnection",
len(self._failed_platforms),
)
# Keep the gateway alive so:
# • cron jobs still run
# • the reconnect watcher can recover platforms when the
# underlying problem clears (proxy comes back, user runs
# `hermes whatsapp`, etc.)
# We used to exit-with-failure here to trigger systemd restart,
# but that converted a transient outage into a restart loop and
# killed in-process state every time. The reconnect watcher
# already handles long-running recovery — let it do its job.
logger.warning(
"No connected messaging platforms remain, but %d platform(s) "
"queued for reconnection — gateway staying alive, watcher will "
"retry in background.",
len(self._failed_platforms),
)
def _request_clean_exit(self, reason: str) -> None:
self._exit_cleanly = True
@ -2180,6 +2180,73 @@ class GatewayRunner:
except Exception:
pass
# ------------------------------------------------------------------
# Per-platform circuit breaker (pause/resume) — used by the reconnect
# watcher when a retryable failure recurs past a threshold, and by the
# /platform pause|resume slash command for manual control.
# ------------------------------------------------------------------
def _pause_failed_platform(self, platform, *, reason: str = "") -> None:
"""Mark a queued platform as paused — keep it in ``_failed_platforms``
but stop the reconnect watcher from hammering it.
Used by the circuit breaker after ``_PAUSE_AFTER_FAILURES`` consecutive
retryable failures, and by ``/platform pause <name>`` for manual
intervention. Paused platforms are surfaced in ``/platform list``
and resumed with ``/platform resume <name>``.
"""
info = getattr(self, "_failed_platforms", {}).get(platform)
if info is None:
return
if info.get("paused"):
return
info["paused"] = True
info["pause_reason"] = reason or "auto-paused after repeated failures"
# Push next_retry far enough out that even if "paused" is missed
# by a stale code path, the watcher won't fire on it.
info["next_retry"] = float("inf")
try:
self._update_platform_runtime_status(
platform.value,
platform_state="paused",
error_code=None,
error_message=info["pause_reason"],
)
except Exception:
pass
logger.warning(
"%s paused after %d consecutive failures (%s) — "
"fix the underlying issue then run `/platform resume %s` "
"to retry, or `hermes gateway restart` to restart the gateway.",
platform.value, info.get("attempts", 0),
info["pause_reason"], platform.value,
)
def _resume_paused_platform(self, platform) -> bool:
"""Unpause a platform — reset its attempt counter and schedule an
immediate retry. Returns True if the platform was paused and is
now queued; False if it wasn't paused (or wasn't in the queue).
"""
info = getattr(self, "_failed_platforms", {}).get(platform)
if info is None:
return False
if not info.get("paused"):
return False
info["paused"] = False
info.pop("pause_reason", None)
info["attempts"] = 0
info["next_retry"] = time.monotonic() # retry on next watcher tick
try:
self._update_platform_runtime_status(
platform.value,
platform_state="retrying",
error_code=None,
error_message=None,
)
except Exception:
pass
logger.info("%s resumed — retrying on next watcher tick", platform.value)
return True
@staticmethod
def _load_prefill_messages() -> List[Dict[str, Any]]:
"""Load ephemeral prefill messages from config or env var.
@ -3613,16 +3680,32 @@ class GatewayRunner:
return True
if enabled_platform_count > 0:
if startup_retryable_errors:
# At least one platform attempted a connection and failed —
# this is a real startup error that should block the gateway.
# All enabled platforms hit retryable failures (network
# blip, bridge not paired, npm install timeout, etc.).
# Keep the gateway alive so:
# • cron jobs still run
# • the reconnect watcher gets a chance to recover the
# failing platforms once the underlying problem is
# fixed (e.g. user runs `hermes whatsapp`, fixes
# proxy, etc.)
# Exiting here used to convert a single misconfigured
# platform into an infinite systemd restart loop.
reason = "; ".join(startup_retryable_errors)
logger.error("Gateway failed to connect any configured messaging platform: %s", reason)
logger.warning(
"Gateway started with no connected platforms — "
"%d platform(s) queued for retry: %s",
len(self._failed_platforms), reason,
)
try:
from gateway.status import write_runtime_status
write_runtime_status(gateway_state="startup_failed", exit_reason=reason)
write_runtime_status(
gateway_state="degraded",
exit_reason=None,
)
except Exception:
pass
return False
# Fall through to the normal "running" state — reconnect
# watcher takes it from here.
# All enabled platforms had no adapter (missing library or credentials).
# In fleet deployments the same config.yaml is shared across nodes that
# may only have credentials for a subset of platforms. Rather than
@ -4737,11 +4820,15 @@ class GatewayRunner:
"""Background task that periodically retries connecting failed platforms.
Uses exponential backoff: 30s 60s 120s 240s 300s (cap).
Stops retrying a platform after 20 failed attempts or if the error
is non-retryable (e.g. bad auth token).
Retryable failures keep retrying at the backoff cap indefinitely
but if a platform fails ``_PAUSE_AFTER_FAILURES`` times in a row
without ever succeeding, it is *paused*: kept in the retry queue
but no longer hammered. The user surfaces it with ``/platform list``
and resumes it with ``/platform resume <name>``. Non-retryable
failures (bad auth, etc.) still drop out of the queue immediately.
"""
_MAX_ATTEMPTS = 20
_BACKOFF_CAP = 300 # 5 minutes max between retries
_PAUSE_AFTER_FAILURES = 10 # circuit-breaker threshold
await asyncio.sleep(10) # initial delay — let startup finish
while self._running:
@ -4758,22 +4845,18 @@ class GatewayRunner:
if not self._running:
return
info = self._failed_platforms[platform]
# Skip paused platforms entirely — they need explicit
# /platform resume to come back.
if info.get("paused"):
continue
if now < info["next_retry"]:
continue # not time yet
if info["attempts"] >= _MAX_ATTEMPTS:
logger.warning(
"Giving up reconnecting %s after %d attempts",
platform.value, info["attempts"],
)
del self._failed_platforms[platform]
continue
platform_config = info["config"]
attempt = info["attempts"] + 1
logger.info(
"Reconnecting %s (attempt %d/%d)...",
platform.value, attempt, _MAX_ATTEMPTS,
"Reconnecting %s (attempt %d)...",
platform.value, attempt,
)
try:
@ -4838,6 +4921,14 @@ class GatewayRunner:
"Reconnect %s failed, next retry in %ds",
platform.value, backoff,
)
if attempt >= _PAUSE_AFTER_FAILURES:
self._pause_failed_platform(
platform,
reason=(
adapter.fatal_error_message
or "failed to reconnect"
),
)
except Exception as e:
self._update_platform_runtime_status(
platform.value,
@ -4852,6 +4943,8 @@ class GatewayRunner:
"Reconnect %s error: %s, next retry in %ds",
platform.value, e, backoff,
)
if attempt >= _PAUSE_AFTER_FAILURES:
self._pause_failed_platform(platform, reason=str(e))
# Check every 10 seconds for platforms that need reconnection
for _ in range(10):
@ -6451,6 +6544,9 @@ class GatewayRunner:
if canonical == "agents":
return await self._handle_agents_command(event)
if canonical == "platform":
return await self._handle_platform_command(event)
if canonical == "restart":
return await self._handle_restart_command(event)
@ -8698,6 +8794,99 @@ class GatewayRunner:
else:
return t("gateway.stop.no_active")
async def _handle_platform_command(self, event: MessageEvent) -> str:
"""Handle ``/platform list|pause|resume [name]`` — surface and
manually control failed/paused gateway adapters.
Examples:
``/platform list`` show connected + failed/paused platforms
``/platform pause whatsapp`` stop the reconnect watcher hammering whatsapp
``/platform resume whatsapp`` re-queue a paused platform for retry
"""
text = (getattr(event, "content", "") or "").strip()
# Strip the leading "/platform" (or "/PLATFORM") token if present
parts = text.split(maxsplit=2)
if parts and parts[0].lower().lstrip("/").startswith("platform"):
parts = parts[1:]
action = (parts[0] if parts else "list").lower()
target = parts[1].lower() if len(parts) > 1 else ""
# Resolve platform name (case-insensitive, value match)
def _resolve_platform(name: str):
if not name:
return None
for p in Platform.__members__.values():
if p.value.lower() == name:
return p
return None
if action == "list":
lines = ["**Gateway platforms**"]
connected = sorted(p.value for p in self.adapters.keys())
if connected:
lines.append("Connected: " + ", ".join(connected))
else:
lines.append("Connected: (none)")
failed = getattr(self, "_failed_platforms", {}) or {}
if failed:
for p, info in failed.items():
if info.get("paused"):
reason = info.get("pause_reason") or "paused"
lines.append(
f" · {p.value} — PAUSED ({reason}). "
f"Resume with `/platform resume {p.value}`."
)
else:
attempts = info.get("attempts", 0)
lines.append(
f" · {p.value} — retrying (attempt {attempts})"
)
else:
lines.append("Failed/paused: (none)")
return "\n".join(lines)
if action in ("pause", "resume"):
if not target:
return f"Usage: /platform {action} <name>"
platform = _resolve_platform(target)
if platform is None:
return f"Unknown platform: {target}"
failed = getattr(self, "_failed_platforms", {}) or {}
if action == "pause":
if platform not in failed:
return (
f"{platform.value} is not in the retry queue "
f"(it's either connected or not enabled)."
)
if failed[platform].get("paused"):
return f"{platform.value} is already paused."
self._pause_failed_platform(platform, reason="paused via /platform pause")
return (
f"{platform.value} paused. "
f"Resume with `/platform resume {platform.value}` or "
f"`hermes gateway restart` to reset."
)
# action == "resume"
if platform not in failed:
return (
f"{platform.value} is not in the retry queue — "
f"nothing to resume."
)
if not failed[platform].get("paused"):
return (
f"{platform.value} is already retrying — "
f"no resume needed."
)
self._resume_paused_platform(platform)
return f"{platform.value} resumed — retrying on next watcher tick."
return (
"Usage: /platform <list|pause|resume> [name]\n"
" /platform list — show platform status\n"
" /platform pause <name> — stop retrying a failing platform\n"
" /platform resume <name> — re-queue a paused platform"
)
async def _handle_restart_command(self, event: MessageEvent) -> Union[str, EphemeralReply]:
"""Handle /restart command - drain active work, then restart the gateway."""
# Defensive idempotency check: if the previous gateway process

View file

@ -198,6 +198,8 @@ COMMAND_REGISTRY: list[CommandDef] = [
args_hint="[days]"),
CommandDef("platforms", "Show gateway/messaging platform status", "Info",
cli_only=True, aliases=("gateway",)),
CommandDef("platform", "Pause, resume, or list a failing gateway platform", "Info",
gateway_only=True, args_hint="<pause|resume|list> [name]"),
CommandDef("copy", "Copy the last assistant response to clipboard", "Info",
cli_only=True, args_hint="[number]"),
CommandDef("paste", "Attach clipboard image from your clipboard", "Info",

View file

@ -1522,14 +1522,18 @@ def cmd_whatsapp(args):
)
print(f"\n✓ Mode: {mode_label}")
# ── Step 2: Enable WhatsApp ──────────────────────────────────────────
# ── Step 2: Mode is selected, will enable WhatsApp only after pairing ──
# We intentionally don't write WHATSAPP_ENABLED=true here. If the user
# aborts the wizard later (Ctrl+C, failed npm install, missed QR scan),
# we'd otherwise leave .env claiming WhatsApp is ready when the bridge
# has no creds.json. Every subsequent `hermes gateway` then paid a 30s
# bridge-bootstrap timeout and queued WhatsApp for indefinite retries.
# Now: aborted setup leaves WHATSAPP_ENABLED unset → gateway skips it.
# Re-runs that already have WHATSAPP_ENABLED=true (from a prior
# successful pairing) stay enabled — we just don't write it pre-emptively.
print()
current = get_env_value("WHATSAPP_ENABLED")
if current and current.lower() == "true":
if (get_env_value("WHATSAPP_ENABLED") or "").lower() == "true":
print("✓ WhatsApp is already enabled")
else:
save_env_value("WHATSAPP_ENABLED", "true")
print("✓ WhatsApp enabled")
# ── Step 3: Allowed users ────────────────────────────────────────────
current_users = get_env_value("WHATSAPP_ALLOWED_USERS") or ""
@ -1619,6 +1623,12 @@ def cmd_whatsapp(args):
session_dir.mkdir(parents=True, exist_ok=True)
print(" ✓ Session cleared")
else:
# Existing pairing — ensure WHATSAPP_ENABLED reflects that.
# (Older installs may have lost the env var; covers re-runs
# where the user picked "no, keep my session" but the var
# was never set or got removed.)
if (get_env_value("WHATSAPP_ENABLED") or "").lower() != "true":
save_env_value("WHATSAPP_ENABLED", "true")
print("\n✓ WhatsApp is configured and paired!")
print(" Start the gateway with: hermes gateway")
return
@ -1647,6 +1657,11 @@ def cmd_whatsapp(args):
# ── Step 7: Post-pairing ─────────────────────────────────────────────
print()
if (session_dir / "creds.json").exists():
# Only enable WhatsApp now that pairing actually succeeded. If the
# user Ctrl+C'd at any earlier step, WHATSAPP_ENABLED stays unset
# and `hermes gateway` skips it cleanly instead of paying a 30s
# bridge timeout + queueing the platform for indefinite retries.
save_env_value("WHATSAPP_ENABLED", "true")
print("✓ WhatsApp paired successfully!")
print()
if wa_mode == "bot":

View file

@ -294,15 +294,63 @@ class TestPlatformReconnectWatcher:
assert runner._failed_platforms[Platform.TELEGRAM]["attempts"] == 2
@pytest.mark.asyncio
async def test_reconnect_gives_up_after_max_attempts(self):
"""After max attempts, platform should be removed from retry queue."""
async def test_reconnect_pauses_after_circuit_breaker_threshold(self):
"""After enough consecutive retryable failures, the watcher should
*pause* the platform (keep it in the queue but stop hammering it),
not drop it. The user resumes via /platform resume.
"""
runner = _make_runner()
platform_config = PlatformConfig(enabled=True, token="test")
# 9 prior attempts — the next failure will be the 10th and should
# trip the circuit breaker.
runner._failed_platforms[Platform.TELEGRAM] = {
"config": platform_config,
"attempts": 9,
"next_retry": time.monotonic() - 1,
}
fail_adapter = StubAdapter(
succeed=False, fatal_error="DNS failure", fatal_retryable=True
)
real_sleep = asyncio.sleep
with patch.object(runner, "_create_adapter", return_value=fail_adapter):
async def run_one_iteration():
runner._running = True
call_count = 0
async def fake_sleep(n):
nonlocal call_count
call_count += 1
if call_count > 1:
runner._running = False
await real_sleep(0)
with patch("asyncio.sleep", side_effect=fake_sleep):
await runner._platform_reconnect_watcher()
await run_one_iteration()
# Platform stays in queue — paused, not dropped
assert Platform.TELEGRAM in runner._failed_platforms
info = runner._failed_platforms[Platform.TELEGRAM]
assert info["paused"] is True
assert info["attempts"] == 10
assert "pause_reason" in info
@pytest.mark.asyncio
async def test_reconnect_skips_paused_platforms(self):
"""A paused platform should not be retried by the watcher tick."""
runner = _make_runner()
platform_config = PlatformConfig(enabled=True, token="test")
runner._failed_platforms[Platform.TELEGRAM] = {
"config": platform_config,
"attempts": 20, # At max
"next_retry": time.monotonic() - 1,
"attempts": 10,
"next_retry": time.monotonic() - 1, # would normally retry now
"paused": True,
"pause_reason": "paused via /platform pause",
}
real_sleep = asyncio.sleep
@ -324,8 +372,10 @@ class TestPlatformReconnectWatcher:
await run_one_iteration()
assert Platform.TELEGRAM not in runner._failed_platforms
mock_create.assert_not_called() # Should give up without trying
# Paused platform stays queued and was never touched
assert Platform.TELEGRAM in runner._failed_platforms
assert runner._failed_platforms[Platform.TELEGRAM]["paused"] is True
mock_create.assert_not_called()
@pytest.mark.asyncio
async def test_reconnect_skips_when_not_time_yet(self):
@ -459,11 +509,12 @@ class TestRuntimeDisconnectQueuing:
assert Platform.TELEGRAM not in runner._failed_platforms
@pytest.mark.asyncio
async def test_retryable_error_exits_for_service_restart_when_all_down(self):
"""Gateway should exit with failure when all platforms fail with retryable errors.
This lets systemd Restart=on-failure restart the process, which is more
reliable than in-process background reconnection after exhausted retries.
async def test_retryable_error_keeps_gateway_alive_when_all_down(self):
"""When all adapters fail at runtime with retryable errors, the
gateway should stay alive and let the reconnect watcher recover them
in the background. (Previously this exited-with-failure to trigger
a systemd restart that converted transient outages into infinite
restart loops and killed in-process state.)
"""
runner = _make_runner()
runner.stop = AsyncMock()
@ -474,9 +525,9 @@ class TestRuntimeDisconnectQueuing:
await runner._handle_adapter_fatal_error(adapter)
# stop() SHOULD be called — gateway exits for systemd restart
runner.stop.assert_called_once()
assert runner._exit_with_failure is True
# stop() should NOT be called — gateway stays alive for the watcher
runner.stop.assert_not_called()
assert runner._exit_with_failure is False
assert Platform.TELEGRAM in runner._failed_platforms
@pytest.mark.asyncio
@ -512,3 +563,154 @@ class TestRuntimeDisconnectQueuing:
await runner._handle_adapter_fatal_error(adapter)
runner.stop.assert_called_once()
# --- Pause / resume circuit breaker ---
class TestPauseResume:
"""Test the per-platform pause/resume helpers and slash command."""
def test_pause_marks_platform_paused(self):
runner = _make_runner()
runner._failed_platforms[Platform.TELEGRAM] = {
"config": PlatformConfig(enabled=True, token="t"),
"attempts": 3,
"next_retry": time.monotonic() + 30,
}
runner._pause_failed_platform(Platform.TELEGRAM, reason="manual")
info = runner._failed_platforms[Platform.TELEGRAM]
assert info["paused"] is True
assert info["pause_reason"] == "manual"
assert info["next_retry"] == float("inf")
def test_pause_is_idempotent(self):
runner = _make_runner()
runner._failed_platforms[Platform.TELEGRAM] = {
"config": PlatformConfig(enabled=True, token="t"),
"attempts": 3,
"next_retry": time.monotonic() + 30,
"paused": True,
"pause_reason": "first reason",
}
runner._pause_failed_platform(Platform.TELEGRAM, reason="second reason")
# Reason should not be overwritten on a second pause call.
assert (
runner._failed_platforms[Platform.TELEGRAM]["pause_reason"]
== "first reason"
)
def test_pause_no_op_when_platform_not_queued(self):
runner = _make_runner()
# No exception even when the platform isn't in _failed_platforms.
runner._pause_failed_platform(Platform.TELEGRAM, reason="x")
assert Platform.TELEGRAM not in runner._failed_platforms
def test_resume_clears_paused_and_resets_attempts(self):
runner = _make_runner()
runner._failed_platforms[Platform.TELEGRAM] = {
"config": PlatformConfig(enabled=True, token="t"),
"attempts": 10,
"next_retry": float("inf"),
"paused": True,
"pause_reason": "auto-paused",
}
assert runner._resume_paused_platform(Platform.TELEGRAM) is True
info = runner._failed_platforms[Platform.TELEGRAM]
assert info["paused"] is False
assert info["attempts"] == 0
assert info["next_retry"] != float("inf")
assert "pause_reason" not in info
def test_resume_returns_false_when_not_paused(self):
runner = _make_runner()
runner._failed_platforms[Platform.TELEGRAM] = {
"config": PlatformConfig(enabled=True, token="t"),
"attempts": 1,
"next_retry": time.monotonic() + 30,
}
assert runner._resume_paused_platform(Platform.TELEGRAM) is False
def test_resume_returns_false_when_not_queued(self):
runner = _make_runner()
assert runner._resume_paused_platform(Platform.TELEGRAM) is False
class TestPlatformSlashCommand:
"""Test the /platform list|pause|resume slash command handler."""
def _make_event(self, content: str):
ev = MagicMock()
ev.content = content
return ev
@pytest.mark.asyncio
async def test_list_shows_connected_and_paused(self):
runner = _make_runner()
runner.adapters[Platform.DISCORD] = StubAdapter(platform=Platform.DISCORD)
runner._failed_platforms[Platform.WHATSAPP] = {
"config": PlatformConfig(enabled=True, token="t"),
"attempts": 10,
"next_retry": float("inf"),
"paused": True,
"pause_reason": "not paired",
}
out = await runner._handle_platform_command(self._make_event("/platform list"))
assert "discord" in out
assert "whatsapp" in out
assert "PAUSED" in out
assert "not paired" in out
@pytest.mark.asyncio
async def test_pause_command_pauses_queued_platform(self):
runner = _make_runner()
runner._failed_platforms[Platform.WHATSAPP] = {
"config": PlatformConfig(enabled=True, token="t"),
"attempts": 2,
"next_retry": time.monotonic() + 30,
}
out = await runner._handle_platform_command(
self._make_event("/platform pause whatsapp")
)
assert "paused" in out.lower()
assert runner._failed_platforms[Platform.WHATSAPP]["paused"] is True
@pytest.mark.asyncio
async def test_pause_rejects_unqueued_platform(self):
runner = _make_runner()
out = await runner._handle_platform_command(
self._make_event("/platform pause whatsapp")
)
assert "not in the retry queue" in out
@pytest.mark.asyncio
async def test_resume_command_resumes_paused_platform(self):
runner = _make_runner()
runner._failed_platforms[Platform.WHATSAPP] = {
"config": PlatformConfig(enabled=True, token="t"),
"attempts": 10,
"next_retry": float("inf"),
"paused": True,
"pause_reason": "x",
}
out = await runner._handle_platform_command(
self._make_event("/platform resume whatsapp")
)
assert "resumed" in out.lower()
assert runner._failed_platforms[Platform.WHATSAPP]["paused"] is False
@pytest.mark.asyncio
async def test_unknown_platform_name(self):
runner = _make_runner()
out = await runner._handle_platform_command(
self._make_event("/platform pause notarealplatform")
)
assert "Unknown platform" in out
@pytest.mark.asyncio
async def test_bare_platform_shows_usage_with_list(self):
# An empty /platform call defaults to "list".
runner = _make_runner()
out = await runner._handle_platform_command(self._make_event("/platform"))
assert "Gateway platforms" in out

View file

@ -68,7 +68,11 @@ async def test_runner_requests_clean_exit_for_nonretryable_startup_conflict(monk
@pytest.mark.asyncio
async def test_runner_queues_retryable_runtime_fatal_for_reconnection(monkeypatch, tmp_path):
"""Retryable runtime fatal errors queue the platform for reconnection
instead of shutting down the gateway."""
AND keep the gateway alive the background reconnect watcher recovers
the platform when the underlying issue clears. (Previously this
exited-with-failure to trigger a systemd restart; that converted
transient failures into infinite restart loops.)
"""
config = GatewayConfig(
platforms={
Platform.WHATSAPP: PlatformConfig(enabled=True, token="token")
@ -89,8 +93,8 @@ async def test_runner_queues_retryable_runtime_fatal_for_reconnection(monkeypatc
await runner._handle_adapter_fatal_error(adapter)
# Should shut down with failure — systemd Restart=on-failure will restart
runner.stop.assert_awaited_once()
assert runner._exit_with_failure is True
# Gateway stays alive — watcher will retry in background
runner.stop.assert_not_awaited()
assert runner._exit_with_failure is False
assert Platform.WHATSAPP in runner._failed_platforms
assert runner._failed_platforms[Platform.WHATSAPP]["attempts"] == 0

View file

@ -64,7 +64,14 @@ class _SuccessfulAdapter(BasePlatformAdapter):
@pytest.mark.asyncio
async def test_runner_returns_failure_for_retryable_startup_errors(monkeypatch, tmp_path):
async def test_runner_stays_alive_for_retryable_startup_errors(monkeypatch, tmp_path):
"""Retryable startup errors should leave the gateway running in
degraded mode so the reconnect watcher can recover the platform when
the underlying problem clears. Previously this returned False from
``start()`` and exited the process, which converted a single broken
platform (e.g. unpaired WhatsApp, DNS blip on Telegram) into a
systemd restart loop and killed cron jobs in the meantime.
"""
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
config = GatewayConfig(
platforms={
@ -78,11 +85,13 @@ async def test_runner_returns_failure_for_retryable_startup_errors(monkeypatch,
ok = await runner.start()
assert ok is False
# Gateway stays alive in degraded mode; reconnect watcher takes over.
assert ok is True
assert runner.should_exit_cleanly is False
state = read_runtime_status()
assert state["gateway_state"] == "startup_failed"
assert "temporary DNS resolution failure" in state["exit_reason"]
assert state["gateway_state"] in {"degraded", "running"}
# Telegram was queued for retry, not given up on.
assert Platform.TELEGRAM in runner._failed_platforms
assert state["platforms"]["telegram"]["state"] == "retrying"
assert state["platforms"]["telegram"]["error_code"] == "telegram_connect_error"

View file

@ -611,3 +611,93 @@ class TestHttpSessionLifecycle:
mock_task.cancel.assert_not_called()
assert adapter._poll_task is None
# ---------------------------------------------------------------------------
# Pre-flight: refuse to start the bridge when creds.json is missing
# ---------------------------------------------------------------------------
class TestNoCredsPreflight:
"""Verify ``connect()`` fast-fails as non-retryable when WhatsApp is
enabled but the user never finished pairing (no ``creds.json``).
Without this guard, every gateway boot:
spawned the bridge subprocess (npm install if needed)
waited 30s for status:connected (never happens without creds)
queued WhatsApp for indefinite retries that would just repeat
With the guard, ``connect()`` returns False immediately with a
non-retryable fatal error so the reconnect watcher drops the platform
and the gateway gets a single clear log line telling the user to run
``hermes whatsapp``.
"""
@pytest.mark.asyncio
async def test_connect_returns_false_when_no_creds(self, tmp_path):
from gateway.platforms.whatsapp import WhatsAppAdapter
adapter = WhatsAppAdapter.__new__(WhatsAppAdapter)
adapter.platform = Platform.WHATSAPP
adapter.config = MagicMock()
adapter._bridge_port = 19876
# Point bridge_script at a real existing file so the earlier
# bridge-missing check doesn't trip — we want to exercise the
# creds.json check specifically.
bridge = tmp_path / "bridge.js"
bridge.write_text("// stub")
adapter._bridge_script = str(bridge)
adapter._session_path = tmp_path / "session" # no creds.json inside
adapter._session_path.mkdir()
adapter._bridge_log_fh = None
adapter._fatal_error_code = None
adapter._fatal_error_message = None
adapter._fatal_error_retryable = True
with patch(
"gateway.platforms.whatsapp.check_whatsapp_requirements",
return_value=True,
):
result = await adapter.connect()
assert result is False
# Non-retryable so the reconnect watcher drops it cleanly
assert adapter._fatal_error_code == "whatsapp_not_paired"
assert adapter._fatal_error_retryable is False
@pytest.mark.asyncio
async def test_connect_proceeds_when_creds_present(self, tmp_path):
"""When creds.json exists, the preflight check is bypassed and
connect() proceeds to the bridge bootstrap path. We don't fully
simulate the bridge here we just verify no fast-fail occurs.
"""
from gateway.platforms.whatsapp import WhatsAppAdapter
adapter = WhatsAppAdapter.__new__(WhatsAppAdapter)
adapter.platform = Platform.WHATSAPP
adapter.config = MagicMock()
adapter._bridge_port = 19877
bridge = tmp_path / "bridge.js"
bridge.write_text("// stub")
adapter._bridge_script = str(bridge)
session_dir = tmp_path / "session"
session_dir.mkdir()
(session_dir / "creds.json").write_text("{}")
adapter._session_path = session_dir
adapter._bridge_log_fh = None
adapter._fatal_error_code = None
adapter._fatal_error_message = None
adapter._fatal_error_retryable = True
# Stub _acquire_platform_lock to return False so connect() exits
# cleanly *after* the preflight, without spawning subprocesses.
adapter._acquire_platform_lock = MagicMock(return_value=False)
with patch(
"gateway.platforms.whatsapp.check_whatsapp_requirements",
return_value=True,
):
result = await adapter.connect()
# Preflight passed — exits because we faked lock acquisition,
# but the fatal-error code is NOT the "not paired" one.
assert result is False
assert adapter._fatal_error_code != "whatsapp_not_paired"

View file

@ -0,0 +1,140 @@
"""Regression tests for ``cmd_whatsapp`` env-var write ordering.
Before the fix, ``hermes whatsapp`` wrote ``WHATSAPP_ENABLED=true`` at
step 2 before npm install (step 4) and before QR pairing (step 6).
If the user Ctrl+C'd at any later step, ``.env`` claimed WhatsApp was
ready when the bridge still had no ``creds.json``. Every subsequent
``hermes gateway`` then paid a 30s bridge-bootstrap timeout and queued
WhatsApp for indefinite retries looking like "the gateway is broken."
The fix: only set ``WHATSAPP_ENABLED=true`` once pairing actually
succeeds (creds.json exists). Aborted setup leaves no enabled state.
"""
from __future__ import annotations
import io
import os
from contextlib import redirect_stdout
from pathlib import Path
from unittest.mock import MagicMock, patch
import pytest
@pytest.fixture
def isolated_home(tmp_path, monkeypatch):
home = tmp_path / "home"
hermes = home / ".hermes"
hermes.mkdir(parents=True)
monkeypatch.setattr(Path, "home", lambda: home)
monkeypatch.setenv("HERMES_HOME", str(hermes))
# Ensure get_env_value cache doesn't carry stale state.
for key in list(os.environ):
if key.startswith("WHATSAPP_"):
monkeypatch.delenv(key, raising=False)
return hermes
def _env_value(hermes_home: Path, key: str) -> str | None:
env_file = hermes_home / ".env"
if not env_file.exists():
return None
for line in env_file.read_text().splitlines():
if "=" not in line:
continue
k, _, v = line.partition("=")
if k.strip() == key:
return v.strip().strip('"').strip("'")
return None
def test_aborted_setup_does_not_enable_whatsapp(isolated_home, monkeypatch):
"""User picks mode 1, then Ctrl+C's at the allowed-users prompt.
WHATSAPP_ENABLED must NOT be present in .env after abort.
"""
from hermes_cli.main import cmd_whatsapp
# First input() = mode choice, second input() = allowed-users prompt
# We raise KeyboardInterrupt on the second call to simulate abort.
inputs = iter(["1"])
def fake_input(_prompt=""):
try:
return next(inputs)
except StopIteration:
raise KeyboardInterrupt
monkeypatch.setattr("builtins.input", fake_input)
# _require_tty calls sys.stdin.isatty — make it pass.
monkeypatch.setattr("hermes_cli.main._require_tty", lambda *_a, **_kw: None)
# No node, no bridge script — we shouldn't reach those steps anyway.
buf = io.StringIO()
with redirect_stdout(buf):
try:
cmd_whatsapp(MagicMock())
except KeyboardInterrupt:
pass
assert _env_value(isolated_home, "WHATSAPP_ENABLED") is None, (
"Setup aborted before pairing — WHATSAPP_ENABLED must not be set. "
f"Got .env: {(isolated_home / '.env').read_text() if (isolated_home / '.env').exists() else '(missing)'}"
)
def test_existing_pairing_skip_branch_enables_whatsapp(isolated_home, monkeypatch):
"""User runs ``hermes whatsapp`` with an existing paired session and
chooses "no, keep my session" at the re-pair prompt. The env var
should be (re-)written to true so the gateway picks WhatsApp back up,
even if the var was lost since the original pairing.
"""
from hermes_cli.main import cmd_whatsapp
# Pre-create a paired session WITHOUT WHATSAPP_ENABLED in .env.
session = isolated_home / "whatsapp" / "session"
session.mkdir(parents=True)
(session / "creds.json").write_text("{}")
monkeypatch.setenv("WHATSAPP_MODE", "bot")
monkeypatch.setenv("WHATSAPP_ALLOWED_USERS", "15551234567")
# mode already set → skip mode prompt; users already set → skip update
# prompt with "no"; pairing exists → "no, keep session" → return.
inputs = iter(["n", "n"])
def fake_input(_prompt=""):
try:
return next(inputs)
except StopIteration:
return "n"
monkeypatch.setattr("builtins.input", fake_input)
monkeypatch.setattr("hermes_cli.main._require_tty", lambda *_a, **_kw: None)
# Skip the bridge npm install — we're testing setup-ordering, not bridge
# bootstrapping. Pretend node_modules exists (Path.exists -> True for that
# specific check is hard to scope, so instead pretend npm install would
# succeed silently if reached).
monkeypatch.setattr(
"subprocess.run",
lambda *_a, **_kw: MagicMock(returncode=0, stderr=""),
)
monkeypatch.setattr("shutil.which", lambda _name: "/usr/bin/npm")
# Patch (bridge_dir / "node_modules").exists() by stubbing Path.exists
# to True for that one specific subpath. Easier: pre-create it as a
# symlink to /tmp. But we can't write to the repo. Instead, stub
# Path.exists wholesale to True for node_modules; the creds.json check
# in the same function still works because we wrote it ourselves.
_orig_exists = Path.exists
def _stub_exists(self):
if self.name == "node_modules":
return True
return _orig_exists(self)
monkeypatch.setattr(Path, "exists", _stub_exists)
buf = io.StringIO()
with redirect_stdout(buf):
cmd_whatsapp(MagicMock())
# The skip-rebar branch should have set the env var on its way out.
assert _env_value(isolated_home, "WHATSAPP_ENABLED") == "true"