fix(gateway): keep running when platforms fail; add per-platform circuit breaker + /platform (#26600)

Stop the gateway from exiting (or systemd-restart-looping) when a single messaging adapter fails at startup or runtime. A misconfigured WhatsApp (npm install timeout, unpaired bridge, missing creds.json) used to take the entire gateway down, killing cron jobs and any other connected platforms with it. Changes: • Startup (gateway/run.py): when connected_count==0 but the only errors are retryable, log a degraded-state warning and keep the gateway alive instead of returning False. Reconnect watcher then recovers platforms as their underlying problem clears. • Runtime (gateway/run.py _handle_adapter_fatal_error): when the last adapter goes down with a retryable error and is queued for reconnection, stay alive instead of exit-with-failure. Previously this triggered systemd Restart=on-failure, which created infinite restart loops on persistent retryable failures (proxy outage, repeated bridge crashes). • Reconnect watcher (gateway/run.py _platform_reconnect_watcher): replace the 20-attempt hard drop with a circuit-breaker pause. After _PAUSE_AFTER_FAILURES (10) consecutive retryable failures, the platform stays in _failed_platforms with paused=True so the watcher skips it but the operator can still see and resume it. Non-retryable errors still drop out of the queue immediately. Resolves #17063 (gateway giving up on Telegram after 20 attempts). • WhatsApp preflight (gateway/platforms/whatsapp.py): refuse to start the Node bridge when creds.json is missing. Sets a non-retryable whatsapp_not_paired fatal error so the watcher drops it cleanly with a single 'run hermes whatsapp' log line instead of paying the 30s bridge bootstrap timeout on every gateway start. • WhatsApp setup ordering (hermes_cli/main.py cmd_whatsapp): only set WHATSAPP_ENABLED=true once pairing actually succeeds. Previously the wizard wrote the env var at step 2 (before npm install and QR pairing), so any Ctrl+C left .env claiming WhatsApp was ready when the bridge had no creds.json. Also propagate the env var when the user keeps an existing pairing on a re-run. • /platform slash command (hermes_cli/commands.py + gateway/run.py): new gateway-only command for manual circuit-breaker control. /platform list — show connected + failed/paused platforms /platform pause <name> — silence a known-broken platform /platform resume <name> — re-queue a paused platform Tests: • New: pause/resume helpers, /platform list|pause|resume command, WhatsApp creds.json preflight, WhatsApp setup ordering. • Updated: stale assertions that codified the old 'exit and let systemd restart' behavior in test_runner_fatal_adapter.py, test_runner_startup_failures.py, and test_platform_reconnect.py (the 20-attempt give-up test became a circuit-breaker pause test). 5488 tests pass in tests/gateway/.
2026-05-18 04:41:56 +00:00 · 2026-05-15 14:32:14 -07:00 · 2026-05-15 14:32:14 -07:00 · 518f39557b
commit 518f39557b
parent 3b9368a0c4
9 changed files with 745 additions and 62 deletions
--- a/gateway/platforms/whatsapp.py
+++ b/gateway/platforms/whatsapp.py
@ -493,13 +493,45 @@ class WhatsAppAdapter(BasePlatformAdapter):
        """
        if not check_whatsapp_requirements():
            logger.warning("[%s] Node.js not found. WhatsApp requires Node.js.", self.name)
+            self._set_fatal_error(
+                "whatsapp_node_missing",
+                "Node.js is not installed — install Node.js and re-run `hermes gateway`.",
+                retryable=False,
+            )
            return False
        
        bridge_path = Path(self._bridge_script)
        if not bridge_path.exists():
            logger.warning("[%s] Bridge script not found: %s", self.name, bridge_path)
+            self._set_fatal_error(
+                "whatsapp_bridge_missing",
+                f"WhatsApp bridge script missing at {bridge_path}.",
+                retryable=False,
+            )
            return False
-        
+
+        # Pre-flight: skip the 30s bridge bootstrap entirely if the user
+        # never finished pairing.  Without creds.json the bridge prints
+        # QR codes to its log file and never reaches status:connected,
+        # so every gateway restart paid the 30s timeout + queued WhatsApp
+        # for indefinite retries.  Mark non-retryable so the user gets a
+        # clear "run hermes whatsapp" message instead of the watcher
+        # silently hammering an unconfigured platform.
+        creds_path = self._session_path / "creds.json"
+        if not creds_path.exists():
+            logger.warning(
+                "[%s] WhatsApp is enabled but not paired (no creds.json at %s). "
+                "Run `hermes whatsapp` to pair, or remove WHATSAPP_ENABLED from "
+                "your .env to disable.",
+                self.name, creds_path,
+            )
+            self._set_fatal_error(
+                "whatsapp_not_paired",
+                "WhatsApp enabled but not paired — run `hermes whatsapp` to pair.",
+                retryable=False,
+            )
+            return False
+
        logger.info("[%s] Bridge found at %s", self.name, bridge_path)
        
        # Acquire scoped lock to prevent duplicate sessions
--- a/gateway/run.py
+++ b/gateway/run.py
@ -1990,21 +1990,21 @@ class GatewayRunner:
            await self.stop()
        elif not self.adapters and self._failed_platforms:
            # All platforms are down and queued for background reconnection.
-            # If the error is retryable, exit with failure so systemd Restart=on-failure
-            # can restart the process. Otherwise stay alive and keep retrying in background.
-            if adapter.fatal_error_retryable:
-                self._exit_reason = adapter.fatal_error_message or "All messaging platforms failed with retryable errors"
-                self._exit_with_failure = True
-                logger.error(
-                    "All messaging platforms failed with retryable errors. "
-                    "Shutting down gateway for service restart (systemd will retry)."
-                )
-                await self.stop()
-            else:
-                logger.warning(
-                    "No connected messaging platforms remain, but %d platform(s) queued for reconnection",
-                    len(self._failed_platforms),
-                )
+            # Keep the gateway alive so:
+            #   • cron jobs still run
+            #   • the reconnect watcher can recover platforms when the
+            #     underlying problem clears (proxy comes back, user runs
+            #     `hermes whatsapp`, etc.)
+            # We used to exit-with-failure here to trigger systemd restart,
+            # but that converted a transient outage into a restart loop and
+            # killed in-process state every time. The reconnect watcher
+            # already handles long-running recovery — let it do its job.
+            logger.warning(
+                "No connected messaging platforms remain, but %d platform(s) "
+                "queued for reconnection — gateway staying alive, watcher will "
+                "retry in background.",
+                len(self._failed_platforms),
+            )

    def _request_clean_exit(self, reason: str) -> None:
        self._exit_cleanly = True
@ -2180,6 +2180,73 @@ class GatewayRunner:
        except Exception:
            pass

+    # ------------------------------------------------------------------
+    # Per-platform circuit breaker (pause/resume) — used by the reconnect
+    # watcher when a retryable failure recurs past a threshold, and by the
+    # /platform pause|resume slash command for manual control.
+    # ------------------------------------------------------------------
+    def _pause_failed_platform(self, platform, *, reason: str = "") -> None:
+        """Mark a queued platform as paused — keep it in ``_failed_platforms``
+        but stop the reconnect watcher from hammering it.
+
+        Used by the circuit breaker after ``_PAUSE_AFTER_FAILURES`` consecutive
+        retryable failures, and by ``/platform pause <name>`` for manual
+        intervention.  Paused platforms are surfaced in ``/platform list``
+        and resumed with ``/platform resume <name>``.
+        """
+        info = getattr(self, "_failed_platforms", {}).get(platform)
+        if info is None:
+            return
+        if info.get("paused"):
+            return
+        info["paused"] = True
+        info["pause_reason"] = reason or "auto-paused after repeated failures"
+        # Push next_retry far enough out that even if "paused" is missed
+        # by a stale code path, the watcher won't fire on it.
+        info["next_retry"] = float("inf")
+        try:
+            self._update_platform_runtime_status(
+                platform.value,
+                platform_state="paused",
+                error_code=None,
+                error_message=info["pause_reason"],
+            )
+        except Exception:
+            pass
+        logger.warning(
+            "%s paused after %d consecutive failures (%s) — "
+            "fix the underlying issue then run `/platform resume %s` "
+            "to retry, or `hermes gateway restart` to restart the gateway.",
+            platform.value, info.get("attempts", 0),
+            info["pause_reason"], platform.value,
+        )
+
+    def _resume_paused_platform(self, platform) -> bool:
+        """Unpause a platform — reset its attempt counter and schedule an
+        immediate retry.  Returns True if the platform was paused and is
+        now queued; False if it wasn't paused (or wasn't in the queue).
+        """
+        info = getattr(self, "_failed_platforms", {}).get(platform)
+        if info is None:
+            return False
+        if not info.get("paused"):
+            return False
+        info["paused"] = False
+        info.pop("pause_reason", None)
+        info["attempts"] = 0
+        info["next_retry"] = time.monotonic()  # retry on next watcher tick
+        try:
+            self._update_platform_runtime_status(
+                platform.value,
+                platform_state="retrying",
+                error_code=None,
+                error_message=None,
+            )
+        except Exception:
+            pass
+        logger.info("%s resumed — retrying on next watcher tick", platform.value)
+        return True
+
    @staticmethod
    def _load_prefill_messages() -> List[Dict[str, Any]]:
        """Load ephemeral prefill messages from config or env var.
@ -3613,16 +3680,32 @@ class GatewayRunner:
                return True
            if enabled_platform_count > 0:
                if startup_retryable_errors:
-                    # At least one platform attempted a connection and failed —
-                    # this is a real startup error that should block the gateway.
+                    # All enabled platforms hit retryable failures (network
+                    # blip, bridge not paired, npm install timeout, etc.).
+                    # Keep the gateway alive so:
+                    #   • cron jobs still run
+                    #   • the reconnect watcher gets a chance to recover the
+                    #     failing platforms once the underlying problem is
+                    #     fixed (e.g. user runs `hermes whatsapp`, fixes
+                    #     proxy, etc.)
+                    # Exiting here used to convert a single misconfigured
+                    # platform into an infinite systemd restart loop.
                    reason = "; ".join(startup_retryable_errors)
-                    logger.error("Gateway failed to connect any configured messaging platform: %s", reason)
+                    logger.warning(
+                        "Gateway started with no connected platforms — "
+                        "%d platform(s) queued for retry: %s",
+                        len(self._failed_platforms), reason,
+                    )
                    try:
                        from gateway.status import write_runtime_status
-                        write_runtime_status(gateway_state="startup_failed", exit_reason=reason)
+                        write_runtime_status(
+                            gateway_state="degraded",
+                            exit_reason=None,
+                        )
                    except Exception:
                        pass
-                    return False
+                    # Fall through to the normal "running" state — reconnect
+                    # watcher takes it from here.
                # All enabled platforms had no adapter (missing library or credentials).
                # In fleet deployments the same config.yaml is shared across nodes that
                # may only have credentials for a subset of platforms.  Rather than
@ -4737,11 +4820,15 @@ class GatewayRunner:
        """Background task that periodically retries connecting failed platforms.

        Uses exponential backoff: 30s → 60s → 120s → 240s → 300s (cap).
-        Stops retrying a platform after 20 failed attempts or if the error
-        is non-retryable (e.g. bad auth token).
+        Retryable failures keep retrying at the backoff cap indefinitely
+        — but if a platform fails ``_PAUSE_AFTER_FAILURES`` times in a row
+        without ever succeeding, it is *paused*: kept in the retry queue
+        but no longer hammered.  The user surfaces it with ``/platform list``
+        and resumes it with ``/platform resume <name>``.  Non-retryable
+        failures (bad auth, etc.) still drop out of the queue immediately.
        """
-        _MAX_ATTEMPTS = 20
        _BACKOFF_CAP = 300  # 5 minutes max between retries
+        _PAUSE_AFTER_FAILURES = 10  # circuit-breaker threshold

        await asyncio.sleep(10)  # initial delay — let startup finish
        while self._running:
@ -4758,22 +4845,18 @@ class GatewayRunner:
                if not self._running:
                    return
                info = self._failed_platforms[platform]
+                # Skip paused platforms entirely — they need explicit
+                # /platform resume to come back.
+                if info.get("paused"):
+                    continue
                if now < info["next_retry"]:
                    continue  # not time yet

-                if info["attempts"] >= _MAX_ATTEMPTS:
-                    logger.warning(
-                        "Giving up reconnecting %s after %d attempts",
-                        platform.value, info["attempts"],
-                    )
-                    del self._failed_platforms[platform]
-                    continue
-
                platform_config = info["config"]
                attempt = info["attempts"] + 1
                logger.info(
-                    "Reconnecting %s (attempt %d/%d)...",
-                    platform.value, attempt, _MAX_ATTEMPTS,
+                    "Reconnecting %s (attempt %d)...",
+                    platform.value, attempt,
                )

                try:
@ -4838,6 +4921,14 @@ class GatewayRunner:
                            "Reconnect %s failed, next retry in %ds",
                            platform.value, backoff,
                        )
+                        if attempt >= _PAUSE_AFTER_FAILURES:
+                            self._pause_failed_platform(
+                                platform,
+                                reason=(
+                                    adapter.fatal_error_message
+                                    or "failed to reconnect"
+                                ),
+                            )
                except Exception as e:
                    self._update_platform_runtime_status(
                        platform.value,
@ -4852,6 +4943,8 @@ class GatewayRunner:
                        "Reconnect %s error: %s, next retry in %ds",
                        platform.value, e, backoff,
                    )
+                    if attempt >= _PAUSE_AFTER_FAILURES:
+                        self._pause_failed_platform(platform, reason=str(e))

            # Check every 10 seconds for platforms that need reconnection
            for _ in range(10):
@ -6451,6 +6544,9 @@ class GatewayRunner:
        if canonical == "agents":
            return await self._handle_agents_command(event)

+        if canonical == "platform":
+            return await self._handle_platform_command(event)
+
        if canonical == "restart":
            return await self._handle_restart_command(event)
        
@ -8698,6 +8794,99 @@ class GatewayRunner:
        else:
            return t("gateway.stop.no_active")

+    async def _handle_platform_command(self, event: MessageEvent) -> str:
+        """Handle ``/platform list|pause|resume [name]`` — surface and
+        manually control failed/paused gateway adapters.
+
+        Examples:
+            ``/platform list``           — show connected + failed/paused platforms
+            ``/platform pause whatsapp`` — stop the reconnect watcher hammering whatsapp
+            ``/platform resume whatsapp`` — re-queue a paused platform for retry
+        """
+        text = (getattr(event, "content", "") or "").strip()
+        # Strip the leading "/platform" (or "/PLATFORM") token if present
+        parts = text.split(maxsplit=2)
+        if parts and parts[0].lower().lstrip("/").startswith("platform"):
+            parts = parts[1:]
+        action = (parts[0] if parts else "list").lower()
+        target = parts[1].lower() if len(parts) > 1 else ""
+
+        # Resolve platform name (case-insensitive, value match)
+        def _resolve_platform(name: str):
+            if not name:
+                return None
+            for p in Platform.__members__.values():
+                if p.value.lower() == name:
+                    return p
+            return None
+
+        if action == "list":
+            lines = ["**Gateway platforms**"]
+            connected = sorted(p.value for p in self.adapters.keys())
+            if connected:
+                lines.append("Connected: " + ", ".join(connected))
+            else:
+                lines.append("Connected: (none)")
+            failed = getattr(self, "_failed_platforms", {}) or {}
+            if failed:
+                for p, info in failed.items():
+                    if info.get("paused"):
+                        reason = info.get("pause_reason") or "paused"
+                        lines.append(
+                            f"  · {p.value} — PAUSED ({reason}). "
+                            f"Resume with `/platform resume {p.value}`."
+                        )
+                    else:
+                        attempts = info.get("attempts", 0)
+                        lines.append(
+                            f"  · {p.value} — retrying (attempt {attempts})"
+                        )
+            else:
+                lines.append("Failed/paused: (none)")
+            return "\n".join(lines)
+
+        if action in ("pause", "resume"):
+            if not target:
+                return f"Usage: /platform {action} <name>"
+            platform = _resolve_platform(target)
+            if platform is None:
+                return f"Unknown platform: {target}"
+            failed = getattr(self, "_failed_platforms", {}) or {}
+            if action == "pause":
+                if platform not in failed:
+                    return (
+                        f"{platform.value} is not in the retry queue "
+                        f"(it's either connected or not enabled)."
+                    )
+                if failed[platform].get("paused"):
+                    return f"{platform.value} is already paused."
+                self._pause_failed_platform(platform, reason="paused via /platform pause")
+                return (
+                    f"✓ {platform.value} paused. "
+                    f"Resume with `/platform resume {platform.value}` or "
+                    f"`hermes gateway restart` to reset."
+                )
+            # action == "resume"
+            if platform not in failed:
+                return (
+                    f"{platform.value} is not in the retry queue — "
+                    f"nothing to resume."
+                )
+            if not failed[platform].get("paused"):
+                return (
+                    f"{platform.value} is already retrying — "
+                    f"no resume needed."
+                )
+            self._resume_paused_platform(platform)
+            return f"✓ {platform.value} resumed — retrying on next watcher tick."
+
+        return (
+            "Usage: /platform <list|pause|resume> [name]\n"
+            "  /platform list — show platform status\n"
+            "  /platform pause <name> — stop retrying a failing platform\n"
+            "  /platform resume <name> — re-queue a paused platform"
+        )
+
    async def _handle_restart_command(self, event: MessageEvent) -> Union[str, EphemeralReply]:
        """Handle /restart command - drain active work, then restart the gateway."""
        # Defensive idempotency check: if the previous gateway process
--- a/hermes_cli/commands.py
+++ b/hermes_cli/commands.py
@ -198,6 +198,8 @@ COMMAND_REGISTRY: list[CommandDef] = [
               args_hint="[days]"),
    CommandDef("platforms", "Show gateway/messaging platform status", "Info",
               cli_only=True, aliases=("gateway",)),
+    CommandDef("platform", "Pause, resume, or list a failing gateway platform", "Info",
+               gateway_only=True, args_hint="<pause|resume|list> [name]"),
    CommandDef("copy", "Copy the last assistant response to clipboard", "Info",
               cli_only=True, args_hint="[number]"),
    CommandDef("paste", "Attach clipboard image from your clipboard", "Info",
--- a/hermes_cli/main.py
+++ b/hermes_cli/main.py
@ -1522,14 +1522,18 @@ def cmd_whatsapp(args):
        )
        print(f"\n✓ Mode: {mode_label}")

-    # ── Step 2: Enable WhatsApp ──────────────────────────────────────────
+    # ── Step 2: Mode is selected, will enable WhatsApp only after pairing ──
+    # We intentionally don't write WHATSAPP_ENABLED=true here.  If the user
+    # aborts the wizard later (Ctrl+C, failed npm install, missed QR scan),
+    # we'd otherwise leave .env claiming WhatsApp is ready when the bridge
+    # has no creds.json.  Every subsequent `hermes gateway` then paid a 30s
+    # bridge-bootstrap timeout and queued WhatsApp for indefinite retries.
+    # Now: aborted setup leaves WHATSAPP_ENABLED unset → gateway skips it.
+    # Re-runs that already have WHATSAPP_ENABLED=true (from a prior
+    # successful pairing) stay enabled — we just don't write it pre-emptively.
    print()
-    current = get_env_value("WHATSAPP_ENABLED")
-    if current and current.lower() == "true":
+    if (get_env_value("WHATSAPP_ENABLED") or "").lower() == "true":
        print("✓ WhatsApp is already enabled")
-    else:
-        save_env_value("WHATSAPP_ENABLED", "true")
-        print("✓ WhatsApp enabled")

    # ── Step 3: Allowed users ────────────────────────────────────────────
    current_users = get_env_value("WHATSAPP_ALLOWED_USERS") or ""
@ -1619,6 +1623,12 @@ def cmd_whatsapp(args):
            session_dir.mkdir(parents=True, exist_ok=True)
            print("  ✓ Session cleared")
        else:
+            # Existing pairing — ensure WHATSAPP_ENABLED reflects that.
+            # (Older installs may have lost the env var; covers re-runs
+            # where the user picked "no, keep my session" but the var
+            # was never set or got removed.)
+            if (get_env_value("WHATSAPP_ENABLED") or "").lower() != "true":
+                save_env_value("WHATSAPP_ENABLED", "true")
            print("\n✓ WhatsApp is configured and paired!")
            print("  Start the gateway with: hermes gateway")
            return
@ -1647,6 +1657,11 @@ def cmd_whatsapp(args):
    # ── Step 7: Post-pairing ─────────────────────────────────────────────
    print()
    if (session_dir / "creds.json").exists():
+        # Only enable WhatsApp now that pairing actually succeeded.  If the
+        # user Ctrl+C'd at any earlier step, WHATSAPP_ENABLED stays unset
+        # and `hermes gateway` skips it cleanly instead of paying a 30s
+        # bridge timeout + queueing the platform for indefinite retries.
+        save_env_value("WHATSAPP_ENABLED", "true")
        print("✓ WhatsApp paired successfully!")
        print()
        if wa_mode == "bot":
--- a/tests/gateway/test_platform_reconnect.py
+++ b/tests/gateway/test_platform_reconnect.py
@ -294,15 +294,63 @@ class TestPlatformReconnectWatcher:
        assert runner._failed_platforms[Platform.TELEGRAM]["attempts"] == 2

    @pytest.mark.asyncio
-    async def test_reconnect_gives_up_after_max_attempts(self):
-        """After max attempts, platform should be removed from retry queue."""
+    async def test_reconnect_pauses_after_circuit_breaker_threshold(self):
+        """After enough consecutive retryable failures, the watcher should
+        *pause* the platform (keep it in the queue but stop hammering it),
+        not drop it. The user resumes via /platform resume.
+        """
+        runner = _make_runner()
+
+        platform_config = PlatformConfig(enabled=True, token="test")
+        # 9 prior attempts — the next failure will be the 10th and should
+        # trip the circuit breaker.
+        runner._failed_platforms[Platform.TELEGRAM] = {
+            "config": platform_config,
+            "attempts": 9,
+            "next_retry": time.monotonic() - 1,
+        }
+
+        fail_adapter = StubAdapter(
+            succeed=False, fatal_error="DNS failure", fatal_retryable=True
+        )
+        real_sleep = asyncio.sleep
+
+        with patch.object(runner, "_create_adapter", return_value=fail_adapter):
+            async def run_one_iteration():
+                runner._running = True
+                call_count = 0
+
+                async def fake_sleep(n):
+                    nonlocal call_count
+                    call_count += 1
+                    if call_count > 1:
+                        runner._running = False
+                    await real_sleep(0)
+
+                with patch("asyncio.sleep", side_effect=fake_sleep):
+                    await runner._platform_reconnect_watcher()
+
+            await run_one_iteration()
+
+        # Platform stays in queue — paused, not dropped
+        assert Platform.TELEGRAM in runner._failed_platforms
+        info = runner._failed_platforms[Platform.TELEGRAM]
+        assert info["paused"] is True
+        assert info["attempts"] == 10
+        assert "pause_reason" in info
+
+    @pytest.mark.asyncio
+    async def test_reconnect_skips_paused_platforms(self):
+        """A paused platform should not be retried by the watcher tick."""
        runner = _make_runner()

        platform_config = PlatformConfig(enabled=True, token="test")
        runner._failed_platforms[Platform.TELEGRAM] = {
            "config": platform_config,
-            "attempts": 20,  # At max
-            "next_retry": time.monotonic() - 1,
+            "attempts": 10,
+            "next_retry": time.monotonic() - 1,  # would normally retry now
+            "paused": True,
+            "pause_reason": "paused via /platform pause",
        }

        real_sleep = asyncio.sleep
@ -324,8 +372,10 @@ class TestPlatformReconnectWatcher:

            await run_one_iteration()

-        assert Platform.TELEGRAM not in runner._failed_platforms
-        mock_create.assert_not_called()  # Should give up without trying
+        # Paused platform stays queued and was never touched
+        assert Platform.TELEGRAM in runner._failed_platforms
+        assert runner._failed_platforms[Platform.TELEGRAM]["paused"] is True
+        mock_create.assert_not_called()

    @pytest.mark.asyncio
    async def test_reconnect_skips_when_not_time_yet(self):
@ -459,11 +509,12 @@ class TestRuntimeDisconnectQueuing:
        assert Platform.TELEGRAM not in runner._failed_platforms

    @pytest.mark.asyncio
-    async def test_retryable_error_exits_for_service_restart_when_all_down(self):
-        """Gateway should exit with failure when all platforms fail with retryable errors.
-
-        This lets systemd Restart=on-failure restart the process, which is more
-        reliable than in-process background reconnection after exhausted retries.
+    async def test_retryable_error_keeps_gateway_alive_when_all_down(self):
+        """When all adapters fail at runtime with retryable errors, the
+        gateway should stay alive and let the reconnect watcher recover them
+        in the background.  (Previously this exited-with-failure to trigger
+        a systemd restart — that converted transient outages into infinite
+        restart loops and killed in-process state.)
        """
        runner = _make_runner()
        runner.stop = AsyncMock()
@ -474,9 +525,9 @@ class TestRuntimeDisconnectQueuing:

        await runner._handle_adapter_fatal_error(adapter)

-        # stop() SHOULD be called — gateway exits for systemd restart
-        runner.stop.assert_called_once()
-        assert runner._exit_with_failure is True
+        # stop() should NOT be called — gateway stays alive for the watcher
+        runner.stop.assert_not_called()
+        assert runner._exit_with_failure is False
        assert Platform.TELEGRAM in runner._failed_platforms

    @pytest.mark.asyncio
@ -512,3 +563,154 @@ class TestRuntimeDisconnectQueuing:
        await runner._handle_adapter_fatal_error(adapter)

        runner.stop.assert_called_once()
+
+
+# --- Pause / resume circuit breaker ---
+
+
+class TestPauseResume:
+    """Test the per-platform pause/resume helpers and slash command."""
+
+    def test_pause_marks_platform_paused(self):
+        runner = _make_runner()
+        runner._failed_platforms[Platform.TELEGRAM] = {
+            "config": PlatformConfig(enabled=True, token="t"),
+            "attempts": 3,
+            "next_retry": time.monotonic() + 30,
+        }
+        runner._pause_failed_platform(Platform.TELEGRAM, reason="manual")
+        info = runner._failed_platforms[Platform.TELEGRAM]
+        assert info["paused"] is True
+        assert info["pause_reason"] == "manual"
+        assert info["next_retry"] == float("inf")
+
+    def test_pause_is_idempotent(self):
+        runner = _make_runner()
+        runner._failed_platforms[Platform.TELEGRAM] = {
+            "config": PlatformConfig(enabled=True, token="t"),
+            "attempts": 3,
+            "next_retry": time.monotonic() + 30,
+            "paused": True,
+            "pause_reason": "first reason",
+        }
+        runner._pause_failed_platform(Platform.TELEGRAM, reason="second reason")
+        # Reason should not be overwritten on a second pause call.
+        assert (
+            runner._failed_platforms[Platform.TELEGRAM]["pause_reason"]
+            == "first reason"
+        )
+
+    def test_pause_no_op_when_platform_not_queued(self):
+        runner = _make_runner()
+        # No exception even when the platform isn't in _failed_platforms.
+        runner._pause_failed_platform(Platform.TELEGRAM, reason="x")
+        assert Platform.TELEGRAM not in runner._failed_platforms
+
+    def test_resume_clears_paused_and_resets_attempts(self):
+        runner = _make_runner()
+        runner._failed_platforms[Platform.TELEGRAM] = {
+            "config": PlatformConfig(enabled=True, token="t"),
+            "attempts": 10,
+            "next_retry": float("inf"),
+            "paused": True,
+            "pause_reason": "auto-paused",
+        }
+        assert runner._resume_paused_platform(Platform.TELEGRAM) is True
+        info = runner._failed_platforms[Platform.TELEGRAM]
+        assert info["paused"] is False
+        assert info["attempts"] == 0
+        assert info["next_retry"] != float("inf")
+        assert "pause_reason" not in info
+
+    def test_resume_returns_false_when_not_paused(self):
+        runner = _make_runner()
+        runner._failed_platforms[Platform.TELEGRAM] = {
+            "config": PlatformConfig(enabled=True, token="t"),
+            "attempts": 1,
+            "next_retry": time.monotonic() + 30,
+        }
+        assert runner._resume_paused_platform(Platform.TELEGRAM) is False
+
+    def test_resume_returns_false_when_not_queued(self):
+        runner = _make_runner()
+        assert runner._resume_paused_platform(Platform.TELEGRAM) is False
+
+
+class TestPlatformSlashCommand:
+    """Test the /platform list|pause|resume slash command handler."""
+
+    def _make_event(self, content: str):
+        ev = MagicMock()
+        ev.content = content
+        return ev
+
+    @pytest.mark.asyncio
+    async def test_list_shows_connected_and_paused(self):
+        runner = _make_runner()
+        runner.adapters[Platform.DISCORD] = StubAdapter(platform=Platform.DISCORD)
+        runner._failed_platforms[Platform.WHATSAPP] = {
+            "config": PlatformConfig(enabled=True, token="t"),
+            "attempts": 10,
+            "next_retry": float("inf"),
+            "paused": True,
+            "pause_reason": "not paired",
+        }
+        out = await runner._handle_platform_command(self._make_event("/platform list"))
+        assert "discord" in out
+        assert "whatsapp" in out
+        assert "PAUSED" in out
+        assert "not paired" in out
+
+    @pytest.mark.asyncio
+    async def test_pause_command_pauses_queued_platform(self):
+        runner = _make_runner()
+        runner._failed_platforms[Platform.WHATSAPP] = {
+            "config": PlatformConfig(enabled=True, token="t"),
+            "attempts": 2,
+            "next_retry": time.monotonic() + 30,
+        }
+        out = await runner._handle_platform_command(
+            self._make_event("/platform pause whatsapp")
+        )
+        assert "paused" in out.lower()
+        assert runner._failed_platforms[Platform.WHATSAPP]["paused"] is True
+
+    @pytest.mark.asyncio
+    async def test_pause_rejects_unqueued_platform(self):
+        runner = _make_runner()
+        out = await runner._handle_platform_command(
+            self._make_event("/platform pause whatsapp")
+        )
+        assert "not in the retry queue" in out
+
+    @pytest.mark.asyncio
+    async def test_resume_command_resumes_paused_platform(self):
+        runner = _make_runner()
+        runner._failed_platforms[Platform.WHATSAPP] = {
+            "config": PlatformConfig(enabled=True, token="t"),
+            "attempts": 10,
+            "next_retry": float("inf"),
+            "paused": True,
+            "pause_reason": "x",
+        }
+        out = await runner._handle_platform_command(
+            self._make_event("/platform resume whatsapp")
+        )
+        assert "resumed" in out.lower()
+        assert runner._failed_platforms[Platform.WHATSAPP]["paused"] is False
+
+    @pytest.mark.asyncio
+    async def test_unknown_platform_name(self):
+        runner = _make_runner()
+        out = await runner._handle_platform_command(
+            self._make_event("/platform pause notarealplatform")
+        )
+        assert "Unknown platform" in out
+
+    @pytest.mark.asyncio
+    async def test_bare_platform_shows_usage_with_list(self):
+        # An empty /platform call defaults to "list".
+        runner = _make_runner()
+        out = await runner._handle_platform_command(self._make_event("/platform"))
+        assert "Gateway platforms" in out
+
--- a/tests/gateway/test_runner_fatal_adapter.py
+++ b/tests/gateway/test_runner_fatal_adapter.py
@ -68,7 +68,11 @@ async def test_runner_requests_clean_exit_for_nonretryable_startup_conflict(monk
@pytest.mark.asyncio
 async def test_runner_queues_retryable_runtime_fatal_for_reconnection(monkeypatch, tmp_path):
    """Retryable runtime fatal errors queue the platform for reconnection
-    instead of shutting down the gateway."""
+    AND keep the gateway alive — the background reconnect watcher recovers
+    the platform when the underlying issue clears.  (Previously this
+    exited-with-failure to trigger a systemd restart; that converted
+    transient failures into infinite restart loops.)
+    """
    config = GatewayConfig(
        platforms={
            Platform.WHATSAPP: PlatformConfig(enabled=True, token="token")
@ -89,8 +93,8 @@ async def test_runner_queues_retryable_runtime_fatal_for_reconnection(monkeypatc

    await runner._handle_adapter_fatal_error(adapter)

-    # Should shut down with failure — systemd Restart=on-failure will restart
-    runner.stop.assert_awaited_once()
-    assert runner._exit_with_failure is True
+    # Gateway stays alive — watcher will retry in background
+    runner.stop.assert_not_awaited()
+    assert runner._exit_with_failure is False
    assert Platform.WHATSAPP in runner._failed_platforms
    assert runner._failed_platforms[Platform.WHATSAPP]["attempts"] == 0
--- a/tests/gateway/test_runner_startup_failures.py
+++ b/tests/gateway/test_runner_startup_failures.py
@ -64,7 +64,14 @@ class _SuccessfulAdapter(BasePlatformAdapter):


@pytest.mark.asyncio
-async def test_runner_returns_failure_for_retryable_startup_errors(monkeypatch, tmp_path):
+async def test_runner_stays_alive_for_retryable_startup_errors(monkeypatch, tmp_path):
+    """Retryable startup errors should leave the gateway running in
+    degraded mode so the reconnect watcher can recover the platform when
+    the underlying problem clears.  Previously this returned False from
+    ``start()`` and exited the process, which converted a single broken
+    platform (e.g. unpaired WhatsApp, DNS blip on Telegram) into a
+    systemd restart loop and killed cron jobs in the meantime.
+    """
    monkeypatch.setenv("HERMES_HOME", str(tmp_path))
    config = GatewayConfig(
        platforms={
@ -78,11 +85,13 @@ async def test_runner_returns_failure_for_retryable_startup_errors(monkeypatch,

    ok = await runner.start()

-    assert ok is False
+    # Gateway stays alive in degraded mode; reconnect watcher takes over.
+    assert ok is True
    assert runner.should_exit_cleanly is False
    state = read_runtime_status()
-    assert state["gateway_state"] == "startup_failed"
-    assert "temporary DNS resolution failure" in state["exit_reason"]
+    assert state["gateway_state"] in {"degraded", "running"}
+    # Telegram was queued for retry, not given up on.
+    assert Platform.TELEGRAM in runner._failed_platforms
    assert state["platforms"]["telegram"]["state"] == "retrying"
    assert state["platforms"]["telegram"]["error_code"] == "telegram_connect_error"

--- a/tests/gateway/test_whatsapp_connect.py
+++ b/tests/gateway/test_whatsapp_connect.py
@ -611,3 +611,93 @@ class TestHttpSessionLifecycle:

        mock_task.cancel.assert_not_called()
        assert adapter._poll_task is None
+
+
+# ---------------------------------------------------------------------------
+# Pre-flight: refuse to start the bridge when creds.json is missing
+# ---------------------------------------------------------------------------
+
+
+class TestNoCredsPreflight:
+    """Verify ``connect()`` fast-fails as non-retryable when WhatsApp is
+    enabled but the user never finished pairing (no ``creds.json``).
+
+    Without this guard, every gateway boot:
+      • spawned the bridge subprocess (npm install if needed)
+      • waited 30s for status:connected (never happens without creds)
+      • queued WhatsApp for indefinite retries that would just repeat
+    With the guard, ``connect()`` returns False immediately with a
+    non-retryable fatal error so the reconnect watcher drops the platform
+    and the gateway gets a single clear log line telling the user to run
+    ``hermes whatsapp``.
+    """
+
+    @pytest.mark.asyncio
+    async def test_connect_returns_false_when_no_creds(self, tmp_path):
+        from gateway.platforms.whatsapp import WhatsAppAdapter
+
+        adapter = WhatsAppAdapter.__new__(WhatsAppAdapter)
+        adapter.platform = Platform.WHATSAPP
+        adapter.config = MagicMock()
+        adapter._bridge_port = 19876
+        # Point bridge_script at a real existing file so the earlier
+        # bridge-missing check doesn't trip — we want to exercise the
+        # creds.json check specifically.
+        bridge = tmp_path / "bridge.js"
+        bridge.write_text("// stub")
+        adapter._bridge_script = str(bridge)
+        adapter._session_path = tmp_path / "session"  # no creds.json inside
+        adapter._session_path.mkdir()
+        adapter._bridge_log_fh = None
+        adapter._fatal_error_code = None
+        adapter._fatal_error_message = None
+        adapter._fatal_error_retryable = True
+
+        with patch(
+            "gateway.platforms.whatsapp.check_whatsapp_requirements",
+            return_value=True,
+        ):
+            result = await adapter.connect()
+
+        assert result is False
+        # Non-retryable so the reconnect watcher drops it cleanly
+        assert adapter._fatal_error_code == "whatsapp_not_paired"
+        assert adapter._fatal_error_retryable is False
+
+    @pytest.mark.asyncio
+    async def test_connect_proceeds_when_creds_present(self, tmp_path):
+        """When creds.json exists, the preflight check is bypassed and
+        connect() proceeds to the bridge bootstrap path. We don't fully
+        simulate the bridge here — we just verify no fast-fail occurs.
+        """
+        from gateway.platforms.whatsapp import WhatsAppAdapter
+
+        adapter = WhatsAppAdapter.__new__(WhatsAppAdapter)
+        adapter.platform = Platform.WHATSAPP
+        adapter.config = MagicMock()
+        adapter._bridge_port = 19877
+        bridge = tmp_path / "bridge.js"
+        bridge.write_text("// stub")
+        adapter._bridge_script = str(bridge)
+        session_dir = tmp_path / "session"
+        session_dir.mkdir()
+        (session_dir / "creds.json").write_text("{}")
+        adapter._session_path = session_dir
+        adapter._bridge_log_fh = None
+        adapter._fatal_error_code = None
+        adapter._fatal_error_message = None
+        adapter._fatal_error_retryable = True
+        # Stub _acquire_platform_lock to return False so connect() exits
+        # cleanly *after* the preflight, without spawning subprocesses.
+        adapter._acquire_platform_lock = MagicMock(return_value=False)
+
+        with patch(
+            "gateway.platforms.whatsapp.check_whatsapp_requirements",
+            return_value=True,
+        ):
+            result = await adapter.connect()
+
+        # Preflight passed — exits because we faked lock acquisition,
+        # but the fatal-error code is NOT the "not paired" one.
+        assert result is False
+        assert adapter._fatal_error_code != "whatsapp_not_paired"
--- a/tests/hermes_cli/test_whatsapp_setup_ordering.py
+++ b/tests/hermes_cli/test_whatsapp_setup_ordering.py
@ -0,0 +1,140 @@
+"""Regression tests for ``cmd_whatsapp`` env-var write ordering.
+
+Before the fix, ``hermes whatsapp`` wrote ``WHATSAPP_ENABLED=true`` at
+step 2 — before npm install (step 4) and before QR pairing (step 6).
+If the user Ctrl+C'd at any later step, ``.env`` claimed WhatsApp was
+ready when the bridge still had no ``creds.json``.  Every subsequent
+``hermes gateway`` then paid a 30s bridge-bootstrap timeout and queued
+WhatsApp for indefinite retries — looking like "the gateway is broken."
+
+The fix: only set ``WHATSAPP_ENABLED=true`` once pairing actually
+succeeds (creds.json exists).  Aborted setup leaves no enabled state.
+"""
+
+from __future__ import annotations
+
+import io
+import os
+from contextlib import redirect_stdout
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+
+@pytest.fixture
+def isolated_home(tmp_path, monkeypatch):
+    home = tmp_path / "home"
+    hermes = home / ".hermes"
+    hermes.mkdir(parents=True)
+    monkeypatch.setattr(Path, "home", lambda: home)
+    monkeypatch.setenv("HERMES_HOME", str(hermes))
+    # Ensure get_env_value cache doesn't carry stale state.
+    for key in list(os.environ):
+        if key.startswith("WHATSAPP_"):
+            monkeypatch.delenv(key, raising=False)
+    return hermes
+
+
+def _env_value(hermes_home: Path, key: str) -> str | None:
+    env_file = hermes_home / ".env"
+    if not env_file.exists():
+        return None
+    for line in env_file.read_text().splitlines():
+        if "=" not in line:
+            continue
+        k, _, v = line.partition("=")
+        if k.strip() == key:
+            return v.strip().strip('"').strip("'")
+    return None
+
+
+def test_aborted_setup_does_not_enable_whatsapp(isolated_home, monkeypatch):
+    """User picks mode 1, then Ctrl+C's at the allowed-users prompt.
+
+    WHATSAPP_ENABLED must NOT be present in .env after abort.
+    """
+    from hermes_cli.main import cmd_whatsapp
+
+    # First input() = mode choice, second input() = allowed-users prompt
+    # We raise KeyboardInterrupt on the second call to simulate abort.
+    inputs = iter(["1"])
+
+    def fake_input(_prompt=""):
+        try:
+            return next(inputs)
+        except StopIteration:
+            raise KeyboardInterrupt
+
+    monkeypatch.setattr("builtins.input", fake_input)
+    # _require_tty calls sys.stdin.isatty — make it pass.
+    monkeypatch.setattr("hermes_cli.main._require_tty", lambda *_a, **_kw: None)
+    # No node, no bridge script — we shouldn't reach those steps anyway.
+
+    buf = io.StringIO()
+    with redirect_stdout(buf):
+        try:
+            cmd_whatsapp(MagicMock())
+        except KeyboardInterrupt:
+            pass
+
+    assert _env_value(isolated_home, "WHATSAPP_ENABLED") is None, (
+        "Setup aborted before pairing — WHATSAPP_ENABLED must not be set. "
+        f"Got .env: {(isolated_home / '.env').read_text() if (isolated_home / '.env').exists() else '(missing)'}"
+    )
+
+
+def test_existing_pairing_skip_branch_enables_whatsapp(isolated_home, monkeypatch):
+    """User runs ``hermes whatsapp`` with an existing paired session and
+    chooses "no, keep my session" at the re-pair prompt.  The env var
+    should be (re-)written to true so the gateway picks WhatsApp back up,
+    even if the var was lost since the original pairing.
+    """
+    from hermes_cli.main import cmd_whatsapp
+
+    # Pre-create a paired session WITHOUT WHATSAPP_ENABLED in .env.
+    session = isolated_home / "whatsapp" / "session"
+    session.mkdir(parents=True)
+    (session / "creds.json").write_text("{}")
+    monkeypatch.setenv("WHATSAPP_MODE", "bot")
+    monkeypatch.setenv("WHATSAPP_ALLOWED_USERS", "15551234567")
+
+    # mode already set → skip mode prompt; users already set → skip update
+    # prompt with "no"; pairing exists → "no, keep session" → return.
+    inputs = iter(["n", "n"])
+
+    def fake_input(_prompt=""):
+        try:
+            return next(inputs)
+        except StopIteration:
+            return "n"
+
+    monkeypatch.setattr("builtins.input", fake_input)
+    monkeypatch.setattr("hermes_cli.main._require_tty", lambda *_a, **_kw: None)
+    # Skip the bridge npm install — we're testing setup-ordering, not bridge
+    # bootstrapping.  Pretend node_modules exists (Path.exists -> True for that
+    # specific check is hard to scope, so instead pretend npm install would
+    # succeed silently if reached).
+    monkeypatch.setattr(
+        "subprocess.run",
+        lambda *_a, **_kw: MagicMock(returncode=0, stderr=""),
+    )
+    monkeypatch.setattr("shutil.which", lambda _name: "/usr/bin/npm")
+    # Patch (bridge_dir / "node_modules").exists() by stubbing Path.exists
+    # to True for that one specific subpath.  Easier: pre-create it as a
+    # symlink to /tmp.  But we can't write to the repo.  Instead, stub
+    # Path.exists wholesale to True for node_modules; the creds.json check
+    # in the same function still works because we wrote it ourselves.
+    _orig_exists = Path.exists
+    def _stub_exists(self):
+        if self.name == "node_modules":
+            return True
+        return _orig_exists(self)
+    monkeypatch.setattr(Path, "exists", _stub_exists)
+
+    buf = io.StringIO()
+    with redirect_stdout(buf):
+        cmd_whatsapp(MagicMock())
+
+    # The skip-rebar branch should have set the env var on its way out.
+    assert _env_value(isolated_home, "WHATSAPP_ENABLED") == "true"