diff --git a/gateway/platforms/whatsapp.py b/gateway/platforms/whatsapp.py
index 5239df3b5ae..0ca3d41fabb 100644
--- a/gateway/platforms/whatsapp.py
+++ b/gateway/platforms/whatsapp.py
@@ -493,13 +493,45 @@ class WhatsAppAdapter(BasePlatformAdapter):
         """
         if not check_whatsapp_requirements():
             logger.warning("[%s] Node.js not found. WhatsApp requires Node.js.", self.name)
+            self._set_fatal_error(
+                "whatsapp_node_missing",
+                "Node.js is not installed — install Node.js and re-run `hermes gateway`.",
+                retryable=False,
+            )
             return False
         
         bridge_path = Path(self._bridge_script)
         if not bridge_path.exists():
             logger.warning("[%s] Bridge script not found: %s", self.name, bridge_path)
+            self._set_fatal_error(
+                "whatsapp_bridge_missing",
+                f"WhatsApp bridge script missing at {bridge_path}.",
+                retryable=False,
+            )
             return False
-        
+
+        # Pre-flight: skip the 30s bridge bootstrap entirely if the user
+        # never finished pairing.  Without creds.json the bridge prints
+        # QR codes to its log file and never reaches status:connected,
+        # so every gateway restart paid the 30s timeout + queued WhatsApp
+        # for indefinite retries.  Mark non-retryable so the user gets a
+        # clear "run hermes whatsapp" message instead of the watcher
+        # silently hammering an unconfigured platform.
+        creds_path = self._session_path / "creds.json"
+        if not creds_path.exists():
+            logger.warning(
+                "[%s] WhatsApp is enabled but not paired (no creds.json at %s). "
+                "Run `hermes whatsapp` to pair, or remove WHATSAPP_ENABLED from "
+                "your .env to disable.",
+                self.name, creds_path,
+            )
+            self._set_fatal_error(
+                "whatsapp_not_paired",
+                "WhatsApp enabled but not paired — run `hermes whatsapp` to pair.",
+                retryable=False,
+            )
+            return False
+
         logger.info("[%s] Bridge found at %s", self.name, bridge_path)
         
         # Acquire scoped lock to prevent duplicate sessions
diff --git a/gateway/run.py b/gateway/run.py
index f41357673f7..f9a282a413f 100644
--- a/gateway/run.py
+++ b/gateway/run.py
@@ -1990,21 +1990,21 @@ class GatewayRunner:
             await self.stop()
         elif not self.adapters and self._failed_platforms:
             # All platforms are down and queued for background reconnection.
-            # If the error is retryable, exit with failure so systemd Restart=on-failure
-            # can restart the process. Otherwise stay alive and keep retrying in background.
-            if adapter.fatal_error_retryable:
-                self._exit_reason = adapter.fatal_error_message or "All messaging platforms failed with retryable errors"
-                self._exit_with_failure = True
-                logger.error(
-                    "All messaging platforms failed with retryable errors. "
-                    "Shutting down gateway for service restart (systemd will retry)."
-                )
-                await self.stop()
-            else:
-                logger.warning(
-                    "No connected messaging platforms remain, but %d platform(s) queued for reconnection",
-                    len(self._failed_platforms),
-                )
+            # Keep the gateway alive so:
+            #   • cron jobs still run
+            #   • the reconnect watcher can recover platforms when the
+            #     underlying problem clears (proxy comes back, user runs
+            #     `hermes whatsapp`, etc.)
+            # We used to exit-with-failure here to trigger systemd restart,
+            # but that converted a transient outage into a restart loop and
+            # killed in-process state every time. The reconnect watcher
+            # already handles long-running recovery — let it do its job.
+            logger.warning(
+                "No connected messaging platforms remain, but %d platform(s) "
+                "queued for reconnection — gateway staying alive, watcher will "
+                "retry in background.",
+                len(self._failed_platforms),
+            )
 
     def _request_clean_exit(self, reason: str) -> None:
         self._exit_cleanly = True
@@ -2180,6 +2180,73 @@ class GatewayRunner:
         except Exception:
             pass
 
+    # ------------------------------------------------------------------
+    # Per-platform circuit breaker (pause/resume) — used by the reconnect
+    # watcher when a retryable failure recurs past a threshold, and by the
+    # /platform pause|resume slash command for manual control.
+    # ------------------------------------------------------------------
+    def _pause_failed_platform(self, platform, *, reason: str = "") -> None:
+        """Mark a queued platform as paused — keep it in ``_failed_platforms``
+        but stop the reconnect watcher from hammering it.
+
+        Used by the circuit breaker after ``_PAUSE_AFTER_FAILURES`` consecutive
+        retryable failures, and by ``/platform pause <name>`` for manual
+        intervention.  Paused platforms are surfaced in ``/platform list``
+        and resumed with ``/platform resume <name>``.
+        """
+        info = getattr(self, "_failed_platforms", {}).get(platform)
+        if info is None:
+            return
+        if info.get("paused"):
+            return
+        info["paused"] = True
+        info["pause_reason"] = reason or "auto-paused after repeated failures"
+        # Push next_retry far enough out that even if "paused" is missed
+        # by a stale code path, the watcher won't fire on it.
+        info["next_retry"] = float("inf")
+        try:
+            self._update_platform_runtime_status(
+                platform.value,
+                platform_state="paused",
+                error_code=None,
+                error_message=info["pause_reason"],
+            )
+        except Exception:
+            pass
+        logger.warning(
+            "%s paused after %d consecutive failures (%s) — "
+            "fix the underlying issue then run `/platform resume %s` "
+            "to retry, or `hermes gateway restart` to restart the gateway.",
+            platform.value, info.get("attempts", 0),
+            info["pause_reason"], platform.value,
+        )
+
+    def _resume_paused_platform(self, platform) -> bool:
+        """Unpause a platform — reset its attempt counter and schedule an
+        immediate retry.  Returns True if the platform was paused and is
+        now queued; False if it wasn't paused (or wasn't in the queue).
+        """
+        info = getattr(self, "_failed_platforms", {}).get(platform)
+        if info is None:
+            return False
+        if not info.get("paused"):
+            return False
+        info["paused"] = False
+        info.pop("pause_reason", None)
+        info["attempts"] = 0
+        info["next_retry"] = time.monotonic()  # retry on next watcher tick
+        try:
+            self._update_platform_runtime_status(
+                platform.value,
+                platform_state="retrying",
+                error_code=None,
+                error_message=None,
+            )
+        except Exception:
+            pass
+        logger.info("%s resumed — retrying on next watcher tick", platform.value)
+        return True
+
     @staticmethod
     def _load_prefill_messages() -> List[Dict[str, Any]]:
         """Load ephemeral prefill messages from config or env var.
@@ -3613,16 +3680,32 @@ class GatewayRunner:
                 return True
             if enabled_platform_count > 0:
                 if startup_retryable_errors:
-                    # At least one platform attempted a connection and failed —
-                    # this is a real startup error that should block the gateway.
+                    # All enabled platforms hit retryable failures (network
+                    # blip, bridge not paired, npm install timeout, etc.).
+                    # Keep the gateway alive so:
+                    #   • cron jobs still run
+                    #   • the reconnect watcher gets a chance to recover the
+                    #     failing platforms once the underlying problem is
+                    #     fixed (e.g. user runs `hermes whatsapp`, fixes
+                    #     proxy, etc.)
+                    # Exiting here used to convert a single misconfigured
+                    # platform into an infinite systemd restart loop.
                     reason = "; ".join(startup_retryable_errors)
-                    logger.error("Gateway failed to connect any configured messaging platform: %s", reason)
+                    logger.warning(
+                        "Gateway started with no connected platforms — "
+                        "%d platform(s) queued for retry: %s",
+                        len(self._failed_platforms), reason,
+                    )
                     try:
                         from gateway.status import write_runtime_status
-                        write_runtime_status(gateway_state="startup_failed", exit_reason=reason)
+                        write_runtime_status(
+                            gateway_state="degraded",
+                            exit_reason=None,
+                        )
                     except Exception:
                         pass
-                    return False
+                    # Fall through to the normal "running" state — reconnect
+                    # watcher takes it from here.
                 # All enabled platforms had no adapter (missing library or credentials).
                 # In fleet deployments the same config.yaml is shared across nodes that
                 # may only have credentials for a subset of platforms.  Rather than
@@ -4737,11 +4820,15 @@ class GatewayRunner:
         """Background task that periodically retries connecting failed platforms.
 
         Uses exponential backoff: 30s → 60s → 120s → 240s → 300s (cap).
-        Stops retrying a platform after 20 failed attempts or if the error
-        is non-retryable (e.g. bad auth token).
+        Retryable failures keep retrying at the backoff cap indefinitely
+        — but if a platform fails ``_PAUSE_AFTER_FAILURES`` times in a row
+        without ever succeeding, it is *paused*: kept in the retry queue
+        but no longer hammered.  The user surfaces it with ``/platform list``
+        and resumes it with ``/platform resume <name>``.  Non-retryable
+        failures (bad auth, etc.) still drop out of the queue immediately.
         """
-        _MAX_ATTEMPTS = 20
         _BACKOFF_CAP = 300  # 5 minutes max between retries
+        _PAUSE_AFTER_FAILURES = 10  # circuit-breaker threshold
 
         await asyncio.sleep(10)  # initial delay — let startup finish
         while self._running:
@@ -4758,22 +4845,18 @@ class GatewayRunner:
                 if not self._running:
                     return
                 info = self._failed_platforms[platform]
+                # Skip paused platforms entirely — they need explicit
+                # /platform resume to come back.
+                if info.get("paused"):
+                    continue
                 if now < info["next_retry"]:
                     continue  # not time yet
 
-                if info["attempts"] >= _MAX_ATTEMPTS:
-                    logger.warning(
-                        "Giving up reconnecting %s after %d attempts",
-                        platform.value, info["attempts"],
-                    )
-                    del self._failed_platforms[platform]
-                    continue
-
                 platform_config = info["config"]
                 attempt = info["attempts"] + 1
                 logger.info(
-                    "Reconnecting %s (attempt %d/%d)...",
-                    platform.value, attempt, _MAX_ATTEMPTS,
+                    "Reconnecting %s (attempt %d)...",
+                    platform.value, attempt,
                 )
 
                 try:
@@ -4838,6 +4921,14 @@ class GatewayRunner:
                             "Reconnect %s failed, next retry in %ds",
                             platform.value, backoff,
                         )
+                        if attempt >= _PAUSE_AFTER_FAILURES:
+                            self._pause_failed_platform(
+                                platform,
+                                reason=(
+                                    adapter.fatal_error_message
+                                    or "failed to reconnect"
+                                ),
+                            )
                 except Exception as e:
                     self._update_platform_runtime_status(
                         platform.value,
@@ -4852,6 +4943,8 @@ class GatewayRunner:
                         "Reconnect %s error: %s, next retry in %ds",
                         platform.value, e, backoff,
                     )
+                    if attempt >= _PAUSE_AFTER_FAILURES:
+                        self._pause_failed_platform(platform, reason=str(e))
 
             # Check every 10 seconds for platforms that need reconnection
             for _ in range(10):
@@ -6451,6 +6544,9 @@ class GatewayRunner:
         if canonical == "agents":
             return await self._handle_agents_command(event)
 
+        if canonical == "platform":
+            return await self._handle_platform_command(event)
+
         if canonical == "restart":
             return await self._handle_restart_command(event)
         
@@ -8698,6 +8794,99 @@ class GatewayRunner:
         else:
             return t("gateway.stop.no_active")
 
+    async def _handle_platform_command(self, event: MessageEvent) -> str:
+        """Handle ``/platform list|pause|resume [name]`` — surface and
+        manually control failed/paused gateway adapters.
+
+        Examples:
+            ``/platform list``           — show connected + failed/paused platforms
+            ``/platform pause whatsapp`` — stop the reconnect watcher hammering whatsapp
+            ``/platform resume whatsapp`` — re-queue a paused platform for retry
+        """
+        text = (getattr(event, "content", "") or "").strip()
+        # Strip the leading "/platform" (or "/PLATFORM") token if present
+        parts = text.split(maxsplit=2)
+        if parts and parts[0].lower().lstrip("/").startswith("platform"):
+            parts = parts[1:]
+        action = (parts[0] if parts else "list").lower()
+        target = parts[1].lower() if len(parts) > 1 else ""
+
+        # Resolve platform name (case-insensitive, value match)
+        def _resolve_platform(name: str):
+            if not name:
+                return None
+            for p in Platform.__members__.values():
+                if p.value.lower() == name:
+                    return p
+            return None
+
+        if action == "list":
+            lines = ["**Gateway platforms**"]
+            connected = sorted(p.value for p in self.adapters.keys())
+            if connected:
+                lines.append("Connected: " + ", ".join(connected))
+            else:
+                lines.append("Connected: (none)")
+            failed = getattr(self, "_failed_platforms", {}) or {}
+            if failed:
+                for p, info in failed.items():
+                    if info.get("paused"):
+                        reason = info.get("pause_reason") or "paused"
+                        lines.append(
+                            f"  · {p.value} — PAUSED ({reason}). "
+                            f"Resume with `/platform resume {p.value}`."
+                        )
+                    else:
+                        attempts = info.get("attempts", 0)
+                        lines.append(
+                            f"  · {p.value} — retrying (attempt {attempts})"
+                        )
+            else:
+                lines.append("Failed/paused: (none)")
+            return "\n".join(lines)
+
+        if action in ("pause", "resume"):
+            if not target:
+                return f"Usage: /platform {action} <name>"
+            platform = _resolve_platform(target)
+            if platform is None:
+                return f"Unknown platform: {target}"
+            failed = getattr(self, "_failed_platforms", {}) or {}
+            if action == "pause":
+                if platform not in failed:
+                    return (
+                        f"{platform.value} is not in the retry queue "
+                        f"(it's either connected or not enabled)."
+                    )
+                if failed[platform].get("paused"):
+                    return f"{platform.value} is already paused."
+                self._pause_failed_platform(platform, reason="paused via /platform pause")
+                return (
+                    f"✓ {platform.value} paused. "
+                    f"Resume with `/platform resume {platform.value}` or "
+                    f"`hermes gateway restart` to reset."
+                )
+            # action == "resume"
+            if platform not in failed:
+                return (
+                    f"{platform.value} is not in the retry queue — "
+                    f"nothing to resume."
+                )
+            if not failed[platform].get("paused"):
+                return (
+                    f"{platform.value} is already retrying — "
+                    f"no resume needed."
+                )
+            self._resume_paused_platform(platform)
+            return f"✓ {platform.value} resumed — retrying on next watcher tick."
+
+        return (
+            "Usage: /platform <list|pause|resume> [name]\n"
+            "  /platform list — show platform status\n"
+            "  /platform pause <name> — stop retrying a failing platform\n"
+            "  /platform resume <name> — re-queue a paused platform"
+        )
+
     async def _handle_restart_command(self, event: MessageEvent) -> Union[str, EphemeralReply]:
         """Handle /restart command - drain active work, then restart the gateway."""
         # Defensive idempotency check: if the previous gateway process
diff --git a/hermes_cli/commands.py b/hermes_cli/commands.py
index b3556d3932d..83d86c4a3a9 100644
--- a/hermes_cli/commands.py
+++ b/hermes_cli/commands.py
@@ -198,6 +198,8 @@ COMMAND_REGISTRY: list[CommandDef] = [
                args_hint="[days]"),
     CommandDef("platforms", "Show gateway/messaging platform status", "Info",
                cli_only=True, aliases=("gateway",)),
+    CommandDef("platform", "Pause, resume, or list a failing gateway platform", "Info",
+               gateway_only=True, args_hint="<pause|resume|list> [name]"),
     CommandDef("copy", "Copy the last assistant response to clipboard", "Info",
                cli_only=True, args_hint="[number]"),
     CommandDef("paste", "Attach clipboard image from your clipboard", "Info",
diff --git a/hermes_cli/main.py b/hermes_cli/main.py
index c2c8a6880d2..7eedc3fd322 100644
--- a/hermes_cli/main.py
+++ b/hermes_cli/main.py
@@ -1522,14 +1522,18 @@ def cmd_whatsapp(args):
         )
         print(f"\n✓ Mode: {mode_label}")
 
-    # ── Step 2: Enable WhatsApp ──────────────────────────────────────────
+    # ── Step 2: Mode is selected, will enable WhatsApp only after pairing ──
+    # We intentionally don't write WHATSAPP_ENABLED=true here.  If the user
+    # aborts the wizard later (Ctrl+C, failed npm install, missed QR scan),
+    # we'd otherwise leave .env claiming WhatsApp is ready when the bridge
+    # has no creds.json.  Every subsequent `hermes gateway` then paid a 30s
+    # bridge-bootstrap timeout and queued WhatsApp for indefinite retries.
+    # Now: aborted setup leaves WHATSAPP_ENABLED unset → gateway skips it.
+    # Re-runs that already have WHATSAPP_ENABLED=true (from a prior
+    # successful pairing) stay enabled — we just don't write it pre-emptively.
     print()
-    current = get_env_value("WHATSAPP_ENABLED")
-    if current and current.lower() == "true":
+    if (get_env_value("WHATSAPP_ENABLED") or "").lower() == "true":
         print("✓ WhatsApp is already enabled")
-    else:
-        save_env_value("WHATSAPP_ENABLED", "true")
-        print("✓ WhatsApp enabled")
 
     # ── Step 3: Allowed users ────────────────────────────────────────────
     current_users = get_env_value("WHATSAPP_ALLOWED_USERS") or ""
@@ -1619,6 +1623,12 @@ def cmd_whatsapp(args):
             session_dir.mkdir(parents=True, exist_ok=True)
             print("  ✓ Session cleared")
         else:
+            # Existing pairing — ensure WHATSAPP_ENABLED reflects that.
+            # (Older installs may have lost the env var; covers re-runs
+            # where the user picked "no, keep my session" but the var
+            # was never set or got removed.)
+            if (get_env_value("WHATSAPP_ENABLED") or "").lower() != "true":
+                save_env_value("WHATSAPP_ENABLED", "true")
             print("\n✓ WhatsApp is configured and paired!")
             print("  Start the gateway with: hermes gateway")
             return
@@ -1647,6 +1657,11 @@ def cmd_whatsapp(args):
     # ── Step 7: Post-pairing ─────────────────────────────────────────────
     print()
     if (session_dir / "creds.json").exists():
+        # Only enable WhatsApp now that pairing actually succeeded.  If the
+        # user Ctrl+C'd at any earlier step, WHATSAPP_ENABLED stays unset
+        # and `hermes gateway` skips it cleanly instead of paying a 30s
+        # bridge timeout + queueing the platform for indefinite retries.
+        save_env_value("WHATSAPP_ENABLED", "true")
         print("✓ WhatsApp paired successfully!")
         print()
         if wa_mode == "bot":
diff --git a/tests/gateway/test_platform_reconnect.py b/tests/gateway/test_platform_reconnect.py
index a0bd7ab9eec..e4362a02562 100644
--- a/tests/gateway/test_platform_reconnect.py
+++ b/tests/gateway/test_platform_reconnect.py
@@ -294,15 +294,63 @@ class TestPlatformReconnectWatcher:
         assert runner._failed_platforms[Platform.TELEGRAM]["attempts"] == 2
 
     @pytest.mark.asyncio
-    async def test_reconnect_gives_up_after_max_attempts(self):
-        """After max attempts, platform should be removed from retry queue."""
+    async def test_reconnect_pauses_after_circuit_breaker_threshold(self):
+        """After enough consecutive retryable failures, the watcher should
+        *pause* the platform (keep it in the queue but stop hammering it),
+        not drop it. The user resumes via /platform resume.
+        """
+        runner = _make_runner()
+
+        platform_config = PlatformConfig(enabled=True, token="test")
+        # 9 prior attempts — the next failure will be the 10th and should
+        # trip the circuit breaker.
+        runner._failed_platforms[Platform.TELEGRAM] = {
+            "config": platform_config,
+            "attempts": 9,
+            "next_retry": time.monotonic() - 1,
+        }
+
+        fail_adapter = StubAdapter(
+            succeed=False, fatal_error="DNS failure", fatal_retryable=True
+        )
+        real_sleep = asyncio.sleep
+
+        with patch.object(runner, "_create_adapter", return_value=fail_adapter):
+            async def run_one_iteration():
+                runner._running = True
+                call_count = 0
+
+                async def fake_sleep(n):
+                    nonlocal call_count
+                    call_count += 1
+                    if call_count > 1:
+                        runner._running = False
+                    await real_sleep(0)
+
+                with patch("asyncio.sleep", side_effect=fake_sleep):
+                    await runner._platform_reconnect_watcher()
+
+            await run_one_iteration()
+
+        # Platform stays in queue — paused, not dropped
+        assert Platform.TELEGRAM in runner._failed_platforms
+        info = runner._failed_platforms[Platform.TELEGRAM]
+        assert info["paused"] is True
+        assert info["attempts"] == 10
+        assert "pause_reason" in info
+
+    @pytest.mark.asyncio
+    async def test_reconnect_skips_paused_platforms(self):
+        """A paused platform should not be retried by the watcher tick."""
         runner = _make_runner()
 
         platform_config = PlatformConfig(enabled=True, token="test")
         runner._failed_platforms[Platform.TELEGRAM] = {
             "config": platform_config,
-            "attempts": 20,  # At max
-            "next_retry": time.monotonic() - 1,
+            "attempts": 10,
+            "next_retry": time.monotonic() - 1,  # would normally retry now
+            "paused": True,
+            "pause_reason": "paused via /platform pause",
         }
 
         real_sleep = asyncio.sleep
@@ -324,8 +372,10 @@ class TestPlatformReconnectWatcher:
 
             await run_one_iteration()
 
-        assert Platform.TELEGRAM not in runner._failed_platforms
-        mock_create.assert_not_called()  # Should give up without trying
+        # Paused platform stays queued and was never touched
+        assert Platform.TELEGRAM in runner._failed_platforms
+        assert runner._failed_platforms[Platform.TELEGRAM]["paused"] is True
+        mock_create.assert_not_called()
 
     @pytest.mark.asyncio
     async def test_reconnect_skips_when_not_time_yet(self):
@@ -459,11 +509,12 @@ class TestRuntimeDisconnectQueuing:
         assert Platform.TELEGRAM not in runner._failed_platforms
 
     @pytest.mark.asyncio
-    async def test_retryable_error_exits_for_service_restart_when_all_down(self):
-        """Gateway should exit with failure when all platforms fail with retryable errors.
-
-        This lets systemd Restart=on-failure restart the process, which is more
-        reliable than in-process background reconnection after exhausted retries.
+    async def test_retryable_error_keeps_gateway_alive_when_all_down(self):
+        """When all adapters fail at runtime with retryable errors, the
+        gateway should stay alive and let the reconnect watcher recover them
+        in the background.  (Previously this exited-with-failure to trigger
+        a systemd restart — that converted transient outages into infinite
+        restart loops and killed in-process state.)
         """
         runner = _make_runner()
         runner.stop = AsyncMock()
@@ -474,9 +525,9 @@ class TestRuntimeDisconnectQueuing:
 
         await runner._handle_adapter_fatal_error(adapter)
 
-        # stop() SHOULD be called — gateway exits for systemd restart
-        runner.stop.assert_called_once()
-        assert runner._exit_with_failure is True
+        # stop() should NOT be called — gateway stays alive for the watcher
+        runner.stop.assert_not_called()
+        assert runner._exit_with_failure is False
         assert Platform.TELEGRAM in runner._failed_platforms
 
     @pytest.mark.asyncio
@@ -512,3 +563,154 @@ class TestRuntimeDisconnectQueuing:
         await runner._handle_adapter_fatal_error(adapter)
 
         runner.stop.assert_called_once()
+
+
+# --- Pause / resume circuit breaker ---
+
+
+class TestPauseResume:
+    """Test the per-platform pause/resume helpers and slash command."""
+
+    def test_pause_marks_platform_paused(self):
+        runner = _make_runner()
+        runner._failed_platforms[Platform.TELEGRAM] = {
+            "config": PlatformConfig(enabled=True, token="t"),
+            "attempts": 3,
+            "next_retry": time.monotonic() + 30,
+        }
+        runner._pause_failed_platform(Platform.TELEGRAM, reason="manual")
+        info = runner._failed_platforms[Platform.TELEGRAM]
+        assert info["paused"] is True
+        assert info["pause_reason"] == "manual"
+        assert info["next_retry"] == float("inf")
+
+    def test_pause_is_idempotent(self):
+        runner = _make_runner()
+        runner._failed_platforms[Platform.TELEGRAM] = {
+            "config": PlatformConfig(enabled=True, token="t"),
+            "attempts": 3,
+            "next_retry": time.monotonic() + 30,
+            "paused": True,
+            "pause_reason": "first reason",
+        }
+        runner._pause_failed_platform(Platform.TELEGRAM, reason="second reason")
+        # Reason should not be overwritten on a second pause call.
+        assert (
+            runner._failed_platforms[Platform.TELEGRAM]["pause_reason"]
+            == "first reason"
+        )
+
+    def test_pause_no_op_when_platform_not_queued(self):
+        runner = _make_runner()
+        # No exception even when the platform isn't in _failed_platforms.
+        runner._pause_failed_platform(Platform.TELEGRAM, reason="x")
+        assert Platform.TELEGRAM not in runner._failed_platforms
+
+    def test_resume_clears_paused_and_resets_attempts(self):
+        runner = _make_runner()
+        runner._failed_platforms[Platform.TELEGRAM] = {
+            "config": PlatformConfig(enabled=True, token="t"),
+            "attempts": 10,
+            "next_retry": float("inf"),
+            "paused": True,
+            "pause_reason": "auto-paused",
+        }
+        assert runner._resume_paused_platform(Platform.TELEGRAM) is True
+        info = runner._failed_platforms[Platform.TELEGRAM]
+        assert info["paused"] is False
+        assert info["attempts"] == 0
+        assert info["next_retry"] != float("inf")
+        assert "pause_reason" not in info
+
+    def test_resume_returns_false_when_not_paused(self):
+        runner = _make_runner()
+        runner._failed_platforms[Platform.TELEGRAM] = {
+            "config": PlatformConfig(enabled=True, token="t"),
+            "attempts": 1,
+            "next_retry": time.monotonic() + 30,
+        }
+        assert runner._resume_paused_platform(Platform.TELEGRAM) is False
+
+    def test_resume_returns_false_when_not_queued(self):
+        runner = _make_runner()
+        assert runner._resume_paused_platform(Platform.TELEGRAM) is False
+
+
+class TestPlatformSlashCommand:
+    """Test the /platform list|pause|resume slash command handler."""
+
+    def _make_event(self, content: str):
+        ev = MagicMock()
+        ev.content = content
+        return ev
+
+    @pytest.mark.asyncio
+    async def test_list_shows_connected_and_paused(self):
+        runner = _make_runner()
+        runner.adapters[Platform.DISCORD] = StubAdapter(platform=Platform.DISCORD)
+        runner._failed_platforms[Platform.WHATSAPP] = {
+            "config": PlatformConfig(enabled=True, token="t"),
+            "attempts": 10,
+            "next_retry": float("inf"),
+            "paused": True,
+            "pause_reason": "not paired",
+        }
+        out = await runner._handle_platform_command(self._make_event("/platform list"))
+        assert "discord" in out
+        assert "whatsapp" in out
+        assert "PAUSED" in out
+        assert "not paired" in out
+
+    @pytest.mark.asyncio
+    async def test_pause_command_pauses_queued_platform(self):
+        runner = _make_runner()
+        runner._failed_platforms[Platform.WHATSAPP] = {
+            "config": PlatformConfig(enabled=True, token="t"),
+            "attempts": 2,
+            "next_retry": time.monotonic() + 30,
+        }
+        out = await runner._handle_platform_command(
+            self._make_event("/platform pause whatsapp")
+        )
+        assert "paused" in out.lower()
+        assert runner._failed_platforms[Platform.WHATSAPP]["paused"] is True
+
+    @pytest.mark.asyncio
+    async def test_pause_rejects_unqueued_platform(self):
+        runner = _make_runner()
+        out = await runner._handle_platform_command(
+            self._make_event("/platform pause whatsapp")
+        )
+        assert "not in the retry queue" in out
+
+    @pytest.mark.asyncio
+    async def test_resume_command_resumes_paused_platform(self):
+        runner = _make_runner()
+        runner._failed_platforms[Platform.WHATSAPP] = {
+            "config": PlatformConfig(enabled=True, token="t"),
+            "attempts": 10,
+            "next_retry": float("inf"),
+            "paused": True,
+            "pause_reason": "x",
+        }
+        out = await runner._handle_platform_command(
+            self._make_event("/platform resume whatsapp")
+        )
+        assert "resumed" in out.lower()
+        assert runner._failed_platforms[Platform.WHATSAPP]["paused"] is False
+
+    @pytest.mark.asyncio
+    async def test_unknown_platform_name(self):
+        runner = _make_runner()
+        out = await runner._handle_platform_command(
+            self._make_event("/platform pause notarealplatform")
+        )
+        assert "Unknown platform" in out
+
+    @pytest.mark.asyncio
+    async def test_bare_platform_shows_usage_with_list(self):
+        # An empty /platform call defaults to "list".
+        runner = _make_runner()
+        out = await runner._handle_platform_command(self._make_event("/platform"))
+        assert "Gateway platforms" in out
+
diff --git a/tests/gateway/test_runner_fatal_adapter.py b/tests/gateway/test_runner_fatal_adapter.py
index 13b9a7d99e8..706514f1ae6 100644
--- a/tests/gateway/test_runner_fatal_adapter.py
+++ b/tests/gateway/test_runner_fatal_adapter.py
@@ -68,7 +68,11 @@ async def test_runner_requests_clean_exit_for_nonretryable_startup_conflict(monk
 @pytest.mark.asyncio
 async def test_runner_queues_retryable_runtime_fatal_for_reconnection(monkeypatch, tmp_path):
     """Retryable runtime fatal errors queue the platform for reconnection
-    instead of shutting down the gateway."""
+    AND keep the gateway alive — the background reconnect watcher recovers
+    the platform when the underlying issue clears.  (Previously this
+    exited-with-failure to trigger a systemd restart; that converted
+    transient failures into infinite restart loops.)
+    """
     config = GatewayConfig(
         platforms={
             Platform.WHATSAPP: PlatformConfig(enabled=True, token="token")
@@ -89,8 +93,8 @@ async def test_runner_queues_retryable_runtime_fatal_for_reconnection(monkeypatc
 
     await runner._handle_adapter_fatal_error(adapter)
 
-    # Should shut down with failure — systemd Restart=on-failure will restart
-    runner.stop.assert_awaited_once()
-    assert runner._exit_with_failure is True
+    # Gateway stays alive — watcher will retry in background
+    runner.stop.assert_not_awaited()
+    assert runner._exit_with_failure is False
     assert Platform.WHATSAPP in runner._failed_platforms
     assert runner._failed_platforms[Platform.WHATSAPP]["attempts"] == 0
diff --git a/tests/gateway/test_runner_startup_failures.py b/tests/gateway/test_runner_startup_failures.py
index fc5c775a779..438553f34ed 100644
--- a/tests/gateway/test_runner_startup_failures.py
+++ b/tests/gateway/test_runner_startup_failures.py
@@ -64,7 +64,14 @@ class _SuccessfulAdapter(BasePlatformAdapter):
 
 
 @pytest.mark.asyncio
-async def test_runner_returns_failure_for_retryable_startup_errors(monkeypatch, tmp_path):
+async def test_runner_stays_alive_for_retryable_startup_errors(monkeypatch, tmp_path):
+    """Retryable startup errors should leave the gateway running in
+    degraded mode so the reconnect watcher can recover the platform when
+    the underlying problem clears.  Previously this returned False from
+    ``start()`` and exited the process, which converted a single broken
+    platform (e.g. unpaired WhatsApp, DNS blip on Telegram) into a
+    systemd restart loop and killed cron jobs in the meantime.
+    """
     monkeypatch.setenv("HERMES_HOME", str(tmp_path))
     config = GatewayConfig(
         platforms={
@@ -78,11 +85,13 @@ async def test_runner_returns_failure_for_retryable_startup_errors(monkeypatch,
 
     ok = await runner.start()
 
-    assert ok is False
+    # Gateway stays alive in degraded mode; reconnect watcher takes over.
+    assert ok is True
     assert runner.should_exit_cleanly is False
     state = read_runtime_status()
-    assert state["gateway_state"] == "startup_failed"
-    assert "temporary DNS resolution failure" in state["exit_reason"]
+    assert state["gateway_state"] in {"degraded", "running"}
+    # Telegram was queued for retry, not given up on.
+    assert Platform.TELEGRAM in runner._failed_platforms
     assert state["platforms"]["telegram"]["state"] == "retrying"
     assert state["platforms"]["telegram"]["error_code"] == "telegram_connect_error"
 
diff --git a/tests/gateway/test_whatsapp_connect.py b/tests/gateway/test_whatsapp_connect.py
index 0a359fb7511..9d7807734bb 100644
--- a/tests/gateway/test_whatsapp_connect.py
+++ b/tests/gateway/test_whatsapp_connect.py
@@ -611,3 +611,93 @@ class TestHttpSessionLifecycle:
 
         mock_task.cancel.assert_not_called()
         assert adapter._poll_task is None
+
+
+# ---------------------------------------------------------------------------
+# Pre-flight: refuse to start the bridge when creds.json is missing
+# ---------------------------------------------------------------------------
+
+
+class TestNoCredsPreflight:
+    """Verify ``connect()`` fast-fails as non-retryable when WhatsApp is
+    enabled but the user never finished pairing (no ``creds.json``).
+
+    Without this guard, every gateway boot:
+      • spawned the bridge subprocess (npm install if needed)
+      • waited 30s for status:connected (never happens without creds)
+      • queued WhatsApp for indefinite retries that would just repeat
+    With the guard, ``connect()`` returns False immediately with a
+    non-retryable fatal error so the reconnect watcher drops the platform
+    and the gateway gets a single clear log line telling the user to run
+    ``hermes whatsapp``.
+    """
+
+    @pytest.mark.asyncio
+    async def test_connect_returns_false_when_no_creds(self, tmp_path):
+        from gateway.platforms.whatsapp import WhatsAppAdapter
+
+        adapter = WhatsAppAdapter.__new__(WhatsAppAdapter)
+        adapter.platform = Platform.WHATSAPP
+        adapter.config = MagicMock()
+        adapter._bridge_port = 19876
+        # Point bridge_script at a real existing file so the earlier
+        # bridge-missing check doesn't trip — we want to exercise the
+        # creds.json check specifically.
+        bridge = tmp_path / "bridge.js"
+        bridge.write_text("// stub")
+        adapter._bridge_script = str(bridge)
+        adapter._session_path = tmp_path / "session"  # no creds.json inside
+        adapter._session_path.mkdir()
+        adapter._bridge_log_fh = None
+        adapter._fatal_error_code = None
+        adapter._fatal_error_message = None
+        adapter._fatal_error_retryable = True
+
+        with patch(
+            "gateway.platforms.whatsapp.check_whatsapp_requirements",
+            return_value=True,
+        ):
+            result = await adapter.connect()
+
+        assert result is False
+        # Non-retryable so the reconnect watcher drops it cleanly
+        assert adapter._fatal_error_code == "whatsapp_not_paired"
+        assert adapter._fatal_error_retryable is False
+
+    @pytest.mark.asyncio
+    async def test_connect_proceeds_when_creds_present(self, tmp_path):
+        """When creds.json exists, the preflight check is bypassed and
+        connect() proceeds to the bridge bootstrap path. We don't fully
+        simulate the bridge here — we just verify no fast-fail occurs.
+        """
+        from gateway.platforms.whatsapp import WhatsAppAdapter
+
+        adapter = WhatsAppAdapter.__new__(WhatsAppAdapter)
+        adapter.platform = Platform.WHATSAPP
+        adapter.config = MagicMock()
+        adapter._bridge_port = 19877
+        bridge = tmp_path / "bridge.js"
+        bridge.write_text("// stub")
+        adapter._bridge_script = str(bridge)
+        session_dir = tmp_path / "session"
+        session_dir.mkdir()
+        (session_dir / "creds.json").write_text("{}")
+        adapter._session_path = session_dir
+        adapter._bridge_log_fh = None
+        adapter._fatal_error_code = None
+        adapter._fatal_error_message = None
+        adapter._fatal_error_retryable = True
+        # Stub _acquire_platform_lock to return False so connect() exits
+        # cleanly *after* the preflight, without spawning subprocesses.
+        adapter._acquire_platform_lock = MagicMock(return_value=False)
+
+        with patch(
+            "gateway.platforms.whatsapp.check_whatsapp_requirements",
+            return_value=True,
+        ):
+            result = await adapter.connect()
+
+        # Preflight passed — exits because we faked lock acquisition,
+        # but the fatal-error code is NOT the "not paired" one.
+        assert result is False
+        assert adapter._fatal_error_code != "whatsapp_not_paired"
diff --git a/tests/hermes_cli/test_whatsapp_setup_ordering.py b/tests/hermes_cli/test_whatsapp_setup_ordering.py
new file mode 100644
index 00000000000..47952bcc796
--- /dev/null
+++ b/tests/hermes_cli/test_whatsapp_setup_ordering.py
@@ -0,0 +1,140 @@
+"""Regression tests for ``cmd_whatsapp`` env-var write ordering.
+
+Before the fix, ``hermes whatsapp`` wrote ``WHATSAPP_ENABLED=true`` at
+step 2 — before npm install (step 4) and before QR pairing (step 6).
+If the user Ctrl+C'd at any later step, ``.env`` claimed WhatsApp was
+ready when the bridge still had no ``creds.json``.  Every subsequent
+``hermes gateway`` then paid a 30s bridge-bootstrap timeout and queued
+WhatsApp for indefinite retries — looking like "the gateway is broken."
+
+The fix: only set ``WHATSAPP_ENABLED=true`` once pairing actually
+succeeds (creds.json exists).  Aborted setup leaves no enabled state.
+"""
+
+from __future__ import annotations
+
+import io
+import os
+from contextlib import redirect_stdout
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+
+@pytest.fixture
+def isolated_home(tmp_path, monkeypatch):
+    home = tmp_path / "home"
+    hermes = home / ".hermes"
+    hermes.mkdir(parents=True)
+    monkeypatch.setattr(Path, "home", lambda: home)
+    monkeypatch.setenv("HERMES_HOME", str(hermes))
+    # Ensure get_env_value cache doesn't carry stale state.
+    for key in list(os.environ):
+        if key.startswith("WHATSAPP_"):
+            monkeypatch.delenv(key, raising=False)
+    return hermes
+
+
+def _env_value(hermes_home: Path, key: str) -> str | None:
+    env_file = hermes_home / ".env"
+    if not env_file.exists():
+        return None
+    for line in env_file.read_text().splitlines():
+        if "=" not in line:
+            continue
+        k, _, v = line.partition("=")
+        if k.strip() == key:
+            return v.strip().strip('"').strip("'")
+    return None
+
+
+def test_aborted_setup_does_not_enable_whatsapp(isolated_home, monkeypatch):
+    """User picks mode 1, then Ctrl+C's at the allowed-users prompt.
+
+    WHATSAPP_ENABLED must NOT be present in .env after abort.
+    """
+    from hermes_cli.main import cmd_whatsapp
+
+    # First input() = mode choice, second input() = allowed-users prompt
+    # We raise KeyboardInterrupt on the second call to simulate abort.
+    inputs = iter(["1"])
+
+    def fake_input(_prompt=""):
+        try:
+            return next(inputs)
+        except StopIteration:
+            raise KeyboardInterrupt
+
+    monkeypatch.setattr("builtins.input", fake_input)
+    # _require_tty calls sys.stdin.isatty — make it pass.
+    monkeypatch.setattr("hermes_cli.main._require_tty", lambda *_a, **_kw: None)
+    # No node, no bridge script — we shouldn't reach those steps anyway.
+
+    buf = io.StringIO()
+    with redirect_stdout(buf):
+        try:
+            cmd_whatsapp(MagicMock())
+        except KeyboardInterrupt:
+            pass
+
+    assert _env_value(isolated_home, "WHATSAPP_ENABLED") is None, (
+        "Setup aborted before pairing — WHATSAPP_ENABLED must not be set. "
+        f"Got .env: {(isolated_home / '.env').read_text() if (isolated_home / '.env').exists() else '(missing)'}"
+    )
+
+
+def test_existing_pairing_skip_branch_enables_whatsapp(isolated_home, monkeypatch):
+    """User runs ``hermes whatsapp`` with an existing paired session and
+    chooses "no, keep my session" at the re-pair prompt.  The env var
+    should be (re-)written to true so the gateway picks WhatsApp back up,
+    even if the var was lost since the original pairing.
+    """
+    from hermes_cli.main import cmd_whatsapp
+
+    # Pre-create a paired session WITHOUT WHATSAPP_ENABLED in .env.
+    session = isolated_home / "whatsapp" / "session"
+    session.mkdir(parents=True)
+    (session / "creds.json").write_text("{}")
+    monkeypatch.setenv("WHATSAPP_MODE", "bot")
+    monkeypatch.setenv("WHATSAPP_ALLOWED_USERS", "15551234567")
+
+    # mode already set → skip mode prompt; users already set → skip update
+    # prompt with "no"; pairing exists → "no, keep session" → return.
+    inputs = iter(["n", "n"])
+
+    def fake_input(_prompt=""):
+        try:
+            return next(inputs)
+        except StopIteration:
+            return "n"
+
+    monkeypatch.setattr("builtins.input", fake_input)
+    monkeypatch.setattr("hermes_cli.main._require_tty", lambda *_a, **_kw: None)
+    # Skip the bridge npm install — we're testing setup-ordering, not bridge
+    # bootstrapping.  Pretend node_modules exists (Path.exists -> True for that
+    # specific check is hard to scope, so instead pretend npm install would
+    # succeed silently if reached).
+    monkeypatch.setattr(
+        "subprocess.run",
+        lambda *_a, **_kw: MagicMock(returncode=0, stderr=""),
+    )
+    monkeypatch.setattr("shutil.which", lambda _name: "/usr/bin/npm")
+    # Patch (bridge_dir / "node_modules").exists() by stubbing Path.exists
+    # to True for that one specific subpath.  Easier: pre-create it as a
+    # symlink to /tmp.  But we can't write to the repo.  Instead, stub
+    # Path.exists wholesale to True for node_modules; the creds.json check
+    # in the same function still works because we wrote it ourselves.
+    _orig_exists = Path.exists
+    def _stub_exists(self):
+        if self.name == "node_modules":
+            return True
+        return _orig_exists(self)
+    monkeypatch.setattr(Path, "exists", _stub_exists)
+
+    buf = io.StringIO()
+    with redirect_stdout(buf):
+        cmd_whatsapp(MagicMock())
+
+    # The skip-rebar branch should have set the env var on its way out.
+    assert _env_value(isolated_home, "WHATSAPP_ENABLED") == "true"