diff --git a/gateway/platforms/whatsapp.py b/gateway/platforms/whatsapp.py index 5239df3b5ae..0ca3d41fabb 100644 --- a/gateway/platforms/whatsapp.py +++ b/gateway/platforms/whatsapp.py @@ -493,13 +493,45 @@ class WhatsAppAdapter(BasePlatformAdapter): """ if not check_whatsapp_requirements(): logger.warning("[%s] Node.js not found. WhatsApp requires Node.js.", self.name) + self._set_fatal_error( + "whatsapp_node_missing", + "Node.js is not installed — install Node.js and re-run `hermes gateway`.", + retryable=False, + ) return False bridge_path = Path(self._bridge_script) if not bridge_path.exists(): logger.warning("[%s] Bridge script not found: %s", self.name, bridge_path) + self._set_fatal_error( + "whatsapp_bridge_missing", + f"WhatsApp bridge script missing at {bridge_path}.", + retryable=False, + ) return False - + + # Pre-flight: skip the 30s bridge bootstrap entirely if the user + # never finished pairing. Without creds.json the bridge prints + # QR codes to its log file and never reaches status:connected, + # so every gateway restart paid the 30s timeout + queued WhatsApp + # for indefinite retries. Mark non-retryable so the user gets a + # clear "run hermes whatsapp" message instead of the watcher + # silently hammering an unconfigured platform. + creds_path = self._session_path / "creds.json" + if not creds_path.exists(): + logger.warning( + "[%s] WhatsApp is enabled but not paired (no creds.json at %s). " + "Run `hermes whatsapp` to pair, or remove WHATSAPP_ENABLED from " + "your .env to disable.", + self.name, creds_path, + ) + self._set_fatal_error( + "whatsapp_not_paired", + "WhatsApp enabled but not paired — run `hermes whatsapp` to pair.", + retryable=False, + ) + return False + logger.info("[%s] Bridge found at %s", self.name, bridge_path) # Acquire scoped lock to prevent duplicate sessions diff --git a/gateway/run.py b/gateway/run.py index f41357673f7..f9a282a413f 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -1990,21 +1990,21 @@ class GatewayRunner: await self.stop() elif not self.adapters and self._failed_platforms: # All platforms are down and queued for background reconnection. - # If the error is retryable, exit with failure so systemd Restart=on-failure - # can restart the process. Otherwise stay alive and keep retrying in background. - if adapter.fatal_error_retryable: - self._exit_reason = adapter.fatal_error_message or "All messaging platforms failed with retryable errors" - self._exit_with_failure = True - logger.error( - "All messaging platforms failed with retryable errors. " - "Shutting down gateway for service restart (systemd will retry)." - ) - await self.stop() - else: - logger.warning( - "No connected messaging platforms remain, but %d platform(s) queued for reconnection", - len(self._failed_platforms), - ) + # Keep the gateway alive so: + # • cron jobs still run + # • the reconnect watcher can recover platforms when the + # underlying problem clears (proxy comes back, user runs + # `hermes whatsapp`, etc.) + # We used to exit-with-failure here to trigger systemd restart, + # but that converted a transient outage into a restart loop and + # killed in-process state every time. The reconnect watcher + # already handles long-running recovery — let it do its job. + logger.warning( + "No connected messaging platforms remain, but %d platform(s) " + "queued for reconnection — gateway staying alive, watcher will " + "retry in background.", + len(self._failed_platforms), + ) def _request_clean_exit(self, reason: str) -> None: self._exit_cleanly = True @@ -2180,6 +2180,73 @@ class GatewayRunner: except Exception: pass + # ------------------------------------------------------------------ + # Per-platform circuit breaker (pause/resume) — used by the reconnect + # watcher when a retryable failure recurs past a threshold, and by the + # /platform pause|resume slash command for manual control. + # ------------------------------------------------------------------ + def _pause_failed_platform(self, platform, *, reason: str = "") -> None: + """Mark a queued platform as paused — keep it in ``_failed_platforms`` + but stop the reconnect watcher from hammering it. + + Used by the circuit breaker after ``_PAUSE_AFTER_FAILURES`` consecutive + retryable failures, and by ``/platform pause `` for manual + intervention. Paused platforms are surfaced in ``/platform list`` + and resumed with ``/platform resume ``. + """ + info = getattr(self, "_failed_platforms", {}).get(platform) + if info is None: + return + if info.get("paused"): + return + info["paused"] = True + info["pause_reason"] = reason or "auto-paused after repeated failures" + # Push next_retry far enough out that even if "paused" is missed + # by a stale code path, the watcher won't fire on it. + info["next_retry"] = float("inf") + try: + self._update_platform_runtime_status( + platform.value, + platform_state="paused", + error_code=None, + error_message=info["pause_reason"], + ) + except Exception: + pass + logger.warning( + "%s paused after %d consecutive failures (%s) — " + "fix the underlying issue then run `/platform resume %s` " + "to retry, or `hermes gateway restart` to restart the gateway.", + platform.value, info.get("attempts", 0), + info["pause_reason"], platform.value, + ) + + def _resume_paused_platform(self, platform) -> bool: + """Unpause a platform — reset its attempt counter and schedule an + immediate retry. Returns True if the platform was paused and is + now queued; False if it wasn't paused (or wasn't in the queue). + """ + info = getattr(self, "_failed_platforms", {}).get(platform) + if info is None: + return False + if not info.get("paused"): + return False + info["paused"] = False + info.pop("pause_reason", None) + info["attempts"] = 0 + info["next_retry"] = time.monotonic() # retry on next watcher tick + try: + self._update_platform_runtime_status( + platform.value, + platform_state="retrying", + error_code=None, + error_message=None, + ) + except Exception: + pass + logger.info("%s resumed — retrying on next watcher tick", platform.value) + return True + @staticmethod def _load_prefill_messages() -> List[Dict[str, Any]]: """Load ephemeral prefill messages from config or env var. @@ -3613,16 +3680,32 @@ class GatewayRunner: return True if enabled_platform_count > 0: if startup_retryable_errors: - # At least one platform attempted a connection and failed — - # this is a real startup error that should block the gateway. + # All enabled platforms hit retryable failures (network + # blip, bridge not paired, npm install timeout, etc.). + # Keep the gateway alive so: + # • cron jobs still run + # • the reconnect watcher gets a chance to recover the + # failing platforms once the underlying problem is + # fixed (e.g. user runs `hermes whatsapp`, fixes + # proxy, etc.) + # Exiting here used to convert a single misconfigured + # platform into an infinite systemd restart loop. reason = "; ".join(startup_retryable_errors) - logger.error("Gateway failed to connect any configured messaging platform: %s", reason) + logger.warning( + "Gateway started with no connected platforms — " + "%d platform(s) queued for retry: %s", + len(self._failed_platforms), reason, + ) try: from gateway.status import write_runtime_status - write_runtime_status(gateway_state="startup_failed", exit_reason=reason) + write_runtime_status( + gateway_state="degraded", + exit_reason=None, + ) except Exception: pass - return False + # Fall through to the normal "running" state — reconnect + # watcher takes it from here. # All enabled platforms had no adapter (missing library or credentials). # In fleet deployments the same config.yaml is shared across nodes that # may only have credentials for a subset of platforms. Rather than @@ -4737,11 +4820,15 @@ class GatewayRunner: """Background task that periodically retries connecting failed platforms. Uses exponential backoff: 30s → 60s → 120s → 240s → 300s (cap). - Stops retrying a platform after 20 failed attempts or if the error - is non-retryable (e.g. bad auth token). + Retryable failures keep retrying at the backoff cap indefinitely + — but if a platform fails ``_PAUSE_AFTER_FAILURES`` times in a row + without ever succeeding, it is *paused*: kept in the retry queue + but no longer hammered. The user surfaces it with ``/platform list`` + and resumes it with ``/platform resume ``. Non-retryable + failures (bad auth, etc.) still drop out of the queue immediately. """ - _MAX_ATTEMPTS = 20 _BACKOFF_CAP = 300 # 5 minutes max between retries + _PAUSE_AFTER_FAILURES = 10 # circuit-breaker threshold await asyncio.sleep(10) # initial delay — let startup finish while self._running: @@ -4758,22 +4845,18 @@ class GatewayRunner: if not self._running: return info = self._failed_platforms[platform] + # Skip paused platforms entirely — they need explicit + # /platform resume to come back. + if info.get("paused"): + continue if now < info["next_retry"]: continue # not time yet - if info["attempts"] >= _MAX_ATTEMPTS: - logger.warning( - "Giving up reconnecting %s after %d attempts", - platform.value, info["attempts"], - ) - del self._failed_platforms[platform] - continue - platform_config = info["config"] attempt = info["attempts"] + 1 logger.info( - "Reconnecting %s (attempt %d/%d)...", - platform.value, attempt, _MAX_ATTEMPTS, + "Reconnecting %s (attempt %d)...", + platform.value, attempt, ) try: @@ -4838,6 +4921,14 @@ class GatewayRunner: "Reconnect %s failed, next retry in %ds", platform.value, backoff, ) + if attempt >= _PAUSE_AFTER_FAILURES: + self._pause_failed_platform( + platform, + reason=( + adapter.fatal_error_message + or "failed to reconnect" + ), + ) except Exception as e: self._update_platform_runtime_status( platform.value, @@ -4852,6 +4943,8 @@ class GatewayRunner: "Reconnect %s error: %s, next retry in %ds", platform.value, e, backoff, ) + if attempt >= _PAUSE_AFTER_FAILURES: + self._pause_failed_platform(platform, reason=str(e)) # Check every 10 seconds for platforms that need reconnection for _ in range(10): @@ -6451,6 +6544,9 @@ class GatewayRunner: if canonical == "agents": return await self._handle_agents_command(event) + if canonical == "platform": + return await self._handle_platform_command(event) + if canonical == "restart": return await self._handle_restart_command(event) @@ -8698,6 +8794,99 @@ class GatewayRunner: else: return t("gateway.stop.no_active") + async def _handle_platform_command(self, event: MessageEvent) -> str: + """Handle ``/platform list|pause|resume [name]`` — surface and + manually control failed/paused gateway adapters. + + Examples: + ``/platform list`` — show connected + failed/paused platforms + ``/platform pause whatsapp`` — stop the reconnect watcher hammering whatsapp + ``/platform resume whatsapp`` — re-queue a paused platform for retry + """ + text = (getattr(event, "content", "") or "").strip() + # Strip the leading "/platform" (or "/PLATFORM") token if present + parts = text.split(maxsplit=2) + if parts and parts[0].lower().lstrip("/").startswith("platform"): + parts = parts[1:] + action = (parts[0] if parts else "list").lower() + target = parts[1].lower() if len(parts) > 1 else "" + + # Resolve platform name (case-insensitive, value match) + def _resolve_platform(name: str): + if not name: + return None + for p in Platform.__members__.values(): + if p.value.lower() == name: + return p + return None + + if action == "list": + lines = ["**Gateway platforms**"] + connected = sorted(p.value for p in self.adapters.keys()) + if connected: + lines.append("Connected: " + ", ".join(connected)) + else: + lines.append("Connected: (none)") + failed = getattr(self, "_failed_platforms", {}) or {} + if failed: + for p, info in failed.items(): + if info.get("paused"): + reason = info.get("pause_reason") or "paused" + lines.append( + f" · {p.value} — PAUSED ({reason}). " + f"Resume with `/platform resume {p.value}`." + ) + else: + attempts = info.get("attempts", 0) + lines.append( + f" · {p.value} — retrying (attempt {attempts})" + ) + else: + lines.append("Failed/paused: (none)") + return "\n".join(lines) + + if action in ("pause", "resume"): + if not target: + return f"Usage: /platform {action} " + platform = _resolve_platform(target) + if platform is None: + return f"Unknown platform: {target}" + failed = getattr(self, "_failed_platforms", {}) or {} + if action == "pause": + if platform not in failed: + return ( + f"{platform.value} is not in the retry queue " + f"(it's either connected or not enabled)." + ) + if failed[platform].get("paused"): + return f"{platform.value} is already paused." + self._pause_failed_platform(platform, reason="paused via /platform pause") + return ( + f"✓ {platform.value} paused. " + f"Resume with `/platform resume {platform.value}` or " + f"`hermes gateway restart` to reset." + ) + # action == "resume" + if platform not in failed: + return ( + f"{platform.value} is not in the retry queue — " + f"nothing to resume." + ) + if not failed[platform].get("paused"): + return ( + f"{platform.value} is already retrying — " + f"no resume needed." + ) + self._resume_paused_platform(platform) + return f"✓ {platform.value} resumed — retrying on next watcher tick." + + return ( + "Usage: /platform [name]\n" + " /platform list — show platform status\n" + " /platform pause — stop retrying a failing platform\n" + " /platform resume — re-queue a paused platform" + ) + async def _handle_restart_command(self, event: MessageEvent) -> Union[str, EphemeralReply]: """Handle /restart command - drain active work, then restart the gateway.""" # Defensive idempotency check: if the previous gateway process diff --git a/hermes_cli/commands.py b/hermes_cli/commands.py index b3556d3932d..83d86c4a3a9 100644 --- a/hermes_cli/commands.py +++ b/hermes_cli/commands.py @@ -198,6 +198,8 @@ COMMAND_REGISTRY: list[CommandDef] = [ args_hint="[days]"), CommandDef("platforms", "Show gateway/messaging platform status", "Info", cli_only=True, aliases=("gateway",)), + CommandDef("platform", "Pause, resume, or list a failing gateway platform", "Info", + gateway_only=True, args_hint=" [name]"), CommandDef("copy", "Copy the last assistant response to clipboard", "Info", cli_only=True, args_hint="[number]"), CommandDef("paste", "Attach clipboard image from your clipboard", "Info", diff --git a/hermes_cli/main.py b/hermes_cli/main.py index c2c8a6880d2..7eedc3fd322 100644 --- a/hermes_cli/main.py +++ b/hermes_cli/main.py @@ -1522,14 +1522,18 @@ def cmd_whatsapp(args): ) print(f"\n✓ Mode: {mode_label}") - # ── Step 2: Enable WhatsApp ────────────────────────────────────────── + # ── Step 2: Mode is selected, will enable WhatsApp only after pairing ── + # We intentionally don't write WHATSAPP_ENABLED=true here. If the user + # aborts the wizard later (Ctrl+C, failed npm install, missed QR scan), + # we'd otherwise leave .env claiming WhatsApp is ready when the bridge + # has no creds.json. Every subsequent `hermes gateway` then paid a 30s + # bridge-bootstrap timeout and queued WhatsApp for indefinite retries. + # Now: aborted setup leaves WHATSAPP_ENABLED unset → gateway skips it. + # Re-runs that already have WHATSAPP_ENABLED=true (from a prior + # successful pairing) stay enabled — we just don't write it pre-emptively. print() - current = get_env_value("WHATSAPP_ENABLED") - if current and current.lower() == "true": + if (get_env_value("WHATSAPP_ENABLED") or "").lower() == "true": print("✓ WhatsApp is already enabled") - else: - save_env_value("WHATSAPP_ENABLED", "true") - print("✓ WhatsApp enabled") # ── Step 3: Allowed users ──────────────────────────────────────────── current_users = get_env_value("WHATSAPP_ALLOWED_USERS") or "" @@ -1619,6 +1623,12 @@ def cmd_whatsapp(args): session_dir.mkdir(parents=True, exist_ok=True) print(" ✓ Session cleared") else: + # Existing pairing — ensure WHATSAPP_ENABLED reflects that. + # (Older installs may have lost the env var; covers re-runs + # where the user picked "no, keep my session" but the var + # was never set or got removed.) + if (get_env_value("WHATSAPP_ENABLED") or "").lower() != "true": + save_env_value("WHATSAPP_ENABLED", "true") print("\n✓ WhatsApp is configured and paired!") print(" Start the gateway with: hermes gateway") return @@ -1647,6 +1657,11 @@ def cmd_whatsapp(args): # ── Step 7: Post-pairing ───────────────────────────────────────────── print() if (session_dir / "creds.json").exists(): + # Only enable WhatsApp now that pairing actually succeeded. If the + # user Ctrl+C'd at any earlier step, WHATSAPP_ENABLED stays unset + # and `hermes gateway` skips it cleanly instead of paying a 30s + # bridge timeout + queueing the platform for indefinite retries. + save_env_value("WHATSAPP_ENABLED", "true") print("✓ WhatsApp paired successfully!") print() if wa_mode == "bot": diff --git a/tests/gateway/test_platform_reconnect.py b/tests/gateway/test_platform_reconnect.py index a0bd7ab9eec..e4362a02562 100644 --- a/tests/gateway/test_platform_reconnect.py +++ b/tests/gateway/test_platform_reconnect.py @@ -294,15 +294,63 @@ class TestPlatformReconnectWatcher: assert runner._failed_platforms[Platform.TELEGRAM]["attempts"] == 2 @pytest.mark.asyncio - async def test_reconnect_gives_up_after_max_attempts(self): - """After max attempts, platform should be removed from retry queue.""" + async def test_reconnect_pauses_after_circuit_breaker_threshold(self): + """After enough consecutive retryable failures, the watcher should + *pause* the platform (keep it in the queue but stop hammering it), + not drop it. The user resumes via /platform resume. + """ + runner = _make_runner() + + platform_config = PlatformConfig(enabled=True, token="test") + # 9 prior attempts — the next failure will be the 10th and should + # trip the circuit breaker. + runner._failed_platforms[Platform.TELEGRAM] = { + "config": platform_config, + "attempts": 9, + "next_retry": time.monotonic() - 1, + } + + fail_adapter = StubAdapter( + succeed=False, fatal_error="DNS failure", fatal_retryable=True + ) + real_sleep = asyncio.sleep + + with patch.object(runner, "_create_adapter", return_value=fail_adapter): + async def run_one_iteration(): + runner._running = True + call_count = 0 + + async def fake_sleep(n): + nonlocal call_count + call_count += 1 + if call_count > 1: + runner._running = False + await real_sleep(0) + + with patch("asyncio.sleep", side_effect=fake_sleep): + await runner._platform_reconnect_watcher() + + await run_one_iteration() + + # Platform stays in queue — paused, not dropped + assert Platform.TELEGRAM in runner._failed_platforms + info = runner._failed_platforms[Platform.TELEGRAM] + assert info["paused"] is True + assert info["attempts"] == 10 + assert "pause_reason" in info + + @pytest.mark.asyncio + async def test_reconnect_skips_paused_platforms(self): + """A paused platform should not be retried by the watcher tick.""" runner = _make_runner() platform_config = PlatformConfig(enabled=True, token="test") runner._failed_platforms[Platform.TELEGRAM] = { "config": platform_config, - "attempts": 20, # At max - "next_retry": time.monotonic() - 1, + "attempts": 10, + "next_retry": time.monotonic() - 1, # would normally retry now + "paused": True, + "pause_reason": "paused via /platform pause", } real_sleep = asyncio.sleep @@ -324,8 +372,10 @@ class TestPlatformReconnectWatcher: await run_one_iteration() - assert Platform.TELEGRAM not in runner._failed_platforms - mock_create.assert_not_called() # Should give up without trying + # Paused platform stays queued and was never touched + assert Platform.TELEGRAM in runner._failed_platforms + assert runner._failed_platforms[Platform.TELEGRAM]["paused"] is True + mock_create.assert_not_called() @pytest.mark.asyncio async def test_reconnect_skips_when_not_time_yet(self): @@ -459,11 +509,12 @@ class TestRuntimeDisconnectQueuing: assert Platform.TELEGRAM not in runner._failed_platforms @pytest.mark.asyncio - async def test_retryable_error_exits_for_service_restart_when_all_down(self): - """Gateway should exit with failure when all platforms fail with retryable errors. - - This lets systemd Restart=on-failure restart the process, which is more - reliable than in-process background reconnection after exhausted retries. + async def test_retryable_error_keeps_gateway_alive_when_all_down(self): + """When all adapters fail at runtime with retryable errors, the + gateway should stay alive and let the reconnect watcher recover them + in the background. (Previously this exited-with-failure to trigger + a systemd restart — that converted transient outages into infinite + restart loops and killed in-process state.) """ runner = _make_runner() runner.stop = AsyncMock() @@ -474,9 +525,9 @@ class TestRuntimeDisconnectQueuing: await runner._handle_adapter_fatal_error(adapter) - # stop() SHOULD be called — gateway exits for systemd restart - runner.stop.assert_called_once() - assert runner._exit_with_failure is True + # stop() should NOT be called — gateway stays alive for the watcher + runner.stop.assert_not_called() + assert runner._exit_with_failure is False assert Platform.TELEGRAM in runner._failed_platforms @pytest.mark.asyncio @@ -512,3 +563,154 @@ class TestRuntimeDisconnectQueuing: await runner._handle_adapter_fatal_error(adapter) runner.stop.assert_called_once() + + +# --- Pause / resume circuit breaker --- + + +class TestPauseResume: + """Test the per-platform pause/resume helpers and slash command.""" + + def test_pause_marks_platform_paused(self): + runner = _make_runner() + runner._failed_platforms[Platform.TELEGRAM] = { + "config": PlatformConfig(enabled=True, token="t"), + "attempts": 3, + "next_retry": time.monotonic() + 30, + } + runner._pause_failed_platform(Platform.TELEGRAM, reason="manual") + info = runner._failed_platforms[Platform.TELEGRAM] + assert info["paused"] is True + assert info["pause_reason"] == "manual" + assert info["next_retry"] == float("inf") + + def test_pause_is_idempotent(self): + runner = _make_runner() + runner._failed_platforms[Platform.TELEGRAM] = { + "config": PlatformConfig(enabled=True, token="t"), + "attempts": 3, + "next_retry": time.monotonic() + 30, + "paused": True, + "pause_reason": "first reason", + } + runner._pause_failed_platform(Platform.TELEGRAM, reason="second reason") + # Reason should not be overwritten on a second pause call. + assert ( + runner._failed_platforms[Platform.TELEGRAM]["pause_reason"] + == "first reason" + ) + + def test_pause_no_op_when_platform_not_queued(self): + runner = _make_runner() + # No exception even when the platform isn't in _failed_platforms. + runner._pause_failed_platform(Platform.TELEGRAM, reason="x") + assert Platform.TELEGRAM not in runner._failed_platforms + + def test_resume_clears_paused_and_resets_attempts(self): + runner = _make_runner() + runner._failed_platforms[Platform.TELEGRAM] = { + "config": PlatformConfig(enabled=True, token="t"), + "attempts": 10, + "next_retry": float("inf"), + "paused": True, + "pause_reason": "auto-paused", + } + assert runner._resume_paused_platform(Platform.TELEGRAM) is True + info = runner._failed_platforms[Platform.TELEGRAM] + assert info["paused"] is False + assert info["attempts"] == 0 + assert info["next_retry"] != float("inf") + assert "pause_reason" not in info + + def test_resume_returns_false_when_not_paused(self): + runner = _make_runner() + runner._failed_platforms[Platform.TELEGRAM] = { + "config": PlatformConfig(enabled=True, token="t"), + "attempts": 1, + "next_retry": time.monotonic() + 30, + } + assert runner._resume_paused_platform(Platform.TELEGRAM) is False + + def test_resume_returns_false_when_not_queued(self): + runner = _make_runner() + assert runner._resume_paused_platform(Platform.TELEGRAM) is False + + +class TestPlatformSlashCommand: + """Test the /platform list|pause|resume slash command handler.""" + + def _make_event(self, content: str): + ev = MagicMock() + ev.content = content + return ev + + @pytest.mark.asyncio + async def test_list_shows_connected_and_paused(self): + runner = _make_runner() + runner.adapters[Platform.DISCORD] = StubAdapter(platform=Platform.DISCORD) + runner._failed_platforms[Platform.WHATSAPP] = { + "config": PlatformConfig(enabled=True, token="t"), + "attempts": 10, + "next_retry": float("inf"), + "paused": True, + "pause_reason": "not paired", + } + out = await runner._handle_platform_command(self._make_event("/platform list")) + assert "discord" in out + assert "whatsapp" in out + assert "PAUSED" in out + assert "not paired" in out + + @pytest.mark.asyncio + async def test_pause_command_pauses_queued_platform(self): + runner = _make_runner() + runner._failed_platforms[Platform.WHATSAPP] = { + "config": PlatformConfig(enabled=True, token="t"), + "attempts": 2, + "next_retry": time.monotonic() + 30, + } + out = await runner._handle_platform_command( + self._make_event("/platform pause whatsapp") + ) + assert "paused" in out.lower() + assert runner._failed_platforms[Platform.WHATSAPP]["paused"] is True + + @pytest.mark.asyncio + async def test_pause_rejects_unqueued_platform(self): + runner = _make_runner() + out = await runner._handle_platform_command( + self._make_event("/platform pause whatsapp") + ) + assert "not in the retry queue" in out + + @pytest.mark.asyncio + async def test_resume_command_resumes_paused_platform(self): + runner = _make_runner() + runner._failed_platforms[Platform.WHATSAPP] = { + "config": PlatformConfig(enabled=True, token="t"), + "attempts": 10, + "next_retry": float("inf"), + "paused": True, + "pause_reason": "x", + } + out = await runner._handle_platform_command( + self._make_event("/platform resume whatsapp") + ) + assert "resumed" in out.lower() + assert runner._failed_platforms[Platform.WHATSAPP]["paused"] is False + + @pytest.mark.asyncio + async def test_unknown_platform_name(self): + runner = _make_runner() + out = await runner._handle_platform_command( + self._make_event("/platform pause notarealplatform") + ) + assert "Unknown platform" in out + + @pytest.mark.asyncio + async def test_bare_platform_shows_usage_with_list(self): + # An empty /platform call defaults to "list". + runner = _make_runner() + out = await runner._handle_platform_command(self._make_event("/platform")) + assert "Gateway platforms" in out + diff --git a/tests/gateway/test_runner_fatal_adapter.py b/tests/gateway/test_runner_fatal_adapter.py index 13b9a7d99e8..706514f1ae6 100644 --- a/tests/gateway/test_runner_fatal_adapter.py +++ b/tests/gateway/test_runner_fatal_adapter.py @@ -68,7 +68,11 @@ async def test_runner_requests_clean_exit_for_nonretryable_startup_conflict(monk @pytest.mark.asyncio async def test_runner_queues_retryable_runtime_fatal_for_reconnection(monkeypatch, tmp_path): """Retryable runtime fatal errors queue the platform for reconnection - instead of shutting down the gateway.""" + AND keep the gateway alive — the background reconnect watcher recovers + the platform when the underlying issue clears. (Previously this + exited-with-failure to trigger a systemd restart; that converted + transient failures into infinite restart loops.) + """ config = GatewayConfig( platforms={ Platform.WHATSAPP: PlatformConfig(enabled=True, token="token") @@ -89,8 +93,8 @@ async def test_runner_queues_retryable_runtime_fatal_for_reconnection(monkeypatc await runner._handle_adapter_fatal_error(adapter) - # Should shut down with failure — systemd Restart=on-failure will restart - runner.stop.assert_awaited_once() - assert runner._exit_with_failure is True + # Gateway stays alive — watcher will retry in background + runner.stop.assert_not_awaited() + assert runner._exit_with_failure is False assert Platform.WHATSAPP in runner._failed_platforms assert runner._failed_platforms[Platform.WHATSAPP]["attempts"] == 0 diff --git a/tests/gateway/test_runner_startup_failures.py b/tests/gateway/test_runner_startup_failures.py index fc5c775a779..438553f34ed 100644 --- a/tests/gateway/test_runner_startup_failures.py +++ b/tests/gateway/test_runner_startup_failures.py @@ -64,7 +64,14 @@ class _SuccessfulAdapter(BasePlatformAdapter): @pytest.mark.asyncio -async def test_runner_returns_failure_for_retryable_startup_errors(monkeypatch, tmp_path): +async def test_runner_stays_alive_for_retryable_startup_errors(monkeypatch, tmp_path): + """Retryable startup errors should leave the gateway running in + degraded mode so the reconnect watcher can recover the platform when + the underlying problem clears. Previously this returned False from + ``start()`` and exited the process, which converted a single broken + platform (e.g. unpaired WhatsApp, DNS blip on Telegram) into a + systemd restart loop and killed cron jobs in the meantime. + """ monkeypatch.setenv("HERMES_HOME", str(tmp_path)) config = GatewayConfig( platforms={ @@ -78,11 +85,13 @@ async def test_runner_returns_failure_for_retryable_startup_errors(monkeypatch, ok = await runner.start() - assert ok is False + # Gateway stays alive in degraded mode; reconnect watcher takes over. + assert ok is True assert runner.should_exit_cleanly is False state = read_runtime_status() - assert state["gateway_state"] == "startup_failed" - assert "temporary DNS resolution failure" in state["exit_reason"] + assert state["gateway_state"] in {"degraded", "running"} + # Telegram was queued for retry, not given up on. + assert Platform.TELEGRAM in runner._failed_platforms assert state["platforms"]["telegram"]["state"] == "retrying" assert state["platforms"]["telegram"]["error_code"] == "telegram_connect_error" diff --git a/tests/gateway/test_whatsapp_connect.py b/tests/gateway/test_whatsapp_connect.py index 0a359fb7511..9d7807734bb 100644 --- a/tests/gateway/test_whatsapp_connect.py +++ b/tests/gateway/test_whatsapp_connect.py @@ -611,3 +611,93 @@ class TestHttpSessionLifecycle: mock_task.cancel.assert_not_called() assert adapter._poll_task is None + + +# --------------------------------------------------------------------------- +# Pre-flight: refuse to start the bridge when creds.json is missing +# --------------------------------------------------------------------------- + + +class TestNoCredsPreflight: + """Verify ``connect()`` fast-fails as non-retryable when WhatsApp is + enabled but the user never finished pairing (no ``creds.json``). + + Without this guard, every gateway boot: + • spawned the bridge subprocess (npm install if needed) + • waited 30s for status:connected (never happens without creds) + • queued WhatsApp for indefinite retries that would just repeat + With the guard, ``connect()`` returns False immediately with a + non-retryable fatal error so the reconnect watcher drops the platform + and the gateway gets a single clear log line telling the user to run + ``hermes whatsapp``. + """ + + @pytest.mark.asyncio + async def test_connect_returns_false_when_no_creds(self, tmp_path): + from gateway.platforms.whatsapp import WhatsAppAdapter + + adapter = WhatsAppAdapter.__new__(WhatsAppAdapter) + adapter.platform = Platform.WHATSAPP + adapter.config = MagicMock() + adapter._bridge_port = 19876 + # Point bridge_script at a real existing file so the earlier + # bridge-missing check doesn't trip — we want to exercise the + # creds.json check specifically. + bridge = tmp_path / "bridge.js" + bridge.write_text("// stub") + adapter._bridge_script = str(bridge) + adapter._session_path = tmp_path / "session" # no creds.json inside + adapter._session_path.mkdir() + adapter._bridge_log_fh = None + adapter._fatal_error_code = None + adapter._fatal_error_message = None + adapter._fatal_error_retryable = True + + with patch( + "gateway.platforms.whatsapp.check_whatsapp_requirements", + return_value=True, + ): + result = await adapter.connect() + + assert result is False + # Non-retryable so the reconnect watcher drops it cleanly + assert adapter._fatal_error_code == "whatsapp_not_paired" + assert adapter._fatal_error_retryable is False + + @pytest.mark.asyncio + async def test_connect_proceeds_when_creds_present(self, tmp_path): + """When creds.json exists, the preflight check is bypassed and + connect() proceeds to the bridge bootstrap path. We don't fully + simulate the bridge here — we just verify no fast-fail occurs. + """ + from gateway.platforms.whatsapp import WhatsAppAdapter + + adapter = WhatsAppAdapter.__new__(WhatsAppAdapter) + adapter.platform = Platform.WHATSAPP + adapter.config = MagicMock() + adapter._bridge_port = 19877 + bridge = tmp_path / "bridge.js" + bridge.write_text("// stub") + adapter._bridge_script = str(bridge) + session_dir = tmp_path / "session" + session_dir.mkdir() + (session_dir / "creds.json").write_text("{}") + adapter._session_path = session_dir + adapter._bridge_log_fh = None + adapter._fatal_error_code = None + adapter._fatal_error_message = None + adapter._fatal_error_retryable = True + # Stub _acquire_platform_lock to return False so connect() exits + # cleanly *after* the preflight, without spawning subprocesses. + adapter._acquire_platform_lock = MagicMock(return_value=False) + + with patch( + "gateway.platforms.whatsapp.check_whatsapp_requirements", + return_value=True, + ): + result = await adapter.connect() + + # Preflight passed — exits because we faked lock acquisition, + # but the fatal-error code is NOT the "not paired" one. + assert result is False + assert adapter._fatal_error_code != "whatsapp_not_paired" diff --git a/tests/hermes_cli/test_whatsapp_setup_ordering.py b/tests/hermes_cli/test_whatsapp_setup_ordering.py new file mode 100644 index 00000000000..47952bcc796 --- /dev/null +++ b/tests/hermes_cli/test_whatsapp_setup_ordering.py @@ -0,0 +1,140 @@ +"""Regression tests for ``cmd_whatsapp`` env-var write ordering. + +Before the fix, ``hermes whatsapp`` wrote ``WHATSAPP_ENABLED=true`` at +step 2 — before npm install (step 4) and before QR pairing (step 6). +If the user Ctrl+C'd at any later step, ``.env`` claimed WhatsApp was +ready when the bridge still had no ``creds.json``. Every subsequent +``hermes gateway`` then paid a 30s bridge-bootstrap timeout and queued +WhatsApp for indefinite retries — looking like "the gateway is broken." + +The fix: only set ``WHATSAPP_ENABLED=true`` once pairing actually +succeeds (creds.json exists). Aborted setup leaves no enabled state. +""" + +from __future__ import annotations + +import io +import os +from contextlib import redirect_stdout +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + + +@pytest.fixture +def isolated_home(tmp_path, monkeypatch): + home = tmp_path / "home" + hermes = home / ".hermes" + hermes.mkdir(parents=True) + monkeypatch.setattr(Path, "home", lambda: home) + monkeypatch.setenv("HERMES_HOME", str(hermes)) + # Ensure get_env_value cache doesn't carry stale state. + for key in list(os.environ): + if key.startswith("WHATSAPP_"): + monkeypatch.delenv(key, raising=False) + return hermes + + +def _env_value(hermes_home: Path, key: str) -> str | None: + env_file = hermes_home / ".env" + if not env_file.exists(): + return None + for line in env_file.read_text().splitlines(): + if "=" not in line: + continue + k, _, v = line.partition("=") + if k.strip() == key: + return v.strip().strip('"').strip("'") + return None + + +def test_aborted_setup_does_not_enable_whatsapp(isolated_home, monkeypatch): + """User picks mode 1, then Ctrl+C's at the allowed-users prompt. + + WHATSAPP_ENABLED must NOT be present in .env after abort. + """ + from hermes_cli.main import cmd_whatsapp + + # First input() = mode choice, second input() = allowed-users prompt + # We raise KeyboardInterrupt on the second call to simulate abort. + inputs = iter(["1"]) + + def fake_input(_prompt=""): + try: + return next(inputs) + except StopIteration: + raise KeyboardInterrupt + + monkeypatch.setattr("builtins.input", fake_input) + # _require_tty calls sys.stdin.isatty — make it pass. + monkeypatch.setattr("hermes_cli.main._require_tty", lambda *_a, **_kw: None) + # No node, no bridge script — we shouldn't reach those steps anyway. + + buf = io.StringIO() + with redirect_stdout(buf): + try: + cmd_whatsapp(MagicMock()) + except KeyboardInterrupt: + pass + + assert _env_value(isolated_home, "WHATSAPP_ENABLED") is None, ( + "Setup aborted before pairing — WHATSAPP_ENABLED must not be set. " + f"Got .env: {(isolated_home / '.env').read_text() if (isolated_home / '.env').exists() else '(missing)'}" + ) + + +def test_existing_pairing_skip_branch_enables_whatsapp(isolated_home, monkeypatch): + """User runs ``hermes whatsapp`` with an existing paired session and + chooses "no, keep my session" at the re-pair prompt. The env var + should be (re-)written to true so the gateway picks WhatsApp back up, + even if the var was lost since the original pairing. + """ + from hermes_cli.main import cmd_whatsapp + + # Pre-create a paired session WITHOUT WHATSAPP_ENABLED in .env. + session = isolated_home / "whatsapp" / "session" + session.mkdir(parents=True) + (session / "creds.json").write_text("{}") + monkeypatch.setenv("WHATSAPP_MODE", "bot") + monkeypatch.setenv("WHATSAPP_ALLOWED_USERS", "15551234567") + + # mode already set → skip mode prompt; users already set → skip update + # prompt with "no"; pairing exists → "no, keep session" → return. + inputs = iter(["n", "n"]) + + def fake_input(_prompt=""): + try: + return next(inputs) + except StopIteration: + return "n" + + monkeypatch.setattr("builtins.input", fake_input) + monkeypatch.setattr("hermes_cli.main._require_tty", lambda *_a, **_kw: None) + # Skip the bridge npm install — we're testing setup-ordering, not bridge + # bootstrapping. Pretend node_modules exists (Path.exists -> True for that + # specific check is hard to scope, so instead pretend npm install would + # succeed silently if reached). + monkeypatch.setattr( + "subprocess.run", + lambda *_a, **_kw: MagicMock(returncode=0, stderr=""), + ) + monkeypatch.setattr("shutil.which", lambda _name: "/usr/bin/npm") + # Patch (bridge_dir / "node_modules").exists() by stubbing Path.exists + # to True for that one specific subpath. Easier: pre-create it as a + # symlink to /tmp. But we can't write to the repo. Instead, stub + # Path.exists wholesale to True for node_modules; the creds.json check + # in the same function still works because we wrote it ourselves. + _orig_exists = Path.exists + def _stub_exists(self): + if self.name == "node_modules": + return True + return _orig_exists(self) + monkeypatch.setattr(Path, "exists", _stub_exists) + + buf = io.StringIO() + with redirect_stdout(buf): + cmd_whatsapp(MagicMock()) + + # The skip-rebar branch should have set the env var on its way out. + assert _env_value(isolated_home, "WHATSAPP_ENABLED") == "true"