fix(desktop): keep composer usable during reconnect (#45488)

* feat(cli): add --safe-mode troubleshooting flag Inspired by Claude Code v2.1.169 (June 2026): run Hermes with all customizations disabled to isolate setup problems from product bugs. --safe-mode implies --ignore-user-config and --ignore-rules, and additionally skips plugin discovery (hermes_cli/plugins.py) and MCP server loading (tools/mcp_tool.py) via the internal HERMES_SAFE_MODE env bridge. * fix(desktop): keep composer usable during reconnect
2026-06-20 10:11:58 +00:00 · 2026-06-13 02:36:09 -07:00 · 2026-06-13 02:36:09 -07:00 · 8cf9d8689d
commit 8cf9d8689d
parent b62e57b2f4
11 changed files with 246 additions and 26 deletions
--- a/apps/desktop/src/app/chat/composer/enter-submit-dom-race.test.tsx
+++ b/apps/desktop/src/app/chat/composer/enter-submit-dom-race.test.tsx
@ -24,6 +24,7 @@ afterEach(cleanup)
 // state stays stale while the DOM already holds the text.
 function Harness({
  busy = false,
+  disabled = false,
  queued = [],
  onSubmit,
  onQueue,
@ -31,6 +32,7 @@ function Harness({
  onDrain
 }: {
  busy?: boolean
+  disabled?: boolean
  queued?: readonly string[]
  onSubmit: (text: string) => void
  onQueue: (text: string) => void
@ -52,6 +54,10 @@ function Harness({
  }

  const submitDraft = () => {
+    if (disabled) {
+      return
+    }
+
    const editor = editorRef.current
    if (editor) {
      const domText = composerPlainText(editor)
@ -84,6 +90,10 @@ function Harness({
      const editorText = editorRef.current ? composerPlainText(editorRef.current) : draftRef.current
      const hasLivePayload = editorText.trim().length > 0 || attachments.length > 0

+      if (disabled) {
+        return
+      }
+
      if (!busy && !hasLivePayload && queued.length > 0) {
        onDrain()

@ -186,4 +196,23 @@ describe('composer Enter submit — live DOM vs stale composer state (#39630)',
    expect(onDrain).toHaveBeenCalledTimes(1)
    expect(onSubmit).not.toHaveBeenCalled()
  })
+
+  it('keeps reconnect drafts editable but blocks Enter submit until the gateway returns', async () => {
+    const onSubmit = vi.fn()
+    const onDrain = vi.fn()
+    const { getByTestId } = render(
+      <Harness disabled onCancel={vi.fn()} onDrain={onDrain} onQueue={vi.fn()} onSubmit={onSubmit} queued={['queued-1']} />
+    )
+    const editor = getByTestId('editor')
+
+    await act(async () => {
+      editor.textContent = 'draft while reconnecting'
+      fireEvent.input(editor)
+      fireEvent.keyDown(editor, { key: 'Enter' })
+    })
+
+    expect(editor.textContent).toBe('draft while reconnecting')
+    expect(onDrain).not.toHaveBeenCalled()
+    expect(onSubmit).not.toHaveBeenCalled()
+  })
 })
--- a/apps/desktop/src/app/chat/composer/index.tsx
+++ b/apps/desktop/src/app/chat/composer/index.tsx
@ -247,6 +247,8 @@ export function ChatBar({
  const gatewayState = useStore($gatewayState)
  const newSessionPlaceholders = t.composer.newSessionPlaceholders
  const followUpPlaceholders = t.composer.followUpPlaceholders
+  const reconnecting = gatewayState === 'closed' || gatewayState === 'error'
+  const inputDisabled = disabled && !reconnecting

  // Resting placeholder: a starter for brand-new sessions, a continuation for
  // existing ones. Picked once and only re-rolled when we genuinely move to a
@ -277,11 +279,13 @@ export function ChatBar({
    setRestingPlaceholder(pickPlaceholder(sessionId ? followUpPlaceholders : newSessionPlaceholders))
  }, [followUpPlaceholders, newSessionPlaceholders, sessionId])

-  // When the bar is disabled it's because the gateway isn't open. Distinguish a
-  // cold start ("Starting Hermes...") from a dropped connection we're trying to
-  // restore (e.g. after the Mac slept) so the stuck state reads as recoverable.
+  // When the transport is disabled it's because the gateway isn't open.
+  // Distinguish a cold start ("Starting Hermes...") from a dropped connection
+  // we're trying to restore. During reconnect, keep the textbox editable so a
+  // flaky network doesn't block drafting; only submit/backend actions stay
+  // disabled until the gateway is open again.
  const placeholder = disabled
-    ? gatewayState === 'closed' || gatewayState === 'error'
+    ? reconnecting
      ? t.composer.placeholderReconnecting
      : t.composer.placeholderStarting
    : restingPlaceholder
@ -323,13 +327,13 @@ export function ChatBar({
  )

  useEffect(() => {
-    if (!disabled) {
+    if (!inputDisabled) {
      focusInput()
    }
-  }, [disabled, focusInput, focusKey, focusRequestId])
+  }, [focusInput, focusKey, focusRequestId, inputDisabled])

  useEffect(() => {
-    if (disabled) {
+    if (inputDisabled) {
      return undefined
    }

@ -349,7 +353,7 @@ export function ChatBar({
      offFocus()
      offInsert()
    }
-  }, [appendExternalText, disabled])
+  }, [appendExternalText, inputDisabled])

  // Keep draftRef in sync with the assistant-ui composer state for callers
  // that read the latest text outside the React render cycle. We don't push
@ -934,6 +938,10 @@ export function ChatBar({
      const editorText = editorRef.current ? composerPlainText(editorRef.current) : draftRef.current
      const hasLivePayload = editorText.trim().length > 0 || attachments.length > 0

+      if (disabled) {
+        return
+      }
+
      if (!busy && !hasLivePayload && queuedPrompts.length > 0) {
        void drainNextQueued()

@ -1476,6 +1484,10 @@ export function ChatBar({
  }

  const submitDraft = () => {
+    if (disabled) {
+      return
+    }
+
    // Source the text from the DOM editor, not React state. The AUI composer
    // state (`draft`) and the derived `hasComposerPayload` lag the DOM by a
    // render, so on fast typing or IME composition the final keystroke(s) may
@ -1656,6 +1668,7 @@ export function ChatBar({
  const input = (
    <div className={cn('relative', stacked ? 'w-full' : 'min-w-(--composer-input-inline-min-width) flex-1')}>
      <div
+        aria-disabled={inputDisabled ? true : undefined}
        aria-label={t.composer.message}
        autoCapitalize="off"
        autoCorrect="off"
@ -1666,7 +1679,7 @@ export function ChatBar({
          stacked && 'pl-3',
          stacked ? 'w-full' : 'min-w-(--composer-input-inline-min-width) flex-1'
        )}
-        contentEditable={!disabled}
+        contentEditable={!inputDisabled}
        data-placeholder={placeholder}
        data-slot={RICH_INPUT_SLOT}
        onBlur={() => window.setTimeout(closeTrigger, 80)}
--- a/apps/desktop/src/components/gateway-connecting-overlay.test.tsx
+++ b/apps/desktop/src/components/gateway-connecting-overlay.test.tsx
@ -3,23 +3,23 @@ import { afterEach, beforeEach, describe, expect, it } from 'vitest'

 import { $desktopBoot } from '@/store/boot'
 import { $desktopOnboarding } from '@/store/onboarding'
-import { $gatewayState, setGatewayState } from '@/store/session'
+import { setGatewayState } from '@/store/session'

 import { BootFailureOverlay } from './boot-failure-overlay'
 import { GatewayConnectingOverlay } from './gateway-connecting-overlay'

 // Repro for the "remote gateway → stuck on CONNECTING, no way to settings"
-// report. The connecting overlay (z-1200, full-screen, pointer-events on) is
-// shown whenever `gatewayState !== 'open' && !boot.error`. The ONLY escape
+// report. The connecting overlay (z-1200, full-screen, pointer-events on) used
+// to be shown whenever `gatewayState !== 'open' && !boot.error`. The ONLY escape
 // hatch — BootFailureOverlay, which has "Use local gateway" / "Sign in" /
 // "Retry" — only renders when `boot.error` is set.
 //
 // useGatewayBoot only calls failDesktopBoot() (which sets boot.error) when the
 // INITIAL boot() throws. After the first successful connect (bootCompleted),
 // any later socket drop goes through scheduleReconnect(), which loops FOREVER
-// against the dead remote and never sets boot.error. So gatewayState sits at
-// 'closed'/'error' with boot.error null → CONNECTING forever, recovery overlay
-// never appears, settings unreachable.
+// against the dead remote. So gatewayState sits at 'closed'/'error' with
+// boot.error null. The fix keeps the initial-boot overlay out of post-boot
+// reconnects, leaving chat/settings usable while the reconnect loop runs.

 function resetStores() {
  setGatewayState('idle')
@ -75,7 +75,7 @@ describe('connecting overlay vs recovery surface', () => {
    expect(isConnectingShown()).toBe(false)
  })

-  it('REPRO: remote socket drops AFTER a successful boot → stuck on CONNECTING, no recovery, no settings', () => {
+  it('post-boot socket drops do not re-cover the app with the initial CONNECTING overlay', () => {
    // 1. Initial boot succeeded: gateway opened, boot completed (no error).
    setGatewayState('open')
    const { rerender } = render(
@ -97,14 +97,14 @@ describe('connecting overlay vs recovery surface', () => {
      </>
    )

-    // The connecting overlay reappears and latches...
-    expect(isConnectingShown()).toBe(true)
-    // ...with NO recovery surface, because boot.error was never set.
+    // The initial-boot connecting overlay stays out of the way, so settings and
+    // the composer remain reachable during the reconnect loop.
+    expect(isConnectingShown()).toBe(false)
    expect(isRecoveryShown()).toBe(false)

-    // 3. Reconnect loops forever against the dead remote: gatewayState bounces
-    //    closed → error → closed, boot.error never gets set. The user is
-    //    pinned on CONNECTING with no path to Settings indefinitely.
+    // 3. Reconnect loops against the dead remote: gatewayState bounces closed
+    //    → error → closed. Until the escalation path sets boot.error, the app
+    //    remains usable instead of modal-blocked.
    setGatewayState('error')
    rerender(
      <>
@ -113,7 +113,7 @@ describe('connecting overlay vs recovery surface', () => {
      </>
    )
    expect($desktopBoot.get().error).toBeNull()
-    expect(isConnectingShown()).toBe(true)
+    expect(isConnectingShown()).toBe(false)
    expect(isRecoveryShown()).toBe(false)
  })

--- a/apps/desktop/src/components/gateway-connecting-overlay.tsx
+++ b/apps/desktop/src/components/gateway-connecting-overlay.tsx
@ -52,7 +52,13 @@ export function GatewayConnectingOverlay() {
  const [tail, setTail] = useState(TAIL)
  const [phase, setPhase] = useState<Phase>('live')

-  const connecting = gatewayState !== 'open' && !boot.error
+  // The full-screen connecting overlay is for initial boot only. After a
+  // healthy boot, flaky networks / sleep-wake can drop the socket and flip the
+  // gateway state back to closed/error while the app reconnects. Do not cover
+  // the chat then — users should still be able to type drafts, open settings,
+  // and recover instead of staring at a modal CONNECTING screen.
+  const initialBootActive = boot.visible || boot.running || boot.progress < 100
+  const connecting = gatewayState !== 'open' && !boot.error && initialBootActive
  // Latches once we've actually shown the overlay, so the brief frame where
  // gatewayState flips to "open" (connecting -> false) before the exit phase
  // kicks in doesn't unmount us and cause a flash.
--- a/hermes_cli/_parser.py
+++ b/hermes_cli/_parser.py
@ -213,6 +213,13 @@ def build_top_level_parser():
        default=False,
        help="Skip auto-injection of AGENTS.md, SOUL.md, .cursorrules, memory, and preloaded skills",
    )
+    _inherited_flag(
+        parser,
+        "--safe-mode",
+        action="store_true",
+        default=False,
+        help="Troubleshooting mode: disable ALL customizations — user config, AGENTS.md/memory injection, plugins, and MCP servers (implies --ignore-user-config and --ignore-rules)",
+    )
    _inherited_flag(
        parser,
        "--tui",
@ -366,6 +373,13 @@ def build_top_level_parser():
        default=argparse.SUPPRESS,
        help="Skip auto-injection of AGENTS.md, SOUL.md, .cursorrules, memory, and preloaded skills. Combine with --ignore-user-config for a fully isolated run.",
    )
+    _inherited_flag(
+        chat_parser,
+        "--safe-mode",
+        action="store_true",
+        default=argparse.SUPPRESS,
+        help="Troubleshooting mode: disable ALL customizations — user config, AGENTS.md/memory injection, plugins, and MCP servers (implies --ignore-user-config and --ignore-rules). Use to isolate whether a problem comes from your setup or from Hermes itself.",
+    )
    chat_parser.add_argument(
        "--source",
        default=None,
--- a/hermes_cli/main.py
+++ b/hermes_cli/main.py
@ -2199,6 +2199,18 @@ def cmd_chat(args):
    if getattr(args, "yolo", False):
        os.environ["HERMES_YOLO_MODE"] = "1"

+    # --safe-mode: troubleshooting mode that disables ALL customizations.
+    # Inspired by Claude Code v2.1.169's --safe-mode (June 2026): run with a
+    # pristine environment to isolate whether a problem comes from the user's
+    # setup (config, rules files, plugins, MCP servers) or from Hermes itself.
+    # Implemented as a superset of --ignore-user-config + --ignore-rules plus
+    # plugin/MCP discovery suppression (HERMES_SAFE_MODE is checked by
+    # hermes_cli/plugins.py and tools/mcp_tool.py).
+    if getattr(args, "safe_mode", False):
+        os.environ["HERMES_SAFE_MODE"] = "1"
+        os.environ["HERMES_IGNORE_USER_CONFIG"] = "1"
+        os.environ["HERMES_IGNORE_RULES"] = "1"
+
    # --ignore-user-config: make load_cli_config() / load_config() skip the
    # user's ~/.hermes/config.yaml and return built-in defaults. Set BEFORE
    # importing cli (which runs `CLI_CONFIG = load_cli_config()` at module
@ -2256,8 +2268,8 @@ def cmd_chat(args):
        "checkpoints": getattr(args, "checkpoints", False),
        "pass_session_id": getattr(args, "pass_session_id", False),
        "max_turns": getattr(args, "max_turns", None),
-        "ignore_rules": getattr(args, "ignore_rules", False),
-        "ignore_user_config": getattr(args, "ignore_user_config", False),
+        "ignore_rules": getattr(args, "ignore_rules", False) or getattr(args, "safe_mode", False),
+        "ignore_user_config": getattr(args, "ignore_user_config", False) or getattr(args, "safe_mode", False),
        "compact": getattr(args, "compact", False),
    }
    # Filter out None values
--- a/hermes_cli/plugins.py
+++ b/hermes_cli/plugins.py
@ -1124,6 +1124,14 @@ class PluginManager:
        """
        if self._discovered and not force:
            return
+        # Safe mode (--safe-mode / HERMES_SAFE_MODE=1): troubleshooting run
+        # with all customizations disabled. Skip plugin discovery entirely so
+        # no third-party code (hooks, tools, platforms) loads. Mark as
+        # discovered so callers see a clean empty registry, not a retry loop.
+        if env_var_enabled("HERMES_SAFE_MODE"):
+            logger.info("HERMES_SAFE_MODE=1 — plugin discovery skipped")
+            self._discovered = True
+            return
        if force:
            self._plugins.clear()
            self._hooks.clear()
--- a/tests/hermes_cli/test_safe_mode.py
+++ b/tests/hermes_cli/test_safe_mode.py
@ -0,0 +1,130 @@
+"""Tests for `hermes chat --safe-mode` — pristine troubleshooting runs.
+
+Inspired by Claude Code v2.1.169's ``--safe-mode`` flag (June 2026), which
+disables all customizations (CLAUDE.md, plugins, skills, hooks, MCP) for
+troubleshooting. The Hermes equivalent:
+
+* implies ``--ignore-user-config`` (built-in config defaults)
+* implies ``--ignore-rules`` (no AGENTS.md/memory/preloaded-skill injection)
+* skips plugin discovery entirely (``hermes_cli.plugins``)
+* loads zero MCP servers (``tools.mcp_tool._load_mcp_config``)
+"""
+
+from __future__ import annotations
+
+import os
+
+import pytest
+
+
+_VARS = ("HERMES_SAFE_MODE", "HERMES_IGNORE_USER_CONFIG", "HERMES_IGNORE_RULES")
+
+
+@pytest.fixture(autouse=True)
+def _clean_env(monkeypatch):
+    for var in _VARS:
+        monkeypatch.delenv(var, raising=False)
+    yield
+    for var in _VARS:
+        os.environ.pop(var, None)
+
+
+class TestSafeModeEnvWiring:
+    """cmd_chat must translate --safe-mode into the three env gates."""
+
+    def test_safe_mode_sets_all_gates(self):
+        # Mirrors the cmd_chat logic in hermes_cli/main.py.
+        class Args:
+            safe_mode = True
+
+        args = Args()
+        if getattr(args, "safe_mode", False):
+            os.environ["HERMES_SAFE_MODE"] = "1"
+            os.environ["HERMES_IGNORE_USER_CONFIG"] = "1"
+            os.environ["HERMES_IGNORE_RULES"] = "1"
+
+        assert os.environ.get("HERMES_SAFE_MODE") == "1"
+        assert os.environ.get("HERMES_IGNORE_USER_CONFIG") == "1"
+        assert os.environ.get("HERMES_IGNORE_RULES") == "1"
+
+
+class TestSafeModePluginDiscovery:
+    """Plugin discovery must be a no-op under HERMES_SAFE_MODE=1."""
+
+    def test_discovery_skipped(self, monkeypatch):
+        monkeypatch.setenv("HERMES_SAFE_MODE", "1")
+        from hermes_cli.plugins import PluginManager
+
+        mgr = PluginManager()
+        called = []
+        monkeypatch.setattr(
+            mgr, "_discover_and_load_inner", lambda: called.append(True)
+        )
+        mgr.discover_and_load()
+        assert called == []          # inner sweep never ran
+        assert mgr._discovered is True  # registry settled as clean-empty
+        assert mgr._plugins == {}
+
+    def test_discovery_runs_without_safe_mode(self, monkeypatch):
+        monkeypatch.delenv("HERMES_SAFE_MODE", raising=False)
+        from hermes_cli.plugins import PluginManager
+
+        mgr = PluginManager()
+        called = []
+        monkeypatch.setattr(
+            mgr, "_discover_and_load_inner", lambda: called.append(True)
+        )
+        mgr.discover_and_load()
+        assert called == [True]
+
+
+class TestSafeModeMCP:
+    """_load_mcp_config must return no servers under HERMES_SAFE_MODE=1."""
+
+    def test_mcp_servers_empty(self, monkeypatch):
+        monkeypatch.setenv("HERMES_SAFE_MODE", "1")
+        from tools.mcp_tool import _load_mcp_config
+
+        with pytest.MonkeyPatch.context() as mp:
+            mp.setattr(
+                "hermes_cli.config.load_config",
+                lambda: {"mcp_servers": {"github": {"url": "https://example.com/mcp"}}},
+            )
+            assert _load_mcp_config() == {}
+
+    def test_mcp_servers_load_without_safe_mode(self, monkeypatch):
+        monkeypatch.delenv("HERMES_SAFE_MODE", raising=False)
+        from tools.mcp_tool import _load_mcp_config
+
+        with pytest.MonkeyPatch.context() as mp:
+            mp.setattr(
+                "hermes_cli.config.load_config",
+                lambda: {"mcp_servers": {"github": {"url": "https://example.com/mcp"}}},
+            )
+            servers = _load_mcp_config()
+            assert "github" in servers
+
+
+class TestSafeModeParser:
+    """--safe-mode must parse on both the root parser and `hermes chat`."""
+
+    def test_chat_subcommand_accepts_flag(self):
+        from hermes_cli._parser import build_top_level_parser
+
+        parser, _subparsers, _chat = build_top_level_parser()
+        args = parser.parse_args(["chat", "--safe-mode"])
+        assert getattr(args, "safe_mode", False) is True
+
+    def test_root_parser_accepts_flag(self):
+        from hermes_cli._parser import build_top_level_parser
+
+        parser, _subparsers, _chat = build_top_level_parser()
+        args = parser.parse_args(["--safe-mode"])
+        assert getattr(args, "safe_mode", False) is True
+
+    def test_default_is_off(self):
+        from hermes_cli._parser import build_top_level_parser
+
+        parser, _subparsers, _chat = build_top_level_parser()
+        args = parser.parse_args(["chat"])
+        assert getattr(args, "safe_mode", False) is False
--- a/tools/mcp_tool.py
+++ b/tools/mcp_tool.py
@ -2686,6 +2686,11 @@ def _load_mcp_config() -> Dict[str, dict]:
    """
    try:
        from hermes_cli.config import load_config
+        # Safe mode (--safe-mode / HERMES_SAFE_MODE=1): troubleshooting run
+        # with all customizations disabled — no MCP servers connect.
+        from utils import env_var_enabled as _env_enabled
+        if _env_enabled("HERMES_SAFE_MODE"):
+            return {}
        config = load_config()
        servers = config.get("mcp_servers")
        if not servers or not isinstance(servers, dict):
--- a/website/docs/reference/cli-commands.md
+++ b/website/docs/reference/cli-commands.md
@ -112,6 +112,7 @@ Common options:
 | `--pass-session-id` | Pass the session ID into the system prompt. |
 | `--ignore-user-config` | Ignore `~/.hermes/config.yaml` and use built-in defaults. Credentials in `.env` are still loaded. Useful for isolated CI runs, reproducible bug reports, and third-party integrations. |
 | `--ignore-rules` | Skip auto-injection of `AGENTS.md`, `SOUL.md`, `.cursorrules`, persistent memory, and preloaded skills. Combine with `--ignore-user-config` for a fully isolated run. |
+| `--safe-mode` | Troubleshooting mode: disable ALL customizations — user config, rules/memory injection, plugins, and MCP servers (implies `--ignore-user-config` and `--ignore-rules`). Use to isolate whether a problem comes from your setup or from Hermes itself. |
 | `--source <tag>` | Session source tag for filtering (default: `cli`). Use `tool` for third-party integrations that should not appear in user session lists. |
 | `--max-turns <N>` | Maximum tool-calling iterations per conversation turn (default: 90, or `agent.max_turns` in config). |

@ -125,6 +126,7 @@ hermes chat --toolsets web,terminal,skills
 hermes chat --quiet -q "Return only JSON"
 hermes chat --worktree -q "Review this repo and open a PR"
 hermes chat --ignore-user-config --ignore-rules -q "Repro without my personal setup"
+hermes chat --safe-mode -q "Is this bug mine or Hermes'?"
 ```

 ### `hermes -z <prompt>` — scripted one-shot
--- a/website/docs/reference/environment-variables.md
+++ b/website/docs/reference/environment-variables.md
@ -595,6 +595,7 @@ Advanced per-platform knobs for throttling the outbound message batcher. Most us
 | `HERMES_ACCEPT_HOOKS` | Auto-approve any unseen shell hooks declared in `config.yaml` without a TTY prompt. Equivalent to `--accept-hooks` or `hooks_auto_accept: true`. |
 | `HERMES_IGNORE_USER_CONFIG` | Skip `~/.hermes/config.yaml` and use built-in defaults (credentials in `.env` still load). Equivalent to `--ignore-user-config`. |
 | `HERMES_IGNORE_RULES` | Skip auto-injection of `AGENTS.md`, `SOUL.md`, `.cursorrules`, memory, and preloaded skills. Equivalent to `--ignore-rules`. |
+| `HERMES_SAFE_MODE` | Troubleshooting mode: disable ALL customizations — skips plugin discovery and MCP server loading. Set automatically by `--safe-mode` (which also sets the two flags above). |
 | `HERMES_MD_NAMES` | Comma-separated list of rules-file names to auto-inject (default: `AGENTS.md,CLAUDE.md,.cursorrules,SOUL.md`). |
 | `HERMES_TOOL_PROGRESS` | Deprecated compatibility variable for tool progress display. Prefer `display.tool_progress` in `config.yaml`. |
 | `HERMES_TOOL_PROGRESS_MODE` | Deprecated compatibility variable for tool progress mode. Prefer `display.tool_progress` in `config.yaml`. |