Merge branch 'main' of github.com:NousResearch/hermes-agent into feat/ink-refactor

2026-06-09 08:21:50 +00:00 · 2026-04-11 13:14:36 -05:00 · 2026-04-11 13:14:36 -05:00 · bf6af95ff5
commit bf6af95ff5
parent 3fd5cf6e3c 55fac8a386
16 changed files with 644 additions and 75 deletions
--- a/agent/models_dev.py
+++ b/agent/models_dev.py
@ -383,7 +383,14 @@ def get_model_capabilities(provider: str, model: str) -> Optional[ModelCapabilit

    # Extract capability flags (default to False if missing)
    supports_tools = bool(entry.get("tool_call", False))
-    supports_vision = bool(entry.get("attachment", False))
+    # Vision: check both the `attachment` flag and `modalities.input` for "image".
+    # Some models (e.g. gemma-4) list image in input modalities but not attachment.
+    input_mods = entry.get("modalities", {})
+    if isinstance(input_mods, dict):
+        input_mods = input_mods.get("input", [])
+    else:
+        input_mods = []
+    supports_vision = bool(entry.get("attachment", False)) or "image" in input_mods
    supports_reasoning = bool(entry.get("reasoning", False))

    # Extract limits
--- a/gateway/config.py
+++ b/gateway/config.py
@ -190,7 +190,7 @@ class StreamingConfig:
    """Configuration for real-time token streaming to messaging platforms."""
    enabled: bool = False
    transport: str = "edit"       # "edit" (progressive editMessageText) or "off"
-    edit_interval: float = 0.3    # Seconds between message edits
+    edit_interval: float = 1.0    # Seconds between message edits (Telegram rate-limits at ~1/s)
    buffer_threshold: int = 40    # Chars before forcing an edit
    cursor: str = " ▉"           # Cursor shown during streaming

@ -210,7 +210,7 @@ class StreamingConfig:
        return cls(
            enabled=data.get("enabled", False),
            transport=data.get("transport", "edit"),
-            edit_interval=float(data.get("edit_interval", 0.3)),
+            edit_interval=float(data.get("edit_interval", 1.0)),
            buffer_threshold=int(data.get("buffer_threshold", 40)),
            cursor=data.get("cursor", " ▉"),
        )
--- a/gateway/platforms/matrix.py
+++ b/gateway/platforms/matrix.py
@ -352,7 +352,16 @@ class MatrixAdapter(BasePlatformAdapter):
                from mautrix.crypto import OlmMachine
                from mautrix.crypto.store import MemoryCryptoStore

-                crypto_store = MemoryCryptoStore()
+                # account_id and pickle_key are required by mautrix ≥0.21.
+                # Use the Matrix user ID as account_id for stable identity.
+                # pickle_key secures in-memory serialisation; derive from
+                # the same user_id:device_id pair used for the on-disk HMAC.
+                _acct_id = self._user_id or "hermes"
+                _pickle_key = f"{_acct_id}:{self._device_id}"
+                crypto_store = MemoryCryptoStore(
+                    account_id=_acct_id,
+                    pickle_key=_pickle_key,
+                )

                # Restore persisted crypto state from a previous run.
                # Uses HMAC to verify integrity before unpickling.
@ -418,6 +427,11 @@ class MatrixAdapter(BasePlatformAdapter):
            if isinstance(sync_data, dict):
                rooms_join = sync_data.get("rooms", {}).get("join", {})
                self._joined_rooms = set(rooms_join.keys())
+                # Store the next_batch token so incremental syncs start
+                # from where the initial sync left off.
+                nb = sync_data.get("next_batch")
+                if nb:
+                    await client.sync_store.put_next_batch(nb)
                logger.info(
                    "Matrix: initial sync complete, joined %d rooms",
                    len(self._joined_rooms),
@ -809,19 +823,40 @@ class MatrixAdapter(BasePlatformAdapter):

    async def _sync_loop(self) -> None:
        """Continuously sync with the homeserver."""
+        client = self._client
+        # Resume from the token stored during the initial sync.
+        next_batch = await client.sync_store.get_next_batch()
        while not self._closing:
            try:
-                sync_data = await self._client.sync(timeout=30000)
+                sync_data = await client.sync(
+                    since=next_batch, timeout=30000,
+                )
                if isinstance(sync_data, dict):
                    # Update joined rooms from sync response.
                    rooms_join = sync_data.get("rooms", {}).get("join", {})
                    if rooms_join:
                        self._joined_rooms.update(rooms_join.keys())

-                # Share keys periodically if E2EE is enabled.
-                if self._encryption and getattr(self._client, "crypto", None):
+                    # Advance the sync token so the next request is
+                    # incremental instead of a full initial sync.
+                    nb = sync_data.get("next_batch")
+                    if nb:
+                        next_batch = nb
+                        await client.sync_store.put_next_batch(nb)
+
+                    # Dispatch events to registered handlers so that
+                    # _on_room_message / _on_reaction / _on_invite fire.
                    try:
-                        await self._client.crypto.share_keys()
+                        tasks = client.handle_sync(sync_data)
+                        if tasks:
+                            await asyncio.gather(*tasks)
+                    except Exception as exc:
+                        logger.warning("Matrix: sync event dispatch error: %s", exc)
+
+                # Share keys periodically if E2EE is enabled.
+                if self._encryption and getattr(client, "crypto", None):
+                    try:
+                        await client.crypto.share_keys()
                    except Exception as exc:
                        logger.warning("Matrix: E2EE key share failed: %s", exc)

--- a/gateway/stream_consumer.py
+++ b/gateway/stream_consumer.py
@ -36,7 +36,7 @@ _NEW_SEGMENT = object()
@dataclass
 class StreamConsumerConfig:
    """Runtime config for a single stream consumer instance."""
-    edit_interval: float = 0.3
+    edit_interval: float = 1.0
    buffer_threshold: int = 40
    cursor: str = " ▉"

@ -56,6 +56,10 @@ class GatewayStreamConsumer:
        await task         # wait for final edit
    """

+    # After this many consecutive flood-control failures, permanently disable
+    # progressive edits for the remainder of the stream.
+    _MAX_FLOOD_STRIKES = 3
+
    def __init__(
        self,
        adapter: Any,
@ -76,6 +80,8 @@ class GatewayStreamConsumer:
        self._last_sent_text = ""   # Track last-sent text to skip redundant edits
        self._fallback_final_send = False
        self._fallback_prefix = ""
+        self._flood_strikes = 0         # Consecutive flood-control edit failures
+        self._current_edit_interval = self.cfg.edit_interval  # Adaptive backoff

    @property
    def already_sent(self) -> bool:
@ -129,7 +135,7 @@ class GatewayStreamConsumer:
                should_edit = (
                    got_done
                    or got_segment_break
-                    or (elapsed >= self.cfg.edit_interval
+                    or (elapsed >= self._current_edit_interval
                        and self._accumulated)
                    or len(self._accumulated) >= self.cfg.buffer_threshold
                )
@ -173,12 +179,13 @@ class GatewayStreamConsumer:
                        if split_at < _safe_limit // 2:
                            split_at = _safe_limit
                        chunk = self._accumulated[:split_at]
-                        await self._send_or_edit(chunk)
-                        if self._fallback_final_send:
-                            # Edit failed while attempting to split an oversized
-                            # message. Keep the full accumulated text intact so
-                            # the fallback final-send path can deliver the
-                            # remaining continuation without dropping content.
+                        ok = await self._send_or_edit(chunk)
+                        if self._fallback_final_send or not ok:
+                            # Edit failed (or backed off due to flood control)
+                            # while attempting to split an oversized message.
+                            # Keep the full accumulated text intact so the
+                            # fallback final-send path can deliver the remaining
+                            # continuation without dropping content.
                            break
                        self._accumulated = self._accumulated[split_at:].lstrip("\n")
                        self._message_id = None
@ -322,7 +329,10 @@ class GatewayStreamConsumer:
        return chunks

    async def _send_fallback_final(self, text: str) -> None:
-        """Send the final continuation after streaming edits stop working."""
+        """Send the final continuation after streaming edits stop working.
+
+        Retries each chunk once on flood-control failures with a short delay.
+        """
        final_text = self._clean_for_display(text)
        continuation = self._continuation_text(final_text)
        self._fallback_final_send = False
@ -339,12 +349,25 @@ class GatewayStreamConsumer:
        last_successful_chunk = ""
        sent_any_chunk = False
        for chunk in chunks:
-            result = await self.adapter.send(
-                chat_id=self.chat_id,
-                content=chunk,
-                metadata=self.metadata,
-            )
-            if not result.success:
+            # Try sending with one retry on flood-control errors.
+            result = None
+            for attempt in range(2):
+                result = await self.adapter.send(
+                    chat_id=self.chat_id,
+                    content=chunk,
+                    metadata=self.metadata,
+                )
+                if result.success:
+                    break
+                if attempt == 0 and self._is_flood_error(result):
+                    logger.debug(
+                        "Flood control on fallback send, retrying in 3s"
+                    )
+                    await asyncio.sleep(3.0)
+                else:
+                    break  # non-flood error or second attempt failed
+
+            if not result or not result.success:
                if sent_any_chunk:
                    # Some continuation text already reached the user. Suppress
                    # the base gateway final-send path so we don't resend the
@ -370,20 +393,52 @@ class GatewayStreamConsumer:
        self._last_sent_text = chunks[-1]
        self._fallback_prefix = ""

-    async def _send_or_edit(self, text: str) -> None:
-        """Send or edit the streaming message."""
+    def _is_flood_error(self, result) -> bool:
+        """Check if a SendResult failure is due to flood control / rate limiting."""
+        err = getattr(result, "error", "") or ""
+        err_lower = err.lower()
+        return "flood" in err_lower or "retry after" in err_lower or "rate" in err_lower
+
+    async def _try_strip_cursor(self) -> None:
+        """Best-effort edit to remove the cursor from the last visible message.
+
+        Called when entering fallback mode so the user doesn't see a stuck
+        cursor (▉) in the partial message.
+        """
+        if not self._message_id or self._message_id == "__no_edit__":
+            return
+        prefix = self._visible_prefix()
+        if not prefix or not prefix.strip():
+            return
+        try:
+            await self.adapter.edit_message(
+                chat_id=self.chat_id,
+                message_id=self._message_id,
+                content=prefix,
+            )
+            self._last_sent_text = prefix
+        except Exception:
+            pass  # best-effort — don't let this block the fallback path
+
+    async def _send_or_edit(self, text: str) -> bool:
+        """Send or edit the streaming message.
+
+        Returns True if the text was successfully delivered (sent or edited),
+        False otherwise.  Callers like the overflow split loop use this to
+        decide whether to advance past the delivered chunk.
+        """
        # Strip MEDIA: directives so they don't appear as visible text.
        # Media files are delivered as native attachments after the stream
        # finishes (via _deliver_media_from_response in gateway/run.py).
        text = self._clean_for_display(text)
        if not text.strip():
-            return
+            return True  # nothing to send is "success"
        try:
            if self._message_id is not None:
                if self._edit_supported:
                    # Skip if text is identical to what we last sent
                    if text == self._last_sent_text:
-                        return
+                        return True
                    # Edit existing message
                    result = await self.adapter.edit_message(
                        chat_id=self.chat_id,
@ -393,19 +448,52 @@ class GatewayStreamConsumer:
                    if result.success:
                        self._already_sent = True
                        self._last_sent_text = text
+                        # Successful edit — reset flood strike counter
+                        self._flood_strikes = 0
+                        return True
                    else:
-                        # If an edit fails mid-stream (especially Telegram flood control),
-                        # stop progressive edits and send only the missing tail once the
+                        # Edit failed.  If this looks like flood control / rate
+                        # limiting, use adaptive backoff: double the edit interval
+                        # and retry on the next cycle.  Only permanently disable
+                        # edits after _MAX_FLOOD_STRIKES consecutive failures.
+                        if self._is_flood_error(result):
+                            self._flood_strikes += 1
+                            self._current_edit_interval = min(
+                                self._current_edit_interval * 2, 10.0,
+                            )
+                            logger.debug(
+                                "Flood control on edit (strike %d/%d), "
+                                "backoff interval → %.1fs",
+                                self._flood_strikes,
+                                self._MAX_FLOOD_STRIKES,
+                                self._current_edit_interval,
+                            )
+                            if self._flood_strikes < self._MAX_FLOOD_STRIKES:
+                                # Don't disable edits yet — just slow down.
+                                # Update _last_edit_time so the next edit
+                                # respects the new interval.
+                                self._last_edit_time = time.monotonic()
+                                return False
+
+                        # Non-flood error OR flood strikes exhausted: enter
+                        # fallback mode — send only the missing tail once the
                        # final response is available.
-                        logger.debug("Edit failed, disabling streaming for this adapter")
+                        logger.debug(
+                            "Edit failed (strikes=%d), entering fallback mode",
+                            self._flood_strikes,
+                        )
                        self._fallback_prefix = self._visible_prefix()
                        self._fallback_final_send = True
                        self._edit_supported = False
                        self._already_sent = True
+                        # Best-effort: strip the cursor from the last visible
+                        # message so the user doesn't see a stuck ▉.
+                        await self._try_strip_cursor()
+                        return False
                else:
                    # Editing not supported — skip intermediate updates.
                    # The final response will be sent by the fallback path.
-                    pass
+                    return False
            else:
                # First message — send new
                result = await self.adapter.send(
@ -417,6 +505,7 @@ class GatewayStreamConsumer:
                    self._message_id = result.message_id
                    self._already_sent = True
                    self._last_sent_text = text
+                    return True
                elif result.success:
                    # Platform accepted the message but returned no message_id
                    # (e.g. Signal).  Can't edit without an ID — switch to
@ -428,8 +517,11 @@ class GatewayStreamConsumer:
                    self._fallback_final_send = True
                    # Sentinel prevents re-entering this branch on every delta
                    self._message_id = "__no_edit__"
+                    return True  # platform accepted, just can't edit
                else:
                    # Initial send failed — disable streaming for this session
                    self._edit_supported = False
+                    return False
        except Exception as e:
            logger.error("Stream send/edit error: %s", e)
+            return False
--- a/hermes_cli/config.py
+++ b/hermes_cli/config.py
@ -381,7 +381,7 @@ DEFAULT_CONFIG = {
            "model": "",           # e.g. "google/gemini-2.5-flash", "gpt-4o"
            "base_url": "",        # direct OpenAI-compatible endpoint (takes precedence over provider)
            "api_key": "",         # API key for base_url (falls back to OPENAI_API_KEY)
-            "timeout": 30,         # seconds — LLM API call timeout; increase for slow local vision models
+            "timeout": 120,        # seconds — LLM API call timeout; vision payloads need generous timeout
            "download_timeout": 30,  # seconds — image HTTP download timeout; increase for slow connections
        },
        "web_extract": {
--- a/hermes_cli/models.py
+++ b/hermes_cli/models.py
@ -56,6 +56,18 @@ OPENROUTER_MODELS: list[tuple[str, str]] = [

 _openrouter_catalog_cache: list[tuple[str, str]] | None = None

+
+def _codex_curated_models() -> list[str]:
+    """Derive the openai-codex curated list from codex_models.py.
+
+    Single source of truth: DEFAULT_CODEX_MODELS + forward-compat synthesis.
+    This keeps the gateway /model picker in sync with the CLI `hermes model`
+    flow without maintaining a separate static list.
+    """
+    from hermes_cli.codex_models import DEFAULT_CODEX_MODELS, _add_forward_compat_models
+    return _add_forward_compat_models(list(DEFAULT_CODEX_MODELS))
+
+
 _PROVIDER_MODELS: dict[str, list[str]] = {
    "nous": [
        "anthropic/claude-opus-4.6",
@ -86,14 +98,7 @@ _PROVIDER_MODELS: dict[str, list[str]] = {
        "openai/gpt-5.4-pro",
        "openai/gpt-5.4-nano",
    ],
-    "openai-codex": [
-        "gpt-5.4",
-        "gpt-5.4-mini",
-        "gpt-5.3-codex",
-        "gpt-5.2-codex",
-        "gpt-5.1-codex-mini",
-        "gpt-5.1-codex-max",
-    ],
+    "openai-codex": _codex_curated_models(),
    "copilot-acp": [
        "copilot-acp",
    ],
--- a/run_agent.py
+++ b/run_agent.py
@ -700,10 +700,14 @@ class AIAgent:
        except Exception:
            pass

-        # Direct OpenAI sessions use the Responses API path.  GPT-5.x tool
-        # calls with reasoning are rejected on /v1/chat/completions, and
-        # Hermes is a tool-using client by default.
-        if self.api_mode == "chat_completions" and self._is_direct_openai_url():
+        # GPT-5.x models require the Responses API path — they are rejected
+        # on /v1/chat/completions by both OpenAI and OpenRouter.  Also
+        # auto-upgrade for direct OpenAI URLs (api.openai.com) since all
+        # newer tool-calling models prefer Responses there.
+        if self.api_mode == "chat_completions" and (
+            self._is_direct_openai_url()
+            or self._model_requires_responses_api(self.model)
+        ):
            self.api_mode = "codex_responses"

        # Pre-warm OpenRouter model metadata cache in a background thread.
@ -1702,6 +1706,21 @@ class AIAgent:
        """Return True when the base URL targets OpenRouter."""
        return "openrouter" in self._base_url_lower

+    @staticmethod
+    def _model_requires_responses_api(model: str) -> bool:
+        """Return True for models that require the Responses API path.
+
+        GPT-5.x models are rejected on /v1/chat/completions by both
+        OpenAI and OpenRouter (error: ``unsupported_api_for_model``).
+        Detect these so the correct api_mode is set regardless of
+        which provider is serving the model.
+        """
+        m = model.lower()
+        # Strip vendor prefix (e.g. "openai/gpt-5.4" → "gpt-5.4")
+        if "/" in m:
+            m = m.rsplit("/", 1)[-1]
+        return m.startswith("gpt-5")
+
    def _max_tokens_param(self, value: int) -> dict:
        """Return the correct max tokens kwarg for the current provider.
        
@ -5252,7 +5271,7 @@ class AIAgent:
            except Exception:
                pass

-            # Determine api_mode from provider / base URL
+            # Determine api_mode from provider / base URL / model
            fb_api_mode = "chat_completions"
            fb_base_url = str(fb_client.base_url)
            if fb_provider == "openai-codex":
@ -5261,6 +5280,10 @@ class AIAgent:
                fb_api_mode = "anthropic_messages"
            elif self._is_direct_openai_url(fb_base_url):
                fb_api_mode = "codex_responses"
+            elif self._model_requires_responses_api(fb_model):
+                # GPT-5.x models need Responses API on every provider
+                # (OpenRouter, Copilot, direct OpenAI, etc.)
+                fb_api_mode = "codex_responses"

            old_model = self.model
            self.model = fb_model
@ -5349,8 +5372,8 @@ class AIAgent:
        to the fallback provider for every subsequent turn.  Calling this at
        the top of ``run_conversation()`` makes fallback turn-scoped.

-        The gateway creates a fresh agent per message so this is a no-op
-        there (``_fallback_activated`` is always False at turn start).
+        The gateway caches agents across messages (``_agent_cache`` in
+        ``gateway/run.py``), so this restoration IS needed there too.
        """
        if not self._fallback_activated:
            return False
--- a/tests/agent/test_models_dev.py
+++ b/tests/agent/test_models_dev.py
@ -7,6 +7,7 @@ from agent.models_dev import (
    PROVIDER_TO_MODELS_DEV,
    _extract_context,
    fetch_models_dev,
+    get_model_capabilities,
    lookup_models_dev_context,
 )

@ -195,3 +196,88 @@ class TestFetchModelsDev:
        result = fetch_models_dev()
        mock_get.assert_not_called()
        assert result == SAMPLE_REGISTRY
+
+
+# ---------------------------------------------------------------------------
+# get_model_capabilities — vision via modalities.input
+# ---------------------------------------------------------------------------
+
+
+CAPS_REGISTRY = {
+    "google": {
+        "id": "google",
+        "models": {
+            "gemma-4-31b-it": {
+                "id": "gemma-4-31b-it",
+                "attachment": False,
+                "tool_call": True,
+                "modalities": {"input": ["text", "image"]},
+                "limit": {"context": 128000, "output": 8192},
+            },
+            "gemma-3-1b": {
+                "id": "gemma-3-1b",
+                "tool_call": True,
+                "limit": {"context": 32000, "output": 8192},
+            },
+        },
+    },
+    "anthropic": {
+        "id": "anthropic",
+        "models": {
+            "claude-sonnet-4": {
+                "id": "claude-sonnet-4",
+                "attachment": True,
+                "tool_call": True,
+                "limit": {"context": 200000, "output": 64000},
+            },
+        },
+    },
+}
+
+
+class TestGetModelCapabilities:
+    """Tests for get_model_capabilities vision detection."""
+
+    def test_vision_from_attachment_flag(self):
+        """Models with attachment=True should report supports_vision=True."""
+        with patch("agent.models_dev.fetch_models_dev", return_value=CAPS_REGISTRY):
+            caps = get_model_capabilities("anthropic", "claude-sonnet-4")
+        assert caps is not None
+        assert caps.supports_vision is True
+
+    def test_vision_from_modalities_input_image(self):
+        """Models with 'image' in modalities.input but attachment=False should
+        still report supports_vision=True (the core fix in this PR)."""
+        with patch("agent.models_dev.fetch_models_dev", return_value=CAPS_REGISTRY):
+            caps = get_model_capabilities("google", "gemma-4-31b-it")
+        assert caps is not None
+        assert caps.supports_vision is True
+
+    def test_no_vision_without_attachment_or_modalities(self):
+        """Models with neither attachment nor image modality should be non-vision."""
+        with patch("agent.models_dev.fetch_models_dev", return_value=CAPS_REGISTRY):
+            caps = get_model_capabilities("google", "gemma-3-1b")
+        assert caps is not None
+        assert caps.supports_vision is False
+
+    def test_modalities_non_dict_handled(self):
+        """Non-dict modalities field should not crash."""
+        registry = {
+            "google": {"id": "google", "models": {
+                "weird-model": {
+                    "id": "weird-model",
+                    "modalities": "text",  # not a dict
+                    "limit": {"context": 200000, "output": 8192},
+                },
+            }},
+        }
+        with patch("agent.models_dev.fetch_models_dev", return_value=registry):
+            caps = get_model_capabilities("gemini", "weird-model")
+        assert caps is not None
+        assert caps.supports_vision is False
+
+    def test_model_not_found_returns_none(self):
+        """Unknown model should return None."""
+        with patch("agent.models_dev.fetch_models_dev", return_value=CAPS_REGISTRY):
+            caps = get_model_capabilities("anthropic", "nonexistent-model")
+        assert caps is None
--- a/tests/gateway/test_matrix.py
+++ b/tests/gateway/test_matrix.py
@ -157,7 +157,9 @@ def _make_fake_mautrix():
    mautrix_crypto_store = types.ModuleType("mautrix.crypto.store")

    class MemoryCryptoStore:
-        pass
+        def __init__(self, account_id="", pickle_key=""):
+            self.account_id = account_id
+            self.pickle_key = pickle_key

    mautrix_crypto_store.MemoryCryptoStore = MemoryCryptoStore

@ -1041,20 +1043,28 @@ class TestMatrixSyncLoop:
            call_count += 1
            if call_count >= 1:
                adapter._closing = True
-            return {"rooms": {"join": {"!room:example.org": {}}}}
+            return {"rooms": {"join": {"!room:example.org": {}}}, "next_batch": "s1234"}

        mock_crypto = MagicMock()
        mock_crypto.share_keys = AsyncMock()

+        mock_sync_store = MagicMock()
+        mock_sync_store.get_next_batch = AsyncMock(return_value=None)
+        mock_sync_store.put_next_batch = AsyncMock()
+
        fake_client = MagicMock()
        fake_client.sync = AsyncMock(side_effect=_sync_once)
        fake_client.crypto = mock_crypto
+        fake_client.sync_store = mock_sync_store
+        fake_client.handle_sync = MagicMock(return_value=[])
        adapter._client = fake_client

        await adapter._sync_loop()

        fake_client.sync.assert_awaited_once()
        mock_crypto.share_keys.assert_awaited_once()
+        fake_client.handle_sync.assert_called_once()
+        mock_sync_store.put_next_batch.assert_awaited_once_with("s1234")


 class TestMatrixEncryptedSendFallback:
--- a/tests/run_agent/test_run_agent_codex_responses.py
+++ b/tests/run_agent/test_run_agent_codex_responses.py
@ -222,6 +222,12 @@ def test_api_mode_normalizes_provider_case(monkeypatch):


 def test_api_mode_respects_explicit_openrouter_provider_over_codex_url(monkeypatch):
+    """GPT-5.x models need codex_responses even on OpenRouter.
+
+    OpenRouter rejects GPT-5 models on /v1/chat/completions with
+    ``unsupported_api_for_model``.  The model-level check overrides
+    the provider default.
+    """
    _patch_agent_bootstrap(monkeypatch)
    agent = run_agent.AIAgent(
        model="gpt-5-codex",
@ -233,7 +239,7 @@ def test_api_mode_respects_explicit_openrouter_provider_over_codex_url(monkeypat
        skip_context_files=True,
        skip_memory=True,
    )
-    assert agent.api_mode == "chat_completions"
+    assert agent.api_mode == "codex_responses"
    assert agent.provider == "openrouter"


--- a/tests/tools/test_vision_tools.py
+++ b/tests/tools/test_vision_tools.py
@ -15,6 +15,10 @@ from tools.vision_tools import (
    _handle_vision_analyze,
    _determine_mime_type,
    _image_to_base64_data_url,
+    _resize_image_for_vision,
+    _is_image_size_error,
+    _MAX_BASE64_BYTES,
+    _RESIZE_TARGET_BYTES,
    vision_analyze_tool,
    check_vision_requirements,
    get_debug_session_info,
@ -590,11 +594,13 @@ class TestBase64SizeLimit:

    @pytest.mark.asyncio
    async def test_oversized_image_rejected_before_api_call(self, tmp_path):
-        """Images exceeding 5 MB base64 should fail with a clear size error."""
+        """Images exceeding the 20 MB hard limit should fail with a clear error."""
        img = tmp_path / "huge.png"
        img.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * (4 * 1024 * 1024))

-        with patch("tools.vision_tools.async_call_llm", new_callable=AsyncMock) as mock_llm:
+        # Patch the hard limit to a small value so the test runs fast.
+        with patch("tools.vision_tools._MAX_BASE64_BYTES", 1000), \
+             patch("tools.vision_tools.async_call_llm", new_callable=AsyncMock) as mock_llm:
            result = json.loads(await vision_analyze_tool(str(img), "describe this"))

        assert result["success"] is False
@ -686,3 +692,124 @@ class TestVisionRegistration:

        entry = registry._tools.get("vision_analyze")
        assert callable(entry.handler)
+
+
+# ---------------------------------------------------------------------------
+# _resize_image_for_vision — auto-resize oversized images
+# ---------------------------------------------------------------------------
+
+
+class TestResizeImageForVision:
+    """Tests for the auto-resize function."""
+
+    def test_small_image_returned_as_is(self, tmp_path):
+        """Images under the limit should be returned unchanged."""
+        # Create a small 10x10 red PNG
+        try:
+            from PIL import Image
+        except ImportError:
+            pytest.skip("Pillow not installed")
+        img = Image.new("RGB", (10, 10), (255, 0, 0))
+        path = tmp_path / "small.png"
+        img.save(path, "PNG")
+
+        result = _resize_image_for_vision(path, mime_type="image/png")
+        assert result.startswith("data:image/png;base64,")
+        assert len(result) < _MAX_BASE64_BYTES
+
+    def test_large_image_is_resized(self, tmp_path):
+        """Images over the default target should be auto-resized to fit."""
+        try:
+            from PIL import Image
+        except ImportError:
+            pytest.skip("Pillow not installed")
+        # Create a large image that will exceed 5 MB in base64
+        # A 4000x4000 uncompressed PNG will be large
+        img = Image.new("RGB", (4000, 4000), (128, 200, 50))
+        path = tmp_path / "large.png"
+        img.save(path, "PNG")
+
+        result = _resize_image_for_vision(path, mime_type="image/png")
+        assert result.startswith("data:image/png;base64,")
+        # Default target is _RESIZE_TARGET_BYTES (5 MB), not _MAX_BASE64_BYTES (20 MB)
+        assert len(result) <= _RESIZE_TARGET_BYTES
+
+    def test_custom_max_bytes(self, tmp_path):
+        """The max_base64_bytes parameter should be respected."""
+        try:
+            from PIL import Image
+        except ImportError:
+            pytest.skip("Pillow not installed")
+        img = Image.new("RGB", (200, 200), (0, 128, 255))
+        path = tmp_path / "medium.png"
+        img.save(path, "PNG")
+
+        # Set a very low limit to force resizing
+        result = _resize_image_for_vision(path, max_base64_bytes=500)
+        # Should still return a valid data URL
+        assert result.startswith("data:image/")
+
+    def test_jpeg_output_for_non_png(self, tmp_path):
+        """Non-PNG images should be resized as JPEG."""
+        try:
+            from PIL import Image
+        except ImportError:
+            pytest.skip("Pillow not installed")
+        img = Image.new("RGB", (2000, 2000), (255, 128, 0))
+        path = tmp_path / "photo.jpg"
+        img.save(path, "JPEG", quality=95)
+
+        result = _resize_image_for_vision(path, mime_type="image/jpeg",
+                                           max_base64_bytes=50_000)
+        assert result.startswith("data:image/jpeg;base64,")
+
+    def test_constants_sane(self):
+        """Hard limit should be larger than resize target."""
+        assert _MAX_BASE64_BYTES == 20 * 1024 * 1024
+        assert _RESIZE_TARGET_BYTES == 5 * 1024 * 1024
+        assert _MAX_BASE64_BYTES > _RESIZE_TARGET_BYTES
+
+    def test_no_pillow_returns_original(self, tmp_path):
+        """Without Pillow, oversized images should be returned as-is."""
+        # Create a dummy file
+        path = tmp_path / "test.png"
+        # Write enough bytes to exceed a tiny limit
+        path.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 1000)
+
+        with patch("tools.vision_tools._image_to_base64_data_url") as mock_b64:
+            # Simulate a large base64 result
+            mock_b64.return_value = "data:image/png;base64," + "A" * 200
+            with patch.dict("sys.modules", {"PIL": None, "PIL.Image": None}):
+                result = _resize_image_for_vision(path, max_base64_bytes=100)
+                # Should return the original (oversized) data url
+                assert len(result) > 100
+
+
+# ---------------------------------------------------------------------------
+# _is_image_size_error — detect size-related API errors
+# ---------------------------------------------------------------------------
+
+
+class TestIsImageSizeError:
+    """Tests for the size-error detection helper."""
+
+    def test_too_large_message(self):
+        assert _is_image_size_error(Exception("Request payload too large"))
+
+    def test_413_status(self):
+        assert _is_image_size_error(Exception("HTTP 413 Payload Too Large"))
+
+    def test_invalid_request(self):
+        assert _is_image_size_error(Exception("invalid_request_error: image too big"))
+
+    def test_exceeds_limit(self):
+        assert _is_image_size_error(Exception("Image exceeds maximum size"))
+
+    def test_unrelated_error(self):
+        assert not _is_image_size_error(Exception("Connection refused"))
+
+    def test_auth_error(self):
+        assert not _is_image_size_error(Exception("401 Unauthorized"))
+
+    def test_empty_message(self):
+        assert not _is_image_size_error(Exception(""))
--- a/tools/browser_tool.py
+++ b/tools/browser_tool.py
@ -1873,10 +1873,10 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str]
                ),
            }, ensure_ascii=False)
        
-        # Read and convert to base64
-        image_data = screenshot_path.read_bytes()
-        image_base64 = base64.b64encode(image_data).decode("ascii")
-        data_url = f"data:image/png;base64,{image_base64}"
+        # Convert screenshot to base64 at full resolution.
+        _screenshot_bytes = screenshot_path.read_bytes()
+        _screenshot_b64 = base64.b64encode(_screenshot_bytes).decode("ascii")
+        data_url = f"data:image/png;base64,{_screenshot_b64}"
        
        vision_prompt = (
            f"You are analyzing a screenshot of a web browser.\n\n"
@ -1890,7 +1890,7 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str]
        # Use the centralized LLM router
        vision_model = _get_vision_model()
        logger.debug("browser_vision: analysing screenshot (%d bytes)",
-                     len(image_data))
+                     len(_screenshot_bytes))

        # Read vision timeout from config (auxiliary.vision.timeout), default 120s.
        # Local vision models (llama.cpp, ollama) can take well over 30s for
@ -1922,7 +1922,27 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str]
        }
        if vision_model:
            call_kwargs["model"] = vision_model
-        response = call_llm(**call_kwargs)
+        # Try full-size screenshot; on size-related rejection, downscale and retry.
+        try:
+            response = call_llm(**call_kwargs)
+        except Exception as _api_err:
+            from tools.vision_tools import (
+                _is_image_size_error, _resize_image_for_vision, _RESIZE_TARGET_BYTES,
+            )
+            if (_is_image_size_error(_api_err)
+                    and len(data_url) > _RESIZE_TARGET_BYTES):
+                logger.info(
+                    "Vision API rejected screenshot (%.1f MB); "
+                    "auto-resizing to ~%.0f MB and retrying...",
+                    len(data_url) / (1024 * 1024),
+                    _RESIZE_TARGET_BYTES / (1024 * 1024),
+                )
+                data_url = _resize_image_for_vision(
+                    screenshot_path, mime_type="image/png")
+                call_kwargs["messages"][0]["content"][1]["image_url"]["url"] = data_url
+                response = call_llm(**call_kwargs)
+            else:
+                raise
        
        analysis = (response.choices[0].message.content or "").strip()
        # Redact secrets the vision LLM may have read from the screenshot.
--- a/tools/vision_tools.py
+++ b/tools/vision_tools.py
@ -277,6 +277,120 @@ def _image_to_base64_data_url(image_path: Path, mime_type: Optional[str] = None)
    return data_url


+# Hard limit for vision API payloads (20 MB) — matches the most restrictive
+# major provider (Gemini inline data limit).  Images above this are rejected.
+_MAX_BASE64_BYTES = 20 * 1024 * 1024
+
+# Target size when auto-resizing on API failure (5 MB).  After a provider
+# rejects an image, we downscale to this target and retry once.
+_RESIZE_TARGET_BYTES = 5 * 1024 * 1024
+
+
+def _is_image_size_error(error: Exception) -> bool:
+    """Detect if an API error is related to image or payload size."""
+    err_str = str(error).lower()
+    return any(hint in err_str for hint in (
+        "too large", "payload", "413", "content_too_large",
+        "request_too_large", "image_url", "invalid_request",
+        "exceeds", "size limit",
+    ))
+
+
+def _resize_image_for_vision(image_path: Path, mime_type: Optional[str] = None,
+                              max_base64_bytes: int = _RESIZE_TARGET_BYTES) -> str:
+    """Convert an image to a base64 data URL, auto-resizing if too large.
+
+    Tries Pillow first to progressively downscale oversized images.  If Pillow
+    is not installed or resizing still exceeds the limit, falls back to the raw
+    bytes and lets the caller handle the size check.
+
+    Returns the base64 data URL string.
+    """
+    # Quick file-size estimate: base64 expands by ~4/3, plus data URL header.
+    # Skip the expensive full-read + encode if Pillow can resize directly.
+    file_size = image_path.stat().st_size
+    estimated_b64 = (file_size * 4) // 3 + 100  # ~header overhead
+    if estimated_b64 <= max_base64_bytes:
+        # Small enough — just encode directly.
+        data_url = _image_to_base64_data_url(image_path, mime_type=mime_type)
+        if len(data_url) <= max_base64_bytes:
+            return data_url
+    else:
+        data_url = None  # defer full encode; try Pillow resize first
+
+    # Attempt auto-resize with Pillow (soft dependency)
+    try:
+        from PIL import Image
+        import io as _io
+    except ImportError:
+        logger.info("Pillow not installed — cannot auto-resize oversized image")
+        if data_url is None:
+            data_url = _image_to_base64_data_url(image_path, mime_type=mime_type)
+        return data_url  # caller will raise the size error
+
+    logger.info("Image file is %.1f MB (estimated base64 %.1f MB, limit %.1f MB), auto-resizing...",
+                file_size / (1024 * 1024), estimated_b64 / (1024 * 1024),
+                max_base64_bytes / (1024 * 1024))
+
+    mime = mime_type or _determine_mime_type(image_path)
+    # Choose output format: JPEG for photos (smaller), PNG for transparency
+    pil_format = "PNG" if mime == "image/png" else "JPEG"
+    out_mime = "image/png" if pil_format == "PNG" else "image/jpeg"
+
+    try:
+        img = Image.open(image_path)
+    except Exception as exc:
+        logger.info("Pillow cannot open image for resizing: %s", exc)
+        if data_url is None:
+            data_url = _image_to_base64_data_url(image_path, mime_type=mime_type)
+        return data_url  # fall through to size-check in caller
+    # Convert RGBA to RGB for JPEG output
+    if pil_format == "JPEG" and img.mode in ("RGBA", "P"):
+        img = img.convert("RGB")
+
+    # Strategy: halve dimensions until base64 fits, up to 4 rounds.
+    # For JPEG, also try reducing quality at each size step.
+    # For PNG, quality is irrelevant — only dimension reduction helps.
+    quality_steps = (85, 70, 50) if pil_format == "JPEG" else (None,)
+    prev_dims = (img.width, img.height)
+    candidate = None  # will be set on first loop iteration
+
+    for attempt in range(5):
+        if attempt > 0:
+            new_w = max(img.width // 2, 64)
+            new_h = max(img.height // 2, 64)
+            # Stop if dimensions can't shrink further
+            if (new_w, new_h) == prev_dims:
+                break
+            img = img.resize((new_w, new_h), Image.LANCZOS)
+            prev_dims = (new_w, new_h)
+            logger.info("Resized to %dx%d (attempt %d)", new_w, new_h, attempt)
+
+        for q in quality_steps:
+            buf = _io.BytesIO()
+            save_kwargs = {"format": pil_format}
+            if q is not None:
+                save_kwargs["quality"] = q
+            img.save(buf, **save_kwargs)
+            encoded = base64.b64encode(buf.getvalue()).decode("ascii")
+            candidate = f"data:{out_mime};base64,{encoded}"
+            if len(candidate) <= max_base64_bytes:
+                logger.info("Auto-resized image fits: %.1f MB (quality=%s, %dx%d)",
+                            len(candidate) / (1024 * 1024), q,
+                            img.width, img.height)
+                return candidate
+
+    # If we still can't get it small enough, return the best attempt
+    # and let the caller decide
+    if candidate is not None:
+        logger.warning("Auto-resize could not fit image under %.1f MB (best: %.1f MB)",
+                       max_base64_bytes / (1024 * 1024), len(candidate) / (1024 * 1024))
+        return candidate
+
+    # Shouldn't reach here, but fall back to full encode
+    return data_url or _image_to_base64_data_url(image_path, mime_type=mime_type)
+
+
 async def vision_analyze_tool(
    image_url: str,
    user_prompt: str,
@ -376,24 +490,27 @@ async def vision_analyze_tool(
        if not detected_mime_type:
            raise ValueError("Only real image files are supported for vision analysis.")
        
-        # Convert image to base64 data URL
+        # Convert image to base64 — send at full resolution first.
+        # If the provider rejects it as too large, we auto-resize and retry.
        logger.info("Converting image to base64...")
        image_data_url = _image_to_base64_data_url(temp_image_path, mime_type=detected_mime_type)
-        # Calculate size in KB for better readability
        data_size_kb = len(image_data_url) / 1024
        logger.info("Image converted to base64 (%.1f KB)", data_size_kb)

-        # Pre-flight size check: most vision APIs cap base64 payloads at 5 MB.
-        # Reject early with a clear message instead of a cryptic provider 400.
-        _MAX_BASE64_BYTES = 5 * 1024 * 1024  # 5 MB
-        # The data URL includes the header (e.g. "data:image/jpeg;base64,") which
-        # is negligible, but measure the full string to be safe.
+        # Hard limit (20 MB) — no provider accepts payloads this large.
        if len(image_data_url) > _MAX_BASE64_BYTES:
-            raise ValueError(
-                f"Image too large for vision API: base64 payload is "
-                f"{len(image_data_url) / (1024 * 1024):.1f} MB (limit 5 MB). "
-                f"Resize or compress the image and try again."
-            )
+            # Try to resize down to 5 MB before giving up.
+            image_data_url = _resize_image_for_vision(
+                temp_image_path, mime_type=detected_mime_type)
+            if len(image_data_url) > _MAX_BASE64_BYTES:
+                raise ValueError(
+                    f"Image too large for vision API: base64 payload is "
+                    f"{len(image_data_url) / (1024 * 1024):.1f} MB "
+                    f"(limit {_MAX_BASE64_BYTES / (1024 * 1024):.0f} MB) "
+                    f"even after resizing. "
+                    f"Install Pillow (`pip install Pillow`) for better auto-resize, "
+                    f"or compress the image manually."
+                )

        debug_call_data["image_size_bytes"] = image_size_bytes
        
@ -442,7 +559,24 @@ async def vision_analyze_tool(
        }
        if model:
            call_kwargs["model"] = model
-        response = await async_call_llm(**call_kwargs)
+        # Try full-size image first; on size-related rejection, downscale and retry.
+        try:
+            response = await async_call_llm(**call_kwargs)
+        except Exception as _api_err:
+            if (_is_image_size_error(_api_err)
+                    and len(image_data_url) > _RESIZE_TARGET_BYTES):
+                logger.info(
+                    "API rejected image (%.1f MB, likely too large); "
+                    "auto-resizing to ~%.0f MB and retrying...",
+                    len(image_data_url) / (1024 * 1024),
+                    _RESIZE_TARGET_BYTES / (1024 * 1024),
+                )
+                image_data_url = _resize_image_for_vision(
+                    temp_image_path, mime_type=detected_mime_type)
+                messages[0]["content"][1]["image_url"]["url"] = image_data_url
+                response = await async_call_llm(**call_kwargs)
+            else:
+                raise
        
        # Extract the analysis — fall back to reasoning if content is empty
        analysis = extract_content_or_reasoning(response)
@ -498,8 +632,8 @@ async def vision_analyze_tool(
        elif "invalid_request" in err_str or "image_url" in err_str:
            analysis = (
                "The vision API rejected the image. This can happen when the "
-                "image is too large, in an unsupported format, or corrupted. "
-                "Try a smaller JPEG/PNG (under 3.5 MB) and retry. "
+                "image is in an unsupported format, corrupted, or still too "
+                "large after auto-resize. Try a smaller JPEG/PNG and retry. "
                f"Error: {e}"
            )
        else:
--- a/website/docs/developer-guide/context-compression-and-caching.md
+++ b/website/docs/developer-guide/context-compression-and-caching.md
@ -143,6 +143,10 @@ to find the parent assistant message, keeping groups intact.

 ### Phase 3: Generate Structured Summary

+:::warning Summary model context length
+The summary model must have a context window **at least as large** as the main agent model's. The entire middle section is sent to the summary model in a single `call_llm(task="compression")` call. If the summary model's context is smaller, the API returns a context-length error — `_generate_summary()` catches it, logs a warning, and returns `None`. The compressor then drops the middle turns **without a summary**, silently losing conversation context. This is the most common cause of degraded compaction quality.
+:::
+
 The middle turns are summarized using the auxiliary LLM with a structured
 template:

--- a/website/docs/user-guide/configuration.md
+++ b/website/docs/user-guide/configuration.md
@ -480,7 +480,9 @@ Points at a custom OpenAI-compatible endpoint. Uses `OPENAI_API_KEY` for auth.
 | `nous` / `openrouter` / etc. | not set | Force that provider, use its auth |
 | any | set | Use the custom endpoint directly (provider ignored) |

-The `summary_model` must support a context length at least as large as your main model's, since it receives the full middle section of the conversation for compression.
+:::warning Summary model context length requirement
+The `summary_model` **must** have a context window at least as large as your main agent model's. The compressor sends the full middle section of the conversation to the summary model — if that model's context window is smaller than the main model's, the summarization call will fail with a context length error. When this happens, the middle turns are **dropped without a summary**, losing conversation context silently. If you override `summary_model`, verify its context length meets or exceeds your main model's.
+:::

 ## Context Engine

--- a/website/docs/user-guide/messaging/feishu.md
+++ b/website/docs/user-guide/messaging/feishu.md
@ -212,7 +212,24 @@ When users click buttons or interact with interactive cards sent by the bot, the

 Card action events are dispatched with `MessageType.COMMAND`, so they flow through the normal command processing pipeline.

-To use this feature, enable the **Interactive Card** event in your Feishu app's event subscriptions (`card.action.trigger`).
+This is also how **command approval** works — when the agent needs to run a dangerous command, it sends an interactive card with Allow Once / Session / Always / Deny buttons. The user clicks a button, and the card action callback delivers the approval decision back to the agent.
+
+### Required Feishu App Configuration
+
+Interactive cards require **three** configuration steps in the Feishu Developer Console. Missing any of them causes error **200340** when users click card buttons.
+
+1. **Subscribe to the card action event:**
+   In **Event Subscriptions**, add `card.action.trigger` to your subscribed events.
+
+2. **Enable the Interactive Card capability:**
+   In **App Features > Bot**, ensure the **Interactive Card** toggle is enabled. This tells Feishu that your app can receive card action callbacks.
+
+3. **Configure the Card Request URL (webhook mode only):**
+   In **App Features > Bot > Message Card Request URL**, set the URL to the same endpoint as your event webhook (e.g. `https://your-server:8765/feishu/webhook`). In WebSocket mode this is handled automatically by the SDK.
+
+:::warning
+Without all three steps, Feishu will successfully *send* interactive cards (sending only requires `im:message:send` permission), but clicking any button will return error 200340. The card appears to work — the error only surfaces when a user interacts with it.
+:::

 ## Media Support

@ -412,6 +429,7 @@ WebSocket and per-group ACL settings are configured via `config.yaml` under `pla
 | Post messages show as plain text | The Feishu API rejected the post payload; this is normal fallback behavior. Check logs for details. |
 | Images/files not received by bot | Grant `im:message` and `im:resource` permission scopes to your Feishu app |
 | Bot identity not auto-detected | Grant `admin:app.info:readonly` scope, or set `FEISHU_BOT_OPEN_ID` / `FEISHU_BOT_NAME` manually |
+| Error 200340 when clicking approval buttons | Enable **Interactive Card** capability and configure **Card Request URL** in the Feishu Developer Console. See [Required Feishu App Configuration](#required-feishu-app-configuration) above. |
 | `Webhook rate limit exceeded` | More than 120 requests/minute from the same IP. This is usually a misconfiguration or loop. |

 ## Toolset