diff --git a/agent/models_dev.py b/agent/models_dev.py index d3620733b..560e7cefe 100644 --- a/agent/models_dev.py +++ b/agent/models_dev.py @@ -383,7 +383,14 @@ def get_model_capabilities(provider: str, model: str) -> Optional[ModelCapabilit # Extract capability flags (default to False if missing) supports_tools = bool(entry.get("tool_call", False)) - supports_vision = bool(entry.get("attachment", False)) + # Vision: check both the `attachment` flag and `modalities.input` for "image". + # Some models (e.g. gemma-4) list image in input modalities but not attachment. + input_mods = entry.get("modalities", {}) + if isinstance(input_mods, dict): + input_mods = input_mods.get("input", []) + else: + input_mods = [] + supports_vision = bool(entry.get("attachment", False)) or "image" in input_mods supports_reasoning = bool(entry.get("reasoning", False)) # Extract limits diff --git a/gateway/config.py b/gateway/config.py index bde52eb55..d2dc45eae 100644 --- a/gateway/config.py +++ b/gateway/config.py @@ -190,7 +190,7 @@ class StreamingConfig: """Configuration for real-time token streaming to messaging platforms.""" enabled: bool = False transport: str = "edit" # "edit" (progressive editMessageText) or "off" - edit_interval: float = 0.3 # Seconds between message edits + edit_interval: float = 1.0 # Seconds between message edits (Telegram rate-limits at ~1/s) buffer_threshold: int = 40 # Chars before forcing an edit cursor: str = " ▉" # Cursor shown during streaming @@ -210,7 +210,7 @@ class StreamingConfig: return cls( enabled=data.get("enabled", False), transport=data.get("transport", "edit"), - edit_interval=float(data.get("edit_interval", 0.3)), + edit_interval=float(data.get("edit_interval", 1.0)), buffer_threshold=int(data.get("buffer_threshold", 40)), cursor=data.get("cursor", " ▉"), ) diff --git a/gateway/platforms/matrix.py b/gateway/platforms/matrix.py index 409d2d6e4..7daf2e70e 100644 --- a/gateway/platforms/matrix.py +++ b/gateway/platforms/matrix.py @@ -352,7 +352,16 @@ class MatrixAdapter(BasePlatformAdapter): from mautrix.crypto import OlmMachine from mautrix.crypto.store import MemoryCryptoStore - crypto_store = MemoryCryptoStore() + # account_id and pickle_key are required by mautrix ≥0.21. + # Use the Matrix user ID as account_id for stable identity. + # pickle_key secures in-memory serialisation; derive from + # the same user_id:device_id pair used for the on-disk HMAC. + _acct_id = self._user_id or "hermes" + _pickle_key = f"{_acct_id}:{self._device_id}" + crypto_store = MemoryCryptoStore( + account_id=_acct_id, + pickle_key=_pickle_key, + ) # Restore persisted crypto state from a previous run. # Uses HMAC to verify integrity before unpickling. @@ -418,6 +427,11 @@ class MatrixAdapter(BasePlatformAdapter): if isinstance(sync_data, dict): rooms_join = sync_data.get("rooms", {}).get("join", {}) self._joined_rooms = set(rooms_join.keys()) + # Store the next_batch token so incremental syncs start + # from where the initial sync left off. + nb = sync_data.get("next_batch") + if nb: + await client.sync_store.put_next_batch(nb) logger.info( "Matrix: initial sync complete, joined %d rooms", len(self._joined_rooms), @@ -809,19 +823,40 @@ class MatrixAdapter(BasePlatformAdapter): async def _sync_loop(self) -> None: """Continuously sync with the homeserver.""" + client = self._client + # Resume from the token stored during the initial sync. + next_batch = await client.sync_store.get_next_batch() while not self._closing: try: - sync_data = await self._client.sync(timeout=30000) + sync_data = await client.sync( + since=next_batch, timeout=30000, + ) if isinstance(sync_data, dict): # Update joined rooms from sync response. rooms_join = sync_data.get("rooms", {}).get("join", {}) if rooms_join: self._joined_rooms.update(rooms_join.keys()) - # Share keys periodically if E2EE is enabled. - if self._encryption and getattr(self._client, "crypto", None): + # Advance the sync token so the next request is + # incremental instead of a full initial sync. + nb = sync_data.get("next_batch") + if nb: + next_batch = nb + await client.sync_store.put_next_batch(nb) + + # Dispatch events to registered handlers so that + # _on_room_message / _on_reaction / _on_invite fire. try: - await self._client.crypto.share_keys() + tasks = client.handle_sync(sync_data) + if tasks: + await asyncio.gather(*tasks) + except Exception as exc: + logger.warning("Matrix: sync event dispatch error: %s", exc) + + # Share keys periodically if E2EE is enabled. + if self._encryption and getattr(client, "crypto", None): + try: + await client.crypto.share_keys() except Exception as exc: logger.warning("Matrix: E2EE key share failed: %s", exc) diff --git a/gateway/stream_consumer.py b/gateway/stream_consumer.py index 5453df60e..de0a1453b 100644 --- a/gateway/stream_consumer.py +++ b/gateway/stream_consumer.py @@ -36,7 +36,7 @@ _NEW_SEGMENT = object() @dataclass class StreamConsumerConfig: """Runtime config for a single stream consumer instance.""" - edit_interval: float = 0.3 + edit_interval: float = 1.0 buffer_threshold: int = 40 cursor: str = " ▉" @@ -56,6 +56,10 @@ class GatewayStreamConsumer: await task # wait for final edit """ + # After this many consecutive flood-control failures, permanently disable + # progressive edits for the remainder of the stream. + _MAX_FLOOD_STRIKES = 3 + def __init__( self, adapter: Any, @@ -76,6 +80,8 @@ class GatewayStreamConsumer: self._last_sent_text = "" # Track last-sent text to skip redundant edits self._fallback_final_send = False self._fallback_prefix = "" + self._flood_strikes = 0 # Consecutive flood-control edit failures + self._current_edit_interval = self.cfg.edit_interval # Adaptive backoff @property def already_sent(self) -> bool: @@ -129,7 +135,7 @@ class GatewayStreamConsumer: should_edit = ( got_done or got_segment_break - or (elapsed >= self.cfg.edit_interval + or (elapsed >= self._current_edit_interval and self._accumulated) or len(self._accumulated) >= self.cfg.buffer_threshold ) @@ -173,12 +179,13 @@ class GatewayStreamConsumer: if split_at < _safe_limit // 2: split_at = _safe_limit chunk = self._accumulated[:split_at] - await self._send_or_edit(chunk) - if self._fallback_final_send: - # Edit failed while attempting to split an oversized - # message. Keep the full accumulated text intact so - # the fallback final-send path can deliver the - # remaining continuation without dropping content. + ok = await self._send_or_edit(chunk) + if self._fallback_final_send or not ok: + # Edit failed (or backed off due to flood control) + # while attempting to split an oversized message. + # Keep the full accumulated text intact so the + # fallback final-send path can deliver the remaining + # continuation without dropping content. break self._accumulated = self._accumulated[split_at:].lstrip("\n") self._message_id = None @@ -322,7 +329,10 @@ class GatewayStreamConsumer: return chunks async def _send_fallback_final(self, text: str) -> None: - """Send the final continuation after streaming edits stop working.""" + """Send the final continuation after streaming edits stop working. + + Retries each chunk once on flood-control failures with a short delay. + """ final_text = self._clean_for_display(text) continuation = self._continuation_text(final_text) self._fallback_final_send = False @@ -339,12 +349,25 @@ class GatewayStreamConsumer: last_successful_chunk = "" sent_any_chunk = False for chunk in chunks: - result = await self.adapter.send( - chat_id=self.chat_id, - content=chunk, - metadata=self.metadata, - ) - if not result.success: + # Try sending with one retry on flood-control errors. + result = None + for attempt in range(2): + result = await self.adapter.send( + chat_id=self.chat_id, + content=chunk, + metadata=self.metadata, + ) + if result.success: + break + if attempt == 0 and self._is_flood_error(result): + logger.debug( + "Flood control on fallback send, retrying in 3s" + ) + await asyncio.sleep(3.0) + else: + break # non-flood error or second attempt failed + + if not result or not result.success: if sent_any_chunk: # Some continuation text already reached the user. Suppress # the base gateway final-send path so we don't resend the @@ -370,20 +393,52 @@ class GatewayStreamConsumer: self._last_sent_text = chunks[-1] self._fallback_prefix = "" - async def _send_or_edit(self, text: str) -> None: - """Send or edit the streaming message.""" + def _is_flood_error(self, result) -> bool: + """Check if a SendResult failure is due to flood control / rate limiting.""" + err = getattr(result, "error", "") or "" + err_lower = err.lower() + return "flood" in err_lower or "retry after" in err_lower or "rate" in err_lower + + async def _try_strip_cursor(self) -> None: + """Best-effort edit to remove the cursor from the last visible message. + + Called when entering fallback mode so the user doesn't see a stuck + cursor (▉) in the partial message. + """ + if not self._message_id or self._message_id == "__no_edit__": + return + prefix = self._visible_prefix() + if not prefix or not prefix.strip(): + return + try: + await self.adapter.edit_message( + chat_id=self.chat_id, + message_id=self._message_id, + content=prefix, + ) + self._last_sent_text = prefix + except Exception: + pass # best-effort — don't let this block the fallback path + + async def _send_or_edit(self, text: str) -> bool: + """Send or edit the streaming message. + + Returns True if the text was successfully delivered (sent or edited), + False otherwise. Callers like the overflow split loop use this to + decide whether to advance past the delivered chunk. + """ # Strip MEDIA: directives so they don't appear as visible text. # Media files are delivered as native attachments after the stream # finishes (via _deliver_media_from_response in gateway/run.py). text = self._clean_for_display(text) if not text.strip(): - return + return True # nothing to send is "success" try: if self._message_id is not None: if self._edit_supported: # Skip if text is identical to what we last sent if text == self._last_sent_text: - return + return True # Edit existing message result = await self.adapter.edit_message( chat_id=self.chat_id, @@ -393,19 +448,52 @@ class GatewayStreamConsumer: if result.success: self._already_sent = True self._last_sent_text = text + # Successful edit — reset flood strike counter + self._flood_strikes = 0 + return True else: - # If an edit fails mid-stream (especially Telegram flood control), - # stop progressive edits and send only the missing tail once the + # Edit failed. If this looks like flood control / rate + # limiting, use adaptive backoff: double the edit interval + # and retry on the next cycle. Only permanently disable + # edits after _MAX_FLOOD_STRIKES consecutive failures. + if self._is_flood_error(result): + self._flood_strikes += 1 + self._current_edit_interval = min( + self._current_edit_interval * 2, 10.0, + ) + logger.debug( + "Flood control on edit (strike %d/%d), " + "backoff interval → %.1fs", + self._flood_strikes, + self._MAX_FLOOD_STRIKES, + self._current_edit_interval, + ) + if self._flood_strikes < self._MAX_FLOOD_STRIKES: + # Don't disable edits yet — just slow down. + # Update _last_edit_time so the next edit + # respects the new interval. + self._last_edit_time = time.monotonic() + return False + + # Non-flood error OR flood strikes exhausted: enter + # fallback mode — send only the missing tail once the # final response is available. - logger.debug("Edit failed, disabling streaming for this adapter") + logger.debug( + "Edit failed (strikes=%d), entering fallback mode", + self._flood_strikes, + ) self._fallback_prefix = self._visible_prefix() self._fallback_final_send = True self._edit_supported = False self._already_sent = True + # Best-effort: strip the cursor from the last visible + # message so the user doesn't see a stuck ▉. + await self._try_strip_cursor() + return False else: # Editing not supported — skip intermediate updates. # The final response will be sent by the fallback path. - pass + return False else: # First message — send new result = await self.adapter.send( @@ -417,6 +505,7 @@ class GatewayStreamConsumer: self._message_id = result.message_id self._already_sent = True self._last_sent_text = text + return True elif result.success: # Platform accepted the message but returned no message_id # (e.g. Signal). Can't edit without an ID — switch to @@ -428,8 +517,11 @@ class GatewayStreamConsumer: self._fallback_final_send = True # Sentinel prevents re-entering this branch on every delta self._message_id = "__no_edit__" + return True # platform accepted, just can't edit else: # Initial send failed — disable streaming for this session self._edit_supported = False + return False except Exception as e: logger.error("Stream send/edit error: %s", e) + return False diff --git a/hermes_cli/config.py b/hermes_cli/config.py index 89606edc2..e088bdfdf 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -381,7 +381,7 @@ DEFAULT_CONFIG = { "model": "", # e.g. "google/gemini-2.5-flash", "gpt-4o" "base_url": "", # direct OpenAI-compatible endpoint (takes precedence over provider) "api_key": "", # API key for base_url (falls back to OPENAI_API_KEY) - "timeout": 30, # seconds — LLM API call timeout; increase for slow local vision models + "timeout": 120, # seconds — LLM API call timeout; vision payloads need generous timeout "download_timeout": 30, # seconds — image HTTP download timeout; increase for slow connections }, "web_extract": { diff --git a/hermes_cli/models.py b/hermes_cli/models.py index a3cd389b4..5da9824f3 100644 --- a/hermes_cli/models.py +++ b/hermes_cli/models.py @@ -56,6 +56,18 @@ OPENROUTER_MODELS: list[tuple[str, str]] = [ _openrouter_catalog_cache: list[tuple[str, str]] | None = None + +def _codex_curated_models() -> list[str]: + """Derive the openai-codex curated list from codex_models.py. + + Single source of truth: DEFAULT_CODEX_MODELS + forward-compat synthesis. + This keeps the gateway /model picker in sync with the CLI `hermes model` + flow without maintaining a separate static list. + """ + from hermes_cli.codex_models import DEFAULT_CODEX_MODELS, _add_forward_compat_models + return _add_forward_compat_models(list(DEFAULT_CODEX_MODELS)) + + _PROVIDER_MODELS: dict[str, list[str]] = { "nous": [ "anthropic/claude-opus-4.6", @@ -86,14 +98,7 @@ _PROVIDER_MODELS: dict[str, list[str]] = { "openai/gpt-5.4-pro", "openai/gpt-5.4-nano", ], - "openai-codex": [ - "gpt-5.4", - "gpt-5.4-mini", - "gpt-5.3-codex", - "gpt-5.2-codex", - "gpt-5.1-codex-mini", - "gpt-5.1-codex-max", - ], + "openai-codex": _codex_curated_models(), "copilot-acp": [ "copilot-acp", ], diff --git a/run_agent.py b/run_agent.py index 44f28cd76..36ddaa0a6 100644 --- a/run_agent.py +++ b/run_agent.py @@ -700,10 +700,14 @@ class AIAgent: except Exception: pass - # Direct OpenAI sessions use the Responses API path. GPT-5.x tool - # calls with reasoning are rejected on /v1/chat/completions, and - # Hermes is a tool-using client by default. - if self.api_mode == "chat_completions" and self._is_direct_openai_url(): + # GPT-5.x models require the Responses API path — they are rejected + # on /v1/chat/completions by both OpenAI and OpenRouter. Also + # auto-upgrade for direct OpenAI URLs (api.openai.com) since all + # newer tool-calling models prefer Responses there. + if self.api_mode == "chat_completions" and ( + self._is_direct_openai_url() + or self._model_requires_responses_api(self.model) + ): self.api_mode = "codex_responses" # Pre-warm OpenRouter model metadata cache in a background thread. @@ -1702,6 +1706,21 @@ class AIAgent: """Return True when the base URL targets OpenRouter.""" return "openrouter" in self._base_url_lower + @staticmethod + def _model_requires_responses_api(model: str) -> bool: + """Return True for models that require the Responses API path. + + GPT-5.x models are rejected on /v1/chat/completions by both + OpenAI and OpenRouter (error: ``unsupported_api_for_model``). + Detect these so the correct api_mode is set regardless of + which provider is serving the model. + """ + m = model.lower() + # Strip vendor prefix (e.g. "openai/gpt-5.4" → "gpt-5.4") + if "/" in m: + m = m.rsplit("/", 1)[-1] + return m.startswith("gpt-5") + def _max_tokens_param(self, value: int) -> dict: """Return the correct max tokens kwarg for the current provider. @@ -5252,7 +5271,7 @@ class AIAgent: except Exception: pass - # Determine api_mode from provider / base URL + # Determine api_mode from provider / base URL / model fb_api_mode = "chat_completions" fb_base_url = str(fb_client.base_url) if fb_provider == "openai-codex": @@ -5261,6 +5280,10 @@ class AIAgent: fb_api_mode = "anthropic_messages" elif self._is_direct_openai_url(fb_base_url): fb_api_mode = "codex_responses" + elif self._model_requires_responses_api(fb_model): + # GPT-5.x models need Responses API on every provider + # (OpenRouter, Copilot, direct OpenAI, etc.) + fb_api_mode = "codex_responses" old_model = self.model self.model = fb_model @@ -5349,8 +5372,8 @@ class AIAgent: to the fallback provider for every subsequent turn. Calling this at the top of ``run_conversation()`` makes fallback turn-scoped. - The gateway creates a fresh agent per message so this is a no-op - there (``_fallback_activated`` is always False at turn start). + The gateway caches agents across messages (``_agent_cache`` in + ``gateway/run.py``), so this restoration IS needed there too. """ if not self._fallback_activated: return False diff --git a/tests/agent/test_models_dev.py b/tests/agent/test_models_dev.py index 1b6216c50..9f11d731e 100644 --- a/tests/agent/test_models_dev.py +++ b/tests/agent/test_models_dev.py @@ -7,6 +7,7 @@ from agent.models_dev import ( PROVIDER_TO_MODELS_DEV, _extract_context, fetch_models_dev, + get_model_capabilities, lookup_models_dev_context, ) @@ -195,3 +196,88 @@ class TestFetchModelsDev: result = fetch_models_dev() mock_get.assert_not_called() assert result == SAMPLE_REGISTRY + + +# --------------------------------------------------------------------------- +# get_model_capabilities — vision via modalities.input +# --------------------------------------------------------------------------- + + +CAPS_REGISTRY = { + "google": { + "id": "google", + "models": { + "gemma-4-31b-it": { + "id": "gemma-4-31b-it", + "attachment": False, + "tool_call": True, + "modalities": {"input": ["text", "image"]}, + "limit": {"context": 128000, "output": 8192}, + }, + "gemma-3-1b": { + "id": "gemma-3-1b", + "tool_call": True, + "limit": {"context": 32000, "output": 8192}, + }, + }, + }, + "anthropic": { + "id": "anthropic", + "models": { + "claude-sonnet-4": { + "id": "claude-sonnet-4", + "attachment": True, + "tool_call": True, + "limit": {"context": 200000, "output": 64000}, + }, + }, + }, +} + + +class TestGetModelCapabilities: + """Tests for get_model_capabilities vision detection.""" + + def test_vision_from_attachment_flag(self): + """Models with attachment=True should report supports_vision=True.""" + with patch("agent.models_dev.fetch_models_dev", return_value=CAPS_REGISTRY): + caps = get_model_capabilities("anthropic", "claude-sonnet-4") + assert caps is not None + assert caps.supports_vision is True + + def test_vision_from_modalities_input_image(self): + """Models with 'image' in modalities.input but attachment=False should + still report supports_vision=True (the core fix in this PR).""" + with patch("agent.models_dev.fetch_models_dev", return_value=CAPS_REGISTRY): + caps = get_model_capabilities("google", "gemma-4-31b-it") + assert caps is not None + assert caps.supports_vision is True + + def test_no_vision_without_attachment_or_modalities(self): + """Models with neither attachment nor image modality should be non-vision.""" + with patch("agent.models_dev.fetch_models_dev", return_value=CAPS_REGISTRY): + caps = get_model_capabilities("google", "gemma-3-1b") + assert caps is not None + assert caps.supports_vision is False + + def test_modalities_non_dict_handled(self): + """Non-dict modalities field should not crash.""" + registry = { + "google": {"id": "google", "models": { + "weird-model": { + "id": "weird-model", + "modalities": "text", # not a dict + "limit": {"context": 200000, "output": 8192}, + }, + }}, + } + with patch("agent.models_dev.fetch_models_dev", return_value=registry): + caps = get_model_capabilities("gemini", "weird-model") + assert caps is not None + assert caps.supports_vision is False + + def test_model_not_found_returns_none(self): + """Unknown model should return None.""" + with patch("agent.models_dev.fetch_models_dev", return_value=CAPS_REGISTRY): + caps = get_model_capabilities("anthropic", "nonexistent-model") + assert caps is None diff --git a/tests/gateway/test_matrix.py b/tests/gateway/test_matrix.py index 469bae030..4bde50b63 100644 --- a/tests/gateway/test_matrix.py +++ b/tests/gateway/test_matrix.py @@ -157,7 +157,9 @@ def _make_fake_mautrix(): mautrix_crypto_store = types.ModuleType("mautrix.crypto.store") class MemoryCryptoStore: - pass + def __init__(self, account_id="", pickle_key=""): + self.account_id = account_id + self.pickle_key = pickle_key mautrix_crypto_store.MemoryCryptoStore = MemoryCryptoStore @@ -1041,20 +1043,28 @@ class TestMatrixSyncLoop: call_count += 1 if call_count >= 1: adapter._closing = True - return {"rooms": {"join": {"!room:example.org": {}}}} + return {"rooms": {"join": {"!room:example.org": {}}}, "next_batch": "s1234"} mock_crypto = MagicMock() mock_crypto.share_keys = AsyncMock() + mock_sync_store = MagicMock() + mock_sync_store.get_next_batch = AsyncMock(return_value=None) + mock_sync_store.put_next_batch = AsyncMock() + fake_client = MagicMock() fake_client.sync = AsyncMock(side_effect=_sync_once) fake_client.crypto = mock_crypto + fake_client.sync_store = mock_sync_store + fake_client.handle_sync = MagicMock(return_value=[]) adapter._client = fake_client await adapter._sync_loop() fake_client.sync.assert_awaited_once() mock_crypto.share_keys.assert_awaited_once() + fake_client.handle_sync.assert_called_once() + mock_sync_store.put_next_batch.assert_awaited_once_with("s1234") class TestMatrixEncryptedSendFallback: diff --git a/tests/run_agent/test_run_agent_codex_responses.py b/tests/run_agent/test_run_agent_codex_responses.py index 635c75fcf..6756ed6fd 100644 --- a/tests/run_agent/test_run_agent_codex_responses.py +++ b/tests/run_agent/test_run_agent_codex_responses.py @@ -222,6 +222,12 @@ def test_api_mode_normalizes_provider_case(monkeypatch): def test_api_mode_respects_explicit_openrouter_provider_over_codex_url(monkeypatch): + """GPT-5.x models need codex_responses even on OpenRouter. + + OpenRouter rejects GPT-5 models on /v1/chat/completions with + ``unsupported_api_for_model``. The model-level check overrides + the provider default. + """ _patch_agent_bootstrap(monkeypatch) agent = run_agent.AIAgent( model="gpt-5-codex", @@ -233,7 +239,7 @@ def test_api_mode_respects_explicit_openrouter_provider_over_codex_url(monkeypat skip_context_files=True, skip_memory=True, ) - assert agent.api_mode == "chat_completions" + assert agent.api_mode == "codex_responses" assert agent.provider == "openrouter" diff --git a/tests/tools/test_vision_tools.py b/tests/tools/test_vision_tools.py index cd4009877..6e9a6034e 100644 --- a/tests/tools/test_vision_tools.py +++ b/tests/tools/test_vision_tools.py @@ -15,6 +15,10 @@ from tools.vision_tools import ( _handle_vision_analyze, _determine_mime_type, _image_to_base64_data_url, + _resize_image_for_vision, + _is_image_size_error, + _MAX_BASE64_BYTES, + _RESIZE_TARGET_BYTES, vision_analyze_tool, check_vision_requirements, get_debug_session_info, @@ -590,11 +594,13 @@ class TestBase64SizeLimit: @pytest.mark.asyncio async def test_oversized_image_rejected_before_api_call(self, tmp_path): - """Images exceeding 5 MB base64 should fail with a clear size error.""" + """Images exceeding the 20 MB hard limit should fail with a clear error.""" img = tmp_path / "huge.png" img.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * (4 * 1024 * 1024)) - with patch("tools.vision_tools.async_call_llm", new_callable=AsyncMock) as mock_llm: + # Patch the hard limit to a small value so the test runs fast. + with patch("tools.vision_tools._MAX_BASE64_BYTES", 1000), \ + patch("tools.vision_tools.async_call_llm", new_callable=AsyncMock) as mock_llm: result = json.loads(await vision_analyze_tool(str(img), "describe this")) assert result["success"] is False @@ -686,3 +692,124 @@ class TestVisionRegistration: entry = registry._tools.get("vision_analyze") assert callable(entry.handler) + + +# --------------------------------------------------------------------------- +# _resize_image_for_vision — auto-resize oversized images +# --------------------------------------------------------------------------- + + +class TestResizeImageForVision: + """Tests for the auto-resize function.""" + + def test_small_image_returned_as_is(self, tmp_path): + """Images under the limit should be returned unchanged.""" + # Create a small 10x10 red PNG + try: + from PIL import Image + except ImportError: + pytest.skip("Pillow not installed") + img = Image.new("RGB", (10, 10), (255, 0, 0)) + path = tmp_path / "small.png" + img.save(path, "PNG") + + result = _resize_image_for_vision(path, mime_type="image/png") + assert result.startswith("data:image/png;base64,") + assert len(result) < _MAX_BASE64_BYTES + + def test_large_image_is_resized(self, tmp_path): + """Images over the default target should be auto-resized to fit.""" + try: + from PIL import Image + except ImportError: + pytest.skip("Pillow not installed") + # Create a large image that will exceed 5 MB in base64 + # A 4000x4000 uncompressed PNG will be large + img = Image.new("RGB", (4000, 4000), (128, 200, 50)) + path = tmp_path / "large.png" + img.save(path, "PNG") + + result = _resize_image_for_vision(path, mime_type="image/png") + assert result.startswith("data:image/png;base64,") + # Default target is _RESIZE_TARGET_BYTES (5 MB), not _MAX_BASE64_BYTES (20 MB) + assert len(result) <= _RESIZE_TARGET_BYTES + + def test_custom_max_bytes(self, tmp_path): + """The max_base64_bytes parameter should be respected.""" + try: + from PIL import Image + except ImportError: + pytest.skip("Pillow not installed") + img = Image.new("RGB", (200, 200), (0, 128, 255)) + path = tmp_path / "medium.png" + img.save(path, "PNG") + + # Set a very low limit to force resizing + result = _resize_image_for_vision(path, max_base64_bytes=500) + # Should still return a valid data URL + assert result.startswith("data:image/") + + def test_jpeg_output_for_non_png(self, tmp_path): + """Non-PNG images should be resized as JPEG.""" + try: + from PIL import Image + except ImportError: + pytest.skip("Pillow not installed") + img = Image.new("RGB", (2000, 2000), (255, 128, 0)) + path = tmp_path / "photo.jpg" + img.save(path, "JPEG", quality=95) + + result = _resize_image_for_vision(path, mime_type="image/jpeg", + max_base64_bytes=50_000) + assert result.startswith("data:image/jpeg;base64,") + + def test_constants_sane(self): + """Hard limit should be larger than resize target.""" + assert _MAX_BASE64_BYTES == 20 * 1024 * 1024 + assert _RESIZE_TARGET_BYTES == 5 * 1024 * 1024 + assert _MAX_BASE64_BYTES > _RESIZE_TARGET_BYTES + + def test_no_pillow_returns_original(self, tmp_path): + """Without Pillow, oversized images should be returned as-is.""" + # Create a dummy file + path = tmp_path / "test.png" + # Write enough bytes to exceed a tiny limit + path.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 1000) + + with patch("tools.vision_tools._image_to_base64_data_url") as mock_b64: + # Simulate a large base64 result + mock_b64.return_value = "data:image/png;base64," + "A" * 200 + with patch.dict("sys.modules", {"PIL": None, "PIL.Image": None}): + result = _resize_image_for_vision(path, max_base64_bytes=100) + # Should return the original (oversized) data url + assert len(result) > 100 + + +# --------------------------------------------------------------------------- +# _is_image_size_error — detect size-related API errors +# --------------------------------------------------------------------------- + + +class TestIsImageSizeError: + """Tests for the size-error detection helper.""" + + def test_too_large_message(self): + assert _is_image_size_error(Exception("Request payload too large")) + + def test_413_status(self): + assert _is_image_size_error(Exception("HTTP 413 Payload Too Large")) + + def test_invalid_request(self): + assert _is_image_size_error(Exception("invalid_request_error: image too big")) + + def test_exceeds_limit(self): + assert _is_image_size_error(Exception("Image exceeds maximum size")) + + def test_unrelated_error(self): + assert not _is_image_size_error(Exception("Connection refused")) + + def test_auth_error(self): + assert not _is_image_size_error(Exception("401 Unauthorized")) + + def test_empty_message(self): + assert not _is_image_size_error(Exception("")) diff --git a/tools/browser_tool.py b/tools/browser_tool.py index a3b408381..ed3cfbb9b 100644 --- a/tools/browser_tool.py +++ b/tools/browser_tool.py @@ -1873,10 +1873,10 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str] ), }, ensure_ascii=False) - # Read and convert to base64 - image_data = screenshot_path.read_bytes() - image_base64 = base64.b64encode(image_data).decode("ascii") - data_url = f"data:image/png;base64,{image_base64}" + # Convert screenshot to base64 at full resolution. + _screenshot_bytes = screenshot_path.read_bytes() + _screenshot_b64 = base64.b64encode(_screenshot_bytes).decode("ascii") + data_url = f"data:image/png;base64,{_screenshot_b64}" vision_prompt = ( f"You are analyzing a screenshot of a web browser.\n\n" @@ -1890,7 +1890,7 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str] # Use the centralized LLM router vision_model = _get_vision_model() logger.debug("browser_vision: analysing screenshot (%d bytes)", - len(image_data)) + len(_screenshot_bytes)) # Read vision timeout from config (auxiliary.vision.timeout), default 120s. # Local vision models (llama.cpp, ollama) can take well over 30s for @@ -1922,7 +1922,27 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str] } if vision_model: call_kwargs["model"] = vision_model - response = call_llm(**call_kwargs) + # Try full-size screenshot; on size-related rejection, downscale and retry. + try: + response = call_llm(**call_kwargs) + except Exception as _api_err: + from tools.vision_tools import ( + _is_image_size_error, _resize_image_for_vision, _RESIZE_TARGET_BYTES, + ) + if (_is_image_size_error(_api_err) + and len(data_url) > _RESIZE_TARGET_BYTES): + logger.info( + "Vision API rejected screenshot (%.1f MB); " + "auto-resizing to ~%.0f MB and retrying...", + len(data_url) / (1024 * 1024), + _RESIZE_TARGET_BYTES / (1024 * 1024), + ) + data_url = _resize_image_for_vision( + screenshot_path, mime_type="image/png") + call_kwargs["messages"][0]["content"][1]["image_url"]["url"] = data_url + response = call_llm(**call_kwargs) + else: + raise analysis = (response.choices[0].message.content or "").strip() # Redact secrets the vision LLM may have read from the screenshot. diff --git a/tools/vision_tools.py b/tools/vision_tools.py index df8fa68c8..8242c7883 100644 --- a/tools/vision_tools.py +++ b/tools/vision_tools.py @@ -277,6 +277,120 @@ def _image_to_base64_data_url(image_path: Path, mime_type: Optional[str] = None) return data_url +# Hard limit for vision API payloads (20 MB) — matches the most restrictive +# major provider (Gemini inline data limit). Images above this are rejected. +_MAX_BASE64_BYTES = 20 * 1024 * 1024 + +# Target size when auto-resizing on API failure (5 MB). After a provider +# rejects an image, we downscale to this target and retry once. +_RESIZE_TARGET_BYTES = 5 * 1024 * 1024 + + +def _is_image_size_error(error: Exception) -> bool: + """Detect if an API error is related to image or payload size.""" + err_str = str(error).lower() + return any(hint in err_str for hint in ( + "too large", "payload", "413", "content_too_large", + "request_too_large", "image_url", "invalid_request", + "exceeds", "size limit", + )) + + +def _resize_image_for_vision(image_path: Path, mime_type: Optional[str] = None, + max_base64_bytes: int = _RESIZE_TARGET_BYTES) -> str: + """Convert an image to a base64 data URL, auto-resizing if too large. + + Tries Pillow first to progressively downscale oversized images. If Pillow + is not installed or resizing still exceeds the limit, falls back to the raw + bytes and lets the caller handle the size check. + + Returns the base64 data URL string. + """ + # Quick file-size estimate: base64 expands by ~4/3, plus data URL header. + # Skip the expensive full-read + encode if Pillow can resize directly. + file_size = image_path.stat().st_size + estimated_b64 = (file_size * 4) // 3 + 100 # ~header overhead + if estimated_b64 <= max_base64_bytes: + # Small enough — just encode directly. + data_url = _image_to_base64_data_url(image_path, mime_type=mime_type) + if len(data_url) <= max_base64_bytes: + return data_url + else: + data_url = None # defer full encode; try Pillow resize first + + # Attempt auto-resize with Pillow (soft dependency) + try: + from PIL import Image + import io as _io + except ImportError: + logger.info("Pillow not installed — cannot auto-resize oversized image") + if data_url is None: + data_url = _image_to_base64_data_url(image_path, mime_type=mime_type) + return data_url # caller will raise the size error + + logger.info("Image file is %.1f MB (estimated base64 %.1f MB, limit %.1f MB), auto-resizing...", + file_size / (1024 * 1024), estimated_b64 / (1024 * 1024), + max_base64_bytes / (1024 * 1024)) + + mime = mime_type or _determine_mime_type(image_path) + # Choose output format: JPEG for photos (smaller), PNG for transparency + pil_format = "PNG" if mime == "image/png" else "JPEG" + out_mime = "image/png" if pil_format == "PNG" else "image/jpeg" + + try: + img = Image.open(image_path) + except Exception as exc: + logger.info("Pillow cannot open image for resizing: %s", exc) + if data_url is None: + data_url = _image_to_base64_data_url(image_path, mime_type=mime_type) + return data_url # fall through to size-check in caller + # Convert RGBA to RGB for JPEG output + if pil_format == "JPEG" and img.mode in ("RGBA", "P"): + img = img.convert("RGB") + + # Strategy: halve dimensions until base64 fits, up to 4 rounds. + # For JPEG, also try reducing quality at each size step. + # For PNG, quality is irrelevant — only dimension reduction helps. + quality_steps = (85, 70, 50) if pil_format == "JPEG" else (None,) + prev_dims = (img.width, img.height) + candidate = None # will be set on first loop iteration + + for attempt in range(5): + if attempt > 0: + new_w = max(img.width // 2, 64) + new_h = max(img.height // 2, 64) + # Stop if dimensions can't shrink further + if (new_w, new_h) == prev_dims: + break + img = img.resize((new_w, new_h), Image.LANCZOS) + prev_dims = (new_w, new_h) + logger.info("Resized to %dx%d (attempt %d)", new_w, new_h, attempt) + + for q in quality_steps: + buf = _io.BytesIO() + save_kwargs = {"format": pil_format} + if q is not None: + save_kwargs["quality"] = q + img.save(buf, **save_kwargs) + encoded = base64.b64encode(buf.getvalue()).decode("ascii") + candidate = f"data:{out_mime};base64,{encoded}" + if len(candidate) <= max_base64_bytes: + logger.info("Auto-resized image fits: %.1f MB (quality=%s, %dx%d)", + len(candidate) / (1024 * 1024), q, + img.width, img.height) + return candidate + + # If we still can't get it small enough, return the best attempt + # and let the caller decide + if candidate is not None: + logger.warning("Auto-resize could not fit image under %.1f MB (best: %.1f MB)", + max_base64_bytes / (1024 * 1024), len(candidate) / (1024 * 1024)) + return candidate + + # Shouldn't reach here, but fall back to full encode + return data_url or _image_to_base64_data_url(image_path, mime_type=mime_type) + + async def vision_analyze_tool( image_url: str, user_prompt: str, @@ -376,24 +490,27 @@ async def vision_analyze_tool( if not detected_mime_type: raise ValueError("Only real image files are supported for vision analysis.") - # Convert image to base64 data URL + # Convert image to base64 — send at full resolution first. + # If the provider rejects it as too large, we auto-resize and retry. logger.info("Converting image to base64...") image_data_url = _image_to_base64_data_url(temp_image_path, mime_type=detected_mime_type) - # Calculate size in KB for better readability data_size_kb = len(image_data_url) / 1024 logger.info("Image converted to base64 (%.1f KB)", data_size_kb) - # Pre-flight size check: most vision APIs cap base64 payloads at 5 MB. - # Reject early with a clear message instead of a cryptic provider 400. - _MAX_BASE64_BYTES = 5 * 1024 * 1024 # 5 MB - # The data URL includes the header (e.g. "data:image/jpeg;base64,") which - # is negligible, but measure the full string to be safe. + # Hard limit (20 MB) — no provider accepts payloads this large. if len(image_data_url) > _MAX_BASE64_BYTES: - raise ValueError( - f"Image too large for vision API: base64 payload is " - f"{len(image_data_url) / (1024 * 1024):.1f} MB (limit 5 MB). " - f"Resize or compress the image and try again." - ) + # Try to resize down to 5 MB before giving up. + image_data_url = _resize_image_for_vision( + temp_image_path, mime_type=detected_mime_type) + if len(image_data_url) > _MAX_BASE64_BYTES: + raise ValueError( + f"Image too large for vision API: base64 payload is " + f"{len(image_data_url) / (1024 * 1024):.1f} MB " + f"(limit {_MAX_BASE64_BYTES / (1024 * 1024):.0f} MB) " + f"even after resizing. " + f"Install Pillow (`pip install Pillow`) for better auto-resize, " + f"or compress the image manually." + ) debug_call_data["image_size_bytes"] = image_size_bytes @@ -442,7 +559,24 @@ async def vision_analyze_tool( } if model: call_kwargs["model"] = model - response = await async_call_llm(**call_kwargs) + # Try full-size image first; on size-related rejection, downscale and retry. + try: + response = await async_call_llm(**call_kwargs) + except Exception as _api_err: + if (_is_image_size_error(_api_err) + and len(image_data_url) > _RESIZE_TARGET_BYTES): + logger.info( + "API rejected image (%.1f MB, likely too large); " + "auto-resizing to ~%.0f MB and retrying...", + len(image_data_url) / (1024 * 1024), + _RESIZE_TARGET_BYTES / (1024 * 1024), + ) + image_data_url = _resize_image_for_vision( + temp_image_path, mime_type=detected_mime_type) + messages[0]["content"][1]["image_url"]["url"] = image_data_url + response = await async_call_llm(**call_kwargs) + else: + raise # Extract the analysis — fall back to reasoning if content is empty analysis = extract_content_or_reasoning(response) @@ -498,8 +632,8 @@ async def vision_analyze_tool( elif "invalid_request" in err_str or "image_url" in err_str: analysis = ( "The vision API rejected the image. This can happen when the " - "image is too large, in an unsupported format, or corrupted. " - "Try a smaller JPEG/PNG (under 3.5 MB) and retry. " + "image is in an unsupported format, corrupted, or still too " + "large after auto-resize. Try a smaller JPEG/PNG and retry. " f"Error: {e}" ) else: diff --git a/website/docs/developer-guide/context-compression-and-caching.md b/website/docs/developer-guide/context-compression-and-caching.md index 98dc0a6e2..d17f45b95 100644 --- a/website/docs/developer-guide/context-compression-and-caching.md +++ b/website/docs/developer-guide/context-compression-and-caching.md @@ -143,6 +143,10 @@ to find the parent assistant message, keeping groups intact. ### Phase 3: Generate Structured Summary +:::warning Summary model context length +The summary model must have a context window **at least as large** as the main agent model's. The entire middle section is sent to the summary model in a single `call_llm(task="compression")` call. If the summary model's context is smaller, the API returns a context-length error — `_generate_summary()` catches it, logs a warning, and returns `None`. The compressor then drops the middle turns **without a summary**, silently losing conversation context. This is the most common cause of degraded compaction quality. +::: + The middle turns are summarized using the auxiliary LLM with a structured template: diff --git a/website/docs/user-guide/configuration.md b/website/docs/user-guide/configuration.md index a8cb23f99..7b735bbde 100644 --- a/website/docs/user-guide/configuration.md +++ b/website/docs/user-guide/configuration.md @@ -480,7 +480,9 @@ Points at a custom OpenAI-compatible endpoint. Uses `OPENAI_API_KEY` for auth. | `nous` / `openrouter` / etc. | not set | Force that provider, use its auth | | any | set | Use the custom endpoint directly (provider ignored) | -The `summary_model` must support a context length at least as large as your main model's, since it receives the full middle section of the conversation for compression. +:::warning Summary model context length requirement +The `summary_model` **must** have a context window at least as large as your main agent model's. The compressor sends the full middle section of the conversation to the summary model — if that model's context window is smaller than the main model's, the summarization call will fail with a context length error. When this happens, the middle turns are **dropped without a summary**, losing conversation context silently. If you override `summary_model`, verify its context length meets or exceeds your main model's. +::: ## Context Engine diff --git a/website/docs/user-guide/messaging/feishu.md b/website/docs/user-guide/messaging/feishu.md index 5a7e06b72..ac4bad239 100644 --- a/website/docs/user-guide/messaging/feishu.md +++ b/website/docs/user-guide/messaging/feishu.md @@ -212,7 +212,24 @@ When users click buttons or interact with interactive cards sent by the bot, the Card action events are dispatched with `MessageType.COMMAND`, so they flow through the normal command processing pipeline. -To use this feature, enable the **Interactive Card** event in your Feishu app's event subscriptions (`card.action.trigger`). +This is also how **command approval** works — when the agent needs to run a dangerous command, it sends an interactive card with Allow Once / Session / Always / Deny buttons. The user clicks a button, and the card action callback delivers the approval decision back to the agent. + +### Required Feishu App Configuration + +Interactive cards require **three** configuration steps in the Feishu Developer Console. Missing any of them causes error **200340** when users click card buttons. + +1. **Subscribe to the card action event:** + In **Event Subscriptions**, add `card.action.trigger` to your subscribed events. + +2. **Enable the Interactive Card capability:** + In **App Features > Bot**, ensure the **Interactive Card** toggle is enabled. This tells Feishu that your app can receive card action callbacks. + +3. **Configure the Card Request URL (webhook mode only):** + In **App Features > Bot > Message Card Request URL**, set the URL to the same endpoint as your event webhook (e.g. `https://your-server:8765/feishu/webhook`). In WebSocket mode this is handled automatically by the SDK. + +:::warning +Without all three steps, Feishu will successfully *send* interactive cards (sending only requires `im:message:send` permission), but clicking any button will return error 200340. The card appears to work — the error only surfaces when a user interacts with it. +::: ## Media Support @@ -412,6 +429,7 @@ WebSocket and per-group ACL settings are configured via `config.yaml` under `pla | Post messages show as plain text | The Feishu API rejected the post payload; this is normal fallback behavior. Check logs for details. | | Images/files not received by bot | Grant `im:message` and `im:resource` permission scopes to your Feishu app | | Bot identity not auto-detected | Grant `admin:app.info:readonly` scope, or set `FEISHU_BOT_OPEN_ID` / `FEISHU_BOT_NAME` manually | +| Error 200340 when clicking approval buttons | Enable **Interactive Card** capability and configure **Card Request URL** in the Feishu Developer Console. See [Required Feishu App Configuration](#required-feishu-app-configuration) above. | | `Webhook rate limit exceeded` | More than 120 requests/minute from the same IP. This is usually a misconfiguration or loop. | ## Toolset