Merge branch 'main' of github.com:NousResearch/hermes-agent into feat/ink-refactor

This commit is contained in:
Brooklyn Nicholson 2026-04-11 13:14:36 -05:00
commit bf6af95ff5
16 changed files with 644 additions and 75 deletions

View file

@ -383,7 +383,14 @@ def get_model_capabilities(provider: str, model: str) -> Optional[ModelCapabilit
# Extract capability flags (default to False if missing)
supports_tools = bool(entry.get("tool_call", False))
supports_vision = bool(entry.get("attachment", False))
# Vision: check both the `attachment` flag and `modalities.input` for "image".
# Some models (e.g. gemma-4) list image in input modalities but not attachment.
input_mods = entry.get("modalities", {})
if isinstance(input_mods, dict):
input_mods = input_mods.get("input", [])
else:
input_mods = []
supports_vision = bool(entry.get("attachment", False)) or "image" in input_mods
supports_reasoning = bool(entry.get("reasoning", False))
# Extract limits

View file

@ -190,7 +190,7 @@ class StreamingConfig:
"""Configuration for real-time token streaming to messaging platforms."""
enabled: bool = False
transport: str = "edit" # "edit" (progressive editMessageText) or "off"
edit_interval: float = 0.3 # Seconds between message edits
edit_interval: float = 1.0 # Seconds between message edits (Telegram rate-limits at ~1/s)
buffer_threshold: int = 40 # Chars before forcing an edit
cursor: str = "" # Cursor shown during streaming
@ -210,7 +210,7 @@ class StreamingConfig:
return cls(
enabled=data.get("enabled", False),
transport=data.get("transport", "edit"),
edit_interval=float(data.get("edit_interval", 0.3)),
edit_interval=float(data.get("edit_interval", 1.0)),
buffer_threshold=int(data.get("buffer_threshold", 40)),
cursor=data.get("cursor", ""),
)

View file

@ -352,7 +352,16 @@ class MatrixAdapter(BasePlatformAdapter):
from mautrix.crypto import OlmMachine
from mautrix.crypto.store import MemoryCryptoStore
crypto_store = MemoryCryptoStore()
# account_id and pickle_key are required by mautrix ≥0.21.
# Use the Matrix user ID as account_id for stable identity.
# pickle_key secures in-memory serialisation; derive from
# the same user_id:device_id pair used for the on-disk HMAC.
_acct_id = self._user_id or "hermes"
_pickle_key = f"{_acct_id}:{self._device_id}"
crypto_store = MemoryCryptoStore(
account_id=_acct_id,
pickle_key=_pickle_key,
)
# Restore persisted crypto state from a previous run.
# Uses HMAC to verify integrity before unpickling.
@ -418,6 +427,11 @@ class MatrixAdapter(BasePlatformAdapter):
if isinstance(sync_data, dict):
rooms_join = sync_data.get("rooms", {}).get("join", {})
self._joined_rooms = set(rooms_join.keys())
# Store the next_batch token so incremental syncs start
# from where the initial sync left off.
nb = sync_data.get("next_batch")
if nb:
await client.sync_store.put_next_batch(nb)
logger.info(
"Matrix: initial sync complete, joined %d rooms",
len(self._joined_rooms),
@ -809,19 +823,40 @@ class MatrixAdapter(BasePlatformAdapter):
async def _sync_loop(self) -> None:
"""Continuously sync with the homeserver."""
client = self._client
# Resume from the token stored during the initial sync.
next_batch = await client.sync_store.get_next_batch()
while not self._closing:
try:
sync_data = await self._client.sync(timeout=30000)
sync_data = await client.sync(
since=next_batch, timeout=30000,
)
if isinstance(sync_data, dict):
# Update joined rooms from sync response.
rooms_join = sync_data.get("rooms", {}).get("join", {})
if rooms_join:
self._joined_rooms.update(rooms_join.keys())
# Share keys periodically if E2EE is enabled.
if self._encryption and getattr(self._client, "crypto", None):
# Advance the sync token so the next request is
# incremental instead of a full initial sync.
nb = sync_data.get("next_batch")
if nb:
next_batch = nb
await client.sync_store.put_next_batch(nb)
# Dispatch events to registered handlers so that
# _on_room_message / _on_reaction / _on_invite fire.
try:
await self._client.crypto.share_keys()
tasks = client.handle_sync(sync_data)
if tasks:
await asyncio.gather(*tasks)
except Exception as exc:
logger.warning("Matrix: sync event dispatch error: %s", exc)
# Share keys periodically if E2EE is enabled.
if self._encryption and getattr(client, "crypto", None):
try:
await client.crypto.share_keys()
except Exception as exc:
logger.warning("Matrix: E2EE key share failed: %s", exc)

View file

@ -36,7 +36,7 @@ _NEW_SEGMENT = object()
@dataclass
class StreamConsumerConfig:
"""Runtime config for a single stream consumer instance."""
edit_interval: float = 0.3
edit_interval: float = 1.0
buffer_threshold: int = 40
cursor: str = ""
@ -56,6 +56,10 @@ class GatewayStreamConsumer:
await task # wait for final edit
"""
# After this many consecutive flood-control failures, permanently disable
# progressive edits for the remainder of the stream.
_MAX_FLOOD_STRIKES = 3
def __init__(
self,
adapter: Any,
@ -76,6 +80,8 @@ class GatewayStreamConsumer:
self._last_sent_text = "" # Track last-sent text to skip redundant edits
self._fallback_final_send = False
self._fallback_prefix = ""
self._flood_strikes = 0 # Consecutive flood-control edit failures
self._current_edit_interval = self.cfg.edit_interval # Adaptive backoff
@property
def already_sent(self) -> bool:
@ -129,7 +135,7 @@ class GatewayStreamConsumer:
should_edit = (
got_done
or got_segment_break
or (elapsed >= self.cfg.edit_interval
or (elapsed >= self._current_edit_interval
and self._accumulated)
or len(self._accumulated) >= self.cfg.buffer_threshold
)
@ -173,12 +179,13 @@ class GatewayStreamConsumer:
if split_at < _safe_limit // 2:
split_at = _safe_limit
chunk = self._accumulated[:split_at]
await self._send_or_edit(chunk)
if self._fallback_final_send:
# Edit failed while attempting to split an oversized
# message. Keep the full accumulated text intact so
# the fallback final-send path can deliver the
# remaining continuation without dropping content.
ok = await self._send_or_edit(chunk)
if self._fallback_final_send or not ok:
# Edit failed (or backed off due to flood control)
# while attempting to split an oversized message.
# Keep the full accumulated text intact so the
# fallback final-send path can deliver the remaining
# continuation without dropping content.
break
self._accumulated = self._accumulated[split_at:].lstrip("\n")
self._message_id = None
@ -322,7 +329,10 @@ class GatewayStreamConsumer:
return chunks
async def _send_fallback_final(self, text: str) -> None:
"""Send the final continuation after streaming edits stop working."""
"""Send the final continuation after streaming edits stop working.
Retries each chunk once on flood-control failures with a short delay.
"""
final_text = self._clean_for_display(text)
continuation = self._continuation_text(final_text)
self._fallback_final_send = False
@ -339,12 +349,25 @@ class GatewayStreamConsumer:
last_successful_chunk = ""
sent_any_chunk = False
for chunk in chunks:
result = await self.adapter.send(
chat_id=self.chat_id,
content=chunk,
metadata=self.metadata,
)
if not result.success:
# Try sending with one retry on flood-control errors.
result = None
for attempt in range(2):
result = await self.adapter.send(
chat_id=self.chat_id,
content=chunk,
metadata=self.metadata,
)
if result.success:
break
if attempt == 0 and self._is_flood_error(result):
logger.debug(
"Flood control on fallback send, retrying in 3s"
)
await asyncio.sleep(3.0)
else:
break # non-flood error or second attempt failed
if not result or not result.success:
if sent_any_chunk:
# Some continuation text already reached the user. Suppress
# the base gateway final-send path so we don't resend the
@ -370,20 +393,52 @@ class GatewayStreamConsumer:
self._last_sent_text = chunks[-1]
self._fallback_prefix = ""
async def _send_or_edit(self, text: str) -> None:
"""Send or edit the streaming message."""
def _is_flood_error(self, result) -> bool:
"""Check if a SendResult failure is due to flood control / rate limiting."""
err = getattr(result, "error", "") or ""
err_lower = err.lower()
return "flood" in err_lower or "retry after" in err_lower or "rate" in err_lower
async def _try_strip_cursor(self) -> None:
"""Best-effort edit to remove the cursor from the last visible message.
Called when entering fallback mode so the user doesn't see a stuck
cursor () in the partial message.
"""
if not self._message_id or self._message_id == "__no_edit__":
return
prefix = self._visible_prefix()
if not prefix or not prefix.strip():
return
try:
await self.adapter.edit_message(
chat_id=self.chat_id,
message_id=self._message_id,
content=prefix,
)
self._last_sent_text = prefix
except Exception:
pass # best-effort — don't let this block the fallback path
async def _send_or_edit(self, text: str) -> bool:
"""Send or edit the streaming message.
Returns True if the text was successfully delivered (sent or edited),
False otherwise. Callers like the overflow split loop use this to
decide whether to advance past the delivered chunk.
"""
# Strip MEDIA: directives so they don't appear as visible text.
# Media files are delivered as native attachments after the stream
# finishes (via _deliver_media_from_response in gateway/run.py).
text = self._clean_for_display(text)
if not text.strip():
return
return True # nothing to send is "success"
try:
if self._message_id is not None:
if self._edit_supported:
# Skip if text is identical to what we last sent
if text == self._last_sent_text:
return
return True
# Edit existing message
result = await self.adapter.edit_message(
chat_id=self.chat_id,
@ -393,19 +448,52 @@ class GatewayStreamConsumer:
if result.success:
self._already_sent = True
self._last_sent_text = text
# Successful edit — reset flood strike counter
self._flood_strikes = 0
return True
else:
# If an edit fails mid-stream (especially Telegram flood control),
# stop progressive edits and send only the missing tail once the
# Edit failed. If this looks like flood control / rate
# limiting, use adaptive backoff: double the edit interval
# and retry on the next cycle. Only permanently disable
# edits after _MAX_FLOOD_STRIKES consecutive failures.
if self._is_flood_error(result):
self._flood_strikes += 1
self._current_edit_interval = min(
self._current_edit_interval * 2, 10.0,
)
logger.debug(
"Flood control on edit (strike %d/%d), "
"backoff interval → %.1fs",
self._flood_strikes,
self._MAX_FLOOD_STRIKES,
self._current_edit_interval,
)
if self._flood_strikes < self._MAX_FLOOD_STRIKES:
# Don't disable edits yet — just slow down.
# Update _last_edit_time so the next edit
# respects the new interval.
self._last_edit_time = time.monotonic()
return False
# Non-flood error OR flood strikes exhausted: enter
# fallback mode — send only the missing tail once the
# final response is available.
logger.debug("Edit failed, disabling streaming for this adapter")
logger.debug(
"Edit failed (strikes=%d), entering fallback mode",
self._flood_strikes,
)
self._fallback_prefix = self._visible_prefix()
self._fallback_final_send = True
self._edit_supported = False
self._already_sent = True
# Best-effort: strip the cursor from the last visible
# message so the user doesn't see a stuck ▉.
await self._try_strip_cursor()
return False
else:
# Editing not supported — skip intermediate updates.
# The final response will be sent by the fallback path.
pass
return False
else:
# First message — send new
result = await self.adapter.send(
@ -417,6 +505,7 @@ class GatewayStreamConsumer:
self._message_id = result.message_id
self._already_sent = True
self._last_sent_text = text
return True
elif result.success:
# Platform accepted the message but returned no message_id
# (e.g. Signal). Can't edit without an ID — switch to
@ -428,8 +517,11 @@ class GatewayStreamConsumer:
self._fallback_final_send = True
# Sentinel prevents re-entering this branch on every delta
self._message_id = "__no_edit__"
return True # platform accepted, just can't edit
else:
# Initial send failed — disable streaming for this session
self._edit_supported = False
return False
except Exception as e:
logger.error("Stream send/edit error: %s", e)
return False

View file

@ -381,7 +381,7 @@ DEFAULT_CONFIG = {
"model": "", # e.g. "google/gemini-2.5-flash", "gpt-4o"
"base_url": "", # direct OpenAI-compatible endpoint (takes precedence over provider)
"api_key": "", # API key for base_url (falls back to OPENAI_API_KEY)
"timeout": 30, # seconds — LLM API call timeout; increase for slow local vision models
"timeout": 120, # seconds — LLM API call timeout; vision payloads need generous timeout
"download_timeout": 30, # seconds — image HTTP download timeout; increase for slow connections
},
"web_extract": {

View file

@ -56,6 +56,18 @@ OPENROUTER_MODELS: list[tuple[str, str]] = [
_openrouter_catalog_cache: list[tuple[str, str]] | None = None
def _codex_curated_models() -> list[str]:
"""Derive the openai-codex curated list from codex_models.py.
Single source of truth: DEFAULT_CODEX_MODELS + forward-compat synthesis.
This keeps the gateway /model picker in sync with the CLI `hermes model`
flow without maintaining a separate static list.
"""
from hermes_cli.codex_models import DEFAULT_CODEX_MODELS, _add_forward_compat_models
return _add_forward_compat_models(list(DEFAULT_CODEX_MODELS))
_PROVIDER_MODELS: dict[str, list[str]] = {
"nous": [
"anthropic/claude-opus-4.6",
@ -86,14 +98,7 @@ _PROVIDER_MODELS: dict[str, list[str]] = {
"openai/gpt-5.4-pro",
"openai/gpt-5.4-nano",
],
"openai-codex": [
"gpt-5.4",
"gpt-5.4-mini",
"gpt-5.3-codex",
"gpt-5.2-codex",
"gpt-5.1-codex-mini",
"gpt-5.1-codex-max",
],
"openai-codex": _codex_curated_models(),
"copilot-acp": [
"copilot-acp",
],

View file

@ -700,10 +700,14 @@ class AIAgent:
except Exception:
pass
# Direct OpenAI sessions use the Responses API path. GPT-5.x tool
# calls with reasoning are rejected on /v1/chat/completions, and
# Hermes is a tool-using client by default.
if self.api_mode == "chat_completions" and self._is_direct_openai_url():
# GPT-5.x models require the Responses API path — they are rejected
# on /v1/chat/completions by both OpenAI and OpenRouter. Also
# auto-upgrade for direct OpenAI URLs (api.openai.com) since all
# newer tool-calling models prefer Responses there.
if self.api_mode == "chat_completions" and (
self._is_direct_openai_url()
or self._model_requires_responses_api(self.model)
):
self.api_mode = "codex_responses"
# Pre-warm OpenRouter model metadata cache in a background thread.
@ -1702,6 +1706,21 @@ class AIAgent:
"""Return True when the base URL targets OpenRouter."""
return "openrouter" in self._base_url_lower
@staticmethod
def _model_requires_responses_api(model: str) -> bool:
"""Return True for models that require the Responses API path.
GPT-5.x models are rejected on /v1/chat/completions by both
OpenAI and OpenRouter (error: ``unsupported_api_for_model``).
Detect these so the correct api_mode is set regardless of
which provider is serving the model.
"""
m = model.lower()
# Strip vendor prefix (e.g. "openai/gpt-5.4" → "gpt-5.4")
if "/" in m:
m = m.rsplit("/", 1)[-1]
return m.startswith("gpt-5")
def _max_tokens_param(self, value: int) -> dict:
"""Return the correct max tokens kwarg for the current provider.
@ -5252,7 +5271,7 @@ class AIAgent:
except Exception:
pass
# Determine api_mode from provider / base URL
# Determine api_mode from provider / base URL / model
fb_api_mode = "chat_completions"
fb_base_url = str(fb_client.base_url)
if fb_provider == "openai-codex":
@ -5261,6 +5280,10 @@ class AIAgent:
fb_api_mode = "anthropic_messages"
elif self._is_direct_openai_url(fb_base_url):
fb_api_mode = "codex_responses"
elif self._model_requires_responses_api(fb_model):
# GPT-5.x models need Responses API on every provider
# (OpenRouter, Copilot, direct OpenAI, etc.)
fb_api_mode = "codex_responses"
old_model = self.model
self.model = fb_model
@ -5349,8 +5372,8 @@ class AIAgent:
to the fallback provider for every subsequent turn. Calling this at
the top of ``run_conversation()`` makes fallback turn-scoped.
The gateway creates a fresh agent per message so this is a no-op
there (``_fallback_activated`` is always False at turn start).
The gateway caches agents across messages (``_agent_cache`` in
``gateway/run.py``), so this restoration IS needed there too.
"""
if not self._fallback_activated:
return False

View file

@ -7,6 +7,7 @@ from agent.models_dev import (
PROVIDER_TO_MODELS_DEV,
_extract_context,
fetch_models_dev,
get_model_capabilities,
lookup_models_dev_context,
)
@ -195,3 +196,88 @@ class TestFetchModelsDev:
result = fetch_models_dev()
mock_get.assert_not_called()
assert result == SAMPLE_REGISTRY
# ---------------------------------------------------------------------------
# get_model_capabilities — vision via modalities.input
# ---------------------------------------------------------------------------
CAPS_REGISTRY = {
"google": {
"id": "google",
"models": {
"gemma-4-31b-it": {
"id": "gemma-4-31b-it",
"attachment": False,
"tool_call": True,
"modalities": {"input": ["text", "image"]},
"limit": {"context": 128000, "output": 8192},
},
"gemma-3-1b": {
"id": "gemma-3-1b",
"tool_call": True,
"limit": {"context": 32000, "output": 8192},
},
},
},
"anthropic": {
"id": "anthropic",
"models": {
"claude-sonnet-4": {
"id": "claude-sonnet-4",
"attachment": True,
"tool_call": True,
"limit": {"context": 200000, "output": 64000},
},
},
},
}
class TestGetModelCapabilities:
"""Tests for get_model_capabilities vision detection."""
def test_vision_from_attachment_flag(self):
"""Models with attachment=True should report supports_vision=True."""
with patch("agent.models_dev.fetch_models_dev", return_value=CAPS_REGISTRY):
caps = get_model_capabilities("anthropic", "claude-sonnet-4")
assert caps is not None
assert caps.supports_vision is True
def test_vision_from_modalities_input_image(self):
"""Models with 'image' in modalities.input but attachment=False should
still report supports_vision=True (the core fix in this PR)."""
with patch("agent.models_dev.fetch_models_dev", return_value=CAPS_REGISTRY):
caps = get_model_capabilities("google", "gemma-4-31b-it")
assert caps is not None
assert caps.supports_vision is True
def test_no_vision_without_attachment_or_modalities(self):
"""Models with neither attachment nor image modality should be non-vision."""
with patch("agent.models_dev.fetch_models_dev", return_value=CAPS_REGISTRY):
caps = get_model_capabilities("google", "gemma-3-1b")
assert caps is not None
assert caps.supports_vision is False
def test_modalities_non_dict_handled(self):
"""Non-dict modalities field should not crash."""
registry = {
"google": {"id": "google", "models": {
"weird-model": {
"id": "weird-model",
"modalities": "text", # not a dict
"limit": {"context": 200000, "output": 8192},
},
}},
}
with patch("agent.models_dev.fetch_models_dev", return_value=registry):
caps = get_model_capabilities("gemini", "weird-model")
assert caps is not None
assert caps.supports_vision is False
def test_model_not_found_returns_none(self):
"""Unknown model should return None."""
with patch("agent.models_dev.fetch_models_dev", return_value=CAPS_REGISTRY):
caps = get_model_capabilities("anthropic", "nonexistent-model")
assert caps is None

View file

@ -157,7 +157,9 @@ def _make_fake_mautrix():
mautrix_crypto_store = types.ModuleType("mautrix.crypto.store")
class MemoryCryptoStore:
pass
def __init__(self, account_id="", pickle_key=""):
self.account_id = account_id
self.pickle_key = pickle_key
mautrix_crypto_store.MemoryCryptoStore = MemoryCryptoStore
@ -1041,20 +1043,28 @@ class TestMatrixSyncLoop:
call_count += 1
if call_count >= 1:
adapter._closing = True
return {"rooms": {"join": {"!room:example.org": {}}}}
return {"rooms": {"join": {"!room:example.org": {}}}, "next_batch": "s1234"}
mock_crypto = MagicMock()
mock_crypto.share_keys = AsyncMock()
mock_sync_store = MagicMock()
mock_sync_store.get_next_batch = AsyncMock(return_value=None)
mock_sync_store.put_next_batch = AsyncMock()
fake_client = MagicMock()
fake_client.sync = AsyncMock(side_effect=_sync_once)
fake_client.crypto = mock_crypto
fake_client.sync_store = mock_sync_store
fake_client.handle_sync = MagicMock(return_value=[])
adapter._client = fake_client
await adapter._sync_loop()
fake_client.sync.assert_awaited_once()
mock_crypto.share_keys.assert_awaited_once()
fake_client.handle_sync.assert_called_once()
mock_sync_store.put_next_batch.assert_awaited_once_with("s1234")
class TestMatrixEncryptedSendFallback:

View file

@ -222,6 +222,12 @@ def test_api_mode_normalizes_provider_case(monkeypatch):
def test_api_mode_respects_explicit_openrouter_provider_over_codex_url(monkeypatch):
"""GPT-5.x models need codex_responses even on OpenRouter.
OpenRouter rejects GPT-5 models on /v1/chat/completions with
``unsupported_api_for_model``. The model-level check overrides
the provider default.
"""
_patch_agent_bootstrap(monkeypatch)
agent = run_agent.AIAgent(
model="gpt-5-codex",
@ -233,7 +239,7 @@ def test_api_mode_respects_explicit_openrouter_provider_over_codex_url(monkeypat
skip_context_files=True,
skip_memory=True,
)
assert agent.api_mode == "chat_completions"
assert agent.api_mode == "codex_responses"
assert agent.provider == "openrouter"

View file

@ -15,6 +15,10 @@ from tools.vision_tools import (
_handle_vision_analyze,
_determine_mime_type,
_image_to_base64_data_url,
_resize_image_for_vision,
_is_image_size_error,
_MAX_BASE64_BYTES,
_RESIZE_TARGET_BYTES,
vision_analyze_tool,
check_vision_requirements,
get_debug_session_info,
@ -590,11 +594,13 @@ class TestBase64SizeLimit:
@pytest.mark.asyncio
async def test_oversized_image_rejected_before_api_call(self, tmp_path):
"""Images exceeding 5 MB base64 should fail with a clear size error."""
"""Images exceeding the 20 MB hard limit should fail with a clear error."""
img = tmp_path / "huge.png"
img.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * (4 * 1024 * 1024))
with patch("tools.vision_tools.async_call_llm", new_callable=AsyncMock) as mock_llm:
# Patch the hard limit to a small value so the test runs fast.
with patch("tools.vision_tools._MAX_BASE64_BYTES", 1000), \
patch("tools.vision_tools.async_call_llm", new_callable=AsyncMock) as mock_llm:
result = json.loads(await vision_analyze_tool(str(img), "describe this"))
assert result["success"] is False
@ -686,3 +692,124 @@ class TestVisionRegistration:
entry = registry._tools.get("vision_analyze")
assert callable(entry.handler)
# ---------------------------------------------------------------------------
# _resize_image_for_vision — auto-resize oversized images
# ---------------------------------------------------------------------------
class TestResizeImageForVision:
"""Tests for the auto-resize function."""
def test_small_image_returned_as_is(self, tmp_path):
"""Images under the limit should be returned unchanged."""
# Create a small 10x10 red PNG
try:
from PIL import Image
except ImportError:
pytest.skip("Pillow not installed")
img = Image.new("RGB", (10, 10), (255, 0, 0))
path = tmp_path / "small.png"
img.save(path, "PNG")
result = _resize_image_for_vision(path, mime_type="image/png")
assert result.startswith("data:image/png;base64,")
assert len(result) < _MAX_BASE64_BYTES
def test_large_image_is_resized(self, tmp_path):
"""Images over the default target should be auto-resized to fit."""
try:
from PIL import Image
except ImportError:
pytest.skip("Pillow not installed")
# Create a large image that will exceed 5 MB in base64
# A 4000x4000 uncompressed PNG will be large
img = Image.new("RGB", (4000, 4000), (128, 200, 50))
path = tmp_path / "large.png"
img.save(path, "PNG")
result = _resize_image_for_vision(path, mime_type="image/png")
assert result.startswith("data:image/png;base64,")
# Default target is _RESIZE_TARGET_BYTES (5 MB), not _MAX_BASE64_BYTES (20 MB)
assert len(result) <= _RESIZE_TARGET_BYTES
def test_custom_max_bytes(self, tmp_path):
"""The max_base64_bytes parameter should be respected."""
try:
from PIL import Image
except ImportError:
pytest.skip("Pillow not installed")
img = Image.new("RGB", (200, 200), (0, 128, 255))
path = tmp_path / "medium.png"
img.save(path, "PNG")
# Set a very low limit to force resizing
result = _resize_image_for_vision(path, max_base64_bytes=500)
# Should still return a valid data URL
assert result.startswith("data:image/")
def test_jpeg_output_for_non_png(self, tmp_path):
"""Non-PNG images should be resized as JPEG."""
try:
from PIL import Image
except ImportError:
pytest.skip("Pillow not installed")
img = Image.new("RGB", (2000, 2000), (255, 128, 0))
path = tmp_path / "photo.jpg"
img.save(path, "JPEG", quality=95)
result = _resize_image_for_vision(path, mime_type="image/jpeg",
max_base64_bytes=50_000)
assert result.startswith("data:image/jpeg;base64,")
def test_constants_sane(self):
"""Hard limit should be larger than resize target."""
assert _MAX_BASE64_BYTES == 20 * 1024 * 1024
assert _RESIZE_TARGET_BYTES == 5 * 1024 * 1024
assert _MAX_BASE64_BYTES > _RESIZE_TARGET_BYTES
def test_no_pillow_returns_original(self, tmp_path):
"""Without Pillow, oversized images should be returned as-is."""
# Create a dummy file
path = tmp_path / "test.png"
# Write enough bytes to exceed a tiny limit
path.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 1000)
with patch("tools.vision_tools._image_to_base64_data_url") as mock_b64:
# Simulate a large base64 result
mock_b64.return_value = "data:image/png;base64," + "A" * 200
with patch.dict("sys.modules", {"PIL": None, "PIL.Image": None}):
result = _resize_image_for_vision(path, max_base64_bytes=100)
# Should return the original (oversized) data url
assert len(result) > 100
# ---------------------------------------------------------------------------
# _is_image_size_error — detect size-related API errors
# ---------------------------------------------------------------------------
class TestIsImageSizeError:
"""Tests for the size-error detection helper."""
def test_too_large_message(self):
assert _is_image_size_error(Exception("Request payload too large"))
def test_413_status(self):
assert _is_image_size_error(Exception("HTTP 413 Payload Too Large"))
def test_invalid_request(self):
assert _is_image_size_error(Exception("invalid_request_error: image too big"))
def test_exceeds_limit(self):
assert _is_image_size_error(Exception("Image exceeds maximum size"))
def test_unrelated_error(self):
assert not _is_image_size_error(Exception("Connection refused"))
def test_auth_error(self):
assert not _is_image_size_error(Exception("401 Unauthorized"))
def test_empty_message(self):
assert not _is_image_size_error(Exception(""))

View file

@ -1873,10 +1873,10 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str]
),
}, ensure_ascii=False)
# Read and convert to base64
image_data = screenshot_path.read_bytes()
image_base64 = base64.b64encode(image_data).decode("ascii")
data_url = f"data:image/png;base64,{image_base64}"
# Convert screenshot to base64 at full resolution.
_screenshot_bytes = screenshot_path.read_bytes()
_screenshot_b64 = base64.b64encode(_screenshot_bytes).decode("ascii")
data_url = f"data:image/png;base64,{_screenshot_b64}"
vision_prompt = (
f"You are analyzing a screenshot of a web browser.\n\n"
@ -1890,7 +1890,7 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str]
# Use the centralized LLM router
vision_model = _get_vision_model()
logger.debug("browser_vision: analysing screenshot (%d bytes)",
len(image_data))
len(_screenshot_bytes))
# Read vision timeout from config (auxiliary.vision.timeout), default 120s.
# Local vision models (llama.cpp, ollama) can take well over 30s for
@ -1922,7 +1922,27 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str]
}
if vision_model:
call_kwargs["model"] = vision_model
response = call_llm(**call_kwargs)
# Try full-size screenshot; on size-related rejection, downscale and retry.
try:
response = call_llm(**call_kwargs)
except Exception as _api_err:
from tools.vision_tools import (
_is_image_size_error, _resize_image_for_vision, _RESIZE_TARGET_BYTES,
)
if (_is_image_size_error(_api_err)
and len(data_url) > _RESIZE_TARGET_BYTES):
logger.info(
"Vision API rejected screenshot (%.1f MB); "
"auto-resizing to ~%.0f MB and retrying...",
len(data_url) / (1024 * 1024),
_RESIZE_TARGET_BYTES / (1024 * 1024),
)
data_url = _resize_image_for_vision(
screenshot_path, mime_type="image/png")
call_kwargs["messages"][0]["content"][1]["image_url"]["url"] = data_url
response = call_llm(**call_kwargs)
else:
raise
analysis = (response.choices[0].message.content or "").strip()
# Redact secrets the vision LLM may have read from the screenshot.

View file

@ -277,6 +277,120 @@ def _image_to_base64_data_url(image_path: Path, mime_type: Optional[str] = None)
return data_url
# Hard limit for vision API payloads (20 MB) — matches the most restrictive
# major provider (Gemini inline data limit). Images above this are rejected.
_MAX_BASE64_BYTES = 20 * 1024 * 1024
# Target size when auto-resizing on API failure (5 MB). After a provider
# rejects an image, we downscale to this target and retry once.
_RESIZE_TARGET_BYTES = 5 * 1024 * 1024
def _is_image_size_error(error: Exception) -> bool:
"""Detect if an API error is related to image or payload size."""
err_str = str(error).lower()
return any(hint in err_str for hint in (
"too large", "payload", "413", "content_too_large",
"request_too_large", "image_url", "invalid_request",
"exceeds", "size limit",
))
def _resize_image_for_vision(image_path: Path, mime_type: Optional[str] = None,
max_base64_bytes: int = _RESIZE_TARGET_BYTES) -> str:
"""Convert an image to a base64 data URL, auto-resizing if too large.
Tries Pillow first to progressively downscale oversized images. If Pillow
is not installed or resizing still exceeds the limit, falls back to the raw
bytes and lets the caller handle the size check.
Returns the base64 data URL string.
"""
# Quick file-size estimate: base64 expands by ~4/3, plus data URL header.
# Skip the expensive full-read + encode if Pillow can resize directly.
file_size = image_path.stat().st_size
estimated_b64 = (file_size * 4) // 3 + 100 # ~header overhead
if estimated_b64 <= max_base64_bytes:
# Small enough — just encode directly.
data_url = _image_to_base64_data_url(image_path, mime_type=mime_type)
if len(data_url) <= max_base64_bytes:
return data_url
else:
data_url = None # defer full encode; try Pillow resize first
# Attempt auto-resize with Pillow (soft dependency)
try:
from PIL import Image
import io as _io
except ImportError:
logger.info("Pillow not installed — cannot auto-resize oversized image")
if data_url is None:
data_url = _image_to_base64_data_url(image_path, mime_type=mime_type)
return data_url # caller will raise the size error
logger.info("Image file is %.1f MB (estimated base64 %.1f MB, limit %.1f MB), auto-resizing...",
file_size / (1024 * 1024), estimated_b64 / (1024 * 1024),
max_base64_bytes / (1024 * 1024))
mime = mime_type or _determine_mime_type(image_path)
# Choose output format: JPEG for photos (smaller), PNG for transparency
pil_format = "PNG" if mime == "image/png" else "JPEG"
out_mime = "image/png" if pil_format == "PNG" else "image/jpeg"
try:
img = Image.open(image_path)
except Exception as exc:
logger.info("Pillow cannot open image for resizing: %s", exc)
if data_url is None:
data_url = _image_to_base64_data_url(image_path, mime_type=mime_type)
return data_url # fall through to size-check in caller
# Convert RGBA to RGB for JPEG output
if pil_format == "JPEG" and img.mode in ("RGBA", "P"):
img = img.convert("RGB")
# Strategy: halve dimensions until base64 fits, up to 4 rounds.
# For JPEG, also try reducing quality at each size step.
# For PNG, quality is irrelevant — only dimension reduction helps.
quality_steps = (85, 70, 50) if pil_format == "JPEG" else (None,)
prev_dims = (img.width, img.height)
candidate = None # will be set on first loop iteration
for attempt in range(5):
if attempt > 0:
new_w = max(img.width // 2, 64)
new_h = max(img.height // 2, 64)
# Stop if dimensions can't shrink further
if (new_w, new_h) == prev_dims:
break
img = img.resize((new_w, new_h), Image.LANCZOS)
prev_dims = (new_w, new_h)
logger.info("Resized to %dx%d (attempt %d)", new_w, new_h, attempt)
for q in quality_steps:
buf = _io.BytesIO()
save_kwargs = {"format": pil_format}
if q is not None:
save_kwargs["quality"] = q
img.save(buf, **save_kwargs)
encoded = base64.b64encode(buf.getvalue()).decode("ascii")
candidate = f"data:{out_mime};base64,{encoded}"
if len(candidate) <= max_base64_bytes:
logger.info("Auto-resized image fits: %.1f MB (quality=%s, %dx%d)",
len(candidate) / (1024 * 1024), q,
img.width, img.height)
return candidate
# If we still can't get it small enough, return the best attempt
# and let the caller decide
if candidate is not None:
logger.warning("Auto-resize could not fit image under %.1f MB (best: %.1f MB)",
max_base64_bytes / (1024 * 1024), len(candidate) / (1024 * 1024))
return candidate
# Shouldn't reach here, but fall back to full encode
return data_url or _image_to_base64_data_url(image_path, mime_type=mime_type)
async def vision_analyze_tool(
image_url: str,
user_prompt: str,
@ -376,24 +490,27 @@ async def vision_analyze_tool(
if not detected_mime_type:
raise ValueError("Only real image files are supported for vision analysis.")
# Convert image to base64 data URL
# Convert image to base64 — send at full resolution first.
# If the provider rejects it as too large, we auto-resize and retry.
logger.info("Converting image to base64...")
image_data_url = _image_to_base64_data_url(temp_image_path, mime_type=detected_mime_type)
# Calculate size in KB for better readability
data_size_kb = len(image_data_url) / 1024
logger.info("Image converted to base64 (%.1f KB)", data_size_kb)
# Pre-flight size check: most vision APIs cap base64 payloads at 5 MB.
# Reject early with a clear message instead of a cryptic provider 400.
_MAX_BASE64_BYTES = 5 * 1024 * 1024 # 5 MB
# The data URL includes the header (e.g. "data:image/jpeg;base64,") which
# is negligible, but measure the full string to be safe.
# Hard limit (20 MB) — no provider accepts payloads this large.
if len(image_data_url) > _MAX_BASE64_BYTES:
raise ValueError(
f"Image too large for vision API: base64 payload is "
f"{len(image_data_url) / (1024 * 1024):.1f} MB (limit 5 MB). "
f"Resize or compress the image and try again."
)
# Try to resize down to 5 MB before giving up.
image_data_url = _resize_image_for_vision(
temp_image_path, mime_type=detected_mime_type)
if len(image_data_url) > _MAX_BASE64_BYTES:
raise ValueError(
f"Image too large for vision API: base64 payload is "
f"{len(image_data_url) / (1024 * 1024):.1f} MB "
f"(limit {_MAX_BASE64_BYTES / (1024 * 1024):.0f} MB) "
f"even after resizing. "
f"Install Pillow (`pip install Pillow`) for better auto-resize, "
f"or compress the image manually."
)
debug_call_data["image_size_bytes"] = image_size_bytes
@ -442,7 +559,24 @@ async def vision_analyze_tool(
}
if model:
call_kwargs["model"] = model
response = await async_call_llm(**call_kwargs)
# Try full-size image first; on size-related rejection, downscale and retry.
try:
response = await async_call_llm(**call_kwargs)
except Exception as _api_err:
if (_is_image_size_error(_api_err)
and len(image_data_url) > _RESIZE_TARGET_BYTES):
logger.info(
"API rejected image (%.1f MB, likely too large); "
"auto-resizing to ~%.0f MB and retrying...",
len(image_data_url) / (1024 * 1024),
_RESIZE_TARGET_BYTES / (1024 * 1024),
)
image_data_url = _resize_image_for_vision(
temp_image_path, mime_type=detected_mime_type)
messages[0]["content"][1]["image_url"]["url"] = image_data_url
response = await async_call_llm(**call_kwargs)
else:
raise
# Extract the analysis — fall back to reasoning if content is empty
analysis = extract_content_or_reasoning(response)
@ -498,8 +632,8 @@ async def vision_analyze_tool(
elif "invalid_request" in err_str or "image_url" in err_str:
analysis = (
"The vision API rejected the image. This can happen when the "
"image is too large, in an unsupported format, or corrupted. "
"Try a smaller JPEG/PNG (under 3.5 MB) and retry. "
"image is in an unsupported format, corrupted, or still too "
"large after auto-resize. Try a smaller JPEG/PNG and retry. "
f"Error: {e}"
)
else:

View file

@ -143,6 +143,10 @@ to find the parent assistant message, keeping groups intact.
### Phase 3: Generate Structured Summary
:::warning Summary model context length
The summary model must have a context window **at least as large** as the main agent model's. The entire middle section is sent to the summary model in a single `call_llm(task="compression")` call. If the summary model's context is smaller, the API returns a context-length error — `_generate_summary()` catches it, logs a warning, and returns `None`. The compressor then drops the middle turns **without a summary**, silently losing conversation context. This is the most common cause of degraded compaction quality.
:::
The middle turns are summarized using the auxiliary LLM with a structured
template:

View file

@ -480,7 +480,9 @@ Points at a custom OpenAI-compatible endpoint. Uses `OPENAI_API_KEY` for auth.
| `nous` / `openrouter` / etc. | not set | Force that provider, use its auth |
| any | set | Use the custom endpoint directly (provider ignored) |
The `summary_model` must support a context length at least as large as your main model's, since it receives the full middle section of the conversation for compression.
:::warning Summary model context length requirement
The `summary_model` **must** have a context window at least as large as your main agent model's. The compressor sends the full middle section of the conversation to the summary model — if that model's context window is smaller than the main model's, the summarization call will fail with a context length error. When this happens, the middle turns are **dropped without a summary**, losing conversation context silently. If you override `summary_model`, verify its context length meets or exceeds your main model's.
:::
## Context Engine

View file

@ -212,7 +212,24 @@ When users click buttons or interact with interactive cards sent by the bot, the
Card action events are dispatched with `MessageType.COMMAND`, so they flow through the normal command processing pipeline.
To use this feature, enable the **Interactive Card** event in your Feishu app's event subscriptions (`card.action.trigger`).
This is also how **command approval** works — when the agent needs to run a dangerous command, it sends an interactive card with Allow Once / Session / Always / Deny buttons. The user clicks a button, and the card action callback delivers the approval decision back to the agent.
### Required Feishu App Configuration
Interactive cards require **three** configuration steps in the Feishu Developer Console. Missing any of them causes error **200340** when users click card buttons.
1. **Subscribe to the card action event:**
In **Event Subscriptions**, add `card.action.trigger` to your subscribed events.
2. **Enable the Interactive Card capability:**
In **App Features > Bot**, ensure the **Interactive Card** toggle is enabled. This tells Feishu that your app can receive card action callbacks.
3. **Configure the Card Request URL (webhook mode only):**
In **App Features > Bot > Message Card Request URL**, set the URL to the same endpoint as your event webhook (e.g. `https://your-server:8765/feishu/webhook`). In WebSocket mode this is handled automatically by the SDK.
:::warning
Without all three steps, Feishu will successfully *send* interactive cards (sending only requires `im:message:send` permission), but clicking any button will return error 200340. The card appears to work — the error only surfaces when a user interacts with it.
:::
## Media Support
@ -412,6 +429,7 @@ WebSocket and per-group ACL settings are configured via `config.yaml` under `pla
| Post messages show as plain text | The Feishu API rejected the post payload; this is normal fallback behavior. Check logs for details. |
| Images/files not received by bot | Grant `im:message` and `im:resource` permission scopes to your Feishu app |
| Bot identity not auto-detected | Grant `admin:app.info:readonly` scope, or set `FEISHU_BOT_OPEN_ID` / `FEISHU_BOT_NAME` manually |
| Error 200340 when clicking approval buttons | Enable **Interactive Card** capability and configure **Card Request URL** in the Feishu Developer Console. See [Required Feishu App Configuration](#required-feishu-app-configuration) above. |
| `Webhook rate limit exceeded` | More than 120 requests/minute from the same IP. This is usually a misconfiguration or loop. |
## Toolset