mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
Merge branch 'main' of github.com:NousResearch/hermes-agent into feat/ink-refactor
This commit is contained in:
commit
bf6af95ff5
16 changed files with 644 additions and 75 deletions
|
|
@ -383,7 +383,14 @@ def get_model_capabilities(provider: str, model: str) -> Optional[ModelCapabilit
|
|||
|
||||
# Extract capability flags (default to False if missing)
|
||||
supports_tools = bool(entry.get("tool_call", False))
|
||||
supports_vision = bool(entry.get("attachment", False))
|
||||
# Vision: check both the `attachment` flag and `modalities.input` for "image".
|
||||
# Some models (e.g. gemma-4) list image in input modalities but not attachment.
|
||||
input_mods = entry.get("modalities", {})
|
||||
if isinstance(input_mods, dict):
|
||||
input_mods = input_mods.get("input", [])
|
||||
else:
|
||||
input_mods = []
|
||||
supports_vision = bool(entry.get("attachment", False)) or "image" in input_mods
|
||||
supports_reasoning = bool(entry.get("reasoning", False))
|
||||
|
||||
# Extract limits
|
||||
|
|
|
|||
|
|
@ -190,7 +190,7 @@ class StreamingConfig:
|
|||
"""Configuration for real-time token streaming to messaging platforms."""
|
||||
enabled: bool = False
|
||||
transport: str = "edit" # "edit" (progressive editMessageText) or "off"
|
||||
edit_interval: float = 0.3 # Seconds between message edits
|
||||
edit_interval: float = 1.0 # Seconds between message edits (Telegram rate-limits at ~1/s)
|
||||
buffer_threshold: int = 40 # Chars before forcing an edit
|
||||
cursor: str = " ▉" # Cursor shown during streaming
|
||||
|
||||
|
|
@ -210,7 +210,7 @@ class StreamingConfig:
|
|||
return cls(
|
||||
enabled=data.get("enabled", False),
|
||||
transport=data.get("transport", "edit"),
|
||||
edit_interval=float(data.get("edit_interval", 0.3)),
|
||||
edit_interval=float(data.get("edit_interval", 1.0)),
|
||||
buffer_threshold=int(data.get("buffer_threshold", 40)),
|
||||
cursor=data.get("cursor", " ▉"),
|
||||
)
|
||||
|
|
|
|||
|
|
@ -352,7 +352,16 @@ class MatrixAdapter(BasePlatformAdapter):
|
|||
from mautrix.crypto import OlmMachine
|
||||
from mautrix.crypto.store import MemoryCryptoStore
|
||||
|
||||
crypto_store = MemoryCryptoStore()
|
||||
# account_id and pickle_key are required by mautrix ≥0.21.
|
||||
# Use the Matrix user ID as account_id for stable identity.
|
||||
# pickle_key secures in-memory serialisation; derive from
|
||||
# the same user_id:device_id pair used for the on-disk HMAC.
|
||||
_acct_id = self._user_id or "hermes"
|
||||
_pickle_key = f"{_acct_id}:{self._device_id}"
|
||||
crypto_store = MemoryCryptoStore(
|
||||
account_id=_acct_id,
|
||||
pickle_key=_pickle_key,
|
||||
)
|
||||
|
||||
# Restore persisted crypto state from a previous run.
|
||||
# Uses HMAC to verify integrity before unpickling.
|
||||
|
|
@ -418,6 +427,11 @@ class MatrixAdapter(BasePlatformAdapter):
|
|||
if isinstance(sync_data, dict):
|
||||
rooms_join = sync_data.get("rooms", {}).get("join", {})
|
||||
self._joined_rooms = set(rooms_join.keys())
|
||||
# Store the next_batch token so incremental syncs start
|
||||
# from where the initial sync left off.
|
||||
nb = sync_data.get("next_batch")
|
||||
if nb:
|
||||
await client.sync_store.put_next_batch(nb)
|
||||
logger.info(
|
||||
"Matrix: initial sync complete, joined %d rooms",
|
||||
len(self._joined_rooms),
|
||||
|
|
@ -809,19 +823,40 @@ class MatrixAdapter(BasePlatformAdapter):
|
|||
|
||||
async def _sync_loop(self) -> None:
|
||||
"""Continuously sync with the homeserver."""
|
||||
client = self._client
|
||||
# Resume from the token stored during the initial sync.
|
||||
next_batch = await client.sync_store.get_next_batch()
|
||||
while not self._closing:
|
||||
try:
|
||||
sync_data = await self._client.sync(timeout=30000)
|
||||
sync_data = await client.sync(
|
||||
since=next_batch, timeout=30000,
|
||||
)
|
||||
if isinstance(sync_data, dict):
|
||||
# Update joined rooms from sync response.
|
||||
rooms_join = sync_data.get("rooms", {}).get("join", {})
|
||||
if rooms_join:
|
||||
self._joined_rooms.update(rooms_join.keys())
|
||||
|
||||
# Share keys periodically if E2EE is enabled.
|
||||
if self._encryption and getattr(self._client, "crypto", None):
|
||||
# Advance the sync token so the next request is
|
||||
# incremental instead of a full initial sync.
|
||||
nb = sync_data.get("next_batch")
|
||||
if nb:
|
||||
next_batch = nb
|
||||
await client.sync_store.put_next_batch(nb)
|
||||
|
||||
# Dispatch events to registered handlers so that
|
||||
# _on_room_message / _on_reaction / _on_invite fire.
|
||||
try:
|
||||
await self._client.crypto.share_keys()
|
||||
tasks = client.handle_sync(sync_data)
|
||||
if tasks:
|
||||
await asyncio.gather(*tasks)
|
||||
except Exception as exc:
|
||||
logger.warning("Matrix: sync event dispatch error: %s", exc)
|
||||
|
||||
# Share keys periodically if E2EE is enabled.
|
||||
if self._encryption and getattr(client, "crypto", None):
|
||||
try:
|
||||
await client.crypto.share_keys()
|
||||
except Exception as exc:
|
||||
logger.warning("Matrix: E2EE key share failed: %s", exc)
|
||||
|
||||
|
|
|
|||
|
|
@ -36,7 +36,7 @@ _NEW_SEGMENT = object()
|
|||
@dataclass
|
||||
class StreamConsumerConfig:
|
||||
"""Runtime config for a single stream consumer instance."""
|
||||
edit_interval: float = 0.3
|
||||
edit_interval: float = 1.0
|
||||
buffer_threshold: int = 40
|
||||
cursor: str = " ▉"
|
||||
|
||||
|
|
@ -56,6 +56,10 @@ class GatewayStreamConsumer:
|
|||
await task # wait for final edit
|
||||
"""
|
||||
|
||||
# After this many consecutive flood-control failures, permanently disable
|
||||
# progressive edits for the remainder of the stream.
|
||||
_MAX_FLOOD_STRIKES = 3
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
adapter: Any,
|
||||
|
|
@ -76,6 +80,8 @@ class GatewayStreamConsumer:
|
|||
self._last_sent_text = "" # Track last-sent text to skip redundant edits
|
||||
self._fallback_final_send = False
|
||||
self._fallback_prefix = ""
|
||||
self._flood_strikes = 0 # Consecutive flood-control edit failures
|
||||
self._current_edit_interval = self.cfg.edit_interval # Adaptive backoff
|
||||
|
||||
@property
|
||||
def already_sent(self) -> bool:
|
||||
|
|
@ -129,7 +135,7 @@ class GatewayStreamConsumer:
|
|||
should_edit = (
|
||||
got_done
|
||||
or got_segment_break
|
||||
or (elapsed >= self.cfg.edit_interval
|
||||
or (elapsed >= self._current_edit_interval
|
||||
and self._accumulated)
|
||||
or len(self._accumulated) >= self.cfg.buffer_threshold
|
||||
)
|
||||
|
|
@ -173,12 +179,13 @@ class GatewayStreamConsumer:
|
|||
if split_at < _safe_limit // 2:
|
||||
split_at = _safe_limit
|
||||
chunk = self._accumulated[:split_at]
|
||||
await self._send_or_edit(chunk)
|
||||
if self._fallback_final_send:
|
||||
# Edit failed while attempting to split an oversized
|
||||
# message. Keep the full accumulated text intact so
|
||||
# the fallback final-send path can deliver the
|
||||
# remaining continuation without dropping content.
|
||||
ok = await self._send_or_edit(chunk)
|
||||
if self._fallback_final_send or not ok:
|
||||
# Edit failed (or backed off due to flood control)
|
||||
# while attempting to split an oversized message.
|
||||
# Keep the full accumulated text intact so the
|
||||
# fallback final-send path can deliver the remaining
|
||||
# continuation without dropping content.
|
||||
break
|
||||
self._accumulated = self._accumulated[split_at:].lstrip("\n")
|
||||
self._message_id = None
|
||||
|
|
@ -322,7 +329,10 @@ class GatewayStreamConsumer:
|
|||
return chunks
|
||||
|
||||
async def _send_fallback_final(self, text: str) -> None:
|
||||
"""Send the final continuation after streaming edits stop working."""
|
||||
"""Send the final continuation after streaming edits stop working.
|
||||
|
||||
Retries each chunk once on flood-control failures with a short delay.
|
||||
"""
|
||||
final_text = self._clean_for_display(text)
|
||||
continuation = self._continuation_text(final_text)
|
||||
self._fallback_final_send = False
|
||||
|
|
@ -339,12 +349,25 @@ class GatewayStreamConsumer:
|
|||
last_successful_chunk = ""
|
||||
sent_any_chunk = False
|
||||
for chunk in chunks:
|
||||
result = await self.adapter.send(
|
||||
chat_id=self.chat_id,
|
||||
content=chunk,
|
||||
metadata=self.metadata,
|
||||
)
|
||||
if not result.success:
|
||||
# Try sending with one retry on flood-control errors.
|
||||
result = None
|
||||
for attempt in range(2):
|
||||
result = await self.adapter.send(
|
||||
chat_id=self.chat_id,
|
||||
content=chunk,
|
||||
metadata=self.metadata,
|
||||
)
|
||||
if result.success:
|
||||
break
|
||||
if attempt == 0 and self._is_flood_error(result):
|
||||
logger.debug(
|
||||
"Flood control on fallback send, retrying in 3s"
|
||||
)
|
||||
await asyncio.sleep(3.0)
|
||||
else:
|
||||
break # non-flood error or second attempt failed
|
||||
|
||||
if not result or not result.success:
|
||||
if sent_any_chunk:
|
||||
# Some continuation text already reached the user. Suppress
|
||||
# the base gateway final-send path so we don't resend the
|
||||
|
|
@ -370,20 +393,52 @@ class GatewayStreamConsumer:
|
|||
self._last_sent_text = chunks[-1]
|
||||
self._fallback_prefix = ""
|
||||
|
||||
async def _send_or_edit(self, text: str) -> None:
|
||||
"""Send or edit the streaming message."""
|
||||
def _is_flood_error(self, result) -> bool:
|
||||
"""Check if a SendResult failure is due to flood control / rate limiting."""
|
||||
err = getattr(result, "error", "") or ""
|
||||
err_lower = err.lower()
|
||||
return "flood" in err_lower or "retry after" in err_lower or "rate" in err_lower
|
||||
|
||||
async def _try_strip_cursor(self) -> None:
|
||||
"""Best-effort edit to remove the cursor from the last visible message.
|
||||
|
||||
Called when entering fallback mode so the user doesn't see a stuck
|
||||
cursor (▉) in the partial message.
|
||||
"""
|
||||
if not self._message_id or self._message_id == "__no_edit__":
|
||||
return
|
||||
prefix = self._visible_prefix()
|
||||
if not prefix or not prefix.strip():
|
||||
return
|
||||
try:
|
||||
await self.adapter.edit_message(
|
||||
chat_id=self.chat_id,
|
||||
message_id=self._message_id,
|
||||
content=prefix,
|
||||
)
|
||||
self._last_sent_text = prefix
|
||||
except Exception:
|
||||
pass # best-effort — don't let this block the fallback path
|
||||
|
||||
async def _send_or_edit(self, text: str) -> bool:
|
||||
"""Send or edit the streaming message.
|
||||
|
||||
Returns True if the text was successfully delivered (sent or edited),
|
||||
False otherwise. Callers like the overflow split loop use this to
|
||||
decide whether to advance past the delivered chunk.
|
||||
"""
|
||||
# Strip MEDIA: directives so they don't appear as visible text.
|
||||
# Media files are delivered as native attachments after the stream
|
||||
# finishes (via _deliver_media_from_response in gateway/run.py).
|
||||
text = self._clean_for_display(text)
|
||||
if not text.strip():
|
||||
return
|
||||
return True # nothing to send is "success"
|
||||
try:
|
||||
if self._message_id is not None:
|
||||
if self._edit_supported:
|
||||
# Skip if text is identical to what we last sent
|
||||
if text == self._last_sent_text:
|
||||
return
|
||||
return True
|
||||
# Edit existing message
|
||||
result = await self.adapter.edit_message(
|
||||
chat_id=self.chat_id,
|
||||
|
|
@ -393,19 +448,52 @@ class GatewayStreamConsumer:
|
|||
if result.success:
|
||||
self._already_sent = True
|
||||
self._last_sent_text = text
|
||||
# Successful edit — reset flood strike counter
|
||||
self._flood_strikes = 0
|
||||
return True
|
||||
else:
|
||||
# If an edit fails mid-stream (especially Telegram flood control),
|
||||
# stop progressive edits and send only the missing tail once the
|
||||
# Edit failed. If this looks like flood control / rate
|
||||
# limiting, use adaptive backoff: double the edit interval
|
||||
# and retry on the next cycle. Only permanently disable
|
||||
# edits after _MAX_FLOOD_STRIKES consecutive failures.
|
||||
if self._is_flood_error(result):
|
||||
self._flood_strikes += 1
|
||||
self._current_edit_interval = min(
|
||||
self._current_edit_interval * 2, 10.0,
|
||||
)
|
||||
logger.debug(
|
||||
"Flood control on edit (strike %d/%d), "
|
||||
"backoff interval → %.1fs",
|
||||
self._flood_strikes,
|
||||
self._MAX_FLOOD_STRIKES,
|
||||
self._current_edit_interval,
|
||||
)
|
||||
if self._flood_strikes < self._MAX_FLOOD_STRIKES:
|
||||
# Don't disable edits yet — just slow down.
|
||||
# Update _last_edit_time so the next edit
|
||||
# respects the new interval.
|
||||
self._last_edit_time = time.monotonic()
|
||||
return False
|
||||
|
||||
# Non-flood error OR flood strikes exhausted: enter
|
||||
# fallback mode — send only the missing tail once the
|
||||
# final response is available.
|
||||
logger.debug("Edit failed, disabling streaming for this adapter")
|
||||
logger.debug(
|
||||
"Edit failed (strikes=%d), entering fallback mode",
|
||||
self._flood_strikes,
|
||||
)
|
||||
self._fallback_prefix = self._visible_prefix()
|
||||
self._fallback_final_send = True
|
||||
self._edit_supported = False
|
||||
self._already_sent = True
|
||||
# Best-effort: strip the cursor from the last visible
|
||||
# message so the user doesn't see a stuck ▉.
|
||||
await self._try_strip_cursor()
|
||||
return False
|
||||
else:
|
||||
# Editing not supported — skip intermediate updates.
|
||||
# The final response will be sent by the fallback path.
|
||||
pass
|
||||
return False
|
||||
else:
|
||||
# First message — send new
|
||||
result = await self.adapter.send(
|
||||
|
|
@ -417,6 +505,7 @@ class GatewayStreamConsumer:
|
|||
self._message_id = result.message_id
|
||||
self._already_sent = True
|
||||
self._last_sent_text = text
|
||||
return True
|
||||
elif result.success:
|
||||
# Platform accepted the message but returned no message_id
|
||||
# (e.g. Signal). Can't edit without an ID — switch to
|
||||
|
|
@ -428,8 +517,11 @@ class GatewayStreamConsumer:
|
|||
self._fallback_final_send = True
|
||||
# Sentinel prevents re-entering this branch on every delta
|
||||
self._message_id = "__no_edit__"
|
||||
return True # platform accepted, just can't edit
|
||||
else:
|
||||
# Initial send failed — disable streaming for this session
|
||||
self._edit_supported = False
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.error("Stream send/edit error: %s", e)
|
||||
return False
|
||||
|
|
|
|||
|
|
@ -381,7 +381,7 @@ DEFAULT_CONFIG = {
|
|||
"model": "", # e.g. "google/gemini-2.5-flash", "gpt-4o"
|
||||
"base_url": "", # direct OpenAI-compatible endpoint (takes precedence over provider)
|
||||
"api_key": "", # API key for base_url (falls back to OPENAI_API_KEY)
|
||||
"timeout": 30, # seconds — LLM API call timeout; increase for slow local vision models
|
||||
"timeout": 120, # seconds — LLM API call timeout; vision payloads need generous timeout
|
||||
"download_timeout": 30, # seconds — image HTTP download timeout; increase for slow connections
|
||||
},
|
||||
"web_extract": {
|
||||
|
|
|
|||
|
|
@ -56,6 +56,18 @@ OPENROUTER_MODELS: list[tuple[str, str]] = [
|
|||
|
||||
_openrouter_catalog_cache: list[tuple[str, str]] | None = None
|
||||
|
||||
|
||||
def _codex_curated_models() -> list[str]:
|
||||
"""Derive the openai-codex curated list from codex_models.py.
|
||||
|
||||
Single source of truth: DEFAULT_CODEX_MODELS + forward-compat synthesis.
|
||||
This keeps the gateway /model picker in sync with the CLI `hermes model`
|
||||
flow without maintaining a separate static list.
|
||||
"""
|
||||
from hermes_cli.codex_models import DEFAULT_CODEX_MODELS, _add_forward_compat_models
|
||||
return _add_forward_compat_models(list(DEFAULT_CODEX_MODELS))
|
||||
|
||||
|
||||
_PROVIDER_MODELS: dict[str, list[str]] = {
|
||||
"nous": [
|
||||
"anthropic/claude-opus-4.6",
|
||||
|
|
@ -86,14 +98,7 @@ _PROVIDER_MODELS: dict[str, list[str]] = {
|
|||
"openai/gpt-5.4-pro",
|
||||
"openai/gpt-5.4-nano",
|
||||
],
|
||||
"openai-codex": [
|
||||
"gpt-5.4",
|
||||
"gpt-5.4-mini",
|
||||
"gpt-5.3-codex",
|
||||
"gpt-5.2-codex",
|
||||
"gpt-5.1-codex-mini",
|
||||
"gpt-5.1-codex-max",
|
||||
],
|
||||
"openai-codex": _codex_curated_models(),
|
||||
"copilot-acp": [
|
||||
"copilot-acp",
|
||||
],
|
||||
|
|
|
|||
37
run_agent.py
37
run_agent.py
|
|
@ -700,10 +700,14 @@ class AIAgent:
|
|||
except Exception:
|
||||
pass
|
||||
|
||||
# Direct OpenAI sessions use the Responses API path. GPT-5.x tool
|
||||
# calls with reasoning are rejected on /v1/chat/completions, and
|
||||
# Hermes is a tool-using client by default.
|
||||
if self.api_mode == "chat_completions" and self._is_direct_openai_url():
|
||||
# GPT-5.x models require the Responses API path — they are rejected
|
||||
# on /v1/chat/completions by both OpenAI and OpenRouter. Also
|
||||
# auto-upgrade for direct OpenAI URLs (api.openai.com) since all
|
||||
# newer tool-calling models prefer Responses there.
|
||||
if self.api_mode == "chat_completions" and (
|
||||
self._is_direct_openai_url()
|
||||
or self._model_requires_responses_api(self.model)
|
||||
):
|
||||
self.api_mode = "codex_responses"
|
||||
|
||||
# Pre-warm OpenRouter model metadata cache in a background thread.
|
||||
|
|
@ -1702,6 +1706,21 @@ class AIAgent:
|
|||
"""Return True when the base URL targets OpenRouter."""
|
||||
return "openrouter" in self._base_url_lower
|
||||
|
||||
@staticmethod
|
||||
def _model_requires_responses_api(model: str) -> bool:
|
||||
"""Return True for models that require the Responses API path.
|
||||
|
||||
GPT-5.x models are rejected on /v1/chat/completions by both
|
||||
OpenAI and OpenRouter (error: ``unsupported_api_for_model``).
|
||||
Detect these so the correct api_mode is set regardless of
|
||||
which provider is serving the model.
|
||||
"""
|
||||
m = model.lower()
|
||||
# Strip vendor prefix (e.g. "openai/gpt-5.4" → "gpt-5.4")
|
||||
if "/" in m:
|
||||
m = m.rsplit("/", 1)[-1]
|
||||
return m.startswith("gpt-5")
|
||||
|
||||
def _max_tokens_param(self, value: int) -> dict:
|
||||
"""Return the correct max tokens kwarg for the current provider.
|
||||
|
||||
|
|
@ -5252,7 +5271,7 @@ class AIAgent:
|
|||
except Exception:
|
||||
pass
|
||||
|
||||
# Determine api_mode from provider / base URL
|
||||
# Determine api_mode from provider / base URL / model
|
||||
fb_api_mode = "chat_completions"
|
||||
fb_base_url = str(fb_client.base_url)
|
||||
if fb_provider == "openai-codex":
|
||||
|
|
@ -5261,6 +5280,10 @@ class AIAgent:
|
|||
fb_api_mode = "anthropic_messages"
|
||||
elif self._is_direct_openai_url(fb_base_url):
|
||||
fb_api_mode = "codex_responses"
|
||||
elif self._model_requires_responses_api(fb_model):
|
||||
# GPT-5.x models need Responses API on every provider
|
||||
# (OpenRouter, Copilot, direct OpenAI, etc.)
|
||||
fb_api_mode = "codex_responses"
|
||||
|
||||
old_model = self.model
|
||||
self.model = fb_model
|
||||
|
|
@ -5349,8 +5372,8 @@ class AIAgent:
|
|||
to the fallback provider for every subsequent turn. Calling this at
|
||||
the top of ``run_conversation()`` makes fallback turn-scoped.
|
||||
|
||||
The gateway creates a fresh agent per message so this is a no-op
|
||||
there (``_fallback_activated`` is always False at turn start).
|
||||
The gateway caches agents across messages (``_agent_cache`` in
|
||||
``gateway/run.py``), so this restoration IS needed there too.
|
||||
"""
|
||||
if not self._fallback_activated:
|
||||
return False
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@ from agent.models_dev import (
|
|||
PROVIDER_TO_MODELS_DEV,
|
||||
_extract_context,
|
||||
fetch_models_dev,
|
||||
get_model_capabilities,
|
||||
lookup_models_dev_context,
|
||||
)
|
||||
|
||||
|
|
@ -195,3 +196,88 @@ class TestFetchModelsDev:
|
|||
result = fetch_models_dev()
|
||||
mock_get.assert_not_called()
|
||||
assert result == SAMPLE_REGISTRY
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# get_model_capabilities — vision via modalities.input
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
CAPS_REGISTRY = {
|
||||
"google": {
|
||||
"id": "google",
|
||||
"models": {
|
||||
"gemma-4-31b-it": {
|
||||
"id": "gemma-4-31b-it",
|
||||
"attachment": False,
|
||||
"tool_call": True,
|
||||
"modalities": {"input": ["text", "image"]},
|
||||
"limit": {"context": 128000, "output": 8192},
|
||||
},
|
||||
"gemma-3-1b": {
|
||||
"id": "gemma-3-1b",
|
||||
"tool_call": True,
|
||||
"limit": {"context": 32000, "output": 8192},
|
||||
},
|
||||
},
|
||||
},
|
||||
"anthropic": {
|
||||
"id": "anthropic",
|
||||
"models": {
|
||||
"claude-sonnet-4": {
|
||||
"id": "claude-sonnet-4",
|
||||
"attachment": True,
|
||||
"tool_call": True,
|
||||
"limit": {"context": 200000, "output": 64000},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
class TestGetModelCapabilities:
|
||||
"""Tests for get_model_capabilities vision detection."""
|
||||
|
||||
def test_vision_from_attachment_flag(self):
|
||||
"""Models with attachment=True should report supports_vision=True."""
|
||||
with patch("agent.models_dev.fetch_models_dev", return_value=CAPS_REGISTRY):
|
||||
caps = get_model_capabilities("anthropic", "claude-sonnet-4")
|
||||
assert caps is not None
|
||||
assert caps.supports_vision is True
|
||||
|
||||
def test_vision_from_modalities_input_image(self):
|
||||
"""Models with 'image' in modalities.input but attachment=False should
|
||||
still report supports_vision=True (the core fix in this PR)."""
|
||||
with patch("agent.models_dev.fetch_models_dev", return_value=CAPS_REGISTRY):
|
||||
caps = get_model_capabilities("google", "gemma-4-31b-it")
|
||||
assert caps is not None
|
||||
assert caps.supports_vision is True
|
||||
|
||||
def test_no_vision_without_attachment_or_modalities(self):
|
||||
"""Models with neither attachment nor image modality should be non-vision."""
|
||||
with patch("agent.models_dev.fetch_models_dev", return_value=CAPS_REGISTRY):
|
||||
caps = get_model_capabilities("google", "gemma-3-1b")
|
||||
assert caps is not None
|
||||
assert caps.supports_vision is False
|
||||
|
||||
def test_modalities_non_dict_handled(self):
|
||||
"""Non-dict modalities field should not crash."""
|
||||
registry = {
|
||||
"google": {"id": "google", "models": {
|
||||
"weird-model": {
|
||||
"id": "weird-model",
|
||||
"modalities": "text", # not a dict
|
||||
"limit": {"context": 200000, "output": 8192},
|
||||
},
|
||||
}},
|
||||
}
|
||||
with patch("agent.models_dev.fetch_models_dev", return_value=registry):
|
||||
caps = get_model_capabilities("gemini", "weird-model")
|
||||
assert caps is not None
|
||||
assert caps.supports_vision is False
|
||||
|
||||
def test_model_not_found_returns_none(self):
|
||||
"""Unknown model should return None."""
|
||||
with patch("agent.models_dev.fetch_models_dev", return_value=CAPS_REGISTRY):
|
||||
caps = get_model_capabilities("anthropic", "nonexistent-model")
|
||||
assert caps is None
|
||||
|
|
|
|||
|
|
@ -157,7 +157,9 @@ def _make_fake_mautrix():
|
|||
mautrix_crypto_store = types.ModuleType("mautrix.crypto.store")
|
||||
|
||||
class MemoryCryptoStore:
|
||||
pass
|
||||
def __init__(self, account_id="", pickle_key=""):
|
||||
self.account_id = account_id
|
||||
self.pickle_key = pickle_key
|
||||
|
||||
mautrix_crypto_store.MemoryCryptoStore = MemoryCryptoStore
|
||||
|
||||
|
|
@ -1041,20 +1043,28 @@ class TestMatrixSyncLoop:
|
|||
call_count += 1
|
||||
if call_count >= 1:
|
||||
adapter._closing = True
|
||||
return {"rooms": {"join": {"!room:example.org": {}}}}
|
||||
return {"rooms": {"join": {"!room:example.org": {}}}, "next_batch": "s1234"}
|
||||
|
||||
mock_crypto = MagicMock()
|
||||
mock_crypto.share_keys = AsyncMock()
|
||||
|
||||
mock_sync_store = MagicMock()
|
||||
mock_sync_store.get_next_batch = AsyncMock(return_value=None)
|
||||
mock_sync_store.put_next_batch = AsyncMock()
|
||||
|
||||
fake_client = MagicMock()
|
||||
fake_client.sync = AsyncMock(side_effect=_sync_once)
|
||||
fake_client.crypto = mock_crypto
|
||||
fake_client.sync_store = mock_sync_store
|
||||
fake_client.handle_sync = MagicMock(return_value=[])
|
||||
adapter._client = fake_client
|
||||
|
||||
await adapter._sync_loop()
|
||||
|
||||
fake_client.sync.assert_awaited_once()
|
||||
mock_crypto.share_keys.assert_awaited_once()
|
||||
fake_client.handle_sync.assert_called_once()
|
||||
mock_sync_store.put_next_batch.assert_awaited_once_with("s1234")
|
||||
|
||||
|
||||
class TestMatrixEncryptedSendFallback:
|
||||
|
|
|
|||
|
|
@ -222,6 +222,12 @@ def test_api_mode_normalizes_provider_case(monkeypatch):
|
|||
|
||||
|
||||
def test_api_mode_respects_explicit_openrouter_provider_over_codex_url(monkeypatch):
|
||||
"""GPT-5.x models need codex_responses even on OpenRouter.
|
||||
|
||||
OpenRouter rejects GPT-5 models on /v1/chat/completions with
|
||||
``unsupported_api_for_model``. The model-level check overrides
|
||||
the provider default.
|
||||
"""
|
||||
_patch_agent_bootstrap(monkeypatch)
|
||||
agent = run_agent.AIAgent(
|
||||
model="gpt-5-codex",
|
||||
|
|
@ -233,7 +239,7 @@ def test_api_mode_respects_explicit_openrouter_provider_over_codex_url(monkeypat
|
|||
skip_context_files=True,
|
||||
skip_memory=True,
|
||||
)
|
||||
assert agent.api_mode == "chat_completions"
|
||||
assert agent.api_mode == "codex_responses"
|
||||
assert agent.provider == "openrouter"
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -15,6 +15,10 @@ from tools.vision_tools import (
|
|||
_handle_vision_analyze,
|
||||
_determine_mime_type,
|
||||
_image_to_base64_data_url,
|
||||
_resize_image_for_vision,
|
||||
_is_image_size_error,
|
||||
_MAX_BASE64_BYTES,
|
||||
_RESIZE_TARGET_BYTES,
|
||||
vision_analyze_tool,
|
||||
check_vision_requirements,
|
||||
get_debug_session_info,
|
||||
|
|
@ -590,11 +594,13 @@ class TestBase64SizeLimit:
|
|||
|
||||
@pytest.mark.asyncio
|
||||
async def test_oversized_image_rejected_before_api_call(self, tmp_path):
|
||||
"""Images exceeding 5 MB base64 should fail with a clear size error."""
|
||||
"""Images exceeding the 20 MB hard limit should fail with a clear error."""
|
||||
img = tmp_path / "huge.png"
|
||||
img.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * (4 * 1024 * 1024))
|
||||
|
||||
with patch("tools.vision_tools.async_call_llm", new_callable=AsyncMock) as mock_llm:
|
||||
# Patch the hard limit to a small value so the test runs fast.
|
||||
with patch("tools.vision_tools._MAX_BASE64_BYTES", 1000), \
|
||||
patch("tools.vision_tools.async_call_llm", new_callable=AsyncMock) as mock_llm:
|
||||
result = json.loads(await vision_analyze_tool(str(img), "describe this"))
|
||||
|
||||
assert result["success"] is False
|
||||
|
|
@ -686,3 +692,124 @@ class TestVisionRegistration:
|
|||
|
||||
entry = registry._tools.get("vision_analyze")
|
||||
assert callable(entry.handler)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _resize_image_for_vision — auto-resize oversized images
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestResizeImageForVision:
|
||||
"""Tests for the auto-resize function."""
|
||||
|
||||
def test_small_image_returned_as_is(self, tmp_path):
|
||||
"""Images under the limit should be returned unchanged."""
|
||||
# Create a small 10x10 red PNG
|
||||
try:
|
||||
from PIL import Image
|
||||
except ImportError:
|
||||
pytest.skip("Pillow not installed")
|
||||
img = Image.new("RGB", (10, 10), (255, 0, 0))
|
||||
path = tmp_path / "small.png"
|
||||
img.save(path, "PNG")
|
||||
|
||||
result = _resize_image_for_vision(path, mime_type="image/png")
|
||||
assert result.startswith("data:image/png;base64,")
|
||||
assert len(result) < _MAX_BASE64_BYTES
|
||||
|
||||
def test_large_image_is_resized(self, tmp_path):
|
||||
"""Images over the default target should be auto-resized to fit."""
|
||||
try:
|
||||
from PIL import Image
|
||||
except ImportError:
|
||||
pytest.skip("Pillow not installed")
|
||||
# Create a large image that will exceed 5 MB in base64
|
||||
# A 4000x4000 uncompressed PNG will be large
|
||||
img = Image.new("RGB", (4000, 4000), (128, 200, 50))
|
||||
path = tmp_path / "large.png"
|
||||
img.save(path, "PNG")
|
||||
|
||||
result = _resize_image_for_vision(path, mime_type="image/png")
|
||||
assert result.startswith("data:image/png;base64,")
|
||||
# Default target is _RESIZE_TARGET_BYTES (5 MB), not _MAX_BASE64_BYTES (20 MB)
|
||||
assert len(result) <= _RESIZE_TARGET_BYTES
|
||||
|
||||
def test_custom_max_bytes(self, tmp_path):
|
||||
"""The max_base64_bytes parameter should be respected."""
|
||||
try:
|
||||
from PIL import Image
|
||||
except ImportError:
|
||||
pytest.skip("Pillow not installed")
|
||||
img = Image.new("RGB", (200, 200), (0, 128, 255))
|
||||
path = tmp_path / "medium.png"
|
||||
img.save(path, "PNG")
|
||||
|
||||
# Set a very low limit to force resizing
|
||||
result = _resize_image_for_vision(path, max_base64_bytes=500)
|
||||
# Should still return a valid data URL
|
||||
assert result.startswith("data:image/")
|
||||
|
||||
def test_jpeg_output_for_non_png(self, tmp_path):
|
||||
"""Non-PNG images should be resized as JPEG."""
|
||||
try:
|
||||
from PIL import Image
|
||||
except ImportError:
|
||||
pytest.skip("Pillow not installed")
|
||||
img = Image.new("RGB", (2000, 2000), (255, 128, 0))
|
||||
path = tmp_path / "photo.jpg"
|
||||
img.save(path, "JPEG", quality=95)
|
||||
|
||||
result = _resize_image_for_vision(path, mime_type="image/jpeg",
|
||||
max_base64_bytes=50_000)
|
||||
assert result.startswith("data:image/jpeg;base64,")
|
||||
|
||||
def test_constants_sane(self):
|
||||
"""Hard limit should be larger than resize target."""
|
||||
assert _MAX_BASE64_BYTES == 20 * 1024 * 1024
|
||||
assert _RESIZE_TARGET_BYTES == 5 * 1024 * 1024
|
||||
assert _MAX_BASE64_BYTES > _RESIZE_TARGET_BYTES
|
||||
|
||||
def test_no_pillow_returns_original(self, tmp_path):
|
||||
"""Without Pillow, oversized images should be returned as-is."""
|
||||
# Create a dummy file
|
||||
path = tmp_path / "test.png"
|
||||
# Write enough bytes to exceed a tiny limit
|
||||
path.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 1000)
|
||||
|
||||
with patch("tools.vision_tools._image_to_base64_data_url") as mock_b64:
|
||||
# Simulate a large base64 result
|
||||
mock_b64.return_value = "data:image/png;base64," + "A" * 200
|
||||
with patch.dict("sys.modules", {"PIL": None, "PIL.Image": None}):
|
||||
result = _resize_image_for_vision(path, max_base64_bytes=100)
|
||||
# Should return the original (oversized) data url
|
||||
assert len(result) > 100
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _is_image_size_error — detect size-related API errors
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestIsImageSizeError:
|
||||
"""Tests for the size-error detection helper."""
|
||||
|
||||
def test_too_large_message(self):
|
||||
assert _is_image_size_error(Exception("Request payload too large"))
|
||||
|
||||
def test_413_status(self):
|
||||
assert _is_image_size_error(Exception("HTTP 413 Payload Too Large"))
|
||||
|
||||
def test_invalid_request(self):
|
||||
assert _is_image_size_error(Exception("invalid_request_error: image too big"))
|
||||
|
||||
def test_exceeds_limit(self):
|
||||
assert _is_image_size_error(Exception("Image exceeds maximum size"))
|
||||
|
||||
def test_unrelated_error(self):
|
||||
assert not _is_image_size_error(Exception("Connection refused"))
|
||||
|
||||
def test_auth_error(self):
|
||||
assert not _is_image_size_error(Exception("401 Unauthorized"))
|
||||
|
||||
def test_empty_message(self):
|
||||
assert not _is_image_size_error(Exception(""))
|
||||
|
|
|
|||
|
|
@ -1873,10 +1873,10 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str]
|
|||
),
|
||||
}, ensure_ascii=False)
|
||||
|
||||
# Read and convert to base64
|
||||
image_data = screenshot_path.read_bytes()
|
||||
image_base64 = base64.b64encode(image_data).decode("ascii")
|
||||
data_url = f"data:image/png;base64,{image_base64}"
|
||||
# Convert screenshot to base64 at full resolution.
|
||||
_screenshot_bytes = screenshot_path.read_bytes()
|
||||
_screenshot_b64 = base64.b64encode(_screenshot_bytes).decode("ascii")
|
||||
data_url = f"data:image/png;base64,{_screenshot_b64}"
|
||||
|
||||
vision_prompt = (
|
||||
f"You are analyzing a screenshot of a web browser.\n\n"
|
||||
|
|
@ -1890,7 +1890,7 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str]
|
|||
# Use the centralized LLM router
|
||||
vision_model = _get_vision_model()
|
||||
logger.debug("browser_vision: analysing screenshot (%d bytes)",
|
||||
len(image_data))
|
||||
len(_screenshot_bytes))
|
||||
|
||||
# Read vision timeout from config (auxiliary.vision.timeout), default 120s.
|
||||
# Local vision models (llama.cpp, ollama) can take well over 30s for
|
||||
|
|
@ -1922,7 +1922,27 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str]
|
|||
}
|
||||
if vision_model:
|
||||
call_kwargs["model"] = vision_model
|
||||
response = call_llm(**call_kwargs)
|
||||
# Try full-size screenshot; on size-related rejection, downscale and retry.
|
||||
try:
|
||||
response = call_llm(**call_kwargs)
|
||||
except Exception as _api_err:
|
||||
from tools.vision_tools import (
|
||||
_is_image_size_error, _resize_image_for_vision, _RESIZE_TARGET_BYTES,
|
||||
)
|
||||
if (_is_image_size_error(_api_err)
|
||||
and len(data_url) > _RESIZE_TARGET_BYTES):
|
||||
logger.info(
|
||||
"Vision API rejected screenshot (%.1f MB); "
|
||||
"auto-resizing to ~%.0f MB and retrying...",
|
||||
len(data_url) / (1024 * 1024),
|
||||
_RESIZE_TARGET_BYTES / (1024 * 1024),
|
||||
)
|
||||
data_url = _resize_image_for_vision(
|
||||
screenshot_path, mime_type="image/png")
|
||||
call_kwargs["messages"][0]["content"][1]["image_url"]["url"] = data_url
|
||||
response = call_llm(**call_kwargs)
|
||||
else:
|
||||
raise
|
||||
|
||||
analysis = (response.choices[0].message.content or "").strip()
|
||||
# Redact secrets the vision LLM may have read from the screenshot.
|
||||
|
|
|
|||
|
|
@ -277,6 +277,120 @@ def _image_to_base64_data_url(image_path: Path, mime_type: Optional[str] = None)
|
|||
return data_url
|
||||
|
||||
|
||||
# Hard limit for vision API payloads (20 MB) — matches the most restrictive
|
||||
# major provider (Gemini inline data limit). Images above this are rejected.
|
||||
_MAX_BASE64_BYTES = 20 * 1024 * 1024
|
||||
|
||||
# Target size when auto-resizing on API failure (5 MB). After a provider
|
||||
# rejects an image, we downscale to this target and retry once.
|
||||
_RESIZE_TARGET_BYTES = 5 * 1024 * 1024
|
||||
|
||||
|
||||
def _is_image_size_error(error: Exception) -> bool:
|
||||
"""Detect if an API error is related to image or payload size."""
|
||||
err_str = str(error).lower()
|
||||
return any(hint in err_str for hint in (
|
||||
"too large", "payload", "413", "content_too_large",
|
||||
"request_too_large", "image_url", "invalid_request",
|
||||
"exceeds", "size limit",
|
||||
))
|
||||
|
||||
|
||||
def _resize_image_for_vision(image_path: Path, mime_type: Optional[str] = None,
|
||||
max_base64_bytes: int = _RESIZE_TARGET_BYTES) -> str:
|
||||
"""Convert an image to a base64 data URL, auto-resizing if too large.
|
||||
|
||||
Tries Pillow first to progressively downscale oversized images. If Pillow
|
||||
is not installed or resizing still exceeds the limit, falls back to the raw
|
||||
bytes and lets the caller handle the size check.
|
||||
|
||||
Returns the base64 data URL string.
|
||||
"""
|
||||
# Quick file-size estimate: base64 expands by ~4/3, plus data URL header.
|
||||
# Skip the expensive full-read + encode if Pillow can resize directly.
|
||||
file_size = image_path.stat().st_size
|
||||
estimated_b64 = (file_size * 4) // 3 + 100 # ~header overhead
|
||||
if estimated_b64 <= max_base64_bytes:
|
||||
# Small enough — just encode directly.
|
||||
data_url = _image_to_base64_data_url(image_path, mime_type=mime_type)
|
||||
if len(data_url) <= max_base64_bytes:
|
||||
return data_url
|
||||
else:
|
||||
data_url = None # defer full encode; try Pillow resize first
|
||||
|
||||
# Attempt auto-resize with Pillow (soft dependency)
|
||||
try:
|
||||
from PIL import Image
|
||||
import io as _io
|
||||
except ImportError:
|
||||
logger.info("Pillow not installed — cannot auto-resize oversized image")
|
||||
if data_url is None:
|
||||
data_url = _image_to_base64_data_url(image_path, mime_type=mime_type)
|
||||
return data_url # caller will raise the size error
|
||||
|
||||
logger.info("Image file is %.1f MB (estimated base64 %.1f MB, limit %.1f MB), auto-resizing...",
|
||||
file_size / (1024 * 1024), estimated_b64 / (1024 * 1024),
|
||||
max_base64_bytes / (1024 * 1024))
|
||||
|
||||
mime = mime_type or _determine_mime_type(image_path)
|
||||
# Choose output format: JPEG for photos (smaller), PNG for transparency
|
||||
pil_format = "PNG" if mime == "image/png" else "JPEG"
|
||||
out_mime = "image/png" if pil_format == "PNG" else "image/jpeg"
|
||||
|
||||
try:
|
||||
img = Image.open(image_path)
|
||||
except Exception as exc:
|
||||
logger.info("Pillow cannot open image for resizing: %s", exc)
|
||||
if data_url is None:
|
||||
data_url = _image_to_base64_data_url(image_path, mime_type=mime_type)
|
||||
return data_url # fall through to size-check in caller
|
||||
# Convert RGBA to RGB for JPEG output
|
||||
if pil_format == "JPEG" and img.mode in ("RGBA", "P"):
|
||||
img = img.convert("RGB")
|
||||
|
||||
# Strategy: halve dimensions until base64 fits, up to 4 rounds.
|
||||
# For JPEG, also try reducing quality at each size step.
|
||||
# For PNG, quality is irrelevant — only dimension reduction helps.
|
||||
quality_steps = (85, 70, 50) if pil_format == "JPEG" else (None,)
|
||||
prev_dims = (img.width, img.height)
|
||||
candidate = None # will be set on first loop iteration
|
||||
|
||||
for attempt in range(5):
|
||||
if attempt > 0:
|
||||
new_w = max(img.width // 2, 64)
|
||||
new_h = max(img.height // 2, 64)
|
||||
# Stop if dimensions can't shrink further
|
||||
if (new_w, new_h) == prev_dims:
|
||||
break
|
||||
img = img.resize((new_w, new_h), Image.LANCZOS)
|
||||
prev_dims = (new_w, new_h)
|
||||
logger.info("Resized to %dx%d (attempt %d)", new_w, new_h, attempt)
|
||||
|
||||
for q in quality_steps:
|
||||
buf = _io.BytesIO()
|
||||
save_kwargs = {"format": pil_format}
|
||||
if q is not None:
|
||||
save_kwargs["quality"] = q
|
||||
img.save(buf, **save_kwargs)
|
||||
encoded = base64.b64encode(buf.getvalue()).decode("ascii")
|
||||
candidate = f"data:{out_mime};base64,{encoded}"
|
||||
if len(candidate) <= max_base64_bytes:
|
||||
logger.info("Auto-resized image fits: %.1f MB (quality=%s, %dx%d)",
|
||||
len(candidate) / (1024 * 1024), q,
|
||||
img.width, img.height)
|
||||
return candidate
|
||||
|
||||
# If we still can't get it small enough, return the best attempt
|
||||
# and let the caller decide
|
||||
if candidate is not None:
|
||||
logger.warning("Auto-resize could not fit image under %.1f MB (best: %.1f MB)",
|
||||
max_base64_bytes / (1024 * 1024), len(candidate) / (1024 * 1024))
|
||||
return candidate
|
||||
|
||||
# Shouldn't reach here, but fall back to full encode
|
||||
return data_url or _image_to_base64_data_url(image_path, mime_type=mime_type)
|
||||
|
||||
|
||||
async def vision_analyze_tool(
|
||||
image_url: str,
|
||||
user_prompt: str,
|
||||
|
|
@ -376,24 +490,27 @@ async def vision_analyze_tool(
|
|||
if not detected_mime_type:
|
||||
raise ValueError("Only real image files are supported for vision analysis.")
|
||||
|
||||
# Convert image to base64 data URL
|
||||
# Convert image to base64 — send at full resolution first.
|
||||
# If the provider rejects it as too large, we auto-resize and retry.
|
||||
logger.info("Converting image to base64...")
|
||||
image_data_url = _image_to_base64_data_url(temp_image_path, mime_type=detected_mime_type)
|
||||
# Calculate size in KB for better readability
|
||||
data_size_kb = len(image_data_url) / 1024
|
||||
logger.info("Image converted to base64 (%.1f KB)", data_size_kb)
|
||||
|
||||
# Pre-flight size check: most vision APIs cap base64 payloads at 5 MB.
|
||||
# Reject early with a clear message instead of a cryptic provider 400.
|
||||
_MAX_BASE64_BYTES = 5 * 1024 * 1024 # 5 MB
|
||||
# The data URL includes the header (e.g. "data:image/jpeg;base64,") which
|
||||
# is negligible, but measure the full string to be safe.
|
||||
# Hard limit (20 MB) — no provider accepts payloads this large.
|
||||
if len(image_data_url) > _MAX_BASE64_BYTES:
|
||||
raise ValueError(
|
||||
f"Image too large for vision API: base64 payload is "
|
||||
f"{len(image_data_url) / (1024 * 1024):.1f} MB (limit 5 MB). "
|
||||
f"Resize or compress the image and try again."
|
||||
)
|
||||
# Try to resize down to 5 MB before giving up.
|
||||
image_data_url = _resize_image_for_vision(
|
||||
temp_image_path, mime_type=detected_mime_type)
|
||||
if len(image_data_url) > _MAX_BASE64_BYTES:
|
||||
raise ValueError(
|
||||
f"Image too large for vision API: base64 payload is "
|
||||
f"{len(image_data_url) / (1024 * 1024):.1f} MB "
|
||||
f"(limit {_MAX_BASE64_BYTES / (1024 * 1024):.0f} MB) "
|
||||
f"even after resizing. "
|
||||
f"Install Pillow (`pip install Pillow`) for better auto-resize, "
|
||||
f"or compress the image manually."
|
||||
)
|
||||
|
||||
debug_call_data["image_size_bytes"] = image_size_bytes
|
||||
|
||||
|
|
@ -442,7 +559,24 @@ async def vision_analyze_tool(
|
|||
}
|
||||
if model:
|
||||
call_kwargs["model"] = model
|
||||
response = await async_call_llm(**call_kwargs)
|
||||
# Try full-size image first; on size-related rejection, downscale and retry.
|
||||
try:
|
||||
response = await async_call_llm(**call_kwargs)
|
||||
except Exception as _api_err:
|
||||
if (_is_image_size_error(_api_err)
|
||||
and len(image_data_url) > _RESIZE_TARGET_BYTES):
|
||||
logger.info(
|
||||
"API rejected image (%.1f MB, likely too large); "
|
||||
"auto-resizing to ~%.0f MB and retrying...",
|
||||
len(image_data_url) / (1024 * 1024),
|
||||
_RESIZE_TARGET_BYTES / (1024 * 1024),
|
||||
)
|
||||
image_data_url = _resize_image_for_vision(
|
||||
temp_image_path, mime_type=detected_mime_type)
|
||||
messages[0]["content"][1]["image_url"]["url"] = image_data_url
|
||||
response = await async_call_llm(**call_kwargs)
|
||||
else:
|
||||
raise
|
||||
|
||||
# Extract the analysis — fall back to reasoning if content is empty
|
||||
analysis = extract_content_or_reasoning(response)
|
||||
|
|
@ -498,8 +632,8 @@ async def vision_analyze_tool(
|
|||
elif "invalid_request" in err_str or "image_url" in err_str:
|
||||
analysis = (
|
||||
"The vision API rejected the image. This can happen when the "
|
||||
"image is too large, in an unsupported format, or corrupted. "
|
||||
"Try a smaller JPEG/PNG (under 3.5 MB) and retry. "
|
||||
"image is in an unsupported format, corrupted, or still too "
|
||||
"large after auto-resize. Try a smaller JPEG/PNG and retry. "
|
||||
f"Error: {e}"
|
||||
)
|
||||
else:
|
||||
|
|
|
|||
|
|
@ -143,6 +143,10 @@ to find the parent assistant message, keeping groups intact.
|
|||
|
||||
### Phase 3: Generate Structured Summary
|
||||
|
||||
:::warning Summary model context length
|
||||
The summary model must have a context window **at least as large** as the main agent model's. The entire middle section is sent to the summary model in a single `call_llm(task="compression")` call. If the summary model's context is smaller, the API returns a context-length error — `_generate_summary()` catches it, logs a warning, and returns `None`. The compressor then drops the middle turns **without a summary**, silently losing conversation context. This is the most common cause of degraded compaction quality.
|
||||
:::
|
||||
|
||||
The middle turns are summarized using the auxiliary LLM with a structured
|
||||
template:
|
||||
|
||||
|
|
|
|||
|
|
@ -480,7 +480,9 @@ Points at a custom OpenAI-compatible endpoint. Uses `OPENAI_API_KEY` for auth.
|
|||
| `nous` / `openrouter` / etc. | not set | Force that provider, use its auth |
|
||||
| any | set | Use the custom endpoint directly (provider ignored) |
|
||||
|
||||
The `summary_model` must support a context length at least as large as your main model's, since it receives the full middle section of the conversation for compression.
|
||||
:::warning Summary model context length requirement
|
||||
The `summary_model` **must** have a context window at least as large as your main agent model's. The compressor sends the full middle section of the conversation to the summary model — if that model's context window is smaller than the main model's, the summarization call will fail with a context length error. When this happens, the middle turns are **dropped without a summary**, losing conversation context silently. If you override `summary_model`, verify its context length meets or exceeds your main model's.
|
||||
:::
|
||||
|
||||
## Context Engine
|
||||
|
||||
|
|
|
|||
|
|
@ -212,7 +212,24 @@ When users click buttons or interact with interactive cards sent by the bot, the
|
|||
|
||||
Card action events are dispatched with `MessageType.COMMAND`, so they flow through the normal command processing pipeline.
|
||||
|
||||
To use this feature, enable the **Interactive Card** event in your Feishu app's event subscriptions (`card.action.trigger`).
|
||||
This is also how **command approval** works — when the agent needs to run a dangerous command, it sends an interactive card with Allow Once / Session / Always / Deny buttons. The user clicks a button, and the card action callback delivers the approval decision back to the agent.
|
||||
|
||||
### Required Feishu App Configuration
|
||||
|
||||
Interactive cards require **three** configuration steps in the Feishu Developer Console. Missing any of them causes error **200340** when users click card buttons.
|
||||
|
||||
1. **Subscribe to the card action event:**
|
||||
In **Event Subscriptions**, add `card.action.trigger` to your subscribed events.
|
||||
|
||||
2. **Enable the Interactive Card capability:**
|
||||
In **App Features > Bot**, ensure the **Interactive Card** toggle is enabled. This tells Feishu that your app can receive card action callbacks.
|
||||
|
||||
3. **Configure the Card Request URL (webhook mode only):**
|
||||
In **App Features > Bot > Message Card Request URL**, set the URL to the same endpoint as your event webhook (e.g. `https://your-server:8765/feishu/webhook`). In WebSocket mode this is handled automatically by the SDK.
|
||||
|
||||
:::warning
|
||||
Without all three steps, Feishu will successfully *send* interactive cards (sending only requires `im:message:send` permission), but clicking any button will return error 200340. The card appears to work — the error only surfaces when a user interacts with it.
|
||||
:::
|
||||
|
||||
## Media Support
|
||||
|
||||
|
|
@ -412,6 +429,7 @@ WebSocket and per-group ACL settings are configured via `config.yaml` under `pla
|
|||
| Post messages show as plain text | The Feishu API rejected the post payload; this is normal fallback behavior. Check logs for details. |
|
||||
| Images/files not received by bot | Grant `im:message` and `im:resource` permission scopes to your Feishu app |
|
||||
| Bot identity not auto-detected | Grant `admin:app.info:readonly` scope, or set `FEISHU_BOT_OPEN_ID` / `FEISHU_BOT_NAME` manually |
|
||||
| Error 200340 when clicking approval buttons | Enable **Interactive Card** capability and configure **Card Request URL** in the Feishu Developer Console. See [Required Feishu App Configuration](#required-feishu-app-configuration) above. |
|
||||
| `Webhook rate limit exceeded` | More than 120 requests/minute from the same IP. This is usually a misconfiguration or loop. |
|
||||
|
||||
## Toolset
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue