diff --git a/agent/models_dev.py b/agent/models_dev.py index d3620733b..560e7cefe 100644 --- a/agent/models_dev.py +++ b/agent/models_dev.py @@ -383,7 +383,14 @@ def get_model_capabilities(provider: str, model: str) -> Optional[ModelCapabilit # Extract capability flags (default to False if missing) supports_tools = bool(entry.get("tool_call", False)) - supports_vision = bool(entry.get("attachment", False)) + # Vision: check both the `attachment` flag and `modalities.input` for "image". + # Some models (e.g. gemma-4) list image in input modalities but not attachment. + input_mods = entry.get("modalities", {}) + if isinstance(input_mods, dict): + input_mods = input_mods.get("input", []) + else: + input_mods = [] + supports_vision = bool(entry.get("attachment", False)) or "image" in input_mods supports_reasoning = bool(entry.get("reasoning", False)) # Extract limits diff --git a/hermes_cli/config.py b/hermes_cli/config.py index 89606edc2..e088bdfdf 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -381,7 +381,7 @@ DEFAULT_CONFIG = { "model": "", # e.g. "google/gemini-2.5-flash", "gpt-4o" "base_url": "", # direct OpenAI-compatible endpoint (takes precedence over provider) "api_key": "", # API key for base_url (falls back to OPENAI_API_KEY) - "timeout": 30, # seconds — LLM API call timeout; increase for slow local vision models + "timeout": 120, # seconds — LLM API call timeout; vision payloads need generous timeout "download_timeout": 30, # seconds — image HTTP download timeout; increase for slow connections }, "web_extract": { diff --git a/tests/agent/test_models_dev.py b/tests/agent/test_models_dev.py index 1b6216c50..9f11d731e 100644 --- a/tests/agent/test_models_dev.py +++ b/tests/agent/test_models_dev.py @@ -7,6 +7,7 @@ from agent.models_dev import ( PROVIDER_TO_MODELS_DEV, _extract_context, fetch_models_dev, + get_model_capabilities, lookup_models_dev_context, ) @@ -195,3 +196,88 @@ class TestFetchModelsDev: result = fetch_models_dev() mock_get.assert_not_called() assert result == SAMPLE_REGISTRY + + +# --------------------------------------------------------------------------- +# get_model_capabilities — vision via modalities.input +# --------------------------------------------------------------------------- + + +CAPS_REGISTRY = { + "google": { + "id": "google", + "models": { + "gemma-4-31b-it": { + "id": "gemma-4-31b-it", + "attachment": False, + "tool_call": True, + "modalities": {"input": ["text", "image"]}, + "limit": {"context": 128000, "output": 8192}, + }, + "gemma-3-1b": { + "id": "gemma-3-1b", + "tool_call": True, + "limit": {"context": 32000, "output": 8192}, + }, + }, + }, + "anthropic": { + "id": "anthropic", + "models": { + "claude-sonnet-4": { + "id": "claude-sonnet-4", + "attachment": True, + "tool_call": True, + "limit": {"context": 200000, "output": 64000}, + }, + }, + }, +} + + +class TestGetModelCapabilities: + """Tests for get_model_capabilities vision detection.""" + + def test_vision_from_attachment_flag(self): + """Models with attachment=True should report supports_vision=True.""" + with patch("agent.models_dev.fetch_models_dev", return_value=CAPS_REGISTRY): + caps = get_model_capabilities("anthropic", "claude-sonnet-4") + assert caps is not None + assert caps.supports_vision is True + + def test_vision_from_modalities_input_image(self): + """Models with 'image' in modalities.input but attachment=False should + still report supports_vision=True (the core fix in this PR).""" + with patch("agent.models_dev.fetch_models_dev", return_value=CAPS_REGISTRY): + caps = get_model_capabilities("google", "gemma-4-31b-it") + assert caps is not None + assert caps.supports_vision is True + + def test_no_vision_without_attachment_or_modalities(self): + """Models with neither attachment nor image modality should be non-vision.""" + with patch("agent.models_dev.fetch_models_dev", return_value=CAPS_REGISTRY): + caps = get_model_capabilities("google", "gemma-3-1b") + assert caps is not None + assert caps.supports_vision is False + + def test_modalities_non_dict_handled(self): + """Non-dict modalities field should not crash.""" + registry = { + "google": {"id": "google", "models": { + "weird-model": { + "id": "weird-model", + "modalities": "text", # not a dict + "limit": {"context": 200000, "output": 8192}, + }, + }}, + } + with patch("agent.models_dev.fetch_models_dev", return_value=registry): + caps = get_model_capabilities("gemini", "weird-model") + assert caps is not None + assert caps.supports_vision is False + + def test_model_not_found_returns_none(self): + """Unknown model should return None.""" + with patch("agent.models_dev.fetch_models_dev", return_value=CAPS_REGISTRY): + caps = get_model_capabilities("anthropic", "nonexistent-model") + assert caps is None diff --git a/tests/tools/test_vision_tools.py b/tests/tools/test_vision_tools.py index cd4009877..6e9a6034e 100644 --- a/tests/tools/test_vision_tools.py +++ b/tests/tools/test_vision_tools.py @@ -15,6 +15,10 @@ from tools.vision_tools import ( _handle_vision_analyze, _determine_mime_type, _image_to_base64_data_url, + _resize_image_for_vision, + _is_image_size_error, + _MAX_BASE64_BYTES, + _RESIZE_TARGET_BYTES, vision_analyze_tool, check_vision_requirements, get_debug_session_info, @@ -590,11 +594,13 @@ class TestBase64SizeLimit: @pytest.mark.asyncio async def test_oversized_image_rejected_before_api_call(self, tmp_path): - """Images exceeding 5 MB base64 should fail with a clear size error.""" + """Images exceeding the 20 MB hard limit should fail with a clear error.""" img = tmp_path / "huge.png" img.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * (4 * 1024 * 1024)) - with patch("tools.vision_tools.async_call_llm", new_callable=AsyncMock) as mock_llm: + # Patch the hard limit to a small value so the test runs fast. + with patch("tools.vision_tools._MAX_BASE64_BYTES", 1000), \ + patch("tools.vision_tools.async_call_llm", new_callable=AsyncMock) as mock_llm: result = json.loads(await vision_analyze_tool(str(img), "describe this")) assert result["success"] is False @@ -686,3 +692,124 @@ class TestVisionRegistration: entry = registry._tools.get("vision_analyze") assert callable(entry.handler) + + +# --------------------------------------------------------------------------- +# _resize_image_for_vision — auto-resize oversized images +# --------------------------------------------------------------------------- + + +class TestResizeImageForVision: + """Tests for the auto-resize function.""" + + def test_small_image_returned_as_is(self, tmp_path): + """Images under the limit should be returned unchanged.""" + # Create a small 10x10 red PNG + try: + from PIL import Image + except ImportError: + pytest.skip("Pillow not installed") + img = Image.new("RGB", (10, 10), (255, 0, 0)) + path = tmp_path / "small.png" + img.save(path, "PNG") + + result = _resize_image_for_vision(path, mime_type="image/png") + assert result.startswith("data:image/png;base64,") + assert len(result) < _MAX_BASE64_BYTES + + def test_large_image_is_resized(self, tmp_path): + """Images over the default target should be auto-resized to fit.""" + try: + from PIL import Image + except ImportError: + pytest.skip("Pillow not installed") + # Create a large image that will exceed 5 MB in base64 + # A 4000x4000 uncompressed PNG will be large + img = Image.new("RGB", (4000, 4000), (128, 200, 50)) + path = tmp_path / "large.png" + img.save(path, "PNG") + + result = _resize_image_for_vision(path, mime_type="image/png") + assert result.startswith("data:image/png;base64,") + # Default target is _RESIZE_TARGET_BYTES (5 MB), not _MAX_BASE64_BYTES (20 MB) + assert len(result) <= _RESIZE_TARGET_BYTES + + def test_custom_max_bytes(self, tmp_path): + """The max_base64_bytes parameter should be respected.""" + try: + from PIL import Image + except ImportError: + pytest.skip("Pillow not installed") + img = Image.new("RGB", (200, 200), (0, 128, 255)) + path = tmp_path / "medium.png" + img.save(path, "PNG") + + # Set a very low limit to force resizing + result = _resize_image_for_vision(path, max_base64_bytes=500) + # Should still return a valid data URL + assert result.startswith("data:image/") + + def test_jpeg_output_for_non_png(self, tmp_path): + """Non-PNG images should be resized as JPEG.""" + try: + from PIL import Image + except ImportError: + pytest.skip("Pillow not installed") + img = Image.new("RGB", (2000, 2000), (255, 128, 0)) + path = tmp_path / "photo.jpg" + img.save(path, "JPEG", quality=95) + + result = _resize_image_for_vision(path, mime_type="image/jpeg", + max_base64_bytes=50_000) + assert result.startswith("data:image/jpeg;base64,") + + def test_constants_sane(self): + """Hard limit should be larger than resize target.""" + assert _MAX_BASE64_BYTES == 20 * 1024 * 1024 + assert _RESIZE_TARGET_BYTES == 5 * 1024 * 1024 + assert _MAX_BASE64_BYTES > _RESIZE_TARGET_BYTES + + def test_no_pillow_returns_original(self, tmp_path): + """Without Pillow, oversized images should be returned as-is.""" + # Create a dummy file + path = tmp_path / "test.png" + # Write enough bytes to exceed a tiny limit + path.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 1000) + + with patch("tools.vision_tools._image_to_base64_data_url") as mock_b64: + # Simulate a large base64 result + mock_b64.return_value = "data:image/png;base64," + "A" * 200 + with patch.dict("sys.modules", {"PIL": None, "PIL.Image": None}): + result = _resize_image_for_vision(path, max_base64_bytes=100) + # Should return the original (oversized) data url + assert len(result) > 100 + + +# --------------------------------------------------------------------------- +# _is_image_size_error — detect size-related API errors +# --------------------------------------------------------------------------- + + +class TestIsImageSizeError: + """Tests for the size-error detection helper.""" + + def test_too_large_message(self): + assert _is_image_size_error(Exception("Request payload too large")) + + def test_413_status(self): + assert _is_image_size_error(Exception("HTTP 413 Payload Too Large")) + + def test_invalid_request(self): + assert _is_image_size_error(Exception("invalid_request_error: image too big")) + + def test_exceeds_limit(self): + assert _is_image_size_error(Exception("Image exceeds maximum size")) + + def test_unrelated_error(self): + assert not _is_image_size_error(Exception("Connection refused")) + + def test_auth_error(self): + assert not _is_image_size_error(Exception("401 Unauthorized")) + + def test_empty_message(self): + assert not _is_image_size_error(Exception("")) diff --git a/tools/browser_tool.py b/tools/browser_tool.py index a3b408381..ed3cfbb9b 100644 --- a/tools/browser_tool.py +++ b/tools/browser_tool.py @@ -1873,10 +1873,10 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str] ), }, ensure_ascii=False) - # Read and convert to base64 - image_data = screenshot_path.read_bytes() - image_base64 = base64.b64encode(image_data).decode("ascii") - data_url = f"data:image/png;base64,{image_base64}" + # Convert screenshot to base64 at full resolution. + _screenshot_bytes = screenshot_path.read_bytes() + _screenshot_b64 = base64.b64encode(_screenshot_bytes).decode("ascii") + data_url = f"data:image/png;base64,{_screenshot_b64}" vision_prompt = ( f"You are analyzing a screenshot of a web browser.\n\n" @@ -1890,7 +1890,7 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str] # Use the centralized LLM router vision_model = _get_vision_model() logger.debug("browser_vision: analysing screenshot (%d bytes)", - len(image_data)) + len(_screenshot_bytes)) # Read vision timeout from config (auxiliary.vision.timeout), default 120s. # Local vision models (llama.cpp, ollama) can take well over 30s for @@ -1922,7 +1922,27 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str] } if vision_model: call_kwargs["model"] = vision_model - response = call_llm(**call_kwargs) + # Try full-size screenshot; on size-related rejection, downscale and retry. + try: + response = call_llm(**call_kwargs) + except Exception as _api_err: + from tools.vision_tools import ( + _is_image_size_error, _resize_image_for_vision, _RESIZE_TARGET_BYTES, + ) + if (_is_image_size_error(_api_err) + and len(data_url) > _RESIZE_TARGET_BYTES): + logger.info( + "Vision API rejected screenshot (%.1f MB); " + "auto-resizing to ~%.0f MB and retrying...", + len(data_url) / (1024 * 1024), + _RESIZE_TARGET_BYTES / (1024 * 1024), + ) + data_url = _resize_image_for_vision( + screenshot_path, mime_type="image/png") + call_kwargs["messages"][0]["content"][1]["image_url"]["url"] = data_url + response = call_llm(**call_kwargs) + else: + raise analysis = (response.choices[0].message.content or "").strip() # Redact secrets the vision LLM may have read from the screenshot. diff --git a/tools/vision_tools.py b/tools/vision_tools.py index df8fa68c8..8242c7883 100644 --- a/tools/vision_tools.py +++ b/tools/vision_tools.py @@ -277,6 +277,120 @@ def _image_to_base64_data_url(image_path: Path, mime_type: Optional[str] = None) return data_url +# Hard limit for vision API payloads (20 MB) — matches the most restrictive +# major provider (Gemini inline data limit). Images above this are rejected. +_MAX_BASE64_BYTES = 20 * 1024 * 1024 + +# Target size when auto-resizing on API failure (5 MB). After a provider +# rejects an image, we downscale to this target and retry once. +_RESIZE_TARGET_BYTES = 5 * 1024 * 1024 + + +def _is_image_size_error(error: Exception) -> bool: + """Detect if an API error is related to image or payload size.""" + err_str = str(error).lower() + return any(hint in err_str for hint in ( + "too large", "payload", "413", "content_too_large", + "request_too_large", "image_url", "invalid_request", + "exceeds", "size limit", + )) + + +def _resize_image_for_vision(image_path: Path, mime_type: Optional[str] = None, + max_base64_bytes: int = _RESIZE_TARGET_BYTES) -> str: + """Convert an image to a base64 data URL, auto-resizing if too large. + + Tries Pillow first to progressively downscale oversized images. If Pillow + is not installed or resizing still exceeds the limit, falls back to the raw + bytes and lets the caller handle the size check. + + Returns the base64 data URL string. + """ + # Quick file-size estimate: base64 expands by ~4/3, plus data URL header. + # Skip the expensive full-read + encode if Pillow can resize directly. + file_size = image_path.stat().st_size + estimated_b64 = (file_size * 4) // 3 + 100 # ~header overhead + if estimated_b64 <= max_base64_bytes: + # Small enough — just encode directly. + data_url = _image_to_base64_data_url(image_path, mime_type=mime_type) + if len(data_url) <= max_base64_bytes: + return data_url + else: + data_url = None # defer full encode; try Pillow resize first + + # Attempt auto-resize with Pillow (soft dependency) + try: + from PIL import Image + import io as _io + except ImportError: + logger.info("Pillow not installed — cannot auto-resize oversized image") + if data_url is None: + data_url = _image_to_base64_data_url(image_path, mime_type=mime_type) + return data_url # caller will raise the size error + + logger.info("Image file is %.1f MB (estimated base64 %.1f MB, limit %.1f MB), auto-resizing...", + file_size / (1024 * 1024), estimated_b64 / (1024 * 1024), + max_base64_bytes / (1024 * 1024)) + + mime = mime_type or _determine_mime_type(image_path) + # Choose output format: JPEG for photos (smaller), PNG for transparency + pil_format = "PNG" if mime == "image/png" else "JPEG" + out_mime = "image/png" if pil_format == "PNG" else "image/jpeg" + + try: + img = Image.open(image_path) + except Exception as exc: + logger.info("Pillow cannot open image for resizing: %s", exc) + if data_url is None: + data_url = _image_to_base64_data_url(image_path, mime_type=mime_type) + return data_url # fall through to size-check in caller + # Convert RGBA to RGB for JPEG output + if pil_format == "JPEG" and img.mode in ("RGBA", "P"): + img = img.convert("RGB") + + # Strategy: halve dimensions until base64 fits, up to 4 rounds. + # For JPEG, also try reducing quality at each size step. + # For PNG, quality is irrelevant — only dimension reduction helps. + quality_steps = (85, 70, 50) if pil_format == "JPEG" else (None,) + prev_dims = (img.width, img.height) + candidate = None # will be set on first loop iteration + + for attempt in range(5): + if attempt > 0: + new_w = max(img.width // 2, 64) + new_h = max(img.height // 2, 64) + # Stop if dimensions can't shrink further + if (new_w, new_h) == prev_dims: + break + img = img.resize((new_w, new_h), Image.LANCZOS) + prev_dims = (new_w, new_h) + logger.info("Resized to %dx%d (attempt %d)", new_w, new_h, attempt) + + for q in quality_steps: + buf = _io.BytesIO() + save_kwargs = {"format": pil_format} + if q is not None: + save_kwargs["quality"] = q + img.save(buf, **save_kwargs) + encoded = base64.b64encode(buf.getvalue()).decode("ascii") + candidate = f"data:{out_mime};base64,{encoded}" + if len(candidate) <= max_base64_bytes: + logger.info("Auto-resized image fits: %.1f MB (quality=%s, %dx%d)", + len(candidate) / (1024 * 1024), q, + img.width, img.height) + return candidate + + # If we still can't get it small enough, return the best attempt + # and let the caller decide + if candidate is not None: + logger.warning("Auto-resize could not fit image under %.1f MB (best: %.1f MB)", + max_base64_bytes / (1024 * 1024), len(candidate) / (1024 * 1024)) + return candidate + + # Shouldn't reach here, but fall back to full encode + return data_url or _image_to_base64_data_url(image_path, mime_type=mime_type) + + async def vision_analyze_tool( image_url: str, user_prompt: str, @@ -376,24 +490,27 @@ async def vision_analyze_tool( if not detected_mime_type: raise ValueError("Only real image files are supported for vision analysis.") - # Convert image to base64 data URL + # Convert image to base64 — send at full resolution first. + # If the provider rejects it as too large, we auto-resize and retry. logger.info("Converting image to base64...") image_data_url = _image_to_base64_data_url(temp_image_path, mime_type=detected_mime_type) - # Calculate size in KB for better readability data_size_kb = len(image_data_url) / 1024 logger.info("Image converted to base64 (%.1f KB)", data_size_kb) - # Pre-flight size check: most vision APIs cap base64 payloads at 5 MB. - # Reject early with a clear message instead of a cryptic provider 400. - _MAX_BASE64_BYTES = 5 * 1024 * 1024 # 5 MB - # The data URL includes the header (e.g. "data:image/jpeg;base64,") which - # is negligible, but measure the full string to be safe. + # Hard limit (20 MB) — no provider accepts payloads this large. if len(image_data_url) > _MAX_BASE64_BYTES: - raise ValueError( - f"Image too large for vision API: base64 payload is " - f"{len(image_data_url) / (1024 * 1024):.1f} MB (limit 5 MB). " - f"Resize or compress the image and try again." - ) + # Try to resize down to 5 MB before giving up. + image_data_url = _resize_image_for_vision( + temp_image_path, mime_type=detected_mime_type) + if len(image_data_url) > _MAX_BASE64_BYTES: + raise ValueError( + f"Image too large for vision API: base64 payload is " + f"{len(image_data_url) / (1024 * 1024):.1f} MB " + f"(limit {_MAX_BASE64_BYTES / (1024 * 1024):.0f} MB) " + f"even after resizing. " + f"Install Pillow (`pip install Pillow`) for better auto-resize, " + f"or compress the image manually." + ) debug_call_data["image_size_bytes"] = image_size_bytes @@ -442,7 +559,24 @@ async def vision_analyze_tool( } if model: call_kwargs["model"] = model - response = await async_call_llm(**call_kwargs) + # Try full-size image first; on size-related rejection, downscale and retry. + try: + response = await async_call_llm(**call_kwargs) + except Exception as _api_err: + if (_is_image_size_error(_api_err) + and len(image_data_url) > _RESIZE_TARGET_BYTES): + logger.info( + "API rejected image (%.1f MB, likely too large); " + "auto-resizing to ~%.0f MB and retrying...", + len(image_data_url) / (1024 * 1024), + _RESIZE_TARGET_BYTES / (1024 * 1024), + ) + image_data_url = _resize_image_for_vision( + temp_image_path, mime_type=detected_mime_type) + messages[0]["content"][1]["image_url"]["url"] = image_data_url + response = await async_call_llm(**call_kwargs) + else: + raise # Extract the analysis — fall back to reasoning if content is empty analysis = extract_content_or_reasoning(response) @@ -498,8 +632,8 @@ async def vision_analyze_tool( elif "invalid_request" in err_str or "image_url" in err_str: analysis = ( "The vision API rejected the image. This can happen when the " - "image is too large, in an unsupported format, or corrupted. " - "Try a smaller JPEG/PNG (under 3.5 MB) and retry. " + "image is in an unsupported format, corrupted, or still too " + "large after auto-resize. Try a smaller JPEG/PNG and retry. " f"Error: {e}" ) else: