fix(vision): auto-resize oversized images, increase default timeout, fix vision capability detection

Cherry-picked from PR #7749 by kshitijk4poor with modifications: - Raise hard image limit from 5 MB to 20 MB (matches most restrictive provider) - Send images at full resolution first; only auto-resize to 5 MB on API failure - Add _is_image_size_error() helper to detect size-related API rejections - Auto-resize uses Pillow (soft dep) with progressive downscale + JPEG quality reduction - Fix get_model_capabilities() to check modalities.input for vision support - Increase default vision timeout from 30s to 120s (matches hardcoded fallback intent) - Applied retry-with-resize to both vision_analyze_tool and browser_vision Closes #7740
2026-04-25 00:51:20 +00:00 · 2026-04-11 11:07:18 -07:00 · 2026-04-11 11:07:18 -07:00 · 50bb4fe010
commit 50bb4fe010
parent 06e1d9cdd4
6 changed files with 399 additions and 25 deletions
--- a/tests/agent/test_models_dev.py
+++ b/tests/agent/test_models_dev.py
@ -7,6 +7,7 @@ from agent.models_dev import (
    PROVIDER_TO_MODELS_DEV,
    _extract_context,
    fetch_models_dev,
+    get_model_capabilities,
    lookup_models_dev_context,
 )

@ -195,3 +196,88 @@ class TestFetchModelsDev:
        result = fetch_models_dev()
        mock_get.assert_not_called()
        assert result == SAMPLE_REGISTRY
+
+
+# ---------------------------------------------------------------------------
+# get_model_capabilities — vision via modalities.input
+# ---------------------------------------------------------------------------
+
+
+CAPS_REGISTRY = {
+    "google": {
+        "id": "google",
+        "models": {
+            "gemma-4-31b-it": {
+                "id": "gemma-4-31b-it",
+                "attachment": False,
+                "tool_call": True,
+                "modalities": {"input": ["text", "image"]},
+                "limit": {"context": 128000, "output": 8192},
+            },
+            "gemma-3-1b": {
+                "id": "gemma-3-1b",
+                "tool_call": True,
+                "limit": {"context": 32000, "output": 8192},
+            },
+        },
+    },
+    "anthropic": {
+        "id": "anthropic",
+        "models": {
+            "claude-sonnet-4": {
+                "id": "claude-sonnet-4",
+                "attachment": True,
+                "tool_call": True,
+                "limit": {"context": 200000, "output": 64000},
+            },
+        },
+    },
+}
+
+
+class TestGetModelCapabilities:
+    """Tests for get_model_capabilities vision detection."""
+
+    def test_vision_from_attachment_flag(self):
+        """Models with attachment=True should report supports_vision=True."""
+        with patch("agent.models_dev.fetch_models_dev", return_value=CAPS_REGISTRY):
+            caps = get_model_capabilities("anthropic", "claude-sonnet-4")
+        assert caps is not None
+        assert caps.supports_vision is True
+
+    def test_vision_from_modalities_input_image(self):
+        """Models with 'image' in modalities.input but attachment=False should
+        still report supports_vision=True (the core fix in this PR)."""
+        with patch("agent.models_dev.fetch_models_dev", return_value=CAPS_REGISTRY):
+            caps = get_model_capabilities("google", "gemma-4-31b-it")
+        assert caps is not None
+        assert caps.supports_vision is True
+
+    def test_no_vision_without_attachment_or_modalities(self):
+        """Models with neither attachment nor image modality should be non-vision."""
+        with patch("agent.models_dev.fetch_models_dev", return_value=CAPS_REGISTRY):
+            caps = get_model_capabilities("google", "gemma-3-1b")
+        assert caps is not None
+        assert caps.supports_vision is False
+
+    def test_modalities_non_dict_handled(self):
+        """Non-dict modalities field should not crash."""
+        registry = {
+            "google": {"id": "google", "models": {
+                "weird-model": {
+                    "id": "weird-model",
+                    "modalities": "text",  # not a dict
+                    "limit": {"context": 200000, "output": 8192},
+                },
+            }},
+        }
+        with patch("agent.models_dev.fetch_models_dev", return_value=registry):
+            caps = get_model_capabilities("gemini", "weird-model")
+        assert caps is not None
+        assert caps.supports_vision is False
+
+    def test_model_not_found_returns_none(self):
+        """Unknown model should return None."""
+        with patch("agent.models_dev.fetch_models_dev", return_value=CAPS_REGISTRY):
+            caps = get_model_capabilities("anthropic", "nonexistent-model")
+        assert caps is None