fix(vision): auto-resize oversized images, increase default timeout, fix vision capability detection

Cherry-picked from PR #7749 by kshitijk4poor with modifications:

- Raise hard image limit from 5 MB to 20 MB (matches most restrictive provider)
- Send images at full resolution first; only auto-resize to 5 MB on API failure
- Add _is_image_size_error() helper to detect size-related API rejections
- Auto-resize uses Pillow (soft dep) with progressive downscale + JPEG quality reduction
- Fix get_model_capabilities() to check modalities.input for vision support
- Increase default vision timeout from 30s to 120s (matches hardcoded fallback intent)
- Applied retry-with-resize to both vision_analyze_tool and browser_vision

Closes #7740
This commit is contained in:
kshitijk4poor 2026-04-11 11:07:18 -07:00 committed by Teknium
parent 06e1d9cdd4
commit 50bb4fe010
6 changed files with 399 additions and 25 deletions

View file

@ -277,6 +277,120 @@ def _image_to_base64_data_url(image_path: Path, mime_type: Optional[str] = None)
return data_url
# Hard limit for vision API payloads (20 MB) — matches the most restrictive
# major provider (Gemini inline data limit). Images above this are rejected.
_MAX_BASE64_BYTES = 20 * 1024 * 1024
# Target size when auto-resizing on API failure (5 MB). After a provider
# rejects an image, we downscale to this target and retry once.
_RESIZE_TARGET_BYTES = 5 * 1024 * 1024
def _is_image_size_error(error: Exception) -> bool:
"""Detect if an API error is related to image or payload size."""
err_str = str(error).lower()
return any(hint in err_str for hint in (
"too large", "payload", "413", "content_too_large",
"request_too_large", "image_url", "invalid_request",
"exceeds", "size limit",
))
def _resize_image_for_vision(image_path: Path, mime_type: Optional[str] = None,
max_base64_bytes: int = _RESIZE_TARGET_BYTES) -> str:
"""Convert an image to a base64 data URL, auto-resizing if too large.
Tries Pillow first to progressively downscale oversized images. If Pillow
is not installed or resizing still exceeds the limit, falls back to the raw
bytes and lets the caller handle the size check.
Returns the base64 data URL string.
"""
# Quick file-size estimate: base64 expands by ~4/3, plus data URL header.
# Skip the expensive full-read + encode if Pillow can resize directly.
file_size = image_path.stat().st_size
estimated_b64 = (file_size * 4) // 3 + 100 # ~header overhead
if estimated_b64 <= max_base64_bytes:
# Small enough — just encode directly.
data_url = _image_to_base64_data_url(image_path, mime_type=mime_type)
if len(data_url) <= max_base64_bytes:
return data_url
else:
data_url = None # defer full encode; try Pillow resize first
# Attempt auto-resize with Pillow (soft dependency)
try:
from PIL import Image
import io as _io
except ImportError:
logger.info("Pillow not installed — cannot auto-resize oversized image")
if data_url is None:
data_url = _image_to_base64_data_url(image_path, mime_type=mime_type)
return data_url # caller will raise the size error
logger.info("Image file is %.1f MB (estimated base64 %.1f MB, limit %.1f MB), auto-resizing...",
file_size / (1024 * 1024), estimated_b64 / (1024 * 1024),
max_base64_bytes / (1024 * 1024))
mime = mime_type or _determine_mime_type(image_path)
# Choose output format: JPEG for photos (smaller), PNG for transparency
pil_format = "PNG" if mime == "image/png" else "JPEG"
out_mime = "image/png" if pil_format == "PNG" else "image/jpeg"
try:
img = Image.open(image_path)
except Exception as exc:
logger.info("Pillow cannot open image for resizing: %s", exc)
if data_url is None:
data_url = _image_to_base64_data_url(image_path, mime_type=mime_type)
return data_url # fall through to size-check in caller
# Convert RGBA to RGB for JPEG output
if pil_format == "JPEG" and img.mode in ("RGBA", "P"):
img = img.convert("RGB")
# Strategy: halve dimensions until base64 fits, up to 4 rounds.
# For JPEG, also try reducing quality at each size step.
# For PNG, quality is irrelevant — only dimension reduction helps.
quality_steps = (85, 70, 50) if pil_format == "JPEG" else (None,)
prev_dims = (img.width, img.height)
candidate = None # will be set on first loop iteration
for attempt in range(5):
if attempt > 0:
new_w = max(img.width // 2, 64)
new_h = max(img.height // 2, 64)
# Stop if dimensions can't shrink further
if (new_w, new_h) == prev_dims:
break
img = img.resize((new_w, new_h), Image.LANCZOS)
prev_dims = (new_w, new_h)
logger.info("Resized to %dx%d (attempt %d)", new_w, new_h, attempt)
for q in quality_steps:
buf = _io.BytesIO()
save_kwargs = {"format": pil_format}
if q is not None:
save_kwargs["quality"] = q
img.save(buf, **save_kwargs)
encoded = base64.b64encode(buf.getvalue()).decode("ascii")
candidate = f"data:{out_mime};base64,{encoded}"
if len(candidate) <= max_base64_bytes:
logger.info("Auto-resized image fits: %.1f MB (quality=%s, %dx%d)",
len(candidate) / (1024 * 1024), q,
img.width, img.height)
return candidate
# If we still can't get it small enough, return the best attempt
# and let the caller decide
if candidate is not None:
logger.warning("Auto-resize could not fit image under %.1f MB (best: %.1f MB)",
max_base64_bytes / (1024 * 1024), len(candidate) / (1024 * 1024))
return candidate
# Shouldn't reach here, but fall back to full encode
return data_url or _image_to_base64_data_url(image_path, mime_type=mime_type)
async def vision_analyze_tool(
image_url: str,
user_prompt: str,
@ -376,24 +490,27 @@ async def vision_analyze_tool(
if not detected_mime_type:
raise ValueError("Only real image files are supported for vision analysis.")
# Convert image to base64 data URL
# Convert image to base64 — send at full resolution first.
# If the provider rejects it as too large, we auto-resize and retry.
logger.info("Converting image to base64...")
image_data_url = _image_to_base64_data_url(temp_image_path, mime_type=detected_mime_type)
# Calculate size in KB for better readability
data_size_kb = len(image_data_url) / 1024
logger.info("Image converted to base64 (%.1f KB)", data_size_kb)
# Pre-flight size check: most vision APIs cap base64 payloads at 5 MB.
# Reject early with a clear message instead of a cryptic provider 400.
_MAX_BASE64_BYTES = 5 * 1024 * 1024 # 5 MB
# The data URL includes the header (e.g. "data:image/jpeg;base64,") which
# is negligible, but measure the full string to be safe.
# Hard limit (20 MB) — no provider accepts payloads this large.
if len(image_data_url) > _MAX_BASE64_BYTES:
raise ValueError(
f"Image too large for vision API: base64 payload is "
f"{len(image_data_url) / (1024 * 1024):.1f} MB (limit 5 MB). "
f"Resize or compress the image and try again."
)
# Try to resize down to 5 MB before giving up.
image_data_url = _resize_image_for_vision(
temp_image_path, mime_type=detected_mime_type)
if len(image_data_url) > _MAX_BASE64_BYTES:
raise ValueError(
f"Image too large for vision API: base64 payload is "
f"{len(image_data_url) / (1024 * 1024):.1f} MB "
f"(limit {_MAX_BASE64_BYTES / (1024 * 1024):.0f} MB) "
f"even after resizing. "
f"Install Pillow (`pip install Pillow`) for better auto-resize, "
f"or compress the image manually."
)
debug_call_data["image_size_bytes"] = image_size_bytes
@ -442,7 +559,24 @@ async def vision_analyze_tool(
}
if model:
call_kwargs["model"] = model
response = await async_call_llm(**call_kwargs)
# Try full-size image first; on size-related rejection, downscale and retry.
try:
response = await async_call_llm(**call_kwargs)
except Exception as _api_err:
if (_is_image_size_error(_api_err)
and len(image_data_url) > _RESIZE_TARGET_BYTES):
logger.info(
"API rejected image (%.1f MB, likely too large); "
"auto-resizing to ~%.0f MB and retrying...",
len(image_data_url) / (1024 * 1024),
_RESIZE_TARGET_BYTES / (1024 * 1024),
)
image_data_url = _resize_image_for_vision(
temp_image_path, mime_type=detected_mime_type)
messages[0]["content"][1]["image_url"]["url"] = image_data_url
response = await async_call_llm(**call_kwargs)
else:
raise
# Extract the analysis — fall back to reasoning if content is empty
analysis = extract_content_or_reasoning(response)
@ -498,8 +632,8 @@ async def vision_analyze_tool(
elif "invalid_request" in err_str or "image_url" in err_str:
analysis = (
"The vision API rejected the image. This can happen when the "
"image is too large, in an unsupported format, or corrupted. "
"Try a smaller JPEG/PNG (under 3.5 MB) and retry. "
"image is in an unsupported format, corrupted, or still too "
"large after auto-resize. Try a smaller JPEG/PNG and retry. "
f"Error: {e}"
)
else: