fix(vision): detect Ollama vision models via /api/show (#54511)

When local Ollama models are absent from models.dev, probe the Ollama
server's /api/show capabilities so attached images are routed natively
instead of being stripped as non-vision input.
This commit is contained in:
HexLab98 2026-06-29 07:13:00 +07:00 committed by Teknium
parent b481348fbc
commit d7e573e54d
2 changed files with 144 additions and 4 deletions

View file

@ -251,6 +251,78 @@ def _supports_vision_override(
return None
def _resolve_inference_base_url(
cfg: Optional[Dict[str, Any]],
provider: str,
) -> str:
"""Best-effort base URL for the active inference provider."""
try:
from agent.auxiliary_client import _RUNTIME_MAIN_BASE_URL
runtime = str(_RUNTIME_MAIN_BASE_URL or "").strip()
if runtime:
return runtime
except Exception:
pass
if not isinstance(cfg, dict):
return ""
model_cfg_raw = cfg.get("model")
model_cfg: Dict[str, Any] = model_cfg_raw if isinstance(model_cfg_raw, dict) else {}
base_url = str(model_cfg.get("base_url") or "").strip()
if base_url:
return base_url
config_provider = str(model_cfg.get("provider") or "").strip()
candidate_names: set[str] = set()
for p in filter(None, (provider, config_provider)):
candidate_names.add(p)
if p.lower().startswith("custom:"):
candidate_names.add(p.split(":", 1)[1])
else:
candidate_names.add(f"custom:{p}")
providers_cfg = cfg.get("providers")
if isinstance(providers_cfg, dict):
for name in candidate_names:
entry = providers_cfg.get(name)
if isinstance(entry, dict):
bu = str(entry.get("base_url") or "").strip()
if bu:
return bu
custom_providers = cfg.get("custom_providers")
if isinstance(custom_providers, list):
lowered = {n.lower() for n in candidate_names}
for entry_raw in custom_providers:
if not isinstance(entry_raw, dict):
continue
entry_name = str(entry_raw.get("name") or "").strip()
if entry_name not in candidate_names and entry_name.lower() not in lowered:
continue
bu = str(entry_raw.get("base_url") or "").strip()
if bu:
return bu
return ""
def _should_probe_ollama_vision(provider: str, base_url: str) -> bool:
"""True when the active provider likely fronts a local Ollama server."""
p = (provider or "").strip().lower()
if p == "ollama":
return True
if not base_url:
return False
try:
from agent.model_metadata import detect_local_server_type
return detect_local_server_type(base_url) == "ollama"
except Exception:
return False
def _coerce_mode(raw: Any) -> str:
"""Normalize a config value into one of the valid modes."""
if not isinstance(raw, str):
@ -302,15 +374,33 @@ def _lookup_supports_vision(
return override
if not provider or not model:
return None
caps = None
try:
from agent.models_dev import get_model_capabilities
caps = get_model_capabilities(provider, model)
except Exception as exc: # pragma: no cover - defensive
logger.debug("image_routing: caps lookup failed for %s:%s%s", provider, model, exc)
return None
if caps is None:
return None
return bool(caps.supports_vision)
if caps is not None:
return bool(caps.supports_vision)
base_url = _resolve_inference_base_url(cfg, provider)
if not base_url and (provider or "").strip().lower() == "ollama":
base_url = "http://localhost:11434/v1"
if _should_probe_ollama_vision(provider, base_url):
try:
from agent.model_metadata import query_ollama_supports_vision
ollama_vision = query_ollama_supports_vision(model, base_url)
if ollama_vision is not None:
return ollama_vision
except Exception as exc: # pragma: no cover - defensive
logger.debug(
"image_routing: ollama vision probe failed for %s:%s%s",
provider,
model,
exc,
)
return None
def decide_image_input_mode(

View file

@ -1199,6 +1199,56 @@ def query_ollama_num_ctx(model: str, base_url: str, api_key: str = "") -> Option
return None
def query_ollama_supports_vision(model: str, base_url: str, api_key: str = "") -> Optional[bool]:
"""Return True/False when Ollama ``/api/show`` reports vision support.
Uses the ``capabilities`` field on Ollama 0.6.0+ and falls back to
``model_info.*.vision.block_count`` on older servers. Returns None when
the server is unreachable, not Ollama, or the model is unknown.
"""
import httpx
bare_model = _strip_provider_prefix(model)
if not bare_model or not base_url:
return None
try:
if detect_local_server_type(base_url, api_key=api_key) != "ollama":
return None
except Exception:
return None
server_url = base_url.rstrip("/")
if server_url.endswith("/v1"):
server_url = server_url[:-3]
headers = _auth_headers(api_key)
try:
with httpx.Client(timeout=3.0, headers=headers) as client:
resp = client.post(f"{server_url}/api/show", json={"name": bare_model})
if resp.status_code != 200:
return None
data = resp.json()
except Exception:
return None
caps = data.get("capabilities")
if isinstance(caps, list):
if any(str(cap).lower() == "vision" for cap in caps):
return True
if caps:
return False
model_info = data.get("model_info")
if isinstance(model_info, dict):
for key in model_info:
if "vision.block_count" in str(key).lower():
return True
return None
def _query_ollama_api_show(model: str, base_url: str, api_key: str = "") -> Optional[int]:
"""Query an Ollama server's native ``/api/show`` for context length.