From d7e573e54dc55a25e5f1a9a80fcca4b3b478f6e3 Mon Sep 17 00:00:00 2001 From: HexLab98 Date: Mon, 29 Jun 2026 07:13:00 +0700 Subject: [PATCH] fix(vision): detect Ollama vision models via /api/show (#54511) When local Ollama models are absent from models.dev, probe the Ollama server's /api/show capabilities so attached images are routed natively instead of being stripped as non-vision input. --- agent/image_routing.py | 98 +++++++++++++++++++++++++++++++++++++++-- agent/model_metadata.py | 50 +++++++++++++++++++++ 2 files changed, 144 insertions(+), 4 deletions(-) diff --git a/agent/image_routing.py b/agent/image_routing.py index 3014acf1cab..acd66fea827 100644 --- a/agent/image_routing.py +++ b/agent/image_routing.py @@ -251,6 +251,78 @@ def _supports_vision_override( return None +def _resolve_inference_base_url( + cfg: Optional[Dict[str, Any]], + provider: str, +) -> str: + """Best-effort base URL for the active inference provider.""" + try: + from agent.auxiliary_client import _RUNTIME_MAIN_BASE_URL + + runtime = str(_RUNTIME_MAIN_BASE_URL or "").strip() + if runtime: + return runtime + except Exception: + pass + + if not isinstance(cfg, dict): + return "" + + model_cfg_raw = cfg.get("model") + model_cfg: Dict[str, Any] = model_cfg_raw if isinstance(model_cfg_raw, dict) else {} + base_url = str(model_cfg.get("base_url") or "").strip() + if base_url: + return base_url + + config_provider = str(model_cfg.get("provider") or "").strip() + candidate_names: set[str] = set() + for p in filter(None, (provider, config_provider)): + candidate_names.add(p) + if p.lower().startswith("custom:"): + candidate_names.add(p.split(":", 1)[1]) + else: + candidate_names.add(f"custom:{p}") + + providers_cfg = cfg.get("providers") + if isinstance(providers_cfg, dict): + for name in candidate_names: + entry = providers_cfg.get(name) + if isinstance(entry, dict): + bu = str(entry.get("base_url") or "").strip() + if bu: + return bu + + custom_providers = cfg.get("custom_providers") + if isinstance(custom_providers, list): + lowered = {n.lower() for n in candidate_names} + for entry_raw in custom_providers: + if not isinstance(entry_raw, dict): + continue + entry_name = str(entry_raw.get("name") or "").strip() + if entry_name not in candidate_names and entry_name.lower() not in lowered: + continue + bu = str(entry_raw.get("base_url") or "").strip() + if bu: + return bu + + return "" + + +def _should_probe_ollama_vision(provider: str, base_url: str) -> bool: + """True when the active provider likely fronts a local Ollama server.""" + p = (provider or "").strip().lower() + if p == "ollama": + return True + if not base_url: + return False + try: + from agent.model_metadata import detect_local_server_type + + return detect_local_server_type(base_url) == "ollama" + except Exception: + return False + + def _coerce_mode(raw: Any) -> str: """Normalize a config value into one of the valid modes.""" if not isinstance(raw, str): @@ -302,15 +374,33 @@ def _lookup_supports_vision( return override if not provider or not model: return None + caps = None try: from agent.models_dev import get_model_capabilities caps = get_model_capabilities(provider, model) except Exception as exc: # pragma: no cover - defensive logger.debug("image_routing: caps lookup failed for %s:%s — %s", provider, model, exc) - return None - if caps is None: - return None - return bool(caps.supports_vision) + if caps is not None: + return bool(caps.supports_vision) + + base_url = _resolve_inference_base_url(cfg, provider) + if not base_url and (provider or "").strip().lower() == "ollama": + base_url = "http://localhost:11434/v1" + if _should_probe_ollama_vision(provider, base_url): + try: + from agent.model_metadata import query_ollama_supports_vision + + ollama_vision = query_ollama_supports_vision(model, base_url) + if ollama_vision is not None: + return ollama_vision + except Exception as exc: # pragma: no cover - defensive + logger.debug( + "image_routing: ollama vision probe failed for %s:%s — %s", + provider, + model, + exc, + ) + return None def decide_image_input_mode( diff --git a/agent/model_metadata.py b/agent/model_metadata.py index 70177dbb3c0..9430a98bfb1 100644 --- a/agent/model_metadata.py +++ b/agent/model_metadata.py @@ -1199,6 +1199,56 @@ def query_ollama_num_ctx(model: str, base_url: str, api_key: str = "") -> Option return None +def query_ollama_supports_vision(model: str, base_url: str, api_key: str = "") -> Optional[bool]: + """Return True/False when Ollama ``/api/show`` reports vision support. + + Uses the ``capabilities`` field on Ollama 0.6.0+ and falls back to + ``model_info.*.vision.block_count`` on older servers. Returns None when + the server is unreachable, not Ollama, or the model is unknown. + """ + import httpx + + bare_model = _strip_provider_prefix(model) + if not bare_model or not base_url: + return None + + try: + if detect_local_server_type(base_url, api_key=api_key) != "ollama": + return None + except Exception: + return None + + server_url = base_url.rstrip("/") + if server_url.endswith("/v1"): + server_url = server_url[:-3] + + headers = _auth_headers(api_key) + + try: + with httpx.Client(timeout=3.0, headers=headers) as client: + resp = client.post(f"{server_url}/api/show", json={"name": bare_model}) + if resp.status_code != 200: + return None + data = resp.json() + except Exception: + return None + + caps = data.get("capabilities") + if isinstance(caps, list): + if any(str(cap).lower() == "vision" for cap in caps): + return True + if caps: + return False + + model_info = data.get("model_info") + if isinstance(model_info, dict): + for key in model_info: + if "vision.block_count" in str(key).lower(): + return True + + return None + + def _query_ollama_api_show(model: str, base_url: str, api_key: str = "") -> Optional[int]: """Query an Ollama server's native ``/api/show`` for context length.