mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-11 08:42:11 +00:00
refactor(vision): consolidate native fast-path gate into one shared helper
The fast-path decision (native routing + provider allowlist OR supports_vision override) lived inline in vision_analyze and was copied into browser_vision. Extract it to _should_use_native_vision_fast_path() so both tools share one source of truth. - vision_tools: gate logic now one helper; vision_analyze calls it in 3 lines - browser_tool: thin envelope decoration over the shared helper, not a copy - browser_vision typed Union[str, Dict] to match its real return shape - tests slimmed to target the override path + text-mode-wins invariant
This commit is contained in:
parent
c3f28c651d
commit
c77a697fa4
4 changed files with 140 additions and 193 deletions
|
|
@ -251,6 +251,7 @@ class TestBrowserVisionConfig:
|
|||
assert mock_llm.call_args.kwargs["timeout"] == 120.0
|
||||
|
||||
def test_browser_vision_native_fast_path_returns_multimodal(self, tmp_path):
|
||||
"""supports_vision override → screenshot attached natively, no aux call."""
|
||||
from agent.auxiliary_client import clear_runtime_main, set_runtime_main
|
||||
from tools.browser_tool import browser_vision
|
||||
|
||||
|
|
@ -265,10 +266,7 @@ class TestBrowserVisionConfig:
|
|||
"tools.browser_tool._run_browser_command",
|
||||
return_value={
|
||||
"success": True,
|
||||
"data": {
|
||||
"path": str(screenshot),
|
||||
"annotations": annotations,
|
||||
},
|
||||
"data": {"path": str(screenshot), "annotations": annotations},
|
||||
},
|
||||
),
|
||||
patch(
|
||||
|
|
@ -278,9 +276,7 @@ class TestBrowserVisionConfig:
|
|||
patch("tools.browser_tool._get_vision_model") as mock_get_vision_model,
|
||||
patch("tools.browser_tool.call_llm") as mock_llm,
|
||||
):
|
||||
result = browser_vision(
|
||||
"what is on the page?", annotate=True, task_id="test"
|
||||
)
|
||||
result = browser_vision("what is on the page?", annotate=True, task_id="test")
|
||||
finally:
|
||||
clear_runtime_main()
|
||||
|
||||
|
|
@ -289,55 +285,12 @@ class TestBrowserVisionConfig:
|
|||
assert result["meta"]["screenshot_path"] == str(screenshot)
|
||||
assert result["meta"]["annotations"] == annotations
|
||||
assert any(p.get("type") == "image_url" for p in result["content"])
|
||||
assert "what is on the page?" in result["content"][0]["text"]
|
||||
assert str(screenshot) in result["content"][0]["text"]
|
||||
assert "Screenshot path:" in result["text_summary"]
|
||||
assert f"Screenshot path: {screenshot}" in result["text_summary"]
|
||||
mock_get_vision_model.assert_not_called()
|
||||
mock_llm.assert_not_called()
|
||||
|
||||
def test_browser_vision_native_mode_without_supports_vision_uses_aux_llm(self, tmp_path):
|
||||
from agent.auxiliary_client import clear_runtime_main, set_runtime_main
|
||||
from tools.browser_tool import browser_vision
|
||||
|
||||
shots_dir, screenshot = self._setup_screenshot(tmp_path)
|
||||
mock_response = MagicMock()
|
||||
mock_choice = MagicMock()
|
||||
mock_choice.message.content = "Fallback screenshot analysis"
|
||||
mock_response.choices = [mock_choice]
|
||||
|
||||
set_runtime_main("brand-new-provider", "opaque-model")
|
||||
try:
|
||||
with (
|
||||
patch("hermes_constants.get_hermes_dir", return_value=shots_dir),
|
||||
patch("tools.browser_tool._cleanup_old_screenshots"),
|
||||
patch(
|
||||
"tools.browser_tool._run_browser_command",
|
||||
return_value={"success": True, "data": {"path": str(screenshot)}},
|
||||
),
|
||||
patch(
|
||||
"hermes_cli.config.load_config",
|
||||
return_value={"agent": {"image_input_mode": "native"}},
|
||||
),
|
||||
patch("tools.browser_tool._get_vision_model", return_value="test-model"),
|
||||
patch("tools.browser_tool.call_llm", return_value=mock_response) as mock_llm,
|
||||
):
|
||||
result = json.loads(browser_vision("what is on the page?", task_id="test"))
|
||||
finally:
|
||||
clear_runtime_main()
|
||||
|
||||
assert result["success"] is True
|
||||
assert result["analysis"] == "Fallback screenshot analysis"
|
||||
assert result["screenshot_path"] == str(screenshot)
|
||||
mock_llm.assert_called_once()
|
||||
kwargs = mock_llm.call_args.kwargs
|
||||
assert kwargs["task"] == "vision"
|
||||
assert kwargs["model"] == "test-model"
|
||||
assert kwargs["messages"][0]["content"][1]["type"] == "image_url"
|
||||
assert kwargs["messages"][0]["content"][1]["image_url"]["url"].startswith(
|
||||
"data:image/png;base64,"
|
||||
)
|
||||
|
||||
def test_browser_vision_text_mode_blocks_native_fast_path(self, tmp_path):
|
||||
"""Explicit text routing → aux LLM used even with supports_vision."""
|
||||
from agent.auxiliary_client import clear_runtime_main, set_runtime_main
|
||||
from tools.browser_tool import browser_vision
|
||||
|
||||
|
|
@ -372,7 +325,6 @@ class TestBrowserVisionConfig:
|
|||
|
||||
assert result["success"] is True
|
||||
assert result["analysis"] == "Text-mode screenshot analysis"
|
||||
assert result["screenshot_path"] == str(screenshot)
|
||||
mock_llm.assert_called_once()
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -146,35 +146,32 @@ class TestVisionAnalyzeNative:
|
|||
class TestHandleVisionAnalyzeFastPath:
|
||||
"""Verify the dispatcher chooses fast-path vs aux-LLM correctly."""
|
||||
|
||||
def test_native_mode_with_supported_transport_uses_fast_path(self, tmp_path):
|
||||
"""Explicit native mode + known transport returns multimodal."""
|
||||
def test_vision_capable_main_model_uses_fast_path(self, tmp_path, monkeypatch):
|
||||
"""Main model supports native vision → fast path returns multimodal."""
|
||||
img = tmp_path / "x.png"
|
||||
img.write_bytes(_TINY_PNG)
|
||||
|
||||
async def _aux_sentinel(*args, **kwargs):
|
||||
return '{"sentinel": "aux-path"}'
|
||||
|
||||
# Set runtime override so the handler thinks we're on opus@openrouter
|
||||
from agent.auxiliary_client import set_runtime_main, clear_runtime_main
|
||||
set_runtime_main("openrouter", "anthropic/claude-opus-4.6")
|
||||
try:
|
||||
# Mock decide_image_input_mode to always return "native" so the
|
||||
# fast path fires regardless of model-catalog state in CI.
|
||||
with patch(
|
||||
"hermes_cli.config.load_config",
|
||||
return_value={"agent": {"image_input_mode": "native"}},
|
||||
), patch("tools.vision_tools.vision_analyze_tool", side_effect=_aux_sentinel) as mock_aux:
|
||||
result = asyncio.get_event_loop().run_until_complete(
|
||||
_handle_vision_analyze({"image_url": str(img), "question": "?"})
|
||||
)
|
||||
"agent.image_routing.decide_image_input_mode",
|
||||
return_value="native",
|
||||
):
|
||||
coro = _handle_vision_analyze({"image_url": str(img), "question": "?"})
|
||||
result = asyncio.get_event_loop().run_until_complete(coro)
|
||||
finally:
|
||||
clear_runtime_main()
|
||||
|
||||
assert isinstance(result, dict), (
|
||||
assert isinstance(result, dict), \
|
||||
f"Expected multimodal envelope, got {type(result).__name__}: {str(result)[:200]}"
|
||||
)
|
||||
assert result.get("_multimodal") is True
|
||||
mock_aux.assert_not_called()
|
||||
|
||||
def test_native_mode_with_unsupported_transport_falls_through(self, tmp_path):
|
||||
"""Explicit native mode still respects the transport gate."""
|
||||
def test_non_vision_main_model_falls_through_to_aux(self, tmp_path, monkeypatch):
|
||||
"""Non-vision main model → fast path skipped, aux LLM path attempted."""
|
||||
img = tmp_path / "x.png"
|
||||
img.write_bytes(_TINY_PNG)
|
||||
|
||||
|
|
@ -182,27 +179,39 @@ class TestHandleVisionAnalyzeFastPath:
|
|||
return '{"sentinel": "aux-path"}'
|
||||
|
||||
from agent.auxiliary_client import set_runtime_main, clear_runtime_main
|
||||
set_runtime_main("brand-new-provider", "opaque-model")
|
||||
set_runtime_main("openrouter", "qwen/qwen3-coder")
|
||||
try:
|
||||
with (
|
||||
patch(
|
||||
"hermes_cli.config.load_config",
|
||||
return_value={"agent": {"image_input_mode": "native"}},
|
||||
),
|
||||
patch("tools.vision_tools.vision_analyze_tool", side_effect=_aux_sentinel) as mock_aux,
|
||||
):
|
||||
result = asyncio.get_event_loop().run_until_complete(
|
||||
_handle_vision_analyze({"image_url": str(img), "question": "?"})
|
||||
)
|
||||
with patch("tools.vision_tools.vision_analyze_tool", side_effect=_aux_sentinel):
|
||||
coro = _handle_vision_analyze({"image_url": str(img), "question": "?"})
|
||||
result = asyncio.get_event_loop().run_until_complete(coro)
|
||||
finally:
|
||||
clear_runtime_main()
|
||||
|
||||
assert isinstance(result, str)
|
||||
assert json.loads(result) == {"sentinel": "aux-path"}
|
||||
mock_aux.assert_called_once()
|
||||
assert not (isinstance(result, dict) and result.get("_multimodal") is True), \
|
||||
"Fast path fired for non-vision model; should have fallen through to aux LLM"
|
||||
|
||||
def test_supports_vision_bypasses_transport_gate(self, tmp_path):
|
||||
"""supports_vision=True enables fast path even on unknown providers."""
|
||||
def test_fast_path_disabled_for_unsupported_provider(self, tmp_path, monkeypatch):
|
||||
"""Even with vision-capable model, unknown provider → fall through."""
|
||||
img = tmp_path / "x.png"
|
||||
img.write_bytes(_TINY_PNG)
|
||||
|
||||
async def _aux_sentinel(*args, **kwargs):
|
||||
return '{"sentinel": "aux-path"}'
|
||||
|
||||
from agent.auxiliary_client import set_runtime_main, clear_runtime_main
|
||||
set_runtime_main("brand-new-provider", "anthropic/claude-opus-4.6")
|
||||
try:
|
||||
with patch("tools.vision_tools.vision_analyze_tool", side_effect=_aux_sentinel):
|
||||
coro = _handle_vision_analyze({"image_url": str(img), "question": "?"})
|
||||
result = asyncio.get_event_loop().run_until_complete(coro)
|
||||
finally:
|
||||
clear_runtime_main()
|
||||
|
||||
assert not (isinstance(result, dict) and result.get("_multimodal") is True), \
|
||||
"Fast path fired for unknown provider; should have fallen through"
|
||||
|
||||
def test_supports_vision_override_bypasses_provider_allowlist(self, tmp_path):
|
||||
"""supports_vision=true enables the fast path on an unlisted provider."""
|
||||
img = tmp_path / "x.png"
|
||||
img.write_bytes(_TINY_PNG)
|
||||
|
||||
|
|
@ -215,21 +224,19 @@ class TestHandleVisionAnalyzeFastPath:
|
|||
with patch(
|
||||
"hermes_cli.config.load_config",
|
||||
return_value={"model": {"supports_vision": True}},
|
||||
), patch("tools.vision_tools.vision_analyze_tool", side_effect=_aux_sentinel) as mock_aux:
|
||||
result = asyncio.get_event_loop().run_until_complete(
|
||||
_handle_vision_analyze({"image_url": str(img), "question": "?"})
|
||||
)
|
||||
), patch(
|
||||
"tools.vision_tools.vision_analyze_tool", side_effect=_aux_sentinel,
|
||||
) as mock_aux:
|
||||
coro = _handle_vision_analyze({"image_url": str(img), "question": "?"})
|
||||
result = asyncio.get_event_loop().run_until_complete(coro)
|
||||
finally:
|
||||
clear_runtime_main()
|
||||
|
||||
assert isinstance(result, dict), (
|
||||
f"Expected multimodal envelope, got {type(result).__name__}: {str(result)[:200]}"
|
||||
)
|
||||
assert result.get("_multimodal") is True
|
||||
assert isinstance(result, dict) and result.get("_multimodal") is True
|
||||
mock_aux.assert_not_called()
|
||||
|
||||
def test_text_mode_still_blocks_fast_path_when_supports_vision_true(self, tmp_path):
|
||||
"""Routing mode wins over supports_vision when text mode was chosen."""
|
||||
def test_text_mode_wins_over_supports_vision_override(self, tmp_path):
|
||||
"""Explicit text routing blocks the fast path even with supports_vision."""
|
||||
img = tmp_path / "x.png"
|
||||
img.write_bytes(_TINY_PNG)
|
||||
|
||||
|
|
@ -239,19 +246,17 @@ class TestHandleVisionAnalyzeFastPath:
|
|||
from agent.auxiliary_client import set_runtime_main, clear_runtime_main
|
||||
set_runtime_main("brand-new-provider", "llava-v1.6")
|
||||
try:
|
||||
with (
|
||||
patch(
|
||||
"hermes_cli.config.load_config",
|
||||
return_value={
|
||||
"agent": {"image_input_mode": "text"},
|
||||
"model": {"supports_vision": True},
|
||||
},
|
||||
),
|
||||
patch("tools.vision_tools.vision_analyze_tool", side_effect=_aux_sentinel) as mock_aux,
|
||||
):
|
||||
result = asyncio.get_event_loop().run_until_complete(
|
||||
_handle_vision_analyze({"image_url": str(img), "question": "?"})
|
||||
)
|
||||
with patch(
|
||||
"hermes_cli.config.load_config",
|
||||
return_value={
|
||||
"agent": {"image_input_mode": "text"},
|
||||
"model": {"supports_vision": True},
|
||||
},
|
||||
), patch(
|
||||
"tools.vision_tools.vision_analyze_tool", side_effect=_aux_sentinel,
|
||||
) as mock_aux:
|
||||
coro = _handle_vision_analyze({"image_url": str(img), "question": "?"})
|
||||
result = asyncio.get_event_loop().run_until_complete(coro)
|
||||
finally:
|
||||
clear_runtime_main()
|
||||
|
||||
|
|
|
|||
|
|
@ -62,7 +62,7 @@ import tempfile
|
|||
import threading
|
||||
import time
|
||||
import requests
|
||||
from typing import Dict, Any, Optional, List, Tuple
|
||||
from typing import Dict, Any, Optional, List, Tuple, Union
|
||||
from pathlib import Path
|
||||
from agent.auxiliary_client import call_llm
|
||||
from hermes_constants import get_hermes_home
|
||||
|
|
@ -3044,16 +3044,16 @@ def browser_get_images(task_id: Optional[str] = None) -> str:
|
|||
return json.dumps(_copy_fallback_warning(response, result), ensure_ascii=False)
|
||||
|
||||
|
||||
def browser_vision(question: str, annotate: bool = False, task_id: Optional[str] = None) -> str:
|
||||
def browser_vision(question: str, annotate: bool = False, task_id: Optional[str] = None) -> Union[str, Dict[str, Any]]:
|
||||
"""
|
||||
Take a screenshot of the current page for visual inspection.
|
||||
|
||||
This tool captures what's visually displayed in the browser. When the
|
||||
active model supports native vision, the screenshot is attached directly
|
||||
to the conversation so the model can inspect it on the next turn.
|
||||
Otherwise Hermes falls back to the auxiliary vision model. Useful for
|
||||
understanding visual content that the text-based snapshot may not capture
|
||||
(CAPTCHAs, verification challenges, images, complex layouts, etc.).
|
||||
Captures what's visually displayed in the browser. When the active model
|
||||
supports native vision, the screenshot is attached directly to the
|
||||
conversation so the model can inspect it on the next turn; otherwise Hermes
|
||||
falls back to the auxiliary vision model and returns a text analysis. Useful
|
||||
for visual content the text-based snapshot may not capture (CAPTCHAs,
|
||||
verification challenges, images, complex layouts, etc.).
|
||||
|
||||
The screenshot is saved persistently and its file path is returned so it
|
||||
can be shared with users via MEDIA:<path> in the response.
|
||||
|
|
@ -3064,8 +3064,8 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str]
|
|||
task_id: Task identifier for session isolation
|
||||
|
||||
Returns:
|
||||
Either a JSON string with vision analysis results and screenshot_path,
|
||||
or a multimodal tool-result envelope with the screenshot and metadata.
|
||||
A JSON string with vision analysis results and screenshot_path, or a
|
||||
multimodal tool-result envelope carrying the screenshot and metadata.
|
||||
"""
|
||||
if _is_camofox_mode():
|
||||
from tools.browser_camofox import camofox_vision
|
||||
|
|
@ -3190,55 +3190,33 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str]
|
|||
_screenshot_b64 = base64.b64encode(_screenshot_bytes).decode("ascii")
|
||||
data_url = f"data:image/png;base64,{_screenshot_b64}"
|
||||
|
||||
# Fast path: when the active main model supports native vision AND the
|
||||
# provider supports image content inside tool results, short-circuit
|
||||
# the auxiliary LLM and return the image bytes as a multimodal
|
||||
# tool-result envelope. The user can force native vision with the
|
||||
# supports_vision override. The main model sees the pixels directly on its
|
||||
# next turn — no aux call, no information loss, no extra latency.
|
||||
try:
|
||||
from agent.auxiliary_client import _read_main_model, _read_main_provider
|
||||
from agent.image_routing import decide_image_input_mode, _lookup_supports_vision
|
||||
from hermes_cli.config import load_config
|
||||
from tools.vision_tools import (
|
||||
_build_native_vision_tool_result,
|
||||
_supports_media_in_tool_results,
|
||||
)
|
||||
# Fast path: when native image routing is in effect for the active main
|
||||
# model, attach the screenshot directly instead of describing it through
|
||||
# an auxiliary vision LLM. The model inspects the pixels on its next
|
||||
# turn — no aux call, no information loss. Consistent with vision_analyze.
|
||||
from tools.vision_tools import (
|
||||
_build_native_vision_tool_result,
|
||||
_should_use_native_vision_fast_path,
|
||||
)
|
||||
|
||||
_provider = _read_main_provider()
|
||||
_model = _read_main_model()
|
||||
_cfg = load_config()
|
||||
_mode = decide_image_input_mode(_provider, _model, _cfg)
|
||||
_supports_vision = _lookup_supports_vision(_provider, _model, _cfg) is True
|
||||
if _mode == "native" and (
|
||||
_supports_media_in_tool_results(_provider, _model)
|
||||
or _supports_vision
|
||||
):
|
||||
native_result = _build_native_vision_tool_result(
|
||||
image_url=str(screenshot_path),
|
||||
question=question,
|
||||
image_data_url=data_url,
|
||||
image_size_bytes=len(_screenshot_bytes),
|
||||
)
|
||||
native_result.setdefault("meta", {})
|
||||
native_result["meta"]["screenshot_path"] = str(screenshot_path)
|
||||
if _lp_fallback_warning:
|
||||
native_result["meta"]["fallback_warning"] = _lp_fallback_warning
|
||||
if annotate and result.get("data", {}).get("annotations"):
|
||||
native_result["meta"]["annotations"] = result["data"]["annotations"]
|
||||
text_parts = native_result.get("content") or []
|
||||
if text_parts and isinstance(text_parts[0], dict) and text_parts[0].get("type") == "text":
|
||||
text_parts[0]["text"] = (
|
||||
str(text_parts[0].get("text", ""))
|
||||
+ f"\n\nScreenshot path: {screenshot_path}"
|
||||
)
|
||||
native_result["text_summary"] = (
|
||||
str(native_result.get("text_summary") or "")
|
||||
+ f" Screenshot path: {screenshot_path}"
|
||||
).strip()
|
||||
return native_result
|
||||
except Exception:
|
||||
pass
|
||||
if _should_use_native_vision_fast_path():
|
||||
native_result = _build_native_vision_tool_result(
|
||||
image_url=str(screenshot_path),
|
||||
question=question,
|
||||
image_data_url=data_url,
|
||||
image_size_bytes=len(_screenshot_bytes),
|
||||
)
|
||||
meta = native_result.setdefault("meta", {})
|
||||
meta["screenshot_path"] = str(screenshot_path)
|
||||
if _lp_fallback_warning:
|
||||
meta["fallback_warning"] = _lp_fallback_warning
|
||||
if annotate and result.get("data", {}).get("annotations"):
|
||||
meta["annotations"] = result["data"]["annotations"]
|
||||
native_result["text_summary"] = (
|
||||
f"{native_result.get('text_summary', '')} "
|
||||
f"Screenshot path: {screenshot_path}"
|
||||
).strip()
|
||||
return native_result
|
||||
|
||||
vision_prompt = (
|
||||
f"You are analyzing a screenshot of a web browser.\n\n"
|
||||
|
|
|
|||
|
|
@ -476,6 +476,36 @@ def _supports_media_in_tool_results(provider: str, model: str) -> bool:
|
|||
return False
|
||||
|
||||
|
||||
def _should_use_native_vision_fast_path() -> bool:
|
||||
"""Whether vision tools should attach the image to the main model directly
|
||||
instead of routing through the auxiliary vision LLM.
|
||||
|
||||
True when image routing resolves to ``native`` AND either the provider is
|
||||
known to accept images inside tool results, or the user explicitly declared
|
||||
the model vision-capable via the ``model.supports_vision`` config override.
|
||||
The override is the escape hatch for custom/local providers that aren't in
|
||||
the static allowlist. Best-effort: any resolution failure returns False so
|
||||
the caller falls back to the legacy aux-LLM path.
|
||||
"""
|
||||
try:
|
||||
from agent.auxiliary_client import _read_main_provider, _read_main_model
|
||||
from agent.image_routing import decide_image_input_mode, _lookup_supports_vision
|
||||
from hermes_cli.config import load_config
|
||||
|
||||
provider = _read_main_provider()
|
||||
model = _read_main_model()
|
||||
cfg = load_config()
|
||||
if decide_image_input_mode(provider, model, cfg) != "native":
|
||||
return False
|
||||
return (
|
||||
_supports_media_in_tool_results(provider, model)
|
||||
or _lookup_supports_vision(provider, model, cfg) is True
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.debug("Native vision fast-path check failed: %s", exc)
|
||||
return False
|
||||
|
||||
|
||||
def _build_native_vision_tool_result(
|
||||
image_url: str,
|
||||
question: str,
|
||||
|
|
@ -1030,33 +1060,15 @@ def _handle_vision_analyze(args: Dict[str, Any], **kw: Any) -> Awaitable[str]:
|
|||
image_url = args.get("image_url", "")
|
||||
question = args.get("question", "")
|
||||
|
||||
# Fast path: when the active main model supports native vision AND the
|
||||
# provider supports image content inside tool results, short-circuit
|
||||
# the auxiliary LLM and return the image bytes as a multimodal
|
||||
# tool-result envelope. The user can force native vision with the
|
||||
# supports_vision override. The main model sees the pixels directly on its
|
||||
# next turn — no aux call, no information loss, no extra latency.
|
||||
try:
|
||||
from agent.auxiliary_client import _read_main_provider, _read_main_model
|
||||
from agent.image_routing import decide_image_input_mode, _lookup_supports_vision
|
||||
from hermes_cli.config import load_config
|
||||
|
||||
_provider = _read_main_provider()
|
||||
_model = _read_main_model()
|
||||
_cfg = load_config()
|
||||
_mode = decide_image_input_mode(_provider, _model, _cfg)
|
||||
_supports_vision = _lookup_supports_vision(_provider, _model, _cfg) is True
|
||||
if _mode == "native" and (
|
||||
_supports_media_in_tool_results(_provider, _model)
|
||||
or _supports_vision
|
||||
):
|
||||
logger.info(
|
||||
"vision_analyze: native fast path (provider=%s, model=%s)",
|
||||
_provider, _model,
|
||||
)
|
||||
return _vision_analyze_native(image_url, question)
|
||||
except Exception as exc:
|
||||
logger.debug("Native vision fast-path check failed; using aux LLM: %s", exc)
|
||||
# Fast path: when native image routing is in effect for the active main
|
||||
# model (provider accepts images in tool results, or the user set the
|
||||
# model.supports_vision override), short-circuit the auxiliary LLM and
|
||||
# return the image bytes as a multimodal tool-result envelope. The main
|
||||
# model sees the pixels directly on its next turn — no aux call, no
|
||||
# information loss, no extra latency.
|
||||
if _should_use_native_vision_fast_path():
|
||||
logger.info("vision_analyze: native fast path")
|
||||
return _vision_analyze_native(image_url, question)
|
||||
|
||||
# Legacy path: aux LLM describes the image and we return its text.
|
||||
full_prompt = (
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue