mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-07-01 12:02:05 +00:00
fix(vision): cap vision_analyze fan-out concurrency process-wide
A single agent turn can fan out N vision_analyze calls at once — the classic trigger is "analyze every frame of this video", where ffmpeg explodes a clip into dozens of frames and the model calls vision_analyze on each. Every call does a CPU-heavy base64-encode/resize burst AND holds a long-lived LLM stream open. The tool executor runs concurrent tool calls on a per-session ThreadPoolExecutor (_MAX_TOOL_WORKERS=8), and multiple agent sessions share one process (the dashboard runs the agent in-process), so there was no global ceiling. In prod (June 2026) a video-frame fan-out pinned a worker thread at ~100% CPU and starved the shared asyncio event loop that also serves the dashboard's /api/status liveness probe, flapping the instance to UNHEALTHY even though nothing had crashed. Add a process-global threading.BoundedSemaphore that bounds how many vision analyses run concurrently across the whole process, held across the entire analysis (image load + encode + LLM call) in the single _handle_vision_analyze chokepoint (covers both the native fast path and the legacy aux-LLM path). It is a threading semaphore, NOT asyncio: each vision call is dispatched through model_tools._run_async on a per-thread event loop, so an asyncio primitive bound to one loop cannot coordinate across them. The acquire is offloaded via run_in_executor so waiting for a slot never blocks the calling loop. Default: min(host CPUs, 4), floored at 1 — respect the host's concurrency, or lower. Override via auxiliary.vision.max_concurrency (config.yaml) or HERMES_VISION_MAX_CONCURRENCY (env). Values < 1 are ignored so the cap can never be disabled into an unbounded fan-out. Tests: bounded-fan-out regression guard + a control proving it would fail without the cap; resolver tests for host-cpu default, ceiling clamp, low-cpu host, env override, and sub-1 rejection. Pre-existing handler tests updated for the now-async _handle_vision_analyze. Verified via the real registry.dispatch -> _run_async per-thread-loop path (16 concurrent calls, peak bounded to cap).
This commit is contained in:
parent
115e78c377
commit
eddfecd2ce
5 changed files with 346 additions and 37 deletions
|
|
@ -7,7 +7,9 @@ Also tests the vision_tools and browser_tool model override env vars.
|
|||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch, MagicMock
|
||||
from unittest.mock import patch, MagicMock, AsyncMock
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
|
||||
|
|
@ -238,22 +240,30 @@ class TestGatewayBridgeCodeParity:
|
|||
class TestVisionModelOverride:
|
||||
"""Test that AUXILIARY_VISION_MODEL env var overrides the default model in the handler."""
|
||||
|
||||
def test_env_var_overrides_default(self, monkeypatch):
|
||||
@pytest.mark.asyncio
|
||||
async def test_env_var_overrides_default(self, monkeypatch):
|
||||
monkeypatch.setenv("AUXILIARY_VISION_MODEL", "openai/gpt-4o")
|
||||
from tools.vision_tools import _handle_vision_analyze
|
||||
with patch("tools.vision_tools.vision_analyze_tool", new_callable=MagicMock) as mock_tool:
|
||||
with (
|
||||
patch("tools.vision_tools.vision_analyze_tool", new_callable=AsyncMock) as mock_tool,
|
||||
patch("tools.vision_tools._should_use_native_vision_fast_path", return_value=False),
|
||||
):
|
||||
mock_tool.return_value = '{"success": true}'
|
||||
_handle_vision_analyze({"image_url": "http://test.jpg", "question": "test"})
|
||||
await _handle_vision_analyze({"image_url": "http://test.jpg", "question": "test"})
|
||||
call_args = mock_tool.call_args
|
||||
# 3rd positional arg = model
|
||||
assert call_args[0][2] == "openai/gpt-4o"
|
||||
|
||||
def test_default_model_when_no_override(self, monkeypatch):
|
||||
@pytest.mark.asyncio
|
||||
async def test_default_model_when_no_override(self, monkeypatch):
|
||||
monkeypatch.delenv("AUXILIARY_VISION_MODEL", raising=False)
|
||||
from tools.vision_tools import _handle_vision_analyze
|
||||
with patch("tools.vision_tools.vision_analyze_tool", new_callable=MagicMock) as mock_tool:
|
||||
with (
|
||||
patch("tools.vision_tools.vision_analyze_tool", new_callable=AsyncMock) as mock_tool,
|
||||
patch("tools.vision_tools._should_use_native_vision_fast_path", return_value=False),
|
||||
):
|
||||
mock_tool.return_value = '{"success": true}'
|
||||
_handle_vision_analyze({"image_url": "http://test.jpg", "question": "test"})
|
||||
await _handle_vision_analyze({"image_url": "http://test.jpg", "question": "test"})
|
||||
call_args = mock_tool.call_args
|
||||
# With no AUXILIARY_VISION_MODEL env var, model should be None
|
||||
# (the centralized call_llm router picks the provider default)
|
||||
|
|
|
|||
|
|
@ -191,57 +191,70 @@ class TestHandleVisionAnalyze:
|
|||
# Clean up the coroutine to avoid RuntimeWarning
|
||||
result.close()
|
||||
|
||||
def test_prompt_contains_question(self):
|
||||
@pytest.mark.asyncio
|
||||
async def test_prompt_contains_question(self):
|
||||
"""The full prompt should incorporate the user's question."""
|
||||
with patch(
|
||||
"tools.vision_tools.vision_analyze_tool", new_callable=AsyncMock
|
||||
) as mock_tool:
|
||||
with (
|
||||
patch(
|
||||
"tools.vision_tools.vision_analyze_tool", new_callable=AsyncMock
|
||||
) as mock_tool,
|
||||
patch(
|
||||
"tools.vision_tools._should_use_native_vision_fast_path",
|
||||
return_value=False,
|
||||
),
|
||||
):
|
||||
mock_tool.return_value = json.dumps({"result": "ok"})
|
||||
coro = _handle_vision_analyze(
|
||||
await _handle_vision_analyze(
|
||||
{
|
||||
"image_url": "https://example.com/img.png",
|
||||
"question": "Describe the cat",
|
||||
}
|
||||
)
|
||||
# Clean up coroutine
|
||||
coro.close()
|
||||
call_args = mock_tool.call_args
|
||||
full_prompt = call_args[0][1] # second positional arg
|
||||
assert "Describe the cat" in full_prompt
|
||||
assert "Fully describe and explain" in full_prompt
|
||||
|
||||
def test_uses_auxiliary_vision_model_env(self):
|
||||
@pytest.mark.asyncio
|
||||
async def test_uses_auxiliary_vision_model_env(self):
|
||||
"""AUXILIARY_VISION_MODEL env var should override DEFAULT_VISION_MODEL."""
|
||||
with (
|
||||
patch(
|
||||
"tools.vision_tools.vision_analyze_tool", new_callable=AsyncMock
|
||||
) as mock_tool,
|
||||
patch(
|
||||
"tools.vision_tools._should_use_native_vision_fast_path",
|
||||
return_value=False,
|
||||
),
|
||||
patch.dict(os.environ, {"AUXILIARY_VISION_MODEL": "custom/model-v1"}),
|
||||
):
|
||||
mock_tool.return_value = json.dumps({"result": "ok"})
|
||||
coro = _handle_vision_analyze(
|
||||
await _handle_vision_analyze(
|
||||
{"image_url": "https://example.com/img.png", "question": "test"}
|
||||
)
|
||||
coro.close()
|
||||
call_args = mock_tool.call_args
|
||||
model = call_args[0][2] # third positional arg
|
||||
assert model == "custom/model-v1"
|
||||
|
||||
def test_falls_back_to_default_model(self):
|
||||
@pytest.mark.asyncio
|
||||
async def test_falls_back_to_default_model(self):
|
||||
"""Without AUXILIARY_VISION_MODEL, model should be None (let call_llm resolve default)."""
|
||||
with (
|
||||
patch(
|
||||
"tools.vision_tools.vision_analyze_tool", new_callable=AsyncMock
|
||||
) as mock_tool,
|
||||
patch(
|
||||
"tools.vision_tools._should_use_native_vision_fast_path",
|
||||
return_value=False,
|
||||
),
|
||||
patch.dict(os.environ, {}, clear=False),
|
||||
):
|
||||
# Ensure AUXILIARY_VISION_MODEL is not set
|
||||
os.environ.pop("AUXILIARY_VISION_MODEL", None)
|
||||
mock_tool.return_value = json.dumps({"result": "ok"})
|
||||
coro = _handle_vision_analyze(
|
||||
await _handle_vision_analyze(
|
||||
{"image_url": "https://example.com/img.png", "question": "test"}
|
||||
)
|
||||
coro.close()
|
||||
call_args = mock_tool.call_args
|
||||
model = call_args[0][2]
|
||||
# With no AUXILIARY_VISION_MODEL set, model should be None
|
||||
|
|
@ -1069,3 +1082,169 @@ class TestDownloadRetryClassification:
|
|||
# All three attempts used, two backoff sleeps between them.
|
||||
assert mock_client.get.await_count == 3
|
||||
assert mock_sleep.await_count == 2
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Fan-out concurrency cap — a single turn (or several concurrent sessions in
|
||||
# one process) can launch dozens of vision_analyze calls at once. The
|
||||
# process-global semaphore must bound how many run simultaneously so a video-
|
||||
# frame storm can't pin a worker thread and starve the dashboard event loop.
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestVisionFanoutConcurrencyCap:
|
||||
"""The process-global semaphore bounds concurrent vision analyses."""
|
||||
|
||||
def test_resolver_defaults_to_min_cpus_and_ceiling(self):
|
||||
from tools import vision_tools as vt
|
||||
|
||||
with (
|
||||
patch.dict(os.environ, {}, clear=False),
|
||||
patch("tools.vision_tools._detect_host_cpus", return_value=64),
|
||||
):
|
||||
os.environ.pop("HERMES_VISION_MAX_CONCURRENCY", None)
|
||||
# No config override available in the test env → falls to default,
|
||||
# which is clamped to the ceiling even on a 64-core host.
|
||||
with patch("hermes_cli.config.load_config", side_effect=Exception):
|
||||
assert vt._resolve_vision_max_concurrency() == vt._VISION_DEFAULT_CONCURRENCY_CEILING
|
||||
|
||||
def test_resolver_respects_low_host_cpu_count(self):
|
||||
from tools import vision_tools as vt
|
||||
|
||||
with (
|
||||
patch.dict(os.environ, {}, clear=False),
|
||||
patch("tools.vision_tools._detect_host_cpus", return_value=2),
|
||||
patch("hermes_cli.config.load_config", side_effect=Exception),
|
||||
):
|
||||
os.environ.pop("HERMES_VISION_MAX_CONCURRENCY", None)
|
||||
# 2-core host → cap is 2 (host limit, below the ceiling of 4).
|
||||
assert vt._resolve_vision_max_concurrency() == 2
|
||||
|
||||
def test_resolver_env_override(self):
|
||||
from tools import vision_tools as vt
|
||||
|
||||
with patch.dict(os.environ, {"HERMES_VISION_MAX_CONCURRENCY": "1"}):
|
||||
assert vt._resolve_vision_max_concurrency() == 1
|
||||
|
||||
def test_resolver_rejects_sub_one_override(self):
|
||||
from tools import vision_tools as vt
|
||||
|
||||
with (
|
||||
patch.dict(os.environ, {"HERMES_VISION_MAX_CONCURRENCY": "0"}),
|
||||
patch("tools.vision_tools._detect_host_cpus", return_value=2),
|
||||
patch("hermes_cli.config.load_config", side_effect=Exception),
|
||||
):
|
||||
# 0 is ignored (cap can never be disabled) → falls back to default.
|
||||
assert vt._resolve_vision_max_concurrency() == 2
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_fanout_is_bounded_by_semaphore(self):
|
||||
"""Firing many concurrent vision calls must never exceed the cap in flight.
|
||||
|
||||
This is the regression guard for the prod incident: an unbounded
|
||||
fan-out pinned the event loop. With the cap, peak concurrency is
|
||||
clamped to the semaphore value regardless of how many calls launch.
|
||||
"""
|
||||
import asyncio
|
||||
import importlib
|
||||
import threading
|
||||
# Resolve the module fresh and drive BOTH the handler and the patch
|
||||
# targets through that SAME module object. Sibling suites
|
||||
# (test_vision_routing_31179) delete tools.vision_tools from
|
||||
# sys.modules, so the top-level ``_handle_vision_analyze`` import can
|
||||
# be bound to a stale module while ``patch`` hits the current one —
|
||||
# patching the wrong object lets the real function run (peak stays 0).
|
||||
vt = importlib.import_module("tools.vision_tools")
|
||||
|
||||
CAP = 3
|
||||
in_flight = 0
|
||||
peak = 0
|
||||
lock = asyncio.Lock()
|
||||
|
||||
async def fake_native(image_url, question):
|
||||
nonlocal in_flight, peak
|
||||
async with lock:
|
||||
in_flight += 1
|
||||
peak = max(peak, in_flight)
|
||||
try:
|
||||
# Hold the slot long enough that, without a cap, all callers
|
||||
# would overlap and drive peak up to N.
|
||||
await asyncio.sleep(0.05)
|
||||
finally:
|
||||
async with lock:
|
||||
in_flight -= 1
|
||||
return json.dumps({"ok": True})
|
||||
|
||||
N = 12
|
||||
# Install a fresh semaphore at the test cap so the assertion is
|
||||
# deterministic regardless of the host's core count.
|
||||
with (
|
||||
patch.object(vt, "_vision_concurrency_semaphore",
|
||||
threading.BoundedSemaphore(CAP)),
|
||||
patch.object(vt, "_should_use_native_vision_fast_path",
|
||||
return_value=True),
|
||||
patch.object(vt, "_vision_analyze_native", side_effect=fake_native),
|
||||
):
|
||||
await asyncio.gather(*[
|
||||
vt._handle_vision_analyze(
|
||||
{"image_url": f"https://example.com/frame_{i}.png",
|
||||
"question": "what is this"}
|
||||
)
|
||||
for i in range(N)
|
||||
])
|
||||
|
||||
assert peak <= CAP, f"peak concurrency {peak} exceeded cap {CAP}"
|
||||
# Sanity: with N > CAP and a real wait, we should have actually
|
||||
# saturated the cap (otherwise the test proves nothing).
|
||||
assert peak == CAP, f"expected to saturate cap {CAP}, only reached {peak}"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_unbounded_fanout_would_exceed_cap_without_semaphore(self):
|
||||
"""Control: with a no-op (effectively unbounded) semaphore, peak blows past CAP.
|
||||
|
||||
Proves the guard above would fail if the semaphore weren't enforcing
|
||||
the limit — i.e. the test is actually exercising the cap.
|
||||
"""
|
||||
import asyncio
|
||||
import importlib
|
||||
import threading
|
||||
vt = importlib.import_module("tools.vision_tools")
|
||||
|
||||
CAP = 3
|
||||
in_flight = 0
|
||||
peak = 0
|
||||
lock = asyncio.Lock()
|
||||
|
||||
async def fake_native(image_url, question):
|
||||
nonlocal in_flight, peak
|
||||
async with lock:
|
||||
in_flight += 1
|
||||
peak = max(peak, in_flight)
|
||||
try:
|
||||
await asyncio.sleep(0.05)
|
||||
finally:
|
||||
async with lock:
|
||||
in_flight -= 1
|
||||
return json.dumps({"ok": True})
|
||||
|
||||
N = 12
|
||||
# A semaphore sized to N imposes no real limit for this workload.
|
||||
with (
|
||||
patch.object(vt, "_vision_concurrency_semaphore",
|
||||
threading.BoundedSemaphore(N)),
|
||||
patch.object(vt, "_should_use_native_vision_fast_path",
|
||||
return_value=True),
|
||||
patch.object(vt, "_vision_analyze_native", side_effect=fake_native),
|
||||
):
|
||||
await asyncio.gather(*[
|
||||
vt._handle_vision_analyze(
|
||||
{"image_url": f"https://example.com/frame_{i}.png",
|
||||
"question": "what is this"}
|
||||
)
|
||||
for i in range(N)
|
||||
])
|
||||
|
||||
assert peak > CAP, (
|
||||
"control failed: peak did not exceed CAP even without a real cap "
|
||||
f"(peak={peak})"
|
||||
)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue