mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-05 02:31:47 +00:00
feat: add video_analyze tool for native video understanding (#19301)
* feat: add video_analyze tool for native video understanding Adds a video_analyze tool that sends video files to multimodal LLMs (e.g. Gemini) for analysis via the OpenRouter-compatible video_url content type. Mirrors vision_analyze in structure, error handling, and registration pattern. Key design: - Base64 encodes entire video (no frame extraction, no ffmpeg dep) - Uses 'video_url' content block type (OpenRouter standard) - Supports mp4, webm, mov, avi, mkv, mpeg formats - 50 MB hard cap, 20 MB warning threshold - 180s minimum timeout (videos take longer than images) - AUXILIARY_VIDEO_MODEL env override, falls back to AUXILIARY_VISION_MODEL - Same SSRF protection, retry logic, and cleanup as vision_analyze Default disabled: registered in 'video' toolset (not in _HERMES_CORE_TOOLS). Users opt in via: hermes tools enable video, or enabled_toolsets=['video']. * feat(video): add models.dev capability pre-check + CONFIGURABLE_TOOLSETS entry - Pre-checks model video capability via models.dev modalities.input before expensive base64 encoding. Fails early with helpful message suggesting video-capable alternatives (gemini, mimo-v2.5-pro). - Passes optimistically if model unknown or lookup fails. - Adds ModelInfo.supports_video_input() helper. - Adds 'video' to CONFIGURABLE_TOOLSETS and _DEFAULT_OFF_TOOLSETS so 'hermes tools enable video' works from CLI. - 8 new tests for the capability check (37 total). * refactor(video): remove models.dev capability pre-check Removes _check_video_model_capability and ModelInfo.supports_video_input. The vision_analyze tool doesn't pre-check image capability either — both tools rely on the same pattern: send request, handle API errors gracefully with categorized user-facing messages. The pre-check was inconsistent (only worked for some providers/models) so drop it for parity. * cleanup: compress comments, fix fragile timeout coupling - Replace _VISION_DOWNLOAD_TIMEOUT * 2 with hardcoded 60s (no silent breakage if vision timeout changes independently) - Strip verbose comments and redundant log lines throughout - No behavioral changes
This commit is contained in:
parent
0dd8e3f8d8
commit
c9a3f36f56
4 changed files with 706 additions and 1 deletions
|
|
@ -801,3 +801,364 @@ registry.register(
|
|||
is_async=True,
|
||||
emoji="👁️",
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Video Analysis Tool
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Extension → MIME. avi/mkv fall back to mp4.
|
||||
_VIDEO_MIME_TYPES = {
|
||||
".mp4": "video/mp4",
|
||||
".webm": "video/webm",
|
||||
".mov": "video/mov",
|
||||
".avi": "video/mp4",
|
||||
".mkv": "video/mp4",
|
||||
".mpeg": "video/mpeg",
|
||||
".mpg": "video/mpeg",
|
||||
}
|
||||
|
||||
_MAX_VIDEO_BASE64_BYTES = 50 * 1024 * 1024 # 50 MB hard cap
|
||||
_VIDEO_SIZE_WARN_BYTES = 20 * 1024 * 1024
|
||||
|
||||
|
||||
def _detect_video_mime_type(video_path: Path) -> Optional[str]:
|
||||
"""Return a video MIME type based on file extension, or None if unsupported."""
|
||||
ext = video_path.suffix.lower()
|
||||
return _VIDEO_MIME_TYPES.get(ext)
|
||||
|
||||
|
||||
def _video_to_base64_data_url(video_path: Path, mime_type: Optional[str] = None) -> str:
|
||||
"""Convert a video file to a base64-encoded data URL."""
|
||||
data = video_path.read_bytes()
|
||||
encoded = base64.b64encode(data).decode("ascii")
|
||||
mime = mime_type or _VIDEO_MIME_TYPES.get(video_path.suffix.lower(), "video/mp4")
|
||||
return f"data:{mime};base64,{encoded}"
|
||||
|
||||
|
||||
async def _download_video(video_url: str, destination: Path, max_retries: int = 3) -> Path:
|
||||
"""Download video from URL with SSRF protection and retry."""
|
||||
import asyncio
|
||||
|
||||
destination.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
async def _ssrf_redirect_guard(response):
|
||||
if response.is_redirect and response.next_request:
|
||||
redirect_url = str(response.next_request.url)
|
||||
from tools.url_safety import is_safe_url
|
||||
if not is_safe_url(redirect_url):
|
||||
raise ValueError(
|
||||
f"Blocked redirect to private/internal address: {redirect_url}"
|
||||
)
|
||||
|
||||
last_error = None
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
blocked = check_website_access(video_url)
|
||||
if blocked:
|
||||
raise PermissionError(blocked["message"])
|
||||
|
||||
async with httpx.AsyncClient(
|
||||
timeout=60.0,
|
||||
follow_redirects=True,
|
||||
event_hooks={"response": [_ssrf_redirect_guard]},
|
||||
) as client:
|
||||
response = await client.get(
|
||||
video_url,
|
||||
headers={
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
"Accept": "video/*,*/*;q=0.8",
|
||||
},
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
cl = response.headers.get("content-length")
|
||||
if cl and int(cl) > _MAX_VIDEO_BASE64_BYTES:
|
||||
raise ValueError(
|
||||
f"Video too large ({int(cl)} bytes, max {_MAX_VIDEO_BASE64_BYTES})"
|
||||
)
|
||||
|
||||
final_url = str(response.url)
|
||||
blocked = check_website_access(final_url)
|
||||
if blocked:
|
||||
raise PermissionError(blocked["message"])
|
||||
|
||||
body = response.content
|
||||
if len(body) > _MAX_VIDEO_BASE64_BYTES:
|
||||
raise ValueError(
|
||||
f"Video too large ({len(body)} bytes, max {_MAX_VIDEO_BASE64_BYTES})"
|
||||
)
|
||||
destination.write_bytes(body)
|
||||
|
||||
return destination
|
||||
except Exception as e:
|
||||
last_error = e
|
||||
if attempt < max_retries - 1:
|
||||
wait_time = 2 ** (attempt + 1)
|
||||
logger.warning("Video download failed (attempt %s/%s): %s", attempt + 1, max_retries, str(e)[:50])
|
||||
await asyncio.sleep(wait_time)
|
||||
else:
|
||||
logger.error(
|
||||
"Video download failed after %s attempts: %s",
|
||||
max_retries, str(e)[:100], exc_info=True,
|
||||
)
|
||||
|
||||
if last_error is None:
|
||||
raise RuntimeError(
|
||||
f"_download_video exited retry loop without attempting (max_retries={max_retries})"
|
||||
)
|
||||
raise last_error
|
||||
|
||||
|
||||
async def video_analyze_tool(
|
||||
video_url: str,
|
||||
user_prompt: str,
|
||||
model: str = None,
|
||||
) -> str:
|
||||
"""Analyze a video via multimodal LLM. Returns JSON {success, analysis}."""
|
||||
debug_call_data = {
|
||||
"parameters": {
|
||||
"video_url": video_url,
|
||||
"user_prompt": user_prompt[:200] + "..." if len(user_prompt) > 200 else user_prompt,
|
||||
"model": model,
|
||||
},
|
||||
"error": None,
|
||||
"success": False,
|
||||
"analysis_length": 0,
|
||||
"model_used": model,
|
||||
"video_size_bytes": 0,
|
||||
}
|
||||
|
||||
temp_video_path = None
|
||||
should_cleanup = True
|
||||
|
||||
try:
|
||||
from tools.interrupt import is_interrupted
|
||||
if is_interrupted():
|
||||
return tool_error("Interrupted", success=False)
|
||||
|
||||
logger.info("Analyzing video: %s", video_url[:60])
|
||||
logger.info("User prompt: %s", user_prompt[:100])
|
||||
|
||||
# Resolve local path vs remote URL
|
||||
resolved_url = video_url
|
||||
if resolved_url.startswith("file://"):
|
||||
resolved_url = resolved_url[len("file://"):]
|
||||
local_path = Path(os.path.expanduser(resolved_url))
|
||||
|
||||
if local_path.is_file():
|
||||
logger.info("Using local video file: %s", video_url)
|
||||
temp_video_path = local_path
|
||||
should_cleanup = False
|
||||
elif _validate_image_url(video_url):
|
||||
blocked = check_website_access(video_url)
|
||||
if blocked:
|
||||
raise PermissionError(blocked["message"])
|
||||
temp_dir = get_hermes_dir("cache/video", "temp_video_files")
|
||||
temp_video_path = temp_dir / f"temp_video_{uuid.uuid4()}.mp4"
|
||||
await _download_video(video_url, temp_video_path)
|
||||
should_cleanup = True
|
||||
else:
|
||||
raise ValueError(
|
||||
"Invalid video source. Provide an HTTP/HTTPS URL or a valid local file path."
|
||||
)
|
||||
|
||||
video_size_bytes = temp_video_path.stat().st_size
|
||||
video_size_mb = video_size_bytes / (1024 * 1024)
|
||||
logger.info("Video ready (%.1f MB)", video_size_mb)
|
||||
|
||||
detected_mime = _detect_video_mime_type(temp_video_path)
|
||||
if not detected_mime:
|
||||
raise ValueError(
|
||||
f"Unsupported video format: '{temp_video_path.suffix}'. "
|
||||
f"Supported: {', '.join(sorted(_VIDEO_MIME_TYPES.keys()))}"
|
||||
)
|
||||
|
||||
if video_size_bytes > _VIDEO_SIZE_WARN_BYTES:
|
||||
logger.warning("Video is %.1f MB — may be slow or rejected", video_size_mb)
|
||||
|
||||
video_data_url = _video_to_base64_data_url(temp_video_path, mime_type=detected_mime)
|
||||
data_size_mb = len(video_data_url) / (1024 * 1024)
|
||||
|
||||
if len(video_data_url) > _MAX_VIDEO_BASE64_BYTES:
|
||||
raise ValueError(
|
||||
f"Video too large for API: base64 payload is {data_size_mb:.1f} MB "
|
||||
f"(limit {_MAX_VIDEO_BASE64_BYTES / (1024 * 1024):.0f} MB). "
|
||||
f"Compress or trim the video and retry."
|
||||
)
|
||||
|
||||
debug_call_data["video_size_bytes"] = video_size_bytes
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": user_prompt,
|
||||
},
|
||||
{
|
||||
"type": "video_url",
|
||||
"video_url": {
|
||||
"url": video_data_url,
|
||||
},
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
vision_timeout = 180.0
|
||||
vision_temperature = 0.1
|
||||
try:
|
||||
from hermes_cli.config import cfg_get, load_config
|
||||
_cfg = load_config()
|
||||
_vision_cfg = cfg_get(_cfg, "auxiliary", "vision", default={})
|
||||
_vt = _vision_cfg.get("timeout")
|
||||
if _vt is not None:
|
||||
vision_timeout = max(float(_vt), 180.0)
|
||||
_vtemp = _vision_cfg.get("temperature")
|
||||
if _vtemp is not None:
|
||||
vision_temperature = float(_vtemp)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
call_kwargs = {
|
||||
"task": "vision",
|
||||
"messages": messages,
|
||||
"temperature": vision_temperature,
|
||||
"max_tokens": 4000,
|
||||
"timeout": vision_timeout,
|
||||
}
|
||||
if model:
|
||||
call_kwargs["model"] = model
|
||||
|
||||
response = await async_call_llm(**call_kwargs)
|
||||
analysis = extract_content_or_reasoning(response)
|
||||
|
||||
if not analysis:
|
||||
logger.warning("Empty video response, retrying once")
|
||||
response = await async_call_llm(**call_kwargs)
|
||||
analysis = extract_content_or_reasoning(response)
|
||||
|
||||
analysis_length = len(analysis) if analysis else 0
|
||||
logger.info("Video analysis completed (%s characters)", analysis_length)
|
||||
|
||||
result = {
|
||||
"success": True,
|
||||
"analysis": analysis or "There was a problem with the request and the video could not be analyzed.",
|
||||
}
|
||||
|
||||
debug_call_data["success"] = True
|
||||
debug_call_data["analysis_length"] = analysis_length
|
||||
_debug.log_call("video_analyze_tool", debug_call_data)
|
||||
_debug.save()
|
||||
|
||||
return json.dumps(result, indent=2, ensure_ascii=False)
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Error analyzing video: {str(e)}"
|
||||
logger.error("%s", error_msg, exc_info=True)
|
||||
|
||||
err_str = str(e).lower()
|
||||
if any(hint in err_str for hint in (
|
||||
"402", "insufficient", "payment required", "credits", "billing",
|
||||
)):
|
||||
analysis = (
|
||||
"Insufficient credits or payment required. Please top up your "
|
||||
f"API provider account and try again. Error: {e}"
|
||||
)
|
||||
elif any(hint in err_str for hint in (
|
||||
"does not support", "not support video",
|
||||
"content_policy", "multimodal",
|
||||
"unrecognized request argument", "video input",
|
||||
"video_url",
|
||||
)):
|
||||
analysis = (
|
||||
f"The model does not support video analysis or the request was "
|
||||
f"rejected. Ensure you're using a video-capable model "
|
||||
f"(e.g. google/gemini-2.5-flash). Error: {e}"
|
||||
)
|
||||
elif any(hint in err_str for hint in (
|
||||
"too large", "payload", "413", "content_too_large",
|
||||
"request_too_large", "exceeds", "size limit",
|
||||
)):
|
||||
analysis = (
|
||||
"The video is too large for the API. Try compressing or trimming "
|
||||
f"the video (max ~50 MB). Error: {e}"
|
||||
)
|
||||
else:
|
||||
analysis = (
|
||||
"There was a problem with the request and the video could not "
|
||||
f"be analyzed. Error: {e}"
|
||||
)
|
||||
|
||||
result = {
|
||||
"success": False,
|
||||
"error": error_msg,
|
||||
"analysis": analysis,
|
||||
}
|
||||
|
||||
debug_call_data["error"] = error_msg
|
||||
_debug.log_call("video_analyze_tool", debug_call_data)
|
||||
_debug.save()
|
||||
|
||||
return json.dumps(result, indent=2, ensure_ascii=False)
|
||||
|
||||
finally:
|
||||
if should_cleanup and temp_video_path and temp_video_path.exists():
|
||||
try:
|
||||
temp_video_path.unlink()
|
||||
logger.debug("Cleaned up temporary video file")
|
||||
except Exception as cleanup_error:
|
||||
logger.warning(
|
||||
"Could not delete temporary file: %s", cleanup_error, exc_info=True
|
||||
)
|
||||
|
||||
|
||||
VIDEO_ANALYZE_SCHEMA = {
|
||||
"name": "video_analyze",
|
||||
"description": (
|
||||
"Analyze a video from a URL or local file path using a multimodal AI model. "
|
||||
"Sends the video to a video-capable model (e.g. Gemini) for understanding. "
|
||||
"Use this for video files — for images, use vision_analyze instead. "
|
||||
"Supports mp4, webm, mov, avi, mkv, mpeg formats. "
|
||||
"Note: large videos (>20 MB) may be slow; max ~50 MB."
|
||||
),
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"video_url": {
|
||||
"type": "string",
|
||||
"description": "Video URL (http/https) or local file path to analyze.",
|
||||
},
|
||||
"question": {
|
||||
"type": "string",
|
||||
"description": "Your specific question about the video. The AI will describe what happens in the video and answer your question.",
|
||||
},
|
||||
},
|
||||
"required": ["video_url", "question"],
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def _handle_video_analyze(args: Dict[str, Any], **kw: Any) -> Awaitable[str]:
|
||||
video_url = args.get("video_url", "")
|
||||
question = args.get("question", "")
|
||||
full_prompt = (
|
||||
"Fully describe and explain everything happening in this video, "
|
||||
"including visual content, motion, audio cues, text overlays, and scene "
|
||||
f"transitions. Then answer the following question:\n\n{question}"
|
||||
)
|
||||
model = os.getenv("AUXILIARY_VIDEO_MODEL", "").strip() or os.getenv("AUXILIARY_VISION_MODEL", "").strip() or None
|
||||
return video_analyze_tool(video_url, full_prompt, model)
|
||||
|
||||
|
||||
registry.register(
|
||||
name="video_analyze",
|
||||
toolset="video",
|
||||
schema=VIDEO_ANALYZE_SCHEMA,
|
||||
handler=_handle_video_analyze,
|
||||
check_fn=check_vision_requirements,
|
||||
is_async=True,
|
||||
emoji="🎬",
|
||||
)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue