feat: add video_analyze tool for native video understanding (#19301)

* feat: add video_analyze tool for native video understanding Adds a video_analyze tool that sends video files to multimodal LLMs (e.g. Gemini) for analysis via the OpenRouter-compatible video_url content type. Mirrors vision_analyze in structure, error handling, and registration pattern. Key design: - Base64 encodes entire video (no frame extraction, no ffmpeg dep) - Uses 'video_url' content block type (OpenRouter standard) - Supports mp4, webm, mov, avi, mkv, mpeg formats - 50 MB hard cap, 20 MB warning threshold - 180s minimum timeout (videos take longer than images) - AUXILIARY_VIDEO_MODEL env override, falls back to AUXILIARY_VISION_MODEL - Same SSRF protection, retry logic, and cleanup as vision_analyze Default disabled: registered in 'video' toolset (not in _HERMES_CORE_TOOLS). Users opt in via: hermes tools enable video, or enabled_toolsets=['video']. * feat(video): add models.dev capability pre-check + CONFIGURABLE_TOOLSETS entry - Pre-checks model video capability via models.dev modalities.input before expensive base64 encoding. Fails early with helpful message suggesting video-capable alternatives (gemini, mimo-v2.5-pro). - Passes optimistically if model unknown or lookup fails. - Adds ModelInfo.supports_video_input() helper. - Adds 'video' to CONFIGURABLE_TOOLSETS and _DEFAULT_OFF_TOOLSETS so 'hermes tools enable video' works from CLI. - 8 new tests for the capability check (37 total). * refactor(video): remove models.dev capability pre-check Removes _check_video_model_capability and ModelInfo.supports_video_input. The vision_analyze tool doesn't pre-check image capability either — both tools rely on the same pattern: send request, handle API errors gracefully with categorized user-facing messages. The pre-check was inconsistent (only worked for some providers/models) so drop it for parity. * cleanup: compress comments, fix fragile timeout coupling - Replace _VISION_DOWNLOAD_TIMEOUT * 2 with hardcoded 60s (no silent breakage if vision timeout changes independently) - Strip verbose comments and redundant log lines throughout - No behavioral changes
2026-05-05 02:31:47 +00:00 · 2026-05-04 00:04:36 +05:30 · 2026-05-04 00:04:36 +05:30 · c9a3f36f56
commit c9a3f36f56
parent 0dd8e3f8d8
4 changed files with 706 additions and 1 deletions
--- a/tools/vision_tools.py
+++ b/tools/vision_tools.py
@ -801,3 +801,364 @@ registry.register(
    is_async=True,
    emoji="👁️",
 )
+
+
+# ---------------------------------------------------------------------------
+# Video Analysis Tool
+# ---------------------------------------------------------------------------
+
+# Extension → MIME. avi/mkv fall back to mp4.
+_VIDEO_MIME_TYPES = {
+    ".mp4": "video/mp4",
+    ".webm": "video/webm",
+    ".mov": "video/mov",
+    ".avi": "video/mp4",
+    ".mkv": "video/mp4",
+    ".mpeg": "video/mpeg",
+    ".mpg": "video/mpeg",
+}
+
+_MAX_VIDEO_BASE64_BYTES = 50 * 1024 * 1024  # 50 MB hard cap
+_VIDEO_SIZE_WARN_BYTES = 20 * 1024 * 1024
+
+
+def _detect_video_mime_type(video_path: Path) -> Optional[str]:
+    """Return a video MIME type based on file extension, or None if unsupported."""
+    ext = video_path.suffix.lower()
+    return _VIDEO_MIME_TYPES.get(ext)
+
+
+def _video_to_base64_data_url(video_path: Path, mime_type: Optional[str] = None) -> str:
+    """Convert a video file to a base64-encoded data URL."""
+    data = video_path.read_bytes()
+    encoded = base64.b64encode(data).decode("ascii")
+    mime = mime_type or _VIDEO_MIME_TYPES.get(video_path.suffix.lower(), "video/mp4")
+    return f"data:{mime};base64,{encoded}"
+
+
+async def _download_video(video_url: str, destination: Path, max_retries: int = 3) -> Path:
+    """Download video from URL with SSRF protection and retry."""
+    import asyncio
+
+    destination.parent.mkdir(parents=True, exist_ok=True)
+
+    async def _ssrf_redirect_guard(response):
+        if response.is_redirect and response.next_request:
+            redirect_url = str(response.next_request.url)
+            from tools.url_safety import is_safe_url
+            if not is_safe_url(redirect_url):
+                raise ValueError(
+                    f"Blocked redirect to private/internal address: {redirect_url}"
+                )
+
+    last_error = None
+    for attempt in range(max_retries):
+        try:
+            blocked = check_website_access(video_url)
+            if blocked:
+                raise PermissionError(blocked["message"])
+
+            async with httpx.AsyncClient(
+                timeout=60.0,
+                follow_redirects=True,
+                event_hooks={"response": [_ssrf_redirect_guard]},
+            ) as client:
+                response = await client.get(
+                    video_url,
+                    headers={
+                        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+                        "Accept": "video/*,*/*;q=0.8",
+                    },
+                )
+                response.raise_for_status()
+
+                cl = response.headers.get("content-length")
+                if cl and int(cl) > _MAX_VIDEO_BASE64_BYTES:
+                    raise ValueError(
+                        f"Video too large ({int(cl)} bytes, max {_MAX_VIDEO_BASE64_BYTES})"
+                    )
+
+                final_url = str(response.url)
+                blocked = check_website_access(final_url)
+                if blocked:
+                    raise PermissionError(blocked["message"])
+
+                body = response.content
+                if len(body) > _MAX_VIDEO_BASE64_BYTES:
+                    raise ValueError(
+                        f"Video too large ({len(body)} bytes, max {_MAX_VIDEO_BASE64_BYTES})"
+                    )
+                destination.write_bytes(body)
+
+            return destination
+        except Exception as e:
+            last_error = e
+            if attempt < max_retries - 1:
+                wait_time = 2 ** (attempt + 1)
+                logger.warning("Video download failed (attempt %s/%s): %s", attempt + 1, max_retries, str(e)[:50])
+                await asyncio.sleep(wait_time)
+            else:
+                logger.error(
+                    "Video download failed after %s attempts: %s",
+                    max_retries, str(e)[:100], exc_info=True,
+                )
+
+    if last_error is None:
+        raise RuntimeError(
+            f"_download_video exited retry loop without attempting (max_retries={max_retries})"
+        )
+    raise last_error
+
+
+async def video_analyze_tool(
+    video_url: str,
+    user_prompt: str,
+    model: str = None,
+) -> str:
+    """Analyze a video via multimodal LLM. Returns JSON {success, analysis}."""
+    debug_call_data = {
+        "parameters": {
+            "video_url": video_url,
+            "user_prompt": user_prompt[:200] + "..." if len(user_prompt) > 200 else user_prompt,
+            "model": model,
+        },
+        "error": None,
+        "success": False,
+        "analysis_length": 0,
+        "model_used": model,
+        "video_size_bytes": 0,
+    }
+
+    temp_video_path = None
+    should_cleanup = True
+
+    try:
+        from tools.interrupt import is_interrupted
+        if is_interrupted():
+            return tool_error("Interrupted", success=False)
+
+        logger.info("Analyzing video: %s", video_url[:60])
+        logger.info("User prompt: %s", user_prompt[:100])
+
+        # Resolve local path vs remote URL
+        resolved_url = video_url
+        if resolved_url.startswith("file://"):
+            resolved_url = resolved_url[len("file://"):]
+        local_path = Path(os.path.expanduser(resolved_url))
+
+        if local_path.is_file():
+            logger.info("Using local video file: %s", video_url)
+            temp_video_path = local_path
+            should_cleanup = False
+        elif _validate_image_url(video_url):
+            blocked = check_website_access(video_url)
+            if blocked:
+                raise PermissionError(blocked["message"])
+            temp_dir = get_hermes_dir("cache/video", "temp_video_files")
+            temp_video_path = temp_dir / f"temp_video_{uuid.uuid4()}.mp4"
+            await _download_video(video_url, temp_video_path)
+            should_cleanup = True
+        else:
+            raise ValueError(
+                "Invalid video source. Provide an HTTP/HTTPS URL or a valid local file path."
+            )
+
+        video_size_bytes = temp_video_path.stat().st_size
+        video_size_mb = video_size_bytes / (1024 * 1024)
+        logger.info("Video ready (%.1f MB)", video_size_mb)
+
+        detected_mime = _detect_video_mime_type(temp_video_path)
+        if not detected_mime:
+            raise ValueError(
+                f"Unsupported video format: '{temp_video_path.suffix}'. "
+                f"Supported: {', '.join(sorted(_VIDEO_MIME_TYPES.keys()))}"
+            )
+
+        if video_size_bytes > _VIDEO_SIZE_WARN_BYTES:
+            logger.warning("Video is %.1f MB — may be slow or rejected", video_size_mb)
+
+        video_data_url = _video_to_base64_data_url(temp_video_path, mime_type=detected_mime)
+        data_size_mb = len(video_data_url) / (1024 * 1024)
+
+        if len(video_data_url) > _MAX_VIDEO_BASE64_BYTES:
+            raise ValueError(
+                f"Video too large for API: base64 payload is {data_size_mb:.1f} MB "
+                f"(limit {_MAX_VIDEO_BASE64_BYTES / (1024 * 1024):.0f} MB). "
+                f"Compress or trim the video and retry."
+            )
+
+        debug_call_data["video_size_bytes"] = video_size_bytes
+
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": user_prompt,
+                    },
+                    {
+                        "type": "video_url",
+                        "video_url": {
+                            "url": video_data_url,
+                        },
+                    },
+                ],
+            }
+        ]
+
+        vision_timeout = 180.0
+        vision_temperature = 0.1
+        try:
+            from hermes_cli.config import cfg_get, load_config
+            _cfg = load_config()
+            _vision_cfg = cfg_get(_cfg, "auxiliary", "vision", default={})
+            _vt = _vision_cfg.get("timeout")
+            if _vt is not None:
+                vision_timeout = max(float(_vt), 180.0)
+            _vtemp = _vision_cfg.get("temperature")
+            if _vtemp is not None:
+                vision_temperature = float(_vtemp)
+        except Exception:
+            pass
+
+        call_kwargs = {
+            "task": "vision",
+            "messages": messages,
+            "temperature": vision_temperature,
+            "max_tokens": 4000,
+            "timeout": vision_timeout,
+        }
+        if model:
+            call_kwargs["model"] = model
+
+        response = await async_call_llm(**call_kwargs)
+        analysis = extract_content_or_reasoning(response)
+
+        if not analysis:
+            logger.warning("Empty video response, retrying once")
+            response = await async_call_llm(**call_kwargs)
+            analysis = extract_content_or_reasoning(response)
+
+        analysis_length = len(analysis) if analysis else 0
+        logger.info("Video analysis completed (%s characters)", analysis_length)
+
+        result = {
+            "success": True,
+            "analysis": analysis or "There was a problem with the request and the video could not be analyzed.",
+        }
+
+        debug_call_data["success"] = True
+        debug_call_data["analysis_length"] = analysis_length
+        _debug.log_call("video_analyze_tool", debug_call_data)
+        _debug.save()
+
+        return json.dumps(result, indent=2, ensure_ascii=False)
+
+    except Exception as e:
+        error_msg = f"Error analyzing video: {str(e)}"
+        logger.error("%s", error_msg, exc_info=True)
+
+        err_str = str(e).lower()
+        if any(hint in err_str for hint in (
+            "402", "insufficient", "payment required", "credits", "billing",
+        )):
+            analysis = (
+                "Insufficient credits or payment required. Please top up your "
+                f"API provider account and try again. Error: {e}"
+            )
+        elif any(hint in err_str for hint in (
+            "does not support", "not support video",
+            "content_policy", "multimodal",
+            "unrecognized request argument", "video input",
+            "video_url",
+        )):
+            analysis = (
+                f"The model does not support video analysis or the request was "
+                f"rejected. Ensure you're using a video-capable model "
+                f"(e.g. google/gemini-2.5-flash). Error: {e}"
+            )
+        elif any(hint in err_str for hint in (
+            "too large", "payload", "413", "content_too_large",
+            "request_too_large", "exceeds", "size limit",
+        )):
+            analysis = (
+                "The video is too large for the API. Try compressing or trimming "
+                f"the video (max ~50 MB). Error: {e}"
+            )
+        else:
+            analysis = (
+                "There was a problem with the request and the video could not "
+                f"be analyzed. Error: {e}"
+            )
+
+        result = {
+            "success": False,
+            "error": error_msg,
+            "analysis": analysis,
+        }
+
+        debug_call_data["error"] = error_msg
+        _debug.log_call("video_analyze_tool", debug_call_data)
+        _debug.save()
+
+        return json.dumps(result, indent=2, ensure_ascii=False)
+
+    finally:
+        if should_cleanup and temp_video_path and temp_video_path.exists():
+            try:
+                temp_video_path.unlink()
+                logger.debug("Cleaned up temporary video file")
+            except Exception as cleanup_error:
+                logger.warning(
+                    "Could not delete temporary file: %s", cleanup_error, exc_info=True
+                )
+
+
+VIDEO_ANALYZE_SCHEMA = {
+    "name": "video_analyze",
+    "description": (
+        "Analyze a video from a URL or local file path using a multimodal AI model. "
+        "Sends the video to a video-capable model (e.g. Gemini) for understanding. "
+        "Use this for video files — for images, use vision_analyze instead. "
+        "Supports mp4, webm, mov, avi, mkv, mpeg formats. "
+        "Note: large videos (>20 MB) may be slow; max ~50 MB."
+    ),
+    "parameters": {
+        "type": "object",
+        "properties": {
+            "video_url": {
+                "type": "string",
+                "description": "Video URL (http/https) or local file path to analyze.",
+            },
+            "question": {
+                "type": "string",
+                "description": "Your specific question about the video. The AI will describe what happens in the video and answer your question.",
+            },
+        },
+        "required": ["video_url", "question"],
+    },
+}
+
+
+def _handle_video_analyze(args: Dict[str, Any], **kw: Any) -> Awaitable[str]:
+    video_url = args.get("video_url", "")
+    question = args.get("question", "")
+    full_prompt = (
+        "Fully describe and explain everything happening in this video, "
+        "including visual content, motion, audio cues, text overlays, and scene "
+        f"transitions. Then answer the following question:\n\n{question}"
+    )
+    model = os.getenv("AUXILIARY_VIDEO_MODEL", "").strip() or os.getenv("AUXILIARY_VISION_MODEL", "").strip() or None
+    return video_analyze_tool(video_url, full_prompt, model)
+
+
+registry.register(
+    name="video_analyze",
+    toolset="video",
+    schema=VIDEO_ANALYZE_SCHEMA,
+    handler=_handle_video_analyze,
+    check_fn=check_vision_requirements,
+    is_async=True,
+    emoji="🎬",
+)