mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-27 01:11:40 +00:00
feat: devex help, add Makefile, ruff, pre-commit, and modernize CI
This commit is contained in:
parent
172a38c344
commit
f4d7e6a29e
111 changed files with 11655 additions and 10200 deletions
|
|
@ -19,7 +19,7 @@ Features:
|
|||
Usage:
|
||||
from vision_tools import vision_analyze_tool
|
||||
import asyncio
|
||||
|
||||
|
||||
# Analyze an image
|
||||
result = await vision_analyze_tool(
|
||||
image_url="https://example.com/image.jpg",
|
||||
|
|
@ -33,11 +33,14 @@ import json
|
|||
import logging
|
||||
import os
|
||||
import uuid
|
||||
from collections.abc import Awaitable
|
||||
from pathlib import Path
|
||||
from typing import Any, Awaitable, Dict, Optional
|
||||
from typing import Any
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import httpx
|
||||
from openai import AsyncOpenAI
|
||||
|
||||
from agent.auxiliary_client import get_vision_auxiliary_client
|
||||
from tools.debug_helpers import DebugSession
|
||||
|
||||
|
|
@ -55,7 +58,7 @@ if _aux_sync_client is not None:
|
|||
_async_kwargs["default_headers"] = {
|
||||
"HTTP-Referer": "https://github.com/NousResearch/hermes-agent",
|
||||
"X-OpenRouter-Title": "Hermes Agent",
|
||||
"X-OpenRouter-Categories": "productivity,cli-agent",
|
||||
"X-OpenRouter-Categories": "productivity,cli-agent",
|
||||
}
|
||||
_aux_async_client = AsyncOpenAI(**_async_kwargs)
|
||||
|
||||
|
|
@ -65,10 +68,10 @@ _debug = DebugSession("vision_tools", env_var="VISION_TOOLS_DEBUG")
|
|||
def _validate_image_url(url: str) -> bool:
|
||||
"""
|
||||
Basic validation of image URL format.
|
||||
|
||||
|
||||
Args:
|
||||
url (str): The URL to validate
|
||||
|
||||
|
||||
Returns:
|
||||
bool: True if URL appears to be valid, False otherwise
|
||||
"""
|
||||
|
|
@ -91,23 +94,22 @@ def _validate_image_url(url: str) -> bool:
|
|||
async def _download_image(image_url: str, destination: Path, max_retries: int = 3) -> Path:
|
||||
"""
|
||||
Download an image from a URL to a local destination (async) with retry logic.
|
||||
|
||||
|
||||
Args:
|
||||
image_url (str): The URL of the image to download
|
||||
destination (Path): The path where the image should be saved
|
||||
max_retries (int): Maximum number of retry attempts (default: 3)
|
||||
|
||||
|
||||
Returns:
|
||||
Path: The path to the downloaded image
|
||||
|
||||
|
||||
Raises:
|
||||
Exception: If download fails after all retries
|
||||
"""
|
||||
import asyncio
|
||||
|
||||
|
||||
# Create parent directories if they don't exist
|
||||
destination.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
last_error = None
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
|
|
@ -122,10 +124,10 @@ async def _download_image(image_url: str, destination: Path, max_retries: int =
|
|||
},
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
|
||||
# Save the image content
|
||||
destination.write_bytes(response.content)
|
||||
|
||||
|
||||
return destination
|
||||
except Exception as e:
|
||||
last_error = e
|
||||
|
|
@ -141,56 +143,56 @@ async def _download_image(image_url: str, destination: Path, max_retries: int =
|
|||
str(e)[:100],
|
||||
exc_info=True,
|
||||
)
|
||||
|
||||
|
||||
raise last_error
|
||||
|
||||
|
||||
def _determine_mime_type(image_path: Path) -> str:
|
||||
"""
|
||||
Determine the MIME type of an image based on its file extension.
|
||||
|
||||
|
||||
Args:
|
||||
image_path (Path): Path to the image file
|
||||
|
||||
|
||||
Returns:
|
||||
str: The MIME type (defaults to image/jpeg if unknown)
|
||||
"""
|
||||
extension = image_path.suffix.lower()
|
||||
mime_types = {
|
||||
'.jpg': 'image/jpeg',
|
||||
'.jpeg': 'image/jpeg',
|
||||
'.png': 'image/png',
|
||||
'.gif': 'image/gif',
|
||||
'.bmp': 'image/bmp',
|
||||
'.webp': 'image/webp',
|
||||
'.svg': 'image/svg+xml'
|
||||
".jpg": "image/jpeg",
|
||||
".jpeg": "image/jpeg",
|
||||
".png": "image/png",
|
||||
".gif": "image/gif",
|
||||
".bmp": "image/bmp",
|
||||
".webp": "image/webp",
|
||||
".svg": "image/svg+xml",
|
||||
}
|
||||
return mime_types.get(extension, 'image/jpeg')
|
||||
return mime_types.get(extension, "image/jpeg")
|
||||
|
||||
|
||||
def _image_to_base64_data_url(image_path: Path, mime_type: Optional[str] = None) -> str:
|
||||
def _image_to_base64_data_url(image_path: Path, mime_type: str | None = None) -> str:
|
||||
"""
|
||||
Convert an image file to a base64-encoded data URL.
|
||||
|
||||
|
||||
Args:
|
||||
image_path (Path): Path to the image file
|
||||
mime_type (Optional[str]): MIME type of the image (auto-detected if None)
|
||||
|
||||
|
||||
Returns:
|
||||
str: Base64-encoded data URL (e.g., "data:image/jpeg;base64,...")
|
||||
"""
|
||||
# Read the image as bytes
|
||||
data = image_path.read_bytes()
|
||||
|
||||
|
||||
# Encode to base64
|
||||
encoded = base64.b64encode(data).decode("ascii")
|
||||
|
||||
|
||||
# Determine MIME type
|
||||
mime = mime_type or _determine_mime_type(image_path)
|
||||
|
||||
|
||||
# Create data URL
|
||||
data_url = f"data:{mime};base64,{encoded}"
|
||||
|
||||
|
||||
return data_url
|
||||
|
||||
|
||||
|
|
@ -201,31 +203,31 @@ async def vision_analyze_tool(
|
|||
) -> str:
|
||||
"""
|
||||
Analyze an image from a URL or local file path using vision AI.
|
||||
|
||||
|
||||
This tool accepts either an HTTP/HTTPS URL or a local file path. For URLs,
|
||||
it downloads the image first. In both cases, the image is converted to base64
|
||||
and processed using Gemini 3 Flash Preview via OpenRouter API.
|
||||
|
||||
|
||||
The user_prompt parameter is expected to be pre-formatted by the calling
|
||||
function (typically model_tools.py) to include both full description
|
||||
requests and specific questions.
|
||||
|
||||
|
||||
Args:
|
||||
image_url (str): The URL or local file path of the image to analyze.
|
||||
Accepts http://, https:// URLs or absolute/relative file paths.
|
||||
user_prompt (str): The pre-formatted prompt for the vision model
|
||||
model (str): The vision model to use (default: google/gemini-3-flash-preview)
|
||||
|
||||
|
||||
Returns:
|
||||
str: JSON string containing the analysis results with the following structure:
|
||||
{
|
||||
"success": bool,
|
||||
"analysis": str (defaults to error message if None)
|
||||
}
|
||||
|
||||
|
||||
Raises:
|
||||
Exception: If download fails, analysis fails, or API key is not set
|
||||
|
||||
|
||||
Note:
|
||||
- For URLs, temporary images are stored in ./temp_vision_images/ and cleaned up
|
||||
- For local file paths, the file is used directly and NOT deleted
|
||||
|
|
@ -235,36 +237,41 @@ async def vision_analyze_tool(
|
|||
"parameters": {
|
||||
"image_url": image_url,
|
||||
"user_prompt": user_prompt[:200] + "..." if len(user_prompt) > 200 else user_prompt,
|
||||
"model": model
|
||||
"model": model,
|
||||
},
|
||||
"error": None,
|
||||
"success": False,
|
||||
"analysis_length": 0,
|
||||
"model_used": model,
|
||||
"image_size_bytes": 0
|
||||
"image_size_bytes": 0,
|
||||
}
|
||||
|
||||
|
||||
temp_image_path = None
|
||||
# Track whether we should clean up the file after processing.
|
||||
# Local files (e.g. from the image cache) should NOT be deleted.
|
||||
should_cleanup = True
|
||||
|
||||
|
||||
try:
|
||||
from tools.interrupt import is_interrupted
|
||||
|
||||
if is_interrupted():
|
||||
return json.dumps({"success": False, "error": "Interrupted"})
|
||||
|
||||
logger.info("Analyzing image: %s", image_url[:60])
|
||||
logger.info("User prompt: %s", user_prompt[:100])
|
||||
|
||||
|
||||
# Check auxiliary vision client availability
|
||||
if _aux_async_client is None or DEFAULT_VISION_MODEL is None:
|
||||
return json.dumps({
|
||||
"success": False,
|
||||
"analysis": "Vision analysis unavailable: no auxiliary vision model configured. "
|
||||
"Set OPENROUTER_API_KEY or configure Nous Portal to enable vision tools."
|
||||
}, indent=2, ensure_ascii=False)
|
||||
|
||||
return json.dumps(
|
||||
{
|
||||
"success": False,
|
||||
"analysis": "Vision analysis unavailable: no auxiliary vision model configured. "
|
||||
"Set OPENROUTER_API_KEY or configure Nous Portal to enable vision tools.",
|
||||
},
|
||||
indent=2,
|
||||
ensure_ascii=False,
|
||||
)
|
||||
|
||||
# Determine if this is a local file path or a remote URL
|
||||
local_path = Path(image_url)
|
||||
if local_path.is_file():
|
||||
|
|
@ -280,50 +287,41 @@ async def vision_analyze_tool(
|
|||
await _download_image(image_url, temp_image_path)
|
||||
should_cleanup = True
|
||||
else:
|
||||
raise ValueError(
|
||||
"Invalid image source. Provide an HTTP/HTTPS URL or a valid local file path."
|
||||
)
|
||||
|
||||
raise ValueError("Invalid image source. Provide an HTTP/HTTPS URL or a valid local file path.")
|
||||
|
||||
# Get image file size for logging
|
||||
image_size_bytes = temp_image_path.stat().st_size
|
||||
image_size_kb = image_size_bytes / 1024
|
||||
logger.info("Image ready (%.1f KB)", image_size_kb)
|
||||
|
||||
|
||||
# Convert image to base64 data URL
|
||||
logger.info("Converting image to base64...")
|
||||
image_data_url = _image_to_base64_data_url(temp_image_path)
|
||||
# Calculate size in KB for better readability
|
||||
data_size_kb = len(image_data_url) / 1024
|
||||
logger.info("Image converted to base64 (%.1f KB)", data_size_kb)
|
||||
|
||||
|
||||
debug_call_data["image_size_bytes"] = image_size_bytes
|
||||
|
||||
|
||||
# Use the prompt as provided (model_tools.py now handles full description formatting)
|
||||
comprehensive_prompt = user_prompt
|
||||
|
||||
|
||||
# Prepare the message with base64-encoded image
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": comprehensive_prompt
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_data_url
|
||||
}
|
||||
}
|
||||
]
|
||||
{"type": "text", "text": comprehensive_prompt},
|
||||
{"type": "image_url", "image_url": {"url": image_data_url}},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
logger.info("Processing image with %s...", model)
|
||||
|
||||
|
||||
# Call the vision API
|
||||
from agent.auxiliary_client import get_auxiliary_extra_body, auxiliary_max_tokens_param
|
||||
from agent.auxiliary_client import auxiliary_max_tokens_param, get_auxiliary_extra_body
|
||||
|
||||
_extra = get_auxiliary_extra_body()
|
||||
response = await _aux_async_client.chat.completions.create(
|
||||
model=model,
|
||||
|
|
@ -332,44 +330,44 @@ async def vision_analyze_tool(
|
|||
**auxiliary_max_tokens_param(2000),
|
||||
**({} if not _extra else {"extra_body": _extra}),
|
||||
)
|
||||
|
||||
|
||||
# Extract the analysis
|
||||
analysis = response.choices[0].message.content.strip()
|
||||
analysis_length = len(analysis)
|
||||
|
||||
|
||||
logger.info("Image analysis completed (%s characters)", analysis_length)
|
||||
|
||||
|
||||
# Prepare successful response
|
||||
result = {
|
||||
"success": True,
|
||||
"analysis": analysis or "There was a problem with the request and the image could not be analyzed."
|
||||
"analysis": analysis or "There was a problem with the request and the image could not be analyzed.",
|
||||
}
|
||||
|
||||
|
||||
debug_call_data["success"] = True
|
||||
debug_call_data["analysis_length"] = analysis_length
|
||||
|
||||
|
||||
# Log debug information
|
||||
_debug.log_call("vision_analyze_tool", debug_call_data)
|
||||
_debug.save()
|
||||
|
||||
|
||||
return json.dumps(result, indent=2, ensure_ascii=False)
|
||||
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Error analyzing image: {str(e)}"
|
||||
logger.error("%s", error_msg, exc_info=True)
|
||||
|
||||
|
||||
# Prepare error response
|
||||
result = {
|
||||
"success": False,
|
||||
"analysis": "There was a problem with the request and the image could not be analyzed."
|
||||
"analysis": "There was a problem with the request and the image could not be analyzed.",
|
||||
}
|
||||
|
||||
|
||||
debug_call_data["error"] = error_msg
|
||||
_debug.log_call("vision_analyze_tool", debug_call_data)
|
||||
_debug.save()
|
||||
|
||||
|
||||
return json.dumps(result, indent=2, ensure_ascii=False)
|
||||
|
||||
|
||||
finally:
|
||||
# Clean up temporary image file (but NOT local/cached files)
|
||||
if should_cleanup and temp_image_path and temp_image_path.exists():
|
||||
|
|
@ -377,9 +375,7 @@ async def vision_analyze_tool(
|
|||
temp_image_path.unlink()
|
||||
logger.debug("Cleaned up temporary image file")
|
||||
except Exception as cleanup_error:
|
||||
logger.warning(
|
||||
"Could not delete temporary file: %s", cleanup_error, exc_info=True
|
||||
)
|
||||
logger.warning("Could not delete temporary file: %s", cleanup_error, exc_info=True)
|
||||
|
||||
|
||||
def check_vision_requirements() -> bool:
|
||||
|
|
@ -387,10 +383,10 @@ def check_vision_requirements() -> bool:
|
|||
return _aux_async_client is not None
|
||||
|
||||
|
||||
def get_debug_session_info() -> Dict[str, Any]:
|
||||
def get_debug_session_info() -> dict[str, Any]:
|
||||
"""
|
||||
Get information about the current debug session.
|
||||
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: Dictionary containing debug session information
|
||||
"""
|
||||
|
|
@ -403,27 +399,27 @@ if __name__ == "__main__":
|
|||
"""
|
||||
print("👁️ Vision Tools Module")
|
||||
print("=" * 40)
|
||||
|
||||
|
||||
# Check if vision model is available
|
||||
api_available = check_vision_requirements()
|
||||
|
||||
|
||||
if not api_available:
|
||||
print("❌ No auxiliary vision model available")
|
||||
print("Set OPENROUTER_API_KEY or configure Nous Portal to enable vision tools.")
|
||||
exit(1)
|
||||
else:
|
||||
print(f"✅ Vision model available: {DEFAULT_VISION_MODEL}")
|
||||
|
||||
|
||||
print("🛠️ Vision tools ready for use!")
|
||||
print(f"🧠 Using model: {DEFAULT_VISION_MODEL}")
|
||||
|
||||
|
||||
# Show debug mode status
|
||||
if _debug.active:
|
||||
print(f"🐛 Debug mode ENABLED - Session ID: {_debug.session_id}")
|
||||
print(f" Debug logs will be saved to: ./logs/vision_tools_debug_{_debug.session_id}.json")
|
||||
else:
|
||||
print("🐛 Debug mode disabled (set VISION_TOOLS_DEBUG=true to enable)")
|
||||
|
||||
|
||||
print("\nBasic usage:")
|
||||
print(" from vision_tools import vision_analyze_tool")
|
||||
print(" import asyncio")
|
||||
|
|
@ -435,14 +431,14 @@ if __name__ == "__main__":
|
|||
print(" )")
|
||||
print(" print(result)")
|
||||
print(" asyncio.run(main())")
|
||||
|
||||
|
||||
print("\nExample prompts:")
|
||||
print(" - 'What architectural style is this building?'")
|
||||
print(" - 'Describe the emotions and mood in this image'")
|
||||
print(" - 'What text can you read in this image?'")
|
||||
print(" - 'Identify any safety hazards visible'")
|
||||
print(" - 'What products or brands are shown?'")
|
||||
|
||||
|
||||
print("\nDebug mode:")
|
||||
print(" # Enable debug logging")
|
||||
print(" export VISION_TOOLS_DEBUG=true")
|
||||
|
|
@ -461,30 +457,24 @@ VISION_ANALYZE_SCHEMA = {
|
|||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"image_url": {
|
||||
"type": "string",
|
||||
"description": "Image URL (http/https) or local file path to analyze."
|
||||
},
|
||||
"image_url": {"type": "string", "description": "Image URL (http/https) or local file path to analyze."},
|
||||
"question": {
|
||||
"type": "string",
|
||||
"description": "Your specific question or request about the image to resolve. The AI will automatically provide a complete image description AND answer your specific question."
|
||||
}
|
||||
"description": "Your specific question or request about the image to resolve. The AI will automatically provide a complete image description AND answer your specific question.",
|
||||
},
|
||||
},
|
||||
"required": ["image_url", "question"]
|
||||
}
|
||||
"required": ["image_url", "question"],
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def _handle_vision_analyze(args: Dict[str, Any], **kw: Any) -> Awaitable[str]:
|
||||
def _handle_vision_analyze(args: dict[str, Any], **kw: Any) -> Awaitable[str]:
|
||||
image_url = args.get("image_url", "")
|
||||
question = args.get("question", "")
|
||||
full_prompt = (
|
||||
"Fully describe and explain everything about this image, then answer the "
|
||||
f"following question:\n\n{question}"
|
||||
f"Fully describe and explain everything about this image, then answer the following question:\n\n{question}"
|
||||
)
|
||||
model = (os.getenv("AUXILIARY_VISION_MODEL", "").strip()
|
||||
or DEFAULT_VISION_MODEL
|
||||
or "google/gemini-3-flash-preview")
|
||||
model = os.getenv("AUXILIARY_VISION_MODEL", "").strip() or DEFAULT_VISION_MODEL or "google/gemini-3-flash-preview"
|
||||
return vision_analyze_tool(image_url, full_prompt, model)
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue