mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
feat: browser screenshot sharing via MEDIA: on all messaging platforms
browser_vision now saves screenshots persistently to ~/.hermes/browser_screenshots/ and returns the screenshot_path in its JSON response. The model can include MEDIA:<path> in its response to share screenshots as native photos. Changes: - browser_tool.py: Save screenshots persistently, return screenshot_path, auto-cleanup files older than 24 hours, mkdir moved inside try/except - telegram.py: Add send_image_file() — sends local images via bot.send_photo() - discord.py: Add send_image_file() — sends local images via discord.File - slack.py: Add send_image_file() — sends local images via files_upload_v2() (WhatsApp already had send_image_file — no changes needed) - prompt_builder.py: Updated Telegram hint to list image extensions, added Discord and Slack MEDIA: platform hints - browser.md: Document screenshot sharing and 24h cleanup - send_file_integration_map.md: Updated to reflect send_image_file is now implemented on Telegram/Discord/Slack - test_send_image_file.py: 19 tests covering MEDIA: .png extraction, send_image_file on all platforms, and screenshot cleanup Partially addresses #466 (Phase 0: platform adapter gaps for send_image_file).
This commit is contained in:
parent
a680367568
commit
b8c3bc7841
8 changed files with 489 additions and 21 deletions
|
|
@ -424,7 +424,7 @@ BROWSER_TOOL_SCHEMAS = [
|
|||
},
|
||||
{
|
||||
"name": "browser_vision",
|
||||
"description": "Take a screenshot of the current page and analyze it with vision AI. Use this when you need to visually understand what's on the page - especially useful for CAPTCHAs, visual verification challenges, complex layouts, or when the text snapshot doesn't capture important visual information. Requires browser_navigate to be called first.",
|
||||
"description": "Take a screenshot of the current page and analyze it with vision AI. Use this when you need to visually understand what's on the page - especially useful for CAPTCHAs, visual verification challenges, complex layouts, or when the text snapshot doesn't capture important visual information. Returns both the AI analysis and a screenshot_path that you can share with the user by including MEDIA:<screenshot_path> in your response. Requires browser_navigate to be called first.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
|
|
@ -1289,15 +1289,17 @@ def browser_vision(question: str, task_id: Optional[str] = None) -> str:
|
|||
text-based snapshot may not capture (CAPTCHAs, verification challenges,
|
||||
images, complex layouts, etc.).
|
||||
|
||||
The screenshot is saved persistently and its file path is returned alongside
|
||||
the analysis, so it can be shared with users via MEDIA:<path> in the response.
|
||||
|
||||
Args:
|
||||
question: What you want to know about the page visually
|
||||
task_id: Task identifier for session isolation
|
||||
|
||||
Returns:
|
||||
JSON string with vision analysis results
|
||||
JSON string with vision analysis results and screenshot_path
|
||||
"""
|
||||
import base64
|
||||
import tempfile
|
||||
import uuid as uuid_mod
|
||||
from pathlib import Path
|
||||
|
||||
|
|
@ -1311,11 +1313,17 @@ def browser_vision(question: str, task_id: Optional[str] = None) -> str:
|
|||
"Set OPENROUTER_API_KEY or configure Nous Portal to enable browser vision."
|
||||
}, ensure_ascii=False)
|
||||
|
||||
# Create a temporary file for the screenshot
|
||||
temp_dir = Path(tempfile.gettempdir())
|
||||
screenshot_path = temp_dir / f"browser_screenshot_{uuid_mod.uuid4().hex}.png"
|
||||
# Save screenshot to persistent location so it can be shared with users
|
||||
hermes_home = Path(os.environ.get("HERMES_HOME", Path.home() / ".hermes"))
|
||||
screenshots_dir = hermes_home / "browser_screenshots"
|
||||
screenshot_path = screenshots_dir / f"browser_screenshot_{uuid_mod.uuid4().hex}.png"
|
||||
|
||||
try:
|
||||
screenshots_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Prune old screenshots (older than 24 hours) to prevent unbounded disk growth
|
||||
_cleanup_old_screenshots(screenshots_dir, max_age_hours=24)
|
||||
|
||||
# Take screenshot using agent-browser
|
||||
result = _run_browser_command(
|
||||
effective_task_id,
|
||||
|
|
@ -1372,21 +1380,35 @@ def browser_vision(question: str, task_id: Optional[str] = None) -> str:
|
|||
return json.dumps({
|
||||
"success": True,
|
||||
"analysis": analysis,
|
||||
"screenshot_path": str(screenshot_path),
|
||||
}, ensure_ascii=False)
|
||||
|
||||
except Exception as e:
|
||||
return json.dumps({
|
||||
"success": False,
|
||||
"error": f"Error during vision analysis: {str(e)}"
|
||||
}, ensure_ascii=False)
|
||||
|
||||
finally:
|
||||
# Clean up screenshot file
|
||||
# Clean up screenshot on failure
|
||||
if screenshot_path.exists():
|
||||
try:
|
||||
screenshot_path.unlink()
|
||||
except Exception:
|
||||
pass
|
||||
return json.dumps({
|
||||
"success": False,
|
||||
"error": f"Error during vision analysis: {str(e)}"
|
||||
}, ensure_ascii=False)
|
||||
|
||||
|
||||
def _cleanup_old_screenshots(screenshots_dir, max_age_hours=24):
|
||||
"""Remove browser screenshots older than max_age_hours to prevent disk bloat."""
|
||||
import time
|
||||
try:
|
||||
cutoff = time.time() - (max_age_hours * 3600)
|
||||
for f in screenshots_dir.glob("browser_screenshot_*.png"):
|
||||
try:
|
||||
if f.stat().st_mtime < cutoff:
|
||||
f.unlink()
|
||||
except Exception:
|
||||
pass
|
||||
except Exception:
|
||||
pass # Non-critical — don't fail the screenshot operation
|
||||
|
||||
|
||||
# ============================================================================
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue