feat: browser console/errors tool, annotated screenshots, auto-recording, and dogfood QA skill

New browser capabilities and a built-in skill for agent-driven web QA. ## New tool: browser_console Returns console messages (log/warn/error/info) AND uncaught JavaScript exceptions in a single call. Uses agent-browser's 'console' and 'errors' commands through the existing session plumbing. Supports --clear to reset buffers. Verified working in both local and Browserbase cloud modes. ## Enhanced tool: browser_vision(annotate=True) New boolean parameter on browser_vision. When true, agent-browser overlays numbered [N] labels on interactive elements — each [N] maps to ref @eN. Annotation data (element name, role, bounding box) returned alongside the vision analysis. Useful for QA reports and spatial reasoning. ## Config: browser.record_sessions Auto-record browser sessions as WebM video files when enabled: - Starts recording on first browser_navigate - Stops and saves on browser_close - Saves to ~/.hermes/browser_recordings/ - Works in both local and cloud modes (verified) - Disabled by default ## Built-in skill: dogfood Systematic exploratory QA testing for web applications. Teaches the agent a 5-phase workflow: 1. Plan — accept URL, create output dirs, set scope 2. Explore — systematic crawl with annotated screenshots 3. Collect Evidence — screenshots, console errors, JS exceptions 4. Categorize — severity (Critical/High/Medium/Low) and category (Functional/Visual/Accessibility/Console/UX/Content) 5. Report — structured markdown with per-issue evidence Includes: - skills/dogfood/SKILL.md — full workflow instructions - skills/dogfood/references/issue-taxonomy.md — severity/category defs - skills/dogfood/templates/dogfood-report-template.md — report template ## Tests 21 new tests covering: - browser_console message/error parsing, clear flag, empty/failed states - browser_console schema registration - browser_vision annotate schema and flag passing - record_sessions config defaults and recording lifecycle - Dogfood skill file existence and content validation Addresses #315.
2026-04-25 00:51:20 +00:00 · 2026-03-08 21:02:14 -07:00 · 2026-03-08 21:02:14 -07:00 · a8bf414f4a
commit a8bf414f4a
parent 0c4cff352a
11 changed files with 835 additions and 9 deletions
--- a/tools/browser_tool.py
+++ b/tools/browser_tool.py
@ -144,6 +144,7 @@ def _socket_safe_tmpdir() -> str:
 # Track active sessions per task
 # Stores: session_name (always), bb_session_id + cdp_url (cloud mode only)
 _active_sessions: Dict[str, Dict[str, str]] = {}  # task_id -> {session_name, ...}
+_recording_sessions: set = set()  # task_ids with active recordings

 # Flag to track if cleanup has been done
 _cleanup_done = False
@ -478,11 +479,31 @@ BROWSER_TOOL_SCHEMAS = [
                "question": {
                    "type": "string",
                    "description": "What you want to know about the page visually. Be specific about what you're looking for."
+                },
+                "annotate": {
+                    "type": "boolean",
+                    "default": False,
+                    "description": "If true, overlay numbered [N] labels on interactive elements. Each [N] maps to ref @eN for subsequent browser commands. Useful for QA and spatial reasoning about page layout."
                }
            },
            "required": ["question"]
        }
    },
+    {
+        "name": "browser_console",
+        "description": "Get browser console output and JavaScript errors from the current page. Returns console.log/warn/error/info messages and uncaught JS exceptions. Use this to detect silent JavaScript errors, failed API calls, and application warnings. Requires browser_navigate to be called first.",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "clear": {
+                    "type": "boolean",
+                    "default": False,
+                    "description": "If true, clear the message buffers after reading"
+                }
+            },
+            "required": []
+        }
+    },
 ]


@ -998,9 +1019,10 @@ def browser_navigate(url: str, task_id: Optional[str] = None) -> str:
    session_info = _get_session_info(effective_task_id)
    is_first_nav = session_info.get("_first_nav", True)
    
-    # Mark that we've done at least one navigation
+    # Auto-start recording if configured and this is first navigation
    if is_first_nav:
        session_info["_first_nav"] = False
+        _maybe_start_recording(effective_task_id)
    
    result = _run_browser_command(effective_task_id, "open", [url], timeout=60)
    
@ -1264,6 +1286,10 @@ def browser_close(task_id: Optional[str] = None) -> str:
        JSON string with close result
    """
    effective_task_id = task_id or "default"
+    
+    # Stop auto-recording before closing
+    _maybe_stop_recording(effective_task_id)
+    
    result = _run_browser_command(effective_task_id, "close", [])
    
    # Close the backend session (Browserbase API in cloud mode, nothing extra in local mode)
@ -1294,6 +1320,103 @@ def browser_close(task_id: Optional[str] = None) -> str:
        }, ensure_ascii=False)


+def browser_console(clear: bool = False, task_id: Optional[str] = None) -> str:
+    """Get browser console messages and JavaScript errors.
+    
+    Returns both console output (log/warn/error/info from the page's JS)
+    and uncaught exceptions (crashes, unhandled promise rejections).
+    
+    Args:
+        clear: If True, clear the message/error buffers after reading
+        task_id: Task identifier for session isolation
+        
+    Returns:
+        JSON string with console messages and JS errors
+    """
+    effective_task_id = task_id or "default"
+    
+    console_args = ["--clear"] if clear else []
+    error_args = ["--clear"] if clear else []
+    
+    console_result = _run_browser_command(effective_task_id, "console", console_args)
+    errors_result = _run_browser_command(effective_task_id, "errors", error_args)
+    
+    messages = []
+    if console_result.get("success"):
+        for msg in console_result.get("data", {}).get("messages", []):
+            messages.append({
+                "type": msg.get("type", "log"),
+                "text": msg.get("text", ""),
+                "source": "console",
+            })
+    
+    errors = []
+    if errors_result.get("success"):
+        for err in errors_result.get("data", {}).get("errors", []):
+            errors.append({
+                "message": err.get("message", ""),
+                "source": "exception",
+            })
+    
+    return json.dumps({
+        "success": True,
+        "console_messages": messages,
+        "js_errors": errors,
+        "total_messages": len(messages),
+        "total_errors": len(errors),
+    }, ensure_ascii=False)
+
+
+def _maybe_start_recording(task_id: str):
+    """Start recording if browser.record_sessions is enabled in config."""
+    if task_id in _recording_sessions:
+        return
+    try:
+        hermes_home = Path(os.environ.get("HERMES_HOME", Path.home() / ".hermes"))
+        config_path = hermes_home / "config.yaml"
+        record_enabled = False
+        if config_path.exists():
+            import yaml
+            with open(config_path) as f:
+                cfg = yaml.safe_load(f) or {}
+            record_enabled = cfg.get("browser", {}).get("record_sessions", False)
+        
+        if not record_enabled:
+            return
+        
+        recordings_dir = hermes_home / "browser_recordings"
+        recordings_dir.mkdir(parents=True, exist_ok=True)
+        _cleanup_old_recordings(max_age_hours=72)
+        
+        import time
+        timestamp = time.strftime("%Y%m%d_%H%M%S")
+        recording_path = recordings_dir / f"session_{timestamp}_{task_id[:16]}.webm"
+        
+        result = _run_browser_command(task_id, "record", ["start", str(recording_path)])
+        if result.get("success"):
+            _recording_sessions.add(task_id)
+            logger.info("Auto-recording browser session %s to %s", task_id, recording_path)
+        else:
+            logger.debug("Could not start auto-recording: %s", result.get("error"))
+    except Exception as e:
+        logger.debug("Auto-recording setup failed: %s", e)
+
+
+def _maybe_stop_recording(task_id: str):
+    """Stop recording if one is active for this session."""
+    if task_id not in _recording_sessions:
+        return
+    try:
+        result = _run_browser_command(task_id, "record", ["stop"])
+        if result.get("success"):
+            path = result.get("data", {}).get("path", "")
+            logger.info("Saved browser recording for session %s: %s", task_id, path)
+    except Exception as e:
+        logger.debug("Could not stop recording for %s: %s", task_id, e)
+    finally:
+        _recording_sessions.discard(task_id)
+
+
 def browser_get_images(task_id: Optional[str] = None) -> str:
    """
    Get all images on the current page.
@ -1348,7 +1471,7 @@ def browser_get_images(task_id: Optional[str] = None) -> str:
        }, ensure_ascii=False)


-def browser_vision(question: str, task_id: Optional[str] = None) -> str:
+def browser_vision(question: str, annotate: bool = False, task_id: Optional[str] = None) -> str:
    """
    Take a screenshot of the current page and analyze it with vision AI.
    
@ -1362,6 +1485,7 @@ def browser_vision(question: str, task_id: Optional[str] = None) -> str:
    
    Args:
        question: What you want to know about the page visually
+        annotate: If True, overlay numbered [N] labels on interactive elements
        task_id: Task identifier for session isolation
        
    Returns:
@ -1393,10 +1517,13 @@ def browser_vision(question: str, task_id: Optional[str] = None) -> str:
        _cleanup_old_screenshots(screenshots_dir, max_age_hours=24)
        
        # Take screenshot using agent-browser
+        screenshot_args = [str(screenshot_path)]
+        if annotate:
+            screenshot_args.insert(0, "--annotate")
        result = _run_browser_command(
            effective_task_id, 
            "screenshot", 
-            [str(screenshot_path)],
+            screenshot_args,
            timeout=30
        )
        
@ -1456,11 +1583,15 @@ def browser_vision(question: str, task_id: Optional[str] = None) -> str:
        )
        
        analysis = response.choices[0].message.content
-        return json.dumps({
+        response_data = {
            "success": True,
            "analysis": analysis,
            "screenshot_path": str(screenshot_path),
-        }, ensure_ascii=False)
+        }
+        # Include annotation data if annotated screenshot was taken
+        if annotate and result.get("data", {}).get("annotations"):
+            response_data["annotations"] = result["data"]["annotations"]
+        return json.dumps(response_data, ensure_ascii=False)
    
    except Exception as e:
        # Keep the screenshot if it was captured successfully — the failure is
@ -1490,6 +1621,25 @@ def _cleanup_old_screenshots(screenshots_dir, max_age_hours=24):
        pass  # Non-critical — don't fail the screenshot operation


+def _cleanup_old_recordings(max_age_hours=72):
+    """Remove browser recordings older than max_age_hours to prevent disk bloat."""
+    import time
+    try:
+        hermes_home = Path(os.environ.get("HERMES_HOME", Path.home() / ".hermes"))
+        recordings_dir = hermes_home / "browser_recordings"
+        if not recordings_dir.exists():
+            return
+        cutoff = time.time() - (max_age_hours * 3600)
+        for f in recordings_dir.glob("session_*.webm"):
+            try:
+                if f.stat().st_mtime < cutoff:
+                    f.unlink()
+            except Exception:
+                pass
+    except Exception:
+        pass
+
+
 # ============================================================================
 # Cleanup and Management Functions
 # ============================================================================
@ -1561,6 +1711,9 @@ def cleanup_browser(task_id: Optional[str] = None) -> None:
        bb_session_id = session_info.get("bb_session_id", "unknown")
        logger.debug("Found session for task %s: bb_session_id=%s", task_id, bb_session_id)
        
+        # Stop auto-recording before closing (saves the file)
+        _maybe_stop_recording(task_id)
+        
        # Try to close via agent-browser first (needs session in _active_sessions)
        try:
            _run_browser_command(task_id, "close", [], timeout=10)
@ -1776,6 +1929,13 @@ registry.register(
    name="browser_vision",
    toolset="browser",
    schema=_BROWSER_SCHEMA_MAP["browser_vision"],
-    handler=lambda args, **kw: browser_vision(question=args.get("question", ""), task_id=kw.get("task_id")),
+    handler=lambda args, **kw: browser_vision(question=args.get("question", ""), annotate=args.get("annotate", False), task_id=kw.get("task_id")),
+    check_fn=check_browser_requirements,
+)
+registry.register(
+    name="browser_console",
+    toolset="browser",
+    schema=_BROWSER_SCHEMA_MAP["browser_console"],
+    handler=lambda args, **kw: browser_console(clear=args.get("clear", False), task_id=kw.get("task_id")),
    check_fn=check_browser_requirements,
 )