diff --git a/tests/tools/test_browser_lightpanda.py b/tests/tools/test_browser_lightpanda.py index b65d1e7e52..a618df72a9 100644 --- a/tests/tools/test_browser_lightpanda.py +++ b/tests/tools/test_browser_lightpanda.py @@ -157,6 +157,14 @@ class TestNeedsLightpandaFallback: result = {"success": False, "error": "page.goto: Timeout"} assert _needs_lightpanda_fallback("lightpanda", "open", result) is True + def test_failed_command_reason_is_user_visible(self): + from tools.browser_tool import _lightpanda_fallback_reason + result = {"success": False, "error": "page.goto: Timeout"} + reason = _lightpanda_fallback_reason("lightpanda", "open", result) + assert reason is not None + assert "page.goto: Timeout" in reason + assert "retried with Chrome" in reason + def test_empty_snapshot_triggers_fallback(self): from tools.browser_tool import _needs_lightpanda_fallback result = {"success": True, "data": {"snapshot": ""}} @@ -260,6 +268,145 @@ class TestCleanupResetsEngineCache: assert bt._browser_engine_resolved is False + + +# --------------------------------------------------------------------------- +# fallback warning annotation +# --------------------------------------------------------------------------- + +class TestLightpandaFallbackWarning: + """Verify Chrome fallback results are annotated for users.""" + + def test_fallback_result_gets_user_visible_warning(self): + from tools.browser_tool import _annotate_lightpanda_fallback + + result = {"success": True, "data": {"snapshot": "- heading \"Hello\" [ref=e1]"}} + annotated = _annotate_lightpanda_fallback( + result, + "Lightpanda returned an empty/too-short snapshot; retried with Chrome.", + ) + + assert annotated["browser_engine"] == "chrome" + assert "Lightpanda fallback" in annotated["fallback_warning"] + assert annotated["browser_engine_fallback"] == { + "from": "lightpanda", + "to": "chrome", + "reason": "Lightpanda returned an empty/too-short snapshot; retried with Chrome.", + } + assert annotated["data"]["fallback_warning"] == annotated["fallback_warning"] + assert annotated["data"]["browser_engine"] == "chrome" + + + def test_browser_navigate_surfaces_fallback_warning(self): + import json + import tools.browser_tool as bt + + result = bt._annotate_lightpanda_fallback( + {"success": True, "data": {"title": "Fallback OK", "url": "https://example.com/"}}, + "synthetic Lightpanda failure; retried with Chrome.", + ) + + with patch("tools.browser_tool._is_local_backend", return_value=True), \ + patch("tools.browser_tool._get_cloud_provider", return_value=None), \ + patch("tools.browser_tool._get_session_info", return_value={ + "session_name": "test", "_first_nav": False, "features": {"local": True, "proxies": True} + }), \ + patch("tools.browser_tool._run_browser_command", side_effect=[ + result, + {"success": True, "data": {"snapshot": "- heading \"Fallback OK\" [ref=e1]", "refs": {"e1": {}}}}, + ]): + response = json.loads(bt.browser_navigate("https://example.com", task_id="warn-test")) + + assert response["success"] is True + assert response["browser_engine"] == "chrome" + assert "Lightpanda fallback" in response["fallback_warning"] + assert response["browser_engine_fallback"]["from"] == "lightpanda" + assert response["browser_engine_fallback"]["to"] == "chrome" + bt._last_active_session_key.pop("warn-test", None) + + def test_browser_navigate_surfaces_auto_snapshot_fallback_warning(self): + import json + import tools.browser_tool as bt + + snapshot_result = bt._annotate_lightpanda_fallback( + {"success": True, "data": {"snapshot": "- heading \"Fallback OK\" [ref=e1]", "refs": {"e1": {}}}}, + "Lightpanda returned an empty/too-short snapshot; retried with Chrome.", + ) + + with patch("tools.browser_tool._is_local_backend", return_value=True), \ + patch("tools.browser_tool._get_cloud_provider", return_value=None), \ + patch("tools.browser_tool._get_session_info", return_value={ + "session_name": "test", "_first_nav": False, "features": {"local": True, "proxies": True} + }), \ + patch("tools.browser_tool._run_browser_command", side_effect=[ + {"success": True, "data": {"title": "Fallback OK", "url": "https://example.com/"}}, + snapshot_result, + ]): + response = json.loads(bt.browser_navigate("https://example.com", task_id="warn-test2")) + + assert response["success"] is True + assert response["browser_engine"] == "chrome" + assert "Lightpanda fallback" in response["fallback_warning"] + assert response["element_count"] == 1 + bt._last_active_session_key.pop("warn-test2", None) + + def test_failed_fallback_warning_is_preserved_on_click_error(self): + import json + import tools.browser_tool as bt + + result = bt._annotate_lightpanda_fallback( + {"success": False, "error": "Chrome fallback failed"}, + "Lightpanda 'click' failed (timeout); retried with Chrome.", + ) + bt._last_active_session_key["warn-test3"] = "warn-test3" + with patch("tools.browser_tool._run_browser_command", return_value=result): + response = json.loads(bt.browser_click("@e1", task_id="warn-test3")) + + assert response["success"] is False + assert "Lightpanda fallback" in response["fallback_warning"] + assert response["browser_engine"] == "chrome" + bt._last_active_session_key.pop("warn-test3", None) + + + def test_browser_vision_lightpanda_uses_chrome_capture_and_normal_call_llm_shape(self, tmp_path): + import json + import tools.browser_tool as bt + + chrome_shot = tmp_path / "chrome.png" + chrome_shot.write_bytes(b"\x89PNG" + b"0" * 128) + + class _Msg: + content = "Example Domain screenshot" + + class _Choice: + message = _Msg() + + class _Response: + choices = [_Choice()] + + captured_kwargs = {} + + def fake_call_llm(**kwargs): + captured_kwargs.update(kwargs) + return _Response() + + with patch("tools.browser_tool._get_browser_engine", return_value="lightpanda"), \ + patch("tools.browser_tool._should_inject_engine", return_value=True), \ + patch("tools.browser_tool._chrome_fallback_screenshot", return_value={ + "success": True, "data": {"path": str(chrome_shot)} + }), \ + patch("hermes_constants.get_hermes_dir", return_value=tmp_path), \ + patch("tools.browser_tool.call_llm", side_effect=fake_call_llm): + response = json.loads(bt.browser_vision("what is this?", task_id="vision-test")) + + assert response["success"] is True + assert response["analysis"] == "Example Domain screenshot" + assert response["browser_engine"] == "chrome" + assert "Lightpanda fallback" in response["fallback_warning"] + assert "messages" in captured_kwargs + assert "images" not in captured_kwargs + assert captured_kwargs["task"] == "vision" + # --------------------------------------------------------------------------- # _engine_override parameter # --------------------------------------------------------------------------- @@ -361,3 +508,48 @@ class TestEngineOverride: assert "--engine" in captured_cmds[0] engine_idx = captured_cmds[0].index("--engine") assert captured_cmds[0][engine_idx + 1] == "lightpanda" + + def test_hybrid_local_sidecar_injects_engine_even_with_cloud_provider(self): + """A task::local sidecar is local even when global cloud config exists.""" + import tools.browser_tool as bt + + bt._cached_browser_engine = "lightpanda" + bt._browser_engine_resolved = True + captured_cmds = [] + mock_provider = MagicMock() + + mock_proc = MagicMock() + mock_proc.wait.return_value = None + mock_proc.returncode = 0 + + def capture_popen(cmd, **kwargs): + captured_cmds.append(cmd) + return mock_proc + + mock_stdout = json.dumps({ + "success": True, + "data": {"snapshot": '- heading "Hello" [ref=e1]', "refs": {"e1": {}}}, + }) + with patch("tools.browser_tool._get_session_info", return_value={"session_name": "local-sidecar"}), \ + patch("tools.browser_tool._find_agent_browser", return_value="/usr/bin/agent-browser"), \ + patch("tools.browser_tool._is_local_mode", return_value=False), \ + patch("tools.browser_tool._chromium_installed", return_value=True), \ + patch("tools.browser_tool._get_cloud_provider", return_value=mock_provider), \ + patch("tools.browser_tool._get_cdp_override", return_value=""), \ + patch("tools.browser_tool._is_camofox_mode", return_value=False), \ + patch("subprocess.Popen", side_effect=capture_popen), \ + patch("os.open", return_value=99), \ + patch("os.close"), \ + patch("os.unlink"), \ + patch("os.makedirs"), \ + patch("builtins.open", MagicMock(return_value=MagicMock( + __enter__=MagicMock(return_value=MagicMock(read=MagicMock(return_value=mock_stdout))), + __exit__=MagicMock(return_value=False), + ))), \ + patch("tools.interrupt.is_interrupted", return_value=False), \ + patch("tools.browser_tool._write_owner_pid"): + bt._run_browser_command("task::local", "snapshot", []) + + assert len(captured_cmds) == 1 + assert "--engine" in captured_cmds[0] + assert captured_cmds[0][captured_cmds[0].index("--engine") + 1] == "lightpanda" diff --git a/tools/browser_tool.py b/tools/browser_tool.py index 195aa1ee44..9c8e355c87 100644 --- a/tools/browser_tool.py +++ b/tools/browser_tool.py @@ -38,13 +38,13 @@ Environment Variables: Usage: from tools.browser_tool import browser_navigate, browser_snapshot, browser_click - + # Navigate to a page result = browser_navigate("https://example.com", task_id="task_123") - + # Get page snapshot snapshot = browser_snapshot(task_id="task_123") - + # Click an element browser_click("@e5", task_id="task_123") """ @@ -555,18 +555,15 @@ def _should_inject_engine(engine: str) -> bool: return _is_local_mode() -def _needs_lightpanda_fallback(engine: str, command: str, result: Dict[str, Any]) -> bool: - """Check if a Lightpanda result should trigger an automatic Chrome fallback. +def _lightpanda_fallback_reason(engine: str, command: str, result: Dict[str, Any]) -> Optional[str]: + """Return the user-visible reason a Lightpanda result needs Chrome fallback. - Returns True when: - - The engine is lightpanda AND - - The command is fallback-eligible (not close/record) AND - - The command failed, OR - - A snapshot came back empty/suspiciously short, OR - - A screenshot returned but is likely the Lightpanda placeholder PNG + ``None`` means no fallback should run. The returned string is copied into + the fallback result so CLI/TUI/gateway users can see when Hermes silently + switched from Lightpanda to Chrome for completeness. """ if engine != "lightpanda": - return False + return None # Only retry commands where Chrome can meaningfully produce a different # result. Session-management commands (close, record) are tied to the @@ -574,11 +571,12 @@ def _needs_lightpanda_fallback(engine: str, command: str, result: Dict[str, Any] _FALLBACK_ELIGIBLE = {"open", "snapshot", "screenshot", "eval", "click", "fill", "scroll", "back", "press", "console", "errors"} if command not in _FALLBACK_ELIGIBLE: - return False + return None # Explicit failure if not result.get("success"): - return True + error = str(result.get("error") or "command failed").strip() + return f"Lightpanda {command!r} failed ({error}); retried with Chrome." data = result.get("data", {}) @@ -586,7 +584,7 @@ def _needs_lightpanda_fallback(engine: str, command: str, result: Dict[str, Any] snap = data.get("snapshot", "") # Empty or near-empty snapshots indicate Lightpanda couldn't render if not snap or len(snap.strip()) < 20: - return True + return "Lightpanda returned an empty/too-short snapshot; retried with Chrome." if command == "screenshot": # Lightpanda returns a placeholder PNG with its panda logo. @@ -599,32 +597,79 @@ def _needs_lightpanda_fallback(engine: str, command: str, result: Dict[str, Any] if size < 20480: logger.debug("Lightpanda screenshot is suspiciously small (%d bytes), " "triggering Chrome fallback", size) - return True + return ( + f"Lightpanda screenshot was suspiciously small ({size} bytes); " + "retried with Chrome." + ) except OSError: - return True # file doesn't exist or can't be read + return "Lightpanda screenshot file was missing/unreadable; retried with Chrome." - return False + return None -def _chrome_fallback_screenshot( +def _needs_lightpanda_fallback(engine: str, command: str, result: Dict[str, Any]) -> bool: + """Check if a Lightpanda result should trigger an automatic Chrome fallback.""" + return _lightpanda_fallback_reason(engine, command, result) is not None + + +def _annotate_lightpanda_fallback(result: Dict[str, Any], reason: str) -> Dict[str, Any]: + """Add a user-visible Chrome fallback warning to a browser command result.""" + warning = ( + "⚠ Lightpanda fallback: Chrome was used for this browser action. " + f"{reason}" + ) + annotated = dict(result) + annotated["fallback_warning"] = warning + annotated["browser_engine"] = "chrome" + annotated["browser_engine_fallback"] = { + "from": "lightpanda", + "to": "chrome", + "reason": reason, + } + data = annotated.get("data") + if isinstance(data, dict): + data = dict(data) + data.setdefault("fallback_warning", warning) + data.setdefault("browser_engine", "chrome") + data.setdefault( + "browser_engine_fallback", + {"from": "lightpanda", "to": "chrome", "reason": reason}, + ) + annotated["data"] = data + return annotated + + +def _copy_fallback_warning(target: Dict[str, Any], result: Dict[str, Any]) -> Dict[str, Any]: + """Copy browser fallback metadata from an internal result into a tool response.""" + if result.get("fallback_warning"): + target["fallback_warning"] = result["fallback_warning"] + target["browser_engine"] = result.get("browser_engine") + target["browser_engine_fallback"] = result.get("browser_engine_fallback") + return target + + +def _run_chrome_fallback_command( task_id: str, + command: str, args: List[str], timeout: int, ) -> Dict[str, Any]: - """Take a screenshot using a temporary Chrome session. + """Run a browser command in a temporary Chrome session at the current URL. - When the active session uses Lightpanda, ``--engine chrome`` on the same - session has no effect — the engine is locked at daemon startup. This - helper spins up a **separate** Chrome session, navigates to the same URL - the agent is currently viewing, takes the screenshot, then tears down the - temporary session. - - Returns the screenshot result dict (same shape as ``_run_browser_command``). + agent-browser locks the engine when a named daemon starts. Passing + ``--engine chrome`` to the same Lightpanda ``--session`` cannot change that + running daemon. This helper always uses a fresh temporary Chrome session, + navigates it to the current Lightpanda URL, runs ``command``, then tears it + down. """ import uuid - # 1. Grab the current URL from the Lightpanda session. - url_result = _run_browser_command(task_id, "eval", ["window.location.href"], timeout=10) + # 1. Grab the current URL from the Lightpanda session. Use + # ``_engine_override=\"auto\"`` so this helper does not recursively trigger + # Lightpanda→Chrome fallback if the eval call itself fails. + url_result = _run_browser_command( + task_id, "eval", ["window.location.href"], timeout=10, _engine_override="auto" + ) current_url = None if url_result.get("success"): current_url = url_result.get("data", {}).get("result", "").strip().strip('"').strip("'") @@ -647,6 +692,9 @@ def _chrome_fallback_screenshot( browser_env = {**os.environ, "AGENT_BROWSER_SOCKET_DIR": task_socket_dir} browser_env["PATH"] = _merge_browser_path(browser_env.get("PATH", "")) + if "AGENT_BROWSER_IDLE_TIMEOUT_MS" not in browser_env: + browser_env["AGENT_BROWSER_IDLE_TIMEOUT_MS"] = str(BROWSER_SESSION_INACTIVITY_TIMEOUT * 1000) + def _run_tmp(cmd: str, cmd_args: List[str]) -> Dict[str, Any]: full = base_args + [cmd] + cmd_args # Use temp-file stdout/stderr pattern (same as _run_browser_command) @@ -677,9 +725,9 @@ def _chrome_fallback_screenshot( except Exception as exc: logger.debug("Chrome fallback tmp cmd '%s' error: %s", cmd, exc) finally: - for p in (stdout_path, stderr_path): + for pth in (stdout_path, stderr_path): try: - os.unlink(p) + os.unlink(pth) except OSError: pass return {"success": False, "error": f"Chrome fallback '{cmd}' failed"} @@ -691,9 +739,8 @@ def _chrome_fallback_screenshot( logger.warning("Chrome fallback: navigate failed: %s", nav.get("error")) return {"success": False, "error": f"Chrome fallback navigate failed: {nav.get('error')}"} - # 4. Take the screenshot. - result = _run_tmp("screenshot", args) - return result + # 4. Run the requested command in Chrome. + return _run_tmp(command, args) finally: # 5. Tear down the temporary Chrome session. @@ -706,6 +753,15 @@ def _chrome_fallback_screenshot( _shutil.rmtree(task_socket_dir, ignore_errors=True) +def _chrome_fallback_screenshot( + task_id: str, + args: List[str], + timeout: int, +) -> Dict[str, Any]: + """Take a screenshot using a temporary Chrome session.""" + return _run_chrome_fallback_command(task_id, "screenshot", args, timeout) + + def _auto_local_for_private_urls() -> bool: """Return whether a cloud-configured install should auto-spawn a local Chromium for LAN/localhost URLs. @@ -981,19 +1037,19 @@ atexit.register(_emergency_cleanup_all_sessions) def _cleanup_inactive_browser_sessions(): """ Clean up browser sessions that have been inactive for longer than the timeout. - + This function is called periodically by the background cleanup thread to automatically close sessions that haven't been used recently, preventing orphaned sessions (local or Browserbase) from accumulating. """ current_time = time.time() sessions_to_cleanup = [] - + with _cleanup_lock: for task_id, last_time in list(_session_last_activity.items()): if current_time - last_time > BROWSER_SESSION_INACTIVITY_TIMEOUT: sessions_to_cleanup.append(task_id) - + for task_id in sessions_to_cleanup: try: elapsed = int(current_time - _session_last_activity.get(task_id, current_time)) @@ -1147,7 +1203,7 @@ def _reap_orphaned_browser_sessions(): def _browser_cleanup_thread_worker(): """ Background thread that periodically cleans up inactive browser sessions. - + Runs every 30 seconds and checks for sessions that haven't been used within the BROWSER_SESSION_INACTIVITY_TIMEOUT period. On first run, also reaps orphaned sessions from previous process lifetimes. @@ -1163,7 +1219,7 @@ def _browser_cleanup_thread_worker(): _cleanup_inactive_browser_sessions() except Exception as e: logger.warning("Cleanup thread error: %s", e) - + # Sleep in 1-second intervals so we can stop quickly if needed for _ in range(30): if not _cleanup_running: @@ -1174,7 +1230,7 @@ def _browser_cleanup_thread_worker(): def _start_browser_cleanup_thread(): """Start the background cleanup thread if not already running.""" global _cleanup_thread, _cleanup_running - + with _cleanup_lock: if _cleanup_thread is None or not _cleanup_thread.is_alive(): _cleanup_running = True @@ -1493,13 +1549,13 @@ def _get_session_info(task_id: Optional[str] = None) -> Dict[str, str]: def _find_agent_browser() -> str: """ Find the agent-browser CLI executable. - + Checks in order: current PATH, Homebrew/common bin dirs, Hermes-managed node, local node_modules/.bin/, npx fallback. - + Returns: Path to agent-browser executable - + Raises: FileNotFoundError: If agent-browser is not installed """ @@ -1542,7 +1598,7 @@ def _find_agent_browser() -> str: _cached_agent_browser = str(local_bin) _agent_browser_resolved = True return _cached_agent_browser - + # Check common npx locations (also search the extended fallback PATH) npx_path = shutil.which("npx") if not npx_path and extended_path: @@ -1551,7 +1607,7 @@ def _find_agent_browser() -> str: _cached_agent_browser = "npx agent-browser" _agent_browser_resolved = True return _cached_agent_browser - + # Nothing found — cache the failure so subsequent calls don't re-scan. _agent_browser_resolved = True raise FileNotFoundError( @@ -1592,7 +1648,7 @@ def _run_browser_command( ) -> Dict[str, Any]: """ Run an agent-browser CLI command using our pre-created Browserbase session. - + Args: task_id: Task identifier to get the right session command: The command to run (e.g., "open", "click") @@ -1602,14 +1658,14 @@ def _run_browser_command( _engine_override: Force a specific engine for this call only. Used internally by the Lightpanda fallback to retry with Chrome without touching global state. - + Returns: Parsed JSON response from agent-browser """ if timeout is None: timeout = _get_command_timeout() args = args or [] - + # Build the command try: browser_cmd = _find_agent_browser() @@ -1640,7 +1696,7 @@ def _run_browser_command( ) logger.warning("browser command blocked: %s", hint) return {"success": False, "error": hint} - + from tools.interrupt import is_interrupted if is_interrupted(): return {"success": False, "error": "Interrupted"} @@ -1651,7 +1707,7 @@ def _run_browser_command( except Exception as e: logger.warning("Failed to create browser session for task=%s: %s", task_id, e) return {"success": False, "error": f"Failed to create browser session: {str(e)}"} - + # Build the command with the appropriate backend flag. # Cloud mode: --cdp connects to Browserbase. # Local mode: --session launches a local headless Chromium. @@ -1665,9 +1721,12 @@ def _run_browser_command( # Local mode — launch a headless Chromium instance backend_args = ["--session", session_info["session_name"]] - # Lightpanda engine injection (local mode only, agent-browser v0.25.3+) + # Lightpanda engine injection (local mode only, agent-browser v0.25.3+). + # Use the resolved session backend rather than global cloud-provider state: + # hybrid private-URL routing can create a local sidecar while a cloud + # provider remains configured for public URLs. engine = _engine_override or _get_browser_engine() - if _should_inject_engine(engine): + if engine != "auto" and not _is_camofox_mode() and not session_info.get("cdp_url"): backend_args += ["--engine", engine] # Keep concrete executable paths intact, even when they contain spaces. @@ -1678,7 +1737,7 @@ def _run_browser_command( "--json", command ] + args - + try: # Give each task its own socket directory to prevent concurrency conflicts. # Without this, parallel workers fight over the same default socket path, @@ -1693,7 +1752,7 @@ def _run_browser_command( _write_owner_pid(task_socket_dir, session_info['session_name']) logger.debug("browser cmd=%s task=%s socket_dir=%s (%d chars)", command, task_id, task_socket_dir, len(task_socket_dir)) - + browser_env = {**os.environ} # Ensure subprocesses inherit the same browser-specific PATH fallbacks @@ -1737,7 +1796,7 @@ def _run_browser_command( browser_env["AGENT_BROWSER_CHROME_FLAGS"] = ( "--no-sandbox --disable-dev-shm-usage" ) - + # Use temp files for stdout/stderr instead of pipes. # agent-browser starts a background daemon that inherits file # descriptors. With capture_output=True (pipes), the daemon keeps @@ -1786,7 +1845,7 @@ def _run_browser_command( if stderr and stderr.strip(): level = logging.WARNING if returncode != 0 else logging.DEBUG logger.log(level, "browser '%s' stderr: %s", command, stderr.strip()[:500]) - + stdout_text = stdout.strip() # Empty output with rc=0 is a broken state — treat as failure rather @@ -1847,7 +1906,7 @@ def _run_browser_command( result = {"success": False, "error": error_msg} else: result = {"success": True, "data": {}} - + except Exception as e: logger.warning("browser '%s' exception: %s", command, e, exc_info=True) result = {"success": False, "error": str(e)} @@ -1855,14 +1914,21 @@ def _run_browser_command( # --- Lightpanda automatic Chrome fallback --- # If engine is lightpanda and the result looks broken, retry with Chrome. # This runs for ALL exit paths (timeout, empty, non-JSON, nonzero rc, parsed). - if _needs_lightpanda_fallback(engine, command, result): - logger.info("Lightpanda fallback: retrying '%s' with Chrome (task=%s)", command, task_id) + fallback_reason = _lightpanda_fallback_reason(engine, command, result) + if fallback_reason: + logger.info( + "Lightpanda fallback: retrying '%s' with Chrome (task=%s): %s", + command, + task_id, + fallback_reason, + ) # For screenshots, use the dedicated Chrome fallback helper # (spins up a separate Chrome session to the same URL). if command == "screenshot": - return _chrome_fallback_screenshot(task_id, args or [], timeout) - # For other commands, re-run with engine forced to "auto" (Chrome). - return _run_browser_command(task_id, command, args, timeout, _engine_override="auto") + fallback_result = _chrome_fallback_screenshot(task_id, args or [], timeout) + else: + fallback_result = _run_chrome_fallback_command(task_id, command, args, timeout) + return _annotate_lightpanda_fallback(fallback_result, fallback_reason) return result @@ -1961,11 +2027,11 @@ def _truncate_snapshot(snapshot_text: str, max_chars: int = 8000) -> str: def browser_navigate(url: str, task_id: Optional[str] = None) -> str: """ Navigate to a URL in the browser. - + Args: url: The URL to navigate to task_id: Task identifier for session isolation - + Returns: JSON string with navigation result (includes stealth features info on first nav) """ @@ -2045,7 +2111,7 @@ def browser_navigate(url: str, task_id: Optional[str] = None) -> str: # on the same task_id hit it (critical when hybrid routing has both a # cloud session and a local sidecar alive concurrently). _last_active_session_key[effective_task_id] = nav_session_key - + if result.get("success"): data = result.get("data", {}) title = data.get("title", "") @@ -2075,7 +2141,8 @@ def browser_navigate(url: str, task_id: Optional[str] = None) -> str: "url": final_url, "title": title } - + _copy_fallback_warning(response, result) + # Detect common "blocked" page patterns from title/url blocked_patterns = [ "access denied", "access to this page has been denied", @@ -2085,7 +2152,7 @@ def browser_navigate(url: str, task_id: Optional[str] = None) -> str: "just a moment", "attention required" ] title_lower = title.lower() - + if any(pattern in title_lower for pattern in blocked_patterns): response["bot_detection_warning"] = ( f"Page title '{title}' suggests bot detection. The site may have blocked this request. " @@ -2093,7 +2160,7 @@ def browser_navigate(url: str, task_id: Optional[str] = None) -> str: "3) Enable advanced stealth (BROWSERBASE_ADVANCED_STEALTH=true, requires Scale plan), " "4) Some sites have very aggressive bot detection that may be unavoidable." ) - + # Include feature info on first navigation so model knows what's active if is_first_nav and "features" in session_info: features = session_info["features"] @@ -2117,6 +2184,8 @@ def browser_navigate(url: str, task_id: Optional[str] = None) -> str: snapshot_text = _truncate_snapshot(snapshot_text) response["snapshot"] = snapshot_text response["element_count"] = len(refs) if refs else 0 + if snap_result.get("fallback_warning") and not response.get("fallback_warning"): + _copy_fallback_warning(response, snap_result) except Exception as e: logger.debug("Auto-snapshot after navigate failed: %s", e) @@ -2135,12 +2204,12 @@ def browser_snapshot( ) -> str: """ Get a text-based snapshot of the current page's accessibility tree. - + Args: full: If True, return complete snapshot. If False, return compact view. task_id: Task identifier for session isolation user_task: The user's current task (for task-aware extraction) - + Returns: JSON string with page snapshot """ @@ -2149,30 +2218,31 @@ def browser_snapshot( return camofox_snapshot(full, task_id, user_task) effective_task_id = _last_session_key(task_id or "default") - + # Build command args based on full flag args = [] if not full: args.extend(["-c"]) # Compact mode - + result = _run_browser_command(effective_task_id, "snapshot", args) - + if result.get("success"): data = result.get("data", {}) snapshot_text = data.get("snapshot", "") refs = data.get("refs", {}) - + # Check if snapshot needs summarization if len(snapshot_text) > SNAPSHOT_SUMMARIZE_THRESHOLD and user_task: snapshot_text = _extract_relevant_content(snapshot_text, user_task) elif len(snapshot_text) > SNAPSHOT_SUMMARIZE_THRESHOLD: snapshot_text = _truncate_snapshot(snapshot_text) - + response = { "success": True, "snapshot": snapshot_text, "element_count": len(refs) if refs else 0 } + _copy_fallback_warning(response, result) # Merge supervisor state (pending dialogs + frame tree) when a CDP # supervisor is attached to this task. No-op otherwise. See @@ -2189,20 +2259,21 @@ def browser_snapshot( return json.dumps(response, ensure_ascii=False) else: - return json.dumps({ + response = { "success": False, "error": result.get("error", "Failed to get snapshot") - }, ensure_ascii=False) + } + return json.dumps(_copy_fallback_warning(response, result), ensure_ascii=False) def browser_click(ref: str, task_id: Optional[str] = None) -> str: """ Click on an element. - + Args: ref: Element reference (e.g., "@e5") task_id: Task identifier for session isolation - + Returns: JSON string with click result """ @@ -2211,34 +2282,36 @@ def browser_click(ref: str, task_id: Optional[str] = None) -> str: return camofox_click(ref, task_id) effective_task_id = _last_session_key(task_id or "default") - + # Ensure ref starts with @ if not ref.startswith("@"): ref = f"@{ref}" - + result = _run_browser_command(effective_task_id, "click", [ref]) - + if result.get("success"): - return json.dumps({ + response = { "success": True, "clicked": ref - }, ensure_ascii=False) + } + return json.dumps(_copy_fallback_warning(response, result), ensure_ascii=False) else: - return json.dumps({ + response = { "success": False, "error": result.get("error", f"Failed to click {ref}") - }, ensure_ascii=False) + } + return json.dumps(_copy_fallback_warning(response, result), ensure_ascii=False) def browser_type(ref: str, text: str, task_id: Optional[str] = None) -> str: """ Type text into an input field. - + Args: ref: Element reference (e.g., "@e3") text: Text to type task_id: Task identifier for session isolation - + Returns: JSON string with type result """ @@ -2247,35 +2320,37 @@ def browser_type(ref: str, text: str, task_id: Optional[str] = None) -> str: return camofox_type(ref, text, task_id) effective_task_id = _last_session_key(task_id or "default") - + # Ensure ref starts with @ if not ref.startswith("@"): ref = f"@{ref}" - + # Use fill command (clears then types) result = _run_browser_command(effective_task_id, "fill", [ref, text]) - + if result.get("success"): - return json.dumps({ + response = { "success": True, "typed": text, "element": ref - }, ensure_ascii=False) + } + return json.dumps(_copy_fallback_warning(response, result), ensure_ascii=False) else: - return json.dumps({ + response = { "success": False, "error": result.get("error", f"Failed to type into {ref}") - }, ensure_ascii=False) + } + return json.dumps(_copy_fallback_warning(response, result), ensure_ascii=False) def browser_scroll(direction: str, task_id: Optional[str] = None) -> str: """ Scroll the page. - + Args: direction: "up" or "down" task_id: Task identifier for session isolation - + Returns: JSON string with scroll result """ @@ -2304,24 +2379,26 @@ def browser_scroll(direction: str, task_id: Optional[str] = None) -> str: result = _run_browser_command(effective_task_id, "scroll", [direction, str(_SCROLL_PIXELS)]) if not result.get("success"): - return json.dumps({ + response = { "success": False, "error": result.get("error", f"Failed to scroll {direction}") - }, ensure_ascii=False) + } + return json.dumps(_copy_fallback_warning(response, result), ensure_ascii=False) - return json.dumps({ + response = { "success": True, "scrolled": direction - }, ensure_ascii=False) + } + return json.dumps(_copy_fallback_warning(response, result), ensure_ascii=False) def browser_back(task_id: Optional[str] = None) -> str: """ Navigate back in browser history. - + Args: task_id: Task identifier for session isolation - + Returns: JSON string with navigation result """ @@ -2331,28 +2408,30 @@ def browser_back(task_id: Optional[str] = None) -> str: effective_task_id = _last_session_key(task_id or "default") result = _run_browser_command(effective_task_id, "back", []) - + if result.get("success"): data = result.get("data", {}) - return json.dumps({ + response = { "success": True, "url": data.get("url", "") - }, ensure_ascii=False) + } + return json.dumps(_copy_fallback_warning(response, result), ensure_ascii=False) else: - return json.dumps({ + response = { "success": False, "error": result.get("error", "Failed to go back") - }, ensure_ascii=False) + } + return json.dumps(_copy_fallback_warning(response, result), ensure_ascii=False) def browser_press(key: str, task_id: Optional[str] = None) -> str: """ Press a keyboard key. - + Args: key: Key to press (e.g., "Enter", "Tab") task_id: Task identifier for session isolation - + Returns: JSON string with key press result """ @@ -2362,17 +2441,19 @@ def browser_press(key: str, task_id: Optional[str] = None) -> str: effective_task_id = _last_session_key(task_id or "default") result = _run_browser_command(effective_task_id, "press", [key]) - + if result.get("success"): - return json.dumps({ + response = { "success": True, "pressed": key - }, ensure_ascii=False) + } + return json.dumps(_copy_fallback_warning(response, result), ensure_ascii=False) else: - return json.dumps({ + response = { "success": False, "error": result.get("error", f"Failed to press {key}") - }, ensure_ascii=False) + } + return json.dumps(_copy_fallback_warning(response, result), ensure_ascii=False) @@ -2380,16 +2461,16 @@ def browser_press(key: str, task_id: Optional[str] = None) -> str: def browser_console(clear: bool = False, expression: Optional[str] = None, task_id: Optional[str] = None) -> str: """Get browser console messages and JavaScript errors, or evaluate JS in the page. - + When ``expression`` is provided, evaluates JavaScript in the page context (like the DevTools console) and returns the result. Otherwise returns console output (log/warn/error/info) and uncaught exceptions. - + Args: clear: If True, clear the message/error buffers after reading expression: JavaScript expression to evaluate in the page context task_id: Task identifier for session isolation - + Returns: JSON string with console messages/errors, or eval result """ @@ -2403,13 +2484,13 @@ def browser_console(clear: bool = False, expression: Optional[str] = None, task_ return camofox_console(clear, task_id) effective_task_id = _last_session_key(task_id or "default") - + console_args = ["--clear"] if clear else [] error_args = ["--clear"] if clear else [] - + console_result = _run_browser_command(effective_task_id, "console", console_args) errors_result = _run_browser_command(effective_task_id, "errors", error_args) - + messages = [] if console_result.get("success"): for msg in console_result.get("data", {}).get("messages", []): @@ -2418,7 +2499,7 @@ def browser_console(clear: bool = False, expression: Optional[str] = None, task_ "text": msg.get("text", ""), "source": "console", }) - + errors = [] if errors_result.get("success"): for err in errors_result.get("data", {}).get("errors", []): @@ -2426,14 +2507,18 @@ def browser_console(clear: bool = False, expression: Optional[str] = None, task_ "message": err.get("message", ""), "source": "exception", }) - - return json.dumps({ + + response = { "success": True, "console_messages": messages, "js_errors": errors, "total_messages": len(messages), "total_errors": len(errors), - }, ensure_ascii=False) + } + _copy_fallback_warning(response, console_result) + if errors_result.get("fallback_warning") and not response.get("fallback_warning"): + _copy_fallback_warning(response, errors_result) + return json.dumps(response, ensure_ascii=False) def _browser_eval(expression: str, task_id: Optional[str] = None) -> str: @@ -2448,14 +2533,16 @@ def _browser_eval(expression: str, task_id: Optional[str] = None) -> str: err = result.get("error", "eval failed") # Detect backend capability gaps and give the model a clear signal if any(hint in err.lower() for hint in ("unknown command", "not supported", "not found", "no such command")): - return json.dumps({ + response = { "success": False, "error": f"JavaScript evaluation is not supported by this browser backend. {err}", - }) - return json.dumps({ + } + return json.dumps(_copy_fallback_warning(response, result)) + response = { "success": False, "error": err, - }) + } + return json.dumps(_copy_fallback_warning(response, result)) data = result.get("data", {}) raw_result = data.get("result") @@ -2469,11 +2556,12 @@ def _browser_eval(expression: str, task_id: Optional[str] = None) -> str: except (json.JSONDecodeError, ValueError): pass # keep as string - return json.dumps({ + response = { "success": True, "result": parsed, "result_type": type(parsed).__name__, - }, ensure_ascii=False, default=str) + } + return json.dumps(_copy_fallback_warning(response, result), ensure_ascii=False, default=str) def _camofox_eval(expression: str, task_id: Optional[str] = None) -> str: @@ -2520,17 +2608,17 @@ def _maybe_start_recording(task_id: str): hermes_home = get_hermes_home() cfg = read_raw_config() record_enabled = cfg_get(cfg, "browser", "record_sessions", default=False) - + if not record_enabled: return - + recordings_dir = hermes_home / "browser_recordings" recordings_dir.mkdir(parents=True, exist_ok=True) _cleanup_old_recordings(max_age_hours=72) - + timestamp = time.strftime("%Y%m%d_%H%M%S") recording_path = recordings_dir / f"session_{timestamp}_{task_id[:16]}.webm" - + result = _run_browser_command(task_id, "record", ["start", str(recording_path)]) if result.get("success"): with _cleanup_lock: @@ -2562,10 +2650,10 @@ def _maybe_stop_recording(task_id: str): def browser_get_images(task_id: Optional[str] = None) -> str: """ Get all images on the current page. - + Args: task_id: Task identifier for session isolation - + Returns: JSON string with list of images (src and alt) """ @@ -2574,7 +2662,7 @@ def browser_get_images(task_id: Optional[str] = None) -> str: return camofox_get_images(task_id) effective_task_id = _last_session_key(task_id or "default") - + # Use eval to run JavaScript that extracts images js_code = """JSON.stringify( [...document.images].map(img => ({ @@ -2584,20 +2672,20 @@ def browser_get_images(task_id: Optional[str] = None) -> str: height: img.naturalHeight })).filter(img => img.src && !img.src.startsWith('data:')) )""" - + result = _run_browser_command(effective_task_id, "eval", [js_code]) - + if result.get("success"): data = result.get("data", {}) raw_result = data.get("result", "[]") - + try: # Parse the JSON string returned by JavaScript if isinstance(raw_result, str): images = json.loads(raw_result) else: images = raw_result - + return json.dumps({ "success": True, "images": images, @@ -2620,20 +2708,20 @@ def browser_get_images(task_id: Optional[str] = None) -> str: def browser_vision(question: str, annotate: bool = False, task_id: Optional[str] = None) -> str: """ Take a screenshot of the current page and analyze it with vision AI. - + This tool captures what's visually displayed in the browser and sends it to Gemini for analysis. Useful for understanding visual content that the text-based snapshot may not capture (CAPTCHAs, verification challenges, images, complex layouts, etc.). - + The screenshot is saved persistently and its file path is returned alongside the analysis, so it can be shared with users via MEDIA: in the response. - + Args: question: What you want to know about the page visually annotate: If True, overlay numbered [N] labels on interactive elements task_id: Task identifier for session isolation - + Returns: JSON string with vision analysis results and screenshot_path """ @@ -2643,13 +2731,19 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str] import base64 import uuid as uuid_mod + from hermes_constants import get_hermes_dir + screenshots_dir = get_hermes_dir("cache/screenshots", "browser_screenshots") + screenshot_path = screenshots_dir / f"browser_screenshot_{uuid_mod.uuid4().hex}.png" effective_task_id = _last_session_key(task_id or "default") # Lightpanda has no graphical renderer — pre-route screenshots to Chrome # via the fallback helper instead of letting the normal path fail with a - # CDP error or return a placeholder PNG. + # CDP error or return a placeholder PNG. The normal analysis path below + # still owns base64 encoding, provider routing, resizing retry, redaction, + # and response shape. engine = _get_browser_engine() _lp_prerouted = False + _lp_fallback_warning = None if engine == "lightpanda" and _should_inject_engine(engine): logger.debug("browser_vision: pre-routing screenshot to Chrome (engine=lightpanda)") screenshot_args = [] @@ -2658,70 +2752,73 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str] fb_result = _chrome_fallback_screenshot( effective_task_id, screenshot_args, _get_command_timeout(), ) + fb_reason = "Lightpanda has no graphical renderer for screenshots; used Chrome for vision capture." + fb_result = _annotate_lightpanda_fallback(fb_result, fb_reason) if fb_result.get("success"): - # Proceed with the Chrome screenshot for vision analysis + _lp_prerouted = True + _lp_fallback_warning = fb_result.get("fallback_warning") fb_path = fb_result.get("data", {}).get("path", "") if fb_path and os.path.exists(fb_path): - try: - with open(fb_path, "rb") as f: - image_data = base64.b64encode(f.read()).decode("utf-8") - analysis = call_llm( - f"Analyze this browser screenshot and answer: {question}", - images=[{"data": image_data, "media_type": "image/png"}], - task="vision", - ) - from hermes_constants import get_hermes_dir - screenshots_dir = get_hermes_dir("cache/screenshots", "browser_screenshots") - screenshots_dir.mkdir(parents=True, exist_ok=True) - # Copy to persistent location - import shutil as _shutil_vision - persistent_path = screenshots_dir / f"browser_screenshot_{uuid_mod.uuid4().hex}.png" - _shutil_vision.copy2(fb_path, persistent_path) - return json.dumps({ - "analysis": analysis, - "screenshot_path": str(persistent_path), - }) - except Exception as e: - logger.warning("Lightpanda Chrome fallback vision failed: %s", e) - # Fall through to normal path as last resort - # Mark that we already tried the Chrome fallback, so the normal - # _run_browser_command path doesn't trigger it a second time. - _lp_prerouted = True - - # Save screenshot to persistent location so it can be shared with users - from hermes_constants import get_hermes_dir - screenshots_dir = get_hermes_dir("cache/screenshots", "browser_screenshots") - screenshot_path = screenshots_dir / f"browser_screenshot_{uuid_mod.uuid4().hex}.png" - + from hermes_constants import get_hermes_dir + screenshots_dir = get_hermes_dir("cache/screenshots", "browser_screenshots") + screenshots_dir.mkdir(parents=True, exist_ok=True) + import shutil as _shutil_vision + persistent_path = screenshots_dir / f"browser_screenshot_{uuid_mod.uuid4().hex}.png" + _shutil_vision.copy2(fb_path, persistent_path) + screenshot_path = persistent_path + else: + logger.warning("Lightpanda Chrome fallback vision screenshot failed: %s", fb_result.get("error")) + # Fall through to normal path as last resort. Mark that we already + # tried Chrome so _run_browser_command doesn't recursively fallback. + _lp_prerouted = True + try: screenshots_dir.mkdir(parents=True, exist_ok=True) - + # Prune old screenshots (older than 24 hours) to prevent unbounded disk growth _cleanup_old_screenshots(screenshots_dir, max_age_hours=24) - - # Take screenshot using agent-browser - screenshot_args = [] - if annotate: - screenshot_args.append("--annotate") - screenshot_args.append("--full") - screenshot_args.append(str(screenshot_path)) - result = _run_browser_command( - effective_task_id, - "screenshot", - screenshot_args, - # If the Lightpanda pre-route already failed, force Chrome so - # _run_browser_command doesn't trigger a redundant LP fallback. - _engine_override="auto" if _lp_prerouted else None, - ) - + + if _lp_prerouted and screenshot_path.exists(): + result = { + "success": True, + "data": { + "path": str(screenshot_path), + "fallback_warning": _lp_fallback_warning, + "browser_engine": "chrome", + "browser_engine_fallback": { + "from": "lightpanda", + "to": "chrome", + "reason": "Lightpanda has no graphical renderer for screenshots; used Chrome for vision capture.", + }, + }, + "fallback_warning": _lp_fallback_warning, + "browser_engine": "chrome", + } + else: + # Take screenshot using agent-browser + screenshot_args = [] + if annotate: + screenshot_args.append("--annotate") + screenshot_args.append("--full") + screenshot_args.append(str(screenshot_path)) + result = _run_browser_command( + effective_task_id, + "screenshot", + screenshot_args, + # If the Lightpanda pre-route already failed, force Chrome so + # _run_browser_command doesn't trigger a redundant LP fallback. + _engine_override="auto" if _lp_prerouted else None, + ) + if not result.get("success"): error_detail = result.get("error", "Unknown error") _cp = _get_cloud_provider() mode = "local" if _cp is None else f"cloud ({_cp.provider_name()})" - return json.dumps({ + error_response = { "success": False, "error": f"Failed to take screenshot ({mode} mode): {error_detail}" - }, ensure_ascii=False) + } + return json.dumps(_copy_fallback_warning(error_response, result), ensure_ascii=False) actual_screenshot_path = result.get("data", {}).get("path") if actual_screenshot_path: @@ -2740,12 +2837,12 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str] f"or a stale daemon process." ), }, ensure_ascii=False) - + # Convert screenshot to base64 at full resolution. _screenshot_bytes = screenshot_path.read_bytes() _screenshot_b64 = base64.b64encode(_screenshot_bytes).decode("ascii") data_url = f"data:image/png;base64,{_screenshot_b64}" - + vision_prompt = ( f"You are analyzing a screenshot of a web browser.\n\n" f"User's question: {question}\n\n" @@ -2816,7 +2913,7 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str] response = call_llm(**call_kwargs) else: raise - + analysis = (response.choices[0].message.content or "").strip() # Redact secrets the vision LLM may have read from the screenshot. from agent.redact import redact_sensitive_text @@ -2826,11 +2923,12 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str] "analysis": analysis or "Vision analysis returned no content.", "screenshot_path": str(screenshot_path), } + _copy_fallback_warning(response_data, result) # Include annotation data if annotated screenshot was taken if annotate and result.get("data", {}).get("annotations"): response_data["annotations"] = result["data"]["annotations"] return json.dumps(response_data, ensure_ascii=False) - + except Exception as e: # Keep the screenshot if it was captured successfully — the failure is # in the LLM vision analysis, not the capture. Deleting a valid @@ -2841,6 +2939,7 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str] if screenshot_path.exists(): error_info["screenshot_path"] = str(screenshot_path) error_info["note"] = "Screenshot was captured but vision analysis failed. You can still share it via MEDIA:." + _copy_fallback_warning(error_info, result if 'result' in locals() else {}) return json.dumps(error_info, ensure_ascii=False) @@ -2985,7 +3084,7 @@ def _cleanup_single_browser_session(task_id: str) -> None: provider.close_session(bb_session_id) except Exception as e: logger.warning("Could not close cloud browser session: %s", e) - + # Kill the daemon process and clean up socket directory session_name = session_info.get("session_name", "") if session_name: @@ -3001,7 +3100,7 @@ def _cleanup_single_browser_session(task_id: str) -> None: except (ProcessLookupError, ValueError, PermissionError, OSError): logger.debug("Could not kill daemon pid for %s (already dead or inaccessible)", session_name) shutil.rmtree(socket_dir, ignore_errors=True) - + logger.debug("Removed task %s from active sessions", task_id) else: logger.debug("No active session found for task_id: %s", task_id) @@ -3010,7 +3109,7 @@ def _cleanup_single_browser_session(task_id: str) -> None: def cleanup_all_browsers() -> None: """ Clean up all active browser sessions. - + Useful for cleanup on shutdown. """ with _cleanup_lock: @@ -3208,7 +3307,7 @@ if __name__ == "__main__": _cp = _get_cloud_provider() mode = "local" if _cp is None else f"cloud ({_cp.provider_name()})" print(f" Mode: {mode}") - + # Check requirements if check_browser_requirements(): print("✅ All requirements met") @@ -3239,11 +3338,11 @@ if __name__ == "__main__": if _cp is not None and not _cp.is_configured(): print(f" - {_cp.provider_name()} credentials not configured") print(" Tip: set browser.cloud_provider to 'local' to use free local mode instead") - + print("\n📋 Available Browser Tools:") for schema in BROWSER_TOOL_SCHEMAS: print(f" 🔹 {schema['name']}: {schema['description'][:60]}...") - + print("\n💡 Usage:") print(" from tools.browser_tool import browser_navigate, browser_snapshot") print(" result = browser_navigate('https://example.com', task_id='my_task')") diff --git a/tui_gateway/server.py b/tui_gateway/server.py index 1e1bb2af34..ade8d57ebb 100644 --- a/tui_gateway/server.py +++ b/tui_gateway/server.py @@ -1473,6 +1473,11 @@ def _tool_summary(name: str, result: str, duration_s: float | None) -> str | Non if n is not None: text = f"Extracted {n} {'page' if n == 1 else 'pages'}" + if isinstance(data, dict) and data.get("fallback_warning"): + warning = str(data.get("fallback_warning") or "").strip() + if warning: + return f"{warning}{suffix}" + return f"{text}{suffix}" if text else None