From 333f01bc7ffa95616805a62e1f6979d8cc69c026 Mon Sep 17 00:00:00 2001 From: helix4u <4317663+helix4u@users.noreply.github.com> Date: Sun, 7 Jun 2026 11:42:26 -0600 Subject: [PATCH] fix(tools): percent-encode non-ascii URL components --- tests/tools/test_browser_secret_exfil.py | 69 ++++++++++++++++++++++++ tests/tools/test_url_safety.py | 27 ++++++++++ tools/browser_tool.py | 10 ++++ tools/url_safety.py | 43 ++++++++++++++- tools/web_tools.py | 18 +++++-- 5 files changed, 161 insertions(+), 6 deletions(-) diff --git a/tests/tools/test_browser_secret_exfil.py b/tests/tools/test_browser_secret_exfil.py index 82fa7e490e1..fbf35727bb9 100644 --- a/tests/tools/test_browser_secret_exfil.py +++ b/tests/tools/test_browser_secret_exfil.py @@ -42,6 +42,25 @@ class TestBrowserSecretExfil: # Should NOT be blocked by secret detection assert "API key or token" not in parsed.get("error", "") + def test_normalizes_non_ascii_url_before_navigation(self): + from tools.browser_tool import browser_navigate + + captured = {} + + def mock_run(_session_key, command, args, **_kwargs): + if command == "open": + captured["url"] = args[0] + return {"success": True, "data": {"title": "ok", "url": args[0]}} + + with patch("tools.browser_tool._run_browser_command", side_effect=mock_run), \ + patch("tools.browser_tool._get_session_info", return_value={"_first_nav": False}), \ + patch("tools.browser_tool._is_local_backend", return_value=True): + result = browser_navigate("https://wttr.in/Köln") + + parsed = json.loads(result) + assert parsed["success"] is True + assert captured["url"] == "https://wttr.in/K%C3%B6ln" + class TestWebExtractSecretExfil: """Verify web_extract_tool blocks URLs containing secrets.""" @@ -65,6 +84,56 @@ class TestWebExtractSecretExfil: # Should fail for API/config reason, not secret blocking assert "API key" not in parsed.get("error", "") or "Blocked" not in parsed.get("error", "") + @pytest.mark.asyncio + async def test_normalizes_non_ascii_url_before_extract_provider(self, monkeypatch): + from agent.web_search_provider import WebSearchProvider + from agent import web_search_registry + from tools import web_tools + + class FakeExtractProvider(WebSearchProvider): + @property + def name(self) -> str: + return "fake-extract" + + def is_available(self) -> bool: + return True + + def supports_search(self) -> bool: + return False + + def supports_extract(self) -> bool: + return True + + def extract(self, urls, **_kwargs): + return [ + { + "url": urls[0], + "title": "ok", + "content": "ok", + "raw_content": "ok", + } + ] + + async def allow_url(_url: str) -> bool: + return True + + web_search_registry._reset_for_tests() + web_search_registry.register_provider(FakeExtractProvider()) + monkeypatch.setattr(web_tools, "_ensure_web_plugins_loaded", lambda: None) + monkeypatch.setattr(web_tools, "_get_extract_backend", lambda: "fake-extract") + monkeypatch.setattr(web_tools, "async_is_safe_url", allow_url) + + try: + result = await web_tools.web_extract_tool( + urls=["https://wttr.in/Köln"], + use_llm_processing=False, + ) + finally: + web_search_registry._reset_for_tests() + + parsed = json.loads(result) + assert parsed["results"][0]["url"] == "https://wttr.in/K%C3%B6ln" + class TestBrowserSnapshotRedaction: """Verify secrets in page snapshots are redacted before auxiliary LLM calls.""" diff --git a/tests/tools/test_url_safety.py b/tests/tools/test_url_safety.py index a5e00dcf64e..c68dd6e82dc 100644 --- a/tests/tools/test_url_safety.py +++ b/tests/tools/test_url_safety.py @@ -7,6 +7,7 @@ from tools.url_safety import ( is_safe_url, async_is_safe_url, is_always_blocked_url, + normalize_url_for_request, _is_blocked_ip, _global_allow_private_urls, _reset_allow_private_cache, @@ -16,6 +17,32 @@ import ipaddress import pytest +class TestNormalizeUrlForRequest: + def test_percent_encodes_non_ascii_path(self): + assert ( + normalize_url_for_request("https://wttr.in/Köln") + == "https://wttr.in/K%C3%B6ln" + ) + + def test_preserves_existing_percent_escapes(self): + assert ( + normalize_url_for_request("https://wttr.in/K%C3%B6ln") + == "https://wttr.in/K%C3%B6ln" + ) + + def test_preserves_reserved_query_syntax(self): + assert ( + normalize_url_for_request("https://example.com/search?q=Köln&lang=de") + == "https://example.com/search?q=K%C3%B6ln&lang=de" + ) + + def test_idna_encodes_hostname(self): + assert ( + normalize_url_for_request("https://münich.example/Köln") + == "https://xn--mnich-kva.example/K%C3%B6ln" + ) + + class TestIsSafeUrl: def test_public_url_allowed(self): with patch("socket.getaddrinfo", return_value=[ diff --git a/tools/browser_tool.py b/tools/browser_tool.py index 5a486b7f244..66684b8ee35 100644 --- a/tools/browser_tool.py +++ b/tools/browser_tool.py @@ -78,10 +78,12 @@ try: from tools.url_safety import ( is_safe_url as _is_safe_url, is_always_blocked_url as _is_always_blocked_url, + normalize_url_for_request as _normalize_url_for_request, ) except Exception: _is_safe_url = lambda url: False # noqa: E731 — fail-closed: block all if safety module unavailable _is_always_blocked_url = lambda url: True # noqa: E731 — fail-closed on the floor too + _normalize_url_for_request = lambda url: url # noqa: E731 — best-effort fallback # Browser-provider ABC + registry — PR #25214 moved the per-vendor providers # (Browserbase / Browser Use / Firecrawl) out of ``tools/browser_providers/`` # and into ``plugins/browser//``. The dispatcher consults the @@ -2310,6 +2312,14 @@ def browser_navigate(url: str, task_id: Optional[str] = None) -> str: "error": "Blocked: URL contains what appears to be an API key or token. " "Secrets must not be sent in URLs.", }) + url = _normalize_url_for_request(url) + normalized_decoded = urllib.parse.unquote(url) + if _PREFIX_RE.search(url) or _PREFIX_RE.search(normalized_decoded): + return json.dumps({ + "success": False, + "error": "Blocked: URL contains what appears to be an API key or token. " + "Secrets must not be sent in URLs.", + }) # SSRF protection — block private/internal addresses before navigating. # Skipped for local backends (Camofox, headless Chromium without a cloud diff --git a/tools/url_safety.py b/tools/url_safety.py index 13117d76031..ac6326e306f 100644 --- a/tools/url_safety.py +++ b/tools/url_safety.py @@ -28,12 +28,53 @@ import logging import os import socket import asyncio -from urllib.parse import urlparse +from urllib.parse import quote, urlparse, urlsplit, urlunsplit from utils import is_truthy_value logger = logging.getLogger(__name__) + +def normalize_url_for_request(url: str) -> str: + """Return an ASCII-safe HTTP URL for Hermes-owned URL tools. + + Browsers and HTTP clients expect URIs, but users and models often provide + IRIs such as ``https://wttr.in/Köln``. Preserve URL syntax and existing + percent escapes while encoding non-ASCII host/path/query/fragment text. + This is intentionally for URL tool inputs only; arbitrary shell commands + must not be rewritten. + """ + if not isinstance(url, str): + return url + + raw = url.strip() + if not raw: + return raw + + try: + parsed = urlsplit(raw) + except ValueError: + return raw + + if parsed.scheme.lower() not in {"http", "https"}: + return raw + + netloc = parsed.netloc + hostname = parsed.hostname + if hostname: + try: + ascii_host = hostname.encode("idna").decode("ascii") + except UnicodeError: + ascii_host = hostname + if ascii_host != hostname: + netloc = netloc.replace(hostname, ascii_host, 1) + + path = quote(parsed.path, safe="/%:@!$&'()*+,;=") + query = quote(parsed.query, safe="/%:@!$&'()*+,;=?") + fragment = quote(parsed.fragment, safe="/%:@!$&'()*+,;=?") + + return urlunsplit((parsed.scheme, netloc, path, query, fragment)) + # Hostnames that should always be blocked regardless of IP resolution # or any config toggle. These are cloud metadata endpoints that an # attacker could use to steal instance credentials. diff --git a/tools/web_tools.py b/tools/web_tools.py index a97370c4835..9bdd7c32d6e 100644 --- a/tools/web_tools.py +++ b/tools/web_tools.py @@ -102,7 +102,7 @@ from tools.tool_backend_helpers import ( # noqa: F401 nous_tool_gateway_unavailable_message, prefers_gateway, ) -from tools.url_safety import async_is_safe_url +from tools.url_safety import async_is_safe_url, normalize_url_for_request import sys logger = logging.getLogger(__name__) @@ -902,17 +902,25 @@ async def web_extract_tool( # URL-decode first so percent-encoded secrets (%73k- = sk-) are caught. from agent.redact import _PREFIX_RE from urllib.parse import unquote + normalized_urls: List[str] = [] for _url in urls: - if _PREFIX_RE.search(_url) or _PREFIX_RE.search(unquote(_url)): + normalized_url = normalize_url_for_request(_url) + if ( + _PREFIX_RE.search(_url) + or _PREFIX_RE.search(unquote(_url)) + or _PREFIX_RE.search(normalized_url) + or _PREFIX_RE.search(unquote(normalized_url)) + ): return json.dumps({ "success": False, "error": "Blocked: URL contains what appears to be an API key or token. " "Secrets must not be sent in URLs.", }) + normalized_urls.append(normalized_url) debug_call_data = { "parameters": { - "urls": urls, + "urls": normalized_urls, "format": format, "use_llm_processing": use_llm_processing, "model": model, @@ -928,12 +936,12 @@ async def web_extract_tool( } try: - logger.info("Extracting content from %d URL(s)", len(urls)) + logger.info("Extracting content from %d URL(s)", len(normalized_urls)) # ── SSRF protection — filter out private/internal URLs before any backend ── safe_urls = [] ssrf_blocked: List[Dict[str, Any]] = [] - for url in urls: + for url in normalized_urls: if not await async_is_safe_url(url): ssrf_blocked.append({ "url": url, "title": "", "content": "",