From ee8cbfdc03eb9b7cdd486165486f2e3cad0d8645 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Mon, 29 Jun 2026 10:00:49 -0700 Subject: [PATCH] feat(web_extract): truncate-and-store instead of LLM summarization (#54843) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat(web_extract): truncate-and-store instead of LLM summarization web_extract no longer runs an auxiliary LLM over scraped pages. The extract backends (Firecrawl/Tavily/Exa/Parallel) already return clean, boilerplate- stripped markdown, so we return it directly: pages within a char budget (default 15000, web.extract_char_limit) come back whole; larger pages get a head+tail window plus an explicit footer giving the stored full-text path and the read_file call to page through the omitted middle. The full clean text is written to cache/web (mounted read-only into remote backends like the other cache dirs), so nothing is lost. Inline base64 images are converted to [IMAGE: alt] placeholders (token bombs dropped) while real http(s) image URLs are preserved as links so the agent can still web_extract/vision_analyze them. Removes process_content_with_llm + the chunked summarizer + check_auxiliary_model + _resolve_web_extract_auxiliary. context_references._default_url_fetcher is updated to the truncate path and its stale data.documents shape read is fixed to results (it was silently returning empty). Live before/after eval (firecrawl, 4 URLs): 11.7x faster overall (176.6s -> 15.1s); 10-60x on large pages. Quality identical; findability 4/4 (answer recoverable from stored full text on every truncated page). web_search is unchanged. No own scraper added; no changes to web_search. * fix(web_extract): add char_limit to execute_code web_extract stub The new web_extract char_limit param must appear in the code_execution_tool _TOOL_STUBS signature (and doc line) or test_stubs_cover_all_schema_params fails — the stub schema must cover every real schema param. --- agent/context_references.py | 4 +- hermes_cli/config.py | 1 + tests/integration/test_web_tools.py | 20 +- tests/tools/test_browser_secret_exfil.py | 1 - tests/tools/test_web_providers.py | 1 - tests/tools/test_web_tools_config.py | 41 -- tests/tools/test_web_tools_tavily.py | 6 +- tests/tools/test_web_tools_truncate.py | 142 ++++ tests/tools/test_website_policy.py | 4 +- tools/code_execution_tool.py | 11 +- tools/credential_files.py | 1 + tools/web_tools.py | 799 ++++++----------------- 12 files changed, 370 insertions(+), 661 deletions(-) create mode 100644 tests/tools/test_web_tools_truncate.py diff --git a/agent/context_references.py b/agent/context_references.py index fad1ff00159..d77857584a7 100644 --- a/agent/context_references.py +++ b/agent/context_references.py @@ -328,9 +328,9 @@ async def _fetch_url_content( async def _default_url_fetcher(url: str) -> str: from tools.web_tools import web_extract_tool - raw = await web_extract_tool([url], format="markdown", use_llm_processing=True) + raw = await web_extract_tool([url], format="markdown") payload = json.loads(raw) - docs = payload.get("data", {}).get("documents", []) + docs = payload.get("results", []) if not docs: return "" doc = docs[0] diff --git a/hermes_cli/config.py b/hermes_cli/config.py index 731397ba13a..3474ee35a0e 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -1161,6 +1161,7 @@ DEFAULT_CONFIG = { "backend": "", # shared fallback — applies to both search and extract "search_backend": "", # per-capability override for web_search (e.g. "searxng") "extract_backend": "", # per-capability override for web_extract (e.g. "native") + "extract_char_limit": 15000, # per-page char budget for web_extract; larger pages truncate + store full text in cache/web }, "browser": { diff --git a/tests/integration/test_web_tools.py b/tests/integration/test_web_tools.py index f5281140066..6be64b6b2a6 100644 --- a/tests/integration/test_web_tools.py +++ b/tests/integration/test_web_tools.py @@ -32,7 +32,6 @@ from tools.web_tools import ( web_extract_tool, check_firecrawl_api_key, check_web_api_key, - check_auxiliary_model, _get_backend, ) @@ -129,12 +128,11 @@ class WebToolsTester: backend = _get_backend() self.log_result("Web Backend API Key", "passed", f"Using {backend} backend") - # Check auxiliary LLM provider (optional) - if not check_auxiliary_model(): - self.log_result("Auxiliary LLM", "skipped", "No auxiliary LLM provider available (LLM tests will be skipped)") - self.test_llm = False - else: - self.log_result("Auxiliary LLM", "passed", "Found") + # Auxiliary LLM summarization was removed — web_extract is now + # truncate-and-store (no LLM). Keep the flag off so any residual + # LLM-path assertions stay skipped. + self.log_result("Auxiliary LLM", "skipped", "web_extract no longer uses an LLM (truncate-and-store)") + self.test_llm = False return True @@ -261,12 +259,11 @@ class WebToolsTester: print(f" - {url}") if self.verbose: - print(f" Calling web_extract_tool(urls={test_urls}, format='markdown', use_llm_processing=False)") + print(f" Calling web_extract_tool(urls={test_urls}, format='markdown')") result = await web_extract_tool( test_urls, format="markdown", - use_llm_processing=False ) # Parse result @@ -360,8 +357,7 @@ class WebToolsTester: result = await web_extract_tool( [test_url], format="markdown", - use_llm_processing=True, - min_length=1000 # Lower threshold for testing + char_limit=1000, # small budget to force truncation in the test ) data = json.loads(result) @@ -466,7 +462,7 @@ class WebToolsTester: "web_backend": _get_backend() if check_web_api_key() else None, "firecrawl_api_key": check_firecrawl_api_key(), "parallel_api_key": bool(os.getenv("PARALLEL_API_KEY")), - "auxiliary_model": check_auxiliary_model(), + "auxiliary_model": False, } } diff --git a/tests/tools/test_browser_secret_exfil.py b/tests/tools/test_browser_secret_exfil.py index fbf35727bb9..2ccc9193b49 100644 --- a/tests/tools/test_browser_secret_exfil.py +++ b/tests/tools/test_browser_secret_exfil.py @@ -126,7 +126,6 @@ class TestWebExtractSecretExfil: try: result = await web_tools.web_extract_tool( urls=["https://wttr.in/Köln"], - use_llm_processing=False, ) finally: web_search_registry._reset_for_tests() diff --git a/tests/tools/test_web_providers.py b/tests/tools/test_web_providers.py index 177b34ccc92..f71b1f3b7d8 100644 --- a/tests/tools/test_web_providers.py +++ b/tests/tools/test_web_providers.py @@ -418,7 +418,6 @@ class TestDispatchersTriggerPluginDiscovery: result = json.loads(asyncio.run( web_tools.web_extract_tool( ["https://example.com"], - use_llm_processing=False, ) )) diff --git a/tests/tools/test_web_tools_config.py b/tests/tools/test_web_tools_config.py index 5838ea00b78..667d5350c4a 100644 --- a/tests/tools/test_web_tools_config.py +++ b/tests/tools/test_web_tools_config.py @@ -160,47 +160,6 @@ class TestFirecrawlClientConfig: importlib.reload(tools.web_tools) assert tools.web_tools._read_nous_access_token() == "nous-token" - def test_check_auxiliary_model_re_resolves_backend_each_call(self): - """Availability checks should not be pinned to module import state.""" - import tools.web_tools - - # Simulate the pre-fix import-time cache slot for regression coverage. - tools.web_tools.__dict__["_aux_async_client"] = None - - with patch( - "tools.web_tools.get_async_text_auxiliary_client", - side_effect=[(None, None), (MagicMock(base_url="https://api.openrouter.ai/v1"), "test-model")], - ): - assert tools.web_tools.check_auxiliary_model() is False - assert tools.web_tools.check_auxiliary_model() is True - - @pytest.mark.asyncio - async def test_summarizer_re_resolves_backend_after_initial_unavailable_state(self): - """Summarization should pick up a backend that becomes available later in-process.""" - import tools.web_tools - - tools.web_tools.__dict__["_aux_async_client"] = None - - response = MagicMock() - response.choices = [MagicMock(message=MagicMock(content="summary text"))] - - with patch( - "tools.web_tools._resolve_web_extract_auxiliary", - side_effect=[(None, None, {}), (MagicMock(base_url="https://api.openrouter.ai/v1"), "test-model", {})], - ), patch( - "tools.web_tools.async_call_llm", - new=AsyncMock(return_value=response), - ) as mock_async_call: - assert tools.web_tools.check_auxiliary_model() is False - result = await tools.web_tools._call_summarizer_llm( - "Some content worth summarizing", - "Source: https://example.com\n\n", - None, - ) - - assert result == "summary text" - mock_async_call.assert_awaited_once() - # ── Singleton caching ──────────────────────────────────────────── def test_singleton_returns_same_instance(self): diff --git a/tests/tools/test_web_tools_tavily.py b/tests/tools/test_web_tools_tavily.py index de820794965..d65baac3e19 100644 --- a/tests/tools/test_web_tools_tavily.py +++ b/tests/tools/test_web_tools_tavily.py @@ -215,13 +215,13 @@ class TestWebExtractTavily: with patch("tools.web_tools._get_backend", return_value="tavily"), \ patch.dict(os.environ, {"TAVILY_API_KEY": "tvly-test"}), \ - patch("tools.web_tools.httpx.post", return_value=mock_response), \ - patch("tools.web_tools.process_content_with_llm", return_value=None): + patch("tools.web_tools.httpx.post", return_value=mock_response): from tools.web_tools import web_extract_tool result = json.loads(asyncio.get_event_loop().run_until_complete( - web_extract_tool(["https://example.com"], use_llm_processing=False) + web_extract_tool(["https://example.com"]) )) assert "results" in result assert len(result["results"]) == 1 assert result["results"][0]["url"] == "https://example.com" + assert "Extracted content" in result["results"][0]["content"] diff --git a/tests/tools/test_web_tools_truncate.py b/tests/tools/test_web_tools_truncate.py new file mode 100644 index 00000000000..310a9b896dc --- /dev/null +++ b/tests/tools/test_web_tools_truncate.py @@ -0,0 +1,142 @@ +"""Unit tests for the truncate-and-store web_extract path (no LLM). + +Covers convert_base64_images_to_links, _truncate_with_footer, _store_full_text, +_get_extract_char_limit, and the end-to-end web_extract_tool truncation behavior. +""" +import asyncio +import json +import os +from unittest.mock import patch + +import pytest + +import tools.web_tools as wt + + +class TestImageConversion: + def test_markdown_base64_image_keeps_alt_drops_blob(self): + blob = "A" * 5000 + text = f"before ![a cat]( data:image/png;base64,{blob}) after" + out = wt.convert_base64_images_to_links(text) + assert "[IMAGE: a cat]" in out + assert "base64" not in out + assert blob not in out + assert "before" in out and "after" in out + + def test_markdown_base64_image_no_alt(self): + out = wt.convert_base64_images_to_links("x ![](data:image/jpeg;base64,QQ==) y") + assert "[IMAGE]" in out + assert "base64" not in out + + def test_real_http_image_links_preserved(self): + text = "see ![logo](https://example.com/logo.png) here" + out = wt.convert_base64_images_to_links(text) + # Real image URLs must survive so the agent can inspect them. + assert "![logo](https://example.com/logo.png)" in out + + def test_bare_and_parenthesised_base64_become_placeholder(self): + blob = "Z" * 3000 + bare = wt.convert_base64_images_to_links(f"data:image/gif;base64,{blob}") + assert bare == "[IMAGE]" + paren = wt.convert_base64_images_to_links(f"(data:image/gif;base64,{blob})") + assert paren == "[IMAGE]" + + +class TestTruncation: + def test_short_content_returned_whole(self): + content = "# Title\n\nshort body\n" + out, truncated = wt._truncate_with_footer(content, "https://e.com", 15000) + assert out == content + assert truncated is False + + def test_long_content_truncated_with_footer(self, tmp_path, monkeypatch): + monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes")) + body = "\n".join(f"line {i} " + "x" * 50 for i in range(2000)) + out, truncated = wt._truncate_with_footer(body, "https://example.com/page", 4000) + assert truncated is True + assert "[TRUNCATED]" in out + assert "Full text saved to:" in out + assert "read_file" in out + # Head and tail are both present (first and last lines survive). + assert "line 0 " in out + assert "line 1999 " in out + # The omitted middle is gone. + assert "line 1000 " not in out + # Sent text is bounded near the budget (+ footer overhead). + assert len(out) < 4000 + 2000 + + def test_truncation_stores_full_text_readable(self, tmp_path, monkeypatch): + monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes")) + body = "UNIQUE_MIDDLE_MARKER\n" + ("\n".join(f"row {i}" for i in range(5000))) + out, truncated = wt._truncate_with_footer(body, "https://example.com/doc", 3000) + assert truncated is True + # Extract the stored path from the footer and confirm full text is there. + path_line = next(ln for ln in out.splitlines() if "Full text saved to:" in ln) + stored_path = path_line.split("Full text saved to:", 1)[1].strip() + assert os.path.exists(stored_path) + full = open(stored_path).read() + assert "UNIQUE_MIDDLE_MARKER" in full + assert "row 2500" in full # the omitted-middle row is in the stored file + + +class TestCharLimitConfig: + def test_default_when_unset(self): + with patch("tools.web_tools._load_web_config", return_value={}): + assert wt._get_extract_char_limit() == wt.DEFAULT_EXTRACT_CHAR_LIMIT + + def test_config_override(self): + with patch("tools.web_tools._load_web_config", return_value={"extract_char_limit": 40000}): + assert wt._get_extract_char_limit() == 40000 + + def test_clamps_floor(self): + with patch("tools.web_tools._load_web_config", return_value={"extract_char_limit": 100}): + assert wt._get_extract_char_limit() == 2000 + + def test_bad_value_falls_back(self): + with patch("tools.web_tools._load_web_config", return_value={"extract_char_limit": "nope"}): + assert wt._get_extract_char_limit() == wt.DEFAULT_EXTRACT_CHAR_LIMIT + + +class TestEndToEnd: + def test_web_extract_truncates_large_page_no_llm(self, tmp_path, monkeypatch): + monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes")) + big = "\n".join(f"para {i} " + "y" * 80 for i in range(3000)) + + class FakeProvider: + name = "fake" + display_name = "Fake" + + def supports_extract(self): + return True + + async def extract(self, urls, **kwargs): + return [{"url": urls[0], "title": "Big Page", "content": big, + "raw_content": big, "metadata": {}}] + + with patch("tools.web_tools._ensure_web_plugins_loaded"), \ + patch("tools.web_tools._get_extract_backend", return_value="fake"), \ + patch("tools.web_tools.async_is_safe_url", new=_AsyncTrue()), \ + patch("agent.web_search_registry.get_provider", return_value=FakeProvider()): + result = json.loads(asyncio.new_event_loop().run_until_complete( + wt.web_extract_tool(["https://example.com/big"], char_limit=5000) + )) + + assert "results" in result + content = result["results"][0]["content"] + assert "[TRUNCATED]" in content + assert "Full text saved to:" in content + # No LLM was involved: para 0 (head) and the last para (tail) are verbatim. + assert "para 0 " in content + assert "para 2999 " in content + + +def _make_awaitable(value): + async def _coro(*a, **k): + return value + return _coro() + + +class _AsyncTrue: + """Async callable that always returns True (re-awaitable per call).""" + async def __call__(self, *a, **k): + return True diff --git a/tests/tools/test_website_policy.py b/tests/tools/test_website_policy.py index 712a372867a..9f488ee1189 100644 --- a/tests/tools/test_website_policy.py +++ b/tests/tools/test_website_policy.py @@ -398,7 +398,7 @@ class TestWebToolPolicy: # Force the firecrawl plugin to be the active extract provider. monkeypatch.setenv("FIRECRAWL_API_KEY", "fake-key") - result = json.loads(await web_tools.web_extract_tool(["https://blocked.test"], use_llm_processing=False)) + result = json.loads(await web_tools.web_extract_tool(["https://blocked.test"])) assert result["results"][0]["url"] == "https://blocked.test" assert "Blocked by website policy" in result["results"][0]["error"] @@ -443,7 +443,7 @@ class TestWebToolPolicy: monkeypatch.setattr("tools.interrupt.is_interrupted", lambda: False) monkeypatch.setenv("FIRECRAWL_API_KEY", "fake-key") - result = json.loads(await web_tools.web_extract_tool(["https://allowed.test"], use_llm_processing=False)) + result = json.loads(await web_tools.web_extract_tool(["https://allowed.test"])) assert result["results"][0]["url"] == "https://blocked.test/final" assert result["results"][0]["content"] == "" diff --git a/tools/code_execution_tool.py b/tools/code_execution_tool.py index 8946de73750..402777eab61 100644 --- a/tools/code_execution_tool.py +++ b/tools/code_execution_tool.py @@ -219,9 +219,9 @@ _TOOL_STUBS = { ), "web_extract": ( "web_extract", - "urls: list", - '"""Extract content from URLs. Returns dict with results list of {url, title, content, error}."""', - '{"urls": urls}', + "urls: list, char_limit: int = None", + '"""Extract content from URLs (no LLM summarization). Returns dict with results list of {url, title, content, error}. Pages over char_limit (default 15000) are head+tail truncated with the full text stored on disk; the content footer gives the path. content is markdown."""', + '{"urls": urls, "char_limit": char_limit}', ), "read_file": ( "read_file", @@ -1727,8 +1727,9 @@ _TOOL_DOC_LINES = [ " web_search(query: str, limit: int = 5) -> dict\n" " Returns {\"data\": {\"web\": [{\"url\", \"title\", \"description\"}, ...]}}"), ("web_extract", - " web_extract(urls: list[str]) -> dict\n" - " Returns {\"results\": [{\"url\", \"title\", \"content\", \"error\"}, ...]} where content is markdown"), + " web_extract(urls: list[str], char_limit: int = None) -> dict\n" + " Returns {\"results\": [{\"url\", \"title\", \"content\", \"error\"}, ...]} where content is markdown.\n" + " No LLM summarization. Pages over char_limit (default 15000) are head+tail truncated; full text stored on disk (path in the content footer)."), ("read_file", " read_file(path: str, offset: int = 1, limit: int = 500) -> dict\n" " Lines are 1-indexed. Returns {\"content\": \"...\", \"total_lines\": N}"), diff --git a/tools/credential_files.py b/tools/credential_files.py index 7d6520820c7..b7f1ff773e8 100644 --- a/tools/credential_files.py +++ b/tools/credential_files.py @@ -349,6 +349,7 @@ _CACHE_DIRS: list[tuple[str, str]] = [ ("cache/audio", "audio_cache"), ("cache/videos", "video_cache"), ("cache/screenshots", "browser_screenshots"), + ("cache/web", "web_cache"), ] diff --git a/tools/web_tools.py b/tools/web_tools.py index 133489b0a89..0635b23f5d4 100644 --- a/tools/web_tools.py +++ b/tools/web_tools.py @@ -83,11 +83,6 @@ _parallel_client: Optional[Any] = None _async_parallel_client: Optional[Any] = None _exa_client: Optional[Any] = None -from agent.auxiliary_client import ( - async_call_llm, - extract_content_or_reasoning, - get_async_text_auxiliary_client, -) from tools.debug_helpers import DebugSession # Imported solely so unit tests can monkeypatch these names on # tools.web_tools (the firecrawl plugin reads them via its own import chain). @@ -305,445 +300,144 @@ def _web_requires_env() -> list[str]: # unit-test patches. -DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION = 5000 - -def _is_nous_auxiliary_client(client: Any) -> bool: - """Return True when the resolved auxiliary backend is Nous Portal.""" - from urllib.parse import urlparse - - base_url = str(getattr(client, "base_url", "") or "") - host = (urlparse(base_url).hostname or "").lower() - return host == "nousresearch.com" or host.endswith(".nousresearch.com") - - -def _resolve_web_extract_auxiliary(model: Optional[str] = None) -> tuple[Optional[Any], Optional[str], Dict[str, Any]]: - """Resolve the current web-extract auxiliary client, model, and extra body.""" - client, default_model = get_async_text_auxiliary_client("web_extract") - configured_model = os.getenv("AUXILIARY_WEB_EXTRACT_MODEL", "").strip() - effective_model = model or configured_model or default_model - - extra_body: Dict[str, Any] = {} - if client is not None and _is_nous_auxiliary_client(client): - from agent.auxiliary_client import get_auxiliary_extra_body - from agent.portal_tags import nous_portal_tags - extra_body = get_auxiliary_extra_body() or {"tags": nous_portal_tags()} - - return client, effective_model, extra_body - - -def _get_default_summarizer_model() -> Optional[str]: - """Return the current default model for web extraction summarization.""" - _, model, _ = _resolve_web_extract_auxiliary() - return model +# Default budget (characters) of clean page text sent to the model. Pages at +# or under this size are returned whole; larger pages are head+tail truncated +# and the full text is stored on disk (see _store_full_text). Spending context, +# not API dollars — so this is generous relative to the old 5k summary cap. +# Override via web.extract_char_limit in config.yaml. +DEFAULT_EXTRACT_CHAR_LIMIT = 15000 _debug = DebugSession("web_tools", env_var="WEB_TOOLS_DEBUG") -async def process_content_with_llm( - content: str, - url: str = "", - title: str = "", - model: Optional[str] = None, - min_length: int = DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION -) -> Optional[str]: - """ - Process web content using LLM to create intelligent summaries with key excerpts. - - This function uses Gemini 3 Flash Preview (or specified model) via OpenRouter API - to intelligently extract key information and create markdown summaries, - significantly reducing token usage while preserving all important information. - - For very large content (>500k chars), uses chunked processing with synthesis. - For extremely large content (>2M chars), refuses to process entirely. - - Args: - content (str): The raw content to process - url (str): The source URL (for context, optional) - title (str): The page title (for context, optional) - model (str): The model to use for processing (default: google/gemini-3-flash-preview) - min_length (int): Minimum content length to trigger processing (default: 5000) - - Returns: - Optional[str]: Processed markdown content, or None if content too short or processing fails - """ - # Size thresholds - MAX_CONTENT_SIZE = 2_000_000 # 2M chars - refuse entirely above this - CHUNK_THRESHOLD = 500_000 # 500k chars - use chunked processing above this - CHUNK_SIZE = 100_000 # 100k chars per chunk - MAX_OUTPUT_SIZE = 5000 # Hard cap on final output size - +def _get_extract_char_limit() -> int: + """Resolve the per-page char budget from config, clamped to a sane range.""" try: - content_len = len(content) - - # Refuse if content is absurdly large - if content_len > MAX_CONTENT_SIZE: - size_mb = content_len / 1_000_000 - logger.warning("Content too large (%.1fMB > 2MB limit). Refusing to process.", size_mb) - return f"[Content too large to process: {size_mb:.1f}MB. Try a more focused source URL.]" - - # Skip processing if content is too short - if content_len < min_length: - logger.debug("Content too short (%d < %d chars), skipping LLM processing", content_len, min_length) - return None - - # Create context information - context_info = [] - if title: - context_info.append(f"Title: {title}") - if url: - context_info.append(f"Source: {url}") - context_str = "\n".join(context_info) + "\n\n" if context_info else "" - - # Check if we need chunked processing - if content_len > CHUNK_THRESHOLD: - logger.info("Content large (%d chars). Using chunked processing...", content_len) - return await _process_large_content_chunked( - content, context_str, model, CHUNK_SIZE, MAX_OUTPUT_SIZE - ) - - # Standard single-pass processing for normal content - logger.info("Processing content with LLM (%d characters)", content_len) - - processed_content = await _call_summarizer_llm(content, context_str, model) - - if processed_content: - # Enforce output cap - if len(processed_content) > MAX_OUTPUT_SIZE: - processed_content = processed_content[:MAX_OUTPUT_SIZE] + "\n\n[... summary truncated for context management ...]" - - # Log compression metrics - processed_length = len(processed_content) - compression_ratio = processed_length / content_len if content_len > 0 else 1.0 - logger.info("Content processed: %d -> %d chars (%.1f%%)", content_len, processed_length, compression_ratio * 100) - - return processed_content - - except Exception as e: - logger.warning( - "web_extract LLM summarization failed (%s). " - "Tip: increase auxiliary.web_extract.timeout in config.yaml " - "or switch to a faster auxiliary model.", - str(e)[:120], + configured = _load_web_config().get("extract_char_limit") + if configured is not None: + value = int(configured) + # Floor at 2k (below that the footer dominates), no hard ceiling + # beyond a generous guard so a typo can't blow up context. + return max(2000, min(value, 500_000)) + except (TypeError, ValueError): + pass + return DEFAULT_EXTRACT_CHAR_LIMIT + + +def convert_base64_images_to_links(text: str) -> str: + """Replace inline base64 image blobs with labeled markdown links. + + base64 image payloads are token bombs (a single inline PNG can be tens of + thousands of characters), so we never send the raw bytes to the model. But + we preserve the fact that an image was there, and its alt text, as an + inspectable placeholder. Real (http/https) markdown image links are left + untouched so the agent can ``web_extract`` / ``vision_analyze`` them. + + Transformations: + ``![alt](data:image/png;base64,AAAA...)`` -> ``[IMAGE: alt](base64 image omitted)`` + ``(data:image/png;base64,AAAA...)`` -> ``[IMAGE]`` + bare ``data:image/...;base64,AAAA...`` -> ``[IMAGE]`` + """ + # 1. Markdown image with base64 source -> keep alt text, drop the blob. + def _md_repl(m: "re.Match[str]") -> str: + alt = (m.group("alt") or "").strip() + return f"[IMAGE: {alt}]" if alt else "[IMAGE]" + + md_b64 = re.compile( + r"!\[(?P[^\]]*)\]\(\s*data:image/[^;]+;base64,[A-Za-z0-9+/=\s]+\)" + ) + out = md_b64.sub(_md_repl, text) + + # 2. Parenthesised base64 (non-markdown) and 3. bare base64 -> [IMAGE]. + out = re.sub(r"\(\s*data:image/[^;]+;base64,[A-Za-z0-9+/=\s]+\)", "[IMAGE]", out) + out = re.sub(r"data:image/[^;]+;base64,[A-Za-z0-9+/=]+", "[IMAGE]", out) + return out + + +def _store_full_text(url: str, content: str) -> Optional[str]: + """Write the full extracted page to cache/web and return its absolute path. + + The file is mounted read-only into remote backends (Docker/Modal/SSH) via + credential_files._CACHE_DIRS, so the agent's terminal/read_file tools can + page through the complete text on any backend. Returns None on failure + (storage is best-effort; truncated content is still returned to the model). + """ + try: + import hashlib + from urllib.parse import urlparse + from hermes_constants import get_hermes_dir + + cache_dir = get_hermes_dir("cache/web", "web_cache") + cache_dir.mkdir(parents=True, exist_ok=True) + + host = (urlparse(url).hostname or "page").replace(":", "_") + slug = re.sub(r"[^A-Za-z0-9._-]", "-", host)[:60].strip("-") or "page" + digest = hashlib.sha256(url.encode("utf-8")).hexdigest()[:10] + path = cache_dir / f"{slug}-{digest}.md" + path.write_text(content, encoding="utf-8") + return str(path) + except Exception as exc: # noqa: BLE001 + logger.debug("Failed to store full web_extract text for %s: %s", url, exc) + return None + + +def _truncate_with_footer( + content: str, + url: str, + char_limit: int, +) -> tuple[str, bool]: + """Return (model_text, was_truncated) for one page's clean content. + + Pages at or under ``char_limit`` are returned whole. Larger pages get a + head+tail window (~75% head / ~25% tail) cut on a markdown line boundary + where possible, plus an explicit footer telling the model exactly how much + it is seeing, where the full text is stored, and which read_file call pages + in the omitted middle. Deterministic — no model involvement. + """ + if len(content) <= char_limit: + return content, False + + head_budget = int(char_limit * 0.75) + tail_budget = char_limit - head_budget + + head = content[:head_budget] + tail = content[-tail_budget:] + # Snap the head cut back to the last newline so we don't slice mid-line. + nl = head.rfind("\n") + if nl > head_budget * 0.5: + head = head[:nl] + # Snap the tail cut forward to the next newline for the same reason. + nl = tail.find("\n") + if 0 <= nl < tail_budget * 0.5: + tail = tail[nl + 1:] + + total = len(content) + stored_path = _store_full_text(url, content) + shown = len(head) + len(tail) + + footer_lines = [ + "", + "─" * 8 + " [TRUNCATED] " + "─" * 8, + f"Showing {len(head):,} chars (head) + {len(tail):,} chars (tail) " + f"of {total:,} total clean characters.", + ] + if stored_path: + footer_lines.append(f"Full text saved to: {stored_path}") + footer_lines.append( + f'To read the omitted middle: read_file path="{stored_path}" ' + f"offset= limit= (the file is the complete page)." ) - # Fall back to truncated raw content instead of returning a useless - # error message. The first ~5000 chars are almost always more useful - # to the model than "[Failed to process content: ...]". - truncated = content[:MAX_OUTPUT_SIZE] - if len(content) > MAX_OUTPUT_SIZE: - truncated += ( - f"\n\n[Content truncated — showing first {MAX_OUTPUT_SIZE:,} of " - f"{len(content):,} chars. LLM summarization timed out. " - f"To fix: increase auxiliary.web_extract.timeout in config.yaml, " - f"or use a faster auxiliary model. Use browser_navigate for the full page.]" - ) - return truncated - - -async def _call_summarizer_llm( - content: str, - context_str: str, - model: Optional[str], - max_tokens: int = 20000, - is_chunk: bool = False, - chunk_info: str = "" -) -> Optional[str]: - """ - Make a single LLM call to summarize content. - - Args: - content: The content to summarize - context_str: Context information (title, URL) - model: Model to use - max_tokens: Maximum output tokens - is_chunk: Whether this is a chunk of a larger document - chunk_info: Information about chunk position (e.g., "Chunk 2/5") - - Returns: - Summarized content or None on failure - """ - if is_chunk: - # Chunk-specific prompt - aware that this is partial content - system_prompt = """You are an expert content analyst processing a SECTION of a larger document. Your job is to extract and summarize the key information from THIS SECTION ONLY. - -Important guidelines for chunk processing: -1. Do NOT write introductions or conclusions - this is a partial document -2. Focus on extracting ALL key facts, figures, data points, and insights from this section -3. Preserve important quotes, code snippets, and specific details verbatim -4. Use bullet points and structured formatting for easy synthesis later -5. Note any references to other sections (e.g., "as mentioned earlier", "see below") without trying to resolve them - -Your output will be combined with summaries of other sections, so focus on thorough extraction rather than narrative flow.""" - - user_prompt = f"""Extract key information from this SECTION of a larger document: - -{context_str}{chunk_info} - -SECTION CONTENT: -{content} - -Extract all important information from this section in a structured format. Focus on facts, data, insights, and key details. Do not add introductions or conclusions.""" - else: - # Standard full-document prompt - system_prompt = """You are an expert content analyst. Your job is to process web content and create a comprehensive yet concise summary that preserves all important information while dramatically reducing bulk. + footer_lines.append( + "Full text could not be stored; re-run web_extract on a more " + "specific URL or use browser_navigate for the complete page." + ) + footer_lines.append("─" * 29) -Create a well-structured markdown summary that includes: -1. Key excerpts (quotes, code snippets, important facts) in their original format -2. Comprehensive summary of all other important information -3. Proper markdown formatting with headers, bullets, and emphasis + model_text = head + "\n\n[... middle omitted — see footer ...]\n\n" + tail + model_text += "\n" + "\n".join(footer_lines) + return model_text, True -Your goal is to preserve ALL important information while reducing length. Never lose key facts, figures, insights, or actionable information. Make it scannable and well-organized.""" - - user_prompt = f"""Please process this web content and create a comprehensive markdown summary: - -{context_str}CONTENT TO PROCESS: -{content} - -Create a markdown summary that captures all key information in a well-organized, scannable format. Include important quotes and code snippets in their original formatting. Focus on actionable information, specific details, and unique insights.""" - - # Call the LLM with retry logic — keep retries low since summarization - # is a nice-to-have; the caller falls back to truncated content on failure. - max_retries = 2 - retry_delay = 2 - last_error = None - - for attempt in range(max_retries): - try: - aux_client, effective_model, extra_body = _resolve_web_extract_auxiliary(model) - if aux_client is None or not effective_model: - logger.warning("No auxiliary model available for web content processing") - return None - call_kwargs = { - "task": "web_extract", - "model": effective_model, - "messages": [ - {"role": "system", "content": system_prompt}, - {"role": "user", "content": user_prompt}, - ], - "temperature": 0.1, - "max_tokens": max_tokens, - # No explicit timeout — async_call_llm reads auxiliary.web_extract.timeout - # from config.yaml. Fresh configs ship with 360s; if the key is absent - # the runtime default is 30s (_DEFAULT_AUX_TIMEOUT in - # agent/auxiliary_client.py). Users with slow local models should set - # or increase auxiliary.web_extract.timeout in config.yaml. - } - if extra_body: - call_kwargs["extra_body"] = extra_body - response = await async_call_llm(**call_kwargs) - content = extract_content_or_reasoning(response) - if content: - return content - # Reasoning-only / empty response — let the retry loop handle it - logger.warning("LLM returned empty content (attempt %d/%d), retrying", attempt + 1, max_retries) - if attempt < max_retries - 1: - await asyncio.sleep(retry_delay) - retry_delay = min(retry_delay * 2, 60) - continue - return content # Return whatever we got after exhausting retries - except RuntimeError: - logger.warning("No auxiliary model available for web content processing") - return None - except Exception as api_error: - last_error = api_error - if attempt < max_retries - 1: - logger.warning("LLM API call failed (attempt %d/%d): %s", attempt + 1, max_retries, str(api_error)[:100]) - logger.warning("Retrying in %ds...", retry_delay) - await asyncio.sleep(retry_delay) - retry_delay = min(retry_delay * 2, 60) - else: - raise last_error - - return None - - -async def _process_large_content_chunked( - content: str, - context_str: str, - model: Optional[str], - chunk_size: int, - max_output_size: int -) -> Optional[str]: - """ - Process large content by chunking, summarizing each chunk in parallel, - then synthesizing the summaries. - - Args: - content: The large content to process - context_str: Context information - model: Model to use - chunk_size: Size of each chunk in characters - max_output_size: Maximum final output size - - Returns: - Synthesized summary or None on failure - """ - # Split content into chunks - chunks = [] - for i in range(0, len(content), chunk_size): - chunk = content[i:i + chunk_size] - chunks.append(chunk) - - logger.info("Split into %d chunks of ~%d chars each", len(chunks), chunk_size) - - # Summarize each chunk in parallel - async def summarize_chunk(chunk_idx: int, chunk_content: str) -> tuple[int, Optional[str]]: - """Summarize a single chunk.""" - try: - chunk_info = f"[Processing chunk {chunk_idx + 1} of {len(chunks)}]" - summary = await _call_summarizer_llm( - chunk_content, - context_str, - model, - max_tokens=10000, - is_chunk=True, - chunk_info=chunk_info - ) - if summary: - logger.info("Chunk %d/%d summarized: %d -> %d chars", chunk_idx + 1, len(chunks), len(chunk_content), len(summary)) - return chunk_idx, summary - except Exception as e: - logger.warning("Chunk %d/%d failed: %s", chunk_idx + 1, len(chunks), str(e)[:50]) - return chunk_idx, None - - # Run all chunk summarizations in parallel - tasks = [summarize_chunk(i, chunk) for i, chunk in enumerate(chunks)] - # Use return_exceptions=True so a single task failure does not discard - # all other successfully summarized chunks. - results = await asyncio.gather(*tasks, return_exceptions=True) - - # Filter out exceptions, then collect successful summaries in order - successful_results = [] - for result_item in results: - if isinstance(result_item, BaseException): - logger.warning("Chunk summarization task failed: %s", result_item) - continue - successful_results.append(result_item) - - summaries = [] - for chunk_idx, summary in sorted(successful_results, key=lambda x: x[0]): - if summary: - summaries.append(f"## Section {chunk_idx + 1}\n{summary}") - - if not summaries: - logger.debug("All chunk summarizations failed") - return "[Failed to process large content: all chunk summarizations failed]" - - logger.info("Got %d/%d chunk summaries", len(summaries), len(chunks)) - - # If only one chunk succeeded, just return it (with cap) - if len(summaries) == 1: - result = summaries[0] - if len(result) > max_output_size: - result = result[:max_output_size] + "\n\n[... truncated ...]" - return result - - # Synthesize the summaries into a final summary - logger.info("Synthesizing %d summaries...", len(summaries)) - - combined_summaries = "\n\n---\n\n".join(summaries) - - synthesis_prompt = f"""You have been given summaries of different sections of a large document. -Synthesize these into ONE cohesive, comprehensive summary that: -1. Removes redundancy between sections -2. Preserves all key facts, figures, and actionable information -3. Is well-organized with clear structure -4. Is under {max_output_size} characters - -{context_str}SECTION SUMMARIES: -{combined_summaries} - -Create a single, unified markdown summary.""" - - try: - aux_client, effective_model, extra_body = _resolve_web_extract_auxiliary(model) - if aux_client is None or not effective_model: - logger.warning("No auxiliary model for synthesis, concatenating summaries") - fallback = "\n\n".join(summaries) - if len(fallback) > max_output_size: - fallback = fallback[:max_output_size] + "\n\n[... truncated ...]" - return fallback - - call_kwargs = { - "task": "web_extract", - "model": effective_model, - "messages": [ - {"role": "system", "content": "You synthesize multiple summaries into one cohesive, comprehensive summary. Be thorough but concise."}, - {"role": "user", "content": synthesis_prompt}, - ], - "temperature": 0.1, - "max_tokens": 20000, - } - if extra_body: - call_kwargs["extra_body"] = extra_body - response = await async_call_llm(**call_kwargs) - final_summary = extract_content_or_reasoning(response) - - # Retry once on empty content (reasoning-only response) - if not final_summary: - logger.warning("Synthesis LLM returned empty content, retrying once") - response = await async_call_llm(**call_kwargs) - final_summary = extract_content_or_reasoning(response) - - # If still None after retry, fall back to concatenated summaries - if not final_summary: - logger.warning("Synthesis failed after retry — concatenating chunk summaries") - fallback = "\n\n".join(summaries) - if len(fallback) > max_output_size: - fallback = fallback[:max_output_size] + "\n\n[... truncated ...]" - return fallback - - # Enforce hard cap - if len(final_summary) > max_output_size: - final_summary = final_summary[:max_output_size] + "\n\n[... summary truncated for context management ...]" - - original_len = len(content) - final_len = len(final_summary) - compression = final_len / original_len if original_len > 0 else 1.0 - - logger.info("Synthesis complete: %d -> %d chars (%.2f%%)", original_len, final_len, compression * 100) - return final_summary - - except Exception as e: - logger.warning("Synthesis failed: %s", str(e)[:100]) - # Fall back to concatenated summaries with truncation - fallback = "\n\n".join(summaries) - if len(fallback) > max_output_size: - fallback = fallback[:max_output_size] + "\n\n[... truncated due to synthesis failure ...]" - return fallback - - -def clean_base64_images(text: str) -> str: - """ - Remove base64 encoded images from text to reduce token count and clutter. - - This function finds and removes base64 encoded images in various formats: - - (data:image/png;base64,...) - - (data:image/jpeg;base64,...) - - (data:image/svg+xml;base64,...) - - data:image/[type];base64,... (without parentheses) - - Args: - text: The text content to clean - - Returns: - Cleaned text with base64 images replaced with placeholders - """ - # Pattern to match base64 encoded images wrapped in parentheses - # Matches: (data:image/[type];base64,[base64-string]) - base64_with_parens_pattern = r'\(data:image/[^;]+;base64,[A-Za-z0-9+/=]+\)' - - # Pattern to match base64 encoded images without parentheses - # Matches: data:image/[type];base64,[base64-string] - base64_pattern = r'data:image/[^;]+;base64,[A-Za-z0-9+/=]+' - - # Replace parentheses-wrapped images first - cleaned_text = re.sub(base64_with_parens_pattern, '[BASE64_IMAGE_REMOVED]', text) - - # Then replace any remaining non-parentheses images - cleaned_text = re.sub(base64_pattern, '[BASE64_IMAGE_REMOVED]', cleaned_text) - - return cleaned_text # ─── Exa / Parallel inline helpers — moved into plugins ────────────────────── @@ -894,29 +588,32 @@ def web_search_tool(query: str, limit: int = 5) -> str: async def web_extract_tool( urls: List[str], format: str = None, - use_llm_processing: bool = True, - model: Optional[str] = None, - min_length: int = DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION + char_limit: Optional[int] = None, ) -> str: """ Extract content from specific web pages using available extraction API backend. - This function provides a generic interface for web content extraction that - can work with multiple backends. Currently uses Firecrawl. + Returns clean page content (markdown/text) with NO LLM summarization. The + extract backends (Firecrawl, Tavily, Exa, Parallel) already return clean, + boilerplate-stripped content, so we return it directly and fast. Pages over + ``char_limit`` are head+tail truncated with an explicit footer; the full + text is stored under cache/web and the footer tells the model how to + read_file the omitted middle. Inline base64 images are replaced with + ``[IMAGE: alt]`` placeholders (real image URLs are preserved as links). Args: urls (List[str]): List of URLs to extract content from format (str): Desired output format ("markdown" or "html", optional) - use_llm_processing (bool): Whether to process content with LLM for summarization (default: True) - model (Optional[str]): The model to use for LLM processing (defaults to current auxiliary backend model) - min_length (int): Minimum content length to trigger LLM processing (default: 5000) + char_limit (Optional[int]): Per-page char budget sent to the model + (default: web.extract_char_limit or 15000). Larger pages truncate. Security: URLs are checked for embedded secrets before fetching. - + Returns: - str: JSON string containing extracted content. If LLM processing is enabled and successful, - the 'content' field will contain the processed markdown summary instead of raw content. - + str: JSON string with a ``results`` list; each entry has + ``url``, ``title``, ``content``, ``error``. ``content`` is the + (possibly truncated) clean page text. + Raises: Exception: If extraction fails or API key is not set """ @@ -944,16 +641,14 @@ async def web_extract_tool( "parameters": { "urls": normalized_urls, "format": format, - "use_llm_processing": use_llm_processing, - "model": model, - "min_length": min_length + "char_limit": char_limit, }, "error": None, "pages_extracted": 0, - "pages_processed_with_llm": 0, + "pages_truncated": 0, "original_response_size": 0, "final_response_size": 0, - "compression_metrics": [], + "truncation_metrics": [], "processing_applied": [] } @@ -1053,91 +748,39 @@ async def web_extract_tool( debug_call_data["pages_extracted"] = pages_extracted debug_call_data["original_response_size"] = len(json.dumps(response)) - effective_model = model or _get_default_summarizer_model() - auxiliary_available = check_auxiliary_model() - - # Process each result with LLM if enabled - if use_llm_processing and auxiliary_available: - logger.info("Processing extracted content with LLM (parallel)...") - debug_call_data["processing_applied"].append("llm_processing") - - # Prepare tasks for parallel processing - async def process_single_result(result): - """Process a single result with LLM and return updated result with metrics.""" - url = result.get('url', 'Unknown URL') - title = result.get('title', '') - raw_content = result.get('raw_content', '') or result.get('content', '') - - if not raw_content: - return result, None, "no_content" - - original_size = len(raw_content) - - # Process content with LLM - processed = await process_content_with_llm( - raw_content, url, title, effective_model, min_length - ) - - if processed: - processed_size = len(processed) - compression_ratio = processed_size / original_size if original_size > 0 else 1.0 - - # Update result with processed content - result['content'] = processed - result['raw_content'] = raw_content - - metrics = { - "url": url, - "original_size": original_size, - "processed_size": processed_size, - "compression_ratio": compression_ratio, - "model_used": effective_model - } - return result, metrics, "processed" - else: - metrics = { - "url": url, - "original_size": original_size, - "processed_size": original_size, - "compression_ratio": 1.0, - "model_used": None, - "reason": "content_too_short" - } - return result, metrics, "too_short" - - # Run all LLM processing in parallel - results_list = response.get('results', []) - tasks = [process_single_result(result) for result in results_list] - # Use return_exceptions=True so a single task failure does not - # discard all other successfully processed results. - processed_results = await asyncio.gather(*tasks, return_exceptions=True) - # Collect metrics and print results - for result_item in processed_results: - if isinstance(result_item, BaseException): - logger.warning("Web result processing task failed: %s", result_item) - continue - result, metrics, status = result_item - url = result.get('url', 'Unknown URL') - if status == "processed": - debug_call_data["compression_metrics"].append(metrics) - debug_call_data["pages_processed_with_llm"] += 1 - logger.info("%s (processed)", url) - elif status == "too_short": - debug_call_data["compression_metrics"].append(metrics) - logger.info("%s (no processing - content too short)", url) - else: - logger.warning("%s (no content to process)", url) - else: - if use_llm_processing and not auxiliary_available: - logger.warning("LLM processing requested but no auxiliary model available, returning raw content") - debug_call_data["processing_applied"].append("llm_processing_unavailable") - # Print summary of extracted pages for debugging (original behavior) - for result in response.get('results', []): - url = result.get('url', 'Unknown URL') - content_length = len(result.get('raw_content', '')) - logger.info("%s (%d characters)", url, content_length) - + effective_char_limit = char_limit if char_limit is not None else _get_extract_char_limit() + try: + effective_char_limit = max(2000, min(int(effective_char_limit), 500_000)) + except (TypeError, ValueError): + effective_char_limit = DEFAULT_EXTRACT_CHAR_LIMIT + + # Truncate-and-store: no LLM. For each result, convert inline base64 + # images to labeled placeholders (keeping alt text + real image URLs), + # then return the clean content directly if within budget, or a + # head+tail window plus a footer pointing at the stored full text. + debug_call_data["processing_applied"].append("truncate_and_store") + for result in response.get("results", []): + if result.get("error"): + continue + url = result.get("url", "") + raw_content = result.get("raw_content", "") or result.get("content", "") + if not raw_content: + continue + clean = convert_base64_images_to_links(raw_content) + model_text, truncated = _truncate_with_footer(clean, url, effective_char_limit) + result["content"] = model_text + if truncated: + debug_call_data["pages_truncated"] += 1 + debug_call_data["truncation_metrics"].append({ + "url": url, + "original_size": len(clean), + "sent_size": len(model_text), + }) + logger.info("%s (truncated %d -> %d chars)", url, len(clean), len(model_text)) + else: + logger.info("%s (%d chars, whole)", url, len(clean)) + # Trim output to minimal fields per entry: title, content, error trimmed_results = [ { @@ -1153,16 +796,16 @@ async def web_extract_tool( if trimmed_response.get("results") == []: result_json = tool_error("Content was inaccessible or not found") - - cleaned_result = clean_base64_images(result_json) - else: result_json = json.dumps(trimmed_response, indent=2, ensure_ascii=False) - - cleaned_result = clean_base64_images(result_json) - + + # base64 images were already converted to placeholders per-result above; + # this is a belt-and-suspenders sweep over the serialized JSON in case a + # provider tucked a blob somewhere unexpected (e.g. metadata). + cleaned_result = convert_base64_images_to_links(result_json) + debug_call_data["final_response_size"] = len(cleaned_result) - debug_call_data["processing_applied"].append("base64_image_removal") + debug_call_data["processing_applied"].append("base64_image_conversion") # Log debug information _debug.log_call("web_extract_tool", debug_call_data) @@ -1193,28 +836,18 @@ def check_web_api_key() -> bool: ) -def check_auxiliary_model() -> bool: - """Check if an auxiliary text model is available for LLM content processing.""" - client, _, _ = _resolve_web_extract_auxiliary() - return client is not None - - - - if __name__ == "__main__": """ Simple test/demo when run directly """ print("🌐 Standalone Web Tools Module") print("=" * 40) - + # Check if API keys are available web_available = check_web_api_key() tool_gateway_available = _is_tool_gateway_ready() firecrawl_key_available = bool(os.getenv("FIRECRAWL_API_KEY", "").strip()) firecrawl_url_available = bool(os.getenv("FIRECRAWL_API_URL", "").strip()) - nous_available = check_auxiliary_model() - default_summarizer_model = _get_default_summarizer_model() if web_available: backend = _get_backend() @@ -1246,29 +879,20 @@ if __name__ == "__main__": f"{_firecrawl_backend_help_suffix()}" ) - if not nous_available: - print("❌ No auxiliary model available for LLM content processing") - print("Set OPENROUTER_API_KEY, configure Nous Portal, or set OPENAI_BASE_URL + OPENAI_API_KEY") - print("⚠️ Without an auxiliary model, LLM content processing will be disabled") - else: - print(f"✅ Auxiliary model available: {default_summarizer_model}") - if not web_available: sys.exit(1) print("🛠️ Web tools ready for use!") - - if nous_available: - print(f"🧠 LLM content processing available with {default_summarizer_model}") - print(f" Default min length for processing: {DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION} chars") - + print(f" Extract char limit: {_get_extract_char_limit()} chars " + "(pages over this are truncated; full text stored in cache/web)") + # Show debug mode status if _debug.active: print(f"🐛 Debug mode ENABLED - Session ID: {_debug.session_id}") print(f" Debug logs will be saved to: {_debug.log_dir}/web_tools_debug_{_debug.session_id}.json") else: print("🐛 Debug mode disabled (set WEB_TOOLS_DEBUG=true to enable)") - + print("\nBasic usage:") print(" from web_tools import web_search_tool, web_extract_tool") print(" import asyncio") @@ -1276,37 +900,16 @@ if __name__ == "__main__": print(" # Search (synchronous)") print(" results = web_search_tool('Python tutorials')") print("") - print(" # Extract (asynchronous)") + print(" # Extract (asynchronous, no LLM — truncate-and-store)") print(" async def main():") print(" content = await web_extract_tool(['https://example.com'])") + print(" # bigger budget for one call:") + print(" content = await web_extract_tool(['https://docs.python.org'], char_limit=40000)") print(" asyncio.run(main())") - - if nous_available: - print("\nLLM-enhanced usage:") - print(" # Content automatically processed for pages >5000 chars (default)") - print(" content = await web_extract_tool(['https://python.org/about/'])") - print("") - print(" # Customize processing parameters") - print(" content = await web_extract_tool(") - print(" ['https://docs.python.org'],") - print(" model='google/gemini-3-flash-preview',") - print(" min_length=3000") - print(" )") - print("") - print(" # Disable LLM processing") - print(" raw_content = await web_extract_tool(['https://example.com'], use_llm_processing=False)") - + print("\nDebug mode:") - print(" # Enable debug logging") print(" export WEB_TOOLS_DEBUG=true") - print(" # Debug logs capture:") - print(" # - All tool calls with parameters") - print(" # - Original API responses") - print(" # - LLM compression metrics") - print(" # - Final processed results") print(" # Logs saved to: ./logs/web_tools_debug_UUID.json") - - print("\n📝 Run 'python test_web_tools_llm.py' to test LLM processing capabilities") # --------------------------------------------------------------------------- @@ -1338,7 +941,7 @@ WEB_SEARCH_SCHEMA = { WEB_EXTRACT_SCHEMA = { "name": "web_extract", - "description": "Extract content from web page URLs. Returns page content in markdown format. Also works with PDF URLs (arxiv papers, documents, etc.) — pass the PDF link directly and it converts to markdown text. Pages under 5000 chars return full markdown; larger pages are LLM-summarized and capped at ~5000 chars per page. Pages over 2M chars are refused. If a URL fails or times out, use the browser tool to access it instead.", + "description": "Extract content from web page URLs. Returns clean page content in markdown/text (no LLM summarization — fast). Also works with PDF URLs (arxiv papers, documents) — pass the PDF link directly. Pages within the char budget (default 15000) return whole; larger pages return a head+tail window with a footer telling you the full text's saved file path and the read_file call to page through the omitted middle. Inline images appear as [IMAGE: alt] placeholders; real image URLs are kept as links. If a URL fails or times out, use the browser tool instead.", "parameters": { "type": "object", "properties": { @@ -1347,6 +950,11 @@ WEB_EXTRACT_SCHEMA = { "items": {"type": "string"}, "description": "List of URLs to extract content from (max 5 URLs per call)", "maxItems": 5 + }, + "char_limit": { + "type": "integer", + "description": "Optional per-page character budget sent back (default 15000). Pages larger than this are head+tail truncated with the full text stored to disk. Raise it when you need more of a long page inline.", + "minimum": 2000 } }, "required": ["urls"] @@ -1368,7 +976,10 @@ registry.register( toolset="web", schema=WEB_EXTRACT_SCHEMA, handler=lambda args, **kw: web_extract_tool( - args.get("urls", [])[:5] if isinstance(args.get("urls"), list) else [], "markdown"), + args.get("urls", [])[:5] if isinstance(args.get("urls"), list) else [], + "markdown", + char_limit=args.get("char_limit"), + ), check_fn=check_web_api_key, requires_env=_web_requires_env(), is_async=True,