From ee8cbfdc03eb9b7cdd486165486f2e3cad0d8645 Mon Sep 17 00:00:00 2001
From: Teknium <127238744+teknium1@users.noreply.github.com>
Date: Mon, 29 Jun 2026 10:00:49 -0700
Subject: [PATCH] feat(web_extract): truncate-and-store instead of LLM
 summarization (#54843)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* feat(web_extract): truncate-and-store instead of LLM summarization

web_extract no longer runs an auxiliary LLM over scraped pages. The extract
backends (Firecrawl/Tavily/Exa/Parallel) already return clean, boilerplate-
stripped markdown, so we return it directly: pages within a char budget
(default 15000, web.extract_char_limit) come back whole; larger pages get a
head+tail window plus an explicit footer giving the stored full-text path and
the read_file call to page through the omitted middle. The full clean text is
written to cache/web (mounted read-only into remote backends like the other
cache dirs), so nothing is lost.

Inline base64 images are converted to [IMAGE: alt] placeholders (token bombs
dropped) while real http(s) image URLs are preserved as links so the agent can
still web_extract/vision_analyze them.

Removes process_content_with_llm + the chunked summarizer + check_auxiliary_model
+ _resolve_web_extract_auxiliary. context_references._default_url_fetcher is
updated to the truncate path and its stale data.documents shape read is fixed
to results (it was silently returning empty).

Live before/after eval (firecrawl, 4 URLs): 11.7x faster overall (176.6s ->
15.1s); 10-60x on large pages. Quality identical; findability 4/4 (answer
recoverable from stored full text on every truncated page). web_search is
unchanged.

No own scraper added; no changes to web_search.

* fix(web_extract): add char_limit to execute_code web_extract stub

The new web_extract char_limit param must appear in the code_execution_tool
_TOOL_STUBS signature (and doc line) or test_stubs_cover_all_schema_params
fails — the stub schema must cover every real schema param.
---
 agent/context_references.py              |   4 +-
 hermes_cli/config.py                     |   1 +
 tests/integration/test_web_tools.py      |  20 +-
 tests/tools/test_browser_secret_exfil.py |   1 -
 tests/tools/test_web_providers.py        |   1 -
 tests/tools/test_web_tools_config.py     |  41 --
 tests/tools/test_web_tools_tavily.py     |   6 +-
 tests/tools/test_web_tools_truncate.py   | 142 ++++
 tests/tools/test_website_policy.py       |   4 +-
 tools/code_execution_tool.py             |  11 +-
 tools/credential_files.py                |   1 +
 tools/web_tools.py                       | 799 ++++++-----------------
 12 files changed, 370 insertions(+), 661 deletions(-)
 create mode 100644 tests/tools/test_web_tools_truncate.py

diff --git a/agent/context_references.py b/agent/context_references.py
index fad1ff00159..d77857584a7 100644
--- a/agent/context_references.py
+++ b/agent/context_references.py
@@ -328,9 +328,9 @@ async def _fetch_url_content(
 async def _default_url_fetcher(url: str) -> str:
     from tools.web_tools import web_extract_tool
 
-    raw = await web_extract_tool([url], format="markdown", use_llm_processing=True)
+    raw = await web_extract_tool([url], format="markdown")
     payload = json.loads(raw)
-    docs = payload.get("data", {}).get("documents", [])
+    docs = payload.get("results", [])
     if not docs:
         return ""
     doc = docs[0]
diff --git a/hermes_cli/config.py b/hermes_cli/config.py
index 731397ba13a..3474ee35a0e 100644
--- a/hermes_cli/config.py
+++ b/hermes_cli/config.py
@@ -1161,6 +1161,7 @@ DEFAULT_CONFIG = {
         "backend": "",           # shared fallback — applies to both search and extract
         "search_backend": "",    # per-capability override for web_search (e.g. "searxng")
         "extract_backend": "",   # per-capability override for web_extract (e.g. "native")
+        "extract_char_limit": 15000,  # per-page char budget for web_extract; larger pages truncate + store full text in cache/web
     },
 
     "browser": {
diff --git a/tests/integration/test_web_tools.py b/tests/integration/test_web_tools.py
index f5281140066..6be64b6b2a6 100644
--- a/tests/integration/test_web_tools.py
+++ b/tests/integration/test_web_tools.py
@@ -32,7 +32,6 @@ from tools.web_tools import (
     web_extract_tool,
     check_firecrawl_api_key,
     check_web_api_key,
-    check_auxiliary_model,
     _get_backend,
 )
 
@@ -129,12 +128,11 @@ class WebToolsTester:
             backend = _get_backend()
             self.log_result("Web Backend API Key", "passed", f"Using {backend} backend")
         
-        # Check auxiliary LLM provider (optional)
-        if not check_auxiliary_model():
-            self.log_result("Auxiliary LLM", "skipped", "No auxiliary LLM provider available (LLM tests will be skipped)")
-            self.test_llm = False
-        else:
-            self.log_result("Auxiliary LLM", "passed", "Found")
+        # Auxiliary LLM summarization was removed — web_extract is now
+        # truncate-and-store (no LLM). Keep the flag off so any residual
+        # LLM-path assertions stay skipped.
+        self.log_result("Auxiliary LLM", "skipped", "web_extract no longer uses an LLM (truncate-and-store)")
+        self.test_llm = False
         
         return True
     
@@ -261,12 +259,11 @@ class WebToolsTester:
                     print(f"    - {url}")
                 
                 if self.verbose:
-                    print(f"  Calling web_extract_tool(urls={test_urls}, format='markdown', use_llm_processing=False)")
+                    print(f"  Calling web_extract_tool(urls={test_urls}, format='markdown')")
                 
                 result = await web_extract_tool(
                     test_urls,
                     format="markdown",
-                    use_llm_processing=False
                 )
                 
                 # Parse result
@@ -360,8 +357,7 @@ class WebToolsTester:
             result = await web_extract_tool(
                 [test_url],
                 format="markdown",
-                use_llm_processing=True,
-                min_length=1000  # Lower threshold for testing
+                char_limit=1000,  # small budget to force truncation in the test
             )
             
             data = json.loads(result)
@@ -466,7 +462,7 @@ class WebToolsTester:
                 "web_backend": _get_backend() if check_web_api_key() else None,
                 "firecrawl_api_key": check_firecrawl_api_key(),
                 "parallel_api_key": bool(os.getenv("PARALLEL_API_KEY")),
-                "auxiliary_model": check_auxiliary_model(),
+                "auxiliary_model": False,
             }
         }
         
diff --git a/tests/tools/test_browser_secret_exfil.py b/tests/tools/test_browser_secret_exfil.py
index fbf35727bb9..2ccc9193b49 100644
--- a/tests/tools/test_browser_secret_exfil.py
+++ b/tests/tools/test_browser_secret_exfil.py
@@ -126,7 +126,6 @@ class TestWebExtractSecretExfil:
         try:
             result = await web_tools.web_extract_tool(
                 urls=["https://wttr.in/Köln"],
-                use_llm_processing=False,
             )
         finally:
             web_search_registry._reset_for_tests()
diff --git a/tests/tools/test_web_providers.py b/tests/tools/test_web_providers.py
index 177b34ccc92..f71b1f3b7d8 100644
--- a/tests/tools/test_web_providers.py
+++ b/tests/tools/test_web_providers.py
@@ -418,7 +418,6 @@ class TestDispatchersTriggerPluginDiscovery:
             result = json.loads(asyncio.run(
                 web_tools.web_extract_tool(
                     ["https://example.com"],
-                    use_llm_processing=False,
                 )
             ))
 
diff --git a/tests/tools/test_web_tools_config.py b/tests/tools/test_web_tools_config.py
index 5838ea00b78..667d5350c4a 100644
--- a/tests/tools/test_web_tools_config.py
+++ b/tests/tools/test_web_tools_config.py
@@ -160,47 +160,6 @@ class TestFirecrawlClientConfig:
             importlib.reload(tools.web_tools)
             assert tools.web_tools._read_nous_access_token() == "nous-token"
 
-    def test_check_auxiliary_model_re_resolves_backend_each_call(self):
-        """Availability checks should not be pinned to module import state."""
-        import tools.web_tools
-
-        # Simulate the pre-fix import-time cache slot for regression coverage.
-        tools.web_tools.__dict__["_aux_async_client"] = None
-
-        with patch(
-            "tools.web_tools.get_async_text_auxiliary_client",
-            side_effect=[(None, None), (MagicMock(base_url="https://api.openrouter.ai/v1"), "test-model")],
-        ):
-            assert tools.web_tools.check_auxiliary_model() is False
-            assert tools.web_tools.check_auxiliary_model() is True
-
-    @pytest.mark.asyncio
-    async def test_summarizer_re_resolves_backend_after_initial_unavailable_state(self):
-        """Summarization should pick up a backend that becomes available later in-process."""
-        import tools.web_tools
-
-        tools.web_tools.__dict__["_aux_async_client"] = None
-
-        response = MagicMock()
-        response.choices = [MagicMock(message=MagicMock(content="summary text"))]
-
-        with patch(
-            "tools.web_tools._resolve_web_extract_auxiliary",
-            side_effect=[(None, None, {}), (MagicMock(base_url="https://api.openrouter.ai/v1"), "test-model", {})],
-        ), patch(
-            "tools.web_tools.async_call_llm",
-            new=AsyncMock(return_value=response),
-        ) as mock_async_call:
-            assert tools.web_tools.check_auxiliary_model() is False
-            result = await tools.web_tools._call_summarizer_llm(
-                "Some content worth summarizing",
-                "Source: https://example.com\n\n",
-                None,
-            )
-
-        assert result == "summary text"
-        mock_async_call.assert_awaited_once()
-
     # ── Singleton caching ────────────────────────────────────────────
 
     def test_singleton_returns_same_instance(self):
diff --git a/tests/tools/test_web_tools_tavily.py b/tests/tools/test_web_tools_tavily.py
index de820794965..d65baac3e19 100644
--- a/tests/tools/test_web_tools_tavily.py
+++ b/tests/tools/test_web_tools_tavily.py
@@ -215,13 +215,13 @@ class TestWebExtractTavily:
 
         with patch("tools.web_tools._get_backend", return_value="tavily"), \
              patch.dict(os.environ, {"TAVILY_API_KEY": "tvly-test"}), \
-             patch("tools.web_tools.httpx.post", return_value=mock_response), \
-             patch("tools.web_tools.process_content_with_llm", return_value=None):
+             patch("tools.web_tools.httpx.post", return_value=mock_response):
             from tools.web_tools import web_extract_tool
             result = json.loads(asyncio.get_event_loop().run_until_complete(
-                web_extract_tool(["https://example.com"], use_llm_processing=False)
+                web_extract_tool(["https://example.com"])
             ))
             assert "results" in result
             assert len(result["results"]) == 1
             assert result["results"][0]["url"] == "https://example.com"
+            assert "Extracted content" in result["results"][0]["content"]
 
diff --git a/tests/tools/test_web_tools_truncate.py b/tests/tools/test_web_tools_truncate.py
new file mode 100644
index 00000000000..310a9b896dc
--- /dev/null
+++ b/tests/tools/test_web_tools_truncate.py
@@ -0,0 +1,142 @@
+"""Unit tests for the truncate-and-store web_extract path (no LLM).
+
+Covers convert_base64_images_to_links, _truncate_with_footer, _store_full_text,
+_get_extract_char_limit, and the end-to-end web_extract_tool truncation behavior.
+"""
+import asyncio
+import json
+import os
+from unittest.mock import patch
+
+import pytest
+
+import tools.web_tools as wt
+
+
+class TestImageConversion:
+    def test_markdown_base64_image_keeps_alt_drops_blob(self):
+        blob = "A" * 5000
+        text = f"before ![a cat]( data:image/png;base64,{blob}) after"
+        out = wt.convert_base64_images_to_links(text)
+        assert "[IMAGE: a cat]" in out
+        assert "base64" not in out
+        assert blob not in out
+        assert "before" in out and "after" in out
+
+    def test_markdown_base64_image_no_alt(self):
+        out = wt.convert_base64_images_to_links("x ![](data:image/jpeg;base64,QQ==) y")
+        assert "[IMAGE]" in out
+        assert "base64" not in out
+
+    def test_real_http_image_links_preserved(self):
+        text = "see ![logo](https://example.com/logo.png) here"
+        out = wt.convert_base64_images_to_links(text)
+        # Real image URLs must survive so the agent can inspect them.
+        assert "![logo](https://example.com/logo.png)" in out
+
+    def test_bare_and_parenthesised_base64_become_placeholder(self):
+        blob = "Z" * 3000
+        bare = wt.convert_base64_images_to_links(f"data:image/gif;base64,{blob}")
+        assert bare == "[IMAGE]"
+        paren = wt.convert_base64_images_to_links(f"(data:image/gif;base64,{blob})")
+        assert paren == "[IMAGE]"
+
+
+class TestTruncation:
+    def test_short_content_returned_whole(self):
+        content = "# Title\n\nshort body\n"
+        out, truncated = wt._truncate_with_footer(content, "https://e.com", 15000)
+        assert out == content
+        assert truncated is False
+
+    def test_long_content_truncated_with_footer(self, tmp_path, monkeypatch):
+        monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
+        body = "\n".join(f"line {i} " + "x" * 50 for i in range(2000))
+        out, truncated = wt._truncate_with_footer(body, "https://example.com/page", 4000)
+        assert truncated is True
+        assert "[TRUNCATED]" in out
+        assert "Full text saved to:" in out
+        assert "read_file" in out
+        # Head and tail are both present (first and last lines survive).
+        assert "line 0 " in out
+        assert "line 1999 " in out
+        # The omitted middle is gone.
+        assert "line 1000 " not in out
+        # Sent text is bounded near the budget (+ footer overhead).
+        assert len(out) < 4000 + 2000
+
+    def test_truncation_stores_full_text_readable(self, tmp_path, monkeypatch):
+        monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
+        body = "UNIQUE_MIDDLE_MARKER\n" + ("\n".join(f"row {i}" for i in range(5000)))
+        out, truncated = wt._truncate_with_footer(body, "https://example.com/doc", 3000)
+        assert truncated is True
+        # Extract the stored path from the footer and confirm full text is there.
+        path_line = next(ln for ln in out.splitlines() if "Full text saved to:" in ln)
+        stored_path = path_line.split("Full text saved to:", 1)[1].strip()
+        assert os.path.exists(stored_path)
+        full = open(stored_path).read()
+        assert "UNIQUE_MIDDLE_MARKER" in full
+        assert "row 2500" in full  # the omitted-middle row is in the stored file
+
+
+class TestCharLimitConfig:
+    def test_default_when_unset(self):
+        with patch("tools.web_tools._load_web_config", return_value={}):
+            assert wt._get_extract_char_limit() == wt.DEFAULT_EXTRACT_CHAR_LIMIT
+
+    def test_config_override(self):
+        with patch("tools.web_tools._load_web_config", return_value={"extract_char_limit": 40000}):
+            assert wt._get_extract_char_limit() == 40000
+
+    def test_clamps_floor(self):
+        with patch("tools.web_tools._load_web_config", return_value={"extract_char_limit": 100}):
+            assert wt._get_extract_char_limit() == 2000
+
+    def test_bad_value_falls_back(self):
+        with patch("tools.web_tools._load_web_config", return_value={"extract_char_limit": "nope"}):
+            assert wt._get_extract_char_limit() == wt.DEFAULT_EXTRACT_CHAR_LIMIT
+
+
+class TestEndToEnd:
+    def test_web_extract_truncates_large_page_no_llm(self, tmp_path, monkeypatch):
+        monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
+        big = "\n".join(f"para {i} " + "y" * 80 for i in range(3000))
+
+        class FakeProvider:
+            name = "fake"
+            display_name = "Fake"
+
+            def supports_extract(self):
+                return True
+
+            async def extract(self, urls, **kwargs):
+                return [{"url": urls[0], "title": "Big Page", "content": big,
+                         "raw_content": big, "metadata": {}}]
+
+        with patch("tools.web_tools._ensure_web_plugins_loaded"), \
+             patch("tools.web_tools._get_extract_backend", return_value="fake"), \
+             patch("tools.web_tools.async_is_safe_url", new=_AsyncTrue()), \
+             patch("agent.web_search_registry.get_provider", return_value=FakeProvider()):
+            result = json.loads(asyncio.new_event_loop().run_until_complete(
+                wt.web_extract_tool(["https://example.com/big"], char_limit=5000)
+            ))
+
+        assert "results" in result
+        content = result["results"][0]["content"]
+        assert "[TRUNCATED]" in content
+        assert "Full text saved to:" in content
+        # No LLM was involved: para 0 (head) and the last para (tail) are verbatim.
+        assert "para 0 " in content
+        assert "para 2999 " in content
+
+
+def _make_awaitable(value):
+    async def _coro(*a, **k):
+        return value
+    return _coro()
+
+
+class _AsyncTrue:
+    """Async callable that always returns True (re-awaitable per call)."""
+    async def __call__(self, *a, **k):
+        return True
diff --git a/tests/tools/test_website_policy.py b/tests/tools/test_website_policy.py
index 712a372867a..9f488ee1189 100644
--- a/tests/tools/test_website_policy.py
+++ b/tests/tools/test_website_policy.py
@@ -398,7 +398,7 @@ class TestWebToolPolicy:
         # Force the firecrawl plugin to be the active extract provider.
         monkeypatch.setenv("FIRECRAWL_API_KEY", "fake-key")
 
-        result = json.loads(await web_tools.web_extract_tool(["https://blocked.test"], use_llm_processing=False))
+        result = json.loads(await web_tools.web_extract_tool(["https://blocked.test"]))
 
         assert result["results"][0]["url"] == "https://blocked.test"
         assert "Blocked by website policy" in result["results"][0]["error"]
@@ -443,7 +443,7 @@ class TestWebToolPolicy:
         monkeypatch.setattr("tools.interrupt.is_interrupted", lambda: False)
         monkeypatch.setenv("FIRECRAWL_API_KEY", "fake-key")
 
-        result = json.loads(await web_tools.web_extract_tool(["https://allowed.test"], use_llm_processing=False))
+        result = json.loads(await web_tools.web_extract_tool(["https://allowed.test"]))
 
         assert result["results"][0]["url"] == "https://blocked.test/final"
         assert result["results"][0]["content"] == ""
diff --git a/tools/code_execution_tool.py b/tools/code_execution_tool.py
index 8946de73750..402777eab61 100644
--- a/tools/code_execution_tool.py
+++ b/tools/code_execution_tool.py
@@ -219,9 +219,9 @@ _TOOL_STUBS = {
     ),
     "web_extract": (
         "web_extract",
-        "urls: list",
-        '"""Extract content from URLs. Returns dict with results list of {url, title, content, error}."""',
-        '{"urls": urls}',
+        "urls: list, char_limit: int = None",
+        '"""Extract content from URLs (no LLM summarization). Returns dict with results list of {url, title, content, error}. Pages over char_limit (default 15000) are head+tail truncated with the full text stored on disk; the content footer gives the path. content is markdown."""',
+        '{"urls": urls, "char_limit": char_limit}',
     ),
     "read_file": (
         "read_file",
@@ -1727,8 +1727,9 @@ _TOOL_DOC_LINES = [
      "  web_search(query: str, limit: int = 5) -> dict\n"
      "    Returns {\"data\": {\"web\": [{\"url\", \"title\", \"description\"}, ...]}}"),
     ("web_extract",
-     "  web_extract(urls: list[str]) -> dict\n"
-     "    Returns {\"results\": [{\"url\", \"title\", \"content\", \"error\"}, ...]} where content is markdown"),
+     "  web_extract(urls: list[str], char_limit: int = None) -> dict\n"
+     "    Returns {\"results\": [{\"url\", \"title\", \"content\", \"error\"}, ...]} where content is markdown.\n"
+     "    No LLM summarization. Pages over char_limit (default 15000) are head+tail truncated; full text stored on disk (path in the content footer)."),
     ("read_file",
      "  read_file(path: str, offset: int = 1, limit: int = 500) -> dict\n"
      "    Lines are 1-indexed. Returns {\"content\": \"...\", \"total_lines\": N}"),
diff --git a/tools/credential_files.py b/tools/credential_files.py
index 7d6520820c7..b7f1ff773e8 100644
--- a/tools/credential_files.py
+++ b/tools/credential_files.py
@@ -349,6 +349,7 @@ _CACHE_DIRS: list[tuple[str, str]] = [
     ("cache/audio", "audio_cache"),
     ("cache/videos", "video_cache"),
     ("cache/screenshots", "browser_screenshots"),
+    ("cache/web", "web_cache"),
 ]
 
 
diff --git a/tools/web_tools.py b/tools/web_tools.py
index 133489b0a89..0635b23f5d4 100644
--- a/tools/web_tools.py
+++ b/tools/web_tools.py
@@ -83,11 +83,6 @@ _parallel_client: Optional[Any] = None
 _async_parallel_client: Optional[Any] = None
 _exa_client: Optional[Any] = None
 
-from agent.auxiliary_client import (
-    async_call_llm,
-    extract_content_or_reasoning,
-    get_async_text_auxiliary_client,
-)
 from tools.debug_helpers import DebugSession
 # Imported solely so unit tests can monkeypatch these names on
 # tools.web_tools (the firecrawl plugin reads them via its own import chain).
@@ -305,445 +300,144 @@ def _web_requires_env() -> list[str]:
 # unit-test patches.
 
 
-DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION = 5000
-
-def _is_nous_auxiliary_client(client: Any) -> bool:
-    """Return True when the resolved auxiliary backend is Nous Portal."""
-    from urllib.parse import urlparse
-
-    base_url = str(getattr(client, "base_url", "") or "")
-    host = (urlparse(base_url).hostname or "").lower()
-    return host == "nousresearch.com" or host.endswith(".nousresearch.com")
-
-
-def _resolve_web_extract_auxiliary(model: Optional[str] = None) -> tuple[Optional[Any], Optional[str], Dict[str, Any]]:
-    """Resolve the current web-extract auxiliary client, model, and extra body."""
-    client, default_model = get_async_text_auxiliary_client("web_extract")
-    configured_model = os.getenv("AUXILIARY_WEB_EXTRACT_MODEL", "").strip()
-    effective_model = model or configured_model or default_model
-
-    extra_body: Dict[str, Any] = {}
-    if client is not None and _is_nous_auxiliary_client(client):
-        from agent.auxiliary_client import get_auxiliary_extra_body
-        from agent.portal_tags import nous_portal_tags
-        extra_body = get_auxiliary_extra_body() or {"tags": nous_portal_tags()}
-
-    return client, effective_model, extra_body
-
-
-def _get_default_summarizer_model() -> Optional[str]:
-    """Return the current default model for web extraction summarization."""
-    _, model, _ = _resolve_web_extract_auxiliary()
-    return model
+# Default budget (characters) of clean page text sent to the model. Pages at
+# or under this size are returned whole; larger pages are head+tail truncated
+# and the full text is stored on disk (see _store_full_text). Spending context,
+# not API dollars — so this is generous relative to the old 5k summary cap.
+# Override via web.extract_char_limit in config.yaml.
+DEFAULT_EXTRACT_CHAR_LIMIT = 15000
 
 _debug = DebugSession("web_tools", env_var="WEB_TOOLS_DEBUG")
 
 
-async def process_content_with_llm(
-    content: str, 
-    url: str = "", 
-    title: str = "",
-    model: Optional[str] = None,
-    min_length: int = DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION
-) -> Optional[str]:
-    """
-    Process web content using LLM to create intelligent summaries with key excerpts.
-    
-    This function uses Gemini 3 Flash Preview (or specified model) via OpenRouter API 
-    to intelligently extract key information and create markdown summaries,
-    significantly reducing token usage while preserving all important information.
-    
-    For very large content (>500k chars), uses chunked processing with synthesis.
-    For extremely large content (>2M chars), refuses to process entirely.
-    
-    Args:
-        content (str): The raw content to process
-        url (str): The source URL (for context, optional)
-        title (str): The page title (for context, optional)
-        model (str): The model to use for processing (default: google/gemini-3-flash-preview)
-        min_length (int): Minimum content length to trigger processing (default: 5000)
-        
-    Returns:
-        Optional[str]: Processed markdown content, or None if content too short or processing fails
-    """
-    # Size thresholds
-    MAX_CONTENT_SIZE = 2_000_000  # 2M chars - refuse entirely above this
-    CHUNK_THRESHOLD = 500_000     # 500k chars - use chunked processing above this
-    CHUNK_SIZE = 100_000          # 100k chars per chunk
-    MAX_OUTPUT_SIZE = 5000        # Hard cap on final output size
-    
+def _get_extract_char_limit() -> int:
+    """Resolve the per-page char budget from config, clamped to a sane range."""
     try:
-        content_len = len(content)
-        
-        # Refuse if content is absurdly large
-        if content_len > MAX_CONTENT_SIZE:
-            size_mb = content_len / 1_000_000
-            logger.warning("Content too large (%.1fMB > 2MB limit). Refusing to process.", size_mb)
-            return f"[Content too large to process: {size_mb:.1f}MB. Try a more focused source URL.]"
-        
-        # Skip processing if content is too short
-        if content_len < min_length:
-            logger.debug("Content too short (%d < %d chars), skipping LLM processing", content_len, min_length)
-            return None
-        
-        # Create context information
-        context_info = []
-        if title:
-            context_info.append(f"Title: {title}")
-        if url:
-            context_info.append(f"Source: {url}")
-        context_str = "\n".join(context_info) + "\n\n" if context_info else ""
-        
-        # Check if we need chunked processing
-        if content_len > CHUNK_THRESHOLD:
-            logger.info("Content large (%d chars). Using chunked processing...", content_len)
-            return await _process_large_content_chunked(
-                content, context_str, model, CHUNK_SIZE, MAX_OUTPUT_SIZE
-            )
-        
-        # Standard single-pass processing for normal content
-        logger.info("Processing content with LLM (%d characters)", content_len)
-        
-        processed_content = await _call_summarizer_llm(content, context_str, model)
-        
-        if processed_content:
-            # Enforce output cap
-            if len(processed_content) > MAX_OUTPUT_SIZE:
-                processed_content = processed_content[:MAX_OUTPUT_SIZE] + "\n\n[... summary truncated for context management ...]"
-            
-            # Log compression metrics
-            processed_length = len(processed_content)
-            compression_ratio = processed_length / content_len if content_len > 0 else 1.0
-            logger.info("Content processed: %d -> %d chars (%.1f%%)", content_len, processed_length, compression_ratio * 100)
-        
-        return processed_content
-        
-    except Exception as e:
-        logger.warning(
-            "web_extract LLM summarization failed (%s). "
-            "Tip: increase auxiliary.web_extract.timeout in config.yaml "
-            "or switch to a faster auxiliary model.",
-            str(e)[:120],
+        configured = _load_web_config().get("extract_char_limit")
+        if configured is not None:
+            value = int(configured)
+            # Floor at 2k (below that the footer dominates), no hard ceiling
+            # beyond a generous guard so a typo can't blow up context.
+            return max(2000, min(value, 500_000))
+    except (TypeError, ValueError):
+        pass
+    return DEFAULT_EXTRACT_CHAR_LIMIT
+
+
+def convert_base64_images_to_links(text: str) -> str:
+    """Replace inline base64 image blobs with labeled markdown links.
+
+    base64 image payloads are token bombs (a single inline PNG can be tens of
+    thousands of characters), so we never send the raw bytes to the model. But
+    we preserve the fact that an image was there, and its alt text, as an
+    inspectable placeholder. Real (http/https) markdown image links are left
+    untouched so the agent can ``web_extract`` / ``vision_analyze`` them.
+
+    Transformations:
+      ``![alt](data:image/png;base64,AAAA...)``  -> ``[IMAGE: alt](base64 image omitted)``
+      ``(data:image/png;base64,AAAA...)``        -> ``[IMAGE]``
+      bare ``data:image/...;base64,AAAA...``     -> ``[IMAGE]``
+    """
+    # 1. Markdown image with base64 source -> keep alt text, drop the blob.
+    def _md_repl(m: "re.Match[str]") -> str:
+        alt = (m.group("alt") or "").strip()
+        return f"[IMAGE: {alt}]" if alt else "[IMAGE]"
+
+    md_b64 = re.compile(
+        r"!\[(?P<alt>[^\]]*)\]\(\s*data:image/[^;]+;base64,[A-Za-z0-9+/=\s]+\)"
+    )
+    out = md_b64.sub(_md_repl, text)
+
+    # 2. Parenthesised base64 (non-markdown) and 3. bare base64 -> [IMAGE].
+    out = re.sub(r"\(\s*data:image/[^;]+;base64,[A-Za-z0-9+/=\s]+\)", "[IMAGE]", out)
+    out = re.sub(r"data:image/[^;]+;base64,[A-Za-z0-9+/=]+", "[IMAGE]", out)
+    return out
+
+
+def _store_full_text(url: str, content: str) -> Optional[str]:
+    """Write the full extracted page to cache/web and return its absolute path.
+
+    The file is mounted read-only into remote backends (Docker/Modal/SSH) via
+    credential_files._CACHE_DIRS, so the agent's terminal/read_file tools can
+    page through the complete text on any backend. Returns None on failure
+    (storage is best-effort; truncated content is still returned to the model).
+    """
+    try:
+        import hashlib
+        from urllib.parse import urlparse
+        from hermes_constants import get_hermes_dir
+
+        cache_dir = get_hermes_dir("cache/web", "web_cache")
+        cache_dir.mkdir(parents=True, exist_ok=True)
+
+        host = (urlparse(url).hostname or "page").replace(":", "_")
+        slug = re.sub(r"[^A-Za-z0-9._-]", "-", host)[:60].strip("-") or "page"
+        digest = hashlib.sha256(url.encode("utf-8")).hexdigest()[:10]
+        path = cache_dir / f"{slug}-{digest}.md"
+        path.write_text(content, encoding="utf-8")
+        return str(path)
+    except Exception as exc:  # noqa: BLE001
+        logger.debug("Failed to store full web_extract text for %s: %s", url, exc)
+        return None
+
+
+def _truncate_with_footer(
+    content: str,
+    url: str,
+    char_limit: int,
+) -> tuple[str, bool]:
+    """Return (model_text, was_truncated) for one page's clean content.
+
+    Pages at or under ``char_limit`` are returned whole. Larger pages get a
+    head+tail window (~75% head / ~25% tail) cut on a markdown line boundary
+    where possible, plus an explicit footer telling the model exactly how much
+    it is seeing, where the full text is stored, and which read_file call pages
+    in the omitted middle. Deterministic — no model involvement.
+    """
+    if len(content) <= char_limit:
+        return content, False
+
+    head_budget = int(char_limit * 0.75)
+    tail_budget = char_limit - head_budget
+
+    head = content[:head_budget]
+    tail = content[-tail_budget:]
+    # Snap the head cut back to the last newline so we don't slice mid-line.
+    nl = head.rfind("\n")
+    if nl > head_budget * 0.5:
+        head = head[:nl]
+    # Snap the tail cut forward to the next newline for the same reason.
+    nl = tail.find("\n")
+    if 0 <= nl < tail_budget * 0.5:
+        tail = tail[nl + 1:]
+
+    total = len(content)
+    stored_path = _store_full_text(url, content)
+    shown = len(head) + len(tail)
+
+    footer_lines = [
+        "",
+        "─" * 8 + " [TRUNCATED] " + "─" * 8,
+        f"Showing {len(head):,} chars (head) + {len(tail):,} chars (tail) "
+        f"of {total:,} total clean characters.",
+    ]
+    if stored_path:
+        footer_lines.append(f"Full text saved to: {stored_path}")
+        footer_lines.append(
+            f'To read the omitted middle: read_file path="{stored_path}" '
+            f"offset=<line> limit=<n>  (the file is the complete page)."
         )
-        # Fall back to truncated raw content instead of returning a useless
-        # error message.  The first ~5000 chars are almost always more useful
-        # to the model than "[Failed to process content: ...]".
-        truncated = content[:MAX_OUTPUT_SIZE]
-        if len(content) > MAX_OUTPUT_SIZE:
-            truncated += (
-                f"\n\n[Content truncated — showing first {MAX_OUTPUT_SIZE:,} of "
-                f"{len(content):,} chars. LLM summarization timed out. "
-                f"To fix: increase auxiliary.web_extract.timeout in config.yaml, "
-                f"or use a faster auxiliary model. Use browser_navigate for the full page.]"
-            )
-        return truncated
-
-
-async def _call_summarizer_llm(
-    content: str, 
-    context_str: str, 
-    model: Optional[str], 
-    max_tokens: int = 20000,
-    is_chunk: bool = False,
-    chunk_info: str = ""
-) -> Optional[str]:
-    """
-    Make a single LLM call to summarize content.
-    
-    Args:
-        content: The content to summarize
-        context_str: Context information (title, URL)
-        model: Model to use
-        max_tokens: Maximum output tokens
-        is_chunk: Whether this is a chunk of a larger document
-        chunk_info: Information about chunk position (e.g., "Chunk 2/5")
-        
-    Returns:
-        Summarized content or None on failure
-    """
-    if is_chunk:
-        # Chunk-specific prompt - aware that this is partial content
-        system_prompt = """You are an expert content analyst processing a SECTION of a larger document. Your job is to extract and summarize the key information from THIS SECTION ONLY.
-
-Important guidelines for chunk processing:
-1. Do NOT write introductions or conclusions - this is a partial document
-2. Focus on extracting ALL key facts, figures, data points, and insights from this section
-3. Preserve important quotes, code snippets, and specific details verbatim
-4. Use bullet points and structured formatting for easy synthesis later
-5. Note any references to other sections (e.g., "as mentioned earlier", "see below") without trying to resolve them
-
-Your output will be combined with summaries of other sections, so focus on thorough extraction rather than narrative flow."""
-
-        user_prompt = f"""Extract key information from this SECTION of a larger document:
-
-{context_str}{chunk_info}
-
-SECTION CONTENT:
-{content}
-
-Extract all important information from this section in a structured format. Focus on facts, data, insights, and key details. Do not add introductions or conclusions."""
-
     else:
-        # Standard full-document prompt
-        system_prompt = """You are an expert content analyst. Your job is to process web content and create a comprehensive yet concise summary that preserves all important information while dramatically reducing bulk.
+        footer_lines.append(
+            "Full text could not be stored; re-run web_extract on a more "
+            "specific URL or use browser_navigate for the complete page."
+        )
+    footer_lines.append("─" * 29)
 
-Create a well-structured markdown summary that includes:
-1. Key excerpts (quotes, code snippets, important facts) in their original format
-2. Comprehensive summary of all other important information
-3. Proper markdown formatting with headers, bullets, and emphasis
+    model_text = head + "\n\n[... middle omitted — see footer ...]\n\n" + tail
+    model_text += "\n" + "\n".join(footer_lines)
+    return model_text, True
 
-Your goal is to preserve ALL important information while reducing length. Never lose key facts, figures, insights, or actionable information. Make it scannable and well-organized."""
-
-        user_prompt = f"""Please process this web content and create a comprehensive markdown summary:
-
-{context_str}CONTENT TO PROCESS:
-{content}
-
-Create a markdown summary that captures all key information in a well-organized, scannable format. Include important quotes and code snippets in their original formatting. Focus on actionable information, specific details, and unique insights."""
-
-    # Call the LLM with retry logic — keep retries low since summarization
-    # is a nice-to-have; the caller falls back to truncated content on failure.
-    max_retries = 2
-    retry_delay = 2
-    last_error = None
-
-    for attempt in range(max_retries):
-        try:
-            aux_client, effective_model, extra_body = _resolve_web_extract_auxiliary(model)
-            if aux_client is None or not effective_model:
-                logger.warning("No auxiliary model available for web content processing")
-                return None
-            call_kwargs = {
-                "task": "web_extract",
-                "model": effective_model,
-                "messages": [
-                    {"role": "system", "content": system_prompt},
-                    {"role": "user", "content": user_prompt},
-                ],
-                "temperature": 0.1,
-                "max_tokens": max_tokens,
-                # No explicit timeout — async_call_llm reads auxiliary.web_extract.timeout
-                # from config.yaml. Fresh configs ship with 360s; if the key is absent
-                # the runtime default is 30s (_DEFAULT_AUX_TIMEOUT in
-                # agent/auxiliary_client.py). Users with slow local models should set
-                # or increase auxiliary.web_extract.timeout in config.yaml.
-            }
-            if extra_body:
-                call_kwargs["extra_body"] = extra_body
-            response = await async_call_llm(**call_kwargs)
-            content = extract_content_or_reasoning(response)
-            if content:
-                return content
-            # Reasoning-only / empty response — let the retry loop handle it
-            logger.warning("LLM returned empty content (attempt %d/%d), retrying", attempt + 1, max_retries)
-            if attempt < max_retries - 1:
-                await asyncio.sleep(retry_delay)
-                retry_delay = min(retry_delay * 2, 60)
-                continue
-            return content  # Return whatever we got after exhausting retries
-        except RuntimeError:
-            logger.warning("No auxiliary model available for web content processing")
-            return None
-        except Exception as api_error:
-            last_error = api_error
-            if attempt < max_retries - 1:
-                logger.warning("LLM API call failed (attempt %d/%d): %s", attempt + 1, max_retries, str(api_error)[:100])
-                logger.warning("Retrying in %ds...", retry_delay)
-                await asyncio.sleep(retry_delay)
-                retry_delay = min(retry_delay * 2, 60)
-            else:
-                raise last_error
-    
-    return None
-
-
-async def _process_large_content_chunked(
-    content: str, 
-    context_str: str, 
-    model: Optional[str], 
-    chunk_size: int,
-    max_output_size: int
-) -> Optional[str]:
-    """
-    Process large content by chunking, summarizing each chunk in parallel,
-    then synthesizing the summaries.
-    
-    Args:
-        content: The large content to process
-        context_str: Context information
-        model: Model to use
-        chunk_size: Size of each chunk in characters
-        max_output_size: Maximum final output size
-        
-    Returns:
-        Synthesized summary or None on failure
-    """
-    # Split content into chunks
-    chunks = []
-    for i in range(0, len(content), chunk_size):
-        chunk = content[i:i + chunk_size]
-        chunks.append(chunk)
-    
-    logger.info("Split into %d chunks of ~%d chars each", len(chunks), chunk_size)
-    
-    # Summarize each chunk in parallel
-    async def summarize_chunk(chunk_idx: int, chunk_content: str) -> tuple[int, Optional[str]]:
-        """Summarize a single chunk."""
-        try:
-            chunk_info = f"[Processing chunk {chunk_idx + 1} of {len(chunks)}]"
-            summary = await _call_summarizer_llm(
-                chunk_content, 
-                context_str, 
-                model, 
-                max_tokens=10000,
-                is_chunk=True,
-                chunk_info=chunk_info
-            )
-            if summary:
-                logger.info("Chunk %d/%d summarized: %d -> %d chars", chunk_idx + 1, len(chunks), len(chunk_content), len(summary))
-            return chunk_idx, summary
-        except Exception as e:
-            logger.warning("Chunk %d/%d failed: %s", chunk_idx + 1, len(chunks), str(e)[:50])
-            return chunk_idx, None
-    
-    # Run all chunk summarizations in parallel
-    tasks = [summarize_chunk(i, chunk) for i, chunk in enumerate(chunks)]
-    # Use return_exceptions=True so a single task failure does not discard
-    # all other successfully summarized chunks.
-    results = await asyncio.gather(*tasks, return_exceptions=True)
-
-    # Filter out exceptions, then collect successful summaries in order
-    successful_results = []
-    for result_item in results:
-        if isinstance(result_item, BaseException):
-            logger.warning("Chunk summarization task failed: %s", result_item)
-            continue
-        successful_results.append(result_item)
-
-    summaries = []
-    for chunk_idx, summary in sorted(successful_results, key=lambda x: x[0]):
-        if summary:
-            summaries.append(f"## Section {chunk_idx + 1}\n{summary}")
-    
-    if not summaries:
-        logger.debug("All chunk summarizations failed")
-        return "[Failed to process large content: all chunk summarizations failed]"
-    
-    logger.info("Got %d/%d chunk summaries", len(summaries), len(chunks))
-    
-    # If only one chunk succeeded, just return it (with cap)
-    if len(summaries) == 1:
-        result = summaries[0]
-        if len(result) > max_output_size:
-            result = result[:max_output_size] + "\n\n[... truncated ...]"
-        return result
-    
-    # Synthesize the summaries into a final summary
-    logger.info("Synthesizing %d summaries...", len(summaries))
-    
-    combined_summaries = "\n\n---\n\n".join(summaries)
-    
-    synthesis_prompt = f"""You have been given summaries of different sections of a large document. 
-Synthesize these into ONE cohesive, comprehensive summary that:
-1. Removes redundancy between sections
-2. Preserves all key facts, figures, and actionable information
-3. Is well-organized with clear structure
-4. Is under {max_output_size} characters
-
-{context_str}SECTION SUMMARIES:
-{combined_summaries}
-
-Create a single, unified markdown summary."""
-
-    try:
-        aux_client, effective_model, extra_body = _resolve_web_extract_auxiliary(model)
-        if aux_client is None or not effective_model:
-            logger.warning("No auxiliary model for synthesis, concatenating summaries")
-            fallback = "\n\n".join(summaries)
-            if len(fallback) > max_output_size:
-                fallback = fallback[:max_output_size] + "\n\n[... truncated ...]"
-            return fallback
-
-        call_kwargs = {
-            "task": "web_extract",
-            "model": effective_model,
-            "messages": [
-                {"role": "system", "content": "You synthesize multiple summaries into one cohesive, comprehensive summary. Be thorough but concise."},
-                {"role": "user", "content": synthesis_prompt},
-            ],
-            "temperature": 0.1,
-            "max_tokens": 20000,
-        }
-        if extra_body:
-            call_kwargs["extra_body"] = extra_body
-        response = await async_call_llm(**call_kwargs)
-        final_summary = extract_content_or_reasoning(response)
-
-        # Retry once on empty content (reasoning-only response)
-        if not final_summary:
-            logger.warning("Synthesis LLM returned empty content, retrying once")
-            response = await async_call_llm(**call_kwargs)
-            final_summary = extract_content_or_reasoning(response)
-
-        # If still None after retry, fall back to concatenated summaries
-        if not final_summary:
-            logger.warning("Synthesis failed after retry — concatenating chunk summaries")
-            fallback = "\n\n".join(summaries)
-            if len(fallback) > max_output_size:
-                fallback = fallback[:max_output_size] + "\n\n[... truncated ...]"
-            return fallback
-
-        # Enforce hard cap
-        if len(final_summary) > max_output_size:
-            final_summary = final_summary[:max_output_size] + "\n\n[... summary truncated for context management ...]"
-        
-        original_len = len(content)
-        final_len = len(final_summary)
-        compression = final_len / original_len if original_len > 0 else 1.0
-        
-        logger.info("Synthesis complete: %d -> %d chars (%.2f%%)", original_len, final_len, compression * 100)
-        return final_summary
-        
-    except Exception as e:
-        logger.warning("Synthesis failed: %s", str(e)[:100])
-        # Fall back to concatenated summaries with truncation
-        fallback = "\n\n".join(summaries)
-        if len(fallback) > max_output_size:
-            fallback = fallback[:max_output_size] + "\n\n[... truncated due to synthesis failure ...]"
-        return fallback
-
-
-def clean_base64_images(text: str) -> str:
-    """
-    Remove base64 encoded images from text to reduce token count and clutter.
-    
-    This function finds and removes base64 encoded images in various formats:
-    - (data:image/png;base64,...)
-    - (data:image/jpeg;base64,...)
-    - (data:image/svg+xml;base64,...)
-    - data:image/[type];base64,... (without parentheses)
-    
-    Args:
-        text: The text content to clean
-        
-    Returns:
-        Cleaned text with base64 images replaced with placeholders
-    """
-    # Pattern to match base64 encoded images wrapped in parentheses
-    # Matches: (data:image/[type];base64,[base64-string])
-    base64_with_parens_pattern = r'\(data:image/[^;]+;base64,[A-Za-z0-9+/=]+\)'
-    
-    # Pattern to match base64 encoded images without parentheses
-    # Matches: data:image/[type];base64,[base64-string]
-    base64_pattern = r'data:image/[^;]+;base64,[A-Za-z0-9+/=]+'
-    
-    # Replace parentheses-wrapped images first
-    cleaned_text = re.sub(base64_with_parens_pattern, '[BASE64_IMAGE_REMOVED]', text)
-    
-    # Then replace any remaining non-parentheses images
-    cleaned_text = re.sub(base64_pattern, '[BASE64_IMAGE_REMOVED]', cleaned_text)
-    
-    return cleaned_text
 
 
 # ─── Exa / Parallel inline helpers — moved into plugins ──────────────────────
@@ -894,29 +588,32 @@ def web_search_tool(query: str, limit: int = 5) -> str:
 async def web_extract_tool(
     urls: List[str],
     format: str = None,
-    use_llm_processing: bool = True,
-    model: Optional[str] = None,
-    min_length: int = DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION
+    char_limit: Optional[int] = None,
 ) -> str:
     """
     Extract content from specific web pages using available extraction API backend.
 
-    This function provides a generic interface for web content extraction that
-    can work with multiple backends. Currently uses Firecrawl.
+    Returns clean page content (markdown/text) with NO LLM summarization. The
+    extract backends (Firecrawl, Tavily, Exa, Parallel) already return clean,
+    boilerplate-stripped content, so we return it directly and fast. Pages over
+    ``char_limit`` are head+tail truncated with an explicit footer; the full
+    text is stored under cache/web and the footer tells the model how to
+    read_file the omitted middle. Inline base64 images are replaced with
+    ``[IMAGE: alt]`` placeholders (real image URLs are preserved as links).
 
     Args:
         urls (List[str]): List of URLs to extract content from
         format (str): Desired output format ("markdown" or "html", optional)
-        use_llm_processing (bool): Whether to process content with LLM for summarization (default: True)
-        model (Optional[str]): The model to use for LLM processing (defaults to current auxiliary backend model)
-        min_length (int): Minimum content length to trigger LLM processing (default: 5000)
+        char_limit (Optional[int]): Per-page char budget sent to the model
+            (default: web.extract_char_limit or 15000). Larger pages truncate.
 
     Security: URLs are checked for embedded secrets before fetching.
-    
+
     Returns:
-        str: JSON string containing extracted content. If LLM processing is enabled and successful,
-             the 'content' field will contain the processed markdown summary instead of raw content.
-    
+        str: JSON string with a ``results`` list; each entry has
+             ``url``, ``title``, ``content``, ``error``. ``content`` is the
+             (possibly truncated) clean page text.
+
     Raises:
         Exception: If extraction fails or API key is not set
     """
@@ -944,16 +641,14 @@ async def web_extract_tool(
         "parameters": {
             "urls": normalized_urls,
             "format": format,
-            "use_llm_processing": use_llm_processing,
-            "model": model,
-            "min_length": min_length
+            "char_limit": char_limit,
         },
         "error": None,
         "pages_extracted": 0,
-        "pages_processed_with_llm": 0,
+        "pages_truncated": 0,
         "original_response_size": 0,
         "final_response_size": 0,
-        "compression_metrics": [],
+        "truncation_metrics": [],
         "processing_applied": []
     }
     
@@ -1053,91 +748,39 @@ async def web_extract_tool(
         
         debug_call_data["pages_extracted"] = pages_extracted
         debug_call_data["original_response_size"] = len(json.dumps(response))
-        effective_model = model or _get_default_summarizer_model()
-        auxiliary_available = check_auxiliary_model()
-        
-        # Process each result with LLM if enabled
-        if use_llm_processing and auxiliary_available:
-            logger.info("Processing extracted content with LLM (parallel)...")
-            debug_call_data["processing_applied"].append("llm_processing")
-            
-            # Prepare tasks for parallel processing
-            async def process_single_result(result):
-                """Process a single result with LLM and return updated result with metrics."""
-                url = result.get('url', 'Unknown URL')
-                title = result.get('title', '')
-                raw_content = result.get('raw_content', '') or result.get('content', '')
-                
-                if not raw_content:
-                    return result, None, "no_content"
-                
-                original_size = len(raw_content)
-                
-                # Process content with LLM
-                processed = await process_content_with_llm(
-                    raw_content, url, title, effective_model, min_length
-                )
-                
-                if processed:
-                    processed_size = len(processed)
-                    compression_ratio = processed_size / original_size if original_size > 0 else 1.0
-                    
-                    # Update result with processed content
-                    result['content'] = processed
-                    result['raw_content'] = raw_content
-                    
-                    metrics = {
-                        "url": url,
-                        "original_size": original_size,
-                        "processed_size": processed_size,
-                        "compression_ratio": compression_ratio,
-                        "model_used": effective_model
-                    }
-                    return result, metrics, "processed"
-                else:
-                    metrics = {
-                        "url": url,
-                        "original_size": original_size,
-                        "processed_size": original_size,
-                        "compression_ratio": 1.0,
-                        "model_used": None,
-                        "reason": "content_too_short"
-                    }
-                    return result, metrics, "too_short"
-            
-            # Run all LLM processing in parallel
-            results_list = response.get('results', [])
-            tasks = [process_single_result(result) for result in results_list]
-            # Use return_exceptions=True so a single task failure does not
-            # discard all other successfully processed results.
-            processed_results = await asyncio.gather(*tasks, return_exceptions=True)
 
-            # Collect metrics and print results
-            for result_item in processed_results:
-                if isinstance(result_item, BaseException):
-                    logger.warning("Web result processing task failed: %s", result_item)
-                    continue
-                result, metrics, status = result_item
-                url = result.get('url', 'Unknown URL')
-                if status == "processed":
-                    debug_call_data["compression_metrics"].append(metrics)
-                    debug_call_data["pages_processed_with_llm"] += 1
-                    logger.info("%s (processed)", url)
-                elif status == "too_short":
-                    debug_call_data["compression_metrics"].append(metrics)
-                    logger.info("%s (no processing - content too short)", url)
-                else:
-                    logger.warning("%s (no content to process)", url)
-        else:
-            if use_llm_processing and not auxiliary_available:
-                logger.warning("LLM processing requested but no auxiliary model available, returning raw content")
-                debug_call_data["processing_applied"].append("llm_processing_unavailable")
-            # Print summary of extracted pages for debugging (original behavior)
-            for result in response.get('results', []):
-                url = result.get('url', 'Unknown URL')
-                content_length = len(result.get('raw_content', ''))
-                logger.info("%s (%d characters)", url, content_length)
-        
+        effective_char_limit = char_limit if char_limit is not None else _get_extract_char_limit()
+        try:
+            effective_char_limit = max(2000, min(int(effective_char_limit), 500_000))
+        except (TypeError, ValueError):
+            effective_char_limit = DEFAULT_EXTRACT_CHAR_LIMIT
+
+        # Truncate-and-store: no LLM. For each result, convert inline base64
+        # images to labeled placeholders (keeping alt text + real image URLs),
+        # then return the clean content directly if within budget, or a
+        # head+tail window plus a footer pointing at the stored full text.
+        debug_call_data["processing_applied"].append("truncate_and_store")
+        for result in response.get("results", []):
+            if result.get("error"):
+                continue
+            url = result.get("url", "")
+            raw_content = result.get("raw_content", "") or result.get("content", "")
+            if not raw_content:
+                continue
+            clean = convert_base64_images_to_links(raw_content)
+            model_text, truncated = _truncate_with_footer(clean, url, effective_char_limit)
+            result["content"] = model_text
+            if truncated:
+                debug_call_data["pages_truncated"] += 1
+                debug_call_data["truncation_metrics"].append({
+                    "url": url,
+                    "original_size": len(clean),
+                    "sent_size": len(model_text),
+                })
+                logger.info("%s (truncated %d -> %d chars)", url, len(clean), len(model_text))
+            else:
+                logger.info("%s (%d chars, whole)", url, len(clean))
+
         # Trim output to minimal fields per entry: title, content, error
         trimmed_results = [
             {
@@ -1153,16 +796,16 @@ async def web_extract_tool(
 
         if trimmed_response.get("results") == []:
             result_json = tool_error("Content was inaccessible or not found")
-
-            cleaned_result = clean_base64_images(result_json)
-        
         else:
             result_json = json.dumps(trimmed_response, indent=2, ensure_ascii=False)
-            
-            cleaned_result = clean_base64_images(result_json)
-        
+
+        # base64 images were already converted to placeholders per-result above;
+        # this is a belt-and-suspenders sweep over the serialized JSON in case a
+        # provider tucked a blob somewhere unexpected (e.g. metadata).
+        cleaned_result = convert_base64_images_to_links(result_json)
+
         debug_call_data["final_response_size"] = len(cleaned_result)
-        debug_call_data["processing_applied"].append("base64_image_removal")
+        debug_call_data["processing_applied"].append("base64_image_conversion")
         
         # Log debug information
         _debug.log_call("web_extract_tool", debug_call_data)
@@ -1193,28 +836,18 @@ def check_web_api_key() -> bool:
     )
 
 
-def check_auxiliary_model() -> bool:
-    """Check if an auxiliary text model is available for LLM content processing."""
-    client, _, _ = _resolve_web_extract_auxiliary()
-    return client is not None
-
-
-
-
 if __name__ == "__main__":
     """
     Simple test/demo when run directly
     """
     print("🌐 Standalone Web Tools Module")
     print("=" * 40)
-    
+
     # Check if API keys are available
     web_available = check_web_api_key()
     tool_gateway_available = _is_tool_gateway_ready()
     firecrawl_key_available = bool(os.getenv("FIRECRAWL_API_KEY", "").strip())
     firecrawl_url_available = bool(os.getenv("FIRECRAWL_API_URL", "").strip())
-    nous_available = check_auxiliary_model()
-    default_summarizer_model = _get_default_summarizer_model()
 
     if web_available:
         backend = _get_backend()
@@ -1246,29 +879,20 @@ if __name__ == "__main__":
             f"{_firecrawl_backend_help_suffix()}"
         )
 
-    if not nous_available:
-        print("❌ No auxiliary model available for LLM content processing")
-        print("Set OPENROUTER_API_KEY, configure Nous Portal, or set OPENAI_BASE_URL + OPENAI_API_KEY")
-        print("⚠️  Without an auxiliary model, LLM content processing will be disabled")
-    else:
-        print(f"✅ Auxiliary model available: {default_summarizer_model}")
-
     if not web_available:
         sys.exit(1)
 
     print("🛠️  Web tools ready for use!")
-    
-    if nous_available:
-        print(f"🧠 LLM content processing available with {default_summarizer_model}")
-        print(f"   Default min length for processing: {DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION} chars")
-    
+    print(f"   Extract char limit: {_get_extract_char_limit()} chars "
+          "(pages over this are truncated; full text stored in cache/web)")
+
     # Show debug mode status
     if _debug.active:
         print(f"🐛 Debug mode ENABLED - Session ID: {_debug.session_id}")
         print(f"   Debug logs will be saved to: {_debug.log_dir}/web_tools_debug_{_debug.session_id}.json")
     else:
         print("🐛 Debug mode disabled (set WEB_TOOLS_DEBUG=true to enable)")
-    
+
     print("\nBasic usage:")
     print("  from web_tools import web_search_tool, web_extract_tool")
     print("  import asyncio")
@@ -1276,37 +900,16 @@ if __name__ == "__main__":
     print("  # Search (synchronous)")
     print("  results = web_search_tool('Python tutorials')")
     print("")
-    print("  # Extract (asynchronous)")
+    print("  # Extract (asynchronous, no LLM — truncate-and-store)")
     print("  async def main():")
     print("      content = await web_extract_tool(['https://example.com'])")
+    print("      # bigger budget for one call:")
+    print("      content = await web_extract_tool(['https://docs.python.org'], char_limit=40000)")
     print("  asyncio.run(main())")
-    
-    if nous_available:
-        print("\nLLM-enhanced usage:")
-        print("  # Content automatically processed for pages >5000 chars (default)")
-        print("  content = await web_extract_tool(['https://python.org/about/'])")
-        print("")
-        print("  # Customize processing parameters")
-        print("  content = await web_extract_tool(")
-        print("      ['https://docs.python.org'],")
-        print("      model='google/gemini-3-flash-preview',")
-        print("      min_length=3000")
-        print("  )")
-        print("")
-        print("  # Disable LLM processing")
-        print("  raw_content = await web_extract_tool(['https://example.com'], use_llm_processing=False)")
-    
+
     print("\nDebug mode:")
-    print("  # Enable debug logging")
     print("  export WEB_TOOLS_DEBUG=true")
-    print("  # Debug logs capture:")
-    print("  # - All tool calls with parameters")
-    print("  # - Original API responses")
-    print("  # - LLM compression metrics")
-    print("  # - Final processed results")
     print("  # Logs saved to: ./logs/web_tools_debug_UUID.json")
-    
-    print("\n📝 Run 'python test_web_tools_llm.py' to test LLM processing capabilities")
 
 
 # ---------------------------------------------------------------------------
@@ -1338,7 +941,7 @@ WEB_SEARCH_SCHEMA = {
 
 WEB_EXTRACT_SCHEMA = {
     "name": "web_extract",
-    "description": "Extract content from web page URLs. Returns page content in markdown format. Also works with PDF URLs (arxiv papers, documents, etc.) — pass the PDF link directly and it converts to markdown text. Pages under 5000 chars return full markdown; larger pages are LLM-summarized and capped at ~5000 chars per page. Pages over 2M chars are refused. If a URL fails or times out, use the browser tool to access it instead.",
+    "description": "Extract content from web page URLs. Returns clean page content in markdown/text (no LLM summarization — fast). Also works with PDF URLs (arxiv papers, documents) — pass the PDF link directly. Pages within the char budget (default 15000) return whole; larger pages return a head+tail window with a footer telling you the full text's saved file path and the read_file call to page through the omitted middle. Inline images appear as [IMAGE: alt] placeholders; real image URLs are kept as links. If a URL fails or times out, use the browser tool instead.",
     "parameters": {
         "type": "object",
         "properties": {
@@ -1347,6 +950,11 @@ WEB_EXTRACT_SCHEMA = {
                 "items": {"type": "string"},
                 "description": "List of URLs to extract content from (max 5 URLs per call)",
                 "maxItems": 5
+            },
+            "char_limit": {
+                "type": "integer",
+                "description": "Optional per-page character budget sent back (default 15000). Pages larger than this are head+tail truncated with the full text stored to disk. Raise it when you need more of a long page inline.",
+                "minimum": 2000
             }
         },
         "required": ["urls"]
@@ -1368,7 +976,10 @@ registry.register(
     toolset="web",
     schema=WEB_EXTRACT_SCHEMA,
     handler=lambda args, **kw: web_extract_tool(
-        args.get("urls", [])[:5] if isinstance(args.get("urls"), list) else [], "markdown"),
+        args.get("urls", [])[:5] if isinstance(args.get("urls"), list) else [],
+        "markdown",
+        char_limit=args.get("char_limit"),
+    ),
     check_fn=check_web_api_key,
     requires_env=_web_requires_env(),
     is_async=True,