mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-30 11:52:04 +00:00
feat(web_extract): truncate-and-store instead of LLM summarization (#54843)
* feat(web_extract): truncate-and-store instead of LLM summarization web_extract no longer runs an auxiliary LLM over scraped pages. The extract backends (Firecrawl/Tavily/Exa/Parallel) already return clean, boilerplate- stripped markdown, so we return it directly: pages within a char budget (default 15000, web.extract_char_limit) come back whole; larger pages get a head+tail window plus an explicit footer giving the stored full-text path and the read_file call to page through the omitted middle. The full clean text is written to cache/web (mounted read-only into remote backends like the other cache dirs), so nothing is lost. Inline base64 images are converted to [IMAGE: alt] placeholders (token bombs dropped) while real http(s) image URLs are preserved as links so the agent can still web_extract/vision_analyze them. Removes process_content_with_llm + the chunked summarizer + check_auxiliary_model + _resolve_web_extract_auxiliary. context_references._default_url_fetcher is updated to the truncate path and its stale data.documents shape read is fixed to results (it was silently returning empty). Live before/after eval (firecrawl, 4 URLs): 11.7x faster overall (176.6s -> 15.1s); 10-60x on large pages. Quality identical; findability 4/4 (answer recoverable from stored full text on every truncated page). web_search is unchanged. No own scraper added; no changes to web_search. * fix(web_extract): add char_limit to execute_code web_extract stub The new web_extract char_limit param must appear in the code_execution_tool _TOOL_STUBS signature (and doc line) or test_stubs_cover_all_schema_params fails — the stub schema must cover every real schema param.
This commit is contained in:
parent
c6c1fd8b6b
commit
ee8cbfdc03
12 changed files with 370 additions and 661 deletions
|
|
@ -328,9 +328,9 @@ async def _fetch_url_content(
|
|||
async def _default_url_fetcher(url: str) -> str:
|
||||
from tools.web_tools import web_extract_tool
|
||||
|
||||
raw = await web_extract_tool([url], format="markdown", use_llm_processing=True)
|
||||
raw = await web_extract_tool([url], format="markdown")
|
||||
payload = json.loads(raw)
|
||||
docs = payload.get("data", {}).get("documents", [])
|
||||
docs = payload.get("results", [])
|
||||
if not docs:
|
||||
return ""
|
||||
doc = docs[0]
|
||||
|
|
|
|||
|
|
@ -1161,6 +1161,7 @@ DEFAULT_CONFIG = {
|
|||
"backend": "", # shared fallback — applies to both search and extract
|
||||
"search_backend": "", # per-capability override for web_search (e.g. "searxng")
|
||||
"extract_backend": "", # per-capability override for web_extract (e.g. "native")
|
||||
"extract_char_limit": 15000, # per-page char budget for web_extract; larger pages truncate + store full text in cache/web
|
||||
},
|
||||
|
||||
"browser": {
|
||||
|
|
|
|||
|
|
@ -32,7 +32,6 @@ from tools.web_tools import (
|
|||
web_extract_tool,
|
||||
check_firecrawl_api_key,
|
||||
check_web_api_key,
|
||||
check_auxiliary_model,
|
||||
_get_backend,
|
||||
)
|
||||
|
||||
|
|
@ -129,12 +128,11 @@ class WebToolsTester:
|
|||
backend = _get_backend()
|
||||
self.log_result("Web Backend API Key", "passed", f"Using {backend} backend")
|
||||
|
||||
# Check auxiliary LLM provider (optional)
|
||||
if not check_auxiliary_model():
|
||||
self.log_result("Auxiliary LLM", "skipped", "No auxiliary LLM provider available (LLM tests will be skipped)")
|
||||
self.test_llm = False
|
||||
else:
|
||||
self.log_result("Auxiliary LLM", "passed", "Found")
|
||||
# Auxiliary LLM summarization was removed — web_extract is now
|
||||
# truncate-and-store (no LLM). Keep the flag off so any residual
|
||||
# LLM-path assertions stay skipped.
|
||||
self.log_result("Auxiliary LLM", "skipped", "web_extract no longer uses an LLM (truncate-and-store)")
|
||||
self.test_llm = False
|
||||
|
||||
return True
|
||||
|
||||
|
|
@ -261,12 +259,11 @@ class WebToolsTester:
|
|||
print(f" - {url}")
|
||||
|
||||
if self.verbose:
|
||||
print(f" Calling web_extract_tool(urls={test_urls}, format='markdown', use_llm_processing=False)")
|
||||
print(f" Calling web_extract_tool(urls={test_urls}, format='markdown')")
|
||||
|
||||
result = await web_extract_tool(
|
||||
test_urls,
|
||||
format="markdown",
|
||||
use_llm_processing=False
|
||||
)
|
||||
|
||||
# Parse result
|
||||
|
|
@ -360,8 +357,7 @@ class WebToolsTester:
|
|||
result = await web_extract_tool(
|
||||
[test_url],
|
||||
format="markdown",
|
||||
use_llm_processing=True,
|
||||
min_length=1000 # Lower threshold for testing
|
||||
char_limit=1000, # small budget to force truncation in the test
|
||||
)
|
||||
|
||||
data = json.loads(result)
|
||||
|
|
@ -466,7 +462,7 @@ class WebToolsTester:
|
|||
"web_backend": _get_backend() if check_web_api_key() else None,
|
||||
"firecrawl_api_key": check_firecrawl_api_key(),
|
||||
"parallel_api_key": bool(os.getenv("PARALLEL_API_KEY")),
|
||||
"auxiliary_model": check_auxiliary_model(),
|
||||
"auxiliary_model": False,
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -126,7 +126,6 @@ class TestWebExtractSecretExfil:
|
|||
try:
|
||||
result = await web_tools.web_extract_tool(
|
||||
urls=["https://wttr.in/Köln"],
|
||||
use_llm_processing=False,
|
||||
)
|
||||
finally:
|
||||
web_search_registry._reset_for_tests()
|
||||
|
|
|
|||
|
|
@ -418,7 +418,6 @@ class TestDispatchersTriggerPluginDiscovery:
|
|||
result = json.loads(asyncio.run(
|
||||
web_tools.web_extract_tool(
|
||||
["https://example.com"],
|
||||
use_llm_processing=False,
|
||||
)
|
||||
))
|
||||
|
||||
|
|
|
|||
|
|
@ -160,47 +160,6 @@ class TestFirecrawlClientConfig:
|
|||
importlib.reload(tools.web_tools)
|
||||
assert tools.web_tools._read_nous_access_token() == "nous-token"
|
||||
|
||||
def test_check_auxiliary_model_re_resolves_backend_each_call(self):
|
||||
"""Availability checks should not be pinned to module import state."""
|
||||
import tools.web_tools
|
||||
|
||||
# Simulate the pre-fix import-time cache slot for regression coverage.
|
||||
tools.web_tools.__dict__["_aux_async_client"] = None
|
||||
|
||||
with patch(
|
||||
"tools.web_tools.get_async_text_auxiliary_client",
|
||||
side_effect=[(None, None), (MagicMock(base_url="https://api.openrouter.ai/v1"), "test-model")],
|
||||
):
|
||||
assert tools.web_tools.check_auxiliary_model() is False
|
||||
assert tools.web_tools.check_auxiliary_model() is True
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_summarizer_re_resolves_backend_after_initial_unavailable_state(self):
|
||||
"""Summarization should pick up a backend that becomes available later in-process."""
|
||||
import tools.web_tools
|
||||
|
||||
tools.web_tools.__dict__["_aux_async_client"] = None
|
||||
|
||||
response = MagicMock()
|
||||
response.choices = [MagicMock(message=MagicMock(content="summary text"))]
|
||||
|
||||
with patch(
|
||||
"tools.web_tools._resolve_web_extract_auxiliary",
|
||||
side_effect=[(None, None, {}), (MagicMock(base_url="https://api.openrouter.ai/v1"), "test-model", {})],
|
||||
), patch(
|
||||
"tools.web_tools.async_call_llm",
|
||||
new=AsyncMock(return_value=response),
|
||||
) as mock_async_call:
|
||||
assert tools.web_tools.check_auxiliary_model() is False
|
||||
result = await tools.web_tools._call_summarizer_llm(
|
||||
"Some content worth summarizing",
|
||||
"Source: https://example.com\n\n",
|
||||
None,
|
||||
)
|
||||
|
||||
assert result == "summary text"
|
||||
mock_async_call.assert_awaited_once()
|
||||
|
||||
# ── Singleton caching ────────────────────────────────────────────
|
||||
|
||||
def test_singleton_returns_same_instance(self):
|
||||
|
|
|
|||
|
|
@ -215,13 +215,13 @@ class TestWebExtractTavily:
|
|||
|
||||
with patch("tools.web_tools._get_backend", return_value="tavily"), \
|
||||
patch.dict(os.environ, {"TAVILY_API_KEY": "tvly-test"}), \
|
||||
patch("tools.web_tools.httpx.post", return_value=mock_response), \
|
||||
patch("tools.web_tools.process_content_with_llm", return_value=None):
|
||||
patch("tools.web_tools.httpx.post", return_value=mock_response):
|
||||
from tools.web_tools import web_extract_tool
|
||||
result = json.loads(asyncio.get_event_loop().run_until_complete(
|
||||
web_extract_tool(["https://example.com"], use_llm_processing=False)
|
||||
web_extract_tool(["https://example.com"])
|
||||
))
|
||||
assert "results" in result
|
||||
assert len(result["results"]) == 1
|
||||
assert result["results"][0]["url"] == "https://example.com"
|
||||
assert "Extracted content" in result["results"][0]["content"]
|
||||
|
||||
|
|
|
|||
142
tests/tools/test_web_tools_truncate.py
Normal file
142
tests/tools/test_web_tools_truncate.py
Normal file
|
|
@ -0,0 +1,142 @@
|
|||
"""Unit tests for the truncate-and-store web_extract path (no LLM).
|
||||
|
||||
Covers convert_base64_images_to_links, _truncate_with_footer, _store_full_text,
|
||||
_get_extract_char_limit, and the end-to-end web_extract_tool truncation behavior.
|
||||
"""
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
import tools.web_tools as wt
|
||||
|
||||
|
||||
class TestImageConversion:
|
||||
def test_markdown_base64_image_keeps_alt_drops_blob(self):
|
||||
blob = "A" * 5000
|
||||
text = f"before  after"
|
||||
out = wt.convert_base64_images_to_links(text)
|
||||
assert "[IMAGE: a cat]" in out
|
||||
assert "base64" not in out
|
||||
assert blob not in out
|
||||
assert "before" in out and "after" in out
|
||||
|
||||
def test_markdown_base64_image_no_alt(self):
|
||||
out = wt.convert_base64_images_to_links("x  y")
|
||||
assert "[IMAGE]" in out
|
||||
assert "base64" not in out
|
||||
|
||||
def test_real_http_image_links_preserved(self):
|
||||
text = "see  here"
|
||||
out = wt.convert_base64_images_to_links(text)
|
||||
# Real image URLs must survive so the agent can inspect them.
|
||||
assert "" in out
|
||||
|
||||
def test_bare_and_parenthesised_base64_become_placeholder(self):
|
||||
blob = "Z" * 3000
|
||||
bare = wt.convert_base64_images_to_links(f"data:image/gif;base64,{blob}")
|
||||
assert bare == "[IMAGE]"
|
||||
paren = wt.convert_base64_images_to_links(f"(data:image/gif;base64,{blob})")
|
||||
assert paren == "[IMAGE]"
|
||||
|
||||
|
||||
class TestTruncation:
|
||||
def test_short_content_returned_whole(self):
|
||||
content = "# Title\n\nshort body\n"
|
||||
out, truncated = wt._truncate_with_footer(content, "https://e.com", 15000)
|
||||
assert out == content
|
||||
assert truncated is False
|
||||
|
||||
def test_long_content_truncated_with_footer(self, tmp_path, monkeypatch):
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
|
||||
body = "\n".join(f"line {i} " + "x" * 50 for i in range(2000))
|
||||
out, truncated = wt._truncate_with_footer(body, "https://example.com/page", 4000)
|
||||
assert truncated is True
|
||||
assert "[TRUNCATED]" in out
|
||||
assert "Full text saved to:" in out
|
||||
assert "read_file" in out
|
||||
# Head and tail are both present (first and last lines survive).
|
||||
assert "line 0 " in out
|
||||
assert "line 1999 " in out
|
||||
# The omitted middle is gone.
|
||||
assert "line 1000 " not in out
|
||||
# Sent text is bounded near the budget (+ footer overhead).
|
||||
assert len(out) < 4000 + 2000
|
||||
|
||||
def test_truncation_stores_full_text_readable(self, tmp_path, monkeypatch):
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
|
||||
body = "UNIQUE_MIDDLE_MARKER\n" + ("\n".join(f"row {i}" for i in range(5000)))
|
||||
out, truncated = wt._truncate_with_footer(body, "https://example.com/doc", 3000)
|
||||
assert truncated is True
|
||||
# Extract the stored path from the footer and confirm full text is there.
|
||||
path_line = next(ln for ln in out.splitlines() if "Full text saved to:" in ln)
|
||||
stored_path = path_line.split("Full text saved to:", 1)[1].strip()
|
||||
assert os.path.exists(stored_path)
|
||||
full = open(stored_path).read()
|
||||
assert "UNIQUE_MIDDLE_MARKER" in full
|
||||
assert "row 2500" in full # the omitted-middle row is in the stored file
|
||||
|
||||
|
||||
class TestCharLimitConfig:
|
||||
def test_default_when_unset(self):
|
||||
with patch("tools.web_tools._load_web_config", return_value={}):
|
||||
assert wt._get_extract_char_limit() == wt.DEFAULT_EXTRACT_CHAR_LIMIT
|
||||
|
||||
def test_config_override(self):
|
||||
with patch("tools.web_tools._load_web_config", return_value={"extract_char_limit": 40000}):
|
||||
assert wt._get_extract_char_limit() == 40000
|
||||
|
||||
def test_clamps_floor(self):
|
||||
with patch("tools.web_tools._load_web_config", return_value={"extract_char_limit": 100}):
|
||||
assert wt._get_extract_char_limit() == 2000
|
||||
|
||||
def test_bad_value_falls_back(self):
|
||||
with patch("tools.web_tools._load_web_config", return_value={"extract_char_limit": "nope"}):
|
||||
assert wt._get_extract_char_limit() == wt.DEFAULT_EXTRACT_CHAR_LIMIT
|
||||
|
||||
|
||||
class TestEndToEnd:
|
||||
def test_web_extract_truncates_large_page_no_llm(self, tmp_path, monkeypatch):
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
|
||||
big = "\n".join(f"para {i} " + "y" * 80 for i in range(3000))
|
||||
|
||||
class FakeProvider:
|
||||
name = "fake"
|
||||
display_name = "Fake"
|
||||
|
||||
def supports_extract(self):
|
||||
return True
|
||||
|
||||
async def extract(self, urls, **kwargs):
|
||||
return [{"url": urls[0], "title": "Big Page", "content": big,
|
||||
"raw_content": big, "metadata": {}}]
|
||||
|
||||
with patch("tools.web_tools._ensure_web_plugins_loaded"), \
|
||||
patch("tools.web_tools._get_extract_backend", return_value="fake"), \
|
||||
patch("tools.web_tools.async_is_safe_url", new=_AsyncTrue()), \
|
||||
patch("agent.web_search_registry.get_provider", return_value=FakeProvider()):
|
||||
result = json.loads(asyncio.new_event_loop().run_until_complete(
|
||||
wt.web_extract_tool(["https://example.com/big"], char_limit=5000)
|
||||
))
|
||||
|
||||
assert "results" in result
|
||||
content = result["results"][0]["content"]
|
||||
assert "[TRUNCATED]" in content
|
||||
assert "Full text saved to:" in content
|
||||
# No LLM was involved: para 0 (head) and the last para (tail) are verbatim.
|
||||
assert "para 0 " in content
|
||||
assert "para 2999 " in content
|
||||
|
||||
|
||||
def _make_awaitable(value):
|
||||
async def _coro(*a, **k):
|
||||
return value
|
||||
return _coro()
|
||||
|
||||
|
||||
class _AsyncTrue:
|
||||
"""Async callable that always returns True (re-awaitable per call)."""
|
||||
async def __call__(self, *a, **k):
|
||||
return True
|
||||
|
|
@ -398,7 +398,7 @@ class TestWebToolPolicy:
|
|||
# Force the firecrawl plugin to be the active extract provider.
|
||||
monkeypatch.setenv("FIRECRAWL_API_KEY", "fake-key")
|
||||
|
||||
result = json.loads(await web_tools.web_extract_tool(["https://blocked.test"], use_llm_processing=False))
|
||||
result = json.loads(await web_tools.web_extract_tool(["https://blocked.test"]))
|
||||
|
||||
assert result["results"][0]["url"] == "https://blocked.test"
|
||||
assert "Blocked by website policy" in result["results"][0]["error"]
|
||||
|
|
@ -443,7 +443,7 @@ class TestWebToolPolicy:
|
|||
monkeypatch.setattr("tools.interrupt.is_interrupted", lambda: False)
|
||||
monkeypatch.setenv("FIRECRAWL_API_KEY", "fake-key")
|
||||
|
||||
result = json.loads(await web_tools.web_extract_tool(["https://allowed.test"], use_llm_processing=False))
|
||||
result = json.loads(await web_tools.web_extract_tool(["https://allowed.test"]))
|
||||
|
||||
assert result["results"][0]["url"] == "https://blocked.test/final"
|
||||
assert result["results"][0]["content"] == ""
|
||||
|
|
|
|||
|
|
@ -219,9 +219,9 @@ _TOOL_STUBS = {
|
|||
),
|
||||
"web_extract": (
|
||||
"web_extract",
|
||||
"urls: list",
|
||||
'"""Extract content from URLs. Returns dict with results list of {url, title, content, error}."""',
|
||||
'{"urls": urls}',
|
||||
"urls: list, char_limit: int = None",
|
||||
'"""Extract content from URLs (no LLM summarization). Returns dict with results list of {url, title, content, error}. Pages over char_limit (default 15000) are head+tail truncated with the full text stored on disk; the content footer gives the path. content is markdown."""',
|
||||
'{"urls": urls, "char_limit": char_limit}',
|
||||
),
|
||||
"read_file": (
|
||||
"read_file",
|
||||
|
|
@ -1727,8 +1727,9 @@ _TOOL_DOC_LINES = [
|
|||
" web_search(query: str, limit: int = 5) -> dict\n"
|
||||
" Returns {\"data\": {\"web\": [{\"url\", \"title\", \"description\"}, ...]}}"),
|
||||
("web_extract",
|
||||
" web_extract(urls: list[str]) -> dict\n"
|
||||
" Returns {\"results\": [{\"url\", \"title\", \"content\", \"error\"}, ...]} where content is markdown"),
|
||||
" web_extract(urls: list[str], char_limit: int = None) -> dict\n"
|
||||
" Returns {\"results\": [{\"url\", \"title\", \"content\", \"error\"}, ...]} where content is markdown.\n"
|
||||
" No LLM summarization. Pages over char_limit (default 15000) are head+tail truncated; full text stored on disk (path in the content footer)."),
|
||||
("read_file",
|
||||
" read_file(path: str, offset: int = 1, limit: int = 500) -> dict\n"
|
||||
" Lines are 1-indexed. Returns {\"content\": \"...\", \"total_lines\": N}"),
|
||||
|
|
|
|||
|
|
@ -349,6 +349,7 @@ _CACHE_DIRS: list[tuple[str, str]] = [
|
|||
("cache/audio", "audio_cache"),
|
||||
("cache/videos", "video_cache"),
|
||||
("cache/screenshots", "browser_screenshots"),
|
||||
("cache/web", "web_cache"),
|
||||
]
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -83,11 +83,6 @@ _parallel_client: Optional[Any] = None
|
|||
_async_parallel_client: Optional[Any] = None
|
||||
_exa_client: Optional[Any] = None
|
||||
|
||||
from agent.auxiliary_client import (
|
||||
async_call_llm,
|
||||
extract_content_or_reasoning,
|
||||
get_async_text_auxiliary_client,
|
||||
)
|
||||
from tools.debug_helpers import DebugSession
|
||||
# Imported solely so unit tests can monkeypatch these names on
|
||||
# tools.web_tools (the firecrawl plugin reads them via its own import chain).
|
||||
|
|
@ -305,445 +300,144 @@ def _web_requires_env() -> list[str]:
|
|||
# unit-test patches.
|
||||
|
||||
|
||||
DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION = 5000
|
||||
|
||||
def _is_nous_auxiliary_client(client: Any) -> bool:
|
||||
"""Return True when the resolved auxiliary backend is Nous Portal."""
|
||||
from urllib.parse import urlparse
|
||||
|
||||
base_url = str(getattr(client, "base_url", "") or "")
|
||||
host = (urlparse(base_url).hostname or "").lower()
|
||||
return host == "nousresearch.com" or host.endswith(".nousresearch.com")
|
||||
|
||||
|
||||
def _resolve_web_extract_auxiliary(model: Optional[str] = None) -> tuple[Optional[Any], Optional[str], Dict[str, Any]]:
|
||||
"""Resolve the current web-extract auxiliary client, model, and extra body."""
|
||||
client, default_model = get_async_text_auxiliary_client("web_extract")
|
||||
configured_model = os.getenv("AUXILIARY_WEB_EXTRACT_MODEL", "").strip()
|
||||
effective_model = model or configured_model or default_model
|
||||
|
||||
extra_body: Dict[str, Any] = {}
|
||||
if client is not None and _is_nous_auxiliary_client(client):
|
||||
from agent.auxiliary_client import get_auxiliary_extra_body
|
||||
from agent.portal_tags import nous_portal_tags
|
||||
extra_body = get_auxiliary_extra_body() or {"tags": nous_portal_tags()}
|
||||
|
||||
return client, effective_model, extra_body
|
||||
|
||||
|
||||
def _get_default_summarizer_model() -> Optional[str]:
|
||||
"""Return the current default model for web extraction summarization."""
|
||||
_, model, _ = _resolve_web_extract_auxiliary()
|
||||
return model
|
||||
# Default budget (characters) of clean page text sent to the model. Pages at
|
||||
# or under this size are returned whole; larger pages are head+tail truncated
|
||||
# and the full text is stored on disk (see _store_full_text). Spending context,
|
||||
# not API dollars — so this is generous relative to the old 5k summary cap.
|
||||
# Override via web.extract_char_limit in config.yaml.
|
||||
DEFAULT_EXTRACT_CHAR_LIMIT = 15000
|
||||
|
||||
_debug = DebugSession("web_tools", env_var="WEB_TOOLS_DEBUG")
|
||||
|
||||
|
||||
async def process_content_with_llm(
|
||||
content: str,
|
||||
url: str = "",
|
||||
title: str = "",
|
||||
model: Optional[str] = None,
|
||||
min_length: int = DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION
|
||||
) -> Optional[str]:
|
||||
"""
|
||||
Process web content using LLM to create intelligent summaries with key excerpts.
|
||||
|
||||
This function uses Gemini 3 Flash Preview (or specified model) via OpenRouter API
|
||||
to intelligently extract key information and create markdown summaries,
|
||||
significantly reducing token usage while preserving all important information.
|
||||
|
||||
For very large content (>500k chars), uses chunked processing with synthesis.
|
||||
For extremely large content (>2M chars), refuses to process entirely.
|
||||
|
||||
Args:
|
||||
content (str): The raw content to process
|
||||
url (str): The source URL (for context, optional)
|
||||
title (str): The page title (for context, optional)
|
||||
model (str): The model to use for processing (default: google/gemini-3-flash-preview)
|
||||
min_length (int): Minimum content length to trigger processing (default: 5000)
|
||||
|
||||
Returns:
|
||||
Optional[str]: Processed markdown content, or None if content too short or processing fails
|
||||
"""
|
||||
# Size thresholds
|
||||
MAX_CONTENT_SIZE = 2_000_000 # 2M chars - refuse entirely above this
|
||||
CHUNK_THRESHOLD = 500_000 # 500k chars - use chunked processing above this
|
||||
CHUNK_SIZE = 100_000 # 100k chars per chunk
|
||||
MAX_OUTPUT_SIZE = 5000 # Hard cap on final output size
|
||||
|
||||
def _get_extract_char_limit() -> int:
|
||||
"""Resolve the per-page char budget from config, clamped to a sane range."""
|
||||
try:
|
||||
content_len = len(content)
|
||||
|
||||
# Refuse if content is absurdly large
|
||||
if content_len > MAX_CONTENT_SIZE:
|
||||
size_mb = content_len / 1_000_000
|
||||
logger.warning("Content too large (%.1fMB > 2MB limit). Refusing to process.", size_mb)
|
||||
return f"[Content too large to process: {size_mb:.1f}MB. Try a more focused source URL.]"
|
||||
|
||||
# Skip processing if content is too short
|
||||
if content_len < min_length:
|
||||
logger.debug("Content too short (%d < %d chars), skipping LLM processing", content_len, min_length)
|
||||
return None
|
||||
|
||||
# Create context information
|
||||
context_info = []
|
||||
if title:
|
||||
context_info.append(f"Title: {title}")
|
||||
if url:
|
||||
context_info.append(f"Source: {url}")
|
||||
context_str = "\n".join(context_info) + "\n\n" if context_info else ""
|
||||
|
||||
# Check if we need chunked processing
|
||||
if content_len > CHUNK_THRESHOLD:
|
||||
logger.info("Content large (%d chars). Using chunked processing...", content_len)
|
||||
return await _process_large_content_chunked(
|
||||
content, context_str, model, CHUNK_SIZE, MAX_OUTPUT_SIZE
|
||||
)
|
||||
|
||||
# Standard single-pass processing for normal content
|
||||
logger.info("Processing content with LLM (%d characters)", content_len)
|
||||
|
||||
processed_content = await _call_summarizer_llm(content, context_str, model)
|
||||
|
||||
if processed_content:
|
||||
# Enforce output cap
|
||||
if len(processed_content) > MAX_OUTPUT_SIZE:
|
||||
processed_content = processed_content[:MAX_OUTPUT_SIZE] + "\n\n[... summary truncated for context management ...]"
|
||||
|
||||
# Log compression metrics
|
||||
processed_length = len(processed_content)
|
||||
compression_ratio = processed_length / content_len if content_len > 0 else 1.0
|
||||
logger.info("Content processed: %d -> %d chars (%.1f%%)", content_len, processed_length, compression_ratio * 100)
|
||||
|
||||
return processed_content
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
"web_extract LLM summarization failed (%s). "
|
||||
"Tip: increase auxiliary.web_extract.timeout in config.yaml "
|
||||
"or switch to a faster auxiliary model.",
|
||||
str(e)[:120],
|
||||
configured = _load_web_config().get("extract_char_limit")
|
||||
if configured is not None:
|
||||
value = int(configured)
|
||||
# Floor at 2k (below that the footer dominates), no hard ceiling
|
||||
# beyond a generous guard so a typo can't blow up context.
|
||||
return max(2000, min(value, 500_000))
|
||||
except (TypeError, ValueError):
|
||||
pass
|
||||
return DEFAULT_EXTRACT_CHAR_LIMIT
|
||||
|
||||
|
||||
def convert_base64_images_to_links(text: str) -> str:
|
||||
"""Replace inline base64 image blobs with labeled markdown links.
|
||||
|
||||
base64 image payloads are token bombs (a single inline PNG can be tens of
|
||||
thousands of characters), so we never send the raw bytes to the model. But
|
||||
we preserve the fact that an image was there, and its alt text, as an
|
||||
inspectable placeholder. Real (http/https) markdown image links are left
|
||||
untouched so the agent can ``web_extract`` / ``vision_analyze`` them.
|
||||
|
||||
Transformations:
|
||||
```` -> ``[IMAGE: alt](base64 image omitted)``
|
||||
``(data:image/png;base64,AAAA...)`` -> ``[IMAGE]``
|
||||
bare ``data:image/...;base64,AAAA...`` -> ``[IMAGE]``
|
||||
"""
|
||||
# 1. Markdown image with base64 source -> keep alt text, drop the blob.
|
||||
def _md_repl(m: "re.Match[str]") -> str:
|
||||
alt = (m.group("alt") or "").strip()
|
||||
return f"[IMAGE: {alt}]" if alt else "[IMAGE]"
|
||||
|
||||
md_b64 = re.compile(
|
||||
r"!\[(?P<alt>[^\]]*)\]\(\s*data:image/[^;]+;base64,[A-Za-z0-9+/=\s]+\)"
|
||||
)
|
||||
out = md_b64.sub(_md_repl, text)
|
||||
|
||||
# 2. Parenthesised base64 (non-markdown) and 3. bare base64 -> [IMAGE].
|
||||
out = re.sub(r"\(\s*data:image/[^;]+;base64,[A-Za-z0-9+/=\s]+\)", "[IMAGE]", out)
|
||||
out = re.sub(r"data:image/[^;]+;base64,[A-Za-z0-9+/=]+", "[IMAGE]", out)
|
||||
return out
|
||||
|
||||
|
||||
def _store_full_text(url: str, content: str) -> Optional[str]:
|
||||
"""Write the full extracted page to cache/web and return its absolute path.
|
||||
|
||||
The file is mounted read-only into remote backends (Docker/Modal/SSH) via
|
||||
credential_files._CACHE_DIRS, so the agent's terminal/read_file tools can
|
||||
page through the complete text on any backend. Returns None on failure
|
||||
(storage is best-effort; truncated content is still returned to the model).
|
||||
"""
|
||||
try:
|
||||
import hashlib
|
||||
from urllib.parse import urlparse
|
||||
from hermes_constants import get_hermes_dir
|
||||
|
||||
cache_dir = get_hermes_dir("cache/web", "web_cache")
|
||||
cache_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
host = (urlparse(url).hostname or "page").replace(":", "_")
|
||||
slug = re.sub(r"[^A-Za-z0-9._-]", "-", host)[:60].strip("-") or "page"
|
||||
digest = hashlib.sha256(url.encode("utf-8")).hexdigest()[:10]
|
||||
path = cache_dir / f"{slug}-{digest}.md"
|
||||
path.write_text(content, encoding="utf-8")
|
||||
return str(path)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logger.debug("Failed to store full web_extract text for %s: %s", url, exc)
|
||||
return None
|
||||
|
||||
|
||||
def _truncate_with_footer(
|
||||
content: str,
|
||||
url: str,
|
||||
char_limit: int,
|
||||
) -> tuple[str, bool]:
|
||||
"""Return (model_text, was_truncated) for one page's clean content.
|
||||
|
||||
Pages at or under ``char_limit`` are returned whole. Larger pages get a
|
||||
head+tail window (~75% head / ~25% tail) cut on a markdown line boundary
|
||||
where possible, plus an explicit footer telling the model exactly how much
|
||||
it is seeing, where the full text is stored, and which read_file call pages
|
||||
in the omitted middle. Deterministic — no model involvement.
|
||||
"""
|
||||
if len(content) <= char_limit:
|
||||
return content, False
|
||||
|
||||
head_budget = int(char_limit * 0.75)
|
||||
tail_budget = char_limit - head_budget
|
||||
|
||||
head = content[:head_budget]
|
||||
tail = content[-tail_budget:]
|
||||
# Snap the head cut back to the last newline so we don't slice mid-line.
|
||||
nl = head.rfind("\n")
|
||||
if nl > head_budget * 0.5:
|
||||
head = head[:nl]
|
||||
# Snap the tail cut forward to the next newline for the same reason.
|
||||
nl = tail.find("\n")
|
||||
if 0 <= nl < tail_budget * 0.5:
|
||||
tail = tail[nl + 1:]
|
||||
|
||||
total = len(content)
|
||||
stored_path = _store_full_text(url, content)
|
||||
shown = len(head) + len(tail)
|
||||
|
||||
footer_lines = [
|
||||
"",
|
||||
"─" * 8 + " [TRUNCATED] " + "─" * 8,
|
||||
f"Showing {len(head):,} chars (head) + {len(tail):,} chars (tail) "
|
||||
f"of {total:,} total clean characters.",
|
||||
]
|
||||
if stored_path:
|
||||
footer_lines.append(f"Full text saved to: {stored_path}")
|
||||
footer_lines.append(
|
||||
f'To read the omitted middle: read_file path="{stored_path}" '
|
||||
f"offset=<line> limit=<n> (the file is the complete page)."
|
||||
)
|
||||
# Fall back to truncated raw content instead of returning a useless
|
||||
# error message. The first ~5000 chars are almost always more useful
|
||||
# to the model than "[Failed to process content: ...]".
|
||||
truncated = content[:MAX_OUTPUT_SIZE]
|
||||
if len(content) > MAX_OUTPUT_SIZE:
|
||||
truncated += (
|
||||
f"\n\n[Content truncated — showing first {MAX_OUTPUT_SIZE:,} of "
|
||||
f"{len(content):,} chars. LLM summarization timed out. "
|
||||
f"To fix: increase auxiliary.web_extract.timeout in config.yaml, "
|
||||
f"or use a faster auxiliary model. Use browser_navigate for the full page.]"
|
||||
)
|
||||
return truncated
|
||||
|
||||
|
||||
async def _call_summarizer_llm(
|
||||
content: str,
|
||||
context_str: str,
|
||||
model: Optional[str],
|
||||
max_tokens: int = 20000,
|
||||
is_chunk: bool = False,
|
||||
chunk_info: str = ""
|
||||
) -> Optional[str]:
|
||||
"""
|
||||
Make a single LLM call to summarize content.
|
||||
|
||||
Args:
|
||||
content: The content to summarize
|
||||
context_str: Context information (title, URL)
|
||||
model: Model to use
|
||||
max_tokens: Maximum output tokens
|
||||
is_chunk: Whether this is a chunk of a larger document
|
||||
chunk_info: Information about chunk position (e.g., "Chunk 2/5")
|
||||
|
||||
Returns:
|
||||
Summarized content or None on failure
|
||||
"""
|
||||
if is_chunk:
|
||||
# Chunk-specific prompt - aware that this is partial content
|
||||
system_prompt = """You are an expert content analyst processing a SECTION of a larger document. Your job is to extract and summarize the key information from THIS SECTION ONLY.
|
||||
|
||||
Important guidelines for chunk processing:
|
||||
1. Do NOT write introductions or conclusions - this is a partial document
|
||||
2. Focus on extracting ALL key facts, figures, data points, and insights from this section
|
||||
3. Preserve important quotes, code snippets, and specific details verbatim
|
||||
4. Use bullet points and structured formatting for easy synthesis later
|
||||
5. Note any references to other sections (e.g., "as mentioned earlier", "see below") without trying to resolve them
|
||||
|
||||
Your output will be combined with summaries of other sections, so focus on thorough extraction rather than narrative flow."""
|
||||
|
||||
user_prompt = f"""Extract key information from this SECTION of a larger document:
|
||||
|
||||
{context_str}{chunk_info}
|
||||
|
||||
SECTION CONTENT:
|
||||
{content}
|
||||
|
||||
Extract all important information from this section in a structured format. Focus on facts, data, insights, and key details. Do not add introductions or conclusions."""
|
||||
|
||||
else:
|
||||
# Standard full-document prompt
|
||||
system_prompt = """You are an expert content analyst. Your job is to process web content and create a comprehensive yet concise summary that preserves all important information while dramatically reducing bulk.
|
||||
footer_lines.append(
|
||||
"Full text could not be stored; re-run web_extract on a more "
|
||||
"specific URL or use browser_navigate for the complete page."
|
||||
)
|
||||
footer_lines.append("─" * 29)
|
||||
|
||||
Create a well-structured markdown summary that includes:
|
||||
1. Key excerpts (quotes, code snippets, important facts) in their original format
|
||||
2. Comprehensive summary of all other important information
|
||||
3. Proper markdown formatting with headers, bullets, and emphasis
|
||||
model_text = head + "\n\n[... middle omitted — see footer ...]\n\n" + tail
|
||||
model_text += "\n" + "\n".join(footer_lines)
|
||||
return model_text, True
|
||||
|
||||
Your goal is to preserve ALL important information while reducing length. Never lose key facts, figures, insights, or actionable information. Make it scannable and well-organized."""
|
||||
|
||||
user_prompt = f"""Please process this web content and create a comprehensive markdown summary:
|
||||
|
||||
{context_str}CONTENT TO PROCESS:
|
||||
{content}
|
||||
|
||||
Create a markdown summary that captures all key information in a well-organized, scannable format. Include important quotes and code snippets in their original formatting. Focus on actionable information, specific details, and unique insights."""
|
||||
|
||||
# Call the LLM with retry logic — keep retries low since summarization
|
||||
# is a nice-to-have; the caller falls back to truncated content on failure.
|
||||
max_retries = 2
|
||||
retry_delay = 2
|
||||
last_error = None
|
||||
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
aux_client, effective_model, extra_body = _resolve_web_extract_auxiliary(model)
|
||||
if aux_client is None or not effective_model:
|
||||
logger.warning("No auxiliary model available for web content processing")
|
||||
return None
|
||||
call_kwargs = {
|
||||
"task": "web_extract",
|
||||
"model": effective_model,
|
||||
"messages": [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": user_prompt},
|
||||
],
|
||||
"temperature": 0.1,
|
||||
"max_tokens": max_tokens,
|
||||
# No explicit timeout — async_call_llm reads auxiliary.web_extract.timeout
|
||||
# from config.yaml. Fresh configs ship with 360s; if the key is absent
|
||||
# the runtime default is 30s (_DEFAULT_AUX_TIMEOUT in
|
||||
# agent/auxiliary_client.py). Users with slow local models should set
|
||||
# or increase auxiliary.web_extract.timeout in config.yaml.
|
||||
}
|
||||
if extra_body:
|
||||
call_kwargs["extra_body"] = extra_body
|
||||
response = await async_call_llm(**call_kwargs)
|
||||
content = extract_content_or_reasoning(response)
|
||||
if content:
|
||||
return content
|
||||
# Reasoning-only / empty response — let the retry loop handle it
|
||||
logger.warning("LLM returned empty content (attempt %d/%d), retrying", attempt + 1, max_retries)
|
||||
if attempt < max_retries - 1:
|
||||
await asyncio.sleep(retry_delay)
|
||||
retry_delay = min(retry_delay * 2, 60)
|
||||
continue
|
||||
return content # Return whatever we got after exhausting retries
|
||||
except RuntimeError:
|
||||
logger.warning("No auxiliary model available for web content processing")
|
||||
return None
|
||||
except Exception as api_error:
|
||||
last_error = api_error
|
||||
if attempt < max_retries - 1:
|
||||
logger.warning("LLM API call failed (attempt %d/%d): %s", attempt + 1, max_retries, str(api_error)[:100])
|
||||
logger.warning("Retrying in %ds...", retry_delay)
|
||||
await asyncio.sleep(retry_delay)
|
||||
retry_delay = min(retry_delay * 2, 60)
|
||||
else:
|
||||
raise last_error
|
||||
|
||||
return None
|
||||
|
||||
|
||||
async def _process_large_content_chunked(
|
||||
content: str,
|
||||
context_str: str,
|
||||
model: Optional[str],
|
||||
chunk_size: int,
|
||||
max_output_size: int
|
||||
) -> Optional[str]:
|
||||
"""
|
||||
Process large content by chunking, summarizing each chunk in parallel,
|
||||
then synthesizing the summaries.
|
||||
|
||||
Args:
|
||||
content: The large content to process
|
||||
context_str: Context information
|
||||
model: Model to use
|
||||
chunk_size: Size of each chunk in characters
|
||||
max_output_size: Maximum final output size
|
||||
|
||||
Returns:
|
||||
Synthesized summary or None on failure
|
||||
"""
|
||||
# Split content into chunks
|
||||
chunks = []
|
||||
for i in range(0, len(content), chunk_size):
|
||||
chunk = content[i:i + chunk_size]
|
||||
chunks.append(chunk)
|
||||
|
||||
logger.info("Split into %d chunks of ~%d chars each", len(chunks), chunk_size)
|
||||
|
||||
# Summarize each chunk in parallel
|
||||
async def summarize_chunk(chunk_idx: int, chunk_content: str) -> tuple[int, Optional[str]]:
|
||||
"""Summarize a single chunk."""
|
||||
try:
|
||||
chunk_info = f"[Processing chunk {chunk_idx + 1} of {len(chunks)}]"
|
||||
summary = await _call_summarizer_llm(
|
||||
chunk_content,
|
||||
context_str,
|
||||
model,
|
||||
max_tokens=10000,
|
||||
is_chunk=True,
|
||||
chunk_info=chunk_info
|
||||
)
|
||||
if summary:
|
||||
logger.info("Chunk %d/%d summarized: %d -> %d chars", chunk_idx + 1, len(chunks), len(chunk_content), len(summary))
|
||||
return chunk_idx, summary
|
||||
except Exception as e:
|
||||
logger.warning("Chunk %d/%d failed: %s", chunk_idx + 1, len(chunks), str(e)[:50])
|
||||
return chunk_idx, None
|
||||
|
||||
# Run all chunk summarizations in parallel
|
||||
tasks = [summarize_chunk(i, chunk) for i, chunk in enumerate(chunks)]
|
||||
# Use return_exceptions=True so a single task failure does not discard
|
||||
# all other successfully summarized chunks.
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
# Filter out exceptions, then collect successful summaries in order
|
||||
successful_results = []
|
||||
for result_item in results:
|
||||
if isinstance(result_item, BaseException):
|
||||
logger.warning("Chunk summarization task failed: %s", result_item)
|
||||
continue
|
||||
successful_results.append(result_item)
|
||||
|
||||
summaries = []
|
||||
for chunk_idx, summary in sorted(successful_results, key=lambda x: x[0]):
|
||||
if summary:
|
||||
summaries.append(f"## Section {chunk_idx + 1}\n{summary}")
|
||||
|
||||
if not summaries:
|
||||
logger.debug("All chunk summarizations failed")
|
||||
return "[Failed to process large content: all chunk summarizations failed]"
|
||||
|
||||
logger.info("Got %d/%d chunk summaries", len(summaries), len(chunks))
|
||||
|
||||
# If only one chunk succeeded, just return it (with cap)
|
||||
if len(summaries) == 1:
|
||||
result = summaries[0]
|
||||
if len(result) > max_output_size:
|
||||
result = result[:max_output_size] + "\n\n[... truncated ...]"
|
||||
return result
|
||||
|
||||
# Synthesize the summaries into a final summary
|
||||
logger.info("Synthesizing %d summaries...", len(summaries))
|
||||
|
||||
combined_summaries = "\n\n---\n\n".join(summaries)
|
||||
|
||||
synthesis_prompt = f"""You have been given summaries of different sections of a large document.
|
||||
Synthesize these into ONE cohesive, comprehensive summary that:
|
||||
1. Removes redundancy between sections
|
||||
2. Preserves all key facts, figures, and actionable information
|
||||
3. Is well-organized with clear structure
|
||||
4. Is under {max_output_size} characters
|
||||
|
||||
{context_str}SECTION SUMMARIES:
|
||||
{combined_summaries}
|
||||
|
||||
Create a single, unified markdown summary."""
|
||||
|
||||
try:
|
||||
aux_client, effective_model, extra_body = _resolve_web_extract_auxiliary(model)
|
||||
if aux_client is None or not effective_model:
|
||||
logger.warning("No auxiliary model for synthesis, concatenating summaries")
|
||||
fallback = "\n\n".join(summaries)
|
||||
if len(fallback) > max_output_size:
|
||||
fallback = fallback[:max_output_size] + "\n\n[... truncated ...]"
|
||||
return fallback
|
||||
|
||||
call_kwargs = {
|
||||
"task": "web_extract",
|
||||
"model": effective_model,
|
||||
"messages": [
|
||||
{"role": "system", "content": "You synthesize multiple summaries into one cohesive, comprehensive summary. Be thorough but concise."},
|
||||
{"role": "user", "content": synthesis_prompt},
|
||||
],
|
||||
"temperature": 0.1,
|
||||
"max_tokens": 20000,
|
||||
}
|
||||
if extra_body:
|
||||
call_kwargs["extra_body"] = extra_body
|
||||
response = await async_call_llm(**call_kwargs)
|
||||
final_summary = extract_content_or_reasoning(response)
|
||||
|
||||
# Retry once on empty content (reasoning-only response)
|
||||
if not final_summary:
|
||||
logger.warning("Synthesis LLM returned empty content, retrying once")
|
||||
response = await async_call_llm(**call_kwargs)
|
||||
final_summary = extract_content_or_reasoning(response)
|
||||
|
||||
# If still None after retry, fall back to concatenated summaries
|
||||
if not final_summary:
|
||||
logger.warning("Synthesis failed after retry — concatenating chunk summaries")
|
||||
fallback = "\n\n".join(summaries)
|
||||
if len(fallback) > max_output_size:
|
||||
fallback = fallback[:max_output_size] + "\n\n[... truncated ...]"
|
||||
return fallback
|
||||
|
||||
# Enforce hard cap
|
||||
if len(final_summary) > max_output_size:
|
||||
final_summary = final_summary[:max_output_size] + "\n\n[... summary truncated for context management ...]"
|
||||
|
||||
original_len = len(content)
|
||||
final_len = len(final_summary)
|
||||
compression = final_len / original_len if original_len > 0 else 1.0
|
||||
|
||||
logger.info("Synthesis complete: %d -> %d chars (%.2f%%)", original_len, final_len, compression * 100)
|
||||
return final_summary
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("Synthesis failed: %s", str(e)[:100])
|
||||
# Fall back to concatenated summaries with truncation
|
||||
fallback = "\n\n".join(summaries)
|
||||
if len(fallback) > max_output_size:
|
||||
fallback = fallback[:max_output_size] + "\n\n[... truncated due to synthesis failure ...]"
|
||||
return fallback
|
||||
|
||||
|
||||
def clean_base64_images(text: str) -> str:
|
||||
"""
|
||||
Remove base64 encoded images from text to reduce token count and clutter.
|
||||
|
||||
This function finds and removes base64 encoded images in various formats:
|
||||
- (data:image/png;base64,...)
|
||||
- (data:image/jpeg;base64,...)
|
||||
- (data:image/svg+xml;base64,...)
|
||||
- data:image/[type];base64,... (without parentheses)
|
||||
|
||||
Args:
|
||||
text: The text content to clean
|
||||
|
||||
Returns:
|
||||
Cleaned text with base64 images replaced with placeholders
|
||||
"""
|
||||
# Pattern to match base64 encoded images wrapped in parentheses
|
||||
# Matches: (data:image/[type];base64,[base64-string])
|
||||
base64_with_parens_pattern = r'\(data:image/[^;]+;base64,[A-Za-z0-9+/=]+\)'
|
||||
|
||||
# Pattern to match base64 encoded images without parentheses
|
||||
# Matches: data:image/[type];base64,[base64-string]
|
||||
base64_pattern = r'data:image/[^;]+;base64,[A-Za-z0-9+/=]+'
|
||||
|
||||
# Replace parentheses-wrapped images first
|
||||
cleaned_text = re.sub(base64_with_parens_pattern, '[BASE64_IMAGE_REMOVED]', text)
|
||||
|
||||
# Then replace any remaining non-parentheses images
|
||||
cleaned_text = re.sub(base64_pattern, '[BASE64_IMAGE_REMOVED]', cleaned_text)
|
||||
|
||||
return cleaned_text
|
||||
|
||||
|
||||
# ─── Exa / Parallel inline helpers — moved into plugins ──────────────────────
|
||||
|
|
@ -894,29 +588,32 @@ def web_search_tool(query: str, limit: int = 5) -> str:
|
|||
async def web_extract_tool(
|
||||
urls: List[str],
|
||||
format: str = None,
|
||||
use_llm_processing: bool = True,
|
||||
model: Optional[str] = None,
|
||||
min_length: int = DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION
|
||||
char_limit: Optional[int] = None,
|
||||
) -> str:
|
||||
"""
|
||||
Extract content from specific web pages using available extraction API backend.
|
||||
|
||||
This function provides a generic interface for web content extraction that
|
||||
can work with multiple backends. Currently uses Firecrawl.
|
||||
Returns clean page content (markdown/text) with NO LLM summarization. The
|
||||
extract backends (Firecrawl, Tavily, Exa, Parallel) already return clean,
|
||||
boilerplate-stripped content, so we return it directly and fast. Pages over
|
||||
``char_limit`` are head+tail truncated with an explicit footer; the full
|
||||
text is stored under cache/web and the footer tells the model how to
|
||||
read_file the omitted middle. Inline base64 images are replaced with
|
||||
``[IMAGE: alt]`` placeholders (real image URLs are preserved as links).
|
||||
|
||||
Args:
|
||||
urls (List[str]): List of URLs to extract content from
|
||||
format (str): Desired output format ("markdown" or "html", optional)
|
||||
use_llm_processing (bool): Whether to process content with LLM for summarization (default: True)
|
||||
model (Optional[str]): The model to use for LLM processing (defaults to current auxiliary backend model)
|
||||
min_length (int): Minimum content length to trigger LLM processing (default: 5000)
|
||||
char_limit (Optional[int]): Per-page char budget sent to the model
|
||||
(default: web.extract_char_limit or 15000). Larger pages truncate.
|
||||
|
||||
Security: URLs are checked for embedded secrets before fetching.
|
||||
|
||||
|
||||
Returns:
|
||||
str: JSON string containing extracted content. If LLM processing is enabled and successful,
|
||||
the 'content' field will contain the processed markdown summary instead of raw content.
|
||||
|
||||
str: JSON string with a ``results`` list; each entry has
|
||||
``url``, ``title``, ``content``, ``error``. ``content`` is the
|
||||
(possibly truncated) clean page text.
|
||||
|
||||
Raises:
|
||||
Exception: If extraction fails or API key is not set
|
||||
"""
|
||||
|
|
@ -944,16 +641,14 @@ async def web_extract_tool(
|
|||
"parameters": {
|
||||
"urls": normalized_urls,
|
||||
"format": format,
|
||||
"use_llm_processing": use_llm_processing,
|
||||
"model": model,
|
||||
"min_length": min_length
|
||||
"char_limit": char_limit,
|
||||
},
|
||||
"error": None,
|
||||
"pages_extracted": 0,
|
||||
"pages_processed_with_llm": 0,
|
||||
"pages_truncated": 0,
|
||||
"original_response_size": 0,
|
||||
"final_response_size": 0,
|
||||
"compression_metrics": [],
|
||||
"truncation_metrics": [],
|
||||
"processing_applied": []
|
||||
}
|
||||
|
||||
|
|
@ -1053,91 +748,39 @@ async def web_extract_tool(
|
|||
|
||||
debug_call_data["pages_extracted"] = pages_extracted
|
||||
debug_call_data["original_response_size"] = len(json.dumps(response))
|
||||
effective_model = model or _get_default_summarizer_model()
|
||||
auxiliary_available = check_auxiliary_model()
|
||||
|
||||
# Process each result with LLM if enabled
|
||||
if use_llm_processing and auxiliary_available:
|
||||
logger.info("Processing extracted content with LLM (parallel)...")
|
||||
debug_call_data["processing_applied"].append("llm_processing")
|
||||
|
||||
# Prepare tasks for parallel processing
|
||||
async def process_single_result(result):
|
||||
"""Process a single result with LLM and return updated result with metrics."""
|
||||
url = result.get('url', 'Unknown URL')
|
||||
title = result.get('title', '')
|
||||
raw_content = result.get('raw_content', '') or result.get('content', '')
|
||||
|
||||
if not raw_content:
|
||||
return result, None, "no_content"
|
||||
|
||||
original_size = len(raw_content)
|
||||
|
||||
# Process content with LLM
|
||||
processed = await process_content_with_llm(
|
||||
raw_content, url, title, effective_model, min_length
|
||||
)
|
||||
|
||||
if processed:
|
||||
processed_size = len(processed)
|
||||
compression_ratio = processed_size / original_size if original_size > 0 else 1.0
|
||||
|
||||
# Update result with processed content
|
||||
result['content'] = processed
|
||||
result['raw_content'] = raw_content
|
||||
|
||||
metrics = {
|
||||
"url": url,
|
||||
"original_size": original_size,
|
||||
"processed_size": processed_size,
|
||||
"compression_ratio": compression_ratio,
|
||||
"model_used": effective_model
|
||||
}
|
||||
return result, metrics, "processed"
|
||||
else:
|
||||
metrics = {
|
||||
"url": url,
|
||||
"original_size": original_size,
|
||||
"processed_size": original_size,
|
||||
"compression_ratio": 1.0,
|
||||
"model_used": None,
|
||||
"reason": "content_too_short"
|
||||
}
|
||||
return result, metrics, "too_short"
|
||||
|
||||
# Run all LLM processing in parallel
|
||||
results_list = response.get('results', [])
|
||||
tasks = [process_single_result(result) for result in results_list]
|
||||
# Use return_exceptions=True so a single task failure does not
|
||||
# discard all other successfully processed results.
|
||||
processed_results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
# Collect metrics and print results
|
||||
for result_item in processed_results:
|
||||
if isinstance(result_item, BaseException):
|
||||
logger.warning("Web result processing task failed: %s", result_item)
|
||||
continue
|
||||
result, metrics, status = result_item
|
||||
url = result.get('url', 'Unknown URL')
|
||||
if status == "processed":
|
||||
debug_call_data["compression_metrics"].append(metrics)
|
||||
debug_call_data["pages_processed_with_llm"] += 1
|
||||
logger.info("%s (processed)", url)
|
||||
elif status == "too_short":
|
||||
debug_call_data["compression_metrics"].append(metrics)
|
||||
logger.info("%s (no processing - content too short)", url)
|
||||
else:
|
||||
logger.warning("%s (no content to process)", url)
|
||||
else:
|
||||
if use_llm_processing and not auxiliary_available:
|
||||
logger.warning("LLM processing requested but no auxiliary model available, returning raw content")
|
||||
debug_call_data["processing_applied"].append("llm_processing_unavailable")
|
||||
# Print summary of extracted pages for debugging (original behavior)
|
||||
for result in response.get('results', []):
|
||||
url = result.get('url', 'Unknown URL')
|
||||
content_length = len(result.get('raw_content', ''))
|
||||
logger.info("%s (%d characters)", url, content_length)
|
||||
|
||||
effective_char_limit = char_limit if char_limit is not None else _get_extract_char_limit()
|
||||
try:
|
||||
effective_char_limit = max(2000, min(int(effective_char_limit), 500_000))
|
||||
except (TypeError, ValueError):
|
||||
effective_char_limit = DEFAULT_EXTRACT_CHAR_LIMIT
|
||||
|
||||
# Truncate-and-store: no LLM. For each result, convert inline base64
|
||||
# images to labeled placeholders (keeping alt text + real image URLs),
|
||||
# then return the clean content directly if within budget, or a
|
||||
# head+tail window plus a footer pointing at the stored full text.
|
||||
debug_call_data["processing_applied"].append("truncate_and_store")
|
||||
for result in response.get("results", []):
|
||||
if result.get("error"):
|
||||
continue
|
||||
url = result.get("url", "")
|
||||
raw_content = result.get("raw_content", "") or result.get("content", "")
|
||||
if not raw_content:
|
||||
continue
|
||||
clean = convert_base64_images_to_links(raw_content)
|
||||
model_text, truncated = _truncate_with_footer(clean, url, effective_char_limit)
|
||||
result["content"] = model_text
|
||||
if truncated:
|
||||
debug_call_data["pages_truncated"] += 1
|
||||
debug_call_data["truncation_metrics"].append({
|
||||
"url": url,
|
||||
"original_size": len(clean),
|
||||
"sent_size": len(model_text),
|
||||
})
|
||||
logger.info("%s (truncated %d -> %d chars)", url, len(clean), len(model_text))
|
||||
else:
|
||||
logger.info("%s (%d chars, whole)", url, len(clean))
|
||||
|
||||
# Trim output to minimal fields per entry: title, content, error
|
||||
trimmed_results = [
|
||||
{
|
||||
|
|
@ -1153,16 +796,16 @@ async def web_extract_tool(
|
|||
|
||||
if trimmed_response.get("results") == []:
|
||||
result_json = tool_error("Content was inaccessible or not found")
|
||||
|
||||
cleaned_result = clean_base64_images(result_json)
|
||||
|
||||
else:
|
||||
result_json = json.dumps(trimmed_response, indent=2, ensure_ascii=False)
|
||||
|
||||
cleaned_result = clean_base64_images(result_json)
|
||||
|
||||
|
||||
# base64 images were already converted to placeholders per-result above;
|
||||
# this is a belt-and-suspenders sweep over the serialized JSON in case a
|
||||
# provider tucked a blob somewhere unexpected (e.g. metadata).
|
||||
cleaned_result = convert_base64_images_to_links(result_json)
|
||||
|
||||
debug_call_data["final_response_size"] = len(cleaned_result)
|
||||
debug_call_data["processing_applied"].append("base64_image_removal")
|
||||
debug_call_data["processing_applied"].append("base64_image_conversion")
|
||||
|
||||
# Log debug information
|
||||
_debug.log_call("web_extract_tool", debug_call_data)
|
||||
|
|
@ -1193,28 +836,18 @@ def check_web_api_key() -> bool:
|
|||
)
|
||||
|
||||
|
||||
def check_auxiliary_model() -> bool:
|
||||
"""Check if an auxiliary text model is available for LLM content processing."""
|
||||
client, _, _ = _resolve_web_extract_auxiliary()
|
||||
return client is not None
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
"""
|
||||
Simple test/demo when run directly
|
||||
"""
|
||||
print("🌐 Standalone Web Tools Module")
|
||||
print("=" * 40)
|
||||
|
||||
|
||||
# Check if API keys are available
|
||||
web_available = check_web_api_key()
|
||||
tool_gateway_available = _is_tool_gateway_ready()
|
||||
firecrawl_key_available = bool(os.getenv("FIRECRAWL_API_KEY", "").strip())
|
||||
firecrawl_url_available = bool(os.getenv("FIRECRAWL_API_URL", "").strip())
|
||||
nous_available = check_auxiliary_model()
|
||||
default_summarizer_model = _get_default_summarizer_model()
|
||||
|
||||
if web_available:
|
||||
backend = _get_backend()
|
||||
|
|
@ -1246,29 +879,20 @@ if __name__ == "__main__":
|
|||
f"{_firecrawl_backend_help_suffix()}"
|
||||
)
|
||||
|
||||
if not nous_available:
|
||||
print("❌ No auxiliary model available for LLM content processing")
|
||||
print("Set OPENROUTER_API_KEY, configure Nous Portal, or set OPENAI_BASE_URL + OPENAI_API_KEY")
|
||||
print("⚠️ Without an auxiliary model, LLM content processing will be disabled")
|
||||
else:
|
||||
print(f"✅ Auxiliary model available: {default_summarizer_model}")
|
||||
|
||||
if not web_available:
|
||||
sys.exit(1)
|
||||
|
||||
print("🛠️ Web tools ready for use!")
|
||||
|
||||
if nous_available:
|
||||
print(f"🧠 LLM content processing available with {default_summarizer_model}")
|
||||
print(f" Default min length for processing: {DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION} chars")
|
||||
|
||||
print(f" Extract char limit: {_get_extract_char_limit()} chars "
|
||||
"(pages over this are truncated; full text stored in cache/web)")
|
||||
|
||||
# Show debug mode status
|
||||
if _debug.active:
|
||||
print(f"🐛 Debug mode ENABLED - Session ID: {_debug.session_id}")
|
||||
print(f" Debug logs will be saved to: {_debug.log_dir}/web_tools_debug_{_debug.session_id}.json")
|
||||
else:
|
||||
print("🐛 Debug mode disabled (set WEB_TOOLS_DEBUG=true to enable)")
|
||||
|
||||
|
||||
print("\nBasic usage:")
|
||||
print(" from web_tools import web_search_tool, web_extract_tool")
|
||||
print(" import asyncio")
|
||||
|
|
@ -1276,37 +900,16 @@ if __name__ == "__main__":
|
|||
print(" # Search (synchronous)")
|
||||
print(" results = web_search_tool('Python tutorials')")
|
||||
print("")
|
||||
print(" # Extract (asynchronous)")
|
||||
print(" # Extract (asynchronous, no LLM — truncate-and-store)")
|
||||
print(" async def main():")
|
||||
print(" content = await web_extract_tool(['https://example.com'])")
|
||||
print(" # bigger budget for one call:")
|
||||
print(" content = await web_extract_tool(['https://docs.python.org'], char_limit=40000)")
|
||||
print(" asyncio.run(main())")
|
||||
|
||||
if nous_available:
|
||||
print("\nLLM-enhanced usage:")
|
||||
print(" # Content automatically processed for pages >5000 chars (default)")
|
||||
print(" content = await web_extract_tool(['https://python.org/about/'])")
|
||||
print("")
|
||||
print(" # Customize processing parameters")
|
||||
print(" content = await web_extract_tool(")
|
||||
print(" ['https://docs.python.org'],")
|
||||
print(" model='google/gemini-3-flash-preview',")
|
||||
print(" min_length=3000")
|
||||
print(" )")
|
||||
print("")
|
||||
print(" # Disable LLM processing")
|
||||
print(" raw_content = await web_extract_tool(['https://example.com'], use_llm_processing=False)")
|
||||
|
||||
|
||||
print("\nDebug mode:")
|
||||
print(" # Enable debug logging")
|
||||
print(" export WEB_TOOLS_DEBUG=true")
|
||||
print(" # Debug logs capture:")
|
||||
print(" # - All tool calls with parameters")
|
||||
print(" # - Original API responses")
|
||||
print(" # - LLM compression metrics")
|
||||
print(" # - Final processed results")
|
||||
print(" # Logs saved to: ./logs/web_tools_debug_UUID.json")
|
||||
|
||||
print("\n📝 Run 'python test_web_tools_llm.py' to test LLM processing capabilities")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
@ -1338,7 +941,7 @@ WEB_SEARCH_SCHEMA = {
|
|||
|
||||
WEB_EXTRACT_SCHEMA = {
|
||||
"name": "web_extract",
|
||||
"description": "Extract content from web page URLs. Returns page content in markdown format. Also works with PDF URLs (arxiv papers, documents, etc.) — pass the PDF link directly and it converts to markdown text. Pages under 5000 chars return full markdown; larger pages are LLM-summarized and capped at ~5000 chars per page. Pages over 2M chars are refused. If a URL fails or times out, use the browser tool to access it instead.",
|
||||
"description": "Extract content from web page URLs. Returns clean page content in markdown/text (no LLM summarization — fast). Also works with PDF URLs (arxiv papers, documents) — pass the PDF link directly. Pages within the char budget (default 15000) return whole; larger pages return a head+tail window with a footer telling you the full text's saved file path and the read_file call to page through the omitted middle. Inline images appear as [IMAGE: alt] placeholders; real image URLs are kept as links. If a URL fails or times out, use the browser tool instead.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
|
|
@ -1347,6 +950,11 @@ WEB_EXTRACT_SCHEMA = {
|
|||
"items": {"type": "string"},
|
||||
"description": "List of URLs to extract content from (max 5 URLs per call)",
|
||||
"maxItems": 5
|
||||
},
|
||||
"char_limit": {
|
||||
"type": "integer",
|
||||
"description": "Optional per-page character budget sent back (default 15000). Pages larger than this are head+tail truncated with the full text stored to disk. Raise it when you need more of a long page inline.",
|
||||
"minimum": 2000
|
||||
}
|
||||
},
|
||||
"required": ["urls"]
|
||||
|
|
@ -1368,7 +976,10 @@ registry.register(
|
|||
toolset="web",
|
||||
schema=WEB_EXTRACT_SCHEMA,
|
||||
handler=lambda args, **kw: web_extract_tool(
|
||||
args.get("urls", [])[:5] if isinstance(args.get("urls"), list) else [], "markdown"),
|
||||
args.get("urls", [])[:5] if isinstance(args.get("urls"), list) else [],
|
||||
"markdown",
|
||||
char_limit=args.get("char_limit"),
|
||||
),
|
||||
check_fn=check_web_api_key,
|
||||
requires_env=_web_requires_env(),
|
||||
is_async=True,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue