hermes-agent/tests/tools/test_web_tools_truncate.py
Teknium ee8cbfdc03
feat(web_extract): truncate-and-store instead of LLM summarization (#54843)
* feat(web_extract): truncate-and-store instead of LLM summarization

web_extract no longer runs an auxiliary LLM over scraped pages. The extract
backends (Firecrawl/Tavily/Exa/Parallel) already return clean, boilerplate-
stripped markdown, so we return it directly: pages within a char budget
(default 15000, web.extract_char_limit) come back whole; larger pages get a
head+tail window plus an explicit footer giving the stored full-text path and
the read_file call to page through the omitted middle. The full clean text is
written to cache/web (mounted read-only into remote backends like the other
cache dirs), so nothing is lost.

Inline base64 images are converted to [IMAGE: alt] placeholders (token bombs
dropped) while real http(s) image URLs are preserved as links so the agent can
still web_extract/vision_analyze them.

Removes process_content_with_llm + the chunked summarizer + check_auxiliary_model
+ _resolve_web_extract_auxiliary. context_references._default_url_fetcher is
updated to the truncate path and its stale data.documents shape read is fixed
to results (it was silently returning empty).

Live before/after eval (firecrawl, 4 URLs): 11.7x faster overall (176.6s ->
15.1s); 10-60x on large pages. Quality identical; findability 4/4 (answer
recoverable from stored full text on every truncated page). web_search is
unchanged.

No own scraper added; no changes to web_search.

* fix(web_extract): add char_limit to execute_code web_extract stub

The new web_extract char_limit param must appear in the code_execution_tool
_TOOL_STUBS signature (and doc line) or test_stubs_cover_all_schema_params
fails — the stub schema must cover every real schema param.
2026-06-29 10:00:49 -07:00

142 lines
5.8 KiB
Python

"""Unit tests for the truncate-and-store web_extract path (no LLM).
Covers convert_base64_images_to_links, _truncate_with_footer, _store_full_text,
_get_extract_char_limit, and the end-to-end web_extract_tool truncation behavior.
"""
import asyncio
import json
import os
from unittest.mock import patch
import pytest
import tools.web_tools as wt
class TestImageConversion:
def test_markdown_base64_image_keeps_alt_drops_blob(self):
blob = "A" * 5000
text = f"before ![a cat]( data:image/png;base64,{blob}) after"
out = wt.convert_base64_images_to_links(text)
assert "[IMAGE: a cat]" in out
assert "base64" not in out
assert blob not in out
assert "before" in out and "after" in out
def test_markdown_base64_image_no_alt(self):
out = wt.convert_base64_images_to_links("x ![](data:image/jpeg;base64,QQ==) y")
assert "[IMAGE]" in out
assert "base64" not in out
def test_real_http_image_links_preserved(self):
text = "see ![logo](https://example.com/logo.png) here"
out = wt.convert_base64_images_to_links(text)
# Real image URLs must survive so the agent can inspect them.
assert "![logo](https://example.com/logo.png)" in out
def test_bare_and_parenthesised_base64_become_placeholder(self):
blob = "Z" * 3000
bare = wt.convert_base64_images_to_links(f"data:image/gif;base64,{blob}")
assert bare == "[IMAGE]"
paren = wt.convert_base64_images_to_links(f"(data:image/gif;base64,{blob})")
assert paren == "[IMAGE]"
class TestTruncation:
def test_short_content_returned_whole(self):
content = "# Title\n\nshort body\n"
out, truncated = wt._truncate_with_footer(content, "https://e.com", 15000)
assert out == content
assert truncated is False
def test_long_content_truncated_with_footer(self, tmp_path, monkeypatch):
monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
body = "\n".join(f"line {i} " + "x" * 50 for i in range(2000))
out, truncated = wt._truncate_with_footer(body, "https://example.com/page", 4000)
assert truncated is True
assert "[TRUNCATED]" in out
assert "Full text saved to:" in out
assert "read_file" in out
# Head and tail are both present (first and last lines survive).
assert "line 0 " in out
assert "line 1999 " in out
# The omitted middle is gone.
assert "line 1000 " not in out
# Sent text is bounded near the budget (+ footer overhead).
assert len(out) < 4000 + 2000
def test_truncation_stores_full_text_readable(self, tmp_path, monkeypatch):
monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
body = "UNIQUE_MIDDLE_MARKER\n" + ("\n".join(f"row {i}" for i in range(5000)))
out, truncated = wt._truncate_with_footer(body, "https://example.com/doc", 3000)
assert truncated is True
# Extract the stored path from the footer and confirm full text is there.
path_line = next(ln for ln in out.splitlines() if "Full text saved to:" in ln)
stored_path = path_line.split("Full text saved to:", 1)[1].strip()
assert os.path.exists(stored_path)
full = open(stored_path).read()
assert "UNIQUE_MIDDLE_MARKER" in full
assert "row 2500" in full # the omitted-middle row is in the stored file
class TestCharLimitConfig:
def test_default_when_unset(self):
with patch("tools.web_tools._load_web_config", return_value={}):
assert wt._get_extract_char_limit() == wt.DEFAULT_EXTRACT_CHAR_LIMIT
def test_config_override(self):
with patch("tools.web_tools._load_web_config", return_value={"extract_char_limit": 40000}):
assert wt._get_extract_char_limit() == 40000
def test_clamps_floor(self):
with patch("tools.web_tools._load_web_config", return_value={"extract_char_limit": 100}):
assert wt._get_extract_char_limit() == 2000
def test_bad_value_falls_back(self):
with patch("tools.web_tools._load_web_config", return_value={"extract_char_limit": "nope"}):
assert wt._get_extract_char_limit() == wt.DEFAULT_EXTRACT_CHAR_LIMIT
class TestEndToEnd:
def test_web_extract_truncates_large_page_no_llm(self, tmp_path, monkeypatch):
monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
big = "\n".join(f"para {i} " + "y" * 80 for i in range(3000))
class FakeProvider:
name = "fake"
display_name = "Fake"
def supports_extract(self):
return True
async def extract(self, urls, **kwargs):
return [{"url": urls[0], "title": "Big Page", "content": big,
"raw_content": big, "metadata": {}}]
with patch("tools.web_tools._ensure_web_plugins_loaded"), \
patch("tools.web_tools._get_extract_backend", return_value="fake"), \
patch("tools.web_tools.async_is_safe_url", new=_AsyncTrue()), \
patch("agent.web_search_registry.get_provider", return_value=FakeProvider()):
result = json.loads(asyncio.new_event_loop().run_until_complete(
wt.web_extract_tool(["https://example.com/big"], char_limit=5000)
))
assert "results" in result
content = result["results"][0]["content"]
assert "[TRUNCATED]" in content
assert "Full text saved to:" in content
# No LLM was involved: para 0 (head) and the last para (tail) are verbatim.
assert "para 0 " in content
assert "para 2999 " in content
def _make_awaitable(value):
async def _coro(*a, **k):
return value
return _coro()
class _AsyncTrue:
"""Async callable that always returns True (re-awaitable per call)."""
async def __call__(self, *a, **k):
return True