mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-07-01 12:02:05 +00:00
fix(web_extract): bound stored full-text size + give concrete read_file offset
Two robustness gaps from the #54843 truncate-store path: - _store_full_text wrote the full clean page to cache/web with no upper bound (path.write_text(content)); a multi-MB page → unbounded per-extract disk write. Cap at MAX_STORED_TEXT_CHARS (2MB, the pre-truncate-store refusal ceiling) with a marker when capped. - The truncation footer told the model 'read_file ... offset=<line>' — a literal placeholder it had to guess. Compute the real starting line of the omitted middle (head line count + 1) so the first read_file lands in the gap.
This commit is contained in:
parent
c1b9de73f5
commit
c9269fbfb6
2 changed files with 79 additions and 1 deletions
54
tests/tools/test_web_extract_robustness.py
Normal file
54
tests/tools/test_web_extract_robustness.py
Normal file
|
|
@ -0,0 +1,54 @@
|
|||
"""Tests for web_extract truncate-store robustness (findings from #54843 review).
|
||||
|
||||
Covers two robustness gaps left unaddressed when #54843 merged:
|
||||
1. _store_full_text bounded by MAX_STORED_TEXT_CHARS (no unbounded disk write).
|
||||
2. _truncate_with_footer emits a CONCRETE read_file offset for the omitted
|
||||
middle (was a literal `offset=<line>` placeholder the model had to guess).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
|
||||
import tools.web_tools as wt
|
||||
|
||||
|
||||
def test_store_full_text_is_bounded(tmp_path, monkeypatch):
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
||||
# Force the cache dir under the temp home.
|
||||
from hermes_constants import get_hermes_dir # noqa: F401
|
||||
huge = "x\n" * (wt.MAX_STORED_TEXT_CHARS) # > MAX_STORED_TEXT_CHARS chars
|
||||
assert len(huge) > wt.MAX_STORED_TEXT_CHARS
|
||||
path = wt._store_full_text("https://example.com/big", huge)
|
||||
assert path is not None
|
||||
stored = open(path, encoding="utf-8").read()
|
||||
# Stored copy capped (+ short marker), not the full unbounded blob.
|
||||
assert len(stored) <= wt.MAX_STORED_TEXT_CHARS + 200
|
||||
assert "stored copy truncated" in stored
|
||||
|
||||
|
||||
def test_truncate_footer_gives_concrete_offset(tmp_path, monkeypatch):
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
||||
# Build content well over the limit with many lines so head has a known count.
|
||||
content = "\n".join(f"line {i}" for i in range(5000))
|
||||
model_text, truncated = wt._truncate_with_footer(
|
||||
content, "https://example.com/page", char_limit=4000
|
||||
)
|
||||
assert truncated
|
||||
# Footer must contain a real integer offset, NOT the <line> placeholder.
|
||||
assert "offset=<line>" not in model_text
|
||||
m = re.search(r"offset=(\d+) limit=\d+", model_text)
|
||||
assert m, f"no concrete offset in footer: {model_text[-400:]}"
|
||||
offset = int(m.group(1))
|
||||
# Offset should point past the head we showed (head is ~75% of 4000 chars).
|
||||
assert offset > 1
|
||||
|
||||
|
||||
def test_small_page_not_truncated_no_footer(tmp_path, monkeypatch):
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
||||
content = "short page\nwith a few lines\n"
|
||||
model_text, truncated = wt._truncate_with_footer(
|
||||
content, "https://example.com/s", char_limit=15000
|
||||
)
|
||||
assert not truncated
|
||||
assert model_text == content
|
||||
assert "[TRUNCATED]" not in model_text
|
||||
|
|
@ -307,6 +307,15 @@ def _web_requires_env() -> list[str]:
|
|||
# Override via web.extract_char_limit in config.yaml.
|
||||
DEFAULT_EXTRACT_CHAR_LIMIT = 15000
|
||||
|
||||
# Hard ceiling on the full-text file written to cache/web. The truncate-store
|
||||
# path otherwise calls path.write_text(content) with no upper bound, so a
|
||||
# multi-MB page (some backends return very large markdown) writes unbounded
|
||||
# bytes to disk on every extract. Cap the stored copy; the model only ever
|
||||
# sees char_limit anyway, and a 2MB page is already far more than any single
|
||||
# read_file paging session needs. Mirrors the pre-truncate-store era's 2MB
|
||||
# refusal ceiling, but stores (capped) instead of refusing.
|
||||
MAX_STORED_TEXT_CHARS = 2_000_000
|
||||
|
||||
_debug = DebugSession("web_tools", env_var="WEB_TOOLS_DEBUG")
|
||||
|
||||
|
||||
|
|
@ -374,6 +383,15 @@ def _store_full_text(url: str, content: str) -> Optional[str]:
|
|||
slug = re.sub(r"[^A-Za-z0-9._-]", "-", host)[:60].strip("-") or "page"
|
||||
digest = hashlib.sha256(url.encode("utf-8")).hexdigest()[:10]
|
||||
path = cache_dir / f"{slug}-{digest}.md"
|
||||
# Bound the stored copy so a pathologically large page can't write
|
||||
# unbounded bytes to disk. If capped, append a marker so a reader of
|
||||
# the file knows it isn't the literal complete page.
|
||||
if len(content) > MAX_STORED_TEXT_CHARS:
|
||||
content = (
|
||||
content[:MAX_STORED_TEXT_CHARS]
|
||||
+ f"\n\n[... stored copy truncated at {MAX_STORED_TEXT_CHARS:,} chars "
|
||||
f"of {len(content):,}; re-extract a more specific URL for the rest ...]"
|
||||
)
|
||||
path.write_text(content, encoding="utf-8")
|
||||
return str(path)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
|
|
@ -422,10 +440,16 @@ def _truncate_with_footer(
|
|||
f"of {total:,} total clean characters.",
|
||||
]
|
||||
if stored_path:
|
||||
# The omitted middle begins right after the head we're showing. Give
|
||||
# the model a concrete starting line (head line count + 1) so its first
|
||||
# read_file lands in the gap instead of guessing <line>. read_file is
|
||||
# 1-indexed; +1 moves past the last head line we already showed.
|
||||
middle_start_line = head.count("\n") + 2
|
||||
footer_lines.append(f"Full text saved to: {stored_path}")
|
||||
footer_lines.append(
|
||||
f'To read the omitted middle: read_file path="{stored_path}" '
|
||||
f"offset=<line> limit=<n> (the file is the complete page)."
|
||||
f"offset={middle_start_line} limit=200 (the file is the complete page; "
|
||||
f"raise/lower offset to page through it)."
|
||||
)
|
||||
else:
|
||||
footer_lines.append(
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue