From c9269fbfb6896793553b684eb6b5f3a79250fda7 Mon Sep 17 00:00:00 2001 From: kshitijk4poor <82637225+kshitijk4poor@users.noreply.github.com> Date: Tue, 30 Jun 2026 03:06:27 +0530 Subject: [PATCH] fix(web_extract): bound stored full-text size + give concrete read_file offset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two robustness gaps from the #54843 truncate-store path: - _store_full_text wrote the full clean page to cache/web with no upper bound (path.write_text(content)); a multi-MB page → unbounded per-extract disk write. Cap at MAX_STORED_TEXT_CHARS (2MB, the pre-truncate-store refusal ceiling) with a marker when capped. - The truncation footer told the model 'read_file ... offset=' — a literal placeholder it had to guess. Compute the real starting line of the omitted middle (head line count + 1) so the first read_file lands in the gap. --- tests/tools/test_web_extract_robustness.py | 54 ++++++++++++++++++++++ tools/web_tools.py | 26 ++++++++++- 2 files changed, 79 insertions(+), 1 deletion(-) create mode 100644 tests/tools/test_web_extract_robustness.py diff --git a/tests/tools/test_web_extract_robustness.py b/tests/tools/test_web_extract_robustness.py new file mode 100644 index 00000000000..daf4a879f3e --- /dev/null +++ b/tests/tools/test_web_extract_robustness.py @@ -0,0 +1,54 @@ +"""Tests for web_extract truncate-store robustness (findings from #54843 review). + +Covers two robustness gaps left unaddressed when #54843 merged: + 1. _store_full_text bounded by MAX_STORED_TEXT_CHARS (no unbounded disk write). + 2. _truncate_with_footer emits a CONCRETE read_file offset for the omitted + middle (was a literal `offset=` placeholder the model had to guess). +""" +from __future__ import annotations + +import re + +import tools.web_tools as wt + + +def test_store_full_text_is_bounded(tmp_path, monkeypatch): + monkeypatch.setenv("HERMES_HOME", str(tmp_path)) + # Force the cache dir under the temp home. + from hermes_constants import get_hermes_dir # noqa: F401 + huge = "x\n" * (wt.MAX_STORED_TEXT_CHARS) # > MAX_STORED_TEXT_CHARS chars + assert len(huge) > wt.MAX_STORED_TEXT_CHARS + path = wt._store_full_text("https://example.com/big", huge) + assert path is not None + stored = open(path, encoding="utf-8").read() + # Stored copy capped (+ short marker), not the full unbounded blob. + assert len(stored) <= wt.MAX_STORED_TEXT_CHARS + 200 + assert "stored copy truncated" in stored + + +def test_truncate_footer_gives_concrete_offset(tmp_path, monkeypatch): + monkeypatch.setenv("HERMES_HOME", str(tmp_path)) + # Build content well over the limit with many lines so head has a known count. + content = "\n".join(f"line {i}" for i in range(5000)) + model_text, truncated = wt._truncate_with_footer( + content, "https://example.com/page", char_limit=4000 + ) + assert truncated + # Footer must contain a real integer offset, NOT the placeholder. + assert "offset=" not in model_text + m = re.search(r"offset=(\d+) limit=\d+", model_text) + assert m, f"no concrete offset in footer: {model_text[-400:]}" + offset = int(m.group(1)) + # Offset should point past the head we showed (head is ~75% of 4000 chars). + assert offset > 1 + + +def test_small_page_not_truncated_no_footer(tmp_path, monkeypatch): + monkeypatch.setenv("HERMES_HOME", str(tmp_path)) + content = "short page\nwith a few lines\n" + model_text, truncated = wt._truncate_with_footer( + content, "https://example.com/s", char_limit=15000 + ) + assert not truncated + assert model_text == content + assert "[TRUNCATED]" not in model_text diff --git a/tools/web_tools.py b/tools/web_tools.py index 0635b23f5d4..e5b83c77ad1 100644 --- a/tools/web_tools.py +++ b/tools/web_tools.py @@ -307,6 +307,15 @@ def _web_requires_env() -> list[str]: # Override via web.extract_char_limit in config.yaml. DEFAULT_EXTRACT_CHAR_LIMIT = 15000 +# Hard ceiling on the full-text file written to cache/web. The truncate-store +# path otherwise calls path.write_text(content) with no upper bound, so a +# multi-MB page (some backends return very large markdown) writes unbounded +# bytes to disk on every extract. Cap the stored copy; the model only ever +# sees char_limit anyway, and a 2MB page is already far more than any single +# read_file paging session needs. Mirrors the pre-truncate-store era's 2MB +# refusal ceiling, but stores (capped) instead of refusing. +MAX_STORED_TEXT_CHARS = 2_000_000 + _debug = DebugSession("web_tools", env_var="WEB_TOOLS_DEBUG") @@ -374,6 +383,15 @@ def _store_full_text(url: str, content: str) -> Optional[str]: slug = re.sub(r"[^A-Za-z0-9._-]", "-", host)[:60].strip("-") or "page" digest = hashlib.sha256(url.encode("utf-8")).hexdigest()[:10] path = cache_dir / f"{slug}-{digest}.md" + # Bound the stored copy so a pathologically large page can't write + # unbounded bytes to disk. If capped, append a marker so a reader of + # the file knows it isn't the literal complete page. + if len(content) > MAX_STORED_TEXT_CHARS: + content = ( + content[:MAX_STORED_TEXT_CHARS] + + f"\n\n[... stored copy truncated at {MAX_STORED_TEXT_CHARS:,} chars " + f"of {len(content):,}; re-extract a more specific URL for the rest ...]" + ) path.write_text(content, encoding="utf-8") return str(path) except Exception as exc: # noqa: BLE001 @@ -422,10 +440,16 @@ def _truncate_with_footer( f"of {total:,} total clean characters.", ] if stored_path: + # The omitted middle begins right after the head we're showing. Give + # the model a concrete starting line (head line count + 1) so its first + # read_file lands in the gap instead of guessing . read_file is + # 1-indexed; +1 moves past the last head line we already showed. + middle_start_line = head.count("\n") + 2 footer_lines.append(f"Full text saved to: {stored_path}") footer_lines.append( f'To read the omitted middle: read_file path="{stored_path}" ' - f"offset= limit= (the file is the complete page)." + f"offset={middle_start_line} limit=200 (the file is the complete page; " + f"raise/lower offset to page through it)." ) else: footer_lines.append(