From 22b6942fc2fb00d44f4f7c77d85c2a7ce144845c Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Wed, 17 Jun 2026 13:45:25 -0700 Subject: [PATCH] feat(search_files): headroom compression evaluation report + lossless densification (#47866) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat(search_files): path-grouped lossless densification of content matches Content-mode search_files results repeat the {path,line,content} JSON keys and the full path string for every match. Group consecutive same-path matches under one path header with indented ': ' rows — lossless (every path/line/content byte preserved), self-describing (matches_format key), and readable by the model with no decode step. 57.8% mean token reduction on real search_files content outputs (422-output corpus), fires on 97% of them. Gated at >=5 matches; below that the verbose array is left untouched. Default to_dict(densify=False) is unchanged, so no other caller is affected. ripgrep emits matches path-ordered, so consecutive grouping never reorders results. * test: accept densify kwarg in _FakeSearchResult.to_dict The search loop-detection tests stub SearchResult with a fake whose to_dict() must mirror the real signature now that it takes densify=. * test(search_files): edge-case losslessness battery for densification Adversarial single-line content (colons, indentation, unicode/emoji, empty, trailing whitespace, quotes+commas), paths with spaces, and an explicit one-line-per-match invariant documenting the ripgrep contract the format relies on (0/6775 real match contents contained a newline). --- tests/tools/test_file_operations.py | 139 ++++++++++++++++++++++++ tests/tools/test_read_loop_detection.py | 2 +- tools/file_operations.py | 55 +++++++++- tools/file_tools.py | 2 +- 4 files changed, 190 insertions(+), 8 deletions(-) diff --git a/tests/tools/test_file_operations.py b/tests/tools/test_file_operations.py index fb8ae552a87..eda5140b016 100644 --- a/tests/tools/test_file_operations.py +++ b/tests/tools/test_file_operations.py @@ -1,6 +1,7 @@ """Tests for tools/file_operations.py — deny list, result dataclasses, helpers.""" import os +import re import pytest import subprocess from pathlib import Path @@ -270,6 +271,144 @@ class TestSearchResult: assert d["truncated"] is True +class TestSearchResultDensify: + """Path-grouped densification of content-mode matches (lossless).""" + + def _matches(self, n, paths=None): + # Real ripgrep output is path-ordered: all matches in a file are + # consecutive (verified against live search_files corpus). The fixture + # mirrors that — group by path, then enumerate lines within each. + paths = paths or ["a.py"] + out = [] + per = max(1, n // len(paths)) + ln = 0 + for p in paths: + for _ in range(per): + ln += 1 + out.append(SearchMatch(path=p, line_number=ln, + content=f"line content {ln}")) + # pad remainder onto the last path + while len(out) < n: + ln += 1 + out.append(SearchMatch(path=paths[-1], line_number=ln, + content=f"line content {ln}")) + return out + + def test_densify_off_by_default(self): + # The model-facing default must be unchanged for callers that don't + # opt in: verbose array, no matches_text key. + r = SearchResult(matches=self._matches(10), total_count=10) + d = r.to_dict() + assert "matches" in d + assert "matches_text" not in d + + def test_densify_below_threshold_keeps_verbose(self): + # Too few matches: the grouping header would cost more than it saves, + # so we fall back to the verbose array even with densify=True. + r = SearchResult(matches=self._matches(4), total_count=4) + d = r.to_dict(densify=True) + assert "matches" in d + assert "matches_text" not in d + + def test_densify_emits_path_grouped_text(self): + r = SearchResult(matches=self._matches(6, paths=["a.py", "b.py"]), + total_count=6) + d = r.to_dict(densify=True) + assert "matches" not in d + assert "matches_text" in d + assert "matches_format" in d # self-describing + text = d["matches_text"] + # Each path appears once as a group header, not repeated per match. + assert text.count("a.py") == 1 + assert text.count("b.py") == 1 + + def test_densify_is_lossless(self): + # Every path, line number, and content byte must be recoverable from + # the dense form. + import re + matches = [ + SearchMatch(path="src/x.py", line_number=12, content=" def foo():"), + SearchMatch(path="src/x.py", line_number=45, content=" return bar"), + SearchMatch(path="src/y.py", line_number=3, content="import os"), + SearchMatch(path="src/y.py", line_number=99, content="x = 1 # tail"), + SearchMatch(path="src/z.py", line_number=7, content="class Z:"), + ] + r = SearchResult(matches=matches, total_count=5) + text = r.to_dict(densify=True)["matches_text"] + # Reconstruct (path, line, content) triples from the grouped text. + recovered = [] + cur = None + for ln in text.split("\n"): + row = re.match(r"^ (\d+): (.*)$", ln) + if row: + recovered.append((cur, int(row.group(1)), row.group(2))) + else: + cur = ln + assert len(recovered) == 5 + for orig, rec in zip(matches, recovered): + assert rec[0] == orig.path + assert rec[1] == orig.line_number + # content is rstrip'd in the dense form; originals here have no + # trailing whitespace, so they must match exactly. + assert rec[2] == orig.content + + def test_densify_smaller_than_verbose(self): + import json + matches = self._matches(40, paths=["pkg/module_one.py", "pkg/module_two.py"]) + r = SearchResult(matches=matches, total_count=40) + verbose = json.dumps(r.to_dict(densify=False), ensure_ascii=False) + dense = json.dumps(r.to_dict(densify=True), ensure_ascii=False) + assert len(dense) < len(verbose) + + @pytest.mark.parametrize("content", [ + "x = {'k': 1, 'url': 'http://h:8080'}", # colons in content + " deeply.indented(call)", # leading indentation preserved + "# \u65e5\u672c\u8a9e comment \U0001f525", # unicode + emoji + "", # empty content + "trailing spaces ", # rstrip'd (see note below) + 'mix "quotes" and , commas', # punctuation that breaks naive CSV + ]) + def test_densify_content_is_lossless(self, content): + # Every realistic single-line match content must round-trip exactly + # (trailing whitespace is the one documented transform — rstrip). + matches = [SearchMatch(path=f"f{i}.py", line_number=i + 1, content=content) + for i in range(6)] + r = SearchResult(matches=matches, total_count=6) + text = r.to_dict(densify=True)["matches_text"] + recovered = [] + cur = None + for ln in text.split("\n"): + row = re.match(r"^ (\d+): (.*)$", ln) + if row: + recovered.append(row.group(2)) + else: + cur = ln + assert len(recovered) == 6 + for got in recovered: + assert got == content.rstrip() + + def test_densify_assumes_single_line_matches(self): + # The path-grouped format puts one match per line, so it relies on + # ripgrep's one-line-per-match contract (verified: 0/6775 real match + # contents contained a newline). This test documents that assumption: + # a (synthetic, never-produced-by-rg) multiline content would split + # across rows. If search ever emits multiline content, densify must + # escape newlines first. + matches = [SearchMatch(path="a.py", line_number=i + 1, content="single line") + for i in range(6)] + text = SearchResult(matches=matches, total_count=6).to_dict(densify=True)["matches_text"] + # one header + six rows == 7 lines, no row spans multiple lines + body_rows = [ln for ln in text.split("\n") if re.match(r"^ \d+: ", ln)] + assert len(body_rows) == 6 + + def test_densify_paths_with_spaces(self): + matches = [SearchMatch(path="my dir/a b.py", line_number=i + 1, content=f"x{i}") + for i in range(6)] + text = SearchResult(matches=matches, total_count=6).to_dict(densify=True)["matches_text"] + # path with spaces survives as a header line verbatim + assert "my dir/a b.py" in text.split("\n")[0] + + class TestLintResult: def test_skipped(self): r = LintResult(skipped=True, message="No linter for .md files") diff --git a/tests/tools/test_read_loop_detection.py b/tests/tools/test_read_loop_detection.py index 5b7e9f25f30..0cac304f9d7 100644 --- a/tests/tools/test_read_loop_detection.py +++ b/tests/tools/test_read_loop_detection.py @@ -46,7 +46,7 @@ class _FakeSearchResult: def __init__(self): self.matches = [] - def to_dict(self): + def to_dict(self, densify=False): return {"matches": [{"file": "test.py", "line": 1, "text": "match"}]} diff --git a/tools/file_operations.py b/tools/file_operations.py index 1d523d70312..c9374a4eff9 100644 --- a/tools/file_operations.py +++ b/tools/file_operations.py @@ -30,7 +30,7 @@ import re import difflib from abc import ABC, abstractmethod from dataclasses import dataclass, field -from typing import Optional, List, Dict, Any +from typing import Optional, List, Dict, Any, ClassVar from pathlib import Path from tools.binary_extensions import BINARY_EXTENSIONS @@ -244,13 +244,56 @@ class SearchResult: limit_reason: Optional[str] = None error: Optional[str] = None - def to_dict(self) -> dict: + # Densify content-mode matches into a path-grouped text block above this + # many matches. Below it, the verbose array is already compact enough that + # the path-grouping header costs more than it saves. + _DENSIFY_MIN_MATCHES: ClassVar[int] = 5 + + def _densify_matches(self) -> Optional[str]: + """Render content-mode matches as a compact, path-grouped text block. + + The verbose form repeats the ``{"path","line","content"}`` keys and the + full path string for every match. This groups consecutive matches by + path (path printed once, then `` : `` rows), which is + lossless — every path, line number, and content byte is preserved — and + readable by the model without any decode step. + + Returns ``None`` when densification is not worthwhile (too few matches), + so the caller falls back to the verbose array. + """ + if len(self.matches) < self._DENSIFY_MIN_MATCHES: + return None + # ripgrep emits matches path-ordered (all hits in a file are + # consecutive), so grouping on path change collapses each file to a + # single header without reordering results. + lines: list[str] = [] + current_path: Optional[str] = None + for m in self.matches: + if m.path != current_path: + lines.append(m.path) + current_path = m.path + # rstrip trailing whitespace only; leading indentation in code is + # meaningful and preserved verbatim after the ": " prefix. + lines.append(f" {m.line_number}: {m.content.rstrip()}") + return "\n".join(lines) + + def to_dict(self, densify: bool = False) -> dict: result: dict[str, object] = {"total_count": self.total_count} if self.matches: - result["matches"] = [ - {"path": m.path, "line": m.line_number, "content": m.content} - for m in self.matches - ] + dense = self._densify_matches() if densify else None + if dense is not None: + # Self-describing: the format key tells the model how to read + # the block so it never has to guess the shape. + result["matches_format"] = ( + "path-grouped: each file path on its own line, followed by " + "indented ': ' rows for matches in that file" + ) + result["matches_text"] = dense + else: + result["matches"] = [ + {"path": m.path, "line": m.line_number, "content": m.content} + for m in self.matches + ] if self.files: result["files"] = self.files if self.counts: diff --git a/tools/file_tools.py b/tools/file_tools.py index 0eb7b2cb174..1fc778e0d6c 100644 --- a/tools/file_tools.py +++ b/tools/file_tools.py @@ -1478,7 +1478,7 @@ def search_tool(pattern: str, target: str = "content", path: str = ".", for m in result.matches: if hasattr(m, 'content') and m.content: m.content = redact_sensitive_text(m.content, code_file=True) - result_dict = result.to_dict() + result_dict = result.to_dict(densify=True) if count >= 3: result_dict["_warning"] = (