feat(search_files): headroom compression evaluation report + lossless densification (#47866)

* feat(search_files): path-grouped lossless densification of content matches Content-mode search_files results repeat the {path,line,content} JSON keys and the full path string for every match. Group consecutive same-path matches under one path header with indented '<line>: <content>' rows — lossless (every path/line/content byte preserved), self-describing (matches_format key), and readable by the model with no decode step. 57.8% mean token reduction on real search_files content outputs (422-output corpus), fires on 97% of them. Gated at >=5 matches; below that the verbose array is left untouched. Default to_dict(densify=False) is unchanged, so no other caller is affected. ripgrep emits matches path-ordered, so consecutive grouping never reorders results. * test: accept densify kwarg in _FakeSearchResult.to_dict The search loop-detection tests stub SearchResult with a fake whose to_dict() must mirror the real signature now that it takes densify=. * test(search_files): edge-case losslessness battery for densification Adversarial single-line content (colons, indentation, unicode/emoji, empty, trailing whitespace, quotes+commas), paths with spaces, and an explicit one-line-per-match invariant documenting the ripgrep contract the format relies on (0/6775 real match contents contained a newline).
2026-06-18 09:51:59 +00:00 · 2026-06-17 13:45:25 -07:00 · 2026-06-17 13:45:25 -07:00 · 22b6942fc2
commit 22b6942fc2
parent 394cdf48ce
4 changed files with 190 additions and 8 deletions
--- a/tests/tools/test_file_operations.py
+++ b/tests/tools/test_file_operations.py
@ -1,6 +1,7 @@
 """Tests for tools/file_operations.py — deny list, result dataclasses, helpers."""

 import os
+import re
 import pytest
 import subprocess
 from pathlib import Path
@ -270,6 +271,144 @@ class TestSearchResult:
        assert d["truncated"] is True


+class TestSearchResultDensify:
+    """Path-grouped densification of content-mode matches (lossless)."""
+
+    def _matches(self, n, paths=None):
+        # Real ripgrep output is path-ordered: all matches in a file are
+        # consecutive (verified against live search_files corpus). The fixture
+        # mirrors that — group by path, then enumerate lines within each.
+        paths = paths or ["a.py"]
+        out = []
+        per = max(1, n // len(paths))
+        ln = 0
+        for p in paths:
+            for _ in range(per):
+                ln += 1
+                out.append(SearchMatch(path=p, line_number=ln,
+                                       content=f"line content {ln}"))
+        # pad remainder onto the last path
+        while len(out) < n:
+            ln += 1
+            out.append(SearchMatch(path=paths[-1], line_number=ln,
+                                   content=f"line content {ln}"))
+        return out
+
+    def test_densify_off_by_default(self):
+        # The model-facing default must be unchanged for callers that don't
+        # opt in: verbose array, no matches_text key.
+        r = SearchResult(matches=self._matches(10), total_count=10)
+        d = r.to_dict()
+        assert "matches" in d
+        assert "matches_text" not in d
+
+    def test_densify_below_threshold_keeps_verbose(self):
+        # Too few matches: the grouping header would cost more than it saves,
+        # so we fall back to the verbose array even with densify=True.
+        r = SearchResult(matches=self._matches(4), total_count=4)
+        d = r.to_dict(densify=True)
+        assert "matches" in d
+        assert "matches_text" not in d
+
+    def test_densify_emits_path_grouped_text(self):
+        r = SearchResult(matches=self._matches(6, paths=["a.py", "b.py"]),
+                         total_count=6)
+        d = r.to_dict(densify=True)
+        assert "matches" not in d
+        assert "matches_text" in d
+        assert "matches_format" in d  # self-describing
+        text = d["matches_text"]
+        # Each path appears once as a group header, not repeated per match.
+        assert text.count("a.py") == 1
+        assert text.count("b.py") == 1
+
+    def test_densify_is_lossless(self):
+        # Every path, line number, and content byte must be recoverable from
+        # the dense form.
+        import re
+        matches = [
+            SearchMatch(path="src/x.py", line_number=12, content="    def foo():"),
+            SearchMatch(path="src/x.py", line_number=45, content="        return bar"),
+            SearchMatch(path="src/y.py", line_number=3, content="import os"),
+            SearchMatch(path="src/y.py", line_number=99, content="x = 1  # tail"),
+            SearchMatch(path="src/z.py", line_number=7, content="class Z:"),
+        ]
+        r = SearchResult(matches=matches, total_count=5)
+        text = r.to_dict(densify=True)["matches_text"]
+        # Reconstruct (path, line, content) triples from the grouped text.
+        recovered = []
+        cur = None
+        for ln in text.split("\n"):
+            row = re.match(r"^  (\d+): (.*)$", ln)
+            if row:
+                recovered.append((cur, int(row.group(1)), row.group(2)))
+            else:
+                cur = ln
+        assert len(recovered) == 5
+        for orig, rec in zip(matches, recovered):
+            assert rec[0] == orig.path
+            assert rec[1] == orig.line_number
+            # content is rstrip'd in the dense form; originals here have no
+            # trailing whitespace, so they must match exactly.
+            assert rec[2] == orig.content
+
+    def test_densify_smaller_than_verbose(self):
+        import json
+        matches = self._matches(40, paths=["pkg/module_one.py", "pkg/module_two.py"])
+        r = SearchResult(matches=matches, total_count=40)
+        verbose = json.dumps(r.to_dict(densify=False), ensure_ascii=False)
+        dense = json.dumps(r.to_dict(densify=True), ensure_ascii=False)
+        assert len(dense) < len(verbose)
+
+    @pytest.mark.parametrize("content", [
+        "x = {'k': 1, 'url': 'http://h:8080'}",   # colons in content
+        "        deeply.indented(call)",          # leading indentation preserved
+        "# \u65e5\u672c\u8a9e comment \U0001f525",  # unicode + emoji
+        "",                                        # empty content
+        "trailing spaces   ",                     # rstrip'd (see note below)
+        'mix "quotes" and , commas',              # punctuation that breaks naive CSV
+    ])
+    def test_densify_content_is_lossless(self, content):
+        # Every realistic single-line match content must round-trip exactly
+        # (trailing whitespace is the one documented transform — rstrip).
+        matches = [SearchMatch(path=f"f{i}.py", line_number=i + 1, content=content)
+                   for i in range(6)]
+        r = SearchResult(matches=matches, total_count=6)
+        text = r.to_dict(densify=True)["matches_text"]
+        recovered = []
+        cur = None
+        for ln in text.split("\n"):
+            row = re.match(r"^  (\d+): (.*)$", ln)
+            if row:
+                recovered.append(row.group(2))
+            else:
+                cur = ln
+        assert len(recovered) == 6
+        for got in recovered:
+            assert got == content.rstrip()
+
+    def test_densify_assumes_single_line_matches(self):
+        # The path-grouped format puts one match per line, so it relies on
+        # ripgrep's one-line-per-match contract (verified: 0/6775 real match
+        # contents contained a newline). This test documents that assumption:
+        # a (synthetic, never-produced-by-rg) multiline content would split
+        # across rows. If search ever emits multiline content, densify must
+        # escape newlines first.
+        matches = [SearchMatch(path="a.py", line_number=i + 1, content="single line")
+                   for i in range(6)]
+        text = SearchResult(matches=matches, total_count=6).to_dict(densify=True)["matches_text"]
+        # one header + six rows == 7 lines, no row spans multiple lines
+        body_rows = [ln for ln in text.split("\n") if re.match(r"^  \d+: ", ln)]
+        assert len(body_rows) == 6
+
+    def test_densify_paths_with_spaces(self):
+        matches = [SearchMatch(path="my dir/a b.py", line_number=i + 1, content=f"x{i}")
+                   for i in range(6)]
+        text = SearchResult(matches=matches, total_count=6).to_dict(densify=True)["matches_text"]
+        # path with spaces survives as a header line verbatim
+        assert "my dir/a b.py" in text.split("\n")[0]
+
+
 class TestLintResult:
    def test_skipped(self):
        r = LintResult(skipped=True, message="No linter for .md files")
--- a/tests/tools/test_read_loop_detection.py
+++ b/tests/tools/test_read_loop_detection.py
@ -46,7 +46,7 @@ class _FakeSearchResult:
    def __init__(self):
        self.matches = []

-    def to_dict(self):
+    def to_dict(self, densify=False):
        return {"matches": [{"file": "test.py", "line": 1, "text": "match"}]}


--- a/tools/file_operations.py
+++ b/tools/file_operations.py
@ -30,7 +30,7 @@ import re
 import difflib
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, field
-from typing import Optional, List, Dict, Any
+from typing import Optional, List, Dict, Any, ClassVar
 from pathlib import Path
 from tools.binary_extensions import BINARY_EXTENSIONS

@ -244,13 +244,56 @@ class SearchResult:
    limit_reason: Optional[str] = None
    error: Optional[str] = None
    
-    def to_dict(self) -> dict:
+    # Densify content-mode matches into a path-grouped text block above this
+    # many matches. Below it, the verbose array is already compact enough that
+    # the path-grouping header costs more than it saves.
+    _DENSIFY_MIN_MATCHES: ClassVar[int] = 5
+
+    def _densify_matches(self) -> Optional[str]:
+        """Render content-mode matches as a compact, path-grouped text block.
+
+        The verbose form repeats the ``{"path","line","content"}`` keys and the
+        full path string for every match. This groups consecutive matches by
+        path (path printed once, then ``  <line>: <content>`` rows), which is
+        lossless — every path, line number, and content byte is preserved — and
+        readable by the model without any decode step.
+
+        Returns ``None`` when densification is not worthwhile (too few matches),
+        so the caller falls back to the verbose array.
+        """
+        if len(self.matches) < self._DENSIFY_MIN_MATCHES:
+            return None
+        # ripgrep emits matches path-ordered (all hits in a file are
+        # consecutive), so grouping on path change collapses each file to a
+        # single header without reordering results.
+        lines: list[str] = []
+        current_path: Optional[str] = None
+        for m in self.matches:
+            if m.path != current_path:
+                lines.append(m.path)
+                current_path = m.path
+            # rstrip trailing whitespace only; leading indentation in code is
+            # meaningful and preserved verbatim after the "<line>: " prefix.
+            lines.append(f"  {m.line_number}: {m.content.rstrip()}")
+        return "\n".join(lines)
+
+    def to_dict(self, densify: bool = False) -> dict:
        result: dict[str, object] = {"total_count": self.total_count}
        if self.matches:
-            result["matches"] = [
-                {"path": m.path, "line": m.line_number, "content": m.content}
-                for m in self.matches
-            ]
+            dense = self._densify_matches() if densify else None
+            if dense is not None:
+                # Self-describing: the format key tells the model how to read
+                # the block so it never has to guess the shape.
+                result["matches_format"] = (
+                    "path-grouped: each file path on its own line, followed by "
+                    "indented '<line>: <content>' rows for matches in that file"
+                )
+                result["matches_text"] = dense
+            else:
+                result["matches"] = [
+                    {"path": m.path, "line": m.line_number, "content": m.content}
+                    for m in self.matches
+                ]
        if self.files:
            result["files"] = self.files
        if self.counts:
--- a/tools/file_tools.py
+++ b/tools/file_tools.py
@ -1478,7 +1478,7 @@ def search_tool(pattern: str, target: str = "content", path: str = ".",
            for m in result.matches:
                if hasattr(m, 'content') and m.content:
                    m.content = redact_sensitive_text(m.content, code_file=True)
-        result_dict = result.to_dict()
+        result_dict = result.to_dict(densify=True)

        if count >= 3:
            result_dict["_warning"] = (