feat(search_files): headroom compression evaluation report + lossless densification (#47866)

* feat(search_files): path-grouped lossless densification of content matches

Content-mode search_files results repeat the {path,line,content} JSON keys
and the full path string for every match. Group consecutive same-path matches
under one path header with indented '<line>: <content>' rows — lossless (every
path/line/content byte preserved), self-describing (matches_format key), and
readable by the model with no decode step.

57.8% mean token reduction on real search_files content outputs (422-output
corpus), fires on 97% of them. Gated at >=5 matches; below that the verbose
array is left untouched. Default to_dict(densify=False) is unchanged, so no
other caller is affected.

ripgrep emits matches path-ordered, so consecutive grouping never reorders
results.

* test: accept densify kwarg in _FakeSearchResult.to_dict

The search loop-detection tests stub SearchResult with a fake whose
to_dict() must mirror the real signature now that it takes densify=.

* test(search_files): edge-case losslessness battery for densification

Adversarial single-line content (colons, indentation, unicode/emoji, empty,
trailing whitespace, quotes+commas), paths with spaces, and an explicit
one-line-per-match invariant documenting the ripgrep contract the format
relies on (0/6775 real match contents contained a newline).
This commit is contained in:
Teknium 2026-06-17 13:45:25 -07:00 committed by GitHub
parent 394cdf48ce
commit 22b6942fc2
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 190 additions and 8 deletions

View file

@ -1,6 +1,7 @@
"""Tests for tools/file_operations.py — deny list, result dataclasses, helpers."""
import os
import re
import pytest
import subprocess
from pathlib import Path
@ -270,6 +271,144 @@ class TestSearchResult:
assert d["truncated"] is True
class TestSearchResultDensify:
"""Path-grouped densification of content-mode matches (lossless)."""
def _matches(self, n, paths=None):
# Real ripgrep output is path-ordered: all matches in a file are
# consecutive (verified against live search_files corpus). The fixture
# mirrors that — group by path, then enumerate lines within each.
paths = paths or ["a.py"]
out = []
per = max(1, n // len(paths))
ln = 0
for p in paths:
for _ in range(per):
ln += 1
out.append(SearchMatch(path=p, line_number=ln,
content=f"line content {ln}"))
# pad remainder onto the last path
while len(out) < n:
ln += 1
out.append(SearchMatch(path=paths[-1], line_number=ln,
content=f"line content {ln}"))
return out
def test_densify_off_by_default(self):
# The model-facing default must be unchanged for callers that don't
# opt in: verbose array, no matches_text key.
r = SearchResult(matches=self._matches(10), total_count=10)
d = r.to_dict()
assert "matches" in d
assert "matches_text" not in d
def test_densify_below_threshold_keeps_verbose(self):
# Too few matches: the grouping header would cost more than it saves,
# so we fall back to the verbose array even with densify=True.
r = SearchResult(matches=self._matches(4), total_count=4)
d = r.to_dict(densify=True)
assert "matches" in d
assert "matches_text" not in d
def test_densify_emits_path_grouped_text(self):
r = SearchResult(matches=self._matches(6, paths=["a.py", "b.py"]),
total_count=6)
d = r.to_dict(densify=True)
assert "matches" not in d
assert "matches_text" in d
assert "matches_format" in d # self-describing
text = d["matches_text"]
# Each path appears once as a group header, not repeated per match.
assert text.count("a.py") == 1
assert text.count("b.py") == 1
def test_densify_is_lossless(self):
# Every path, line number, and content byte must be recoverable from
# the dense form.
import re
matches = [
SearchMatch(path="src/x.py", line_number=12, content=" def foo():"),
SearchMatch(path="src/x.py", line_number=45, content=" return bar"),
SearchMatch(path="src/y.py", line_number=3, content="import os"),
SearchMatch(path="src/y.py", line_number=99, content="x = 1 # tail"),
SearchMatch(path="src/z.py", line_number=7, content="class Z:"),
]
r = SearchResult(matches=matches, total_count=5)
text = r.to_dict(densify=True)["matches_text"]
# Reconstruct (path, line, content) triples from the grouped text.
recovered = []
cur = None
for ln in text.split("\n"):
row = re.match(r"^ (\d+): (.*)$", ln)
if row:
recovered.append((cur, int(row.group(1)), row.group(2)))
else:
cur = ln
assert len(recovered) == 5
for orig, rec in zip(matches, recovered):
assert rec[0] == orig.path
assert rec[1] == orig.line_number
# content is rstrip'd in the dense form; originals here have no
# trailing whitespace, so they must match exactly.
assert rec[2] == orig.content
def test_densify_smaller_than_verbose(self):
import json
matches = self._matches(40, paths=["pkg/module_one.py", "pkg/module_two.py"])
r = SearchResult(matches=matches, total_count=40)
verbose = json.dumps(r.to_dict(densify=False), ensure_ascii=False)
dense = json.dumps(r.to_dict(densify=True), ensure_ascii=False)
assert len(dense) < len(verbose)
@pytest.mark.parametrize("content", [
"x = {'k': 1, 'url': 'http://h:8080'}", # colons in content
" deeply.indented(call)", # leading indentation preserved
"# \u65e5\u672c\u8a9e comment \U0001f525", # unicode + emoji
"", # empty content
"trailing spaces ", # rstrip'd (see note below)
'mix "quotes" and , commas', # punctuation that breaks naive CSV
])
def test_densify_content_is_lossless(self, content):
# Every realistic single-line match content must round-trip exactly
# (trailing whitespace is the one documented transform — rstrip).
matches = [SearchMatch(path=f"f{i}.py", line_number=i + 1, content=content)
for i in range(6)]
r = SearchResult(matches=matches, total_count=6)
text = r.to_dict(densify=True)["matches_text"]
recovered = []
cur = None
for ln in text.split("\n"):
row = re.match(r"^ (\d+): (.*)$", ln)
if row:
recovered.append(row.group(2))
else:
cur = ln
assert len(recovered) == 6
for got in recovered:
assert got == content.rstrip()
def test_densify_assumes_single_line_matches(self):
# The path-grouped format puts one match per line, so it relies on
# ripgrep's one-line-per-match contract (verified: 0/6775 real match
# contents contained a newline). This test documents that assumption:
# a (synthetic, never-produced-by-rg) multiline content would split
# across rows. If search ever emits multiline content, densify must
# escape newlines first.
matches = [SearchMatch(path="a.py", line_number=i + 1, content="single line")
for i in range(6)]
text = SearchResult(matches=matches, total_count=6).to_dict(densify=True)["matches_text"]
# one header + six rows == 7 lines, no row spans multiple lines
body_rows = [ln for ln in text.split("\n") if re.match(r"^ \d+: ", ln)]
assert len(body_rows) == 6
def test_densify_paths_with_spaces(self):
matches = [SearchMatch(path="my dir/a b.py", line_number=i + 1, content=f"x{i}")
for i in range(6)]
text = SearchResult(matches=matches, total_count=6).to_dict(densify=True)["matches_text"]
# path with spaces survives as a header line verbatim
assert "my dir/a b.py" in text.split("\n")[0]
class TestLintResult:
def test_skipped(self):
r = LintResult(skipped=True, message="No linter for .md files")

View file

@ -46,7 +46,7 @@ class _FakeSearchResult:
def __init__(self):
self.matches = []
def to_dict(self):
def to_dict(self, densify=False):
return {"matches": [{"file": "test.py", "line": 1, "text": "match"}]}

View file

@ -30,7 +30,7 @@ import re
import difflib
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from typing import Optional, List, Dict, Any
from typing import Optional, List, Dict, Any, ClassVar
from pathlib import Path
from tools.binary_extensions import BINARY_EXTENSIONS
@ -244,13 +244,56 @@ class SearchResult:
limit_reason: Optional[str] = None
error: Optional[str] = None
def to_dict(self) -> dict:
# Densify content-mode matches into a path-grouped text block above this
# many matches. Below it, the verbose array is already compact enough that
# the path-grouping header costs more than it saves.
_DENSIFY_MIN_MATCHES: ClassVar[int] = 5
def _densify_matches(self) -> Optional[str]:
"""Render content-mode matches as a compact, path-grouped text block.
The verbose form repeats the ``{"path","line","content"}`` keys and the
full path string for every match. This groups consecutive matches by
path (path printed once, then `` <line>: <content>`` rows), which is
lossless every path, line number, and content byte is preserved and
readable by the model without any decode step.
Returns ``None`` when densification is not worthwhile (too few matches),
so the caller falls back to the verbose array.
"""
if len(self.matches) < self._DENSIFY_MIN_MATCHES:
return None
# ripgrep emits matches path-ordered (all hits in a file are
# consecutive), so grouping on path change collapses each file to a
# single header without reordering results.
lines: list[str] = []
current_path: Optional[str] = None
for m in self.matches:
if m.path != current_path:
lines.append(m.path)
current_path = m.path
# rstrip trailing whitespace only; leading indentation in code is
# meaningful and preserved verbatim after the "<line>: " prefix.
lines.append(f" {m.line_number}: {m.content.rstrip()}")
return "\n".join(lines)
def to_dict(self, densify: bool = False) -> dict:
result: dict[str, object] = {"total_count": self.total_count}
if self.matches:
result["matches"] = [
{"path": m.path, "line": m.line_number, "content": m.content}
for m in self.matches
]
dense = self._densify_matches() if densify else None
if dense is not None:
# Self-describing: the format key tells the model how to read
# the block so it never has to guess the shape.
result["matches_format"] = (
"path-grouped: each file path on its own line, followed by "
"indented '<line>: <content>' rows for matches in that file"
)
result["matches_text"] = dense
else:
result["matches"] = [
{"path": m.path, "line": m.line_number, "content": m.content}
for m in self.matches
]
if self.files:
result["files"] = self.files
if self.counts:

View file

@ -1478,7 +1478,7 @@ def search_tool(pattern: str, target: str = "content", path: str = ".",
for m in result.matches:
if hasattr(m, 'content') and m.content:
m.content = redact_sensitive_text(m.content, code_file=True)
result_dict = result.to_dict()
result_dict = result.to_dict(densify=True)
if count >= 3:
result_dict["_warning"] = (