mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-18 09:51:59 +00:00
feat(search_files): headroom compression evaluation report + lossless densification (#47866)
* feat(search_files): path-grouped lossless densification of content matches
Content-mode search_files results repeat the {path,line,content} JSON keys
and the full path string for every match. Group consecutive same-path matches
under one path header with indented '<line>: <content>' rows — lossless (every
path/line/content byte preserved), self-describing (matches_format key), and
readable by the model with no decode step.
57.8% mean token reduction on real search_files content outputs (422-output
corpus), fires on 97% of them. Gated at >=5 matches; below that the verbose
array is left untouched. Default to_dict(densify=False) is unchanged, so no
other caller is affected.
ripgrep emits matches path-ordered, so consecutive grouping never reorders
results.
* test: accept densify kwarg in _FakeSearchResult.to_dict
The search loop-detection tests stub SearchResult with a fake whose
to_dict() must mirror the real signature now that it takes densify=.
* test(search_files): edge-case losslessness battery for densification
Adversarial single-line content (colons, indentation, unicode/emoji, empty,
trailing whitespace, quotes+commas), paths with spaces, and an explicit
one-line-per-match invariant documenting the ripgrep contract the format
relies on (0/6775 real match contents contained a newline).
This commit is contained in:
parent
394cdf48ce
commit
22b6942fc2
4 changed files with 190 additions and 8 deletions
|
|
@ -1,6 +1,7 @@
|
|||
"""Tests for tools/file_operations.py — deny list, result dataclasses, helpers."""
|
||||
|
||||
import os
|
||||
import re
|
||||
import pytest
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
|
@ -270,6 +271,144 @@ class TestSearchResult:
|
|||
assert d["truncated"] is True
|
||||
|
||||
|
||||
class TestSearchResultDensify:
|
||||
"""Path-grouped densification of content-mode matches (lossless)."""
|
||||
|
||||
def _matches(self, n, paths=None):
|
||||
# Real ripgrep output is path-ordered: all matches in a file are
|
||||
# consecutive (verified against live search_files corpus). The fixture
|
||||
# mirrors that — group by path, then enumerate lines within each.
|
||||
paths = paths or ["a.py"]
|
||||
out = []
|
||||
per = max(1, n // len(paths))
|
||||
ln = 0
|
||||
for p in paths:
|
||||
for _ in range(per):
|
||||
ln += 1
|
||||
out.append(SearchMatch(path=p, line_number=ln,
|
||||
content=f"line content {ln}"))
|
||||
# pad remainder onto the last path
|
||||
while len(out) < n:
|
||||
ln += 1
|
||||
out.append(SearchMatch(path=paths[-1], line_number=ln,
|
||||
content=f"line content {ln}"))
|
||||
return out
|
||||
|
||||
def test_densify_off_by_default(self):
|
||||
# The model-facing default must be unchanged for callers that don't
|
||||
# opt in: verbose array, no matches_text key.
|
||||
r = SearchResult(matches=self._matches(10), total_count=10)
|
||||
d = r.to_dict()
|
||||
assert "matches" in d
|
||||
assert "matches_text" not in d
|
||||
|
||||
def test_densify_below_threshold_keeps_verbose(self):
|
||||
# Too few matches: the grouping header would cost more than it saves,
|
||||
# so we fall back to the verbose array even with densify=True.
|
||||
r = SearchResult(matches=self._matches(4), total_count=4)
|
||||
d = r.to_dict(densify=True)
|
||||
assert "matches" in d
|
||||
assert "matches_text" not in d
|
||||
|
||||
def test_densify_emits_path_grouped_text(self):
|
||||
r = SearchResult(matches=self._matches(6, paths=["a.py", "b.py"]),
|
||||
total_count=6)
|
||||
d = r.to_dict(densify=True)
|
||||
assert "matches" not in d
|
||||
assert "matches_text" in d
|
||||
assert "matches_format" in d # self-describing
|
||||
text = d["matches_text"]
|
||||
# Each path appears once as a group header, not repeated per match.
|
||||
assert text.count("a.py") == 1
|
||||
assert text.count("b.py") == 1
|
||||
|
||||
def test_densify_is_lossless(self):
|
||||
# Every path, line number, and content byte must be recoverable from
|
||||
# the dense form.
|
||||
import re
|
||||
matches = [
|
||||
SearchMatch(path="src/x.py", line_number=12, content=" def foo():"),
|
||||
SearchMatch(path="src/x.py", line_number=45, content=" return bar"),
|
||||
SearchMatch(path="src/y.py", line_number=3, content="import os"),
|
||||
SearchMatch(path="src/y.py", line_number=99, content="x = 1 # tail"),
|
||||
SearchMatch(path="src/z.py", line_number=7, content="class Z:"),
|
||||
]
|
||||
r = SearchResult(matches=matches, total_count=5)
|
||||
text = r.to_dict(densify=True)["matches_text"]
|
||||
# Reconstruct (path, line, content) triples from the grouped text.
|
||||
recovered = []
|
||||
cur = None
|
||||
for ln in text.split("\n"):
|
||||
row = re.match(r"^ (\d+): (.*)$", ln)
|
||||
if row:
|
||||
recovered.append((cur, int(row.group(1)), row.group(2)))
|
||||
else:
|
||||
cur = ln
|
||||
assert len(recovered) == 5
|
||||
for orig, rec in zip(matches, recovered):
|
||||
assert rec[0] == orig.path
|
||||
assert rec[1] == orig.line_number
|
||||
# content is rstrip'd in the dense form; originals here have no
|
||||
# trailing whitespace, so they must match exactly.
|
||||
assert rec[2] == orig.content
|
||||
|
||||
def test_densify_smaller_than_verbose(self):
|
||||
import json
|
||||
matches = self._matches(40, paths=["pkg/module_one.py", "pkg/module_two.py"])
|
||||
r = SearchResult(matches=matches, total_count=40)
|
||||
verbose = json.dumps(r.to_dict(densify=False), ensure_ascii=False)
|
||||
dense = json.dumps(r.to_dict(densify=True), ensure_ascii=False)
|
||||
assert len(dense) < len(verbose)
|
||||
|
||||
@pytest.mark.parametrize("content", [
|
||||
"x = {'k': 1, 'url': 'http://h:8080'}", # colons in content
|
||||
" deeply.indented(call)", # leading indentation preserved
|
||||
"# \u65e5\u672c\u8a9e comment \U0001f525", # unicode + emoji
|
||||
"", # empty content
|
||||
"trailing spaces ", # rstrip'd (see note below)
|
||||
'mix "quotes" and , commas', # punctuation that breaks naive CSV
|
||||
])
|
||||
def test_densify_content_is_lossless(self, content):
|
||||
# Every realistic single-line match content must round-trip exactly
|
||||
# (trailing whitespace is the one documented transform — rstrip).
|
||||
matches = [SearchMatch(path=f"f{i}.py", line_number=i + 1, content=content)
|
||||
for i in range(6)]
|
||||
r = SearchResult(matches=matches, total_count=6)
|
||||
text = r.to_dict(densify=True)["matches_text"]
|
||||
recovered = []
|
||||
cur = None
|
||||
for ln in text.split("\n"):
|
||||
row = re.match(r"^ (\d+): (.*)$", ln)
|
||||
if row:
|
||||
recovered.append(row.group(2))
|
||||
else:
|
||||
cur = ln
|
||||
assert len(recovered) == 6
|
||||
for got in recovered:
|
||||
assert got == content.rstrip()
|
||||
|
||||
def test_densify_assumes_single_line_matches(self):
|
||||
# The path-grouped format puts one match per line, so it relies on
|
||||
# ripgrep's one-line-per-match contract (verified: 0/6775 real match
|
||||
# contents contained a newline). This test documents that assumption:
|
||||
# a (synthetic, never-produced-by-rg) multiline content would split
|
||||
# across rows. If search ever emits multiline content, densify must
|
||||
# escape newlines first.
|
||||
matches = [SearchMatch(path="a.py", line_number=i + 1, content="single line")
|
||||
for i in range(6)]
|
||||
text = SearchResult(matches=matches, total_count=6).to_dict(densify=True)["matches_text"]
|
||||
# one header + six rows == 7 lines, no row spans multiple lines
|
||||
body_rows = [ln for ln in text.split("\n") if re.match(r"^ \d+: ", ln)]
|
||||
assert len(body_rows) == 6
|
||||
|
||||
def test_densify_paths_with_spaces(self):
|
||||
matches = [SearchMatch(path="my dir/a b.py", line_number=i + 1, content=f"x{i}")
|
||||
for i in range(6)]
|
||||
text = SearchResult(matches=matches, total_count=6).to_dict(densify=True)["matches_text"]
|
||||
# path with spaces survives as a header line verbatim
|
||||
assert "my dir/a b.py" in text.split("\n")[0]
|
||||
|
||||
|
||||
class TestLintResult:
|
||||
def test_skipped(self):
|
||||
r = LintResult(skipped=True, message="No linter for .md files")
|
||||
|
|
|
|||
|
|
@ -46,7 +46,7 @@ class _FakeSearchResult:
|
|||
def __init__(self):
|
||||
self.matches = []
|
||||
|
||||
def to_dict(self):
|
||||
def to_dict(self, densify=False):
|
||||
return {"matches": [{"file": "test.py", "line": 1, "text": "match"}]}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -30,7 +30,7 @@ import re
|
|||
import difflib
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional, List, Dict, Any
|
||||
from typing import Optional, List, Dict, Any, ClassVar
|
||||
from pathlib import Path
|
||||
from tools.binary_extensions import BINARY_EXTENSIONS
|
||||
|
||||
|
|
@ -244,13 +244,56 @@ class SearchResult:
|
|||
limit_reason: Optional[str] = None
|
||||
error: Optional[str] = None
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
# Densify content-mode matches into a path-grouped text block above this
|
||||
# many matches. Below it, the verbose array is already compact enough that
|
||||
# the path-grouping header costs more than it saves.
|
||||
_DENSIFY_MIN_MATCHES: ClassVar[int] = 5
|
||||
|
||||
def _densify_matches(self) -> Optional[str]:
|
||||
"""Render content-mode matches as a compact, path-grouped text block.
|
||||
|
||||
The verbose form repeats the ``{"path","line","content"}`` keys and the
|
||||
full path string for every match. This groups consecutive matches by
|
||||
path (path printed once, then `` <line>: <content>`` rows), which is
|
||||
lossless — every path, line number, and content byte is preserved — and
|
||||
readable by the model without any decode step.
|
||||
|
||||
Returns ``None`` when densification is not worthwhile (too few matches),
|
||||
so the caller falls back to the verbose array.
|
||||
"""
|
||||
if len(self.matches) < self._DENSIFY_MIN_MATCHES:
|
||||
return None
|
||||
# ripgrep emits matches path-ordered (all hits in a file are
|
||||
# consecutive), so grouping on path change collapses each file to a
|
||||
# single header without reordering results.
|
||||
lines: list[str] = []
|
||||
current_path: Optional[str] = None
|
||||
for m in self.matches:
|
||||
if m.path != current_path:
|
||||
lines.append(m.path)
|
||||
current_path = m.path
|
||||
# rstrip trailing whitespace only; leading indentation in code is
|
||||
# meaningful and preserved verbatim after the "<line>: " prefix.
|
||||
lines.append(f" {m.line_number}: {m.content.rstrip()}")
|
||||
return "\n".join(lines)
|
||||
|
||||
def to_dict(self, densify: bool = False) -> dict:
|
||||
result: dict[str, object] = {"total_count": self.total_count}
|
||||
if self.matches:
|
||||
result["matches"] = [
|
||||
{"path": m.path, "line": m.line_number, "content": m.content}
|
||||
for m in self.matches
|
||||
]
|
||||
dense = self._densify_matches() if densify else None
|
||||
if dense is not None:
|
||||
# Self-describing: the format key tells the model how to read
|
||||
# the block so it never has to guess the shape.
|
||||
result["matches_format"] = (
|
||||
"path-grouped: each file path on its own line, followed by "
|
||||
"indented '<line>: <content>' rows for matches in that file"
|
||||
)
|
||||
result["matches_text"] = dense
|
||||
else:
|
||||
result["matches"] = [
|
||||
{"path": m.path, "line": m.line_number, "content": m.content}
|
||||
for m in self.matches
|
||||
]
|
||||
if self.files:
|
||||
result["files"] = self.files
|
||||
if self.counts:
|
||||
|
|
|
|||
|
|
@ -1478,7 +1478,7 @@ def search_tool(pattern: str, target: str = "content", path: str = ".",
|
|||
for m in result.matches:
|
||||
if hasattr(m, 'content') and m.content:
|
||||
m.content = redact_sensitive_text(m.content, code_file=True)
|
||||
result_dict = result.to_dict()
|
||||
result_dict = result.to_dict(densify=True)
|
||||
|
||||
if count >= 3:
|
||||
result_dict["_warning"] = (
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue