mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
design: compression eval harness — add three scrubbed fixtures + scrubber
Adds scripts/compression_eval/ with a design doc, README, a placeholder run_eval.py, and three checked-in scrubbed session fixtures. No working eval yet — PR is for design review before implementation. Motivation: we edit agent/context_compressor.py prompts and _template_sections by hand and ship without any automated check that compression still preserves file paths, error codes, or the active task. Factory.ai's Dec 2025 write-up (https://factory.ai/news/evaluating-compression) documents a probe-based eval scored on six dimensions. We adopt the methodology; we do not publish scores. Contents: - DESIGN.md — fixture format, probe format (recall / artifact / continuation / decision), six grading dimensions, report format, cost expectations, scrubber pipeline, open questions, and staged follow-up PR plan. - README.md — short 'what this is / when to run it' page. - run_eval.py — placeholder that prints 'not implemented, see DESIGN.md' and exits 1. - scrub_fixtures.py — reproducible pipeline that converts real sessions from ~/.hermes/sessions/*.jsonl into public-safe JSON fixtures. Applies: redact_sensitive_text, username path normalization, personal handle scrubbing, email and git-author normalization, reasoning scratchpad / <think> stripping, platform user-mention scrubbing, first-user paraphrase, system-prompt placeholder, orphan-message pruning, and tool-output size truncation for fixture readability. - fixtures/feature-impl-context-priority.json — 75 msgs / ~17k tokens. Investigate → patch → test → PR → merge. - fixtures/debug-session-feishu-id-model.json — 59 msgs / ~13k tokens. PR triage + upstream docs + decision. - fixtures/config-build-competitive-scouts.json — 61 msgs / ~23k tokens. Iterative config accumulation (11 cron jobs across 7 weekdays). PII audit: zero matches across the three fixtures for the maintainer's handle (all case variants), personal email domains, and known contributor emails. Only 'contributor@example.com' placeholder remains. Why scripts/: requires API credentials, costs ~\$1 per run, LLM-graded (non-deterministic), must not run in CI. scripts/sample_and_compress.py is the existing precedent for offline credentialed tooling.
This commit is contained in:
parent
c6b734e24d
commit
9f5c13f874
10 changed files with 2642 additions and 0 deletions
370
scripts/compression_eval/scrub_fixtures.py
Executable file
370
scripts/compression_eval/scrub_fixtures.py
Executable file
|
|
@ -0,0 +1,370 @@
|
|||
"""One-shot fixture scrubber for scripts/compression_eval/fixtures/.
|
||||
|
||||
Source: ~/.hermes/sessions/<file>.jsonl
|
||||
Output: .worktrees/.../scripts/compression_eval/fixtures/<name>.json
|
||||
|
||||
Scrubbing passes:
|
||||
1. agent.redact.redact_sensitive_text — API keys, tokens, connection strings
|
||||
2. Username paths — /home/teknium/ → /home/user/, ~/.hermes/ preserved as-is
|
||||
(that path is universal)
|
||||
3. Personal handles — "Teknium"/"teknium"/"teknium1" → "user"
|
||||
4. Reasoning scratchpads — strip <REASONING_SCRATCHPAD>...</REASONING_SCRATCHPAD>
|
||||
blocks and <think>...</think> tags (personality leakage risk)
|
||||
5. session_meta line — drop entirely, we only need the messages
|
||||
6. User message personality — lightly paraphrase the first user message to keep
|
||||
task intent while removing "vibe"; subsequent user turns kept verbatim
|
||||
since they're short instructions
|
||||
|
||||
The fixture format matches DESIGN.md:
|
||||
{
|
||||
"name": "...",
|
||||
"description": "...",
|
||||
"model": "...", # best guess from original session
|
||||
"context_length": 200000,
|
||||
"messages": [...], # OpenAI-format, only role/content/tool_calls/tool_call_id/tool_name
|
||||
"notes": "Scrubbed from ~/.hermes/sessions/... on 2026-04-24"
|
||||
}
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List
|
||||
|
||||
# Resolve the hermes-agent checkout relative to this script so agent.redact
|
||||
# imports cleanly whether we run from a worktree or a main clone.
|
||||
_REPO_ROOT = Path(__file__).resolve().parents[2]
|
||||
sys.path.insert(0, str(_REPO_ROOT))
|
||||
from agent.redact import redact_sensitive_text # noqa: E402
|
||||
|
||||
|
||||
SESSION_DIR = Path.home() / ".hermes" / "sessions"
|
||||
# Resolve FIXTURES_DIR relative to this script so the scrubber runs the
|
||||
# same way inside a worktree, a main checkout, or from a contributor's
|
||||
# clone at a different path.
|
||||
FIXTURES_DIR = Path(__file__).resolve().parent / "fixtures"
|
||||
|
||||
# (source_file, output_name, description, user_first_paraphrase, model_guess, context_length, truncate_at)
|
||||
# truncate_at: keep messages[:truncate_at] (None = keep all). Applied BEFORE
|
||||
# orphan-empty-assistant cleanup.
|
||||
SPECS = [
|
||||
(
|
||||
"20260321_060441_fef7be92.jsonl",
|
||||
"feature-impl-context-priority",
|
||||
"~75-turn feature-impl: user asks how multiple project-context files "
|
||||
"(.hermes.md / AGENTS.md / CLAUDE.md / .cursorrules) are handled when "
|
||||
"all are present; agent investigates the codebase, designs a priority "
|
||||
"order, patches the loader + tests, live-tests with a scenario "
|
||||
"directory, commits to a feature branch, opens a PR, and merges after "
|
||||
"approval. Exercises investigate → decide → implement → verify → "
|
||||
"ship flow with clear artifact trail (2 files modified, 1 PR).",
|
||||
(
|
||||
"If .hermes.md, AGENTS.md, CLAUDE.md, and .cursorrules all exist in "
|
||||
"the same directory, does the agent load all of them or pick one? "
|
||||
"Use the hermes-agent-dev skill to check."
|
||||
),
|
||||
"anthropic/claude-sonnet-4.6",
|
||||
200000,
|
||||
74, # cut at "Merged and pulled. Main is current." — drops trailing unrelated cron-delivery messages
|
||||
),
|
||||
(
|
||||
"20260412_233741_3f2119a8.jsonl",
|
||||
"debug-session-feishu-id-model",
|
||||
"~60-turn debug/triage PR-review session: a third-party bug report "
|
||||
"says the gateway's Feishu adapter misuses the open_id / union_id / "
|
||||
"user_id identity model (open_id is app-scoped, not the bot's "
|
||||
"canonical ID). An open community PR (#8388) tries to fix it. Agent "
|
||||
"reviews the PR against current main, fetches upstream Feishu/Lark "
|
||||
"identity docs, and produces a decision. Exercises long tool-heavy "
|
||||
"context with PR diffs, upstream docs, and a clear decision at the "
|
||||
"end — the classic 'can the summary still name the PR number, the "
|
||||
"root cause, and the decision?' scenario.",
|
||||
(
|
||||
"A community user reports the Feishu/Lark adapter gets the identity "
|
||||
"model wrong — open_id is app-scoped, not the bot's canonical ID. "
|
||||
"There's an open PR #8388 trying to fix it. Use the hermes-agent-dev "
|
||||
"skill and the pr-triage-salvage skill to review it."
|
||||
),
|
||||
"anthropic/claude-sonnet-4.6",
|
||||
200000,
|
||||
58, # end at "Here's my review: ..." — clean decision point before the "close it, implement cleaner" pivot
|
||||
),
|
||||
(
|
||||
"20260328_160817_77bd258b.jsonl",
|
||||
"config-build-competitive-scouts",
|
||||
"~60-turn iterative config/build session: user wants a set of weekly "
|
||||
"cron jobs that scan competing AI coding agents (openclaw, nanoclaw, "
|
||||
"ironclaw, codex, opencode, claude-code, kilo-code, gemini-cli, "
|
||||
"cline, aider, roo) for merged PRs or web updates worth porting to "
|
||||
"hermes-agent. User adds one target per turn; agent creates each cron "
|
||||
"job and re-states the accumulated schedule. Exercises artifact trail "
|
||||
"(which jobs are configured, which days) and iterative state "
|
||||
"accumulation — the canonical case for iterative-merge summarization.",
|
||||
(
|
||||
"Set up a cron job for the agent every Sunday to scan all PRs "
|
||||
"merged into openclaw that week, decide which are worth adding to "
|
||||
"hermes-agent, and open PRs porting those features."
|
||||
),
|
||||
"anthropic/claude-sonnet-4.6",
|
||||
200000,
|
||||
None,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
# Tool outputs beyond this size in chars are replaced with a short
|
||||
# placeholder — a 16KB skill_view dump or 5KB web_extract result
|
||||
# doesn't contribute to the compression eval signal but bloats the
|
||||
# fixture size and PR diff readability. The compressor sees the
|
||||
# placeholder and still knows the tool was called and returned
|
||||
# something useful.
|
||||
_TOOL_OUTPUT_MAX = 2000
|
||||
|
||||
|
||||
def _maybe_truncate_tool_output(text: str, tool_name: str) -> str:
|
||||
if not text or len(text) <= _TOOL_OUTPUT_MAX:
|
||||
return text
|
||||
keep = _TOOL_OUTPUT_MAX - 200
|
||||
head = text[:keep]
|
||||
return (
|
||||
head
|
||||
+ f"\n\n[... tool output truncated for fixture — original was {len(text)} chars"
|
||||
+ (f" from {tool_name}" if tool_name else "")
|
||||
+ "]"
|
||||
)
|
||||
|
||||
|
||||
_PATH_RE = re.compile(r"/home/teknium\b")
|
||||
_USER_RE = re.compile(r"\bteknium1\b|\bTeknium\b|\bteknium\b", re.IGNORECASE)
|
||||
# Only strip scratchpads in ASSISTANT content, not tool results (might be legit)
|
||||
_SCRATCH_RE = re.compile(
|
||||
r"<REASONING_SCRATCHPAD>.*?</REASONING_SCRATCHPAD>\s*", re.DOTALL
|
||||
)
|
||||
_THINK_RE = re.compile(r"<think>.*?</think>\s*", re.DOTALL)
|
||||
# Discord/Telegram user mention leakage in messaging-platform sessions
|
||||
_USER_MENTION_RE = re.compile(r"<@\*{3}>|<@\d+>")
|
||||
# Contributor emails (from git show output etc) — anything@domain.tld
|
||||
# Keep noreply@github-style placeholders obvious; real personal emails get
|
||||
# replaced with a contributor placeholder.
|
||||
_EMAIL_RE = re.compile(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b")
|
||||
# "Author: Name <email>" git-show headers — rewrite the whole line
|
||||
_GIT_AUTHOR_RE = re.compile(r"Author:\s*[^<\n]+<[^>]+>")
|
||||
|
||||
|
||||
def _scrub_text(text: str, *, drop_scratchpads: bool = False) -> str:
|
||||
"""Apply the pipeline to a raw text string.
|
||||
|
||||
drop_scratchpads only affects assistant messages — tool outputs that
|
||||
happen to contain similar markers are left alone.
|
||||
"""
|
||||
if not text:
|
||||
return text
|
||||
if drop_scratchpads:
|
||||
text = _SCRATCH_RE.sub("", text)
|
||||
text = _THINK_RE.sub("", text)
|
||||
text = _PATH_RE.sub("/home/user", text)
|
||||
text = _USER_RE.sub("user", text)
|
||||
text = _USER_MENTION_RE.sub("<@user>", text)
|
||||
# Rewrite git "Author: Name <email>" lines before generic email replace
|
||||
text = _GIT_AUTHOR_RE.sub("Author: contributor <contributor@example.com>", text)
|
||||
text = _EMAIL_RE.sub("contributor@example.com", text)
|
||||
text = redact_sensitive_text(text)
|
||||
return text
|
||||
|
||||
|
||||
def _content_to_str(content: Any) -> str:
|
||||
if content is None:
|
||||
return ""
|
||||
if isinstance(content, str):
|
||||
return content
|
||||
if isinstance(content, list):
|
||||
parts = []
|
||||
for p in content:
|
||||
if isinstance(p, dict) and "text" in p:
|
||||
parts.append(p["text"])
|
||||
elif isinstance(p, str):
|
||||
parts.append(p)
|
||||
return "\n".join(parts)
|
||||
return str(content)
|
||||
|
||||
|
||||
def _scrub_tool_calls(tool_calls: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
out = []
|
||||
for tc in tool_calls or []:
|
||||
if not isinstance(tc, dict):
|
||||
continue
|
||||
fn = tc.get("function", {}) or {}
|
||||
args = fn.get("arguments", "")
|
||||
if isinstance(args, str):
|
||||
args = _scrub_text(args)
|
||||
new_tc = {
|
||||
"id": tc.get("id", ""),
|
||||
"type": tc.get("type", "function"),
|
||||
"function": {
|
||||
"name": fn.get("name", ""),
|
||||
"arguments": args,
|
||||
},
|
||||
}
|
||||
out.append(new_tc)
|
||||
return out
|
||||
|
||||
|
||||
def _scrub_message(m: Dict[str, Any], *, first_user_paraphrase: str | None, user_turn_idx: List[int]) -> Dict[str, Any] | None:
|
||||
role = m.get("role")
|
||||
if role in (None, "session_meta"):
|
||||
return None
|
||||
|
||||
content = _content_to_str(m.get("content"))
|
||||
|
||||
if role == "assistant":
|
||||
content = _scrub_text(content, drop_scratchpads=True)
|
||||
elif role == "user":
|
||||
# Use paraphrase for the very first user turn only
|
||||
user_turn_idx[0] += 1
|
||||
if user_turn_idx[0] == 1 and first_user_paraphrase is not None:
|
||||
content = first_user_paraphrase
|
||||
else:
|
||||
content = _scrub_text(content)
|
||||
else:
|
||||
content = _scrub_text(content)
|
||||
# Truncate large tool outputs
|
||||
if role == "tool":
|
||||
tn = m.get("tool_name") or m.get("name") or ""
|
||||
content = _maybe_truncate_tool_output(content, tn)
|
||||
|
||||
new_msg: Dict[str, Any] = {"role": role, "content": content}
|
||||
|
||||
if role == "assistant":
|
||||
tcs = m.get("tool_calls") or []
|
||||
if tcs:
|
||||
new_msg["tool_calls"] = _scrub_tool_calls(tcs)
|
||||
if role == "tool":
|
||||
if m.get("tool_call_id"):
|
||||
new_msg["tool_call_id"] = m["tool_call_id"]
|
||||
if m.get("tool_name") or m.get("name"):
|
||||
new_msg["tool_name"] = m.get("tool_name") or m.get("name")
|
||||
|
||||
return new_msg
|
||||
|
||||
|
||||
def build_fixture(
|
||||
source_file: str,
|
||||
output_name: str,
|
||||
description: str,
|
||||
first_user_paraphrase: str,
|
||||
model_guess: str,
|
||||
context_length: int,
|
||||
truncate_at: int | None = None,
|
||||
) -> Dict[str, Any]:
|
||||
src = SESSION_DIR / source_file
|
||||
raw_msgs: List[Dict[str, Any]] = []
|
||||
with src.open() as fh:
|
||||
for line in fh:
|
||||
try:
|
||||
raw_msgs.append(json.loads(line))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Skip session_meta lines up front so truncate_at counts real messages
|
||||
raw_msgs = [m for m in raw_msgs if m.get("role") != "session_meta"]
|
||||
if truncate_at is not None:
|
||||
raw_msgs = raw_msgs[:truncate_at]
|
||||
|
||||
user_turn_counter = [0]
|
||||
scrubbed: List[Dict[str, Any]] = []
|
||||
for m in raw_msgs:
|
||||
new = _scrub_message(
|
||||
m,
|
||||
first_user_paraphrase=first_user_paraphrase,
|
||||
user_turn_idx=user_turn_counter,
|
||||
)
|
||||
if new is not None:
|
||||
scrubbed.append(new)
|
||||
|
||||
# Drop empty-content assistant messages that have no tool_calls
|
||||
# (artifact of scratchpad-only turns post-scrub)
|
||||
pruned: List[Dict[str, Any]] = []
|
||||
for m in scrubbed:
|
||||
if (
|
||||
m["role"] == "assistant"
|
||||
and not (m.get("content") or "").strip()
|
||||
and not m.get("tool_calls")
|
||||
):
|
||||
continue
|
||||
pruned.append(m)
|
||||
# Trim trailing orphan tool messages (no matching assistant)
|
||||
while pruned and pruned[-1]["role"] == "tool":
|
||||
pruned.pop()
|
||||
scrubbed = pruned
|
||||
|
||||
# Inject a synthetic public-safe system message so the compressor has
|
||||
# a head to anchor on. The real system prompts embed personality and
|
||||
# platform-specific content we don't want checked in.
|
||||
system_msg = {
|
||||
"role": "system",
|
||||
"content": (
|
||||
"You are a helpful AI coding assistant with access to tools "
|
||||
"(terminal, file editing, search, web, etc.). You operate in a "
|
||||
"conversational loop: the user gives you a task, you call tools "
|
||||
"to accomplish it, and you report back concisely."
|
||||
),
|
||||
}
|
||||
if scrubbed and scrubbed[0].get("role") == "system":
|
||||
scrubbed[0] = system_msg
|
||||
else:
|
||||
scrubbed.insert(0, system_msg)
|
||||
|
||||
fixture = {
|
||||
"name": output_name,
|
||||
"description": description,
|
||||
"model": model_guess,
|
||||
"context_length": context_length,
|
||||
"source": f"~/.hermes/sessions/{source_file}",
|
||||
"truncated_to": truncate_at,
|
||||
"scrubbed_at": datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ"),
|
||||
"scrubbing_passes": [
|
||||
"redact_sensitive_text (agent.redact)",
|
||||
"username paths replaced with /home/user",
|
||||
"personal handles (all case variants of the maintainer name) replaced with 'user'",
|
||||
"email addresses replaced with contributor@example.com",
|
||||
"git 'Author: Name <addr>' header lines normalised",
|
||||
"reasoning scratchpad blocks stripped from assistant content",
|
||||
"think tag blocks stripped from assistant content",
|
||||
"messaging-platform user mentions replaced with <@user>",
|
||||
"first user message paraphrased to remove personal voice",
|
||||
"subsequent user messages kept verbatim (after above redactions)",
|
||||
"system prompt replaced with generic public-safe placeholder",
|
||||
"orphan empty-assistant messages and trailing tool messages dropped",
|
||||
f"tool outputs longer than {_TOOL_OUTPUT_MAX} chars truncated with a note",
|
||||
],
|
||||
"messages": scrubbed,
|
||||
}
|
||||
return fixture
|
||||
|
||||
|
||||
def main() -> int:
|
||||
FIXTURES_DIR.mkdir(parents=True, exist_ok=True)
|
||||
for spec in SPECS:
|
||||
source_file, output_name, description, paraphrase, model, ctx, truncate = spec
|
||||
fixture = build_fixture(
|
||||
source_file=source_file,
|
||||
output_name=output_name,
|
||||
description=description,
|
||||
first_user_paraphrase=paraphrase,
|
||||
model_guess=model,
|
||||
context_length=ctx,
|
||||
truncate_at=truncate,
|
||||
)
|
||||
out_path = FIXTURES_DIR / f"{output_name}.json"
|
||||
with out_path.open("w") as fh:
|
||||
json.dump(fixture, fh, indent=2, ensure_ascii=False)
|
||||
size_kb = out_path.stat().st_size / 1024
|
||||
print(f" {output_name}.json {size_kb:.1f} KB {len(fixture['messages'])} msgs")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Loading…
Add table
Add a link
Reference in a new issue