mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
Adds scripts/compression_eval/ with a design doc, README, a placeholder run_eval.py, and three checked-in scrubbed session fixtures. No working eval yet — PR is for design review before implementation. Motivation: we edit agent/context_compressor.py prompts and _template_sections by hand and ship without any automated check that compression still preserves file paths, error codes, or the active task. Factory.ai's Dec 2025 write-up (https://factory.ai/news/evaluating-compression) documents a probe-based eval scored on six dimensions. We adopt the methodology; we do not publish scores. Contents: - DESIGN.md — fixture format, probe format (recall / artifact / continuation / decision), six grading dimensions, report format, cost expectations, scrubber pipeline, open questions, and staged follow-up PR plan. - README.md — short 'what this is / when to run it' page. - run_eval.py — placeholder that prints 'not implemented, see DESIGN.md' and exits 1. - scrub_fixtures.py — reproducible pipeline that converts real sessions from ~/.hermes/sessions/*.jsonl into public-safe JSON fixtures. Applies: redact_sensitive_text, username path normalization, personal handle scrubbing, email and git-author normalization, reasoning scratchpad / <think> stripping, platform user-mention scrubbing, first-user paraphrase, system-prompt placeholder, orphan-message pruning, and tool-output size truncation for fixture readability. - fixtures/feature-impl-context-priority.json — 75 msgs / ~17k tokens. Investigate → patch → test → PR → merge. - fixtures/debug-session-feishu-id-model.json — 59 msgs / ~13k tokens. PR triage + upstream docs + decision. - fixtures/config-build-competitive-scouts.json — 61 msgs / ~23k tokens. Iterative config accumulation (11 cron jobs across 7 weekdays). PII audit: zero matches across the three fixtures for the maintainer's handle (all case variants), personal email domains, and known contributor emails. Only 'contributor@example.com' placeholder remains. Why scripts/: requires API credentials, costs ~\$1 per run, LLM-graded (non-deterministic), must not run in CI. scripts/sample_and_compress.py is the existing precedent for offline credentialed tooling.
370 lines
14 KiB
Python
Executable file
370 lines
14 KiB
Python
Executable file
"""One-shot fixture scrubber for scripts/compression_eval/fixtures/.
|
|
|
|
Source: ~/.hermes/sessions/<file>.jsonl
|
|
Output: .worktrees/.../scripts/compression_eval/fixtures/<name>.json
|
|
|
|
Scrubbing passes:
|
|
1. agent.redact.redact_sensitive_text — API keys, tokens, connection strings
|
|
2. Username paths — /home/teknium/ → /home/user/, ~/.hermes/ preserved as-is
|
|
(that path is universal)
|
|
3. Personal handles — "Teknium"/"teknium"/"teknium1" → "user"
|
|
4. Reasoning scratchpads — strip <REASONING_SCRATCHPAD>...</REASONING_SCRATCHPAD>
|
|
blocks and <think>...</think> tags (personality leakage risk)
|
|
5. session_meta line — drop entirely, we only need the messages
|
|
6. User message personality — lightly paraphrase the first user message to keep
|
|
task intent while removing "vibe"; subsequent user turns kept verbatim
|
|
since they're short instructions
|
|
|
|
The fixture format matches DESIGN.md:
|
|
{
|
|
"name": "...",
|
|
"description": "...",
|
|
"model": "...", # best guess from original session
|
|
"context_length": 200000,
|
|
"messages": [...], # OpenAI-format, only role/content/tool_calls/tool_call_id/tool_name
|
|
"notes": "Scrubbed from ~/.hermes/sessions/... on 2026-04-24"
|
|
}
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import re
|
|
import sys
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List
|
|
|
|
# Resolve the hermes-agent checkout relative to this script so agent.redact
|
|
# imports cleanly whether we run from a worktree or a main clone.
|
|
_REPO_ROOT = Path(__file__).resolve().parents[2]
|
|
sys.path.insert(0, str(_REPO_ROOT))
|
|
from agent.redact import redact_sensitive_text # noqa: E402
|
|
|
|
|
|
SESSION_DIR = Path.home() / ".hermes" / "sessions"
|
|
# Resolve FIXTURES_DIR relative to this script so the scrubber runs the
|
|
# same way inside a worktree, a main checkout, or from a contributor's
|
|
# clone at a different path.
|
|
FIXTURES_DIR = Path(__file__).resolve().parent / "fixtures"
|
|
|
|
# (source_file, output_name, description, user_first_paraphrase, model_guess, context_length, truncate_at)
|
|
# truncate_at: keep messages[:truncate_at] (None = keep all). Applied BEFORE
|
|
# orphan-empty-assistant cleanup.
|
|
SPECS = [
|
|
(
|
|
"20260321_060441_fef7be92.jsonl",
|
|
"feature-impl-context-priority",
|
|
"~75-turn feature-impl: user asks how multiple project-context files "
|
|
"(.hermes.md / AGENTS.md / CLAUDE.md / .cursorrules) are handled when "
|
|
"all are present; agent investigates the codebase, designs a priority "
|
|
"order, patches the loader + tests, live-tests with a scenario "
|
|
"directory, commits to a feature branch, opens a PR, and merges after "
|
|
"approval. Exercises investigate → decide → implement → verify → "
|
|
"ship flow with clear artifact trail (2 files modified, 1 PR).",
|
|
(
|
|
"If .hermes.md, AGENTS.md, CLAUDE.md, and .cursorrules all exist in "
|
|
"the same directory, does the agent load all of them or pick one? "
|
|
"Use the hermes-agent-dev skill to check."
|
|
),
|
|
"anthropic/claude-sonnet-4.6",
|
|
200000,
|
|
74, # cut at "Merged and pulled. Main is current." — drops trailing unrelated cron-delivery messages
|
|
),
|
|
(
|
|
"20260412_233741_3f2119a8.jsonl",
|
|
"debug-session-feishu-id-model",
|
|
"~60-turn debug/triage PR-review session: a third-party bug report "
|
|
"says the gateway's Feishu adapter misuses the open_id / union_id / "
|
|
"user_id identity model (open_id is app-scoped, not the bot's "
|
|
"canonical ID). An open community PR (#8388) tries to fix it. Agent "
|
|
"reviews the PR against current main, fetches upstream Feishu/Lark "
|
|
"identity docs, and produces a decision. Exercises long tool-heavy "
|
|
"context with PR diffs, upstream docs, and a clear decision at the "
|
|
"end — the classic 'can the summary still name the PR number, the "
|
|
"root cause, and the decision?' scenario.",
|
|
(
|
|
"A community user reports the Feishu/Lark adapter gets the identity "
|
|
"model wrong — open_id is app-scoped, not the bot's canonical ID. "
|
|
"There's an open PR #8388 trying to fix it. Use the hermes-agent-dev "
|
|
"skill and the pr-triage-salvage skill to review it."
|
|
),
|
|
"anthropic/claude-sonnet-4.6",
|
|
200000,
|
|
58, # end at "Here's my review: ..." — clean decision point before the "close it, implement cleaner" pivot
|
|
),
|
|
(
|
|
"20260328_160817_77bd258b.jsonl",
|
|
"config-build-competitive-scouts",
|
|
"~60-turn iterative config/build session: user wants a set of weekly "
|
|
"cron jobs that scan competing AI coding agents (openclaw, nanoclaw, "
|
|
"ironclaw, codex, opencode, claude-code, kilo-code, gemini-cli, "
|
|
"cline, aider, roo) for merged PRs or web updates worth porting to "
|
|
"hermes-agent. User adds one target per turn; agent creates each cron "
|
|
"job and re-states the accumulated schedule. Exercises artifact trail "
|
|
"(which jobs are configured, which days) and iterative state "
|
|
"accumulation — the canonical case for iterative-merge summarization.",
|
|
(
|
|
"Set up a cron job for the agent every Sunday to scan all PRs "
|
|
"merged into openclaw that week, decide which are worth adding to "
|
|
"hermes-agent, and open PRs porting those features."
|
|
),
|
|
"anthropic/claude-sonnet-4.6",
|
|
200000,
|
|
None,
|
|
),
|
|
]
|
|
|
|
|
|
# Tool outputs beyond this size in chars are replaced with a short
|
|
# placeholder — a 16KB skill_view dump or 5KB web_extract result
|
|
# doesn't contribute to the compression eval signal but bloats the
|
|
# fixture size and PR diff readability. The compressor sees the
|
|
# placeholder and still knows the tool was called and returned
|
|
# something useful.
|
|
_TOOL_OUTPUT_MAX = 2000
|
|
|
|
|
|
def _maybe_truncate_tool_output(text: str, tool_name: str) -> str:
|
|
if not text or len(text) <= _TOOL_OUTPUT_MAX:
|
|
return text
|
|
keep = _TOOL_OUTPUT_MAX - 200
|
|
head = text[:keep]
|
|
return (
|
|
head
|
|
+ f"\n\n[... tool output truncated for fixture — original was {len(text)} chars"
|
|
+ (f" from {tool_name}" if tool_name else "")
|
|
+ "]"
|
|
)
|
|
|
|
|
|
_PATH_RE = re.compile(r"/home/teknium\b")
|
|
_USER_RE = re.compile(r"\bteknium1\b|\bTeknium\b|\bteknium\b", re.IGNORECASE)
|
|
# Only strip scratchpads in ASSISTANT content, not tool results (might be legit)
|
|
_SCRATCH_RE = re.compile(
|
|
r"<REASONING_SCRATCHPAD>.*?</REASONING_SCRATCHPAD>\s*", re.DOTALL
|
|
)
|
|
_THINK_RE = re.compile(r"<think>.*?</think>\s*", re.DOTALL)
|
|
# Discord/Telegram user mention leakage in messaging-platform sessions
|
|
_USER_MENTION_RE = re.compile(r"<@\*{3}>|<@\d+>")
|
|
# Contributor emails (from git show output etc) — anything@domain.tld
|
|
# Keep noreply@github-style placeholders obvious; real personal emails get
|
|
# replaced with a contributor placeholder.
|
|
_EMAIL_RE = re.compile(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b")
|
|
# "Author: Name <email>" git-show headers — rewrite the whole line
|
|
_GIT_AUTHOR_RE = re.compile(r"Author:\s*[^<\n]+<[^>]+>")
|
|
|
|
|
|
def _scrub_text(text: str, *, drop_scratchpads: bool = False) -> str:
|
|
"""Apply the pipeline to a raw text string.
|
|
|
|
drop_scratchpads only affects assistant messages — tool outputs that
|
|
happen to contain similar markers are left alone.
|
|
"""
|
|
if not text:
|
|
return text
|
|
if drop_scratchpads:
|
|
text = _SCRATCH_RE.sub("", text)
|
|
text = _THINK_RE.sub("", text)
|
|
text = _PATH_RE.sub("/home/user", text)
|
|
text = _USER_RE.sub("user", text)
|
|
text = _USER_MENTION_RE.sub("<@user>", text)
|
|
# Rewrite git "Author: Name <email>" lines before generic email replace
|
|
text = _GIT_AUTHOR_RE.sub("Author: contributor <contributor@example.com>", text)
|
|
text = _EMAIL_RE.sub("contributor@example.com", text)
|
|
text = redact_sensitive_text(text)
|
|
return text
|
|
|
|
|
|
def _content_to_str(content: Any) -> str:
|
|
if content is None:
|
|
return ""
|
|
if isinstance(content, str):
|
|
return content
|
|
if isinstance(content, list):
|
|
parts = []
|
|
for p in content:
|
|
if isinstance(p, dict) and "text" in p:
|
|
parts.append(p["text"])
|
|
elif isinstance(p, str):
|
|
parts.append(p)
|
|
return "\n".join(parts)
|
|
return str(content)
|
|
|
|
|
|
def _scrub_tool_calls(tool_calls: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
out = []
|
|
for tc in tool_calls or []:
|
|
if not isinstance(tc, dict):
|
|
continue
|
|
fn = tc.get("function", {}) or {}
|
|
args = fn.get("arguments", "")
|
|
if isinstance(args, str):
|
|
args = _scrub_text(args)
|
|
new_tc = {
|
|
"id": tc.get("id", ""),
|
|
"type": tc.get("type", "function"),
|
|
"function": {
|
|
"name": fn.get("name", ""),
|
|
"arguments": args,
|
|
},
|
|
}
|
|
out.append(new_tc)
|
|
return out
|
|
|
|
|
|
def _scrub_message(m: Dict[str, Any], *, first_user_paraphrase: str | None, user_turn_idx: List[int]) -> Dict[str, Any] | None:
|
|
role = m.get("role")
|
|
if role in (None, "session_meta"):
|
|
return None
|
|
|
|
content = _content_to_str(m.get("content"))
|
|
|
|
if role == "assistant":
|
|
content = _scrub_text(content, drop_scratchpads=True)
|
|
elif role == "user":
|
|
# Use paraphrase for the very first user turn only
|
|
user_turn_idx[0] += 1
|
|
if user_turn_idx[0] == 1 and first_user_paraphrase is not None:
|
|
content = first_user_paraphrase
|
|
else:
|
|
content = _scrub_text(content)
|
|
else:
|
|
content = _scrub_text(content)
|
|
# Truncate large tool outputs
|
|
if role == "tool":
|
|
tn = m.get("tool_name") or m.get("name") or ""
|
|
content = _maybe_truncate_tool_output(content, tn)
|
|
|
|
new_msg: Dict[str, Any] = {"role": role, "content": content}
|
|
|
|
if role == "assistant":
|
|
tcs = m.get("tool_calls") or []
|
|
if tcs:
|
|
new_msg["tool_calls"] = _scrub_tool_calls(tcs)
|
|
if role == "tool":
|
|
if m.get("tool_call_id"):
|
|
new_msg["tool_call_id"] = m["tool_call_id"]
|
|
if m.get("tool_name") or m.get("name"):
|
|
new_msg["tool_name"] = m.get("tool_name") or m.get("name")
|
|
|
|
return new_msg
|
|
|
|
|
|
def build_fixture(
|
|
source_file: str,
|
|
output_name: str,
|
|
description: str,
|
|
first_user_paraphrase: str,
|
|
model_guess: str,
|
|
context_length: int,
|
|
truncate_at: int | None = None,
|
|
) -> Dict[str, Any]:
|
|
src = SESSION_DIR / source_file
|
|
raw_msgs: List[Dict[str, Any]] = []
|
|
with src.open() as fh:
|
|
for line in fh:
|
|
try:
|
|
raw_msgs.append(json.loads(line))
|
|
except Exception:
|
|
pass
|
|
|
|
# Skip session_meta lines up front so truncate_at counts real messages
|
|
raw_msgs = [m for m in raw_msgs if m.get("role") != "session_meta"]
|
|
if truncate_at is not None:
|
|
raw_msgs = raw_msgs[:truncate_at]
|
|
|
|
user_turn_counter = [0]
|
|
scrubbed: List[Dict[str, Any]] = []
|
|
for m in raw_msgs:
|
|
new = _scrub_message(
|
|
m,
|
|
first_user_paraphrase=first_user_paraphrase,
|
|
user_turn_idx=user_turn_counter,
|
|
)
|
|
if new is not None:
|
|
scrubbed.append(new)
|
|
|
|
# Drop empty-content assistant messages that have no tool_calls
|
|
# (artifact of scratchpad-only turns post-scrub)
|
|
pruned: List[Dict[str, Any]] = []
|
|
for m in scrubbed:
|
|
if (
|
|
m["role"] == "assistant"
|
|
and not (m.get("content") or "").strip()
|
|
and not m.get("tool_calls")
|
|
):
|
|
continue
|
|
pruned.append(m)
|
|
# Trim trailing orphan tool messages (no matching assistant)
|
|
while pruned and pruned[-1]["role"] == "tool":
|
|
pruned.pop()
|
|
scrubbed = pruned
|
|
|
|
# Inject a synthetic public-safe system message so the compressor has
|
|
# a head to anchor on. The real system prompts embed personality and
|
|
# platform-specific content we don't want checked in.
|
|
system_msg = {
|
|
"role": "system",
|
|
"content": (
|
|
"You are a helpful AI coding assistant with access to tools "
|
|
"(terminal, file editing, search, web, etc.). You operate in a "
|
|
"conversational loop: the user gives you a task, you call tools "
|
|
"to accomplish it, and you report back concisely."
|
|
),
|
|
}
|
|
if scrubbed and scrubbed[0].get("role") == "system":
|
|
scrubbed[0] = system_msg
|
|
else:
|
|
scrubbed.insert(0, system_msg)
|
|
|
|
fixture = {
|
|
"name": output_name,
|
|
"description": description,
|
|
"model": model_guess,
|
|
"context_length": context_length,
|
|
"source": f"~/.hermes/sessions/{source_file}",
|
|
"truncated_to": truncate_at,
|
|
"scrubbed_at": datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ"),
|
|
"scrubbing_passes": [
|
|
"redact_sensitive_text (agent.redact)",
|
|
"username paths replaced with /home/user",
|
|
"personal handles (all case variants of the maintainer name) replaced with 'user'",
|
|
"email addresses replaced with contributor@example.com",
|
|
"git 'Author: Name <addr>' header lines normalised",
|
|
"reasoning scratchpad blocks stripped from assistant content",
|
|
"think tag blocks stripped from assistant content",
|
|
"messaging-platform user mentions replaced with <@user>",
|
|
"first user message paraphrased to remove personal voice",
|
|
"subsequent user messages kept verbatim (after above redactions)",
|
|
"system prompt replaced with generic public-safe placeholder",
|
|
"orphan empty-assistant messages and trailing tool messages dropped",
|
|
f"tool outputs longer than {_TOOL_OUTPUT_MAX} chars truncated with a note",
|
|
],
|
|
"messages": scrubbed,
|
|
}
|
|
return fixture
|
|
|
|
|
|
def main() -> int:
|
|
FIXTURES_DIR.mkdir(parents=True, exist_ok=True)
|
|
for spec in SPECS:
|
|
source_file, output_name, description, paraphrase, model, ctx, truncate = spec
|
|
fixture = build_fixture(
|
|
source_file=source_file,
|
|
output_name=output_name,
|
|
description=description,
|
|
first_user_paraphrase=paraphrase,
|
|
model_guess=model,
|
|
context_length=ctx,
|
|
truncate_at=truncate,
|
|
)
|
|
out_path = FIXTURES_DIR / f"{output_name}.json"
|
|
with out_path.open("w") as fh:
|
|
json.dump(fixture, fh, indent=2, ensure_ascii=False)
|
|
size_kb = out_path.stat().st_size / 1024
|
|
print(f" {output_name}.json {size_kb:.1f} KB {len(fixture['messages'])} msgs")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|