mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-28 11:32:22 +00:00
Merge pull request #52285 from NousResearch/bb/verify-ledger
feat(agent): record coding verification evidence
This commit is contained in:
commit
da0320bf40
4 changed files with 933 additions and 7 deletions
313
tests/agent/test_verification_evidence.py
Normal file
313
tests/agent/test_verification_evidence.py
Normal file
|
|
@ -0,0 +1,313 @@
|
|||
import json
|
||||
import sqlite3
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from pathlib import Path
|
||||
|
||||
from agent.verification_evidence import (
|
||||
classify_verification_command,
|
||||
mark_workspace_edited,
|
||||
record_terminal_result,
|
||||
verification_status,
|
||||
)
|
||||
|
||||
|
||||
def _node_project(root: Path) -> None:
|
||||
(root / "package.json").write_text(
|
||||
json.dumps({"scripts": {"test": "vitest", "lint": "eslint .", "dev": "vite"}})
|
||||
)
|
||||
(root / "pnpm-lock.yaml").write_text("")
|
||||
scripts = root / "scripts"
|
||||
scripts.mkdir()
|
||||
(scripts / "run_tests.sh").write_text("#!/bin/sh\n")
|
||||
|
||||
|
||||
def _python_project(root: Path) -> None:
|
||||
(root / "pyproject.toml").write_text("[tool.pytest.ini_options]\n")
|
||||
|
||||
|
||||
def test_classifies_targeted_project_verify_command(tmp_path, monkeypatch):
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
|
||||
_node_project(tmp_path)
|
||||
|
||||
evidence = classify_verification_command(
|
||||
"scripts/run_tests.sh tests/test_widget.py -q",
|
||||
cwd=tmp_path,
|
||||
session_id="s1",
|
||||
exit_code=0,
|
||||
output="1 passed",
|
||||
)
|
||||
|
||||
assert evidence is not None
|
||||
assert evidence.canonical_command == "scripts/run_tests.sh"
|
||||
assert evidence.kind == "test"
|
||||
assert evidence.scope == "targeted"
|
||||
assert evidence.status == "passed"
|
||||
|
||||
|
||||
def test_classifies_python_module_pytest_as_detected_pytest(tmp_path, monkeypatch):
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
|
||||
_python_project(tmp_path)
|
||||
|
||||
evidence = classify_verification_command(
|
||||
"python -m pytest tests/test_calc.py::test_even -q",
|
||||
cwd=tmp_path,
|
||||
session_id="s1",
|
||||
exit_code=1,
|
||||
output="failed",
|
||||
)
|
||||
|
||||
assert evidence is not None
|
||||
assert evidence.canonical_command == "pytest"
|
||||
assert evidence.kind == "test"
|
||||
assert evidence.scope == "targeted"
|
||||
assert evidence.status == "failed"
|
||||
|
||||
|
||||
def test_records_passed_then_marks_stale_after_edit(tmp_path, monkeypatch):
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
|
||||
_node_project(tmp_path)
|
||||
|
||||
event = record_terminal_result(
|
||||
command="scripts/run_tests.sh",
|
||||
cwd=tmp_path,
|
||||
session_id="s1",
|
||||
exit_code=0,
|
||||
output="all green",
|
||||
)
|
||||
|
||||
assert event is not None
|
||||
assert verification_status(session_id="s1", cwd=tmp_path)["status"] == "passed"
|
||||
|
||||
mark_workspace_edited(
|
||||
session_id="s1",
|
||||
cwd=tmp_path,
|
||||
paths=[str(tmp_path / "src" / "app.ts")],
|
||||
)
|
||||
|
||||
status = verification_status(session_id="s1", cwd=tmp_path)
|
||||
assert status["status"] == "stale"
|
||||
assert status["changed_paths"] == [str(tmp_path / "src" / "app.ts")]
|
||||
|
||||
|
||||
def test_lint_and_typecheck_are_not_reported_as_full_tests(tmp_path, monkeypatch):
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
|
||||
_node_project(tmp_path)
|
||||
|
||||
lint = classify_verification_command(
|
||||
"pnpm run lint",
|
||||
cwd=tmp_path,
|
||||
session_id="s1",
|
||||
exit_code=0,
|
||||
)
|
||||
test = classify_verification_command(
|
||||
"pnpm run test -- tests/button.test.tsx",
|
||||
cwd=tmp_path,
|
||||
session_id="s1",
|
||||
exit_code=0,
|
||||
)
|
||||
|
||||
assert lint is not None
|
||||
assert lint.kind == "lint"
|
||||
assert lint.scope == "full"
|
||||
assert test is not None
|
||||
assert test.kind == "test"
|
||||
assert test.scope == "targeted"
|
||||
|
||||
|
||||
def test_package_script_shorthand_matches_canonical_verify_command(tmp_path, monkeypatch):
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
|
||||
_node_project(tmp_path)
|
||||
|
||||
evidence = classify_verification_command(
|
||||
"pnpm test -- tests/button.test.tsx",
|
||||
cwd=tmp_path,
|
||||
session_id="s1",
|
||||
exit_code=0,
|
||||
)
|
||||
|
||||
assert evidence is not None
|
||||
assert evidence.canonical_command == "pnpm run test"
|
||||
assert evidence.scope == "targeted"
|
||||
|
||||
|
||||
def test_shell_wrappers_match_but_echo_does_not(tmp_path, monkeypatch):
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
|
||||
_node_project(tmp_path)
|
||||
|
||||
wrapped = classify_verification_command(
|
||||
"env CI=1 bash scripts/run_tests.sh tests/test_widget.py",
|
||||
cwd=tmp_path,
|
||||
session_id="s1",
|
||||
exit_code=0,
|
||||
)
|
||||
echoed = classify_verification_command(
|
||||
"echo scripts/run_tests.sh tests/test_widget.py",
|
||||
cwd=tmp_path,
|
||||
session_id="s1",
|
||||
exit_code=0,
|
||||
)
|
||||
|
||||
assert wrapped is not None
|
||||
assert wrapped.canonical_command == "scripts/run_tests.sh"
|
||||
assert wrapped.scope == "targeted"
|
||||
assert echoed is None
|
||||
|
||||
|
||||
def test_uv_run_pytest_matches_detected_pytest(tmp_path, monkeypatch):
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
|
||||
_python_project(tmp_path)
|
||||
|
||||
evidence = classify_verification_command(
|
||||
"uv run pytest tests/test_calc.py",
|
||||
cwd=tmp_path,
|
||||
session_id="s1",
|
||||
exit_code=0,
|
||||
)
|
||||
|
||||
assert evidence is not None
|
||||
assert evidence.canonical_command == "pytest"
|
||||
assert evidence.scope == "targeted"
|
||||
|
||||
|
||||
def test_status_is_unverified_without_evidence(tmp_path, monkeypatch):
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
|
||||
_node_project(tmp_path)
|
||||
|
||||
assert verification_status(session_id="s1", cwd=tmp_path)["status"] == "unverified"
|
||||
|
||||
|
||||
def test_edit_without_prior_evidence_stays_unverified(tmp_path, monkeypatch):
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
|
||||
_node_project(tmp_path)
|
||||
|
||||
mark_workspace_edited(
|
||||
session_id="s1",
|
||||
cwd=tmp_path,
|
||||
paths=[str(tmp_path / "src" / "app.ts")],
|
||||
)
|
||||
|
||||
status = verification_status(session_id="s1", cwd=tmp_path)
|
||||
assert status["status"] == "unverified"
|
||||
assert status["changed_paths"] == [str(tmp_path / "src" / "app.ts")]
|
||||
|
||||
|
||||
def test_file_tool_stales_evidence_by_session_id_for_absolute_edit(tmp_path, monkeypatch):
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
|
||||
_node_project(tmp_path)
|
||||
target = tmp_path / "src" / "app.ts"
|
||||
target.parent.mkdir()
|
||||
|
||||
record_terminal_result(
|
||||
command="pnpm test",
|
||||
cwd=tmp_path,
|
||||
session_id="conversation",
|
||||
exit_code=0,
|
||||
output="green",
|
||||
)
|
||||
|
||||
from tools.file_tools import write_file_tool
|
||||
|
||||
result = json.loads(
|
||||
write_file_tool(
|
||||
str(target),
|
||||
"export const ok = true\n",
|
||||
task_id="turn",
|
||||
session_id="conversation",
|
||||
)
|
||||
)
|
||||
|
||||
assert result["files_modified"] == [str(target.resolve())]
|
||||
assert verification_status(session_id="conversation", cwd=tmp_path)["status"] == "stale"
|
||||
assert verification_status(session_id="turn", cwd=tmp_path)["status"] == "unverified"
|
||||
|
||||
|
||||
def test_recording_prunes_old_events_but_keeps_latest_state(tmp_path, monkeypatch):
|
||||
home = tmp_path / ".hermes"
|
||||
monkeypatch.setenv("HERMES_HOME", str(home))
|
||||
_node_project(tmp_path)
|
||||
|
||||
for index in range(120):
|
||||
record_terminal_result(
|
||||
command="pnpm test",
|
||||
cwd=tmp_path,
|
||||
session_id="s1",
|
||||
exit_code=0,
|
||||
output=f"green {index}",
|
||||
)
|
||||
|
||||
with sqlite3.connect(home / "verification_evidence.db") as conn:
|
||||
event_count = conn.execute("SELECT COUNT(*) FROM verification_events").fetchone()[0]
|
||||
latest_summary = conn.execute(
|
||||
"""
|
||||
SELECT output_summary
|
||||
FROM verification_events
|
||||
ORDER BY id DESC
|
||||
LIMIT 1
|
||||
"""
|
||||
).fetchone()[0]
|
||||
|
||||
assert event_count == 100
|
||||
assert latest_summary == "green 119"
|
||||
assert verification_status(session_id="s1", cwd=tmp_path)["status"] == "passed"
|
||||
|
||||
|
||||
def test_recording_expires_old_current_evidence(tmp_path, monkeypatch):
|
||||
home = tmp_path / ".hermes"
|
||||
monkeypatch.setenv("HERMES_HOME", str(home))
|
||||
_node_project(tmp_path)
|
||||
|
||||
record_terminal_result(
|
||||
command="pnpm test",
|
||||
cwd=tmp_path,
|
||||
session_id="old-session",
|
||||
exit_code=0,
|
||||
output="old green",
|
||||
)
|
||||
cutoff = (datetime.now(timezone.utc) - timedelta(days=31)).isoformat()
|
||||
with sqlite3.connect(home / "verification_evidence.db") as conn:
|
||||
conn.execute("UPDATE verification_events SET created_at = ?", (cutoff,))
|
||||
conn.commit()
|
||||
|
||||
record_terminal_result(
|
||||
command="pnpm test",
|
||||
cwd=tmp_path,
|
||||
session_id="new-session",
|
||||
exit_code=0,
|
||||
output="new green",
|
||||
)
|
||||
|
||||
assert verification_status(session_id="old-session", cwd=tmp_path)["status"] == "unverified"
|
||||
assert verification_status(session_id="new-session", cwd=tmp_path)["status"] == "passed"
|
||||
with sqlite3.connect(home / "verification_evidence.db") as conn:
|
||||
old_rows = conn.execute(
|
||||
"SELECT COUNT(*) FROM verification_events WHERE session_id = 'old-session'"
|
||||
).fetchone()[0]
|
||||
assert old_rows == 0
|
||||
|
||||
|
||||
def test_recording_expires_old_edit_only_state(tmp_path, monkeypatch):
|
||||
home = tmp_path / ".hermes"
|
||||
monkeypatch.setenv("HERMES_HOME", str(home))
|
||||
_node_project(tmp_path)
|
||||
|
||||
mark_workspace_edited(
|
||||
session_id="old-session",
|
||||
cwd=tmp_path,
|
||||
paths=[str(tmp_path / "src" / "app.ts")],
|
||||
)
|
||||
cutoff = (datetime.now(timezone.utc) - timedelta(days=31)).isoformat()
|
||||
with sqlite3.connect(home / "verification_evidence.db") as conn:
|
||||
conn.execute("UPDATE verification_state SET last_edit_at = ?", (cutoff,))
|
||||
conn.commit()
|
||||
|
||||
record_terminal_result(
|
||||
command="pnpm test",
|
||||
cwd=tmp_path,
|
||||
session_id="new-session",
|
||||
exit_code=0,
|
||||
output="new green",
|
||||
)
|
||||
|
||||
status = verification_status(session_id="old-session", cwd=tmp_path)
|
||||
assert status["status"] == "unverified"
|
||||
assert status["changed_paths"] == []
|
||||
Loading…
Add table
Add a link
Reference in a new issue