mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-27 11:22:03 +00:00
Exercise command classification, session scoping, stale edits, bounded retention, and natural expiry for recorded verification evidence.
313 lines
9.4 KiB
Python
313 lines
9.4 KiB
Python
import json
|
|
import sqlite3
|
|
from datetime import datetime, timedelta, timezone
|
|
from pathlib import Path
|
|
|
|
from agent.verification_evidence import (
|
|
classify_verification_command,
|
|
mark_workspace_edited,
|
|
record_terminal_result,
|
|
verification_status,
|
|
)
|
|
|
|
|
|
def _node_project(root: Path) -> None:
|
|
(root / "package.json").write_text(
|
|
json.dumps({"scripts": {"test": "vitest", "lint": "eslint .", "dev": "vite"}})
|
|
)
|
|
(root / "pnpm-lock.yaml").write_text("")
|
|
scripts = root / "scripts"
|
|
scripts.mkdir()
|
|
(scripts / "run_tests.sh").write_text("#!/bin/sh\n")
|
|
|
|
|
|
def _python_project(root: Path) -> None:
|
|
(root / "pyproject.toml").write_text("[tool.pytest.ini_options]\n")
|
|
|
|
|
|
def test_classifies_targeted_project_verify_command(tmp_path, monkeypatch):
|
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
|
|
_node_project(tmp_path)
|
|
|
|
evidence = classify_verification_command(
|
|
"scripts/run_tests.sh tests/test_widget.py -q",
|
|
cwd=tmp_path,
|
|
session_id="s1",
|
|
exit_code=0,
|
|
output="1 passed",
|
|
)
|
|
|
|
assert evidence is not None
|
|
assert evidence.canonical_command == "scripts/run_tests.sh"
|
|
assert evidence.kind == "test"
|
|
assert evidence.scope == "targeted"
|
|
assert evidence.status == "passed"
|
|
|
|
|
|
def test_classifies_python_module_pytest_as_detected_pytest(tmp_path, monkeypatch):
|
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
|
|
_python_project(tmp_path)
|
|
|
|
evidence = classify_verification_command(
|
|
"python -m pytest tests/test_calc.py::test_even -q",
|
|
cwd=tmp_path,
|
|
session_id="s1",
|
|
exit_code=1,
|
|
output="failed",
|
|
)
|
|
|
|
assert evidence is not None
|
|
assert evidence.canonical_command == "pytest"
|
|
assert evidence.kind == "test"
|
|
assert evidence.scope == "targeted"
|
|
assert evidence.status == "failed"
|
|
|
|
|
|
def test_records_passed_then_marks_stale_after_edit(tmp_path, monkeypatch):
|
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
|
|
_node_project(tmp_path)
|
|
|
|
event = record_terminal_result(
|
|
command="scripts/run_tests.sh",
|
|
cwd=tmp_path,
|
|
session_id="s1",
|
|
exit_code=0,
|
|
output="all green",
|
|
)
|
|
|
|
assert event is not None
|
|
assert verification_status(session_id="s1", cwd=tmp_path)["status"] == "passed"
|
|
|
|
mark_workspace_edited(
|
|
session_id="s1",
|
|
cwd=tmp_path,
|
|
paths=[str(tmp_path / "src" / "app.ts")],
|
|
)
|
|
|
|
status = verification_status(session_id="s1", cwd=tmp_path)
|
|
assert status["status"] == "stale"
|
|
assert status["changed_paths"] == [str(tmp_path / "src" / "app.ts")]
|
|
|
|
|
|
def test_lint_and_typecheck_are_not_reported_as_full_tests(tmp_path, monkeypatch):
|
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
|
|
_node_project(tmp_path)
|
|
|
|
lint = classify_verification_command(
|
|
"pnpm run lint",
|
|
cwd=tmp_path,
|
|
session_id="s1",
|
|
exit_code=0,
|
|
)
|
|
test = classify_verification_command(
|
|
"pnpm run test -- tests/button.test.tsx",
|
|
cwd=tmp_path,
|
|
session_id="s1",
|
|
exit_code=0,
|
|
)
|
|
|
|
assert lint is not None
|
|
assert lint.kind == "lint"
|
|
assert lint.scope == "full"
|
|
assert test is not None
|
|
assert test.kind == "test"
|
|
assert test.scope == "targeted"
|
|
|
|
|
|
def test_package_script_shorthand_matches_canonical_verify_command(tmp_path, monkeypatch):
|
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
|
|
_node_project(tmp_path)
|
|
|
|
evidence = classify_verification_command(
|
|
"pnpm test -- tests/button.test.tsx",
|
|
cwd=tmp_path,
|
|
session_id="s1",
|
|
exit_code=0,
|
|
)
|
|
|
|
assert evidence is not None
|
|
assert evidence.canonical_command == "pnpm run test"
|
|
assert evidence.scope == "targeted"
|
|
|
|
|
|
def test_shell_wrappers_match_but_echo_does_not(tmp_path, monkeypatch):
|
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
|
|
_node_project(tmp_path)
|
|
|
|
wrapped = classify_verification_command(
|
|
"env CI=1 bash scripts/run_tests.sh tests/test_widget.py",
|
|
cwd=tmp_path,
|
|
session_id="s1",
|
|
exit_code=0,
|
|
)
|
|
echoed = classify_verification_command(
|
|
"echo scripts/run_tests.sh tests/test_widget.py",
|
|
cwd=tmp_path,
|
|
session_id="s1",
|
|
exit_code=0,
|
|
)
|
|
|
|
assert wrapped is not None
|
|
assert wrapped.canonical_command == "scripts/run_tests.sh"
|
|
assert wrapped.scope == "targeted"
|
|
assert echoed is None
|
|
|
|
|
|
def test_uv_run_pytest_matches_detected_pytest(tmp_path, monkeypatch):
|
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
|
|
_python_project(tmp_path)
|
|
|
|
evidence = classify_verification_command(
|
|
"uv run pytest tests/test_calc.py",
|
|
cwd=tmp_path,
|
|
session_id="s1",
|
|
exit_code=0,
|
|
)
|
|
|
|
assert evidence is not None
|
|
assert evidence.canonical_command == "pytest"
|
|
assert evidence.scope == "targeted"
|
|
|
|
|
|
def test_status_is_unverified_without_evidence(tmp_path, monkeypatch):
|
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
|
|
_node_project(tmp_path)
|
|
|
|
assert verification_status(session_id="s1", cwd=tmp_path)["status"] == "unverified"
|
|
|
|
|
|
def test_edit_without_prior_evidence_stays_unverified(tmp_path, monkeypatch):
|
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
|
|
_node_project(tmp_path)
|
|
|
|
mark_workspace_edited(
|
|
session_id="s1",
|
|
cwd=tmp_path,
|
|
paths=[str(tmp_path / "src" / "app.ts")],
|
|
)
|
|
|
|
status = verification_status(session_id="s1", cwd=tmp_path)
|
|
assert status["status"] == "unverified"
|
|
assert status["changed_paths"] == [str(tmp_path / "src" / "app.ts")]
|
|
|
|
|
|
def test_file_tool_stales_evidence_by_session_id_for_absolute_edit(tmp_path, monkeypatch):
|
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
|
|
_node_project(tmp_path)
|
|
target = tmp_path / "src" / "app.ts"
|
|
target.parent.mkdir()
|
|
|
|
record_terminal_result(
|
|
command="pnpm test",
|
|
cwd=tmp_path,
|
|
session_id="conversation",
|
|
exit_code=0,
|
|
output="green",
|
|
)
|
|
|
|
from tools.file_tools import write_file_tool
|
|
|
|
result = json.loads(
|
|
write_file_tool(
|
|
str(target),
|
|
"export const ok = true\n",
|
|
task_id="turn",
|
|
session_id="conversation",
|
|
)
|
|
)
|
|
|
|
assert result["files_modified"] == [str(target.resolve())]
|
|
assert verification_status(session_id="conversation", cwd=tmp_path)["status"] == "stale"
|
|
assert verification_status(session_id="turn", cwd=tmp_path)["status"] == "unverified"
|
|
|
|
|
|
def test_recording_prunes_old_events_but_keeps_latest_state(tmp_path, monkeypatch):
|
|
home = tmp_path / ".hermes"
|
|
monkeypatch.setenv("HERMES_HOME", str(home))
|
|
_node_project(tmp_path)
|
|
|
|
for index in range(120):
|
|
record_terminal_result(
|
|
command="pnpm test",
|
|
cwd=tmp_path,
|
|
session_id="s1",
|
|
exit_code=0,
|
|
output=f"green {index}",
|
|
)
|
|
|
|
with sqlite3.connect(home / "verification_evidence.db") as conn:
|
|
event_count = conn.execute("SELECT COUNT(*) FROM verification_events").fetchone()[0]
|
|
latest_summary = conn.execute(
|
|
"""
|
|
SELECT output_summary
|
|
FROM verification_events
|
|
ORDER BY id DESC
|
|
LIMIT 1
|
|
"""
|
|
).fetchone()[0]
|
|
|
|
assert event_count == 100
|
|
assert latest_summary == "green 119"
|
|
assert verification_status(session_id="s1", cwd=tmp_path)["status"] == "passed"
|
|
|
|
|
|
def test_recording_expires_old_current_evidence(tmp_path, monkeypatch):
|
|
home = tmp_path / ".hermes"
|
|
monkeypatch.setenv("HERMES_HOME", str(home))
|
|
_node_project(tmp_path)
|
|
|
|
record_terminal_result(
|
|
command="pnpm test",
|
|
cwd=tmp_path,
|
|
session_id="old-session",
|
|
exit_code=0,
|
|
output="old green",
|
|
)
|
|
cutoff = (datetime.now(timezone.utc) - timedelta(days=31)).isoformat()
|
|
with sqlite3.connect(home / "verification_evidence.db") as conn:
|
|
conn.execute("UPDATE verification_events SET created_at = ?", (cutoff,))
|
|
conn.commit()
|
|
|
|
record_terminal_result(
|
|
command="pnpm test",
|
|
cwd=tmp_path,
|
|
session_id="new-session",
|
|
exit_code=0,
|
|
output="new green",
|
|
)
|
|
|
|
assert verification_status(session_id="old-session", cwd=tmp_path)["status"] == "unverified"
|
|
assert verification_status(session_id="new-session", cwd=tmp_path)["status"] == "passed"
|
|
with sqlite3.connect(home / "verification_evidence.db") as conn:
|
|
old_rows = conn.execute(
|
|
"SELECT COUNT(*) FROM verification_events WHERE session_id = 'old-session'"
|
|
).fetchone()[0]
|
|
assert old_rows == 0
|
|
|
|
|
|
def test_recording_expires_old_edit_only_state(tmp_path, monkeypatch):
|
|
home = tmp_path / ".hermes"
|
|
monkeypatch.setenv("HERMES_HOME", str(home))
|
|
_node_project(tmp_path)
|
|
|
|
mark_workspace_edited(
|
|
session_id="old-session",
|
|
cwd=tmp_path,
|
|
paths=[str(tmp_path / "src" / "app.ts")],
|
|
)
|
|
cutoff = (datetime.now(timezone.utc) - timedelta(days=31)).isoformat()
|
|
with sqlite3.connect(home / "verification_evidence.db") as conn:
|
|
conn.execute("UPDATE verification_state SET last_edit_at = ?", (cutoff,))
|
|
conn.commit()
|
|
|
|
record_terminal_result(
|
|
command="pnpm test",
|
|
cwd=tmp_path,
|
|
session_id="new-session",
|
|
exit_code=0,
|
|
output="new green",
|
|
)
|
|
|
|
status = verification_status(session_id="old-session", cwd=tmp_path)
|
|
assert status["status"] == "unverified"
|
|
assert status["changed_paths"] == []
|