Merge pull request #52297 from NousResearch/bb/ad-hoc-verify

Support ad-hoc verification scripts
This commit is contained in:
brooklyn! 2026-06-24 23:10:15 -05:00 committed by GitHub
commit 380d660cab
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 200 additions and 12 deletions

View file

@ -11,6 +11,7 @@ import json
import re
import shlex
import sqlite3
import tempfile
import threading
from dataclasses import dataclass
from datetime import datetime, timedelta, timezone
@ -25,6 +26,7 @@ _MAX_OUTPUT_SUMMARY_CHARS = 2000
_MAX_EVIDENCE_AGE_DAYS = 30
_MAX_EVENTS_PER_SESSION_ROOT = 100
_MAX_TOTAL_UNREFERENCED_EVENTS = 10_000
_AD_HOC_SCRIPT_NAME_PREFIXES = ("hermes-verify-", "hermes-ad-hoc-")
_VERIFY_SCHEMA_VERSION = 1
_SHELL_SPLIT_RE = re.compile(r"\s*(?:&&|\|\||;)\s*")
@ -240,6 +242,69 @@ def _scope_for_args(args: list[str]) -> str:
return "targeted" if any(_looks_like_target(arg) for arg in args) else "full"
def _is_under_temp_dir(token: str) -> bool:
if not token or token.startswith("-"):
return False
try:
path = Path(token).expanduser()
if not path.is_absolute():
return False
resolved = path.resolve()
temp_root = Path(tempfile.gettempdir()).resolve()
return resolved == temp_root or temp_root in resolved.parents
except Exception:
return False
def _is_under_root(token: str, root: str | Path | None) -> bool:
if not root:
return False
try:
path = Path(token).expanduser().resolve()
root_path = Path(root).expanduser().resolve()
return path == root_path or root_path in path.parents
except Exception:
return False
def _is_temp_script_path(token: str, root: str | Path | None) -> bool:
try:
name = Path(token).expanduser().name
except Exception:
return False
return (
name.startswith(_AD_HOC_SCRIPT_NAME_PREFIXES)
and _is_under_temp_dir(token)
and not _is_under_root(token, root)
)
def _ad_hoc_script_args(tokens: list[str], root: str | Path | None) -> Optional[list[str]]:
candidate_tokens = _strip_command_prefix(tokens)
if not candidate_tokens:
return None
command = candidate_tokens[0]
if _is_temp_script_path(command, root):
return candidate_tokens[1:]
if command in {"python", "python3", "node", "bash", "sh", "ruby", "perl"}:
for idx, token in enumerate(candidate_tokens[1:], start=1):
if token == "--":
continue
if _is_temp_script_path(token, root):
return candidate_tokens[idx + 1:]
if not token.startswith("-"):
return None
return None
def _find_ad_hoc_match(command: str, root: str | Path | None) -> Optional[list[str]]:
for tokens in _split_segment_tokens(command):
trailing_args = _ad_hoc_script_args(tokens, root)
if trailing_args is not None:
return trailing_args
return None
def _summarize_output(output: str) -> str:
text = (output or "").strip()
if len(text) <= _MAX_OUTPUT_SUMMARY_CHARS:
@ -338,6 +403,12 @@ def classify_verification_command(
verify_commands = list(facts.get("verifyCommands") or [])
match = _find_canonical_match(command, verify_commands)
is_ad_hoc = False
if match is None and not verify_commands:
ad_hoc_args = _find_ad_hoc_match(command, facts.get("root"))
if ad_hoc_args is not None:
match = ("ad-hoc verification script", ad_hoc_args)
is_ad_hoc = True
if match is None:
return None
@ -345,8 +416,8 @@ def classify_verification_command(
return VerificationEvidence(
command=command,
canonical_command=canonical,
kind=_kind_for_command(canonical),
scope=_scope_for_args(trailing_args),
kind="ad_hoc" if is_ad_hoc else _kind_for_command(canonical),
scope="targeted" if is_ad_hoc else _scope_for_args(trailing_args),
status="passed" if int(exit_code) == 0 else "failed",
exit_code=int(exit_code),
cwd=str(Path(cwd or ".").resolve()),

View file

@ -8,6 +8,7 @@ finish immediately after editing code without fresh evidence.
from __future__ import annotations
import os
import tempfile
from pathlib import Path
from typing import Any, Iterable
@ -127,26 +128,36 @@ def build_verify_on_stop_nudge(
for cmd in (facts.get("verifyCommands") or [])
if str(cmd).strip()
]
if not verify_commands:
return None
state = str(status.get("status") or "unverified")
if state == "passed":
return None
command_hint = ", ".join(f"`{cmd}`" for cmd in verify_commands[:3])
if len(verify_commands) > 3:
command_hint += ", ..."
if verify_commands:
command_instruction = (
"Run the relevant verification command now ("
+ ", ".join(f"`{cmd}`" for cmd in verify_commands[:3])
+ (", ..." if len(verify_commands) > 3 else "")
+ "), read any failure, repair the code, and summarize what passed."
)
else:
temp_dir = tempfile.gettempdir()
command_instruction = (
"No canonical test/lint/build command was detected. Create a focused "
f"temporary verification script under `{temp_dir}` using an OS-safe "
"`tempfile` path with a `hermes-verify-` filename prefix, run it "
"against the changed behavior, clean it up when possible, and "
"summarize it explicitly as ad-hoc verification rather than suite "
"green."
)
return (
"[System: You edited code in this turn, but the workspace does not have "
"fresh passing verification evidence yet.\n\n"
f"Verification status: {_status_detail(status)}\n\n"
f"Changed paths:\n{_format_changed_paths(paths)}\n\n"
f"Run the relevant verification command now ({command_hint}), read any "
"failure, repair the code, and summarize what passed. If verification "
"is not possible, explain the concrete blocker instead of claiming the "
"work is fully verified.]"
f"{command_instruction} If verification is not possible, explain the "
"concrete blocker instead of claiming the work is fully verified.]"
)

View file

@ -1,5 +1,6 @@
import json
import sqlite3
import tempfile
from datetime import datetime, timedelta, timezone
from pathlib import Path
@ -169,6 +170,85 @@ def test_uv_run_pytest_matches_detected_pytest(tmp_path, monkeypatch):
assert evidence.scope == "targeted"
def test_temp_script_records_ad_hoc_evidence_without_canonical_suite(tmp_path, monkeypatch):
monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
(tmp_path / "package.json").write_text("{}", encoding="utf-8")
script = Path(tempfile.gettempdir()) / f"hermes-ad-hoc-{tmp_path.name}.py"
script.write_text("print('ok')\n", encoding="utf-8")
try:
evidence = classify_verification_command(
f"python {script}",
cwd=tmp_path,
session_id="s1",
exit_code=0,
output="ok",
)
finally:
script.unlink(missing_ok=True)
assert evidence is not None
assert evidence.canonical_command == "ad-hoc verification script"
assert evidence.kind == "ad_hoc"
assert evidence.scope == "targeted"
assert evidence.status == "passed"
def test_unprefixed_temp_script_is_not_ad_hoc_evidence(tmp_path, monkeypatch):
monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
(tmp_path / "package.json").write_text("{}", encoding="utf-8")
script = Path(tempfile.gettempdir()) / f"random-check-{tmp_path.name}.py"
script.write_text("print('ok')\n", encoding="utf-8")
try:
evidence = classify_verification_command(
f"python {script}",
cwd=tmp_path,
session_id="s1",
exit_code=0,
output="ok",
)
finally:
script.unlink(missing_ok=True)
assert evidence is None
def test_temp_script_does_not_replace_detected_suite(tmp_path, monkeypatch):
monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
_node_project(tmp_path)
script = Path(tempfile.gettempdir()) / f"hermes-ad-hoc-{tmp_path.name}.py"
script.write_text("print('ok')\n", encoding="utf-8")
try:
evidence = classify_verification_command(
f"python {script}",
cwd=tmp_path,
session_id="s1",
exit_code=0,
output="ok",
)
finally:
script.unlink(missing_ok=True)
assert evidence is None
def test_non_temp_script_is_not_ad_hoc_evidence(tmp_path, monkeypatch):
monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
(tmp_path / "package.json").write_text("{}", encoding="utf-8")
script = tmp_path / "scripts" / "repro.py"
script.parent.mkdir()
script.write_text("print('ok')\n", encoding="utf-8")
evidence = classify_verification_command(
f"python {script}",
cwd=tmp_path,
session_id="s1",
exit_code=0,
output="ok",
)
assert evidence is None
def test_status_is_unverified_without_evidence(tmp_path, monkeypatch):
monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
_node_project(tmp_path)

View file

@ -1,4 +1,5 @@
import json
import tempfile
from pathlib import Path
from agent.verification_evidence import (
@ -117,11 +118,36 @@ def test_nudge_includes_failed_output_summary(tmp_path, monkeypatch):
assert "repair the code" in nudge
def test_no_nudge_without_canonical_verify_command(tmp_path, monkeypatch):
def test_no_suite_nudge_requests_temp_script(tmp_path, monkeypatch):
monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
(tmp_path / "package.json").write_text("{}", encoding="utf-8")
changed = str(tmp_path / "src" / "app.ts")
nudge = build_verify_on_stop_nudge(session_id="s1", changed_paths=[changed])
assert nudge is not None
assert tempfile.gettempdir() in nudge
assert "ad-hoc verification" in nudge
assert "suite green" in nudge
def test_ad_hoc_pass_satisfies_no_suite_stop_loop(tmp_path, monkeypatch):
monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
(tmp_path / "package.json").write_text("{}", encoding="utf-8")
changed = str(tmp_path / "src" / "app.ts")
script = Path(tempfile.gettempdir()) / f"hermes-ad-hoc-stop-{tmp_path.name}.py"
script.write_text("print('ok')\n", encoding="utf-8")
try:
record_terminal_result(
command=f"python {script}",
cwd=tmp_path,
session_id="s1",
exit_code=0,
output="ok",
)
finally:
script.unlink(missing_ok=True)
assert build_verify_on_stop_nudge(session_id="s1", changed_paths=[changed]) is None