mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-27 11:22:03 +00:00
Merge pull request #52297 from NousResearch/bb/ad-hoc-verify
Support ad-hoc verification scripts
This commit is contained in:
commit
380d660cab
4 changed files with 200 additions and 12 deletions
|
|
@ -11,6 +11,7 @@ import json
|
|||
import re
|
||||
import shlex
|
||||
import sqlite3
|
||||
import tempfile
|
||||
import threading
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timedelta, timezone
|
||||
|
|
@ -25,6 +26,7 @@ _MAX_OUTPUT_SUMMARY_CHARS = 2000
|
|||
_MAX_EVIDENCE_AGE_DAYS = 30
|
||||
_MAX_EVENTS_PER_SESSION_ROOT = 100
|
||||
_MAX_TOTAL_UNREFERENCED_EVENTS = 10_000
|
||||
_AD_HOC_SCRIPT_NAME_PREFIXES = ("hermes-verify-", "hermes-ad-hoc-")
|
||||
_VERIFY_SCHEMA_VERSION = 1
|
||||
_SHELL_SPLIT_RE = re.compile(r"\s*(?:&&|\|\||;)\s*")
|
||||
|
||||
|
|
@ -240,6 +242,69 @@ def _scope_for_args(args: list[str]) -> str:
|
|||
return "targeted" if any(_looks_like_target(arg) for arg in args) else "full"
|
||||
|
||||
|
||||
def _is_under_temp_dir(token: str) -> bool:
|
||||
if not token or token.startswith("-"):
|
||||
return False
|
||||
try:
|
||||
path = Path(token).expanduser()
|
||||
if not path.is_absolute():
|
||||
return False
|
||||
resolved = path.resolve()
|
||||
temp_root = Path(tempfile.gettempdir()).resolve()
|
||||
return resolved == temp_root or temp_root in resolved.parents
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def _is_under_root(token: str, root: str | Path | None) -> bool:
|
||||
if not root:
|
||||
return False
|
||||
try:
|
||||
path = Path(token).expanduser().resolve()
|
||||
root_path = Path(root).expanduser().resolve()
|
||||
return path == root_path or root_path in path.parents
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def _is_temp_script_path(token: str, root: str | Path | None) -> bool:
|
||||
try:
|
||||
name = Path(token).expanduser().name
|
||||
except Exception:
|
||||
return False
|
||||
return (
|
||||
name.startswith(_AD_HOC_SCRIPT_NAME_PREFIXES)
|
||||
and _is_under_temp_dir(token)
|
||||
and not _is_under_root(token, root)
|
||||
)
|
||||
|
||||
|
||||
def _ad_hoc_script_args(tokens: list[str], root: str | Path | None) -> Optional[list[str]]:
|
||||
candidate_tokens = _strip_command_prefix(tokens)
|
||||
if not candidate_tokens:
|
||||
return None
|
||||
command = candidate_tokens[0]
|
||||
if _is_temp_script_path(command, root):
|
||||
return candidate_tokens[1:]
|
||||
if command in {"python", "python3", "node", "bash", "sh", "ruby", "perl"}:
|
||||
for idx, token in enumerate(candidate_tokens[1:], start=1):
|
||||
if token == "--":
|
||||
continue
|
||||
if _is_temp_script_path(token, root):
|
||||
return candidate_tokens[idx + 1:]
|
||||
if not token.startswith("-"):
|
||||
return None
|
||||
return None
|
||||
|
||||
|
||||
def _find_ad_hoc_match(command: str, root: str | Path | None) -> Optional[list[str]]:
|
||||
for tokens in _split_segment_tokens(command):
|
||||
trailing_args = _ad_hoc_script_args(tokens, root)
|
||||
if trailing_args is not None:
|
||||
return trailing_args
|
||||
return None
|
||||
|
||||
|
||||
def _summarize_output(output: str) -> str:
|
||||
text = (output or "").strip()
|
||||
if len(text) <= _MAX_OUTPUT_SUMMARY_CHARS:
|
||||
|
|
@ -338,6 +403,12 @@ def classify_verification_command(
|
|||
|
||||
verify_commands = list(facts.get("verifyCommands") or [])
|
||||
match = _find_canonical_match(command, verify_commands)
|
||||
is_ad_hoc = False
|
||||
if match is None and not verify_commands:
|
||||
ad_hoc_args = _find_ad_hoc_match(command, facts.get("root"))
|
||||
if ad_hoc_args is not None:
|
||||
match = ("ad-hoc verification script", ad_hoc_args)
|
||||
is_ad_hoc = True
|
||||
if match is None:
|
||||
return None
|
||||
|
||||
|
|
@ -345,8 +416,8 @@ def classify_verification_command(
|
|||
return VerificationEvidence(
|
||||
command=command,
|
||||
canonical_command=canonical,
|
||||
kind=_kind_for_command(canonical),
|
||||
scope=_scope_for_args(trailing_args),
|
||||
kind="ad_hoc" if is_ad_hoc else _kind_for_command(canonical),
|
||||
scope="targeted" if is_ad_hoc else _scope_for_args(trailing_args),
|
||||
status="passed" if int(exit_code) == 0 else "failed",
|
||||
exit_code=int(exit_code),
|
||||
cwd=str(Path(cwd or ".").resolve()),
|
||||
|
|
|
|||
|
|
@ -8,6 +8,7 @@ finish immediately after editing code without fresh evidence.
|
|||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import Any, Iterable
|
||||
|
||||
|
|
@ -127,26 +128,36 @@ def build_verify_on_stop_nudge(
|
|||
for cmd in (facts.get("verifyCommands") or [])
|
||||
if str(cmd).strip()
|
||||
]
|
||||
if not verify_commands:
|
||||
return None
|
||||
|
||||
state = str(status.get("status") or "unverified")
|
||||
if state == "passed":
|
||||
return None
|
||||
|
||||
command_hint = ", ".join(f"`{cmd}`" for cmd in verify_commands[:3])
|
||||
if len(verify_commands) > 3:
|
||||
command_hint += ", ..."
|
||||
if verify_commands:
|
||||
command_instruction = (
|
||||
"Run the relevant verification command now ("
|
||||
+ ", ".join(f"`{cmd}`" for cmd in verify_commands[:3])
|
||||
+ (", ..." if len(verify_commands) > 3 else "")
|
||||
+ "), read any failure, repair the code, and summarize what passed."
|
||||
)
|
||||
else:
|
||||
temp_dir = tempfile.gettempdir()
|
||||
command_instruction = (
|
||||
"No canonical test/lint/build command was detected. Create a focused "
|
||||
f"temporary verification script under `{temp_dir}` using an OS-safe "
|
||||
"`tempfile` path with a `hermes-verify-` filename prefix, run it "
|
||||
"against the changed behavior, clean it up when possible, and "
|
||||
"summarize it explicitly as ad-hoc verification rather than suite "
|
||||
"green."
|
||||
)
|
||||
|
||||
return (
|
||||
"[System: You edited code in this turn, but the workspace does not have "
|
||||
"fresh passing verification evidence yet.\n\n"
|
||||
f"Verification status: {_status_detail(status)}\n\n"
|
||||
f"Changed paths:\n{_format_changed_paths(paths)}\n\n"
|
||||
f"Run the relevant verification command now ({command_hint}), read any "
|
||||
"failure, repair the code, and summarize what passed. If verification "
|
||||
"is not possible, explain the concrete blocker instead of claiming the "
|
||||
"work is fully verified.]"
|
||||
f"{command_instruction} If verification is not possible, explain the "
|
||||
"concrete blocker instead of claiming the work is fully verified.]"
|
||||
)
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
import json
|
||||
import sqlite3
|
||||
import tempfile
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from pathlib import Path
|
||||
|
||||
|
|
@ -169,6 +170,85 @@ def test_uv_run_pytest_matches_detected_pytest(tmp_path, monkeypatch):
|
|||
assert evidence.scope == "targeted"
|
||||
|
||||
|
||||
def test_temp_script_records_ad_hoc_evidence_without_canonical_suite(tmp_path, monkeypatch):
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
|
||||
(tmp_path / "package.json").write_text("{}", encoding="utf-8")
|
||||
script = Path(tempfile.gettempdir()) / f"hermes-ad-hoc-{tmp_path.name}.py"
|
||||
script.write_text("print('ok')\n", encoding="utf-8")
|
||||
try:
|
||||
evidence = classify_verification_command(
|
||||
f"python {script}",
|
||||
cwd=tmp_path,
|
||||
session_id="s1",
|
||||
exit_code=0,
|
||||
output="ok",
|
||||
)
|
||||
finally:
|
||||
script.unlink(missing_ok=True)
|
||||
|
||||
assert evidence is not None
|
||||
assert evidence.canonical_command == "ad-hoc verification script"
|
||||
assert evidence.kind == "ad_hoc"
|
||||
assert evidence.scope == "targeted"
|
||||
assert evidence.status == "passed"
|
||||
|
||||
|
||||
def test_unprefixed_temp_script_is_not_ad_hoc_evidence(tmp_path, monkeypatch):
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
|
||||
(tmp_path / "package.json").write_text("{}", encoding="utf-8")
|
||||
script = Path(tempfile.gettempdir()) / f"random-check-{tmp_path.name}.py"
|
||||
script.write_text("print('ok')\n", encoding="utf-8")
|
||||
try:
|
||||
evidence = classify_verification_command(
|
||||
f"python {script}",
|
||||
cwd=tmp_path,
|
||||
session_id="s1",
|
||||
exit_code=0,
|
||||
output="ok",
|
||||
)
|
||||
finally:
|
||||
script.unlink(missing_ok=True)
|
||||
|
||||
assert evidence is None
|
||||
|
||||
|
||||
def test_temp_script_does_not_replace_detected_suite(tmp_path, monkeypatch):
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
|
||||
_node_project(tmp_path)
|
||||
script = Path(tempfile.gettempdir()) / f"hermes-ad-hoc-{tmp_path.name}.py"
|
||||
script.write_text("print('ok')\n", encoding="utf-8")
|
||||
try:
|
||||
evidence = classify_verification_command(
|
||||
f"python {script}",
|
||||
cwd=tmp_path,
|
||||
session_id="s1",
|
||||
exit_code=0,
|
||||
output="ok",
|
||||
)
|
||||
finally:
|
||||
script.unlink(missing_ok=True)
|
||||
|
||||
assert evidence is None
|
||||
|
||||
|
||||
def test_non_temp_script_is_not_ad_hoc_evidence(tmp_path, monkeypatch):
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
|
||||
(tmp_path / "package.json").write_text("{}", encoding="utf-8")
|
||||
script = tmp_path / "scripts" / "repro.py"
|
||||
script.parent.mkdir()
|
||||
script.write_text("print('ok')\n", encoding="utf-8")
|
||||
|
||||
evidence = classify_verification_command(
|
||||
f"python {script}",
|
||||
cwd=tmp_path,
|
||||
session_id="s1",
|
||||
exit_code=0,
|
||||
output="ok",
|
||||
)
|
||||
|
||||
assert evidence is None
|
||||
|
||||
|
||||
def test_status_is_unverified_without_evidence(tmp_path, monkeypatch):
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
|
||||
_node_project(tmp_path)
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
import json
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
from agent.verification_evidence import (
|
||||
|
|
@ -117,11 +118,36 @@ def test_nudge_includes_failed_output_summary(tmp_path, monkeypatch):
|
|||
assert "repair the code" in nudge
|
||||
|
||||
|
||||
def test_no_nudge_without_canonical_verify_command(tmp_path, monkeypatch):
|
||||
def test_no_suite_nudge_requests_temp_script(tmp_path, monkeypatch):
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
|
||||
(tmp_path / "package.json").write_text("{}", encoding="utf-8")
|
||||
changed = str(tmp_path / "src" / "app.ts")
|
||||
|
||||
nudge = build_verify_on_stop_nudge(session_id="s1", changed_paths=[changed])
|
||||
|
||||
assert nudge is not None
|
||||
assert tempfile.gettempdir() in nudge
|
||||
assert "ad-hoc verification" in nudge
|
||||
assert "suite green" in nudge
|
||||
|
||||
|
||||
def test_ad_hoc_pass_satisfies_no_suite_stop_loop(tmp_path, monkeypatch):
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
|
||||
(tmp_path / "package.json").write_text("{}", encoding="utf-8")
|
||||
changed = str(tmp_path / "src" / "app.ts")
|
||||
script = Path(tempfile.gettempdir()) / f"hermes-ad-hoc-stop-{tmp_path.name}.py"
|
||||
script.write_text("print('ok')\n", encoding="utf-8")
|
||||
try:
|
||||
record_terminal_result(
|
||||
command=f"python {script}",
|
||||
cwd=tmp_path,
|
||||
session_id="s1",
|
||||
exit_code=0,
|
||||
output="ok",
|
||||
)
|
||||
finally:
|
||||
script.unlink(missing_ok=True)
|
||||
|
||||
assert build_verify_on_stop_nudge(session_id="s1", changed_paths=[changed]) is None
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue