diff --git a/agent/verification_evidence.py b/agent/verification_evidence.py index f86b115c060..9849cdd73a9 100644 --- a/agent/verification_evidence.py +++ b/agent/verification_evidence.py @@ -11,6 +11,7 @@ import json import re import shlex import sqlite3 +import tempfile import threading from dataclasses import dataclass from datetime import datetime, timedelta, timezone @@ -25,6 +26,7 @@ _MAX_OUTPUT_SUMMARY_CHARS = 2000 _MAX_EVIDENCE_AGE_DAYS = 30 _MAX_EVENTS_PER_SESSION_ROOT = 100 _MAX_TOTAL_UNREFERENCED_EVENTS = 10_000 +_AD_HOC_SCRIPT_NAME_PREFIXES = ("hermes-verify-", "hermes-ad-hoc-") _VERIFY_SCHEMA_VERSION = 1 _SHELL_SPLIT_RE = re.compile(r"\s*(?:&&|\|\||;)\s*") @@ -240,6 +242,69 @@ def _scope_for_args(args: list[str]) -> str: return "targeted" if any(_looks_like_target(arg) for arg in args) else "full" +def _is_under_temp_dir(token: str) -> bool: + if not token or token.startswith("-"): + return False + try: + path = Path(token).expanduser() + if not path.is_absolute(): + return False + resolved = path.resolve() + temp_root = Path(tempfile.gettempdir()).resolve() + return resolved == temp_root or temp_root in resolved.parents + except Exception: + return False + + +def _is_under_root(token: str, root: str | Path | None) -> bool: + if not root: + return False + try: + path = Path(token).expanduser().resolve() + root_path = Path(root).expanduser().resolve() + return path == root_path or root_path in path.parents + except Exception: + return False + + +def _is_temp_script_path(token: str, root: str | Path | None) -> bool: + try: + name = Path(token).expanduser().name + except Exception: + return False + return ( + name.startswith(_AD_HOC_SCRIPT_NAME_PREFIXES) + and _is_under_temp_dir(token) + and not _is_under_root(token, root) + ) + + +def _ad_hoc_script_args(tokens: list[str], root: str | Path | None) -> Optional[list[str]]: + candidate_tokens = _strip_command_prefix(tokens) + if not candidate_tokens: + return None + command = candidate_tokens[0] + if _is_temp_script_path(command, root): + return candidate_tokens[1:] + if command in {"python", "python3", "node", "bash", "sh", "ruby", "perl"}: + for idx, token in enumerate(candidate_tokens[1:], start=1): + if token == "--": + continue + if _is_temp_script_path(token, root): + return candidate_tokens[idx + 1:] + if not token.startswith("-"): + return None + return None + + +def _find_ad_hoc_match(command: str, root: str | Path | None) -> Optional[list[str]]: + for tokens in _split_segment_tokens(command): + trailing_args = _ad_hoc_script_args(tokens, root) + if trailing_args is not None: + return trailing_args + return None + + def _summarize_output(output: str) -> str: text = (output or "").strip() if len(text) <= _MAX_OUTPUT_SUMMARY_CHARS: @@ -338,6 +403,12 @@ def classify_verification_command( verify_commands = list(facts.get("verifyCommands") or []) match = _find_canonical_match(command, verify_commands) + is_ad_hoc = False + if match is None and not verify_commands: + ad_hoc_args = _find_ad_hoc_match(command, facts.get("root")) + if ad_hoc_args is not None: + match = ("ad-hoc verification script", ad_hoc_args) + is_ad_hoc = True if match is None: return None @@ -345,8 +416,8 @@ def classify_verification_command( return VerificationEvidence( command=command, canonical_command=canonical, - kind=_kind_for_command(canonical), - scope=_scope_for_args(trailing_args), + kind="ad_hoc" if is_ad_hoc else _kind_for_command(canonical), + scope="targeted" if is_ad_hoc else _scope_for_args(trailing_args), status="passed" if int(exit_code) == 0 else "failed", exit_code=int(exit_code), cwd=str(Path(cwd or ".").resolve()), diff --git a/agent/verification_stop.py b/agent/verification_stop.py index 80cb4aa9a3e..e19cb22bc4e 100644 --- a/agent/verification_stop.py +++ b/agent/verification_stop.py @@ -8,6 +8,7 @@ finish immediately after editing code without fresh evidence. from __future__ import annotations import os +import tempfile from pathlib import Path from typing import Any, Iterable @@ -127,26 +128,36 @@ def build_verify_on_stop_nudge( for cmd in (facts.get("verifyCommands") or []) if str(cmd).strip() ] - if not verify_commands: - return None state = str(status.get("status") or "unverified") if state == "passed": return None - command_hint = ", ".join(f"`{cmd}`" for cmd in verify_commands[:3]) - if len(verify_commands) > 3: - command_hint += ", ..." + if verify_commands: + command_instruction = ( + "Run the relevant verification command now (" + + ", ".join(f"`{cmd}`" for cmd in verify_commands[:3]) + + (", ..." if len(verify_commands) > 3 else "") + + "), read any failure, repair the code, and summarize what passed." + ) + else: + temp_dir = tempfile.gettempdir() + command_instruction = ( + "No canonical test/lint/build command was detected. Create a focused " + f"temporary verification script under `{temp_dir}` using an OS-safe " + "`tempfile` path with a `hermes-verify-` filename prefix, run it " + "against the changed behavior, clean it up when possible, and " + "summarize it explicitly as ad-hoc verification rather than suite " + "green." + ) return ( "[System: You edited code in this turn, but the workspace does not have " "fresh passing verification evidence yet.\n\n" f"Verification status: {_status_detail(status)}\n\n" f"Changed paths:\n{_format_changed_paths(paths)}\n\n" - f"Run the relevant verification command now ({command_hint}), read any " - "failure, repair the code, and summarize what passed. If verification " - "is not possible, explain the concrete blocker instead of claiming the " - "work is fully verified.]" + f"{command_instruction} If verification is not possible, explain the " + "concrete blocker instead of claiming the work is fully verified.]" ) diff --git a/tests/agent/test_verification_evidence.py b/tests/agent/test_verification_evidence.py index 809c0ccd017..5f957f54efb 100644 --- a/tests/agent/test_verification_evidence.py +++ b/tests/agent/test_verification_evidence.py @@ -1,5 +1,6 @@ import json import sqlite3 +import tempfile from datetime import datetime, timedelta, timezone from pathlib import Path @@ -169,6 +170,85 @@ def test_uv_run_pytest_matches_detected_pytest(tmp_path, monkeypatch): assert evidence.scope == "targeted" +def test_temp_script_records_ad_hoc_evidence_without_canonical_suite(tmp_path, monkeypatch): + monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes")) + (tmp_path / "package.json").write_text("{}", encoding="utf-8") + script = Path(tempfile.gettempdir()) / f"hermes-ad-hoc-{tmp_path.name}.py" + script.write_text("print('ok')\n", encoding="utf-8") + try: + evidence = classify_verification_command( + f"python {script}", + cwd=tmp_path, + session_id="s1", + exit_code=0, + output="ok", + ) + finally: + script.unlink(missing_ok=True) + + assert evidence is not None + assert evidence.canonical_command == "ad-hoc verification script" + assert evidence.kind == "ad_hoc" + assert evidence.scope == "targeted" + assert evidence.status == "passed" + + +def test_unprefixed_temp_script_is_not_ad_hoc_evidence(tmp_path, monkeypatch): + monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes")) + (tmp_path / "package.json").write_text("{}", encoding="utf-8") + script = Path(tempfile.gettempdir()) / f"random-check-{tmp_path.name}.py" + script.write_text("print('ok')\n", encoding="utf-8") + try: + evidence = classify_verification_command( + f"python {script}", + cwd=tmp_path, + session_id="s1", + exit_code=0, + output="ok", + ) + finally: + script.unlink(missing_ok=True) + + assert evidence is None + + +def test_temp_script_does_not_replace_detected_suite(tmp_path, monkeypatch): + monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes")) + _node_project(tmp_path) + script = Path(tempfile.gettempdir()) / f"hermes-ad-hoc-{tmp_path.name}.py" + script.write_text("print('ok')\n", encoding="utf-8") + try: + evidence = classify_verification_command( + f"python {script}", + cwd=tmp_path, + session_id="s1", + exit_code=0, + output="ok", + ) + finally: + script.unlink(missing_ok=True) + + assert evidence is None + + +def test_non_temp_script_is_not_ad_hoc_evidence(tmp_path, monkeypatch): + monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes")) + (tmp_path / "package.json").write_text("{}", encoding="utf-8") + script = tmp_path / "scripts" / "repro.py" + script.parent.mkdir() + script.write_text("print('ok')\n", encoding="utf-8") + + evidence = classify_verification_command( + f"python {script}", + cwd=tmp_path, + session_id="s1", + exit_code=0, + output="ok", + ) + + assert evidence is None + + def test_status_is_unverified_without_evidence(tmp_path, monkeypatch): monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes")) _node_project(tmp_path) diff --git a/tests/agent/test_verification_stop.py b/tests/agent/test_verification_stop.py index a325a434eb1..600fa5bf714 100644 --- a/tests/agent/test_verification_stop.py +++ b/tests/agent/test_verification_stop.py @@ -1,4 +1,5 @@ import json +import tempfile from pathlib import Path from agent.verification_evidence import ( @@ -117,11 +118,36 @@ def test_nudge_includes_failed_output_summary(tmp_path, monkeypatch): assert "repair the code" in nudge -def test_no_nudge_without_canonical_verify_command(tmp_path, monkeypatch): +def test_no_suite_nudge_requests_temp_script(tmp_path, monkeypatch): monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes")) (tmp_path / "package.json").write_text("{}", encoding="utf-8") changed = str(tmp_path / "src" / "app.ts") + nudge = build_verify_on_stop_nudge(session_id="s1", changed_paths=[changed]) + + assert nudge is not None + assert tempfile.gettempdir() in nudge + assert "ad-hoc verification" in nudge + assert "suite green" in nudge + + +def test_ad_hoc_pass_satisfies_no_suite_stop_loop(tmp_path, monkeypatch): + monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes")) + (tmp_path / "package.json").write_text("{}", encoding="utf-8") + changed = str(tmp_path / "src" / "app.ts") + script = Path(tempfile.gettempdir()) / f"hermes-ad-hoc-stop-{tmp_path.name}.py" + script.write_text("print('ok')\n", encoding="utf-8") + try: + record_terminal_result( + command=f"python {script}", + cwd=tmp_path, + session_id="s1", + exit_code=0, + output="ok", + ) + finally: + script.unlink(missing_ok=True) + assert build_verify_on_stop_nudge(session_id="s1", changed_paths=[changed]) is None