mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-07-01 12:02:05 +00:00
The verify-on-stop guard fired too eagerly — including on doc/markdown/skill edits with nothing to verify, where it pushed a pointless /tmp verification script. Three changes: 1. Default OFF for new installs: agent.verify_on_stop defaults to false (was the "auto" surface-aware sentinel). _config_version bumped 30 -> 31. 2. One-time migration (v30 -> v31): existing installs are switched off once, but only when the value is missing or still the "auto" sentinel — an explicit true/false the user set is preserved. 3. Path filter: build_verify_on_stop_nudge() now drops documentation/prose paths (.md/.mdx/.rst/.txt/LICENSE/CHANGELOG/...) so even when explicitly enabled, a doc-only turn never nudges. Mixed doc+code turns still nudge on the code paths. The legacy "auto" sentinel is still honored when set explicitly (ON for interactive coding surfaces, OFF for messaging). HERMES_VERIFY_ON_STOP env override unchanged.
341 lines
12 KiB
Python
341 lines
12 KiB
Python
import json
|
|
import tempfile
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
from agent.verification_evidence import (
|
|
mark_workspace_edited,
|
|
record_terminal_result,
|
|
)
|
|
from agent.verification_stop import (
|
|
build_verify_on_stop_nudge,
|
|
verify_on_stop_enabled,
|
|
)
|
|
|
|
|
|
def _node_project(root: Path) -> None:
|
|
(root / "package.json").write_text(
|
|
json.dumps({"scripts": {"test": "vitest", "lint": "eslint ."}}),
|
|
encoding="utf-8",
|
|
)
|
|
(root / "pnpm-lock.yaml").write_text("", encoding="utf-8")
|
|
|
|
|
|
def _make_project(root: Path) -> None:
|
|
root.mkdir()
|
|
_node_project(root)
|
|
|
|
|
|
@pytest.fixture
|
|
def clear_verify_env(monkeypatch):
|
|
"""Clear every env signal verify_on_stop_enabled consults.
|
|
|
|
Tests then set only the variable they exercise, mirroring how the CLI/TUI
|
|
set HERMES_SESSION_SOURCE and the gateway sets HERMES_SESSION_PLATFORM.
|
|
"""
|
|
for var in (
|
|
"HERMES_VERIFY_ON_STOP",
|
|
"HERMES_PLATFORM",
|
|
"HERMES_SESSION_PLATFORM",
|
|
"HERMES_SESSION_SOURCE",
|
|
):
|
|
monkeypatch.delenv(var, raising=False)
|
|
return monkeypatch
|
|
|
|
|
|
def test_verify_on_stop_default_is_off(clear_verify_env):
|
|
# No env, no explicit config -> default OFF (new default as of v31).
|
|
assert verify_on_stop_enabled({"agent": {}}) is False
|
|
|
|
|
|
def test_verify_on_stop_missing_agent_section_is_off(clear_verify_env):
|
|
assert verify_on_stop_enabled({}) is False
|
|
|
|
|
|
def test_verify_on_stop_auto_sentinel_resolves_to_surface_default(clear_verify_env):
|
|
# The legacy "auto" sentinel is still honored when set explicitly: it falls
|
|
# through to the surface-aware default (ON interactive, OFF messaging).
|
|
assert verify_on_stop_enabled({"agent": {"verify_on_stop": "auto"}}) is True
|
|
clear_verify_env.setenv("HERMES_SESSION_PLATFORM", "telegram")
|
|
assert verify_on_stop_enabled({"agent": {"verify_on_stop": "auto"}}) is False
|
|
|
|
|
|
def test_verify_on_stop_env_can_disable(clear_verify_env):
|
|
clear_verify_env.setenv("HERMES_VERIFY_ON_STOP", "0")
|
|
assert verify_on_stop_enabled({"agent": {"verify_on_stop": True}}) is False
|
|
|
|
|
|
def test_verify_on_stop_env_can_enable(clear_verify_env):
|
|
# Env wins over the default-off config.
|
|
clear_verify_env.setenv("HERMES_VERIFY_ON_STOP", "1")
|
|
assert verify_on_stop_enabled({"agent": {}}) is True
|
|
|
|
|
|
def test_verify_on_stop_config_true_enables(clear_verify_env):
|
|
assert verify_on_stop_enabled({"agent": {"verify_on_stop": True}}) is True
|
|
|
|
|
|
def test_verify_on_stop_config_can_disable(clear_verify_env):
|
|
assert verify_on_stop_enabled({"agent": {"verify_on_stop": False}}) is False
|
|
|
|
|
|
def test_verify_on_stop_auto_off_on_gateway_messaging_platform(clear_verify_env):
|
|
# With explicit "auto", a real Telegram turn resolves OFF.
|
|
clear_verify_env.setenv("HERMES_SESSION_PLATFORM", "telegram")
|
|
assert verify_on_stop_enabled({"agent": {"verify_on_stop": "auto"}}) is False
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"platform",
|
|
["discord", "whatsapp_cloud", "signal", "slack", "matrix", "email", "sms"],
|
|
)
|
|
def test_verify_on_stop_auto_off_for_each_messaging_platform(clear_verify_env, platform):
|
|
clear_verify_env.setenv("HERMES_SESSION_PLATFORM", platform)
|
|
assert verify_on_stop_enabled({"agent": {"verify_on_stop": "auto"}}) is False
|
|
|
|
|
|
def test_verify_on_stop_auto_messaging_platform_is_case_insensitive(clear_verify_env):
|
|
clear_verify_env.setenv("HERMES_SESSION_PLATFORM", " Telegram ")
|
|
assert verify_on_stop_enabled({"agent": {"verify_on_stop": "auto"}}) is False
|
|
|
|
|
|
def test_verify_on_stop_auto_uses_hermes_platform_override(clear_verify_env):
|
|
# HERMES_PLATFORM mirrors the sibling platform resolution and also flags a
|
|
# messaging surface under the "auto" sentinel.
|
|
clear_verify_env.setenv("HERMES_PLATFORM", "discord")
|
|
assert verify_on_stop_enabled({"agent": {"verify_on_stop": "auto"}}) is False
|
|
|
|
|
|
@pytest.mark.parametrize("source", ["cli", "tui", "desktop", "codex", "local"])
|
|
def test_verify_on_stop_auto_on_for_interactive_surfaces(clear_verify_env, source):
|
|
# Under "auto", CLI/TUI/desktop coding surfaces resolve ON.
|
|
clear_verify_env.setenv("HERMES_SESSION_SOURCE", source)
|
|
assert verify_on_stop_enabled({"agent": {"verify_on_stop": "auto"}}) is True
|
|
|
|
|
|
@pytest.mark.parametrize("platform", ["api_server", "webhook", "msgraph_webhook"])
|
|
def test_verify_on_stop_auto_on_for_programmatic_surfaces(clear_verify_env, platform):
|
|
clear_verify_env.setenv("HERMES_SESSION_PLATFORM", platform)
|
|
assert verify_on_stop_enabled({"agent": {"verify_on_stop": "auto"}}) is True
|
|
|
|
|
|
def test_default_off_overrides_interactive_surface(clear_verify_env):
|
|
# The new default is OFF even on an interactive coding surface — only an
|
|
# explicit "auto"/true turns it back on.
|
|
clear_verify_env.setenv("HERMES_SESSION_SOURCE", "cli")
|
|
assert verify_on_stop_enabled({"agent": {}}) is False
|
|
|
|
|
|
def test_env_forces_verify_on_stop_on_for_messaging(clear_verify_env):
|
|
clear_verify_env.setenv("HERMES_SESSION_PLATFORM", "telegram")
|
|
clear_verify_env.setenv("HERMES_VERIFY_ON_STOP", "1")
|
|
assert verify_on_stop_enabled({"agent": {}}) is True
|
|
|
|
|
|
def test_config_forces_verify_on_stop_on_for_messaging(clear_verify_env):
|
|
clear_verify_env.setenv("HERMES_SESSION_PLATFORM", "telegram")
|
|
assert verify_on_stop_enabled({"agent": {"verify_on_stop": True}}) is True
|
|
|
|
|
|
def test_verify_on_stop_default_path_through_load_config(tmp_path, clear_verify_env):
|
|
# E2E: the sole production caller passes no config, so verify_on_stop_enabled
|
|
# resolves through load_config() + DEFAULT_CONFIG. The default is now the
|
|
# boolean False, so even an interactive surface resolves OFF without an
|
|
# explicit opt-in. This is the path the unit-level tests above cannot
|
|
# exercise.
|
|
clear_verify_env.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
|
|
|
|
from hermes_cli.config import load_config
|
|
|
|
merged = load_config()
|
|
assert merged["agent"]["verify_on_stop"] is False
|
|
|
|
# Interactive surface still resolves OFF through the real loader.
|
|
clear_verify_env.setenv("HERMES_SESSION_SOURCE", "cli")
|
|
assert verify_on_stop_enabled() is False
|
|
|
|
# A messaging platform also resolves OFF.
|
|
clear_verify_env.setenv("HERMES_SESSION_PLATFORM", "telegram")
|
|
assert verify_on_stop_enabled() is False
|
|
|
|
|
|
def test_no_nudge_after_fresh_pass(tmp_path, monkeypatch):
|
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
|
|
_node_project(tmp_path)
|
|
changed = str(tmp_path / "src" / "app.ts")
|
|
|
|
record_terminal_result(
|
|
command="pnpm test",
|
|
cwd=tmp_path,
|
|
session_id="s1",
|
|
exit_code=0,
|
|
output="green",
|
|
)
|
|
|
|
assert build_verify_on_stop_nudge(session_id="s1", changed_paths=[changed]) is None
|
|
|
|
|
|
def test_nudge_checks_all_edited_workspaces(tmp_path, monkeypatch):
|
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
|
|
project_a = tmp_path / "a"
|
|
project_b = tmp_path / "b"
|
|
_make_project(project_a)
|
|
_make_project(project_b)
|
|
changed_a = str(project_a / "src" / "app.ts")
|
|
changed_b = str(project_b / "src" / "app.ts")
|
|
|
|
record_terminal_result(
|
|
command="pnpm test",
|
|
cwd=project_a,
|
|
session_id="s1",
|
|
exit_code=0,
|
|
output="green",
|
|
)
|
|
mark_workspace_edited(session_id="s1", cwd=project_b, paths=[changed_b])
|
|
|
|
nudge = build_verify_on_stop_nudge(
|
|
session_id="s1",
|
|
changed_paths=[changed_a, changed_b],
|
|
)
|
|
|
|
assert nudge is not None
|
|
assert "fresh passing verification evidence" in nudge
|
|
|
|
|
|
def test_nudge_after_unverified_edit_with_known_command(tmp_path, monkeypatch):
|
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
|
|
_node_project(tmp_path)
|
|
changed = str(tmp_path / "src" / "app.ts")
|
|
mark_workspace_edited(session_id="s1", cwd=tmp_path, paths=[changed])
|
|
|
|
nudge = build_verify_on_stop_nudge(session_id="s1", changed_paths=[changed])
|
|
|
|
assert nudge is not None
|
|
assert "fresh passing verification evidence" in nudge
|
|
assert "`pnpm run test`" in nudge
|
|
assert changed in nudge
|
|
|
|
|
|
def test_nudge_includes_failed_output_summary(tmp_path, monkeypatch):
|
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
|
|
_node_project(tmp_path)
|
|
changed = str(tmp_path / "src" / "app.ts")
|
|
|
|
record_terminal_result(
|
|
command="pnpm test",
|
|
cwd=tmp_path,
|
|
session_id="s1",
|
|
exit_code=1,
|
|
output="expected 1 got 2",
|
|
)
|
|
|
|
nudge = build_verify_on_stop_nudge(session_id="s1", changed_paths=[changed])
|
|
|
|
assert nudge is not None
|
|
assert "failed" in nudge
|
|
assert "expected 1 got 2" in nudge
|
|
assert "repair the code" in nudge
|
|
|
|
|
|
def test_no_suite_nudge_requests_temp_script(tmp_path, monkeypatch):
|
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
|
|
(tmp_path / "package.json").write_text("{}", encoding="utf-8")
|
|
changed = str(tmp_path / "src" / "app.ts")
|
|
|
|
nudge = build_verify_on_stop_nudge(session_id="s1", changed_paths=[changed])
|
|
|
|
assert nudge is not None
|
|
assert tempfile.gettempdir() in nudge
|
|
assert "ad-hoc verification" in nudge
|
|
assert "suite green" in nudge
|
|
|
|
|
|
def test_ad_hoc_pass_satisfies_no_suite_stop_loop(tmp_path, monkeypatch):
|
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
|
|
(tmp_path / "package.json").write_text("{}", encoding="utf-8")
|
|
changed = str(tmp_path / "src" / "app.ts")
|
|
script = Path(tempfile.gettempdir()) / f"hermes-ad-hoc-stop-{tmp_path.name}.py"
|
|
script.write_text("print('ok')\n", encoding="utf-8")
|
|
try:
|
|
record_terminal_result(
|
|
command=f"python {script}",
|
|
cwd=tmp_path,
|
|
session_id="s1",
|
|
exit_code=0,
|
|
output="ok",
|
|
)
|
|
finally:
|
|
script.unlink(missing_ok=True)
|
|
|
|
assert build_verify_on_stop_nudge(session_id="s1", changed_paths=[changed]) is None
|
|
|
|
|
|
def test_nudge_attempts_are_bounded(tmp_path, monkeypatch):
|
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
|
|
_node_project(tmp_path)
|
|
changed = str(tmp_path / "src" / "app.ts")
|
|
mark_workspace_edited(session_id="s1", cwd=tmp_path, paths=[changed])
|
|
|
|
assert build_verify_on_stop_nudge(
|
|
session_id="s1",
|
|
changed_paths=[changed],
|
|
attempts=2,
|
|
max_attempts=2,
|
|
) is None
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Fix C: documentation/prose edits carry no verifiable behavior and must never
|
|
# trip the nudge, even on an unverified workspace.
|
|
# ---------------------------------------------------------------------------
|
|
|
|
@pytest.mark.parametrize(
|
|
"doc_name",
|
|
[
|
|
"SKILL.md",
|
|
"README.md",
|
|
"guide.markdown",
|
|
"page.mdx",
|
|
"manual.rst",
|
|
"notes.txt",
|
|
"data.csv",
|
|
"LICENSE",
|
|
"CHANGELOG",
|
|
],
|
|
)
|
|
def test_doc_only_edit_does_not_nudge(tmp_path, monkeypatch, doc_name):
|
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
|
|
_node_project(tmp_path)
|
|
changed = str(tmp_path / doc_name)
|
|
mark_workspace_edited(session_id="s1", cwd=tmp_path, paths=[changed])
|
|
|
|
# Unverified workspace, but the only edit is a doc — nothing to verify.
|
|
assert build_verify_on_stop_nudge(session_id="s1", changed_paths=[changed]) is None
|
|
|
|
|
|
def test_mixed_doc_and_code_edit_still_nudges(tmp_path, monkeypatch):
|
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
|
|
_node_project(tmp_path)
|
|
doc = str(tmp_path / "README.md")
|
|
code = str(tmp_path / "src" / "app.ts")
|
|
mark_workspace_edited(session_id="s1", cwd=tmp_path, paths=[code])
|
|
|
|
nudge = build_verify_on_stop_nudge(
|
|
session_id="s1", changed_paths=[doc, code]
|
|
)
|
|
assert nudge is not None
|
|
# The doc path is filtered out of the reported set; the code path remains.
|
|
assert code in nudge
|
|
assert doc not in nudge
|
|
|
|
|
|
def test_is_non_code_path_classification():
|
|
from agent.verification_stop import _is_non_code_path
|
|
|
|
assert _is_non_code_path("docs/SKILL.md") is True
|
|
assert _is_non_code_path("README") is False # README has no extension and isn't in the prose-filename set
|
|
assert _is_non_code_path("LICENSE") is True
|
|
assert _is_non_code_path("src/app.ts") is False
|
|
assert _is_non_code_path("config.yaml") is False
|
|
assert _is_non_code_path("run_agent.py") is False
|