mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-18 04:41:56 +00:00
feat(agent): per-turn file-mutation verifier footer (#24498)
Detect when write_file / patch calls fail during a turn and are never superseded by a successful write to the same path. When the final text response is delivered, append an advisory footer listing the files that did NOT change — so models that over-claim 'patched 5 files' after 4 silent failures can't hide the lie. Catches the failure mode reported in Ben Eng's llm-wiki session: grok-4.1-fast issued batches of parallel patches, half failed with 'Could not find old_string', and the agent summarised the turn claiming every file was edited. The user had to manually run 'git status' each turn to catch it. The verifier is a pure post-hoc check on tool results — no new LLM calls, no synthetic messages injected into history (prompt cache preserved), no changes to tool argument dispatch. Per-turn state is keyed by path; a later successful write to the same path clears the failure entry so single-file retry recovery is not flagged. Wired into both _execute_tool_calls_concurrent and _execute_tool_calls_sequential, so batched parallel patches and one-at- a-time edits are both covered. Footer emission happens after the agent loop exits, before transform_llm_output / post_llm_call plugin hooks run, so plugins still see (and can modify) the augmented text. Config: display.file_mutation_verifier (bool, default true) + HERMES_FILE_MUTATION_VERIFIER env override. 31 unit tests in tests/run_agent/test_file_mutation_verifier.py cover target extraction (write_file, patch-replace, patch-v4a single and multi-file), error-preview extraction (JSON .error field and plain string), per-turn state transitions (first-error-wins on repeated failure, success supersedes failure), footer rendering (truncation at 10 entries, user-actionable hint), and env/config precedence. Companion docs updated: user-guide/configuration.md + reference/environment-variables.md.
This commit is contained in:
parent
dd0923bb89
commit
c594a23047
5 changed files with 552 additions and 0 deletions
|
|
@ -917,6 +917,14 @@ DEFAULT_CONFIG = {
|
|||
"persistent_output": True,
|
||||
"persistent_output_max_lines": 200,
|
||||
"inline_diffs": True, # Show inline diff previews for write actions (write_file, patch, skill_manage)
|
||||
# File-mutation verifier footer. When true (default), the agent
|
||||
# appends a one-line advisory to its final response whenever a
|
||||
# write_file / patch call failed during the turn and was never
|
||||
# superseded by a successful write to the same path. This catches
|
||||
# the "batch of parallel patches, half fail, model claims success"
|
||||
# class of over-claim that otherwise forces users to run
|
||||
# `git status` to verify edits landed. Set false to suppress.
|
||||
"file_mutation_verifier": True,
|
||||
"show_cost": False, # Show $ cost in the status bar (off by default)
|
||||
"skin": "default",
|
||||
# UI language for static user-facing messages (approval prompts, a
|
||||
|
|
|
|||
219
run_agent.py
219
run_agent.py
|
|
@ -347,6 +347,10 @@ _PARALLEL_SAFE_TOOLS = frozenset({
|
|||
# File tools can run concurrently when they target independent paths.
|
||||
_PATH_SCOPED_TOOLS = frozenset({"read_file", "write_file", "patch"})
|
||||
|
||||
# Tools that mutate files on disk. Used by the per-turn verifier that
|
||||
# surfaces silently-failed file edits so the model can't over-claim success.
|
||||
_FILE_MUTATING_TOOLS = frozenset({"write_file", "patch"})
|
||||
|
||||
# Maximum number of concurrent worker threads for parallel tool execution.
|
||||
_MAX_TOOL_WORKERS = 8
|
||||
|
||||
|
|
@ -524,6 +528,68 @@ def _append_subdir_hint_to_multimodal(value: Dict[str, Any], hint: str) -> None:
|
|||
value["text_summary"] = value["text_summary"] + hint
|
||||
|
||||
|
||||
def _extract_file_mutation_targets(tool_name: str, args: Dict[str, Any]) -> List[str]:
|
||||
"""Return the file paths a ``write_file`` or ``patch`` call is targeting.
|
||||
|
||||
For ``write_file`` and ``patch`` in replace mode this is just ``args["path"]``.
|
||||
For ``patch`` in V4A patch mode we parse the patch content for
|
||||
``*** Update File:`` / ``*** Add File:`` / ``*** Delete File:`` headers so
|
||||
the verifier can track each file in a multi-file patch separately.
|
||||
"""
|
||||
if tool_name not in _FILE_MUTATING_TOOLS:
|
||||
return []
|
||||
if tool_name == "write_file":
|
||||
p = args.get("path")
|
||||
return [str(p)] if p else []
|
||||
# tool_name == "patch"
|
||||
mode = args.get("mode") or "replace"
|
||||
if mode == "replace":
|
||||
p = args.get("path")
|
||||
return [str(p)] if p else []
|
||||
if mode == "patch":
|
||||
body = args.get("patch") or ""
|
||||
if not isinstance(body, str) or not body:
|
||||
return []
|
||||
import re as _re
|
||||
paths: List[str] = []
|
||||
for _m in _re.finditer(
|
||||
r'^\*\*\*\s+(?:Update|Add|Delete)\s+File:\s*(.+)$',
|
||||
body,
|
||||
_re.MULTILINE,
|
||||
):
|
||||
p = _m.group(1).strip()
|
||||
if p:
|
||||
paths.append(p)
|
||||
return paths
|
||||
return []
|
||||
|
||||
|
||||
def _extract_error_preview(result: Any, max_len: int = 180) -> str:
|
||||
"""Pull a one-line error summary out of a tool result for footer display."""
|
||||
text = _multimodal_text_summary(result) if result is not None else ""
|
||||
if not isinstance(text, str):
|
||||
try:
|
||||
text = str(text)
|
||||
except Exception:
|
||||
return ""
|
||||
# Try to parse JSON and pull the ``error`` field — tool handlers return
|
||||
# ``{"success": false, "error": "..."}``; raw string wins if parse fails.
|
||||
stripped = text.strip()
|
||||
if stripped.startswith("{"):
|
||||
try:
|
||||
import json as _json
|
||||
data = _json.loads(stripped)
|
||||
if isinstance(data, dict) and isinstance(data.get("error"), str):
|
||||
text = data["error"]
|
||||
except Exception:
|
||||
pass
|
||||
# Collapse whitespace, trim to max_len.
|
||||
text = " ".join(text.split())
|
||||
if len(text) > max_len:
|
||||
text = text[: max_len - 1] + "…"
|
||||
return text
|
||||
|
||||
|
||||
def _trajectory_normalize_msg(msg: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Strip image blobs from a message for trajectory saving.
|
||||
|
||||
|
|
@ -5346,6 +5412,103 @@ class AIAgent:
|
|||
self._pending_steer = None
|
||||
return text
|
||||
|
||||
def _record_file_mutation_result(
|
||||
self,
|
||||
tool_name: str,
|
||||
args: Dict[str, Any],
|
||||
result: Any,
|
||||
is_error: bool,
|
||||
) -> None:
|
||||
"""Record a ``write_file`` / ``patch`` outcome for the turn-end verifier.
|
||||
|
||||
On failure, store ``{path: {error_preview, tool}}`` entries. On
|
||||
success, remove any prior failure entries for the same paths (the
|
||||
model recovered within the turn). Silently no-ops if the per-turn
|
||||
state dict hasn't been initialised yet (e.g. a tool dispatched
|
||||
outside ``run_conversation``).
|
||||
"""
|
||||
if tool_name not in _FILE_MUTATING_TOOLS:
|
||||
return
|
||||
state = getattr(self, "_turn_failed_file_mutations", None)
|
||||
if state is None:
|
||||
return
|
||||
targets = _extract_file_mutation_targets(tool_name, args)
|
||||
if not targets:
|
||||
return
|
||||
if is_error:
|
||||
preview = _extract_error_preview(result)
|
||||
for path in targets:
|
||||
# Keep the FIRST error we saw for a given path unless we
|
||||
# later see success. A repeated failure with a different
|
||||
# message shouldn't silently overwrite the original.
|
||||
if path not in state:
|
||||
state[path] = {
|
||||
"tool": tool_name,
|
||||
"error_preview": preview,
|
||||
}
|
||||
else:
|
||||
for path in targets:
|
||||
state.pop(path, None)
|
||||
|
||||
def _file_mutation_verifier_enabled(self) -> bool:
|
||||
"""Check whether the per-turn file-mutation verifier footer is on.
|
||||
|
||||
Config path: ``display.file_mutation_verifier`` (bool, default True).
|
||||
``HERMES_FILE_MUTATION_VERIFIER`` env var overrides config. Exposed
|
||||
as a method so tests can patch a single seam without reaching into
|
||||
the private ``_turn_failed_file_mutations`` state dict.
|
||||
"""
|
||||
try:
|
||||
import os as _os
|
||||
env = _os.environ.get("HERMES_FILE_MUTATION_VERIFIER")
|
||||
if env is not None:
|
||||
return env.strip().lower() not in ("0", "false", "no", "off")
|
||||
# Read from the persisted config.yaml so gateway and CLI share
|
||||
# the same setting. Import lazily to avoid a startup-time cycle.
|
||||
try:
|
||||
from hermes_cli.config import load_config as _load_config
|
||||
_cfg = _load_config() or {}
|
||||
except Exception:
|
||||
_cfg = {}
|
||||
_display = _cfg.get("display") if isinstance(_cfg, dict) else None
|
||||
if isinstance(_display, dict) and "file_mutation_verifier" in _display:
|
||||
return bool(_display.get("file_mutation_verifier"))
|
||||
except Exception:
|
||||
pass
|
||||
return True # safe default: verifier on
|
||||
|
||||
@staticmethod
|
||||
def _format_file_mutation_failure_footer(failed: Dict[str, Dict[str, Any]]) -> str:
|
||||
"""Render the per-turn failed-mutation dict as a user-facing footer.
|
||||
|
||||
Displays up to 10 paths with their first error preview, then a
|
||||
count of any additional failures. Returns an empty string when
|
||||
the dict is empty so callers can concatenate unconditionally.
|
||||
"""
|
||||
if not failed:
|
||||
return ""
|
||||
lines = [
|
||||
"⚠️ File-mutation verifier: "
|
||||
f"{len(failed)} file(s) were NOT modified this turn despite any "
|
||||
"wording above that may suggest otherwise. Run `git status` or "
|
||||
"`read_file` to confirm."
|
||||
]
|
||||
shown = 0
|
||||
for path, info in failed.items():
|
||||
if shown >= 10:
|
||||
break
|
||||
preview = (info.get("error_preview") or "").strip()
|
||||
tool = info.get("tool") or "patch"
|
||||
if preview:
|
||||
lines.append(f" • {path} — [{tool}] {preview}")
|
||||
else:
|
||||
lines.append(f" • {path} — [{tool}] failed")
|
||||
shown += 1
|
||||
remaining = len(failed) - shown
|
||||
if remaining > 0:
|
||||
lines.append(f" • … and {remaining} more")
|
||||
return "\n".join(lines)
|
||||
|
||||
def _apply_pending_steer_to_tool_results(self, messages: list, num_tool_msgs: int) -> None:
|
||||
"""Append any pending /steer text to the last tool result in this turn.
|
||||
|
||||
|
|
@ -10872,6 +11035,17 @@ class AIAgent:
|
|||
result_preview = _err_text[:200] if len(_err_text) > 200 else _err_text
|
||||
logger.warning("Tool %s returned error (%.2fs): %s", function_name, tool_duration, result_preview)
|
||||
|
||||
# Track file-mutation outcome for the turn-end verifier.
|
||||
# `blocked` calls never actually ran — don't let a guardrail
|
||||
# block count as either a failure or a success.
|
||||
if not blocked:
|
||||
try:
|
||||
self._record_file_mutation_result(
|
||||
function_name, function_args, function_result, is_error,
|
||||
)
|
||||
except Exception as _ver_err:
|
||||
logging.debug("file-mutation verifier record failed: %s", _ver_err)
|
||||
|
||||
if not blocked and self.tool_progress_callback:
|
||||
try:
|
||||
self.tool_progress_callback(
|
||||
|
|
@ -11298,6 +11472,18 @@ class AIAgent:
|
|||
else:
|
||||
logger.info("tool %s completed (%.2fs, %d chars)", function_name, tool_duration, _result_len)
|
||||
|
||||
# Track file-mutation outcome for the turn-end verifier. See
|
||||
# the concurrent path for the rationale; both paths must feed
|
||||
# the same state so the footer reflects every tool call in the
|
||||
# turn, not just the parallel ones.
|
||||
if not _execution_blocked:
|
||||
try:
|
||||
self._record_file_mutation_result(
|
||||
function_name, function_args, function_result, _is_error_result,
|
||||
)
|
||||
except Exception as _ver_err:
|
||||
logging.debug("file-mutation verifier record failed: %s", _ver_err)
|
||||
|
||||
if not _execution_blocked and self.tool_progress_callback:
|
||||
try:
|
||||
self.tool_progress_callback(
|
||||
|
|
@ -11995,6 +12181,14 @@ class AIAgent:
|
|||
truncated_response_prefix = ""
|
||||
compression_attempts = 0
|
||||
_turn_exit_reason = "unknown" # Diagnostic: why the loop ended
|
||||
|
||||
# Per-turn file-mutation verifier state. Keyed by resolved path;
|
||||
# each failed ``write_file`` / ``patch`` call records the error
|
||||
# preview. Later successful writes to the same path remove the
|
||||
# entry (the model recovered). At end-of-turn, any entries still
|
||||
# present are surfaced in an advisory footer so the model cannot
|
||||
# over-claim success while the file is actually unchanged on disk.
|
||||
self._turn_failed_file_mutations: Dict[str, Dict[str, Any]] = {}
|
||||
|
||||
# Record the execution thread so interrupt()/clear_interrupt() can
|
||||
# scope the tool-level interrupt signal to THIS agent's thread only.
|
||||
|
|
@ -15310,6 +15504,31 @@ class AIAgent:
|
|||
else:
|
||||
logger.info(_diag_msg, *_diag_args)
|
||||
|
||||
# File-mutation verifier footer.
|
||||
# If one or more ``write_file`` / ``patch`` calls failed during this
|
||||
# turn and were never superseded by a successful write to the same
|
||||
# path, append an advisory footer to the assistant response. This
|
||||
# catches the specific case — reported by Ben Eng (#15524-adjacent)
|
||||
# — where a model issues a batch of parallel patches, half of them
|
||||
# fail with "Could not find old_string", and the model summarises
|
||||
# the turn claiming every file was edited. The user then has to
|
||||
# manually run ``git status`` to catch the lie. With this footer
|
||||
# the truth is surfaced on every turn, so over-claiming is
|
||||
# structurally impossible past the model.
|
||||
#
|
||||
# Gate: only applied when a real text response exists for this
|
||||
# turn and the user didn't interrupt. Empty/interrupted turns
|
||||
# already have other surface text that shouldn't be augmented.
|
||||
if final_response and not interrupted:
|
||||
try:
|
||||
_failed = getattr(self, "_turn_failed_file_mutations", None) or {}
|
||||
if _failed and self._file_mutation_verifier_enabled():
|
||||
footer = self._format_file_mutation_failure_footer(_failed)
|
||||
if footer:
|
||||
final_response = final_response.rstrip() + "\n\n" + footer
|
||||
except Exception as _ver_err:
|
||||
logger.debug("file-mutation verifier footer failed: %s", _ver_err)
|
||||
|
||||
# Plugin hook: transform_llm_output
|
||||
# Fired once per turn after the tool-calling loop completes.
|
||||
# Plugins can transform the LLM's output text before it's returned.
|
||||
|
|
|
|||
308
tests/run_agent/test_file_mutation_verifier.py
Normal file
308
tests/run_agent/test_file_mutation_verifier.py
Normal file
|
|
@ -0,0 +1,308 @@
|
|||
"""Tests for the per-turn file-mutation verifier footer.
|
||||
|
||||
Covers the three moving pieces:
|
||||
|
||||
1. ``_extract_file_mutation_targets`` — pulls file paths from write_file /
|
||||
patch (replace + V4A) tool-call argument dicts.
|
||||
2. ``AIAgent._record_file_mutation_result`` — builds the per-turn state
|
||||
dict, removing entries when a later success supersedes an earlier
|
||||
failure for the same path.
|
||||
3. ``AIAgent._format_file_mutation_failure_footer`` — renders the dict
|
||||
as a user-visible advisory.
|
||||
|
||||
Regression target: the "Ben Eng llm-wiki" session where grok-4.1-fast
|
||||
batched parallel patches, half failed, and the model summarised the
|
||||
turn claiming every file was edited. This verifier makes over-claiming
|
||||
structurally impossible past the model: the user always sees the real
|
||||
list of files that did NOT change.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
|
||||
import pytest
|
||||
|
||||
from run_agent import (
|
||||
AIAgent,
|
||||
_FILE_MUTATING_TOOLS,
|
||||
_extract_error_preview,
|
||||
_extract_file_mutation_targets,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _extract_file_mutation_targets
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestExtractFileMutationTargets:
|
||||
def test_non_mutating_tool_returns_empty(self):
|
||||
assert _extract_file_mutation_targets("read_file", {"path": "/x"}) == []
|
||||
assert _extract_file_mutation_targets("terminal", {"command": "ls"}) == []
|
||||
|
||||
def test_write_file_returns_single_path(self):
|
||||
out = _extract_file_mutation_targets("write_file", {"path": "/tmp/a.md", "content": "x"})
|
||||
assert out == ["/tmp/a.md"]
|
||||
|
||||
def test_write_file_missing_path_returns_empty(self):
|
||||
assert _extract_file_mutation_targets("write_file", {"content": "x"}) == []
|
||||
|
||||
def test_patch_replace_mode_returns_path(self):
|
||||
args = {"mode": "replace", "path": "/tmp/a.md", "old_string": "x", "new_string": "y"}
|
||||
assert _extract_file_mutation_targets("patch", args) == ["/tmp/a.md"]
|
||||
|
||||
def test_patch_default_mode_is_replace(self):
|
||||
# Mode omitted — schema default is ``replace``.
|
||||
args = {"path": "/tmp/a.md", "old_string": "x", "new_string": "y"}
|
||||
assert _extract_file_mutation_targets("patch", args) == ["/tmp/a.md"]
|
||||
|
||||
def test_patch_v4a_single_file(self):
|
||||
body = (
|
||||
"*** Begin Patch\n"
|
||||
"*** Update File: /tmp/a.md\n"
|
||||
"@@ ctx @@\n"
|
||||
" line1\n"
|
||||
"-bad\n"
|
||||
"+good\n"
|
||||
"*** End Patch\n"
|
||||
)
|
||||
args = {"mode": "patch", "patch": body}
|
||||
assert _extract_file_mutation_targets("patch", args) == ["/tmp/a.md"]
|
||||
|
||||
def test_patch_v4a_multi_file(self):
|
||||
body = (
|
||||
"*** Begin Patch\n"
|
||||
"*** Update File: /tmp/a.md\n"
|
||||
"@@ @@\n-a\n+b\n"
|
||||
"*** Add File: /tmp/new.md\n"
|
||||
"+fresh\n"
|
||||
"*** Delete File: /tmp/old.md\n"
|
||||
"*** End Patch\n"
|
||||
)
|
||||
args = {"mode": "patch", "patch": body}
|
||||
paths = _extract_file_mutation_targets("patch", args)
|
||||
assert paths == ["/tmp/a.md", "/tmp/new.md", "/tmp/old.md"]
|
||||
|
||||
def test_patch_v4a_missing_body_returns_empty(self):
|
||||
assert _extract_file_mutation_targets("patch", {"mode": "patch"}) == []
|
||||
assert _extract_file_mutation_targets("patch", {"mode": "patch", "patch": ""}) == []
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _extract_error_preview
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestExtractErrorPreview:
|
||||
def test_json_error_field_preferred(self):
|
||||
raw = json.dumps({"success": False, "error": "Could not find old_string in /tmp/x"})
|
||||
assert _extract_error_preview(raw) == "Could not find old_string in /tmp/x"
|
||||
|
||||
def test_plain_string_falls_through(self):
|
||||
assert _extract_error_preview("Error executing tool: boom") == "Error executing tool: boom"
|
||||
|
||||
def test_long_preview_truncated(self):
|
||||
long = "x" * 500
|
||||
out = _extract_error_preview(long, max_len=50)
|
||||
assert len(out) <= 50
|
||||
assert out.endswith("…")
|
||||
|
||||
def test_none_returns_empty(self):
|
||||
assert _extract_error_preview(None) == ""
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _record_file_mutation_result — state transitions
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _bare_agent() -> AIAgent:
|
||||
"""Skip __init__ and only attach the per-turn state dict.
|
||||
|
||||
AIAgent.__init__ takes ~60 parameters and touches network, auth, and
|
||||
the filesystem. For these tests we only need the two methods —
|
||||
``_record_file_mutation_result`` and ``_format_file_mutation_failure_footer``.
|
||||
Using ``object.__new__`` mirrors the gateway-test pattern documented in
|
||||
the agent pitfalls list.
|
||||
"""
|
||||
agent = object.__new__(AIAgent)
|
||||
agent._turn_failed_file_mutations = {}
|
||||
return agent
|
||||
|
||||
|
||||
class TestRecordFileMutationResult:
|
||||
def test_non_mutating_tool_ignored(self):
|
||||
agent = _bare_agent()
|
||||
agent._record_file_mutation_result(
|
||||
"read_file", {"path": "/tmp/x"}, "{}", is_error=True,
|
||||
)
|
||||
assert agent._turn_failed_file_mutations == {}
|
||||
|
||||
def test_failure_recorded(self):
|
||||
agent = _bare_agent()
|
||||
result = json.dumps({"success": False, "error": "Could not find old_string"})
|
||||
agent._record_file_mutation_result(
|
||||
"patch", {"mode": "replace", "path": "/tmp/a.md", "old_string": "x", "new_string": "y"},
|
||||
result, is_error=True,
|
||||
)
|
||||
state = agent._turn_failed_file_mutations
|
||||
assert "/tmp/a.md" in state
|
||||
assert state["/tmp/a.md"]["tool"] == "patch"
|
||||
assert "Could not find old_string" in state["/tmp/a.md"]["error_preview"]
|
||||
|
||||
def test_success_removes_prior_failure(self):
|
||||
agent = _bare_agent()
|
||||
# First attempt fails
|
||||
agent._record_file_mutation_result(
|
||||
"patch", {"mode": "replace", "path": "/tmp/a.md", "old_string": "x", "new_string": "y"},
|
||||
json.dumps({"error": "not found"}), is_error=True,
|
||||
)
|
||||
assert "/tmp/a.md" in agent._turn_failed_file_mutations
|
||||
# Second attempt with corrected old_string succeeds
|
||||
agent._record_file_mutation_result(
|
||||
"patch", {"mode": "replace", "path": "/tmp/a.md", "old_string": "real", "new_string": "fixed"},
|
||||
json.dumps({"success": True, "diff": "..."}), is_error=False,
|
||||
)
|
||||
assert agent._turn_failed_file_mutations == {}
|
||||
|
||||
def test_repeated_failure_keeps_first_error(self):
|
||||
agent = _bare_agent()
|
||||
agent._record_file_mutation_result(
|
||||
"patch", {"mode": "replace", "path": "/tmp/a.md", "old_string": "v1", "new_string": "y"},
|
||||
json.dumps({"error": "first error"}), is_error=True,
|
||||
)
|
||||
agent._record_file_mutation_result(
|
||||
"patch", {"mode": "replace", "path": "/tmp/a.md", "old_string": "v2", "new_string": "y"},
|
||||
json.dumps({"error": "second error"}), is_error=True,
|
||||
)
|
||||
# Keep the original error — swapping to the latest would obscure
|
||||
# the initial root cause.
|
||||
assert "first error" in agent._turn_failed_file_mutations["/tmp/a.md"]["error_preview"]
|
||||
|
||||
def test_v4a_multi_file_all_tracked(self):
|
||||
agent = _bare_agent()
|
||||
body = (
|
||||
"*** Begin Patch\n"
|
||||
"*** Update File: /tmp/a.md\n@@ @@\n-a\n+b\n"
|
||||
"*** Update File: /tmp/b.md\n@@ @@\n-a\n+b\n"
|
||||
"*** End Patch\n"
|
||||
)
|
||||
agent._record_file_mutation_result(
|
||||
"patch", {"mode": "patch", "patch": body},
|
||||
json.dumps({"error": "parse failure"}), is_error=True,
|
||||
)
|
||||
assert set(agent._turn_failed_file_mutations) == {"/tmp/a.md", "/tmp/b.md"}
|
||||
|
||||
def test_no_state_dict_silent_noop(self):
|
||||
"""When called outside run_conversation the state dict is absent.
|
||||
|
||||
The record helper must never raise — a tool dispatched from, say,
|
||||
a direct ``chat()`` call should not blow up the call site just
|
||||
because the verifier state hasn't been initialised.
|
||||
"""
|
||||
agent = object.__new__(AIAgent) # no state attached
|
||||
# Should not raise
|
||||
agent._record_file_mutation_result(
|
||||
"patch", {"mode": "replace", "path": "/tmp/a.md"},
|
||||
json.dumps({"error": "x"}), is_error=True,
|
||||
)
|
||||
|
||||
def test_missing_path_arg_recorded_nowhere(self):
|
||||
agent = _bare_agent()
|
||||
agent._record_file_mutation_result(
|
||||
"patch", {"mode": "replace"}, # no path
|
||||
json.dumps({"error": "path required"}), is_error=True,
|
||||
)
|
||||
# No path → nothing to key on, state stays empty. The per-turn
|
||||
# state is about file paths, not individual tool-call IDs.
|
||||
assert agent._turn_failed_file_mutations == {}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _format_file_mutation_failure_footer
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestFormatFooter:
|
||||
def test_empty_returns_empty_string(self):
|
||||
assert AIAgent._format_file_mutation_failure_footer({}) == ""
|
||||
|
||||
def test_single_failure(self):
|
||||
out = AIAgent._format_file_mutation_failure_footer(
|
||||
{"/tmp/a.md": {"tool": "patch", "error_preview": "Could not find old_string"}},
|
||||
)
|
||||
assert "1 file(s) were NOT modified" in out
|
||||
assert "/tmp/a.md" in out
|
||||
assert "Could not find old_string" in out
|
||||
assert "git status" in out # user-actionable hint
|
||||
|
||||
def test_truncation_at_10_entries(self):
|
||||
failed = {
|
||||
f"/tmp/f{i}.md": {"tool": "patch", "error_preview": "err"}
|
||||
for i in range(15)
|
||||
}
|
||||
out = AIAgent._format_file_mutation_failure_footer(failed)
|
||||
assert "15 file(s) were NOT modified" in out
|
||||
assert "… and 5 more" in out
|
||||
# Ten file bullets + header + "and X more" line
|
||||
lines = out.split("\n")
|
||||
bullet_lines = [ln for ln in lines if ln.lstrip().startswith("•")]
|
||||
assert len(bullet_lines) == 11 # 10 shown + 1 summary
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _file_mutation_verifier_enabled — env + config precedence
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestVerifierEnabled:
|
||||
def test_default_is_enabled(self, monkeypatch):
|
||||
monkeypatch.delenv("HERMES_FILE_MUTATION_VERIFIER", raising=False)
|
||||
agent = _bare_agent()
|
||||
# With no env and no config present, safe default is True.
|
||||
# load_config may surface a user config.yaml in some envs — stub it.
|
||||
import hermes_cli.config as _cfg_mod
|
||||
monkeypatch.setattr(_cfg_mod, "load_config", lambda: {})
|
||||
assert agent._file_mutation_verifier_enabled() is True
|
||||
|
||||
@pytest.mark.parametrize("value", ["0", "false", "FALSE", "no", "off"])
|
||||
def test_env_disables(self, monkeypatch, value):
|
||||
monkeypatch.setenv("HERMES_FILE_MUTATION_VERIFIER", value)
|
||||
agent = _bare_agent()
|
||||
assert agent._file_mutation_verifier_enabled() is False
|
||||
|
||||
def test_env_enables_over_config(self, monkeypatch):
|
||||
monkeypatch.setenv("HERMES_FILE_MUTATION_VERIFIER", "1")
|
||||
import hermes_cli.config as _cfg_mod
|
||||
monkeypatch.setattr(
|
||||
_cfg_mod, "load_config",
|
||||
lambda: {"display": {"file_mutation_verifier": False}},
|
||||
)
|
||||
agent = _bare_agent()
|
||||
assert agent._file_mutation_verifier_enabled() is True
|
||||
|
||||
def test_config_disables_when_no_env(self, monkeypatch):
|
||||
monkeypatch.delenv("HERMES_FILE_MUTATION_VERIFIER", raising=False)
|
||||
import hermes_cli.config as _cfg_mod
|
||||
monkeypatch.setattr(
|
||||
_cfg_mod, "load_config",
|
||||
lambda: {"display": {"file_mutation_verifier": False}},
|
||||
)
|
||||
agent = _bare_agent()
|
||||
assert agent._file_mutation_verifier_enabled() is False
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Module-level invariants
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_file_mutating_tools_set_shape():
|
||||
"""write_file + patch are the only tools the verifier tracks.
|
||||
|
||||
Guard rail: if someone adds a third file-mutating tool (e.g. a new
|
||||
``append_file``), they should also audit whether the verifier should
|
||||
track it. This test fails loudly on unilateral additions.
|
||||
"""
|
||||
assert _FILE_MUTATING_TOOLS == frozenset({"write_file", "patch"})
|
||||
|
|
@ -490,6 +490,7 @@ Advanced per-platform knobs for throttling the outbound message batcher. Most us
|
|||
| `HERMES_GATEWAY_PLATFORM_CONNECT_TIMEOUT` | Per-platform connect timeout during gateway startup (seconds). |
|
||||
| `HERMES_GATEWAY_BUSY_INPUT_MODE` | Default gateway busy-input behavior: `queue`, `steer`, or `interrupt`. Can be overridden per chat with `/busy`. |
|
||||
| `HERMES_GATEWAY_BUSY_ACK_ENABLED` | Whether the gateway sends an acknowledgment message (⚡/⏳/⏩) when a user sends input while the agent is busy (default: `true`). Set to `false` to suppress these messages entirely — the input is still queued/steered/interrupts as normal, only the chat reply is silenced. Bridged from `display.busy_ack_enabled` in `config.yaml`. |
|
||||
| `HERMES_FILE_MUTATION_VERIFIER` | Enable the per-turn file-mutation verifier footer (default: `true`). When enabled, Hermes appends an advisory listing any `write_file` / `patch` calls that failed during the turn and were not superseded by a successful write. Set to `0`, `false`, `no`, or `off` to suppress. Mirrors `display.file_mutation_verifier` in `config.yaml`; the env var wins when set. |
|
||||
| `HERMES_CRON_TIMEOUT` | Inactivity timeout for cron job agent runs in seconds (default: `600`). The agent can run indefinitely while actively calling tools or receiving stream tokens — this only triggers when idle. Set to `0` for unlimited. |
|
||||
| `HERMES_CRON_SCRIPT_TIMEOUT` | Timeout for pre-run scripts attached to cron jobs in seconds (default: `120`). Override for scripts that need longer execution (e.g., randomized delays for anti-bot timing). Also configurable via `cron.script_timeout_seconds` in `config.yaml`. |
|
||||
| `HERMES_CRON_MAX_PARALLEL` | Max cron jobs run in parallel per tick (default: `4`). |
|
||||
|
|
|
|||
|
|
@ -1204,9 +1204,25 @@ display:
|
|||
runtime_footer: # Gateway: append a runtime-context footer to final replies
|
||||
enabled: false
|
||||
fields: ["model", "context_pct", "cwd"]
|
||||
file_mutation_verifier: true # Append an advisory footer when write_file/patch calls failed this turn
|
||||
language: en # UI language for static messages (approval prompts, some gateway replies). en | zh | ja | de | es | fr | tr | uk
|
||||
```
|
||||
|
||||
### File-mutation verifier
|
||||
|
||||
When `display.file_mutation_verifier` is `true` (default), Hermes appends a one-line advisory to the assistant's final response whenever a `write_file` or `patch` call failed during the turn and was never superseded by a successful write to the same path. This catches the "batch of parallel patches, half silently fail, model summarises success" class of over-claim without requiring you to manually run `git status` after every edit.
|
||||
|
||||
Example footer:
|
||||
|
||||
```
|
||||
⚠️ File-mutation verifier: 3 file(s) were NOT modified this turn despite any wording above that may suggest otherwise. Run `git status` or `read_file` to confirm.
|
||||
• concepts/automatic-organization.md — [patch] Could not find match for old_string
|
||||
• concepts/lora.md — [patch] Could not find match for old_string
|
||||
• concepts/rag-pipeline.md — [patch] Could not find match for old_string
|
||||
```
|
||||
|
||||
Set `file_mutation_verifier: false` (or `HERMES_FILE_MUTATION_VERIFIER=0`) to suppress the footer. The verifier only fires when real failures are outstanding at turn end — a model that retries a failed patch and succeeds within the same turn will not trigger it for that file.
|
||||
|
||||
### UI language for static messages
|
||||
|
||||
The `display.language` setting translates a small set of static user-facing messages — the CLI approval prompt, a handful of gateway slash-command replies (e.g. restart-drain notices, "approval expired", "goal cleared"). It does **not** translate agent responses, log lines, tool output, error tracebacks, or slash-command descriptions — those stay in English. If you want the agent itself to reply in another language, just tell it in your prompt or system message.
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue