From 8fa44b17247efa8cae6b0f155e036e1bdf4d7da8 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Thu, 30 Apr 2026 20:42:44 -0700 Subject: [PATCH] fix(guardrails): preserve display _detect_tool_failure semantics The initial guardrail PR consolidated failure classification by pointing display._detect_tool_failure at the new classify_tool_failure helper, which was strictly broader: it flagged any JSON result with "success": false / "failed": true / non-empty "error", plus plain-text "traceback" and "error:" prefixes. That would uptick the user-visible [error] tag on tools that return {"success": false} as a benign signal (memory fullness, todo state, etc.) and feed the failure-streak counter at the same time. Restore display._detect_tool_failure to its pre-PR semantics verbatim. Tighten classify_tool_failure (the guardrail's internal safety-fallback used only when callers don't pass failed=) to match _detect_tool_failure exactly, so the two never disagree. Production callers in run_agent.py already pass an explicit failed= derived from _detect_tool_failure, so the guardrail counter is driven by the same signal the CLI shows. --- agent/display.py | 26 ++++++++++++++++++++++++-- agent/tool_guardrails.py | 32 +++++++++++++------------------- 2 files changed, 37 insertions(+), 21 deletions(-) diff --git a/agent/display.py b/agent/display.py index 43b35ed301..474595d76c 100644 --- a/agent/display.py +++ b/agent/display.py @@ -14,7 +14,6 @@ from difflib import unified_diff from pathlib import Path from utils import safe_json_loads -from agent.tool_guardrails import classify_tool_failure # ANSI escape codes for coloring tool failure indicators _RED = "\033[31m" @@ -809,7 +808,30 @@ def _detect_tool_failure(tool_name: str, result: str | None) -> tuple[bool, str] like ``" [exit 1]"`` for terminal failures, or ``" [error]"`` for generic failures. On success, returns ``(False, "")``. """ - return classify_tool_failure(tool_name, result) + if result is None: + return False, "" + + if tool_name == "terminal": + data = safe_json_loads(result) + if isinstance(data, dict): + exit_code = data.get("exit_code") + if exit_code is not None and exit_code != 0: + return True, f" [exit {exit_code}]" + return False, "" + + # Memory-specific: distinguish "full" from real errors + if tool_name == "memory": + data = safe_json_loads(result) + if isinstance(data, dict): + if data.get("success") is False and "exceed the limit" in data.get("error", ""): + return True, " [full]" + + # Generic heuristic for non-terminal tools + lower = result[:500].lower() + if '"error"' in lower or '"failed"' in lower or result.startswith("Error"): + return True, " [error]" + + return False, "" def get_cute_tool_message( diff --git a/agent/tool_guardrails.py b/agent/tool_guardrails.py index b3237bc390..3c85d78209 100644 --- a/agent/tool_guardrails.py +++ b/agent/tool_guardrails.py @@ -186,7 +186,14 @@ def canonical_tool_args(args: Mapping[str, Any]) -> str: def classify_tool_failure(tool_name: str, result: str | None) -> tuple[bool, str]: - """Classify a tool result using shared display/runtime semantics.""" + """Safety-fallback classifier used only when callers don't pass ``failed``. + + Mirrors ``agent.display._detect_tool_failure`` exactly so the guardrail + never disagrees with the CLI's user-visible ``[error]`` tag. Production + callers in ``run_agent.py`` always pass an explicit ``failed=`` derived + from ``_detect_tool_failure``; this function exists so standalone callers + (tests, tooling) still get consistent behavior. + """ if result is None: return False, "" @@ -196,31 +203,18 @@ def classify_tool_failure(tool_name: str, result: str | None) -> tuple[bool, str exit_code = data.get("exit_code") if exit_code is not None and exit_code != 0: return True, f" [exit {exit_code}]" - if data.get("success") is False or data.get("failed") is True: - return True, " [error]" - error = data.get("error") - if error is not None and error != "": - return True, " [error]" return False, "" - data = safe_json_loads(result) - if isinstance(data, dict): - if tool_name == "memory": - error = data.get("error", "") - if data.get("success") is False and isinstance(error, str) and "exceed the limit" in error: + if tool_name == "memory": + data = safe_json_loads(result) + if isinstance(data, dict): + if data.get("success") is False and "exceed the limit" in data.get("error", ""): return True, " [full]" - if data.get("success") is False or data.get("failed") is True: - return True, " [error]" - error = data.get("error") - if error is not None and error != "": - return True, " [error]" - return False, "" lower = result[:500].lower() - if "traceback" in lower or lower.startswith("error:"): - return True, " [error]" if '"error"' in lower or '"failed"' in lower or result.startswith("Error"): return True, " [error]" + return False, ""