mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-26 11:12:03 +00:00
fix(security): add circuit breaker for tirith crashes to prevent agent hangs (#41400)
This commit is contained in:
parent
ca82d0accc
commit
07cc567dfa
2 changed files with 67 additions and 7 deletions
|
|
@ -17,17 +17,20 @@ from tools.tirith_security import check_command_security, ensure_installed
|
|||
@pytest.fixture(autouse=True)
|
||||
def _reset_resolved_path():
|
||||
"""Pre-set cached path to skip auto-install in scan tests.
|
||||
|
||||
Tests that specifically test ensure_installed / resolve behavior
|
||||
reset this to None themselves.
|
||||
"""
|
||||
_tirith_mod._resolved_path = "tirith"
|
||||
_tirith_mod._install_thread = None
|
||||
_tirith_mod._install_failure_reason = ""
|
||||
_tirith_mod._crash_count = 0
|
||||
_tirith_mod._circuit_open = False
|
||||
yield
|
||||
_tirith_mod._resolved_path = None
|
||||
_tirith_mod._install_thread = None
|
||||
_tirith_mod._install_failure_reason = ""
|
||||
_tirith_mod._crash_count = 0
|
||||
_tirith_mod._circuit_open = False
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
@ -1216,12 +1219,17 @@ class TestSpawnWarningDedup:
|
|||
_tirith_mod._reset_spawn_warning_state()
|
||||
|
||||
with caplog.at_level("WARNING", logger="tools.tirith_security"):
|
||||
for _ in range(15):
|
||||
for i in range(15):
|
||||
result = check_command_security("echo hi")
|
||||
# Behavior must remain the same on every call —
|
||||
# fail-open allow, with the exception captured in summary.
|
||||
assert result["action"] == "allow"
|
||||
assert "unavailable" in result["summary"]
|
||||
if i < _tirith_mod._CRASH_LIMIT:
|
||||
# Before circuit breaker opens, summary has the exception
|
||||
assert "unavailable" in result["summary"]
|
||||
else:
|
||||
# After circuit breaker opens, summary is generic
|
||||
assert "circuit breaker" in result["summary"]
|
||||
|
||||
spawn_warnings = [
|
||||
rec for rec in caplog.records
|
||||
|
|
@ -1237,7 +1245,11 @@ class TestSpawnWarningDedup:
|
|||
def test_distinct_exception_types_each_log_once(self, mock_cfg, mock_run, caplog):
|
||||
"""``FileNotFoundError`` and ``PermissionError`` are distinct
|
||||
failure modes and each deserves its own first-occurrence log
|
||||
line; the dedupe key includes the exception class."""
|
||||
line; the dedupe key includes the exception class.
|
||||
|
||||
After _CRASH_LIMIT consecutive failures the circuit breaker opens
|
||||
and subsequent calls short-circuit without spawning, so we only
|
||||
see the warnings from the first batch."""
|
||||
mock_cfg.return_value = {
|
||||
"tirith_enabled": True, "tirith_path": "tirith",
|
||||
"tirith_timeout": 5, "tirith_fail_open": True,
|
||||
|
|
@ -1248,6 +1260,9 @@ class TestSpawnWarningDedup:
|
|||
mock_run.side_effect = FileNotFoundError("[WinError 2]")
|
||||
for _ in range(3):
|
||||
check_command_security("a")
|
||||
# Circuit breaker is now open — switching to PermissionError
|
||||
# won't generate a new warning because the function returns
|
||||
# before reaching subprocess.run.
|
||||
mock_run.side_effect = PermissionError("denied")
|
||||
for _ in range(3):
|
||||
check_command_security("b")
|
||||
|
|
@ -1256,8 +1271,8 @@ class TestSpawnWarningDedup:
|
|||
rec for rec in caplog.records
|
||||
if "tirith spawn failed" in rec.message
|
||||
]
|
||||
assert len(spawn_warnings) == 2, (
|
||||
f"expected 2 distinct first-occurrence warnings, "
|
||||
assert len(spawn_warnings) == 1, (
|
||||
f"expected 1 warning before circuit breaker opens, "
|
||||
f"got {len(spawn_warnings)}"
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -97,6 +97,35 @@ _resolved_path: str | None | bool = None
|
|||
_INSTALL_FAILED = False # sentinel: distinct from "not yet tried"
|
||||
_install_failure_reason: str = "" # reason tag when _resolved_path is _INSTALL_FAILED
|
||||
|
||||
# Circuit breaker: after _CRASH_LIMIT consecutive spawn/execution failures,
|
||||
# disable tirith for the rest of the process to prevent agent hangs (#41400).
|
||||
# Reset on successful execution (see _record_tirith_crash / check_command_security).
|
||||
#
|
||||
# Thread safety: _crash_count and _circuit_open are module-level globals
|
||||
# mutated without a lock. check_command_security can be called from
|
||||
# concurrent agent threads (gateway multi-session). The race is benign —
|
||||
# at worst two threads both increment past _CRASH_LIMIT and both set
|
||||
# _circuit_open = True, opening the breaker one call early. No data
|
||||
# corruption or security bypass is possible. This intentionally matches
|
||||
# the lock-free style of error counters in mcp_tool.py rather than the
|
||||
# locked _warn_once pattern, because the worst case is harmless.
|
||||
_CRASH_LIMIT = 3
|
||||
_crash_count: int = 0
|
||||
_circuit_open: bool = False
|
||||
|
||||
|
||||
def _record_tirith_crash() -> None:
|
||||
"""Increment the crash counter and open the circuit breaker if needed."""
|
||||
global _crash_count, _circuit_open
|
||||
_crash_count += 1
|
||||
if _crash_count >= _CRASH_LIMIT:
|
||||
_circuit_open = True
|
||||
logger.warning(
|
||||
"tirith circuit breaker opened after %d consecutive failures; "
|
||||
"disabling for the rest of the process",
|
||||
_crash_count,
|
||||
)
|
||||
|
||||
# Background install thread coordination
|
||||
_install_lock = threading.Lock()
|
||||
_install_thread: threading.Thread | None = None
|
||||
|
|
@ -708,11 +737,21 @@ def check_command_security(command: str) -> dict:
|
|||
Returns:
|
||||
{"action": "allow"|"warn"|"block", "findings": [...], "summary": str}
|
||||
"""
|
||||
global _crash_count, _circuit_open
|
||||
|
||||
cfg = _load_security_config()
|
||||
|
||||
if not cfg["tirith_enabled"]:
|
||||
return {"action": "allow", "findings": [], "summary": ""}
|
||||
|
||||
# Circuit breaker: if tirith has crashed _CRASH_LIMIT times in a row,
|
||||
# stop trying for the rest of the process. Without this, a corrupted
|
||||
# or missing binary causes every tool call to hit the same spawn failure
|
||||
# → fail-open → agent retry loop, hanging the user for 20+ minutes
|
||||
# (issue #41400).
|
||||
if _circuit_open:
|
||||
return {"action": "allow", "findings": [], "summary": "tirith disabled (circuit breaker)"}
|
||||
|
||||
# Unsupported platform (Windows etc.) — tirith has no binary here and
|
||||
# never will. Skip the resolver entirely so we don't even try to spawn.
|
||||
# Pattern-matching guards still run via the rest of approval.py.
|
||||
|
|
@ -750,6 +789,7 @@ def check_command_security(command: str) -> dict:
|
|||
# install marked failed for the day).
|
||||
spawn_key = f"tirith_spawn_failed:{type(exc).__name__}:{getattr(exc, 'errno', '')}"
|
||||
_warn_once(spawn_key, "tirith spawn failed: %s", exc)
|
||||
_record_tirith_crash()
|
||||
if fail_open:
|
||||
return {"action": "allow", "findings": [], "summary": f"tirith unavailable: {exc}"}
|
||||
return {"action": "block", "findings": [], "summary": f"tirith spawn failed (fail-closed): {exc}"}
|
||||
|
|
@ -759,6 +799,7 @@ def check_command_security(command: str) -> dict:
|
|||
"tirith timed out after %ds",
|
||||
timeout,
|
||||
)
|
||||
_record_tirith_crash()
|
||||
if fail_open:
|
||||
return {"action": "allow", "findings": [], "summary": f"tirith timed out ({timeout}s)"}
|
||||
return {"action": "block", "findings": [], "summary": "tirith timed out (fail-closed)"}
|
||||
|
|
@ -767,13 +808,17 @@ def check_command_security(command: str) -> dict:
|
|||
exit_code = result.returncode
|
||||
if exit_code == 0:
|
||||
action = "allow"
|
||||
# Successful execution — reset circuit breaker
|
||||
_crash_count = 0
|
||||
elif exit_code == 1:
|
||||
action = "block"
|
||||
elif exit_code == 2:
|
||||
action = "warn"
|
||||
else:
|
||||
# Unknown exit code — respect fail_open
|
||||
# Unknown exit code (includes signal-killed processes like -11/SIGSEGV)
|
||||
# — respect fail_open
|
||||
logger.warning("tirith returned unexpected exit code %d", exit_code)
|
||||
_record_tirith_crash()
|
||||
if fail_open:
|
||||
return {"action": "allow", "findings": [], "summary": f"tirith exit code {exit_code} (fail-open)"}
|
||||
return {"action": "block", "findings": [], "summary": f"tirith exit code {exit_code} (fail-closed)"}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue