fix(security): add circuit breaker for tirith crashes to prevent agent hangs (#41400)

This commit is contained in:
kyssta-exe 2026-06-07 20:02:57 +00:00 committed by kshitij
parent ca82d0accc
commit 07cc567dfa
2 changed files with 67 additions and 7 deletions

View file

@ -17,17 +17,20 @@ from tools.tirith_security import check_command_security, ensure_installed
@pytest.fixture(autouse=True)
def _reset_resolved_path():
"""Pre-set cached path to skip auto-install in scan tests.
Tests that specifically test ensure_installed / resolve behavior
reset this to None themselves.
"""
_tirith_mod._resolved_path = "tirith"
_tirith_mod._install_thread = None
_tirith_mod._install_failure_reason = ""
_tirith_mod._crash_count = 0
_tirith_mod._circuit_open = False
yield
_tirith_mod._resolved_path = None
_tirith_mod._install_thread = None
_tirith_mod._install_failure_reason = ""
_tirith_mod._crash_count = 0
_tirith_mod._circuit_open = False
# ---------------------------------------------------------------------------
@ -1216,12 +1219,17 @@ class TestSpawnWarningDedup:
_tirith_mod._reset_spawn_warning_state()
with caplog.at_level("WARNING", logger="tools.tirith_security"):
for _ in range(15):
for i in range(15):
result = check_command_security("echo hi")
# Behavior must remain the same on every call —
# fail-open allow, with the exception captured in summary.
assert result["action"] == "allow"
assert "unavailable" in result["summary"]
if i < _tirith_mod._CRASH_LIMIT:
# Before circuit breaker opens, summary has the exception
assert "unavailable" in result["summary"]
else:
# After circuit breaker opens, summary is generic
assert "circuit breaker" in result["summary"]
spawn_warnings = [
rec for rec in caplog.records
@ -1237,7 +1245,11 @@ class TestSpawnWarningDedup:
def test_distinct_exception_types_each_log_once(self, mock_cfg, mock_run, caplog):
"""``FileNotFoundError`` and ``PermissionError`` are distinct
failure modes and each deserves its own first-occurrence log
line; the dedupe key includes the exception class."""
line; the dedupe key includes the exception class.
After _CRASH_LIMIT consecutive failures the circuit breaker opens
and subsequent calls short-circuit without spawning, so we only
see the warnings from the first batch."""
mock_cfg.return_value = {
"tirith_enabled": True, "tirith_path": "tirith",
"tirith_timeout": 5, "tirith_fail_open": True,
@ -1248,6 +1260,9 @@ class TestSpawnWarningDedup:
mock_run.side_effect = FileNotFoundError("[WinError 2]")
for _ in range(3):
check_command_security("a")
# Circuit breaker is now open — switching to PermissionError
# won't generate a new warning because the function returns
# before reaching subprocess.run.
mock_run.side_effect = PermissionError("denied")
for _ in range(3):
check_command_security("b")
@ -1256,8 +1271,8 @@ class TestSpawnWarningDedup:
rec for rec in caplog.records
if "tirith spawn failed" in rec.message
]
assert len(spawn_warnings) == 2, (
f"expected 2 distinct first-occurrence warnings, "
assert len(spawn_warnings) == 1, (
f"expected 1 warning before circuit breaker opens, "
f"got {len(spawn_warnings)}"
)

View file

@ -97,6 +97,35 @@ _resolved_path: str | None | bool = None
_INSTALL_FAILED = False # sentinel: distinct from "not yet tried"
_install_failure_reason: str = "" # reason tag when _resolved_path is _INSTALL_FAILED
# Circuit breaker: after _CRASH_LIMIT consecutive spawn/execution failures,
# disable tirith for the rest of the process to prevent agent hangs (#41400).
# Reset on successful execution (see _record_tirith_crash / check_command_security).
#
# Thread safety: _crash_count and _circuit_open are module-level globals
# mutated without a lock. check_command_security can be called from
# concurrent agent threads (gateway multi-session). The race is benign —
# at worst two threads both increment past _CRASH_LIMIT and both set
# _circuit_open = True, opening the breaker one call early. No data
# corruption or security bypass is possible. This intentionally matches
# the lock-free style of error counters in mcp_tool.py rather than the
# locked _warn_once pattern, because the worst case is harmless.
_CRASH_LIMIT = 3
_crash_count: int = 0
_circuit_open: bool = False
def _record_tirith_crash() -> None:
"""Increment the crash counter and open the circuit breaker if needed."""
global _crash_count, _circuit_open
_crash_count += 1
if _crash_count >= _CRASH_LIMIT:
_circuit_open = True
logger.warning(
"tirith circuit breaker opened after %d consecutive failures; "
"disabling for the rest of the process",
_crash_count,
)
# Background install thread coordination
_install_lock = threading.Lock()
_install_thread: threading.Thread | None = None
@ -708,11 +737,21 @@ def check_command_security(command: str) -> dict:
Returns:
{"action": "allow"|"warn"|"block", "findings": [...], "summary": str}
"""
global _crash_count, _circuit_open
cfg = _load_security_config()
if not cfg["tirith_enabled"]:
return {"action": "allow", "findings": [], "summary": ""}
# Circuit breaker: if tirith has crashed _CRASH_LIMIT times in a row,
# stop trying for the rest of the process. Without this, a corrupted
# or missing binary causes every tool call to hit the same spawn failure
# → fail-open → agent retry loop, hanging the user for 20+ minutes
# (issue #41400).
if _circuit_open:
return {"action": "allow", "findings": [], "summary": "tirith disabled (circuit breaker)"}
# Unsupported platform (Windows etc.) — tirith has no binary here and
# never will. Skip the resolver entirely so we don't even try to spawn.
# Pattern-matching guards still run via the rest of approval.py.
@ -750,6 +789,7 @@ def check_command_security(command: str) -> dict:
# install marked failed for the day).
spawn_key = f"tirith_spawn_failed:{type(exc).__name__}:{getattr(exc, 'errno', '')}"
_warn_once(spawn_key, "tirith spawn failed: %s", exc)
_record_tirith_crash()
if fail_open:
return {"action": "allow", "findings": [], "summary": f"tirith unavailable: {exc}"}
return {"action": "block", "findings": [], "summary": f"tirith spawn failed (fail-closed): {exc}"}
@ -759,6 +799,7 @@ def check_command_security(command: str) -> dict:
"tirith timed out after %ds",
timeout,
)
_record_tirith_crash()
if fail_open:
return {"action": "allow", "findings": [], "summary": f"tirith timed out ({timeout}s)"}
return {"action": "block", "findings": [], "summary": "tirith timed out (fail-closed)"}
@ -767,13 +808,17 @@ def check_command_security(command: str) -> dict:
exit_code = result.returncode
if exit_code == 0:
action = "allow"
# Successful execution — reset circuit breaker
_crash_count = 0
elif exit_code == 1:
action = "block"
elif exit_code == 2:
action = "warn"
else:
# Unknown exit code — respect fail_open
# Unknown exit code (includes signal-killed processes like -11/SIGSEGV)
# — respect fail_open
logger.warning("tirith returned unexpected exit code %d", exit_code)
_record_tirith_crash()
if fail_open:
return {"action": "allow", "findings": [], "summary": f"tirith exit code {exit_code} (fail-open)"}
return {"action": "block", "findings": [], "summary": f"tirith exit code {exit_code} (fail-closed)"}