diff --git a/tests/tools/test_tirith_security.py b/tests/tools/test_tirith_security.py index 34e4d8479e6..27202ca63eb 100644 --- a/tests/tools/test_tirith_security.py +++ b/tests/tools/test_tirith_security.py @@ -17,17 +17,20 @@ from tools.tirith_security import check_command_security, ensure_installed @pytest.fixture(autouse=True) def _reset_resolved_path(): """Pre-set cached path to skip auto-install in scan tests. - Tests that specifically test ensure_installed / resolve behavior reset this to None themselves. """ _tirith_mod._resolved_path = "tirith" _tirith_mod._install_thread = None _tirith_mod._install_failure_reason = "" + _tirith_mod._crash_count = 0 + _tirith_mod._circuit_open = False yield _tirith_mod._resolved_path = None _tirith_mod._install_thread = None _tirith_mod._install_failure_reason = "" + _tirith_mod._crash_count = 0 + _tirith_mod._circuit_open = False # --------------------------------------------------------------------------- @@ -1216,12 +1219,17 @@ class TestSpawnWarningDedup: _tirith_mod._reset_spawn_warning_state() with caplog.at_level("WARNING", logger="tools.tirith_security"): - for _ in range(15): + for i in range(15): result = check_command_security("echo hi") # Behavior must remain the same on every call — # fail-open allow, with the exception captured in summary. assert result["action"] == "allow" - assert "unavailable" in result["summary"] + if i < _tirith_mod._CRASH_LIMIT: + # Before circuit breaker opens, summary has the exception + assert "unavailable" in result["summary"] + else: + # After circuit breaker opens, summary is generic + assert "circuit breaker" in result["summary"] spawn_warnings = [ rec for rec in caplog.records @@ -1237,7 +1245,11 @@ class TestSpawnWarningDedup: def test_distinct_exception_types_each_log_once(self, mock_cfg, mock_run, caplog): """``FileNotFoundError`` and ``PermissionError`` are distinct failure modes and each deserves its own first-occurrence log - line; the dedupe key includes the exception class.""" + line; the dedupe key includes the exception class. + + After _CRASH_LIMIT consecutive failures the circuit breaker opens + and subsequent calls short-circuit without spawning, so we only + see the warnings from the first batch.""" mock_cfg.return_value = { "tirith_enabled": True, "tirith_path": "tirith", "tirith_timeout": 5, "tirith_fail_open": True, @@ -1248,6 +1260,9 @@ class TestSpawnWarningDedup: mock_run.side_effect = FileNotFoundError("[WinError 2]") for _ in range(3): check_command_security("a") + # Circuit breaker is now open — switching to PermissionError + # won't generate a new warning because the function returns + # before reaching subprocess.run. mock_run.side_effect = PermissionError("denied") for _ in range(3): check_command_security("b") @@ -1256,8 +1271,8 @@ class TestSpawnWarningDedup: rec for rec in caplog.records if "tirith spawn failed" in rec.message ] - assert len(spawn_warnings) == 2, ( - f"expected 2 distinct first-occurrence warnings, " + assert len(spawn_warnings) == 1, ( + f"expected 1 warning before circuit breaker opens, " f"got {len(spawn_warnings)}" ) diff --git a/tools/tirith_security.py b/tools/tirith_security.py index f20c96458fe..93509604131 100644 --- a/tools/tirith_security.py +++ b/tools/tirith_security.py @@ -97,6 +97,35 @@ _resolved_path: str | None | bool = None _INSTALL_FAILED = False # sentinel: distinct from "not yet tried" _install_failure_reason: str = "" # reason tag when _resolved_path is _INSTALL_FAILED +# Circuit breaker: after _CRASH_LIMIT consecutive spawn/execution failures, +# disable tirith for the rest of the process to prevent agent hangs (#41400). +# Reset on successful execution (see _record_tirith_crash / check_command_security). +# +# Thread safety: _crash_count and _circuit_open are module-level globals +# mutated without a lock. check_command_security can be called from +# concurrent agent threads (gateway multi-session). The race is benign — +# at worst two threads both increment past _CRASH_LIMIT and both set +# _circuit_open = True, opening the breaker one call early. No data +# corruption or security bypass is possible. This intentionally matches +# the lock-free style of error counters in mcp_tool.py rather than the +# locked _warn_once pattern, because the worst case is harmless. +_CRASH_LIMIT = 3 +_crash_count: int = 0 +_circuit_open: bool = False + + +def _record_tirith_crash() -> None: + """Increment the crash counter and open the circuit breaker if needed.""" + global _crash_count, _circuit_open + _crash_count += 1 + if _crash_count >= _CRASH_LIMIT: + _circuit_open = True + logger.warning( + "tirith circuit breaker opened after %d consecutive failures; " + "disabling for the rest of the process", + _crash_count, + ) + # Background install thread coordination _install_lock = threading.Lock() _install_thread: threading.Thread | None = None @@ -708,11 +737,21 @@ def check_command_security(command: str) -> dict: Returns: {"action": "allow"|"warn"|"block", "findings": [...], "summary": str} """ + global _crash_count, _circuit_open + cfg = _load_security_config() if not cfg["tirith_enabled"]: return {"action": "allow", "findings": [], "summary": ""} + # Circuit breaker: if tirith has crashed _CRASH_LIMIT times in a row, + # stop trying for the rest of the process. Without this, a corrupted + # or missing binary causes every tool call to hit the same spawn failure + # → fail-open → agent retry loop, hanging the user for 20+ minutes + # (issue #41400). + if _circuit_open: + return {"action": "allow", "findings": [], "summary": "tirith disabled (circuit breaker)"} + # Unsupported platform (Windows etc.) — tirith has no binary here and # never will. Skip the resolver entirely so we don't even try to spawn. # Pattern-matching guards still run via the rest of approval.py. @@ -750,6 +789,7 @@ def check_command_security(command: str) -> dict: # install marked failed for the day). spawn_key = f"tirith_spawn_failed:{type(exc).__name__}:{getattr(exc, 'errno', '')}" _warn_once(spawn_key, "tirith spawn failed: %s", exc) + _record_tirith_crash() if fail_open: return {"action": "allow", "findings": [], "summary": f"tirith unavailable: {exc}"} return {"action": "block", "findings": [], "summary": f"tirith spawn failed (fail-closed): {exc}"} @@ -759,6 +799,7 @@ def check_command_security(command: str) -> dict: "tirith timed out after %ds", timeout, ) + _record_tirith_crash() if fail_open: return {"action": "allow", "findings": [], "summary": f"tirith timed out ({timeout}s)"} return {"action": "block", "findings": [], "summary": "tirith timed out (fail-closed)"} @@ -767,13 +808,17 @@ def check_command_security(command: str) -> dict: exit_code = result.returncode if exit_code == 0: action = "allow" + # Successful execution — reset circuit breaker + _crash_count = 0 elif exit_code == 1: action = "block" elif exit_code == 2: action = "warn" else: - # Unknown exit code — respect fail_open + # Unknown exit code (includes signal-killed processes like -11/SIGSEGV) + # — respect fail_open logger.warning("tirith returned unexpected exit code %d", exit_code) + _record_tirith_crash() if fail_open: return {"action": "allow", "findings": [], "summary": f"tirith exit code {exit_code} (fail-open)"} return {"action": "block", "findings": [], "summary": f"tirith exit code {exit_code} (fail-closed)"}