diff --git a/run_agent.py b/run_agent.py index 4ee4de51b2..48092501f6 100644 --- a/run_agent.py +++ b/run_agent.py @@ -7380,6 +7380,59 @@ class AIAgent: # compress history and retry, not abort immediately. status_code = getattr(api_error, "status_code", None) + # ── Anthropic long-context tier gate ────────────────── + # Anthropic returns HTTP 429 "Extra usage is required for + # long context requests" when a Claude Max (or similar) + # subscription doesn't include the 1M-context tier. This + # is NOT a transient rate limit — retrying or switching + # credentials won't help. Reduce context to 200k (the + # standard tier) and compress. + _is_long_context_tier_error = ( + status_code == 429 + and "extra usage" in error_msg + and "long context" in error_msg + ) + if _is_long_context_tier_error: + _reduced_ctx = 200000 + compressor = self.context_compressor + old_ctx = compressor.context_length + if old_ctx > _reduced_ctx: + compressor.context_length = _reduced_ctx + compressor.threshold_tokens = int( + _reduced_ctx * compressor.threshold_percent + ) + compressor._context_probed = True + # Don't persist — this is a subscription-tier + # limitation, not a model capability. If the user + # later enables extra usage the 1M limit should + # come back automatically. + compressor._context_probe_persistable = False + self._vprint( + f"{self.log_prefix}⚠️ Anthropic long-context tier " + f"requires extra usage — reducing context: " + f"{old_ctx:,} → {_reduced_ctx:,} tokens", + force=True, + ) + + compression_attempts += 1 + if compression_attempts <= max_compression_attempts: + original_len = len(messages) + messages, active_system_prompt = self._compress_context( + messages, system_message, + approx_tokens=approx_tokens, + task_id=effective_task_id, + ) + if len(messages) < original_len or old_ctx > _reduced_ctx: + self._emit_status( + f"🗜️ Context reduced to {_reduced_ctx:,} tokens " + f"(was {old_ctx:,}), retrying..." + ) + time.sleep(2) + restart_with_compressed_messages = True + break + # Fall through to normal error handling if compression + # is exhausted or didn't help. + # Eager fallback for rate-limit errors (429 or quota exhaustion). # When a fallback model is configured, switch immediately instead # of burning through retries with exponential backoff -- the diff --git a/tests/test_long_context_tier_429.py b/tests/test_long_context_tier_429.py new file mode 100644 index 0000000000..ac2fcf3113 --- /dev/null +++ b/tests/test_long_context_tier_429.py @@ -0,0 +1,163 @@ +"""Tests for Anthropic long-context tier 429 handling. + +When Claude Max users without "extra usage" hit the 1M context tier, +Anthropic returns HTTP 429 "Extra usage is required for long context +requests." This is NOT a transient rate limit — the agent should +reduce context_length to 200k and compress instead of retrying. +""" + +import pytest +from types import SimpleNamespace +from unittest.mock import MagicMock, patch + + +# --------------------------------------------------------------------------- +# Detection logic +# --------------------------------------------------------------------------- + + +class TestLongContextTierDetection: + """Verify the detection heuristic matches the Anthropic error.""" + + @staticmethod + def _is_long_context_tier_error(status_code, error_msg): + error_msg = error_msg.lower() + return ( + status_code == 429 + and "extra usage" in error_msg + and "long context" in error_msg + ) + + def test_matches_anthropic_error(self): + assert self._is_long_context_tier_error( + 429, + "Extra usage is required for long context requests.", + ) + + def test_matches_lowercase(self): + assert self._is_long_context_tier_error( + 429, + "extra usage is required for long context requests.", + ) + + def test_rejects_normal_429(self): + assert not self._is_long_context_tier_error( + 429, + "Rate limit exceeded. Please retry after 30 seconds.", + ) + + def test_rejects_wrong_status(self): + assert not self._is_long_context_tier_error( + 400, + "Extra usage is required for long context requests.", + ) + + def test_rejects_partial_match(self): + """Both 'extra usage' AND 'long context' must be present.""" + assert not self._is_long_context_tier_error( + 429, "extra usage required" + ) + assert not self._is_long_context_tier_error( + 429, "long context requests not supported" + ) + + +# --------------------------------------------------------------------------- +# Context reduction +# --------------------------------------------------------------------------- + + +class TestContextReduction: + """When the long-context tier error fires, context_length should + drop to 200k and the reduced flag should be set correctly.""" + + def _make_compressor(self, context_length=1_000_000, threshold_percent=0.5): + c = SimpleNamespace( + context_length=context_length, + threshold_percent=threshold_percent, + threshold_tokens=int(context_length * threshold_percent), + _context_probed=False, + _context_probe_persistable=False, + ) + return c + + def test_reduces_1m_to_200k(self): + comp = self._make_compressor(1_000_000) + reduced_ctx = 200_000 + + if comp.context_length > reduced_ctx: + comp.context_length = reduced_ctx + comp.threshold_tokens = int(reduced_ctx * comp.threshold_percent) + comp._context_probed = True + comp._context_probe_persistable = False + + assert comp.context_length == 200_000 + assert comp.threshold_tokens == 100_000 + assert comp._context_probed is True + # Must NOT persist — subscription tier, not model capability + assert comp._context_probe_persistable is False + + def test_no_reduction_when_already_200k(self): + comp = self._make_compressor(200_000) + reduced_ctx = 200_000 + + original = comp.context_length + if comp.context_length > reduced_ctx: + comp.context_length = reduced_ctx + + assert comp.context_length == original # unchanged + + def test_no_reduction_when_below_200k(self): + comp = self._make_compressor(128_000) + reduced_ctx = 200_000 + + original = comp.context_length + if comp.context_length > reduced_ctx: + comp.context_length = reduced_ctx + + assert comp.context_length == original # unchanged + + +# --------------------------------------------------------------------------- +# Integration: agent error handler path +# --------------------------------------------------------------------------- + + +class TestAgentErrorPath: + """Verify the long-context 429 doesn't hit the generic rate-limit + or client-error handlers.""" + + def test_long_context_429_not_treated_as_rate_limit(self): + """The error should be intercepted before the generic + is_rate_limited check fires a fallback switch.""" + error_msg = "extra usage is required for long context requests." + status_code = 429 + + # The long-context check fires first + _is_long_context_tier_error = ( + status_code == 429 + and "extra usage" in error_msg + and "long context" in error_msg + ) + assert _is_long_context_tier_error + + # So we never reach the generic rate-limit path + # (in the real code, `break` exits the retry loop) + + def test_normal_429_still_treated_as_rate_limit(self): + """A normal 429 should NOT match the long-context check.""" + error_msg = "rate limit exceeded" + status_code = 429 + + _is_long_context_tier_error = ( + status_code == 429 + and "extra usage" in error_msg + and "long context" in error_msg + ) + assert not _is_long_context_tier_error + + is_rate_limited = ( + status_code == 429 + or "rate limit" in error_msg + ) + assert is_rate_limited