diff --git a/agent/conversation_loop.py b/agent/conversation_loop.py index 49ce9dbb376..9d78918c267 100644 --- a/agent/conversation_loop.py +++ b/agent/conversation_loop.py @@ -3080,7 +3080,10 @@ def run_conversation( if is_client_error: # Try fallback before aborting — a different provider # may not have the same issue (rate limit, auth, etc.) - agent._buffer_status(f"⚠️ Non-retryable error (HTTP {status_code}) — trying fallback...") + if classified.reason == FailoverReason.content_policy_blocked: + agent._buffer_status("⚠️ Provider safety filter blocked this request — trying fallback...") + else: + agent._buffer_status(f"⚠️ Non-retryable error (HTTP {status_code}) — trying fallback...") if agent._try_activate_fallback(): retry_count = 0 compression_attempts = 0 @@ -3093,10 +3096,16 @@ def run_conversation( # Terminal — flush buffered context so the user sees # what was tried before the abort. agent._flush_status_buffer() - agent._emit_status( - f"❌ Non-retryable error (HTTP {status_code}): " - f"{agent._summarize_api_error(api_error)}" - ) + if classified.reason == FailoverReason.content_policy_blocked: + agent._emit_status( + f"❌ Provider safety filter blocked this request: " + f"{agent._summarize_api_error(api_error)}" + ) + else: + agent._emit_status( + f"❌ Non-retryable error (HTTP {status_code}): " + f"{agent._summarize_api_error(api_error)}" + ) agent._vprint(f"{agent.log_prefix}❌ Non-retryable client error (HTTP {status_code}). Aborting.", force=True) agent._vprint(f"{agent.log_prefix} 🔌 Provider: {_provider} Model: {_model}", force=True) agent._vprint(f"{agent.log_prefix} 🌐 Endpoint: {_base}", force=True) @@ -3143,6 +3152,28 @@ def run_conversation( agent._vprint(f"{agent.log_prefix} • Check credits: https://openrouter.ai/settings/credits", force=True) else: agent._vprint(f"{agent.log_prefix} 💡 This type of error won't be fixed by retrying.", force=True) + # Content-policy blocks deserve their own actionable + # guidance — neither "fix your API key" nor "retry won't + # help" tells the user what to actually do. The provider + # has refused this specific prompt, so the recovery is + # either a rephrase or routing to a different model. + if classified.reason == FailoverReason.content_policy_blocked: + agent._vprint( + f"{agent.log_prefix} 💡 The provider's safety filter rejected this specific prompt.", + force=True, + ) + agent._vprint( + f"{agent.log_prefix} • Try rephrasing the request, narrowing the context, or splitting into smaller steps.", + force=True, + ) + agent._vprint( + f"{agent.log_prefix} • Configure a fallback provider so future blocks route automatically:", + force=True, + ) + agent._vprint( + f"{agent.log_prefix} hermes fallback add (interactive picker — same as `hermes model`)", + force=True, + ) logger.error(f"{agent.log_prefix}Non-retryable client error: {api_error}") # Skip session persistence when the error is likely # context-overflow related (status 400 + large session). @@ -3157,6 +3188,23 @@ def run_conversation( ) else: agent._persist_session(messages, conversation_history) + if classified.reason == FailoverReason.content_policy_blocked: + _summary = agent._summarize_api_error(api_error) + _policy_response = ( + f"⚠️ The model provider's safety filter blocked this request " + f"(not a Hermes/gateway failure).\n\n" + f"Provider message: {_summary}\n\n" + f"Try rephrasing the request, narrowing the context, or " + f"adding a fallback provider with `hermes fallback add`." + ) + return { + "final_response": _policy_response, + "messages": messages, + "api_calls": api_call_count, + "completed": False, + "failed": True, + "error": f"content_policy_blocked: {_summary}", + } return { "final_response": None, "messages": messages, diff --git a/agent/error_classifier.py b/agent/error_classifier.py index 4949d1878d4..e8a44866b28 100644 --- a/agent/error_classifier.py +++ b/agent/error_classifier.py @@ -44,9 +44,10 @@ class FailoverReason(enum.Enum): payload_too_large = "payload_too_large" # 413 — compress payload image_too_large = "image_too_large" # Native image part exceeds provider's per-image limit — shrink and retry - # Model + # Model / provider policy model_not_found = "model_not_found" # 404 or invalid model — fallback to different model provider_policy_blocked = "provider_policy_blocked" # Aggregator (e.g. OpenRouter) blocked the only endpoint due to account data/privacy policy + content_policy_blocked = "content_policy_blocked" # Provider safety filter rejected this prompt — deterministic per-request, don't retry unchanged # Request format format_error = "format_error" # 400 bad request — abort or strip + retry @@ -289,6 +290,45 @@ _PROVIDER_POLICY_BLOCKED_PATTERNS = [ "no endpoints found matching your data policy", ] +# Provider content-policy / safety-filter blocks. Distinct from +# ``provider_policy_blocked`` above (which is an OpenRouter *account*-level +# data/privacy guardrail) — these are *per-prompt* safety decisions made by +# the upstream model provider. They are deterministic for the unchanged +# request, so retrying the same prompt three times just reproduces the same +# block and burns paid attempts on a refusal. The recovery is to switch to a +# configured fallback model/provider immediately, or surface the block to +# the user with actionable guidance if no fallback exists. +# +# Patterns are intentionally narrow — each phrase is a verbatim string from +# a specific provider's safety pipeline, not a generic word like "policy" or +# "violation" that could collide with billing/auth/format errors: +# • OpenAI Codex cybersecurity refusal (gpt-5.5, the case from #18028) +# • OpenAI moderation refusal ("violates our usage policies", with +# "usage policies" disambiguating from billing's "exceeded ... policy") +# • Anthropic safety refusal ("prompt was flagged by ... safety system") +# • OpenAI Responses content filter +_CONTENT_POLICY_BLOCKED_PATTERNS = [ + # OpenAI Codex (#18028) — message may arrive without an HTTP status + "flagged for possible cybersecurity risk", + "trusted access for cyber", + # OpenAI moderation — chat completions / responses + "violates our usage policies", + "violates openai's usage policies", + "your request was flagged by", + # Anthropic safety system + "prompt was flagged by our safety", + "responses cannot be generated due to safety", + # Generic content-filter wording seen on Azure / OpenAI Responses. + # ``content_filter`` (underscore) is the OpenAI-standard error/finish + # token surfaced verbatim by their SDKs when a request is blocked. + # ``responsibleaipolicyviolation`` is Azure OpenAI's error code. + # Deliberately NOT matching the space variant ("content filter") — it + # appears in benign config descriptions and tooltip text that providers + # echo back; the underscore form is provider-specific enough. + "content_filter", + "responsibleaipolicyviolation", +] + # Auth patterns (non-status-code signals) _AUTH_PATTERNS = [ "invalid api key", @@ -492,6 +532,20 @@ def classify_api_error( # ── 1. Provider-specific patterns (highest priority) ──────────── + # Provider content-policy / safety-filter block. The provider has made a + # deterministic refusal decision about THIS prompt — retrying unchanged + # just reproduces the same refusal and burns paid attempts. Must run + # before status-based classification so a 400 safety block isn't + # downgraded to a generic ``format_error`` and a status-less block + # (OpenAI Codex SDK can raise without one) isn't left in the retryable + # ``unknown`` bucket. See issue #18028. + if any(p in error_msg for p in _CONTENT_POLICY_BLOCKED_PATTERNS): + return _result( + FailoverReason.content_policy_blocked, + retryable=False, + should_fallback=True, + ) + # Anthropic thinking block signature invalid (400). # Don't gate on provider — OpenRouter proxies Anthropic errors, so the # provider may be "openrouter" even though the error is Anthropic-specific. diff --git a/scripts/release.py b/scripts/release.py index 55d59b42470..6539f2f1de7 100755 --- a/scripts/release.py +++ b/scripts/release.py @@ -46,6 +46,7 @@ ACP_REGISTRY_MANIFEST = REPO_ROOT / "acp_registry" / "agent.json" # Auto-extracted from noreply emails + manual overrides AUTHOR_MAP = { "9592417+adam91holt@users.noreply.github.com": "adam91holt", + "kchuang1015@users.noreply.github.com": "kchuang1015", "45688690+fujinice@users.noreply.github.com": "fujinice", "276689385+carltonawong@users.noreply.github.com": "carltonawong", "195255660+EvilHumphrey@users.noreply.github.com": "EvilHumphrey", diff --git a/tests/agent/test_error_classifier.py b/tests/agent/test_error_classifier.py index 5bf259ba9bd..b98fbe5beb9 100644 --- a/tests/agent/test_error_classifier.py +++ b/tests/agent/test_error_classifier.py @@ -59,6 +59,7 @@ class TestFailoverReason: "invalid_encrypted_content", "multimodal_tool_content_unsupported", "provider_policy_blocked", + "content_policy_blocked", "thinking_signature", "long_context_tier", "oauth_long_context_beta_forbidden", "llama_cpp_grammar_pattern", @@ -466,6 +467,78 @@ class TestClassifyApiError: result = classify_api_error(e) assert result.reason == FailoverReason.provider_policy_blocked + # ── Provider content-policy block (per-prompt safety filter) ── + # + # Distinct from ``provider_policy_blocked`` above — these are upstream + # model-provider safety refusals for THIS prompt, not OpenRouter + # account-level data policy. Recovery is fallback model, not config fix. + # See issue #18028 — OpenAI Codex was burning 3 retries on identical + # refusals before users saw "API failed after 3 retries" on Telegram. + + def test_message_only_cyber_content_policy_blocked(self): + # OpenAI Codex returns this without an HTTP status. Retrying the + # same prompt three times only repeats the same policy decision, so + # the classifier must jump straight to fallback / abort instead of + # leaving it in the retryable ``unknown`` bucket. + e = Exception( + "This content was flagged for possible cybersecurity risk. If this " + "seems wrong, try rephrasing your request. To get authorized for " + "security work, join the Trusted Access for Cyber program." + ) + result = classify_api_error(e, provider="openai-codex", model="gpt-5.5") + assert result.reason == FailoverReason.content_policy_blocked + assert result.retryable is False + assert result.should_fallback is True + assert result.should_compress is False + + def test_400_cyber_content_policy_blocked(self): + # When the SDK does attach a status (e.g. 400), the safety pattern + # must still beat the format_error fallthrough. + e = MockAPIError( + "This content was flagged for possible cybersecurity risk", + status_code=400, + ) + result = classify_api_error(e, provider="openai-codex", model="gpt-5.5") + assert result.reason == FailoverReason.content_policy_blocked + assert result.retryable is False + assert result.should_fallback is True + + def test_openai_usage_policy_violation_content_policy_blocked(self): + # OpenAI moderation refusal wording from chat completions / responses. + e = MockAPIError( + "Your request was flagged by the moderation system as potentially " + "violating OpenAI's usage policies.", + status_code=400, + ) + result = classify_api_error(e, provider="openai", model="gpt-4o") + assert result.reason == FailoverReason.content_policy_blocked + assert result.retryable is False + assert result.should_fallback is True + + def test_anthropic_safety_system_content_policy_blocked(self): + # Anthropic safety refusal — distinct phrasing from OpenAI. + e = Exception( + "Your prompt was flagged by our safety system. Please rephrase " + "and try again." + ) + result = classify_api_error(e, provider="anthropic", model="claude-3-5-sonnet") + assert result.reason == FailoverReason.content_policy_blocked + assert result.retryable is False + assert result.should_fallback is True + + def test_azure_content_filter_content_policy_blocked(self): + # Azure OpenAI returns ``content_filter`` finish reason / error code + # and ``ResponsibleAIPolicyViolation`` in error bodies — both narrow + # tokens, not the generic English phrase. + e = MockAPIError( + "The response was filtered: ResponsibleAIPolicyViolation " + "(finish_reason=content_filter).", + status_code=400, + ) + result = classify_api_error(e, provider="azure", model="gpt-4o") + assert result.reason == FailoverReason.content_policy_blocked + assert result.retryable is False + def test_404_model_not_found_still_works(self): # Regression guard: the new policy-block check must not swallow # genuine model_not_found 404s. diff --git a/tests/run_agent/test_18028_content_policy_blocked.py b/tests/run_agent/test_18028_content_policy_blocked.py new file mode 100644 index 00000000000..1edf16b87ca --- /dev/null +++ b/tests/run_agent/test_18028_content_policy_blocked.py @@ -0,0 +1,152 @@ +"""Regression guard for #18028: provider content-policy / safety-filter +blocks must classify as ``content_policy_blocked``, be non-retryable, and +trigger the ``is_client_error`` abort path so the loop jumps straight to a +configured fallback or surfaces a clear policy-block message — instead of +burning ``api_max_retries`` paid attempts on a deterministic refusal and +delivering "API failed after 3 retries" to Telegram/cron with no provider +context. + +Real-world symptom from the issue: + ``API call failed after 3 retries — This content was flagged for + possible cybersecurity risk... | provider=openai-codex model=gpt-5.5`` +repeating across cron jobs and gateway sessions, with the user unable to +tell whether the gateway was broken, the model was down, or their prompt +was the problem. +""" +from __future__ import annotations + + +class TestContentPolicyBlockedClassification: + """Verify classify_api_error returns the right shape so downstream + recovery (fallback activation, final_response wording) fires correctly. + """ + + def test_openai_codex_cybersecurity_no_status(self): + """The reported #18028 case — SDK raises without a status code.""" + from agent.error_classifier import classify_api_error, FailoverReason + + e = Exception( + "This content was flagged for possible cybersecurity risk. " + "If this seems wrong, try rephrasing your request. To get " + "authorized for security work, join the Trusted Access for " + "Cyber program." + ) + result = classify_api_error(e, provider="openai-codex", model="gpt-5.5") + # Must NOT fall into the retryable ``unknown`` bucket — that's what + # caused the 3x retry burn. + assert result.reason == FailoverReason.content_policy_blocked + assert result.retryable is False + # Recovery is fallback model, not credential rotation or compression. + assert result.should_fallback is True + assert result.should_compress is False + assert result.should_rotate_credential is False + + +class TestContentPolicyTriggersClientErrorAbort: + """Mirror the ``is_client_error`` predicate in + ``agent/conversation_loop.py`` and verify + ``FailoverReason.content_policy_blocked`` resolves to True so the loop + aborts (after attempting fallback) instead of falling into the + retry-backoff path. + """ + + def _mirror_is_client_error( + self, + *, + classified_retryable: bool, + classified_reason, + classified_should_compress: bool = False, + is_local_validation_error: bool = False, + is_context_length_error: bool = False, + ) -> bool: + """Exact shape of conversation_loop.py's is_client_error check. + + Kept in lock-step with the source. If you change one, change both. + """ + from agent.error_classifier import FailoverReason + + return ( + is_local_validation_error + or ( + not classified_retryable + and not classified_should_compress + and classified_reason not in { + FailoverReason.rate_limit, + FailoverReason.overloaded, + FailoverReason.context_overflow, + FailoverReason.payload_too_large, + FailoverReason.long_context_tier, + FailoverReason.thinking_signature, + } + ) + ) and not is_context_length_error + + def test_content_policy_blocked_triggers_abort(self): + """Safety-filter block must reach is_client_error → fallback/abort.""" + from agent.error_classifier import FailoverReason + + # What classify_api_error returns for a content-policy block: + # reason=content_policy_blocked, retryable=False, should_compress=False + assert self._mirror_is_client_error( + classified_retryable=False, + classified_reason=FailoverReason.content_policy_blocked, + ), ( + "FailoverReason.content_policy_blocked must trigger the " + "is_client_error path so fallback fires immediately instead of " + "burning api_max_retries paid attempts on a deterministic " + "safety refusal — see #18028." + ) + + +class TestContentPolicyPatternsAreNarrow: + """Defensive guard: the safety-filter patterns must not collide with + benign error wording from billing / format / generic 400 errors. If + these regress to ``content_policy_blocked``, recovery will route to + the wrong code path (fallback model instead of credential rotation). + """ + + def test_generic_400_format_error_not_misclassified(self): + from agent.error_classifier import classify_api_error, FailoverReason + + class _Err(Exception): + def __init__(self, msg, status_code): + super().__init__(msg) + self.status_code = status_code + + e = _Err("Invalid request: messages must be a non-empty list", status_code=400) + result = classify_api_error(e, provider="openai", model="gpt-4o") + assert result.reason != FailoverReason.content_policy_blocked + + def test_billing_402_not_misclassified(self): + from agent.error_classifier import classify_api_error, FailoverReason + + class _Err(Exception): + def __init__(self, msg, status_code): + super().__init__(msg) + self.status_code = status_code + + e = _Err("Insufficient credits. Top up your balance.", status_code=402) + result = classify_api_error(e, provider="openrouter", model="anthropic/claude-opus") + assert result.reason == FailoverReason.billing + + def test_openrouter_account_policy_block_stays_distinct(self): + """``provider_policy_blocked`` (OpenRouter account-level data + policy) must remain a separate classification from + ``content_policy_blocked`` (upstream model safety filter) — they + have different recovery strategies. + """ + from agent.error_classifier import classify_api_error, FailoverReason + + class _Err(Exception): + def __init__(self, msg, status_code): + super().__init__(msg) + self.status_code = status_code + + e = _Err( + "No endpoints available matching your guardrail restrictions " + "and data policy", + status_code=404, + ) + result = classify_api_error(e, provider="openrouter", model="anthropic/claude-opus") + assert result.reason == FailoverReason.provider_policy_blocked + assert result.reason != FailoverReason.content_policy_blocked