fix(agent): fallback immediately on provider content-policy blocks (#33883)

* fix(agent): fallback immediately on provider content-policy blocks Provider safety-filter refusals (e.g. OpenAI Codex 'flagged for possible cybersecurity risk', OpenAI moderation 'violates our usage policies', Anthropic safety-system rejections, Azure content_filter) are deterministic decisions about a specific prompt. Retrying the same prompt up to api_max_retries times just reproduces the same refusal and burns paid attempts before surfacing the generic 'API failed after 3 retries — <provider message>' to Telegram / cron with no indication that the failure came from the model provider rather than Hermes itself. Classify these as a new FailoverReason.content_policy_blocked (non-retryable, should_fallback=True) and route them through the existing is_client_error path so the loop: - skips the 3x retry backoff - activates a configured fallback model immediately - emits a clear provider-safety message to the user (not the generic 'Non-retryable error (HTTP None)') and surfaces actionable guidance when no fallback is configured (rephrase, narrow context, or set fallback_model in hermes config) - returns a final_response that explicitly tells the user this came from the model provider, so gateway delivery is unambiguous and cron last_status reflects the safety block rather than a vague 'agent reported failure' Patterns are intentionally narrow — verbatim refusal phrasings keyed to specific provider safety pipelines, not generic words like 'policy' or 'violation' that would collide with billing / format / auth errors. Regression guards in test_18028_content_policy_blocked.py verify billing 402s, generic 400s, and OpenRouter account-level provider_policy_blocked remain distinct classifications. Salvaged from #18164 onto current main (file restructure: loop logic moved from run_agent.py to agent/conversation_loop.py, _emit_status → _buffer_status), broadened patterns beyond the original OpenAI Codex cybersecurity case to cover OpenAI moderation, Anthropic safety system, and Azure content_filter; added user-actionable guidance and a clear final_response so cron/gateway surfaces the policy block instead of a generic non-retryable error, and added a regression-guard test module mirroring the is_client_error predicate. Addresses #18028. Co-authored-by: Kuan-Chieh Huang <kchuang1015@users.noreply.github.com> * chore: add kchuang1015 to AUTHOR_MAP --------- Co-authored-by: Kuan-Chieh Huang <kchuang1015@users.noreply.github.com>
2026-07-18 14:52:04 +00:00 · 2026-05-28 07:28:24 -07:00 · 2026-05-28 07:28:24 -07:00 · 0554ef1aa3
commit 0554ef1aa3
parent a82c88bac0
5 changed files with 334 additions and 6 deletions
--- a/agent/conversation_loop.py
+++ b/agent/conversation_loop.py
@ -3080,7 +3080,10 @@ def run_conversation(
                if is_client_error:
                    # Try fallback before aborting — a different provider
                    # may not have the same issue (rate limit, auth, etc.)
-                    agent._buffer_status(f"⚠️ Non-retryable error (HTTP {status_code}) — trying fallback...")
+                    if classified.reason == FailoverReason.content_policy_blocked:
+                        agent._buffer_status("⚠️ Provider safety filter blocked this request — trying fallback...")
+                    else:
+                        agent._buffer_status(f"⚠️ Non-retryable error (HTTP {status_code}) — trying fallback...")
                    if agent._try_activate_fallback():
                        retry_count = 0
                        compression_attempts = 0
@ -3093,10 +3096,16 @@ def run_conversation(
                    # Terminal — flush buffered context so the user sees
                    # what was tried before the abort.
                    agent._flush_status_buffer()
-                    agent._emit_status(
-                        f"❌ Non-retryable error (HTTP {status_code}): "
-                        f"{agent._summarize_api_error(api_error)}"
-                    )
+                    if classified.reason == FailoverReason.content_policy_blocked:
+                        agent._emit_status(
+                            f"❌ Provider safety filter blocked this request: "
+                            f"{agent._summarize_api_error(api_error)}"
+                        )
+                    else:
+                        agent._emit_status(
+                            f"❌ Non-retryable error (HTTP {status_code}): "
+                            f"{agent._summarize_api_error(api_error)}"
+                        )
                    agent._vprint(f"{agent.log_prefix}❌ Non-retryable client error (HTTP {status_code}). Aborting.", force=True)
                    agent._vprint(f"{agent.log_prefix}   🔌 Provider: {_provider}  Model: {_model}", force=True)
                    agent._vprint(f"{agent.log_prefix}   🌐 Endpoint: {_base}", force=True)
@ -3143,6 +3152,28 @@ def run_conversation(
                                agent._vprint(f"{agent.log_prefix}      • Check credits: https://openrouter.ai/settings/credits", force=True)
                    else:
                        agent._vprint(f"{agent.log_prefix}   💡 This type of error won't be fixed by retrying.", force=True)
+                    # Content-policy blocks deserve their own actionable
+                    # guidance — neither "fix your API key" nor "retry won't
+                    # help" tells the user what to actually do. The provider
+                    # has refused this specific prompt, so the recovery is
+                    # either a rephrase or routing to a different model.
+                    if classified.reason == FailoverReason.content_policy_blocked:
+                        agent._vprint(
+                            f"{agent.log_prefix}   💡 The provider's safety filter rejected this specific prompt.",
+                            force=True,
+                        )
+                        agent._vprint(
+                            f"{agent.log_prefix}      • Try rephrasing the request, narrowing the context, or splitting into smaller steps.",
+                            force=True,
+                        )
+                        agent._vprint(
+                            f"{agent.log_prefix}      • Configure a fallback provider so future blocks route automatically:",
+                            force=True,
+                        )
+                        agent._vprint(
+                            f"{agent.log_prefix}        hermes fallback add   (interactive picker — same as `hermes model`)",
+                            force=True,
+                        )
                    logger.error(f"{agent.log_prefix}Non-retryable client error: {api_error}")
                    # Skip session persistence when the error is likely
                    # context-overflow related (status 400 + large session).
@ -3157,6 +3188,23 @@ def run_conversation(
                        )
                    else:
                        agent._persist_session(messages, conversation_history)
+                    if classified.reason == FailoverReason.content_policy_blocked:
+                        _summary = agent._summarize_api_error(api_error)
+                        _policy_response = (
+                            f"⚠️  The model provider's safety filter blocked this request "
+                            f"(not a Hermes/gateway failure).\n\n"
+                            f"Provider message: {_summary}\n\n"
+                            f"Try rephrasing the request, narrowing the context, or "
+                            f"adding a fallback provider with `hermes fallback add`."
+                        )
+                        return {
+                            "final_response": _policy_response,
+                            "messages": messages,
+                            "api_calls": api_call_count,
+                            "completed": False,
+                            "failed": True,
+                            "error": f"content_policy_blocked: {_summary}",
+                        }
                    return {
                        "final_response": None,
                        "messages": messages,
--- a/agent/error_classifier.py
+++ b/agent/error_classifier.py
@ -44,9 +44,10 @@ class FailoverReason(enum.Enum):
    payload_too_large = "payload_too_large"  # 413 — compress payload
    image_too_large = "image_too_large"   # Native image part exceeds provider's per-image limit — shrink and retry

-    # Model
+    # Model / provider policy
    model_not_found = "model_not_found"  # 404 or invalid model — fallback to different model
    provider_policy_blocked = "provider_policy_blocked"  # Aggregator (e.g. OpenRouter) blocked the only endpoint due to account data/privacy policy
+    content_policy_blocked = "content_policy_blocked"  # Provider safety filter rejected this prompt — deterministic per-request, don't retry unchanged

    # Request format
    format_error = "format_error"        # 400 bad request — abort or strip + retry
@ -289,6 +290,45 @@ _PROVIDER_POLICY_BLOCKED_PATTERNS = [
    "no endpoints found matching your data policy",
 ]

+# Provider content-policy / safety-filter blocks. Distinct from
+# ``provider_policy_blocked`` above (which is an OpenRouter *account*-level
+# data/privacy guardrail) — these are *per-prompt* safety decisions made by
+# the upstream model provider. They are deterministic for the unchanged
+# request, so retrying the same prompt three times just reproduces the same
+# block and burns paid attempts on a refusal. The recovery is to switch to a
+# configured fallback model/provider immediately, or surface the block to
+# the user with actionable guidance if no fallback exists.
+#
+# Patterns are intentionally narrow — each phrase is a verbatim string from
+# a specific provider's safety pipeline, not a generic word like "policy" or
+# "violation" that could collide with billing/auth/format errors:
+#   • OpenAI Codex cybersecurity refusal (gpt-5.5, the case from #18028)
+#   • OpenAI moderation refusal ("violates our usage policies", with
+#     "usage policies" disambiguating from billing's "exceeded ... policy")
+#   • Anthropic safety refusal ("prompt was flagged by ... safety system")
+#   • OpenAI Responses content filter
+_CONTENT_POLICY_BLOCKED_PATTERNS = [
+    # OpenAI Codex (#18028) — message may arrive without an HTTP status
+    "flagged for possible cybersecurity risk",
+    "trusted access for cyber",
+    # OpenAI moderation — chat completions / responses
+    "violates our usage policies",
+    "violates openai's usage policies",
+    "your request was flagged by",
+    # Anthropic safety system
+    "prompt was flagged by our safety",
+    "responses cannot be generated due to safety",
+    # Generic content-filter wording seen on Azure / OpenAI Responses.
+    # ``content_filter`` (underscore) is the OpenAI-standard error/finish
+    # token surfaced verbatim by their SDKs when a request is blocked.
+    # ``responsibleaipolicyviolation`` is Azure OpenAI's error code.
+    # Deliberately NOT matching the space variant ("content filter") — it
+    # appears in benign config descriptions and tooltip text that providers
+    # echo back; the underscore form is provider-specific enough.
+    "content_filter",
+    "responsibleaipolicyviolation",
+]
+
 # Auth patterns (non-status-code signals)
 _AUTH_PATTERNS = [
    "invalid api key",
@ -492,6 +532,20 @@ def classify_api_error(

    # ── 1. Provider-specific patterns (highest priority) ────────────

+    # Provider content-policy / safety-filter block. The provider has made a
+    # deterministic refusal decision about THIS prompt — retrying unchanged
+    # just reproduces the same refusal and burns paid attempts. Must run
+    # before status-based classification so a 400 safety block isn't
+    # downgraded to a generic ``format_error`` and a status-less block
+    # (OpenAI Codex SDK can raise without one) isn't left in the retryable
+    # ``unknown`` bucket. See issue #18028.
+    if any(p in error_msg for p in _CONTENT_POLICY_BLOCKED_PATTERNS):
+        return _result(
+            FailoverReason.content_policy_blocked,
+            retryable=False,
+            should_fallback=True,
+        )
+
    # Anthropic thinking block signature invalid (400).
    # Don't gate on provider — OpenRouter proxies Anthropic errors, so the
    # provider may be "openrouter" even though the error is Anthropic-specific.
--- a/scripts/release.py
+++ b/scripts/release.py
@ -46,6 +46,7 @@ ACP_REGISTRY_MANIFEST = REPO_ROOT / "acp_registry" / "agent.json"
 # Auto-extracted from noreply emails + manual overrides
 AUTHOR_MAP = {
    "9592417+adam91holt@users.noreply.github.com": "adam91holt",
+    "kchuang1015@users.noreply.github.com": "kchuang1015",
    "45688690+fujinice@users.noreply.github.com": "fujinice",
    "276689385+carltonawong@users.noreply.github.com": "carltonawong",
    "195255660+EvilHumphrey@users.noreply.github.com": "EvilHumphrey",
--- a/tests/agent/test_error_classifier.py
+++ b/tests/agent/test_error_classifier.py
@ -59,6 +59,7 @@ class TestFailoverReason:
            "invalid_encrypted_content",
            "multimodal_tool_content_unsupported",
            "provider_policy_blocked",
+            "content_policy_blocked",
            "thinking_signature", "long_context_tier",
            "oauth_long_context_beta_forbidden",
            "llama_cpp_grammar_pattern",
@ -466,6 +467,78 @@ class TestClassifyApiError:
        result = classify_api_error(e)
        assert result.reason == FailoverReason.provider_policy_blocked

+    # ── Provider content-policy block (per-prompt safety filter) ──
+    #
+    # Distinct from ``provider_policy_blocked`` above — these are upstream
+    # model-provider safety refusals for THIS prompt, not OpenRouter
+    # account-level data policy. Recovery is fallback model, not config fix.
+    # See issue #18028 — OpenAI Codex was burning 3 retries on identical
+    # refusals before users saw "API failed after 3 retries" on Telegram.
+
+    def test_message_only_cyber_content_policy_blocked(self):
+        # OpenAI Codex returns this without an HTTP status. Retrying the
+        # same prompt three times only repeats the same policy decision, so
+        # the classifier must jump straight to fallback / abort instead of
+        # leaving it in the retryable ``unknown`` bucket.
+        e = Exception(
+            "This content was flagged for possible cybersecurity risk. If this "
+            "seems wrong, try rephrasing your request. To get authorized for "
+            "security work, join the Trusted Access for Cyber program."
+        )
+        result = classify_api_error(e, provider="openai-codex", model="gpt-5.5")
+        assert result.reason == FailoverReason.content_policy_blocked
+        assert result.retryable is False
+        assert result.should_fallback is True
+        assert result.should_compress is False
+
+    def test_400_cyber_content_policy_blocked(self):
+        # When the SDK does attach a status (e.g. 400), the safety pattern
+        # must still beat the format_error fallthrough.
+        e = MockAPIError(
+            "This content was flagged for possible cybersecurity risk",
+            status_code=400,
+        )
+        result = classify_api_error(e, provider="openai-codex", model="gpt-5.5")
+        assert result.reason == FailoverReason.content_policy_blocked
+        assert result.retryable is False
+        assert result.should_fallback is True
+
+    def test_openai_usage_policy_violation_content_policy_blocked(self):
+        # OpenAI moderation refusal wording from chat completions / responses.
+        e = MockAPIError(
+            "Your request was flagged by the moderation system as potentially "
+            "violating OpenAI's usage policies.",
+            status_code=400,
+        )
+        result = classify_api_error(e, provider="openai", model="gpt-4o")
+        assert result.reason == FailoverReason.content_policy_blocked
+        assert result.retryable is False
+        assert result.should_fallback is True
+
+    def test_anthropic_safety_system_content_policy_blocked(self):
+        # Anthropic safety refusal — distinct phrasing from OpenAI.
+        e = Exception(
+            "Your prompt was flagged by our safety system. Please rephrase "
+            "and try again."
+        )
+        result = classify_api_error(e, provider="anthropic", model="claude-3-5-sonnet")
+        assert result.reason == FailoverReason.content_policy_blocked
+        assert result.retryable is False
+        assert result.should_fallback is True
+
+    def test_azure_content_filter_content_policy_blocked(self):
+        # Azure OpenAI returns ``content_filter`` finish reason / error code
+        # and ``ResponsibleAIPolicyViolation`` in error bodies — both narrow
+        # tokens, not the generic English phrase.
+        e = MockAPIError(
+            "The response was filtered: ResponsibleAIPolicyViolation "
+            "(finish_reason=content_filter).",
+            status_code=400,
+        )
+        result = classify_api_error(e, provider="azure", model="gpt-4o")
+        assert result.reason == FailoverReason.content_policy_blocked
+        assert result.retryable is False
+
    def test_404_model_not_found_still_works(self):
        # Regression guard: the new policy-block check must not swallow
        # genuine model_not_found 404s.
--- a/tests/run_agent/test_18028_content_policy_blocked.py
+++ b/tests/run_agent/test_18028_content_policy_blocked.py
@ -0,0 +1,152 @@
+"""Regression guard for #18028: provider content-policy / safety-filter
+blocks must classify as ``content_policy_blocked``, be non-retryable, and
+trigger the ``is_client_error`` abort path so the loop jumps straight to a
+configured fallback or surfaces a clear policy-block message — instead of
+burning ``api_max_retries`` paid attempts on a deterministic refusal and
+delivering "API failed after 3 retries" to Telegram/cron with no provider
+context.
+
+Real-world symptom from the issue:
+    ``API call failed after 3 retries — This content was flagged for
+    possible cybersecurity risk... | provider=openai-codex model=gpt-5.5``
+repeating across cron jobs and gateway sessions, with the user unable to
+tell whether the gateway was broken, the model was down, or their prompt
+was the problem.
+"""
+from __future__ import annotations
+
+
+class TestContentPolicyBlockedClassification:
+    """Verify classify_api_error returns the right shape so downstream
+    recovery (fallback activation, final_response wording) fires correctly.
+    """
+
+    def test_openai_codex_cybersecurity_no_status(self):
+        """The reported #18028 case — SDK raises without a status code."""
+        from agent.error_classifier import classify_api_error, FailoverReason
+
+        e = Exception(
+            "This content was flagged for possible cybersecurity risk. "
+            "If this seems wrong, try rephrasing your request. To get "
+            "authorized for security work, join the Trusted Access for "
+            "Cyber program."
+        )
+        result = classify_api_error(e, provider="openai-codex", model="gpt-5.5")
+        # Must NOT fall into the retryable ``unknown`` bucket — that's what
+        # caused the 3x retry burn.
+        assert result.reason == FailoverReason.content_policy_blocked
+        assert result.retryable is False
+        # Recovery is fallback model, not credential rotation or compression.
+        assert result.should_fallback is True
+        assert result.should_compress is False
+        assert result.should_rotate_credential is False
+
+
+class TestContentPolicyTriggersClientErrorAbort:
+    """Mirror the ``is_client_error`` predicate in
+    ``agent/conversation_loop.py`` and verify
+    ``FailoverReason.content_policy_blocked`` resolves to True so the loop
+    aborts (after attempting fallback) instead of falling into the
+    retry-backoff path.
+    """
+
+    def _mirror_is_client_error(
+        self,
+        *,
+        classified_retryable: bool,
+        classified_reason,
+        classified_should_compress: bool = False,
+        is_local_validation_error: bool = False,
+        is_context_length_error: bool = False,
+    ) -> bool:
+        """Exact shape of conversation_loop.py's is_client_error check.
+
+        Kept in lock-step with the source. If you change one, change both.
+        """
+        from agent.error_classifier import FailoverReason
+
+        return (
+            is_local_validation_error
+            or (
+                not classified_retryable
+                and not classified_should_compress
+                and classified_reason not in {
+                    FailoverReason.rate_limit,
+                    FailoverReason.overloaded,
+                    FailoverReason.context_overflow,
+                    FailoverReason.payload_too_large,
+                    FailoverReason.long_context_tier,
+                    FailoverReason.thinking_signature,
+                }
+            )
+        ) and not is_context_length_error
+
+    def test_content_policy_blocked_triggers_abort(self):
+        """Safety-filter block must reach is_client_error → fallback/abort."""
+        from agent.error_classifier import FailoverReason
+
+        # What classify_api_error returns for a content-policy block:
+        #   reason=content_policy_blocked, retryable=False, should_compress=False
+        assert self._mirror_is_client_error(
+            classified_retryable=False,
+            classified_reason=FailoverReason.content_policy_blocked,
+        ), (
+            "FailoverReason.content_policy_blocked must trigger the "
+            "is_client_error path so fallback fires immediately instead of "
+            "burning api_max_retries paid attempts on a deterministic "
+            "safety refusal — see #18028."
+        )
+
+
+class TestContentPolicyPatternsAreNarrow:
+    """Defensive guard: the safety-filter patterns must not collide with
+    benign error wording from billing / format / generic 400 errors. If
+    these regress to ``content_policy_blocked``, recovery will route to
+    the wrong code path (fallback model instead of credential rotation).
+    """
+
+    def test_generic_400_format_error_not_misclassified(self):
+        from agent.error_classifier import classify_api_error, FailoverReason
+
+        class _Err(Exception):
+            def __init__(self, msg, status_code):
+                super().__init__(msg)
+                self.status_code = status_code
+
+        e = _Err("Invalid request: messages must be a non-empty list", status_code=400)
+        result = classify_api_error(e, provider="openai", model="gpt-4o")
+        assert result.reason != FailoverReason.content_policy_blocked
+
+    def test_billing_402_not_misclassified(self):
+        from agent.error_classifier import classify_api_error, FailoverReason
+
+        class _Err(Exception):
+            def __init__(self, msg, status_code):
+                super().__init__(msg)
+                self.status_code = status_code
+
+        e = _Err("Insufficient credits. Top up your balance.", status_code=402)
+        result = classify_api_error(e, provider="openrouter", model="anthropic/claude-opus")
+        assert result.reason == FailoverReason.billing
+
+    def test_openrouter_account_policy_block_stays_distinct(self):
+        """``provider_policy_blocked`` (OpenRouter account-level data
+        policy) must remain a separate classification from
+        ``content_policy_blocked`` (upstream model safety filter) — they
+        have different recovery strategies.
+        """
+        from agent.error_classifier import classify_api_error, FailoverReason
+
+        class _Err(Exception):
+            def __init__(self, msg, status_code):
+                super().__init__(msg)
+                self.status_code = status_code
+
+        e = _Err(
+            "No endpoints available matching your guardrail restrictions "
+            "and data policy",
+            status_code=404,
+        )
+        result = classify_api_error(e, provider="openrouter", model="anthropic/claude-opus")
+        assert result.reason == FailoverReason.provider_policy_blocked
+        assert result.reason != FailoverReason.content_policy_blocked