fix(agent): fallback immediately on provider content-policy blocks (#33883)

* fix(agent): fallback immediately on provider content-policy blocks Provider safety-filter refusals (e.g. OpenAI Codex 'flagged for possible cybersecurity risk', OpenAI moderation 'violates our usage policies', Anthropic safety-system rejections, Azure content_filter) are deterministic decisions about a specific prompt. Retrying the same prompt up to api_max_retries times just reproduces the same refusal and burns paid attempts before surfacing the generic 'API failed after 3 retries — <provider message>' to Telegram / cron with no indication that the failure came from the model provider rather than Hermes itself. Classify these as a new FailoverReason.content_policy_blocked (non-retryable, should_fallback=True) and route them through the existing is_client_error path so the loop: - skips the 3x retry backoff - activates a configured fallback model immediately - emits a clear provider-safety message to the user (not the generic 'Non-retryable error (HTTP None)') and surfaces actionable guidance when no fallback is configured (rephrase, narrow context, or set fallback_model in hermes config) - returns a final_response that explicitly tells the user this came from the model provider, so gateway delivery is unambiguous and cron last_status reflects the safety block rather than a vague 'agent reported failure' Patterns are intentionally narrow — verbatim refusal phrasings keyed to specific provider safety pipelines, not generic words like 'policy' or 'violation' that would collide with billing / format / auth errors. Regression guards in test_18028_content_policy_blocked.py verify billing 402s, generic 400s, and OpenRouter account-level provider_policy_blocked remain distinct classifications. Salvaged from #18164 onto current main (file restructure: loop logic moved from run_agent.py to agent/conversation_loop.py, _emit_status → _buffer_status), broadened patterns beyond the original OpenAI Codex cybersecurity case to cover OpenAI moderation, Anthropic safety system, and Azure content_filter; added user-actionable guidance and a clear final_response so cron/gateway surfaces the policy block instead of a generic non-retryable error, and added a regression-guard test module mirroring the is_client_error predicate. Addresses #18028. Co-authored-by: Kuan-Chieh Huang <kchuang1015@users.noreply.github.com> * chore: add kchuang1015 to AUTHOR_MAP --------- Co-authored-by: Kuan-Chieh Huang <kchuang1015@users.noreply.github.com>
2026-06-05 07:41:39 +00:00 · 2026-05-28 07:28:24 -07:00 · 2026-05-28 07:28:24 -07:00 · 0554ef1aa3
commit 0554ef1aa3
parent a82c88bac0
5 changed files with 334 additions and 6 deletions
--- a/agent/conversation_loop.py
+++ b/agent/conversation_loop.py
@ -3080,7 +3080,10 @@ def run_conversation(
                if is_client_error:
                    # Try fallback before aborting — a different provider
                    # may not have the same issue (rate limit, auth, etc.)
-                    agent._buffer_status(f"⚠️ Non-retryable error (HTTP {status_code}) — trying fallback...")
+                    if classified.reason == FailoverReason.content_policy_blocked:
+                        agent._buffer_status("⚠️ Provider safety filter blocked this request — trying fallback...")
+                    else:
+                        agent._buffer_status(f"⚠️ Non-retryable error (HTTP {status_code}) — trying fallback...")
                    if agent._try_activate_fallback():
                        retry_count = 0
                        compression_attempts = 0
@ -3093,10 +3096,16 @@ def run_conversation(
                    # Terminal — flush buffered context so the user sees
                    # what was tried before the abort.
                    agent._flush_status_buffer()
-                    agent._emit_status(
-                        f"❌ Non-retryable error (HTTP {status_code}): "
-                        f"{agent._summarize_api_error(api_error)}"
-                    )
+                    if classified.reason == FailoverReason.content_policy_blocked:
+                        agent._emit_status(
+                            f"❌ Provider safety filter blocked this request: "
+                            f"{agent._summarize_api_error(api_error)}"
+                        )
+                    else:
+                        agent._emit_status(
+                            f"❌ Non-retryable error (HTTP {status_code}): "
+                            f"{agent._summarize_api_error(api_error)}"
+                        )
                    agent._vprint(f"{agent.log_prefix}❌ Non-retryable client error (HTTP {status_code}). Aborting.", force=True)
                    agent._vprint(f"{agent.log_prefix}   🔌 Provider: {_provider}  Model: {_model}", force=True)
                    agent._vprint(f"{agent.log_prefix}   🌐 Endpoint: {_base}", force=True)
@ -3143,6 +3152,28 @@ def run_conversation(
                                agent._vprint(f"{agent.log_prefix}      • Check credits: https://openrouter.ai/settings/credits", force=True)
                    else:
                        agent._vprint(f"{agent.log_prefix}   💡 This type of error won't be fixed by retrying.", force=True)
+                    # Content-policy blocks deserve their own actionable
+                    # guidance — neither "fix your API key" nor "retry won't
+                    # help" tells the user what to actually do. The provider
+                    # has refused this specific prompt, so the recovery is
+                    # either a rephrase or routing to a different model.
+                    if classified.reason == FailoverReason.content_policy_blocked:
+                        agent._vprint(
+                            f"{agent.log_prefix}   💡 The provider's safety filter rejected this specific prompt.",
+                            force=True,
+                        )
+                        agent._vprint(
+                            f"{agent.log_prefix}      • Try rephrasing the request, narrowing the context, or splitting into smaller steps.",
+                            force=True,
+                        )
+                        agent._vprint(
+                            f"{agent.log_prefix}      • Configure a fallback provider so future blocks route automatically:",
+                            force=True,
+                        )
+                        agent._vprint(
+                            f"{agent.log_prefix}        hermes fallback add   (interactive picker — same as `hermes model`)",
+                            force=True,
+                        )
                    logger.error(f"{agent.log_prefix}Non-retryable client error: {api_error}")
                    # Skip session persistence when the error is likely
                    # context-overflow related (status 400 + large session).
@ -3157,6 +3188,23 @@ def run_conversation(
                        )
                    else:
                        agent._persist_session(messages, conversation_history)
+                    if classified.reason == FailoverReason.content_policy_blocked:
+                        _summary = agent._summarize_api_error(api_error)
+                        _policy_response = (
+                            f"⚠️  The model provider's safety filter blocked this request "
+                            f"(not a Hermes/gateway failure).\n\n"
+                            f"Provider message: {_summary}\n\n"
+                            f"Try rephrasing the request, narrowing the context, or "
+                            f"adding a fallback provider with `hermes fallback add`."
+                        )
+                        return {
+                            "final_response": _policy_response,
+                            "messages": messages,
+                            "api_calls": api_call_count,
+                            "completed": False,
+                            "failed": True,
+                            "error": f"content_policy_blocked: {_summary}",
+                        }
                    return {
                        "final_response": None,
                        "messages": messages,