mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-29 06:31:32 +00:00
fix(agent): fallback immediately on provider content-policy blocks (#33883)
* fix(agent): fallback immediately on provider content-policy blocks
Provider safety-filter refusals (e.g. OpenAI Codex 'flagged for possible
cybersecurity risk', OpenAI moderation 'violates our usage policies',
Anthropic safety-system rejections, Azure content_filter) are
deterministic decisions about a specific prompt. Retrying the same
prompt up to api_max_retries times just reproduces the same refusal and
burns paid attempts before surfacing the generic 'API failed after 3
retries — <provider message>' to Telegram / cron with no indication that
the failure came from the model provider rather than Hermes itself.
Classify these as a new FailoverReason.content_policy_blocked
(non-retryable, should_fallback=True) and route them through the
existing is_client_error path so the loop:
- skips the 3x retry backoff
- activates a configured fallback model immediately
- emits a clear provider-safety message to the user (not the generic
'Non-retryable error (HTTP None)') and surfaces actionable guidance
when no fallback is configured (rephrase, narrow context, or set
fallback_model in hermes config)
- returns a final_response that explicitly tells the user this came
from the model provider, so gateway delivery is unambiguous and
cron last_status reflects the safety block rather than a vague
'agent reported failure'
Patterns are intentionally narrow — verbatim refusal phrasings keyed to
specific provider safety pipelines, not generic words like 'policy' or
'violation' that would collide with billing / format / auth errors.
Regression guards in test_18028_content_policy_blocked.py verify
billing 402s, generic 400s, and OpenRouter account-level
provider_policy_blocked remain distinct classifications.
Salvaged from #18164 onto current main (file restructure: loop logic
moved from run_agent.py to agent/conversation_loop.py, _emit_status →
_buffer_status), broadened patterns beyond the original OpenAI Codex
cybersecurity case to cover OpenAI moderation, Anthropic safety system,
and Azure content_filter; added user-actionable guidance and a clear
final_response so cron/gateway surfaces the policy block instead of a
generic non-retryable error, and added a regression-guard test module
mirroring the is_client_error predicate.
Addresses #18028.
Co-authored-by: Kuan-Chieh Huang <kchuang1015@users.noreply.github.com>
* chore: add kchuang1015 to AUTHOR_MAP
---------
Co-authored-by: Kuan-Chieh Huang <kchuang1015@users.noreply.github.com>
This commit is contained in:
parent
a82c88bac0
commit
0554ef1aa3
5 changed files with 334 additions and 6 deletions
|
|
@ -3080,7 +3080,10 @@ def run_conversation(
|
|||
if is_client_error:
|
||||
# Try fallback before aborting — a different provider
|
||||
# may not have the same issue (rate limit, auth, etc.)
|
||||
agent._buffer_status(f"⚠️ Non-retryable error (HTTP {status_code}) — trying fallback...")
|
||||
if classified.reason == FailoverReason.content_policy_blocked:
|
||||
agent._buffer_status("⚠️ Provider safety filter blocked this request — trying fallback...")
|
||||
else:
|
||||
agent._buffer_status(f"⚠️ Non-retryable error (HTTP {status_code}) — trying fallback...")
|
||||
if agent._try_activate_fallback():
|
||||
retry_count = 0
|
||||
compression_attempts = 0
|
||||
|
|
@ -3093,10 +3096,16 @@ def run_conversation(
|
|||
# Terminal — flush buffered context so the user sees
|
||||
# what was tried before the abort.
|
||||
agent._flush_status_buffer()
|
||||
agent._emit_status(
|
||||
f"❌ Non-retryable error (HTTP {status_code}): "
|
||||
f"{agent._summarize_api_error(api_error)}"
|
||||
)
|
||||
if classified.reason == FailoverReason.content_policy_blocked:
|
||||
agent._emit_status(
|
||||
f"❌ Provider safety filter blocked this request: "
|
||||
f"{agent._summarize_api_error(api_error)}"
|
||||
)
|
||||
else:
|
||||
agent._emit_status(
|
||||
f"❌ Non-retryable error (HTTP {status_code}): "
|
||||
f"{agent._summarize_api_error(api_error)}"
|
||||
)
|
||||
agent._vprint(f"{agent.log_prefix}❌ Non-retryable client error (HTTP {status_code}). Aborting.", force=True)
|
||||
agent._vprint(f"{agent.log_prefix} 🔌 Provider: {_provider} Model: {_model}", force=True)
|
||||
agent._vprint(f"{agent.log_prefix} 🌐 Endpoint: {_base}", force=True)
|
||||
|
|
@ -3143,6 +3152,28 @@ def run_conversation(
|
|||
agent._vprint(f"{agent.log_prefix} • Check credits: https://openrouter.ai/settings/credits", force=True)
|
||||
else:
|
||||
agent._vprint(f"{agent.log_prefix} 💡 This type of error won't be fixed by retrying.", force=True)
|
||||
# Content-policy blocks deserve their own actionable
|
||||
# guidance — neither "fix your API key" nor "retry won't
|
||||
# help" tells the user what to actually do. The provider
|
||||
# has refused this specific prompt, so the recovery is
|
||||
# either a rephrase or routing to a different model.
|
||||
if classified.reason == FailoverReason.content_policy_blocked:
|
||||
agent._vprint(
|
||||
f"{agent.log_prefix} 💡 The provider's safety filter rejected this specific prompt.",
|
||||
force=True,
|
||||
)
|
||||
agent._vprint(
|
||||
f"{agent.log_prefix} • Try rephrasing the request, narrowing the context, or splitting into smaller steps.",
|
||||
force=True,
|
||||
)
|
||||
agent._vprint(
|
||||
f"{agent.log_prefix} • Configure a fallback provider so future blocks route automatically:",
|
||||
force=True,
|
||||
)
|
||||
agent._vprint(
|
||||
f"{agent.log_prefix} hermes fallback add (interactive picker — same as `hermes model`)",
|
||||
force=True,
|
||||
)
|
||||
logger.error(f"{agent.log_prefix}Non-retryable client error: {api_error}")
|
||||
# Skip session persistence when the error is likely
|
||||
# context-overflow related (status 400 + large session).
|
||||
|
|
@ -3157,6 +3188,23 @@ def run_conversation(
|
|||
)
|
||||
else:
|
||||
agent._persist_session(messages, conversation_history)
|
||||
if classified.reason == FailoverReason.content_policy_blocked:
|
||||
_summary = agent._summarize_api_error(api_error)
|
||||
_policy_response = (
|
||||
f"⚠️ The model provider's safety filter blocked this request "
|
||||
f"(not a Hermes/gateway failure).\n\n"
|
||||
f"Provider message: {_summary}\n\n"
|
||||
f"Try rephrasing the request, narrowing the context, or "
|
||||
f"adding a fallback provider with `hermes fallback add`."
|
||||
)
|
||||
return {
|
||||
"final_response": _policy_response,
|
||||
"messages": messages,
|
||||
"api_calls": api_call_count,
|
||||
"completed": False,
|
||||
"failed": True,
|
||||
"error": f"content_policy_blocked: {_summary}",
|
||||
}
|
||||
return {
|
||||
"final_response": None,
|
||||
"messages": messages,
|
||||
|
|
|
|||
|
|
@ -44,9 +44,10 @@ class FailoverReason(enum.Enum):
|
|||
payload_too_large = "payload_too_large" # 413 — compress payload
|
||||
image_too_large = "image_too_large" # Native image part exceeds provider's per-image limit — shrink and retry
|
||||
|
||||
# Model
|
||||
# Model / provider policy
|
||||
model_not_found = "model_not_found" # 404 or invalid model — fallback to different model
|
||||
provider_policy_blocked = "provider_policy_blocked" # Aggregator (e.g. OpenRouter) blocked the only endpoint due to account data/privacy policy
|
||||
content_policy_blocked = "content_policy_blocked" # Provider safety filter rejected this prompt — deterministic per-request, don't retry unchanged
|
||||
|
||||
# Request format
|
||||
format_error = "format_error" # 400 bad request — abort or strip + retry
|
||||
|
|
@ -289,6 +290,45 @@ _PROVIDER_POLICY_BLOCKED_PATTERNS = [
|
|||
"no endpoints found matching your data policy",
|
||||
]
|
||||
|
||||
# Provider content-policy / safety-filter blocks. Distinct from
|
||||
# ``provider_policy_blocked`` above (which is an OpenRouter *account*-level
|
||||
# data/privacy guardrail) — these are *per-prompt* safety decisions made by
|
||||
# the upstream model provider. They are deterministic for the unchanged
|
||||
# request, so retrying the same prompt three times just reproduces the same
|
||||
# block and burns paid attempts on a refusal. The recovery is to switch to a
|
||||
# configured fallback model/provider immediately, or surface the block to
|
||||
# the user with actionable guidance if no fallback exists.
|
||||
#
|
||||
# Patterns are intentionally narrow — each phrase is a verbatim string from
|
||||
# a specific provider's safety pipeline, not a generic word like "policy" or
|
||||
# "violation" that could collide with billing/auth/format errors:
|
||||
# • OpenAI Codex cybersecurity refusal (gpt-5.5, the case from #18028)
|
||||
# • OpenAI moderation refusal ("violates our usage policies", with
|
||||
# "usage policies" disambiguating from billing's "exceeded ... policy")
|
||||
# • Anthropic safety refusal ("prompt was flagged by ... safety system")
|
||||
# • OpenAI Responses content filter
|
||||
_CONTENT_POLICY_BLOCKED_PATTERNS = [
|
||||
# OpenAI Codex (#18028) — message may arrive without an HTTP status
|
||||
"flagged for possible cybersecurity risk",
|
||||
"trusted access for cyber",
|
||||
# OpenAI moderation — chat completions / responses
|
||||
"violates our usage policies",
|
||||
"violates openai's usage policies",
|
||||
"your request was flagged by",
|
||||
# Anthropic safety system
|
||||
"prompt was flagged by our safety",
|
||||
"responses cannot be generated due to safety",
|
||||
# Generic content-filter wording seen on Azure / OpenAI Responses.
|
||||
# ``content_filter`` (underscore) is the OpenAI-standard error/finish
|
||||
# token surfaced verbatim by their SDKs when a request is blocked.
|
||||
# ``responsibleaipolicyviolation`` is Azure OpenAI's error code.
|
||||
# Deliberately NOT matching the space variant ("content filter") — it
|
||||
# appears in benign config descriptions and tooltip text that providers
|
||||
# echo back; the underscore form is provider-specific enough.
|
||||
"content_filter",
|
||||
"responsibleaipolicyviolation",
|
||||
]
|
||||
|
||||
# Auth patterns (non-status-code signals)
|
||||
_AUTH_PATTERNS = [
|
||||
"invalid api key",
|
||||
|
|
@ -492,6 +532,20 @@ def classify_api_error(
|
|||
|
||||
# ── 1. Provider-specific patterns (highest priority) ────────────
|
||||
|
||||
# Provider content-policy / safety-filter block. The provider has made a
|
||||
# deterministic refusal decision about THIS prompt — retrying unchanged
|
||||
# just reproduces the same refusal and burns paid attempts. Must run
|
||||
# before status-based classification so a 400 safety block isn't
|
||||
# downgraded to a generic ``format_error`` and a status-less block
|
||||
# (OpenAI Codex SDK can raise without one) isn't left in the retryable
|
||||
# ``unknown`` bucket. See issue #18028.
|
||||
if any(p in error_msg for p in _CONTENT_POLICY_BLOCKED_PATTERNS):
|
||||
return _result(
|
||||
FailoverReason.content_policy_blocked,
|
||||
retryable=False,
|
||||
should_fallback=True,
|
||||
)
|
||||
|
||||
# Anthropic thinking block signature invalid (400).
|
||||
# Don't gate on provider — OpenRouter proxies Anthropic errors, so the
|
||||
# provider may be "openrouter" even though the error is Anthropic-specific.
|
||||
|
|
|
|||
|
|
@ -46,6 +46,7 @@ ACP_REGISTRY_MANIFEST = REPO_ROOT / "acp_registry" / "agent.json"
|
|||
# Auto-extracted from noreply emails + manual overrides
|
||||
AUTHOR_MAP = {
|
||||
"9592417+adam91holt@users.noreply.github.com": "adam91holt",
|
||||
"kchuang1015@users.noreply.github.com": "kchuang1015",
|
||||
"45688690+fujinice@users.noreply.github.com": "fujinice",
|
||||
"276689385+carltonawong@users.noreply.github.com": "carltonawong",
|
||||
"195255660+EvilHumphrey@users.noreply.github.com": "EvilHumphrey",
|
||||
|
|
|
|||
|
|
@ -59,6 +59,7 @@ class TestFailoverReason:
|
|||
"invalid_encrypted_content",
|
||||
"multimodal_tool_content_unsupported",
|
||||
"provider_policy_blocked",
|
||||
"content_policy_blocked",
|
||||
"thinking_signature", "long_context_tier",
|
||||
"oauth_long_context_beta_forbidden",
|
||||
"llama_cpp_grammar_pattern",
|
||||
|
|
@ -466,6 +467,78 @@ class TestClassifyApiError:
|
|||
result = classify_api_error(e)
|
||||
assert result.reason == FailoverReason.provider_policy_blocked
|
||||
|
||||
# ── Provider content-policy block (per-prompt safety filter) ──
|
||||
#
|
||||
# Distinct from ``provider_policy_blocked`` above — these are upstream
|
||||
# model-provider safety refusals for THIS prompt, not OpenRouter
|
||||
# account-level data policy. Recovery is fallback model, not config fix.
|
||||
# See issue #18028 — OpenAI Codex was burning 3 retries on identical
|
||||
# refusals before users saw "API failed after 3 retries" on Telegram.
|
||||
|
||||
def test_message_only_cyber_content_policy_blocked(self):
|
||||
# OpenAI Codex returns this without an HTTP status. Retrying the
|
||||
# same prompt three times only repeats the same policy decision, so
|
||||
# the classifier must jump straight to fallback / abort instead of
|
||||
# leaving it in the retryable ``unknown`` bucket.
|
||||
e = Exception(
|
||||
"This content was flagged for possible cybersecurity risk. If this "
|
||||
"seems wrong, try rephrasing your request. To get authorized for "
|
||||
"security work, join the Trusted Access for Cyber program."
|
||||
)
|
||||
result = classify_api_error(e, provider="openai-codex", model="gpt-5.5")
|
||||
assert result.reason == FailoverReason.content_policy_blocked
|
||||
assert result.retryable is False
|
||||
assert result.should_fallback is True
|
||||
assert result.should_compress is False
|
||||
|
||||
def test_400_cyber_content_policy_blocked(self):
|
||||
# When the SDK does attach a status (e.g. 400), the safety pattern
|
||||
# must still beat the format_error fallthrough.
|
||||
e = MockAPIError(
|
||||
"This content was flagged for possible cybersecurity risk",
|
||||
status_code=400,
|
||||
)
|
||||
result = classify_api_error(e, provider="openai-codex", model="gpt-5.5")
|
||||
assert result.reason == FailoverReason.content_policy_blocked
|
||||
assert result.retryable is False
|
||||
assert result.should_fallback is True
|
||||
|
||||
def test_openai_usage_policy_violation_content_policy_blocked(self):
|
||||
# OpenAI moderation refusal wording from chat completions / responses.
|
||||
e = MockAPIError(
|
||||
"Your request was flagged by the moderation system as potentially "
|
||||
"violating OpenAI's usage policies.",
|
||||
status_code=400,
|
||||
)
|
||||
result = classify_api_error(e, provider="openai", model="gpt-4o")
|
||||
assert result.reason == FailoverReason.content_policy_blocked
|
||||
assert result.retryable is False
|
||||
assert result.should_fallback is True
|
||||
|
||||
def test_anthropic_safety_system_content_policy_blocked(self):
|
||||
# Anthropic safety refusal — distinct phrasing from OpenAI.
|
||||
e = Exception(
|
||||
"Your prompt was flagged by our safety system. Please rephrase "
|
||||
"and try again."
|
||||
)
|
||||
result = classify_api_error(e, provider="anthropic", model="claude-3-5-sonnet")
|
||||
assert result.reason == FailoverReason.content_policy_blocked
|
||||
assert result.retryable is False
|
||||
assert result.should_fallback is True
|
||||
|
||||
def test_azure_content_filter_content_policy_blocked(self):
|
||||
# Azure OpenAI returns ``content_filter`` finish reason / error code
|
||||
# and ``ResponsibleAIPolicyViolation`` in error bodies — both narrow
|
||||
# tokens, not the generic English phrase.
|
||||
e = MockAPIError(
|
||||
"The response was filtered: ResponsibleAIPolicyViolation "
|
||||
"(finish_reason=content_filter).",
|
||||
status_code=400,
|
||||
)
|
||||
result = classify_api_error(e, provider="azure", model="gpt-4o")
|
||||
assert result.reason == FailoverReason.content_policy_blocked
|
||||
assert result.retryable is False
|
||||
|
||||
def test_404_model_not_found_still_works(self):
|
||||
# Regression guard: the new policy-block check must not swallow
|
||||
# genuine model_not_found 404s.
|
||||
|
|
|
|||
152
tests/run_agent/test_18028_content_policy_blocked.py
Normal file
152
tests/run_agent/test_18028_content_policy_blocked.py
Normal file
|
|
@ -0,0 +1,152 @@
|
|||
"""Regression guard for #18028: provider content-policy / safety-filter
|
||||
blocks must classify as ``content_policy_blocked``, be non-retryable, and
|
||||
trigger the ``is_client_error`` abort path so the loop jumps straight to a
|
||||
configured fallback or surfaces a clear policy-block message — instead of
|
||||
burning ``api_max_retries`` paid attempts on a deterministic refusal and
|
||||
delivering "API failed after 3 retries" to Telegram/cron with no provider
|
||||
context.
|
||||
|
||||
Real-world symptom from the issue:
|
||||
``API call failed after 3 retries — This content was flagged for
|
||||
possible cybersecurity risk... | provider=openai-codex model=gpt-5.5``
|
||||
repeating across cron jobs and gateway sessions, with the user unable to
|
||||
tell whether the gateway was broken, the model was down, or their prompt
|
||||
was the problem.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
|
||||
class TestContentPolicyBlockedClassification:
|
||||
"""Verify classify_api_error returns the right shape so downstream
|
||||
recovery (fallback activation, final_response wording) fires correctly.
|
||||
"""
|
||||
|
||||
def test_openai_codex_cybersecurity_no_status(self):
|
||||
"""The reported #18028 case — SDK raises without a status code."""
|
||||
from agent.error_classifier import classify_api_error, FailoverReason
|
||||
|
||||
e = Exception(
|
||||
"This content was flagged for possible cybersecurity risk. "
|
||||
"If this seems wrong, try rephrasing your request. To get "
|
||||
"authorized for security work, join the Trusted Access for "
|
||||
"Cyber program."
|
||||
)
|
||||
result = classify_api_error(e, provider="openai-codex", model="gpt-5.5")
|
||||
# Must NOT fall into the retryable ``unknown`` bucket — that's what
|
||||
# caused the 3x retry burn.
|
||||
assert result.reason == FailoverReason.content_policy_blocked
|
||||
assert result.retryable is False
|
||||
# Recovery is fallback model, not credential rotation or compression.
|
||||
assert result.should_fallback is True
|
||||
assert result.should_compress is False
|
||||
assert result.should_rotate_credential is False
|
||||
|
||||
|
||||
class TestContentPolicyTriggersClientErrorAbort:
|
||||
"""Mirror the ``is_client_error`` predicate in
|
||||
``agent/conversation_loop.py`` and verify
|
||||
``FailoverReason.content_policy_blocked`` resolves to True so the loop
|
||||
aborts (after attempting fallback) instead of falling into the
|
||||
retry-backoff path.
|
||||
"""
|
||||
|
||||
def _mirror_is_client_error(
|
||||
self,
|
||||
*,
|
||||
classified_retryable: bool,
|
||||
classified_reason,
|
||||
classified_should_compress: bool = False,
|
||||
is_local_validation_error: bool = False,
|
||||
is_context_length_error: bool = False,
|
||||
) -> bool:
|
||||
"""Exact shape of conversation_loop.py's is_client_error check.
|
||||
|
||||
Kept in lock-step with the source. If you change one, change both.
|
||||
"""
|
||||
from agent.error_classifier import FailoverReason
|
||||
|
||||
return (
|
||||
is_local_validation_error
|
||||
or (
|
||||
not classified_retryable
|
||||
and not classified_should_compress
|
||||
and classified_reason not in {
|
||||
FailoverReason.rate_limit,
|
||||
FailoverReason.overloaded,
|
||||
FailoverReason.context_overflow,
|
||||
FailoverReason.payload_too_large,
|
||||
FailoverReason.long_context_tier,
|
||||
FailoverReason.thinking_signature,
|
||||
}
|
||||
)
|
||||
) and not is_context_length_error
|
||||
|
||||
def test_content_policy_blocked_triggers_abort(self):
|
||||
"""Safety-filter block must reach is_client_error → fallback/abort."""
|
||||
from agent.error_classifier import FailoverReason
|
||||
|
||||
# What classify_api_error returns for a content-policy block:
|
||||
# reason=content_policy_blocked, retryable=False, should_compress=False
|
||||
assert self._mirror_is_client_error(
|
||||
classified_retryable=False,
|
||||
classified_reason=FailoverReason.content_policy_blocked,
|
||||
), (
|
||||
"FailoverReason.content_policy_blocked must trigger the "
|
||||
"is_client_error path so fallback fires immediately instead of "
|
||||
"burning api_max_retries paid attempts on a deterministic "
|
||||
"safety refusal — see #18028."
|
||||
)
|
||||
|
||||
|
||||
class TestContentPolicyPatternsAreNarrow:
|
||||
"""Defensive guard: the safety-filter patterns must not collide with
|
||||
benign error wording from billing / format / generic 400 errors. If
|
||||
these regress to ``content_policy_blocked``, recovery will route to
|
||||
the wrong code path (fallback model instead of credential rotation).
|
||||
"""
|
||||
|
||||
def test_generic_400_format_error_not_misclassified(self):
|
||||
from agent.error_classifier import classify_api_error, FailoverReason
|
||||
|
||||
class _Err(Exception):
|
||||
def __init__(self, msg, status_code):
|
||||
super().__init__(msg)
|
||||
self.status_code = status_code
|
||||
|
||||
e = _Err("Invalid request: messages must be a non-empty list", status_code=400)
|
||||
result = classify_api_error(e, provider="openai", model="gpt-4o")
|
||||
assert result.reason != FailoverReason.content_policy_blocked
|
||||
|
||||
def test_billing_402_not_misclassified(self):
|
||||
from agent.error_classifier import classify_api_error, FailoverReason
|
||||
|
||||
class _Err(Exception):
|
||||
def __init__(self, msg, status_code):
|
||||
super().__init__(msg)
|
||||
self.status_code = status_code
|
||||
|
||||
e = _Err("Insufficient credits. Top up your balance.", status_code=402)
|
||||
result = classify_api_error(e, provider="openrouter", model="anthropic/claude-opus")
|
||||
assert result.reason == FailoverReason.billing
|
||||
|
||||
def test_openrouter_account_policy_block_stays_distinct(self):
|
||||
"""``provider_policy_blocked`` (OpenRouter account-level data
|
||||
policy) must remain a separate classification from
|
||||
``content_policy_blocked`` (upstream model safety filter) — they
|
||||
have different recovery strategies.
|
||||
"""
|
||||
from agent.error_classifier import classify_api_error, FailoverReason
|
||||
|
||||
class _Err(Exception):
|
||||
def __init__(self, msg, status_code):
|
||||
super().__init__(msg)
|
||||
self.status_code = status_code
|
||||
|
||||
e = _Err(
|
||||
"No endpoints available matching your guardrail restrictions "
|
||||
"and data policy",
|
||||
status_code=404,
|
||||
)
|
||||
result = classify_api_error(e, provider="openrouter", model="anthropic/claude-opus")
|
||||
assert result.reason == FailoverReason.provider_policy_blocked
|
||||
assert result.reason != FailoverReason.content_policy_blocked
|
||||
Loading…
Add table
Add a link
Reference in a new issue