mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-03 07:21:54 +00:00
* fix(agent): fallback immediately on provider content-policy blocks
Provider safety-filter refusals (e.g. OpenAI Codex 'flagged for possible
cybersecurity risk', OpenAI moderation 'violates our usage policies',
Anthropic safety-system rejections, Azure content_filter) are
deterministic decisions about a specific prompt. Retrying the same
prompt up to api_max_retries times just reproduces the same refusal and
burns paid attempts before surfacing the generic 'API failed after 3
retries — <provider message>' to Telegram / cron with no indication that
the failure came from the model provider rather than Hermes itself.
Classify these as a new FailoverReason.content_policy_blocked
(non-retryable, should_fallback=True) and route them through the
existing is_client_error path so the loop:
- skips the 3x retry backoff
- activates a configured fallback model immediately
- emits a clear provider-safety message to the user (not the generic
'Non-retryable error (HTTP None)') and surfaces actionable guidance
when no fallback is configured (rephrase, narrow context, or set
fallback_model in hermes config)
- returns a final_response that explicitly tells the user this came
from the model provider, so gateway delivery is unambiguous and
cron last_status reflects the safety block rather than a vague
'agent reported failure'
Patterns are intentionally narrow — verbatim refusal phrasings keyed to
specific provider safety pipelines, not generic words like 'policy' or
'violation' that would collide with billing / format / auth errors.
Regression guards in test_18028_content_policy_blocked.py verify
billing 402s, generic 400s, and OpenRouter account-level
provider_policy_blocked remain distinct classifications.
Salvaged from #18164 onto current main (file restructure: loop logic
moved from run_agent.py to agent/conversation_loop.py, _emit_status →
_buffer_status), broadened patterns beyond the original OpenAI Codex
cybersecurity case to cover OpenAI moderation, Anthropic safety system,
and Azure content_filter; added user-actionable guidance and a clear
final_response so cron/gateway surfaces the policy block instead of a
generic non-retryable error, and added a regression-guard test module
mirroring the is_client_error predicate.
Addresses #18028.
Co-authored-by: Kuan-Chieh Huang <kchuang1015@users.noreply.github.com>
* chore: add kchuang1015 to AUTHOR_MAP
---------
Co-authored-by: Kuan-Chieh Huang <kchuang1015@users.noreply.github.com>
152 lines
6.5 KiB
Python
152 lines
6.5 KiB
Python
"""Regression guard for #18028: provider content-policy / safety-filter
|
|
blocks must classify as ``content_policy_blocked``, be non-retryable, and
|
|
trigger the ``is_client_error`` abort path so the loop jumps straight to a
|
|
configured fallback or surfaces a clear policy-block message — instead of
|
|
burning ``api_max_retries`` paid attempts on a deterministic refusal and
|
|
delivering "API failed after 3 retries" to Telegram/cron with no provider
|
|
context.
|
|
|
|
Real-world symptom from the issue:
|
|
``API call failed after 3 retries — This content was flagged for
|
|
possible cybersecurity risk... | provider=openai-codex model=gpt-5.5``
|
|
repeating across cron jobs and gateway sessions, with the user unable to
|
|
tell whether the gateway was broken, the model was down, or their prompt
|
|
was the problem.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
|
|
class TestContentPolicyBlockedClassification:
|
|
"""Verify classify_api_error returns the right shape so downstream
|
|
recovery (fallback activation, final_response wording) fires correctly.
|
|
"""
|
|
|
|
def test_openai_codex_cybersecurity_no_status(self):
|
|
"""The reported #18028 case — SDK raises without a status code."""
|
|
from agent.error_classifier import classify_api_error, FailoverReason
|
|
|
|
e = Exception(
|
|
"This content was flagged for possible cybersecurity risk. "
|
|
"If this seems wrong, try rephrasing your request. To get "
|
|
"authorized for security work, join the Trusted Access for "
|
|
"Cyber program."
|
|
)
|
|
result = classify_api_error(e, provider="openai-codex", model="gpt-5.5")
|
|
# Must NOT fall into the retryable ``unknown`` bucket — that's what
|
|
# caused the 3x retry burn.
|
|
assert result.reason == FailoverReason.content_policy_blocked
|
|
assert result.retryable is False
|
|
# Recovery is fallback model, not credential rotation or compression.
|
|
assert result.should_fallback is True
|
|
assert result.should_compress is False
|
|
assert result.should_rotate_credential is False
|
|
|
|
|
|
class TestContentPolicyTriggersClientErrorAbort:
|
|
"""Mirror the ``is_client_error`` predicate in
|
|
``agent/conversation_loop.py`` and verify
|
|
``FailoverReason.content_policy_blocked`` resolves to True so the loop
|
|
aborts (after attempting fallback) instead of falling into the
|
|
retry-backoff path.
|
|
"""
|
|
|
|
def _mirror_is_client_error(
|
|
self,
|
|
*,
|
|
classified_retryable: bool,
|
|
classified_reason,
|
|
classified_should_compress: bool = False,
|
|
is_local_validation_error: bool = False,
|
|
is_context_length_error: bool = False,
|
|
) -> bool:
|
|
"""Exact shape of conversation_loop.py's is_client_error check.
|
|
|
|
Kept in lock-step with the source. If you change one, change both.
|
|
"""
|
|
from agent.error_classifier import FailoverReason
|
|
|
|
return (
|
|
is_local_validation_error
|
|
or (
|
|
not classified_retryable
|
|
and not classified_should_compress
|
|
and classified_reason not in {
|
|
FailoverReason.rate_limit,
|
|
FailoverReason.overloaded,
|
|
FailoverReason.context_overflow,
|
|
FailoverReason.payload_too_large,
|
|
FailoverReason.long_context_tier,
|
|
FailoverReason.thinking_signature,
|
|
}
|
|
)
|
|
) and not is_context_length_error
|
|
|
|
def test_content_policy_blocked_triggers_abort(self):
|
|
"""Safety-filter block must reach is_client_error → fallback/abort."""
|
|
from agent.error_classifier import FailoverReason
|
|
|
|
# What classify_api_error returns for a content-policy block:
|
|
# reason=content_policy_blocked, retryable=False, should_compress=False
|
|
assert self._mirror_is_client_error(
|
|
classified_retryable=False,
|
|
classified_reason=FailoverReason.content_policy_blocked,
|
|
), (
|
|
"FailoverReason.content_policy_blocked must trigger the "
|
|
"is_client_error path so fallback fires immediately instead of "
|
|
"burning api_max_retries paid attempts on a deterministic "
|
|
"safety refusal — see #18028."
|
|
)
|
|
|
|
|
|
class TestContentPolicyPatternsAreNarrow:
|
|
"""Defensive guard: the safety-filter patterns must not collide with
|
|
benign error wording from billing / format / generic 400 errors. If
|
|
these regress to ``content_policy_blocked``, recovery will route to
|
|
the wrong code path (fallback model instead of credential rotation).
|
|
"""
|
|
|
|
def test_generic_400_format_error_not_misclassified(self):
|
|
from agent.error_classifier import classify_api_error, FailoverReason
|
|
|
|
class _Err(Exception):
|
|
def __init__(self, msg, status_code):
|
|
super().__init__(msg)
|
|
self.status_code = status_code
|
|
|
|
e = _Err("Invalid request: messages must be a non-empty list", status_code=400)
|
|
result = classify_api_error(e, provider="openai", model="gpt-4o")
|
|
assert result.reason != FailoverReason.content_policy_blocked
|
|
|
|
def test_billing_402_not_misclassified(self):
|
|
from agent.error_classifier import classify_api_error, FailoverReason
|
|
|
|
class _Err(Exception):
|
|
def __init__(self, msg, status_code):
|
|
super().__init__(msg)
|
|
self.status_code = status_code
|
|
|
|
e = _Err("Insufficient credits. Top up your balance.", status_code=402)
|
|
result = classify_api_error(e, provider="openrouter", model="anthropic/claude-opus")
|
|
assert result.reason == FailoverReason.billing
|
|
|
|
def test_openrouter_account_policy_block_stays_distinct(self):
|
|
"""``provider_policy_blocked`` (OpenRouter account-level data
|
|
policy) must remain a separate classification from
|
|
``content_policy_blocked`` (upstream model safety filter) — they
|
|
have different recovery strategies.
|
|
"""
|
|
from agent.error_classifier import classify_api_error, FailoverReason
|
|
|
|
class _Err(Exception):
|
|
def __init__(self, msg, status_code):
|
|
super().__init__(msg)
|
|
self.status_code = status_code
|
|
|
|
e = _Err(
|
|
"No endpoints available matching your guardrail restrictions "
|
|
"and data policy",
|
|
status_code=404,
|
|
)
|
|
result = classify_api_error(e, provider="openrouter", model="anthropic/claude-opus")
|
|
assert result.reason == FailoverReason.provider_policy_blocked
|
|
assert result.reason != FailoverReason.content_policy_blocked
|