hermes-agent/tests/run_agent/test_18028_content_policy_blocked.py

"""Regression guard for #18028: provider content-policy / safety-filter
blocks must classify as ``content_policy_blocked``, be non-retryable, and
trigger the ``is_client_error`` abort path so the loop jumps straight to a
configured fallback or surfaces a clear policy-block message — instead of
burning ``api_max_retries`` paid attempts on a deterministic refusal and
delivering "API failed after 3 retries" to Telegram/cron with no provider
context.

Real-world symptom from the issue:
    ``API call failed after 3 retries — This content was flagged for
    possible cybersecurity risk... | provider=openai-codex model=gpt-5.5``
repeating across cron jobs and gateway sessions, with the user unable to
tell whether the gateway was broken, the model was down, or their prompt
was the problem.
"""
from __future__ import annotations


class TestContentPolicyBlockedClassification:
    """Verify classify_api_error returns the right shape so downstream
    recovery (fallback activation, final_response wording) fires correctly.
    """

    def test_openai_codex_cybersecurity_no_status(self):
        """The reported #18028 case — SDK raises without a status code."""
        from agent.error_classifier import classify_api_error, FailoverReason

        e = Exception(
            "This content was flagged for possible cybersecurity risk. "
            "If this seems wrong, try rephrasing your request. To get "
            "authorized for security work, join the Trusted Access for "
            "Cyber program."
        )
        result = classify_api_error(e, provider="openai-codex", model="gpt-5.5")
        # Must NOT fall into the retryable ``unknown`` bucket — that's what
        # caused the 3x retry burn.
        assert result.reason == FailoverReason.content_policy_blocked
        assert result.retryable is False
        # Recovery is fallback model, not credential rotation or compression.
        assert result.should_fallback is True
        assert result.should_compress is False
        assert result.should_rotate_credential is False


class TestContentPolicyTriggersClientErrorAbort:
    """Mirror the ``is_client_error`` predicate in
    ``agent/conversation_loop.py`` and verify
    ``FailoverReason.content_policy_blocked`` resolves to True so the loop
    aborts (after attempting fallback) instead of falling into the
    retry-backoff path.
    """

    def _mirror_is_client_error(
        self,
        *,
        classified_retryable: bool,
        classified_reason,
        classified_should_compress: bool = False,
        is_local_validation_error: bool = False,
        is_context_length_error: bool = False,
    ) -> bool:
        """Exact shape of conversation_loop.py's is_client_error check.

        Kept in lock-step with the source. If you change one, change both.
        """
        from agent.error_classifier import FailoverReason

        return (
            is_local_validation_error
            or (
                not classified_retryable
                and not classified_should_compress
                and classified_reason not in {
                    FailoverReason.rate_limit,
                    FailoverReason.overloaded,
                    FailoverReason.context_overflow,
                    FailoverReason.payload_too_large,
                    FailoverReason.long_context_tier,
                    FailoverReason.thinking_signature,
                }
            )
        ) and not is_context_length_error

    def test_content_policy_blocked_triggers_abort(self):
        """Safety-filter block must reach is_client_error → fallback/abort."""
        from agent.error_classifier import FailoverReason

        # What classify_api_error returns for a content-policy block:
        #   reason=content_policy_blocked, retryable=False, should_compress=False
        assert self._mirror_is_client_error(
            classified_retryable=False,
            classified_reason=FailoverReason.content_policy_blocked,
        ), (
            "FailoverReason.content_policy_blocked must trigger the "
            "is_client_error path so fallback fires immediately instead of "
            "burning api_max_retries paid attempts on a deterministic "
            "safety refusal — see #18028."
        )


class TestContentPolicyPatternsAreNarrow:
    """Defensive guard: the safety-filter patterns must not collide with
    benign error wording from billing / format / generic 400 errors. If
    these regress to ``content_policy_blocked``, recovery will route to
    the wrong code path (fallback model instead of credential rotation).
    """

    def test_generic_400_format_error_not_misclassified(self):
        from agent.error_classifier import classify_api_error, FailoverReason

        class _Err(Exception):
            def __init__(self, msg, status_code):
                super().__init__(msg)
                self.status_code = status_code

        e = _Err("Invalid request: messages must be a non-empty list", status_code=400)
        result = classify_api_error(e, provider="openai", model="gpt-4o")
        assert result.reason != FailoverReason.content_policy_blocked

    def test_billing_402_not_misclassified(self):
        from agent.error_classifier import classify_api_error, FailoverReason

        class _Err(Exception):
            def __init__(self, msg, status_code):
                super().__init__(msg)
                self.status_code = status_code

        e = _Err("Insufficient credits. Top up your balance.", status_code=402)
        result = classify_api_error(e, provider="openrouter", model="anthropic/claude-opus")
        assert result.reason == FailoverReason.billing

    def test_openrouter_account_policy_block_stays_distinct(self):
        """``provider_policy_blocked`` (OpenRouter account-level data
        policy) must remain a separate classification from
        ``content_policy_blocked`` (upstream model safety filter) — they
        have different recovery strategies.
        """
        from agent.error_classifier import classify_api_error, FailoverReason

        class _Err(Exception):
            def __init__(self, msg, status_code):
                super().__init__(msg)
                self.status_code = status_code

        e = _Err(
            "No endpoints available matching your guardrail restrictions "
            "and data policy",
            status_code=404,
        )
        result = classify_api_error(e, provider="openrouter", model="anthropic/claude-opus")
        assert result.reason == FailoverReason.provider_policy_blocked
        assert result.reason != FailoverReason.content_policy_blocked