hermes-agent/tests/run_agent/test_background_review_cache_parity.py

"""Tests that the background review fork inherits the parent's cached system prompt.

Regression coverage for issue #25322 (and PR #17276's first root cause): the
background review's outbound HTTP request must carry the same system bytes as
the parent's so Anthropic/OpenRouter's exact-prefix cache key matches.

Without this, every review rebuilds the system prompt from scratch — fresh
``_hermes_now()`` timestamp, fresh ``session_id``, and a different skills
prompt under the (former) narrow toolset — and the prefix-cache miss costs
roughly the full uncached system-prompt cost per nudge (~26% end-to-end on
Sonnet 4.5 per the contributor's measurement).
"""

from unittest.mock import patch


def _make_agent_stub(agent_cls):
    """Create a minimal AIAgent-like object with just enough state for _spawn_background_review."""
    agent = object.__new__(agent_cls)
    agent.model = "test-model"
    agent.platform = "test"
    agent.provider = "openai"
    agent.session_id = "sess-123"
    agent.quiet_mode = True
    agent._memory_store = None
    agent._memory_enabled = True
    agent._user_profile_enabled = False
    agent._memory_nudge_interval = 5
    agent._skill_nudge_interval = 5
    agent.background_review_callback = None
    agent.status_callback = None
    agent._cached_system_prompt = (
        "PARENT-SYSTEM-PROMPT-BYTES — must be inherited verbatim "
        "for prefix-cache parity"
    )
    import datetime as _dt
    agent.session_start = _dt.datetime(2026, 1, 1, 12, 0, 0)
    agent._MEMORY_REVIEW_PROMPT = "review memory"
    agent._SKILL_REVIEW_PROMPT = "review skills"
    agent._COMBINED_REVIEW_PROMPT = "review both"
    # Parent's toolset configuration — must be propagated to the review
    # fork so ``tools[]`` matches byte-for-byte. Without these set on the
    # stub, ``getattr(agent, ..., None)`` would return None on both sides
    # and the test wouldn't catch a regression where the fork is built
    # without the kwargs at all.
    agent.enabled_toolsets = ["memory", "skills", "terminal"]
    agent.disabled_toolsets = ["spotify", "feishu_doc"]
    return agent


class _SyncThread:
    """Drop-in replacement for threading.Thread that runs the target inline."""

    def __init__(self, *, target=None, daemon=None, name=None):
        self._target = target

    def start(self):
        if self._target:
            self._target()


class _ReviewAgentRecorder:
    """Stand-in for the review-fork AIAgent that records the prompt assignment."""

    def __init__(self, *args, **kwargs):
        self._cached_system_prompt = None
        self._memory_write_origin = None
        self._memory_write_context = None
        self._memory_store = None
        self._memory_enabled = None
        self._user_profile_enabled = None
        self._memory_nudge_interval = None
        self._skill_nudge_interval = None
        self.suppress_status_output = None

    def run_conversation(self, *args, **kwargs):
        raise RuntimeError("stop after recording state — don't actually call the API")

    def shutdown_memory_provider(self):
        pass

    def close(self):
        pass


def test_review_fork_inherits_parent_cached_system_prompt():
    """The review fork's _cached_system_prompt must equal the parent's byte-for-byte.

    Anthropic's prefix cache keys on exact bytes; any divergence (timestamp
    minute tick, fresh session_id, narrower skills_prompt) shifts the key
    and forces a full re-cache. Inheriting the parent's cached prompt is
    the cheap, mechanical fix.
    """
    import run_agent

    agent = _make_agent_stub(run_agent.AIAgent)

    captured = {}
    parent_prompt = agent._cached_system_prompt

    # Hook the assignment site: record what gets put on the review agent.
    real_recorder_init = _ReviewAgentRecorder.__init__

    def _recorder_init(self, *args, **kwargs):
        real_recorder_init(self, *args, **kwargs)
        # The actual production code assigns _cached_system_prompt AFTER __init__,
        # so we need to capture it on attribute set. Use a property-style sentinel
        # via __setattr__ on this instance.

    with patch.object(run_agent, "AIAgent", _ReviewAgentRecorder), \
         patch("threading.Thread", _SyncThread):
        # Wrap the recorder's __setattr__ so we can see the _cached_system_prompt
        # write that _spawn_background_review performs after construction.
        orig_setattr = _ReviewAgentRecorder.__setattr__

        def _spy_setattr(self, name, value):
            if name == "_cached_system_prompt":
                captured["written_prompt"] = value
            orig_setattr(self, name, value)

        with patch.object(_ReviewAgentRecorder, "__setattr__", _spy_setattr):
            agent._spawn_background_review(
                messages_snapshot=[],
                review_memory=True,
                review_skills=False,
            )

    assert "written_prompt" in captured, (
        "_spawn_background_review never assigned _cached_system_prompt on the review agent"
    )
    assert captured["written_prompt"] == parent_prompt, (
        f"Review fork's _cached_system_prompt diverged from parent's. "
        f"Got {captured['written_prompt']!r}, expected {parent_prompt!r}. "
        "This breaks Anthropic/OpenRouter prefix-cache parity (#25322)."
    )


def test_review_fork_pins_session_start_and_session_id():
    """Defensive complement to cached-system-prompt inheritance.

    Even though ``_cached_system_prompt`` inheritance short-circuits the
    normal rebuild path, pinning ``session_start`` and ``session_id`` to
    the parent's guarantees byte-identical output from any code path that
    re-renders parts of the system prompt (compression, plugin hooks).
    """
    import run_agent

    agent = _make_agent_stub(run_agent.AIAgent)

    captured = {}

    class _Recorder:
        def __init__(self, *args, **kwargs):
            self._cached_system_prompt = None
            self._memory_write_origin = None
            self._memory_write_context = None
            self._memory_store = None
            self._memory_enabled = None
            self._user_profile_enabled = None
            self._memory_nudge_interval = None
            self._skill_nudge_interval = None
            self.suppress_status_output = None
            self.session_start = None
            self.session_id = None

        def run_conversation(self, *args, **kwargs):
            captured["session_start"] = self.session_start
            captured["session_id"] = self.session_id
            raise RuntimeError("stop after recording")

        def shutdown_memory_provider(self):
            pass

        def close(self):
            pass

    with patch.object(run_agent, "AIAgent", _Recorder), \
         patch("threading.Thread", _SyncThread):
        agent._spawn_background_review(
            messages_snapshot=[],
            review_memory=True,
            review_skills=False,
        )

    assert captured.get("session_start") == agent.session_start, (
        "Review fork did not inherit parent's session_start — "
        "system-prompt rebuild paths would diverge."
    )
    assert captured.get("session_id") == agent.session_id, (
        "Review fork did not inherit parent's session_id — "
        "system-prompt rebuild paths would diverge."
    )


def test_review_fork_inherits_parent_toolset_config():
    """The review fork must receive ``enabled_toolsets`` / ``disabled_toolsets``
    from the parent so the outbound request body's ``tools[]`` field matches
    byte-for-byte.

    Without this, ``enabled_toolsets=None`` defaults to "all registered tools"
    and the fork sends every tool descriptor (e.g. Spotify, Feishu, video)
    even when the parent disabled them via ``hermes tools disable``. Anthropic's
    prompt cache keys on the byte-exact ``tools[]`` array, so divergence here
    forks the cache lineage and forces a full prefix rewrite per nudge
    (~100-200 K cache-write tokens for long conversations).

    This is the same byte-stability invariant as
    ``test_review_fork_inherits_parent_cached_system_prompt`` but for the
    ``tools[]`` slot of the request body, not the ``system`` slot.
    """
    import run_agent

    agent = _make_agent_stub(run_agent.AIAgent)

    captured = {}

    class _Recorder:
        def __init__(self, *args, **kwargs):
            captured["enabled_toolsets"] = kwargs.get("enabled_toolsets")
            captured["disabled_toolsets"] = kwargs.get("disabled_toolsets")
            # Minimal post-init attrs the surrounding code touches.
            self._cached_system_prompt = None
            self._memory_write_origin = None
            self._memory_write_context = None
            self._memory_store = None
            self._memory_enabled = None
            self._user_profile_enabled = None
            self._memory_nudge_interval = None
            self._skill_nudge_interval = None
            self.suppress_status_output = None
            self.session_start = None
            self.session_id = None

        def run_conversation(self, *args, **kwargs):
            raise RuntimeError("stop after recording — don't actually call the API")

        def shutdown_memory_provider(self):
            pass

        def close(self):
            pass

    with patch.object(run_agent, "AIAgent", _Recorder), \
         patch("threading.Thread", _SyncThread):
        agent._spawn_background_review(
            messages_snapshot=[],
            review_memory=True,
            review_skills=False,
        )

    assert captured.get("enabled_toolsets") == agent.enabled_toolsets, (
        f"Review fork did not receive parent's enabled_toolsets. "
        f"Got {captured.get('enabled_toolsets')!r}, expected {agent.enabled_toolsets!r}. "
        "This causes ``tools[]`` to diverge between main turns and review nudges, "
        "breaking Anthropic prompt-cache parity."
    )
    assert captured.get("disabled_toolsets") == agent.disabled_toolsets, (
        f"Review fork did not receive parent's disabled_toolsets. "
        f"Got {captured.get('disabled_toolsets')!r}, expected {agent.disabled_toolsets!r}. "
        "This causes ``tools[]`` to diverge between main turns and review nudges, "
        "breaking Anthropic prompt-cache parity."
    )