hermes-agent/tests/gateway/test_transcript_offset.py

"""Tests for transcript history offset fix.

Regression tests for a bug where the gateway transcript lost 1 message
per turn from turn 2 onwards.  The raw transcript history includes
``session_meta`` entries that are filtered out before being passed to
the agent.  The agent returns messages built from this filtered history
plus new messages from the current turn.

The old code used ``len(history)`` (raw count, includes session_meta)
to slice ``agent_messages``, which caused the slice to skip valid new
messages.  The fix adds ``history_offset`` (the filtered history length)
to ``_run_agent``'s return dict and uses it for the slice.
"""

import pytest

from gateway.run import _preserve_queued_followup_history_offset


# ---------------------------------------------------------------------------
# Helpers - replicate the filtering logic from _run_agent
# ---------------------------------------------------------------------------

def _filter_history(history: list) -> list:
    """Replicate the agent_history filtering from GatewayRunner._run_agent.

    Strips session_meta and system messages, exactly as the real code does.
    """
    agent_history = []
    for msg in history:
        role = msg.get("role")
        if not role:
            continue
        if role in ("session_meta",):
            continue
        if role == "system":
            continue

        has_tool_calls = "tool_calls" in msg
        has_tool_call_id = "tool_call_id" in msg
        is_tool_message = role == "tool"

        if has_tool_calls or has_tool_call_id or is_tool_message:
            clean_msg = {k: v for k, v in msg.items() if k != "timestamp"}
            agent_history.append(clean_msg)
        else:
            content = msg.get("content")
            if content:
                agent_history.append({"role": role, "content": content})
    return agent_history


# ---------------------------------------------------------------------------
# Tests
# ---------------------------------------------------------------------------

class TestTranscriptHistoryOffset:
    """Verify the transcript extraction uses the filtered history length."""

    def test_session_meta_causes_offset_mismatch(self):
        """Turn 2: session_meta makes len(history) > len(agent_history).

        - history (raw): 1 session_meta + 2 conversation = 3 entries
        - agent_history (filtered): 2 entries
        - Agent returns 2 old + 2 new = 4 messages
        - OLD: agent_messages[3:] = 1 message (lost the user message)
        - FIX: agent_messages[2:] = 2 messages (correct)
        """
        history = [
            {"role": "session_meta", "tools": [], "model": "gpt-4",
             "platform": "telegram", "timestamp": "t0"},
            {"role": "user", "content": "Hello", "timestamp": "t1"},
            {"role": "assistant", "content": "Hi there!", "timestamp": "t1"},
        ]

        agent_history = _filter_history(history)
        assert len(agent_history) == 2  # session_meta stripped

        # Agent returns: filtered history (2) + new turn (2)
        agent_messages = [
            {"role": "user", "content": "Hello"},
            {"role": "assistant", "content": "Hi there!"},
            {"role": "user", "content": "What is Python?"},
            {"role": "assistant", "content": "A programming language."},
        ]

        # OLD behavior: len(history) = 3, skips too many
        old_offset = len(history)
        old_new = (agent_messages[old_offset:]
                   if len(agent_messages) > old_offset
                   else agent_messages)
        assert len(old_new) == 1  # BUG: lost the user message

        # FIXED behavior: history_offset = 2
        history_offset = len(agent_history)
        fixed_new = (agent_messages[history_offset:]
                     if len(agent_messages) > history_offset
                     else [])
        assert len(fixed_new) == 2
        assert fixed_new[0]["content"] == "What is Python?"
        assert fixed_new[1]["content"] == "A programming language."

    def test_no_session_meta_same_result(self):
        """First turn has no session_meta, so both approaches agree."""
        history = []
        agent_history = _filter_history(history)
        assert len(agent_history) == 0

        agent_messages = [
            {"role": "user", "content": "Hello"},
            {"role": "assistant", "content": "Hi!"},
        ]

        old_new = (agent_messages[len(history):]
                   if len(agent_messages) > len(history)
                   else agent_messages)
        fixed_new = (agent_messages[len(agent_history):]
                     if len(agent_messages) > len(agent_history)
                     else [])

        assert old_new == fixed_new
        assert len(fixed_new) == 2

    def test_multiple_session_meta_larger_drift(self):
        """Two session_meta entries double the offset error.

        This can happen when the session spans tool definition changes
        or model switches that each write a new session_meta record.
        """
        history = [
            {"role": "session_meta", "tools": [], "timestamp": "t0"},
            {"role": "user", "content": "msg1", "timestamp": "t1"},
            {"role": "assistant", "content": "reply1", "timestamp": "t1"},
            {"role": "session_meta", "tools": ["new_tool"], "timestamp": "t2"},
            {"role": "user", "content": "msg2", "timestamp": "t3"},
            {"role": "assistant", "content": "reply2", "timestamp": "t3"},
        ]

        agent_history = _filter_history(history)
        assert len(agent_history) == 4
        assert len(history) == 6  # 2 extra session_meta entries

        # Agent returns 4 old + 2 new = 6 total
        agent_messages = [
            {"role": "user", "content": "msg1"},
            {"role": "assistant", "content": "reply1"},
            {"role": "user", "content": "msg2"},
            {"role": "assistant", "content": "reply2"},
            {"role": "user", "content": "msg3"},
            {"role": "assistant", "content": "reply3"},
        ]

        # OLD: len(history) == len(agent_messages) == 6 -> else branch
        old_offset = len(history)
        old_new = (agent_messages[old_offset:]
                   if len(agent_messages) > old_offset
                   else agent_messages)
        # BUG: treats ALL messages as new (duplicates entire history)
        assert old_new == agent_messages

        # FIXED: history_offset = 4
        fixed_new = (agent_messages[len(agent_history):]
                     if len(agent_messages) > len(agent_history)
                     else [])
        assert len(fixed_new) == 2
        assert fixed_new[0]["content"] == "msg3"
        assert fixed_new[1]["content"] == "reply3"

    def test_system_messages_also_filtered(self):
        """system messages in history are also stripped from agent_history."""
        history = [
            {"role": "session_meta", "tools": [], "timestamp": "t0"},
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": "Hi", "timestamp": "t1"},
            {"role": "assistant", "content": "Hello!", "timestamp": "t1"},
        ]

        agent_history = _filter_history(history)
        assert len(agent_history) == 2  # only user + assistant

        agent_messages = [
            {"role": "user", "content": "Hi"},
            {"role": "assistant", "content": "Hello!"},
            {"role": "user", "content": "New question"},
            {"role": "assistant", "content": "New answer"},
        ]

        # OLD: len(history) = 4, skips everything
        old_offset = len(history)
        old_new = (agent_messages[old_offset:]
                   if len(agent_messages) > old_offset
                   else agent_messages)
        assert old_new == agent_messages  # BUG: all treated as new

        # FIXED
        fixed_new = (agent_messages[len(agent_history):]
                     if len(agent_messages) > len(agent_history)
                     else [])
        assert len(fixed_new) == 2
        assert fixed_new[0]["content"] == "New question"

    def test_else_branch_returns_empty_list(self):
        """When agent has fewer messages than offset, return [] not all.

        The old code had ``else agent_messages`` which would treat the
        entire message list as new when the agent compressed or dropped
        messages.  The fix changes this to ``else []``, falling through
        to the simple user/assistant fallback path.
        """
        history = [
            {"role": "session_meta", "tools": [], "timestamp": "t0"},
            {"role": "user", "content": "Hello", "timestamp": "t1"},
            {"role": "assistant", "content": "Hi!", "timestamp": "t1"},
        ]

        # Agent compressed and returned fewer messages than history
        agent_messages = [
            {"role": "user", "content": "Hello"},
            {"role": "assistant", "content": "Hi!"},
        ]

        history_offset = len(_filter_history(history))  # 2
        new_messages = (agent_messages[history_offset:]
                        if len(agent_messages) > history_offset
                        else [])
        # 2 == 2, so no new messages - falls to fallback
        assert new_messages == []

    def test_tool_call_messages_preserved_in_filter(self):
        """Tool call messages pass through the filter, keeping offset correct."""
        history = [
            {"role": "session_meta", "tools": [], "timestamp": "t0"},
            {"role": "user", "content": "Search for cats", "timestamp": "t1"},
            {"role": "assistant", "content": None, "timestamp": "t1",
             "tool_calls": [{"id": "tc1", "function": {"name": "web_search"}}]},
            {"role": "tool", "tool_call_id": "tc1",
             "content": "Results about cats", "timestamp": "t1"},
            {"role": "assistant", "content": "Here are results.",
             "timestamp": "t1"},
        ]

        agent_history = _filter_history(history)
        # session_meta filtered, but tool_calls/tool messages kept
        assert len(agent_history) == 4
        assert len(history) == 5  # 1 session_meta extra

        agent_messages = [
            {"role": "user", "content": "Search for cats"},
            {"role": "assistant", "content": None,
             "tool_calls": [{"id": "tc1", "function": {"name": "web_search"}}]},
            {"role": "tool", "tool_call_id": "tc1", "content": "Results about cats"},
            {"role": "assistant", "content": "Here are results."},
            {"role": "user", "content": "Now search for dogs"},
            {"role": "assistant", "content": "Dog results here."},
        ]

        # OLD: len(history) = 5, agent_messages[5:] = 1 message (lost user msg)
        old_new = (agent_messages[len(history):]
                   if len(agent_messages) > len(history)
                   else agent_messages)
        assert len(old_new) == 1  # BUG

        # FIXED
        fixed_new = (agent_messages[len(agent_history):]
                     if len(agent_messages) > len(agent_history)
                     else [])
        assert len(fixed_new) == 2
        assert fixed_new[0]["content"] == "Now search for dogs"
        assert fixed_new[1]["content"] == "Dog results here."

    def test_recursive_queued_followup_keeps_outer_history_offset(self):
        """Queued drain persistence must include every turn in the chain.

        ``_run_agent()`` recurses when a follow-up arrived while the current turn
        was running. The recursive call naturally returns a later
        ``history_offset`` because it received the previous turn as part of its
        input history. If the outer caller persists transcript rows using that
        later offset, it only sees the *last* queued turn as new and drops the
        earlier queued turn from the transcript.
        """
        history_before_chain = [
            {"role": "user", "content": "Earlier question"},
            {"role": "assistant", "content": "Earlier answer"},
        ]
        first_followup_turn = [
            {"role": "user", "content": "First follow-up question"},
            {"role": "assistant", "content": "First follow-up answer"},
        ]
        second_followup_turn = [
            {"role": "user", "content": "Second follow-up question"},
            {"role": "assistant", "content": "Second follow-up answer"},
        ]

        current_result = {
            "history_offset": len(history_before_chain),
            "messages": history_before_chain + first_followup_turn,
        }
        followup_result = {
            "history_offset": len(history_before_chain + first_followup_turn),
            "messages": (
                history_before_chain
                + first_followup_turn
                + second_followup_turn
            ),
        }

        merged = _preserve_queued_followup_history_offset(
            current_result,
            followup_result,
        )
        assert merged["history_offset"] == len(history_before_chain)

        persisted = merged["messages"][merged["history_offset"]:]
        assert persisted == first_followup_turn + second_followup_turn

    def test_recursive_queued_followup_preserves_smaller_existing_offset(self):
        """Do not widen the slice if the nested result is already conservative."""
        current_result = {"history_offset": 4}
        followup_result = {"history_offset": 3, "messages": []}

        merged = _preserve_queued_followup_history_offset(
            current_result,
            followup_result,
        )

        assert merged["history_offset"] == 3