mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-18 04:41:56 +00:00
Replace tenant-specific example text in the transcript offset regression with generic follow-up turns so the upstream test documents the bug without customer-specific wording.
326 lines
13 KiB
Python
326 lines
13 KiB
Python
"""Tests for transcript history offset fix.
|
|
|
|
Regression tests for a bug where the gateway transcript lost 1 message
|
|
per turn from turn 2 onwards. The raw transcript history includes
|
|
``session_meta`` entries that are filtered out before being passed to
|
|
the agent. The agent returns messages built from this filtered history
|
|
plus new messages from the current turn.
|
|
|
|
The old code used ``len(history)`` (raw count, includes session_meta)
|
|
to slice ``agent_messages``, which caused the slice to skip valid new
|
|
messages. The fix adds ``history_offset`` (the filtered history length)
|
|
to ``_run_agent``'s return dict and uses it for the slice.
|
|
"""
|
|
|
|
import pytest
|
|
|
|
from gateway.run import _preserve_queued_followup_history_offset
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Helpers - replicate the filtering logic from _run_agent
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _filter_history(history: list) -> list:
|
|
"""Replicate the agent_history filtering from GatewayRunner._run_agent.
|
|
|
|
Strips session_meta and system messages, exactly as the real code does.
|
|
"""
|
|
agent_history = []
|
|
for msg in history:
|
|
role = msg.get("role")
|
|
if not role:
|
|
continue
|
|
if role in ("session_meta",):
|
|
continue
|
|
if role == "system":
|
|
continue
|
|
|
|
has_tool_calls = "tool_calls" in msg
|
|
has_tool_call_id = "tool_call_id" in msg
|
|
is_tool_message = role == "tool"
|
|
|
|
if has_tool_calls or has_tool_call_id or is_tool_message:
|
|
clean_msg = {k: v for k, v in msg.items() if k != "timestamp"}
|
|
agent_history.append(clean_msg)
|
|
else:
|
|
content = msg.get("content")
|
|
if content:
|
|
agent_history.append({"role": role, "content": content})
|
|
return agent_history
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Tests
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestTranscriptHistoryOffset:
|
|
"""Verify the transcript extraction uses the filtered history length."""
|
|
|
|
def test_session_meta_causes_offset_mismatch(self):
|
|
"""Turn 2: session_meta makes len(history) > len(agent_history).
|
|
|
|
- history (raw): 1 session_meta + 2 conversation = 3 entries
|
|
- agent_history (filtered): 2 entries
|
|
- Agent returns 2 old + 2 new = 4 messages
|
|
- OLD: agent_messages[3:] = 1 message (lost the user message)
|
|
- FIX: agent_messages[2:] = 2 messages (correct)
|
|
"""
|
|
history = [
|
|
{"role": "session_meta", "tools": [], "model": "gpt-4",
|
|
"platform": "telegram", "timestamp": "t0"},
|
|
{"role": "user", "content": "Hello", "timestamp": "t1"},
|
|
{"role": "assistant", "content": "Hi there!", "timestamp": "t1"},
|
|
]
|
|
|
|
agent_history = _filter_history(history)
|
|
assert len(agent_history) == 2 # session_meta stripped
|
|
|
|
# Agent returns: filtered history (2) + new turn (2)
|
|
agent_messages = [
|
|
{"role": "user", "content": "Hello"},
|
|
{"role": "assistant", "content": "Hi there!"},
|
|
{"role": "user", "content": "What is Python?"},
|
|
{"role": "assistant", "content": "A programming language."},
|
|
]
|
|
|
|
# OLD behavior: len(history) = 3, skips too many
|
|
old_offset = len(history)
|
|
old_new = (agent_messages[old_offset:]
|
|
if len(agent_messages) > old_offset
|
|
else agent_messages)
|
|
assert len(old_new) == 1 # BUG: lost the user message
|
|
|
|
# FIXED behavior: history_offset = 2
|
|
history_offset = len(agent_history)
|
|
fixed_new = (agent_messages[history_offset:]
|
|
if len(agent_messages) > history_offset
|
|
else [])
|
|
assert len(fixed_new) == 2
|
|
assert fixed_new[0]["content"] == "What is Python?"
|
|
assert fixed_new[1]["content"] == "A programming language."
|
|
|
|
def test_no_session_meta_same_result(self):
|
|
"""First turn has no session_meta, so both approaches agree."""
|
|
history = []
|
|
agent_history = _filter_history(history)
|
|
assert len(agent_history) == 0
|
|
|
|
agent_messages = [
|
|
{"role": "user", "content": "Hello"},
|
|
{"role": "assistant", "content": "Hi!"},
|
|
]
|
|
|
|
old_new = (agent_messages[len(history):]
|
|
if len(agent_messages) > len(history)
|
|
else agent_messages)
|
|
fixed_new = (agent_messages[len(agent_history):]
|
|
if len(agent_messages) > len(agent_history)
|
|
else [])
|
|
|
|
assert old_new == fixed_new
|
|
assert len(fixed_new) == 2
|
|
|
|
def test_multiple_session_meta_larger_drift(self):
|
|
"""Two session_meta entries double the offset error.
|
|
|
|
This can happen when the session spans tool definition changes
|
|
or model switches that each write a new session_meta record.
|
|
"""
|
|
history = [
|
|
{"role": "session_meta", "tools": [], "timestamp": "t0"},
|
|
{"role": "user", "content": "msg1", "timestamp": "t1"},
|
|
{"role": "assistant", "content": "reply1", "timestamp": "t1"},
|
|
{"role": "session_meta", "tools": ["new_tool"], "timestamp": "t2"},
|
|
{"role": "user", "content": "msg2", "timestamp": "t3"},
|
|
{"role": "assistant", "content": "reply2", "timestamp": "t3"},
|
|
]
|
|
|
|
agent_history = _filter_history(history)
|
|
assert len(agent_history) == 4
|
|
assert len(history) == 6 # 2 extra session_meta entries
|
|
|
|
# Agent returns 4 old + 2 new = 6 total
|
|
agent_messages = [
|
|
{"role": "user", "content": "msg1"},
|
|
{"role": "assistant", "content": "reply1"},
|
|
{"role": "user", "content": "msg2"},
|
|
{"role": "assistant", "content": "reply2"},
|
|
{"role": "user", "content": "msg3"},
|
|
{"role": "assistant", "content": "reply3"},
|
|
]
|
|
|
|
# OLD: len(history) == len(agent_messages) == 6 -> else branch
|
|
old_offset = len(history)
|
|
old_new = (agent_messages[old_offset:]
|
|
if len(agent_messages) > old_offset
|
|
else agent_messages)
|
|
# BUG: treats ALL messages as new (duplicates entire history)
|
|
assert old_new == agent_messages
|
|
|
|
# FIXED: history_offset = 4
|
|
fixed_new = (agent_messages[len(agent_history):]
|
|
if len(agent_messages) > len(agent_history)
|
|
else [])
|
|
assert len(fixed_new) == 2
|
|
assert fixed_new[0]["content"] == "msg3"
|
|
assert fixed_new[1]["content"] == "reply3"
|
|
|
|
def test_system_messages_also_filtered(self):
|
|
"""system messages in history are also stripped from agent_history."""
|
|
history = [
|
|
{"role": "session_meta", "tools": [], "timestamp": "t0"},
|
|
{"role": "system", "content": "You are a helpful assistant."},
|
|
{"role": "user", "content": "Hi", "timestamp": "t1"},
|
|
{"role": "assistant", "content": "Hello!", "timestamp": "t1"},
|
|
]
|
|
|
|
agent_history = _filter_history(history)
|
|
assert len(agent_history) == 2 # only user + assistant
|
|
|
|
agent_messages = [
|
|
{"role": "user", "content": "Hi"},
|
|
{"role": "assistant", "content": "Hello!"},
|
|
{"role": "user", "content": "New question"},
|
|
{"role": "assistant", "content": "New answer"},
|
|
]
|
|
|
|
# OLD: len(history) = 4, skips everything
|
|
old_offset = len(history)
|
|
old_new = (agent_messages[old_offset:]
|
|
if len(agent_messages) > old_offset
|
|
else agent_messages)
|
|
assert old_new == agent_messages # BUG: all treated as new
|
|
|
|
# FIXED
|
|
fixed_new = (agent_messages[len(agent_history):]
|
|
if len(agent_messages) > len(agent_history)
|
|
else [])
|
|
assert len(fixed_new) == 2
|
|
assert fixed_new[0]["content"] == "New question"
|
|
|
|
def test_else_branch_returns_empty_list(self):
|
|
"""When agent has fewer messages than offset, return [] not all.
|
|
|
|
The old code had ``else agent_messages`` which would treat the
|
|
entire message list as new when the agent compressed or dropped
|
|
messages. The fix changes this to ``else []``, falling through
|
|
to the simple user/assistant fallback path.
|
|
"""
|
|
history = [
|
|
{"role": "session_meta", "tools": [], "timestamp": "t0"},
|
|
{"role": "user", "content": "Hello", "timestamp": "t1"},
|
|
{"role": "assistant", "content": "Hi!", "timestamp": "t1"},
|
|
]
|
|
|
|
# Agent compressed and returned fewer messages than history
|
|
agent_messages = [
|
|
{"role": "user", "content": "Hello"},
|
|
{"role": "assistant", "content": "Hi!"},
|
|
]
|
|
|
|
history_offset = len(_filter_history(history)) # 2
|
|
new_messages = (agent_messages[history_offset:]
|
|
if len(agent_messages) > history_offset
|
|
else [])
|
|
# 2 == 2, so no new messages - falls to fallback
|
|
assert new_messages == []
|
|
|
|
def test_tool_call_messages_preserved_in_filter(self):
|
|
"""Tool call messages pass through the filter, keeping offset correct."""
|
|
history = [
|
|
{"role": "session_meta", "tools": [], "timestamp": "t0"},
|
|
{"role": "user", "content": "Search for cats", "timestamp": "t1"},
|
|
{"role": "assistant", "content": None, "timestamp": "t1",
|
|
"tool_calls": [{"id": "tc1", "function": {"name": "web_search"}}]},
|
|
{"role": "tool", "tool_call_id": "tc1",
|
|
"content": "Results about cats", "timestamp": "t1"},
|
|
{"role": "assistant", "content": "Here are results.",
|
|
"timestamp": "t1"},
|
|
]
|
|
|
|
agent_history = _filter_history(history)
|
|
# session_meta filtered, but tool_calls/tool messages kept
|
|
assert len(agent_history) == 4
|
|
assert len(history) == 5 # 1 session_meta extra
|
|
|
|
agent_messages = [
|
|
{"role": "user", "content": "Search for cats"},
|
|
{"role": "assistant", "content": None,
|
|
"tool_calls": [{"id": "tc1", "function": {"name": "web_search"}}]},
|
|
{"role": "tool", "tool_call_id": "tc1", "content": "Results about cats"},
|
|
{"role": "assistant", "content": "Here are results."},
|
|
{"role": "user", "content": "Now search for dogs"},
|
|
{"role": "assistant", "content": "Dog results here."},
|
|
]
|
|
|
|
# OLD: len(history) = 5, agent_messages[5:] = 1 message (lost user msg)
|
|
old_new = (agent_messages[len(history):]
|
|
if len(agent_messages) > len(history)
|
|
else agent_messages)
|
|
assert len(old_new) == 1 # BUG
|
|
|
|
# FIXED
|
|
fixed_new = (agent_messages[len(agent_history):]
|
|
if len(agent_messages) > len(agent_history)
|
|
else [])
|
|
assert len(fixed_new) == 2
|
|
assert fixed_new[0]["content"] == "Now search for dogs"
|
|
assert fixed_new[1]["content"] == "Dog results here."
|
|
|
|
def test_recursive_queued_followup_keeps_outer_history_offset(self):
|
|
"""Queued drain persistence must include every turn in the chain.
|
|
|
|
``_run_agent()`` recurses when a follow-up arrived while the current turn
|
|
was running. The recursive call naturally returns a later
|
|
``history_offset`` because it received the previous turn as part of its
|
|
input history. If the outer caller persists transcript rows using that
|
|
later offset, it only sees the *last* queued turn as new and drops the
|
|
earlier queued turn from the transcript.
|
|
"""
|
|
history_before_chain = [
|
|
{"role": "user", "content": "Earlier question"},
|
|
{"role": "assistant", "content": "Earlier answer"},
|
|
]
|
|
first_followup_turn = [
|
|
{"role": "user", "content": "First follow-up question"},
|
|
{"role": "assistant", "content": "First follow-up answer"},
|
|
]
|
|
second_followup_turn = [
|
|
{"role": "user", "content": "Second follow-up question"},
|
|
{"role": "assistant", "content": "Second follow-up answer"},
|
|
]
|
|
|
|
current_result = {
|
|
"history_offset": len(history_before_chain),
|
|
"messages": history_before_chain + first_followup_turn,
|
|
}
|
|
followup_result = {
|
|
"history_offset": len(history_before_chain + first_followup_turn),
|
|
"messages": (
|
|
history_before_chain
|
|
+ first_followup_turn
|
|
+ second_followup_turn
|
|
),
|
|
}
|
|
|
|
merged = _preserve_queued_followup_history_offset(
|
|
current_result,
|
|
followup_result,
|
|
)
|
|
assert merged["history_offset"] == len(history_before_chain)
|
|
|
|
persisted = merged["messages"][merged["history_offset"]:]
|
|
assert persisted == first_followup_turn + second_followup_turn
|
|
|
|
def test_recursive_queued_followup_preserves_smaller_existing_offset(self):
|
|
"""Do not widen the slice if the nested result is already conservative."""
|
|
current_result = {"history_offset": 4}
|
|
followup_result = {"history_offset": 3, "messages": []}
|
|
|
|
merged = _preserve_queued_followup_history_offset(
|
|
current_result,
|
|
followup_result,
|
|
)
|
|
|
|
assert merged["history_offset"] == 3
|