mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-27 01:11:40 +00:00
Merge branch 'main' into pr-635
This commit is contained in:
commit
c5a9d1ef9d
5 changed files with 344 additions and 2 deletions
159
tests/gateway/test_session_hygiene.py
Normal file
159
tests/gateway/test_session_hygiene.py
Normal file
|
|
@ -0,0 +1,159 @@
|
|||
"""Tests for gateway session hygiene — auto-compression of large sessions.
|
||||
|
||||
Verifies that the gateway detects pathologically large transcripts and
|
||||
triggers auto-compression before running the agent. (#628)
|
||||
"""
|
||||
|
||||
import pytest
|
||||
from unittest.mock import patch, MagicMock, AsyncMock
|
||||
from agent.model_metadata import estimate_messages_tokens_rough
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _make_history(n_messages: int, content_size: int = 100) -> list:
|
||||
"""Build a fake transcript with n_messages user/assistant pairs."""
|
||||
history = []
|
||||
content = "x" * content_size
|
||||
for i in range(n_messages):
|
||||
role = "user" if i % 2 == 0 else "assistant"
|
||||
history.append({"role": role, "content": content, "timestamp": f"t{i}"})
|
||||
return history
|
||||
|
||||
|
||||
def _make_large_history_tokens(target_tokens: int) -> list:
|
||||
"""Build a history that estimates to roughly target_tokens tokens."""
|
||||
# estimate_messages_tokens_rough counts total chars in str(msg) // 4
|
||||
# Each msg dict has ~60 chars of overhead + content chars
|
||||
# So for N tokens we need roughly N * 4 total chars across all messages
|
||||
target_chars = target_tokens * 4
|
||||
# Each message as a dict string is roughly len(content) + 60 chars
|
||||
msg_overhead = 60
|
||||
# Use 50 messages with appropriately sized content
|
||||
n_msgs = 50
|
||||
content_size = max(10, (target_chars // n_msgs) - msg_overhead)
|
||||
return _make_history(n_msgs, content_size=content_size)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Detection threshold tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestSessionHygieneThresholds:
|
||||
"""Test that the threshold logic correctly identifies large sessions."""
|
||||
|
||||
def test_small_session_below_thresholds(self):
|
||||
"""A 10-message session should not trigger compression."""
|
||||
history = _make_history(10)
|
||||
msg_count = len(history)
|
||||
approx_tokens = estimate_messages_tokens_rough(history)
|
||||
|
||||
compress_token_threshold = 100_000
|
||||
compress_msg_threshold = 200
|
||||
|
||||
needs_compress = (
|
||||
approx_tokens >= compress_token_threshold
|
||||
or msg_count >= compress_msg_threshold
|
||||
)
|
||||
assert not needs_compress
|
||||
|
||||
def test_large_message_count_triggers(self):
|
||||
"""200+ messages should trigger compression even if tokens are low."""
|
||||
history = _make_history(250, content_size=10)
|
||||
msg_count = len(history)
|
||||
|
||||
compress_msg_threshold = 200
|
||||
needs_compress = msg_count >= compress_msg_threshold
|
||||
assert needs_compress
|
||||
|
||||
def test_large_token_count_triggers(self):
|
||||
"""High token count should trigger compression even if message count is low."""
|
||||
# 50 messages with huge content to exceed 100K tokens
|
||||
history = _make_history(50, content_size=10_000)
|
||||
approx_tokens = estimate_messages_tokens_rough(history)
|
||||
|
||||
compress_token_threshold = 100_000
|
||||
needs_compress = approx_tokens >= compress_token_threshold
|
||||
assert needs_compress
|
||||
|
||||
def test_under_both_thresholds_no_trigger(self):
|
||||
"""Session under both thresholds should not trigger."""
|
||||
history = _make_history(100, content_size=100)
|
||||
msg_count = len(history)
|
||||
approx_tokens = estimate_messages_tokens_rough(history)
|
||||
|
||||
compress_token_threshold = 100_000
|
||||
compress_msg_threshold = 200
|
||||
|
||||
needs_compress = (
|
||||
approx_tokens >= compress_token_threshold
|
||||
or msg_count >= compress_msg_threshold
|
||||
)
|
||||
assert not needs_compress
|
||||
|
||||
def test_custom_thresholds(self):
|
||||
"""Custom thresholds from config should be respected."""
|
||||
history = _make_history(60, content_size=100)
|
||||
msg_count = len(history)
|
||||
|
||||
# Custom lower threshold
|
||||
compress_msg_threshold = 50
|
||||
needs_compress = msg_count >= compress_msg_threshold
|
||||
assert needs_compress
|
||||
|
||||
# Custom higher threshold
|
||||
compress_msg_threshold = 100
|
||||
needs_compress = msg_count >= compress_msg_threshold
|
||||
assert not needs_compress
|
||||
|
||||
def test_minimum_message_guard(self):
|
||||
"""Sessions with fewer than 4 messages should never trigger."""
|
||||
history = _make_history(3, content_size=100_000)
|
||||
# Even with enormous content, < 4 messages should be skipped
|
||||
# (the gateway code checks `len(history) >= 4` before evaluating)
|
||||
assert len(history) < 4
|
||||
|
||||
|
||||
class TestSessionHygieneWarnThreshold:
|
||||
"""Test the post-compression warning threshold."""
|
||||
|
||||
def test_warn_when_still_large(self):
|
||||
"""If compressed result is still above warn_tokens, should warn."""
|
||||
# Simulate post-compression tokens
|
||||
warn_threshold = 200_000
|
||||
post_compress_tokens = 250_000
|
||||
assert post_compress_tokens >= warn_threshold
|
||||
|
||||
def test_no_warn_when_under(self):
|
||||
"""If compressed result is under warn_tokens, no warning."""
|
||||
warn_threshold = 200_000
|
||||
post_compress_tokens = 150_000
|
||||
assert post_compress_tokens < warn_threshold
|
||||
|
||||
|
||||
class TestTokenEstimation:
|
||||
"""Verify rough token estimation works as expected for hygiene checks."""
|
||||
|
||||
def test_empty_history(self):
|
||||
assert estimate_messages_tokens_rough([]) == 0
|
||||
|
||||
def test_proportional_to_content(self):
|
||||
small = _make_history(10, content_size=100)
|
||||
large = _make_history(10, content_size=10_000)
|
||||
assert estimate_messages_tokens_rough(large) > estimate_messages_tokens_rough(small)
|
||||
|
||||
def test_proportional_to_count(self):
|
||||
few = _make_history(10, content_size=1000)
|
||||
many = _make_history(100, content_size=1000)
|
||||
assert estimate_messages_tokens_rough(many) > estimate_messages_tokens_rough(few)
|
||||
|
||||
def test_pathological_session_detected(self):
|
||||
"""The reported pathological case: 648 messages, ~299K tokens."""
|
||||
# Simulate a 648-message session averaging ~460 tokens per message
|
||||
history = _make_history(648, content_size=1800)
|
||||
tokens = estimate_messages_tokens_rough(history)
|
||||
# Should be well above the 100K default threshold
|
||||
assert tokens > 100_000
|
||||
assert len(history) > 200
|
||||
Loading…
Add table
Add a link
Reference in a new issue