hermes-agent/tests/agent/test_compressor_tool_call_budget.py

"""Regression tests for tool_call envelope accounting in the compression
tail-protection budget walks (issue #28053).

The budget walks used to estimate an assistant message's tokens from
content + ``function.arguments`` only, dropping each ``tool_call``'s ``id``,
``type`` and ``function.name`` (plus JSON structure). For assistant turns
that fan out into parallel tool calls this undercounted by 2-15x, so the
protected tail overshot ``tail_token_budget`` and compression became
ineffective. The fix routes all three walks through
``_estimate_msg_budget_tokens``, which counts the full envelope.
"""

import pytest
from unittest.mock import patch

from agent.context_compressor import (
    ContextCompressor,
    _CHARS_PER_TOKEN,
    _estimate_msg_budget_tokens,
)


def _assistant_with_tool_calls(n_calls: int, *, args: str = '{"path":"a"}') -> dict:
    """An assistant turn fanning into ``n_calls`` parallel tool calls with
    realistic id/name overhead but a small arguments string."""
    return {
        "role": "assistant",
        "content": "",
        "tool_calls": [
            {
                "id": f"call_{i:02d}_{'a' * 24}",  # ~32 chars, UUID-ish id
                "type": "function",
                "function": {"name": "read_file", "arguments": args},
            }
            for i in range(n_calls)
        ],
    }


def _args_only_estimate(msg: dict) -> int:
    """Reproduce the OLD (buggy) arguments-only walk for comparison."""
    content = msg.get("content") or ""
    tokens = len(content) // _CHARS_PER_TOKEN + 10
    for tc in msg.get("tool_calls") or []:
        if isinstance(tc, dict):
            tokens += len(tc.get("function", {}).get("arguments", "")) // _CHARS_PER_TOKEN
    return tokens


class TestToolCallEnvelopeEstimate:
    def test_envelope_counted_not_just_arguments(self):
        msg = _assistant_with_tool_calls(4)
        new = _estimate_msg_budget_tokens(msg)
        old = _args_only_estimate(msg)
        # id/type/name + JSON structure dwarf the tiny arguments string.
        assert new > old * 3, (new, old)
        # The estimate covers the full serialized tool_call envelope.
        envelope = sum(len(str(tc)) for tc in msg["tool_calls"]) // _CHARS_PER_TOKEN
        assert new >= envelope

    def test_scales_with_number_of_parallel_calls(self):
        one = _estimate_msg_budget_tokens(_assistant_with_tool_calls(1))
        five = _estimate_msg_budget_tokens(_assistant_with_tool_calls(5))
        assert five > one * 3

    def test_no_tool_calls_matches_content_estimate(self):
        msg = {"role": "user", "content": "x" * 400}
        # Plain message: content//4 + 10 overhead, behavior unchanged.
        assert _estimate_msg_budget_tokens(msg) == 400 // _CHARS_PER_TOKEN + 10

    def test_non_dict_tool_calls_do_not_crash(self):
        msg = {"role": "assistant", "content": "hi", "tool_calls": ["weird", None]}
        # Non-dict entries are ignored (as before) without raising.
        assert _estimate_msg_budget_tokens(msg) == len("hi") // _CHARS_PER_TOKEN + 10


@pytest.fixture()
def compressor():
    with patch("agent.context_compressor.get_model_context_length", return_value=100000):
        return ContextCompressor(
            model="test/model",
            threshold_percent=0.85,
            protect_first_n=2,
            protect_last_n=2,
            quiet_mode=True,
        )


class TestTailCutAccountsForToolCalls:
    def test_tail_cut_stops_on_tool_call_heavy_tail(self, compressor):
        # 20 assistant turns, each fanning into 5 short-arg tool calls.
        heavy = [_assistant_with_tool_calls(5) for _ in range(20)]
        messages = [{"role": "user", "content": "start"}] + heavy

        per_msg = _estimate_msg_budget_tokens(messages[-1])
        assert per_msg > 30  # sanity: a heavy turn is non-trivial once the envelope counts

        # Budget sized so ~6 heavy turns fit under the 1.5x soft ceiling.
        token_budget = int(per_msg * 6 / 1.5)
        cut = compressor._find_tail_cut_by_tokens(messages, head_end=1, token_budget=token_budget)
        protected = len(messages) - cut

        # With the envelope counted, the walk stops well short of protecting all
        # 20 turns. The old arguments-only estimate (~25 tokens/turn) never
        # reaches the ceiling and would protect the entire transcript.
        assert protected < len(heavy)
        assert 3 <= protected <= 12