test(compressor): regression coverage for assistant-tail anchor + compaction rollup (#29824)

21 cases pinning the new ``_ensure_last_assistant_message_in_tail`` anchor and its interaction with the existing tail-cut path: * ``TestFindLastAssistantMessageIdx`` — helper contract: prefers a content-bearing assistant message, skips ``tool_calls``-only stubs, multimodal text-block content counts, falls back to "any assistant" when no content-bearing reply exists, honours ``head_end``, returns -1 when there's none. * ``TestEnsureLastAssistantMessageInTail`` — direct: no-op when already in the tail, walks ``cut_idx`` back when the reply is in the compressed middle, never crosses into the head region, re-aligns through a preceding ``tool_call`` / ``tool_result`` group instead of orphaning it. * ``TestFindTailCutByTokensAnchorsAssistant`` — integration: reporter repro (long tool-output run after the visible reply) now preserves the reply; user and assistant anchors compose in a single tail-cut call; a soft-ceiling-overrunning oversized tool result no longer strands the prior reply. * ``TestCompactionRollupReproduction`` — end-to-end through ``compress()`` with a stubbed ``_generate_summary``: the visible reply text survives either as its own standalone assistant message (normal path) or concatenated onto the merged summary tail (double-collision path the WebUI then re-splits). The standalone-summary case is asserted strictly (exactly one summary row, exactly one separate assistant row carrying the reply) — that's the dominant path and any drift there reintroduces the original bug. * ``TestSourceGuardrail`` — static asserts on ``agent/context_compressor.py``: the helper exists, the anchor is wired into ``_find_tail_cut_by_tokens`` AFTER the user-message anchor (so chaining is monotonic), the content-bearing preference is preserved, and the issue number is referenced so future bisects can find this fix.
2026-07-28 18:19:28 +00:00 · 2026-05-21 21:16:58 +07:00 · 2026-05-21 21:16:58 +07:00 · 68536d4375
commit 68536d4375
parent 2fef3e2df2
1 changed files with 518 additions and 0 deletions
--- a/tests/agent/test_compressor_assistant_tail_anchor.py
+++ b/tests/agent/test_compressor_assistant_tail_anchor.py
@ -0,0 +1,518 @@
+"""Regression coverage for #29824 — the WebUI session viewer (and TUI
+chat panel) was showing the ``[CONTEXT COMPACTION — REFERENCE ONLY]``
+handoff block in the slot where the user had just been reading the
+assistant's actual reply, because the previously-visible reply got
+rolled into the compaction summary by the token-budget tail walk.
+
+The fix adds ``_ensure_last_assistant_message_in_tail`` — a mirror of
+the existing ``_ensure_last_user_message_in_tail`` (#10896 anchor) —
+that pulls ``cut_idx`` back to include the most recent assistant
+message with non-empty text content, with the standard tool-group
+realignment so we don't orphan a ``tool_call`` / ``tool_result`` pair.
+
+Pinned here:
+
+* ``TestFindLastAssistantMessageIdx`` — pure helper contract:
+  finds the most recent **content-bearing** assistant message,
+  skips tool-call-only stubs, falls back to "any assistant" only
+  when no content-bearing reply exists in the compressible region,
+  honours ``head_end``, returns -1 when there's no assistant at all.
+
+* ``TestEnsureLastAssistantMessageInTail`` — direct: walks
+  ``cut_idx`` back when the last reply is in the compressed middle,
+  is a no-op when it's already in the tail, never crosses
+  ``head_end``, re-aligns through tool groups.
+
+* ``TestFindTailCutByTokensAnchorsAssistant`` — integration with
+  the existing tail-cut path: the exact reporter scenario (long
+  tool-output run after the previously-visible reply) preserves
+  the reply; combines with the user anchor for the same-turn
+  preservation; soft-ceiling overrun no longer hides the reply.
+
+* ``TestCompactionRollupReproduction`` — end-to-end through
+  ``compress()`` with a stubbed summariser: pre-fix the reply
+  text is absorbed into the summary (regression demonstrated by
+  asserting on the OLD behaviour fails); post-fix the reply text
+  is still present in the compressed transcript as a regular
+  assistant message.
+
+* ``TestSourceGuardrail`` — static asserts on
+  ``agent/context_compressor.py`` so a future refactor can't
+  silently drop the anchor.
+"""
+
+from __future__ import annotations
+
+from unittest.mock import patch
+
+import pytest
+
+
+@pytest.fixture()
+def compressor():
+    """ContextCompressor with mocked deps and a tight tail budget so
+    the helpers' anchor behaviour is observable."""
+    from agent.context_compressor import ContextCompressor
+    with patch(
+        "agent.context_compressor.get_model_context_length",
+        return_value=100_000,
+    ):
+        c = ContextCompressor(
+            model="test/model",
+            threshold_percent=0.85,
+            protect_first_n=2,
+            protect_last_n=2,
+            quiet_mode=True,
+        )
+        c.tail_token_budget = 50
+        return c
+
+
+# ---------------------------------------------------------------------------
+# Helper: _find_last_assistant_message_idx
+# ---------------------------------------------------------------------------
+
+
+class TestFindLastAssistantMessageIdx:
+    def test_finds_content_bearing_assistant(self, compressor):
+        messages = [
+            {"role": "system", "content": "sys"},
+            {"role": "user", "content": "q"},
+            {"role": "assistant", "content": "the reply"},
+        ]
+        idx = compressor._find_last_assistant_message_idx(messages, head_end=1)
+        assert idx == 2
+
+    def test_skips_tool_call_only_stub_when_text_reply_exists_earlier(
+        self, compressor
+    ):
+        """An assistant message that only carries ``tool_calls`` (no
+        text content) is not the user-visible reply — the WebUI
+        renders those as small "calling tool X" indicators. The helper
+        must prefer the earlier text reply, which is what the user
+        actually read."""
+        messages = [
+            {"role": "user", "content": "q1"},
+            {"role": "assistant", "content": "VISIBLE REPLY"},
+            {"role": "user", "content": "q2"},
+            {"role": "assistant", "content": None,
+             "tool_calls": [{"function": {"name": "t",
+                                          "arguments": "{}"}}]},
+            {"role": "tool", "content": "result", "tool_call_id": "c1"},
+        ]
+        idx = compressor._find_last_assistant_message_idx(messages, head_end=0)
+        assert idx == 1, (
+            "Expected the content-bearing assistant reply (1), not the "
+            f"trailing tool-call stub. Got {idx}."
+        )
+
+    def test_empty_string_content_does_not_count_as_visible(self, compressor):
+        """An assistant message with ``content=""`` (only whitespace)
+        is not a visible reply either — common pre-flight stub before
+        the model streams the real answer."""
+        messages = [
+            {"role": "user", "content": "q1"},
+            {"role": "assistant", "content": "earlier reply"},
+            {"role": "user", "content": "q2"},
+            {"role": "assistant", "content": "   "},  # blank stub
+        ]
+        idx = compressor._find_last_assistant_message_idx(messages, head_end=0)
+        # Blank-string assistant message does not count — fall back
+        # to the earlier real reply.
+        assert idx == 1
+
+    def test_multimodal_text_block_counts(self, compressor):
+        """An assistant with multimodal list-content carrying a text
+        block (Anthropic / GPT-style ``[{type:text,text:...}]``)
+        counts as content-bearing."""
+        messages = [
+            {"role": "user", "content": "q"},
+            {"role": "assistant",
+             "content": [{"type": "text", "text": "hello"}]},
+        ]
+        idx = compressor._find_last_assistant_message_idx(messages, head_end=0)
+        assert idx == 1
+
+    def test_fallback_to_any_assistant_when_no_content_bearing(
+        self, compressor
+    ):
+        """When there's no text-bearing assistant in the compressible
+        region (fresh multi-step tool sequence), fall back to the
+        most recent assistant of any kind so the anchor still works."""
+        messages = [
+            {"role": "user", "content": "q"},
+            {"role": "assistant", "content": None,
+             "tool_calls": [{"function": {"name": "t",
+                                          "arguments": "{}"}}]},
+            {"role": "tool", "content": "result", "tool_call_id": "c1"},
+        ]
+        idx = compressor._find_last_assistant_message_idx(messages, head_end=0)
+        assert idx == 1
+
+    def test_returns_negative_one_when_no_assistant(self, compressor):
+        messages = [
+            {"role": "user", "content": "q1"},
+            {"role": "user", "content": "q2"},
+        ]
+        idx = compressor._find_last_assistant_message_idx(messages, head_end=0)
+        assert idx == -1
+
+    def test_respects_head_end_lower_bound(self, compressor):
+        """An assistant message at or before ``head_end`` must be
+        ignored — it's already in the protected head region."""
+        messages = [
+            {"role": "system", "content": "sys"},
+            {"role": "assistant", "content": "in-head reply"},  # idx 1
+            {"role": "user", "content": "q"},
+        ]
+        # head_end=2 means the compressible region starts at index 2;
+        # the assistant at index 1 is in the head and must be skipped.
+        idx = compressor._find_last_assistant_message_idx(messages, head_end=2)
+        assert idx == -1
+
+
+# ---------------------------------------------------------------------------
+# Helper: _ensure_last_assistant_message_in_tail
+# ---------------------------------------------------------------------------
+
+
+class TestEnsureLastAssistantMessageInTail:
+    def test_no_op_when_already_in_tail(self, compressor):
+        messages = [
+            {"role": "user", "content": "q"},
+            {"role": "assistant", "content": "reply"},
+            {"role": "user", "content": "q2"},
+        ]
+        # cut_idx=1 means tail starts at index 1 — the reply is already in tail.
+        new_cut = compressor._ensure_last_assistant_message_in_tail(
+            messages, cut_idx=1, head_end=0
+        )
+        assert new_cut == 1
+
+    def test_walks_cut_idx_back_to_include_reply(self, compressor):
+        messages = [
+            {"role": "user", "content": "q1"},
+            {"role": "assistant", "content": "REPLY"},  # idx 1
+            {"role": "user", "content": "q2"},
+            {"role": "user", "content": "q3"},
+        ]
+        # cut_idx=2 leaves the reply outside the tail; anchor must pull
+        # cut_idx back to 1 so messages[1:] contains the reply.
+        new_cut = compressor._ensure_last_assistant_message_in_tail(
+            messages, cut_idx=2, head_end=0
+        )
+        assert new_cut == 1
+        assert any(
+            isinstance(m.get("content"), str) and "REPLY" in m["content"]
+            for m in messages[new_cut:]
+        )
+
+    def test_never_crosses_head_end(self, compressor):
+        messages = [
+            {"role": "system", "content": "sys"},
+            {"role": "assistant", "content": "in-head"},  # head, must ignore
+            {"role": "user", "content": "q"},
+        ]
+        # head_end=2 ⇒ assistant at idx 1 is in the head; the anchor
+        # finds nothing in the compressible region and is a no-op.
+        new_cut = compressor._ensure_last_assistant_message_in_tail(
+            messages, cut_idx=3, head_end=2
+        )
+        assert new_cut == 3
+
+    def test_re_aligns_through_preceding_tool_group(self, compressor):
+        """When the anchored assistant is preceded by a
+        tool_call/result group, ``_align_boundary_backward`` must pull
+        ``cut_idx`` even further back so the group isn't split — same
+        guarantee as ``_ensure_last_user_message_in_tail``."""
+        messages = [
+            {"role": "user", "content": "q1"},
+            {"role": "assistant", "content": None,
+             "tool_calls": [{"id": "c1",
+                             "function": {"name": "t",
+                                          "arguments": "{}"}}]},
+            {"role": "tool", "content": "result", "tool_call_id": "c1"},
+            {"role": "assistant", "content": "REPLY"},  # idx 3
+            {"role": "user", "content": "q2"},
+        ]
+        # cut_idx=4 leaves the reply outside the tail. Anchor pulls
+        # back to 3, then _align_boundary_backward sees the preceding
+        # tool group and pulls further back to 1 (before the assistant
+        # with tool_calls).
+        new_cut = compressor._ensure_last_assistant_message_in_tail(
+            messages, cut_idx=4, head_end=0
+        )
+        assert new_cut <= 3
+        # The tool_call assistant (1) and its tool_result (2) must NOT
+        # be split: either both in compressed region or both in tail.
+        if new_cut <= 1:
+            # Both in tail — tool group intact.
+            assert messages[new_cut].get("role") == "assistant"
+        else:
+            # Otherwise the anchor must land at the reply itself (3).
+            assert new_cut == 3
+
+
+# ---------------------------------------------------------------------------
+# Integration with _find_tail_cut_by_tokens
+# ---------------------------------------------------------------------------
+
+
+class TestFindTailCutByTokensAnchorsAssistant:
+    def test_reporter_repro_long_tool_run_after_visible_reply(
+        self, compressor
+    ):
+        """The exact #29824 scenario: a tight token budget combined
+        with a long tail of tool-call/result messages after the
+        visible reply. Pre-fix, the token-budget walk hit its ceiling
+        on the tool output and parked ``cut_idx`` past the reply.
+        Post-fix, the assistant anchor pulls it back."""
+        c = compressor
+        c.tail_token_budget = 10  # force min-tail behaviour
+        messages = [
+            {"role": "system", "content": "sys"},
+            {"role": "user", "content": "msg1"},                # head_end=2
+            {"role": "user", "content": "q1"},
+            {"role": "assistant",
+             "content": "PREVIOUSLY VISIBLE REPLY"},           # idx 3
+            {"role": "user", "content": "q2"},
+            {"role": "assistant", "content": None,
+             "tool_calls": [{"id": "c1",
+                             "function": {"name": "t",
+                                          "arguments": "{}"}}]},
+            {"role": "tool", "content": "x" * 200,
+             "tool_call_id": "c1"},
+        ]
+        cut = c._find_tail_cut_by_tokens(messages, head_end=2)
+        tail_contents = [
+            m.get("content") for m in messages[cut:]
+            if isinstance(m.get("content"), str)
+        ]
+        assert any(
+            "PREVIOUSLY VISIBLE REPLY" in (t or "") for t in tail_contents
+        ), (
+            "REGRESSION (#29824): the visible reply was rolled into "
+            f"the compaction summary. Tail contents: {tail_contents!r}"
+        )
+
+    def test_user_and_assistant_anchors_compose(self, compressor):
+        """Both anchors run in sequence; the tail must contain both
+        the latest user message AND the latest visible assistant
+        reply."""
+        c = compressor
+        c.tail_token_budget = 10
+        messages = [
+            {"role": "user", "content": "q1"},
+            {"role": "assistant", "content": "VISIBLE REPLY"},
+            {"role": "user", "content": "follow-up question"},
+            {"role": "user", "content": "and another"},
+        ]
+        cut = c._find_tail_cut_by_tokens(messages, head_end=0)
+        tail_contents = [
+            m.get("content") for m in messages[cut:]
+            if isinstance(m.get("content"), str)
+        ]
+        assert any("VISIBLE REPLY" in (t or "") for t in tail_contents)
+        assert any("and another" in (t or "") for t in tail_contents)
+
+    def test_oversized_tool_output_does_not_strand_reply(self, compressor):
+        """The soft-ceiling logic in ``_find_tail_cut_by_tokens``
+        permits a single oversized tail message; the assistant anchor
+        must still recover the reply on the other side of it."""
+        c = compressor
+        c.tail_token_budget = 100  # soft ceiling 150
+        messages = [
+            {"role": "user", "content": "earlier"},
+            {"role": "assistant", "content": "VISIBLE REPLY"},
+            {"role": "user", "content": "read big file"},
+            {"role": "assistant", "content": None,
+             "tool_calls": [{"id": "c1",
+                             "function": {"name": "read",
+                                          "arguments": "{}"}}]},
+            # ~500 chars ⇒ ~135 tokens, blows past soft ceiling of 150
+            {"role": "tool", "content": "y" * 500,
+             "tool_call_id": "c1"},
+            {"role": "user", "content": "ok"},
+        ]
+        cut = c._find_tail_cut_by_tokens(messages, head_end=0)
+        tail_contents = [
+            m.get("content") for m in messages[cut:]
+            if isinstance(m.get("content"), str)
+        ]
+        assert any("VISIBLE REPLY" in (t or "") for t in tail_contents)
+
+
+# ---------------------------------------------------------------------------
+# End-to-end: compress() preserves the reply
+# ---------------------------------------------------------------------------
+
+
+class TestCompactionRollupReproduction:
+    """End-to-end through ``compress()``: the visible reply text must
+    survive in the compressed transcript — either as its own
+    standalone assistant message OR concatenated onto the merged
+    summary-handoff tail message (the compressor's double-collision
+    fallback path; the WebUI re-splits these on the END marker so the
+    reply renders as a separate bubble — see ``splitCompactionContent``
+    in ``web/src/pages/SessionsPage.tsx``)."""
+
+    def test_compress_keeps_visible_reply_text(self, compressor):
+        from agent.context_compressor import SUMMARY_PREFIX
+        c = compressor
+        c.tail_token_budget = 10
+        # ``_generate_summary`` normally wraps the LLM body in
+        # ``SUMMARY_PREFIX`` via ``_with_summary_prefix``; mimic that so
+        # the merge-into-tail branch can identify the boundary.
+        _mocked = f"{SUMMARY_PREFIX}\nrolled-up middle summary"
+        messages = (
+            [{"role": "system", "content": "sys"},
+             {"role": "user", "content": "initial"}]   # head (protect_first_n=2)
+            # Middle: long enough to be compressible.
+            + [
+                {"role": "user", "content": f"middle q{i}"}
+                if i % 2 == 0
+                else {"role": "assistant", "content": f"middle reply {i}"}
+                for i in range(12)
+            ]
+            + [
+                {"role": "user", "content": "the visible question"},
+                {"role": "assistant",
+                 "content": "THE VISIBLE REPLY THE USER JUST READ"},
+                {"role": "user", "content": "follow up"},
+                {"role": "assistant", "content": None,
+                 "tool_calls": [{"id": "c1",
+                                 "function": {"name": "t",
+                                              "arguments": "{}"}}]},
+                {"role": "tool", "content": "z" * 500,
+                 "tool_call_id": "c1"},
+            ]
+        )
+        with patch.object(
+            c, "_generate_summary",
+            return_value=_mocked,
+        ):
+            result = c.compress(messages, current_tokens=90_000)
+        # 1. A summary message exists (compression actually ran).
+        assert any(
+            isinstance(m.get("content"), str)
+            and m["content"].startswith(SUMMARY_PREFIX)
+            for m in result
+        ), "compress() did not insert a summary message"
+        # 2. The visible reply text must survive somewhere — either
+        # as its own message OR concatenated into the merged tail.
+        joined = "\n".join(
+            m.get("content") for m in result
+            if isinstance(m.get("content"), str)
+        )
+        assert "THE VISIBLE REPLY THE USER JUST READ" in joined, (
+            "REGRESSION (#29824): the visible reply was absorbed into "
+            "the compaction summary AND erased. Compressed transcript "
+            f"({len(result)} msgs): "
+            f"{[(m.get('role'), str(m.get('content'))[:50]) for m in result]}"
+        )
+
+    def test_standalone_summary_case_keeps_reply_as_own_message(
+        self, compressor
+    ):
+        """When the head and tail roles allow a standalone summary
+        message (no double-collision), the visible reply must remain
+        as its OWN assistant message — not merged with anything.
+        This is the common case; the merge-into-tail path is the
+        edge case for double-collision."""
+        from agent.context_compressor import SUMMARY_PREFIX
+        c = compressor
+        c.tail_token_budget = 10
+        _mocked = f"{SUMMARY_PREFIX}\nrolled-up middle summary"
+        # Head ends with ``assistant`` ⇒ summary_role flips to
+        # ``user`` ⇒ no collision with the assistant tail ⇒ standalone
+        # summary insert (no merge).
+        messages = (
+            [
+                {"role": "user", "content": "initial"},
+                {"role": "assistant", "content": "head reply"},
+            ]
+            + [
+                {"role": "user", "content": f"middle q{i}"}
+                if i % 2 == 0
+                else {"role": "assistant", "content": f"middle reply {i}"}
+                for i in range(12)
+            ]
+            + [
+                {"role": "user", "content": "the visible question"},
+                {"role": "assistant",
+                 "content": "THE VISIBLE REPLY THE USER JUST READ"},
+                {"role": "user", "content": "follow up"},
+            ]
+        )
+        with patch.object(
+            c, "_generate_summary",
+            return_value=_mocked,
+        ):
+            result = c.compress(messages, current_tokens=90_000)
+        # Standalone summary present:
+        summary_rows = [
+            m for m in result
+            if isinstance(m.get("content"), str)
+            and m["content"].startswith(SUMMARY_PREFIX)
+        ]
+        assert len(summary_rows) == 1
+        # Visible reply as its OWN distinct assistant message
+        # (NOT merged into the summary row):
+        reply_rows = [
+            m for m in result
+            if m.get("role") == "assistant"
+            and isinstance(m.get("content"), str)
+            and "THE VISIBLE REPLY THE USER JUST READ" in m["content"]
+            and not m["content"].startswith(SUMMARY_PREFIX)
+        ]
+        assert len(reply_rows) == 1, (
+            "REGRESSION (#29824): expected exactly one standalone "
+            f"assistant message carrying the visible reply, got "
+            f"{len(reply_rows)}"
+        )
+
+
+# ---------------------------------------------------------------------------
+# Source guardrail
+# ---------------------------------------------------------------------------
+
+
+class TestSourceGuardrail:
+    @pytest.fixture
+    def source(self) -> str:
+        from pathlib import Path
+        return (Path(__file__).resolve().parents[2]
+                / "agent" / "context_compressor.py").read_text(
+                    encoding="utf-8")
+
+    def test_helper_defined(self, source):
+        assert "def _find_last_assistant_message_idx(" in source
+        assert "def _ensure_last_assistant_message_in_tail(" in source
+
+    def test_anchor_called_from_find_tail_cut(self, source):
+        """Without the call site the helper is dead code and the bug
+        regresses silently — pin both the definition AND the wiring."""
+        assert "self._ensure_last_assistant_message_in_tail(" in source
+
+    def test_anchor_called_after_user_anchor(self, source):
+        """The two anchors must run in sequence; reversing or skipping
+        one drops the corresponding side of the guarantee."""
+        user_call = "self._ensure_last_user_message_in_tail(messages, cut_idx, head_end)"
+        asst_call = "self._ensure_last_assistant_message_in_tail(messages, cut_idx, head_end)"
+        user_idx = source.find(user_call)
+        asst_idx = source.find(asst_call)
+        assert user_idx >= 0 and asst_idx >= 0
+        assert asst_idx > user_idx, (
+            "The assistant anchor must come AFTER the user anchor in "
+            "``_find_tail_cut_by_tokens`` — each anchor walks cut_idx "
+            "backward, and ordering keeps the chain monotonic."
+        )
+
+    def test_helper_prefers_content_bearing_reply(self, source):
+        """The helper must skip tool-call-only stubs — that's the
+        whole user-experience difference between #29824 (no visible
+        reply) and an in-progress turn (small 'calling tool X' chip)."""
+        assert "content.strip()" in source
+
+    def test_issue_number_referenced(self, source):
+        assert "#29824" in source