fix(agent): count tokens, not just rows, as preflight compression progress

Rebased onto god-file Phase 1 refactor — preflight compression has moved from agent/conversation_loop.py to agent/turn_context.py (no semantic change in the refactor itself; the bug below was carried over verbatim). The preflight compression loop in ``turn_context.py`` uses ``len(messages) >= _orig_len`` to decide whether a compression pass has made progress. That conflates two different conditions: a true no-op (transcript materially unchanged) and effective token compression that summarises message contents but keeps the same number of rows. The second case is misread as "Cannot compress further" — the session then surfaces ``Context length exceeded`` and auto-resets even when the post-compression estimate is far below the model context window. Observed example from #39548: a Telegram session on GPT-5.5 with a 1M context dropped from ~288k → ~183k tokens (a 36% reduction) while preserving 220 messages. The loop treats that as exhaustion and the gateway auto-resets the session. Fix --- Add ``_compression_made_progress(orig_len, new_len, orig_tokens, new_tokens)`` and call it after the post-pass ``estimate_request_tokens_rough`` (which is moved up to run *before* the progress check instead of after it). Either a row-count reduction OR a token-count reduction now counts as progress; only when neither moves do we break out as "stuck". Fixes #39548
2026-06-23 10:42:00 +00:00 · 2026-06-09 23:12:50 +08:00 · 2026-06-09 23:12:50 +08:00 · b08ee8ad04
commit b08ee8ad04
parent 33efff0d8c
2 changed files with 97 additions and 7 deletions
--- a/agent/turn_context.py
+++ b/agent/turn_context.py
@ -34,6 +34,23 @@ from agent.model_metadata import estimate_request_tokens_rough
 logger = logging.getLogger(__name__)


+def _compression_made_progress(
+    orig_len: int, new_len: int, orig_tokens: int, new_tokens: int
+) -> bool:
+    """Return ``True`` if a compression pass materially reduced the request.
+
+    Compression can succeed by summarising message contents — reducing the
+    estimated request token count — without reducing the message row
+    count.  Treating row count as the sole progress signal false-positives
+    on size-only wins and surfaces a misleading "Cannot compress further"
+    failure even when post-compression tokens are well below the model
+    context window.  See issue #39548 for an observed case: 220 → 220
+    messages, ~288k → ~183k tokens on a 1M-context model still triggered
+    auto-reset.
+    """
+    return new_len < orig_len or new_tokens < orig_tokens
+
+
@dataclass
 class TurnContext:
    """Values produced by the turn prologue and consumed by the turn loop."""
@ -313,23 +330,30 @@ def build_turn_context(
            )
            for _pass in range(3):
                _orig_len = len(messages)
+                _orig_tokens = _preflight_tokens
                messages, active_system_prompt = agent._compress_context(
                    messages, system_message, approx_tokens=_preflight_tokens,
                    task_id=effective_task_id,
                )
-                if len(messages) >= _orig_len:
-                    break  # Cannot compress further
+                # Re-estimate now so size-only compression (same row count,
+                # lower token count — e.g. summarising tool outputs) is
+                # recognised as progress instead of being misread as
+                # "Cannot compress further". Fixes #39548.
+                _preflight_tokens = estimate_request_tokens_rough(
+                    messages,
+                    system_prompt=active_system_prompt or "",
+                    tools=agent.tools or None,
+                )
+                if not _compression_made_progress(
+                    _orig_len, len(messages), _orig_tokens, _preflight_tokens
+                ):
+                    break  # Cannot compress further: neither rows nor tokens moved
                conversation_history = None
                agent._empty_content_retries = 0
                agent._thinking_prefill_retries = 0
                agent._last_content_with_tools = None
                agent._last_content_tools_all_housekeeping = False
                agent._mute_post_response = False
-                _preflight_tokens = estimate_request_tokens_rough(
-                    messages,
-                    system_prompt=active_system_prompt or "",
-                    tools=agent.tools or None,
-                )
                if not _compressor.should_compress(_preflight_tokens):
                    break

--- a/tests/agent/test_compression_progress.py
+++ b/tests/agent/test_compression_progress.py
@ -0,0 +1,66 @@
+"""Regression: detect compression progress by tokens, not just rows.
+
+Issue #39548: preflight compression in the turn prologue was checking
+``len(messages) >= _orig_len`` to decide "Cannot compress further". This
+false-positives when a pass summarises message contents — reducing the
+estimated request token count without removing any rows — and surfaces a
+spurious ``Context length exceeded`` failure followed by an auto-reset of
+an otherwise healthy session.
+
+These tests pin the contract of ``_compression_made_progress``: either a
+row-count reduction OR a token-count reduction counts as progress.
+"""
+
+from __future__ import annotations
+
+from agent.turn_context import _compression_made_progress
+
+
+class TestCompressionMadeProgress:
+    def test_rows_reduced_counts_as_progress(self):
+        """Removing message rows is the obvious progress signal."""
+        assert _compression_made_progress(
+            orig_len=10, new_len=5, orig_tokens=1000, new_tokens=1000
+        ) is True
+
+    def test_tokens_reduced_without_row_change_counts_as_progress(self):
+        """Issue #39548: 220 → 220 rows, 288k → 183k tokens IS progress."""
+        assert _compression_made_progress(
+            orig_len=220, new_len=220, orig_tokens=288_028, new_tokens=183_180
+        ) is True
+
+    def test_both_reduced_counts_as_progress(self):
+        """Common case: summarising drops some rows and shrinks the rest."""
+        assert _compression_made_progress(
+            orig_len=220, new_len=180, orig_tokens=288_028, new_tokens=150_000
+        ) is True
+
+    def test_neither_moved_means_no_progress(self):
+        """The genuine "stuck" case — same rows, same tokens, give up."""
+        assert _compression_made_progress(
+            orig_len=10, new_len=10, orig_tokens=1000, new_tokens=1000
+        ) is False
+
+    def test_rows_grew_and_tokens_grew_means_no_progress(self):
+        """Pathological: the pass made the request larger — definitely stuck."""
+        assert _compression_made_progress(
+            orig_len=10, new_len=12, orig_tokens=1000, new_tokens=1200
+        ) is False
+
+    def test_rows_grew_but_tokens_dropped_is_progress(self):
+        """Edge: summary rows may expand the row count while shrinking tokens.
+
+        Token reduction alone is sufficient to keep the loop going.
+        """
+        assert _compression_made_progress(
+            orig_len=10, new_len=11, orig_tokens=1000, new_tokens=600
+        ) is True
+
+    def test_tokens_grew_but_rows_dropped_is_progress(self):
+        """Edge: row reduction alone is sufficient even if tokens nominally
+        creep up (e.g. summary verbosity).  Row-count reduction is a hard
+        signal that the transcript actually shrank.
+        """
+        assert _compression_made_progress(
+            orig_len=10, new_len=5, orig_tokens=1000, new_tokens=1100
+        ) is True