fix(agent): count tokens, not just rows, as preflight compression progress

Rebased onto god-file Phase 1 refactor — preflight compression has moved
from agent/conversation_loop.py to agent/turn_context.py (no semantic
change in the refactor itself; the bug below was carried over verbatim).

The preflight compression loop in ``turn_context.py`` uses
``len(messages) >= _orig_len`` to decide whether a compression pass has
made progress. That conflates two different conditions: a true no-op
(transcript materially unchanged) and effective token compression that
summarises message contents but keeps the same number of rows. The
second case is misread as "Cannot compress further" — the session then
surfaces ``Context length exceeded`` and auto-resets even when the
post-compression estimate is far below the model context window.

Observed example from #39548: a Telegram session on GPT-5.5 with a 1M
context dropped from ~288k → ~183k tokens (a 36% reduction) while
preserving 220 messages. The loop treats that as exhaustion and the
gateway auto-resets the session.

Fix
---
Add ``_compression_made_progress(orig_len, new_len, orig_tokens, new_tokens)``
and call it after the post-pass ``estimate_request_tokens_rough`` (which
is moved up to run *before* the progress check instead of after it).
Either a row-count reduction OR a token-count reduction now counts as
progress; only when neither moves do we break out as "stuck".

Fixes #39548
This commit is contained in:
JackJin 2026-06-09 23:12:50 +08:00 committed by kshitijk4poor
parent 33efff0d8c
commit b08ee8ad04
2 changed files with 97 additions and 7 deletions

View file

@ -34,6 +34,23 @@ from agent.model_metadata import estimate_request_tokens_rough
logger = logging.getLogger(__name__)
def _compression_made_progress(
orig_len: int, new_len: int, orig_tokens: int, new_tokens: int
) -> bool:
"""Return ``True`` if a compression pass materially reduced the request.
Compression can succeed by summarising message contents reducing the
estimated request token count without reducing the message row
count. Treating row count as the sole progress signal false-positives
on size-only wins and surfaces a misleading "Cannot compress further"
failure even when post-compression tokens are well below the model
context window. See issue #39548 for an observed case: 220 → 220
messages, ~288k ~183k tokens on a 1M-context model still triggered
auto-reset.
"""
return new_len < orig_len or new_tokens < orig_tokens
@dataclass
class TurnContext:
"""Values produced by the turn prologue and consumed by the turn loop."""
@ -313,23 +330,30 @@ def build_turn_context(
)
for _pass in range(3):
_orig_len = len(messages)
_orig_tokens = _preflight_tokens
messages, active_system_prompt = agent._compress_context(
messages, system_message, approx_tokens=_preflight_tokens,
task_id=effective_task_id,
)
if len(messages) >= _orig_len:
break # Cannot compress further
# Re-estimate now so size-only compression (same row count,
# lower token count — e.g. summarising tool outputs) is
# recognised as progress instead of being misread as
# "Cannot compress further". Fixes #39548.
_preflight_tokens = estimate_request_tokens_rough(
messages,
system_prompt=active_system_prompt or "",
tools=agent.tools or None,
)
if not _compression_made_progress(
_orig_len, len(messages), _orig_tokens, _preflight_tokens
):
break # Cannot compress further: neither rows nor tokens moved
conversation_history = None
agent._empty_content_retries = 0
agent._thinking_prefill_retries = 0
agent._last_content_with_tools = None
agent._last_content_tools_all_housekeeping = False
agent._mute_post_response = False
_preflight_tokens = estimate_request_tokens_rough(
messages,
system_prompt=active_system_prompt or "",
tools=agent.tools or None,
)
if not _compressor.should_compress(_preflight_tokens):
break

View file

@ -0,0 +1,66 @@
"""Regression: detect compression progress by tokens, not just rows.
Issue #39548: preflight compression in the turn prologue was checking
``len(messages) >= _orig_len`` to decide "Cannot compress further". This
false-positives when a pass summarises message contents reducing the
estimated request token count without removing any rows and surfaces a
spurious ``Context length exceeded`` failure followed by an auto-reset of
an otherwise healthy session.
These tests pin the contract of ``_compression_made_progress``: either a
row-count reduction OR a token-count reduction counts as progress.
"""
from __future__ import annotations
from agent.turn_context import _compression_made_progress
class TestCompressionMadeProgress:
def test_rows_reduced_counts_as_progress(self):
"""Removing message rows is the obvious progress signal."""
assert _compression_made_progress(
orig_len=10, new_len=5, orig_tokens=1000, new_tokens=1000
) is True
def test_tokens_reduced_without_row_change_counts_as_progress(self):
"""Issue #39548: 220 → 220 rows, 288k → 183k tokens IS progress."""
assert _compression_made_progress(
orig_len=220, new_len=220, orig_tokens=288_028, new_tokens=183_180
) is True
def test_both_reduced_counts_as_progress(self):
"""Common case: summarising drops some rows and shrinks the rest."""
assert _compression_made_progress(
orig_len=220, new_len=180, orig_tokens=288_028, new_tokens=150_000
) is True
def test_neither_moved_means_no_progress(self):
"""The genuine "stuck" case — same rows, same tokens, give up."""
assert _compression_made_progress(
orig_len=10, new_len=10, orig_tokens=1000, new_tokens=1000
) is False
def test_rows_grew_and_tokens_grew_means_no_progress(self):
"""Pathological: the pass made the request larger — definitely stuck."""
assert _compression_made_progress(
orig_len=10, new_len=12, orig_tokens=1000, new_tokens=1200
) is False
def test_rows_grew_but_tokens_dropped_is_progress(self):
"""Edge: summary rows may expand the row count while shrinking tokens.
Token reduction alone is sufficient to keep the loop going.
"""
assert _compression_made_progress(
orig_len=10, new_len=11, orig_tokens=1000, new_tokens=600
) is True
def test_tokens_grew_but_rows_dropped_is_progress(self):
"""Edge: row reduction alone is sufficient even if tokens nominally
creep up (e.g. summary verbosity). Row-count reduction is a hard
signal that the transcript actually shrank.
"""
assert _compression_made_progress(
orig_len=10, new_len=5, orig_tokens=1000, new_tokens=1100
) is True