diff --git a/agent/turn_context.py b/agent/turn_context.py index 0bbdf73764e..df34c6edfcb 100644 --- a/agent/turn_context.py +++ b/agent/turn_context.py @@ -34,6 +34,23 @@ from agent.model_metadata import estimate_request_tokens_rough logger = logging.getLogger(__name__) +def _compression_made_progress( + orig_len: int, new_len: int, orig_tokens: int, new_tokens: int +) -> bool: + """Return ``True`` if a compression pass materially reduced the request. + + Compression can succeed by summarising message contents — reducing the + estimated request token count — without reducing the message row + count. Treating row count as the sole progress signal false-positives + on size-only wins and surfaces a misleading "Cannot compress further" + failure even when post-compression tokens are well below the model + context window. See issue #39548 for an observed case: 220 → 220 + messages, ~288k → ~183k tokens on a 1M-context model still triggered + auto-reset. + """ + return new_len < orig_len or new_tokens < orig_tokens + + @dataclass class TurnContext: """Values produced by the turn prologue and consumed by the turn loop.""" @@ -313,23 +330,30 @@ def build_turn_context( ) for _pass in range(3): _orig_len = len(messages) + _orig_tokens = _preflight_tokens messages, active_system_prompt = agent._compress_context( messages, system_message, approx_tokens=_preflight_tokens, task_id=effective_task_id, ) - if len(messages) >= _orig_len: - break # Cannot compress further + # Re-estimate now so size-only compression (same row count, + # lower token count — e.g. summarising tool outputs) is + # recognised as progress instead of being misread as + # "Cannot compress further". Fixes #39548. + _preflight_tokens = estimate_request_tokens_rough( + messages, + system_prompt=active_system_prompt or "", + tools=agent.tools or None, + ) + if not _compression_made_progress( + _orig_len, len(messages), _orig_tokens, _preflight_tokens + ): + break # Cannot compress further: neither rows nor tokens moved conversation_history = None agent._empty_content_retries = 0 agent._thinking_prefill_retries = 0 agent._last_content_with_tools = None agent._last_content_tools_all_housekeeping = False agent._mute_post_response = False - _preflight_tokens = estimate_request_tokens_rough( - messages, - system_prompt=active_system_prompt or "", - tools=agent.tools or None, - ) if not _compressor.should_compress(_preflight_tokens): break diff --git a/tests/agent/test_compression_progress.py b/tests/agent/test_compression_progress.py new file mode 100644 index 00000000000..05e64b37a52 --- /dev/null +++ b/tests/agent/test_compression_progress.py @@ -0,0 +1,66 @@ +"""Regression: detect compression progress by tokens, not just rows. + +Issue #39548: preflight compression in the turn prologue was checking +``len(messages) >= _orig_len`` to decide "Cannot compress further". This +false-positives when a pass summarises message contents — reducing the +estimated request token count without removing any rows — and surfaces a +spurious ``Context length exceeded`` failure followed by an auto-reset of +an otherwise healthy session. + +These tests pin the contract of ``_compression_made_progress``: either a +row-count reduction OR a token-count reduction counts as progress. +""" + +from __future__ import annotations + +from agent.turn_context import _compression_made_progress + + +class TestCompressionMadeProgress: + def test_rows_reduced_counts_as_progress(self): + """Removing message rows is the obvious progress signal.""" + assert _compression_made_progress( + orig_len=10, new_len=5, orig_tokens=1000, new_tokens=1000 + ) is True + + def test_tokens_reduced_without_row_change_counts_as_progress(self): + """Issue #39548: 220 → 220 rows, 288k → 183k tokens IS progress.""" + assert _compression_made_progress( + orig_len=220, new_len=220, orig_tokens=288_028, new_tokens=183_180 + ) is True + + def test_both_reduced_counts_as_progress(self): + """Common case: summarising drops some rows and shrinks the rest.""" + assert _compression_made_progress( + orig_len=220, new_len=180, orig_tokens=288_028, new_tokens=150_000 + ) is True + + def test_neither_moved_means_no_progress(self): + """The genuine "stuck" case — same rows, same tokens, give up.""" + assert _compression_made_progress( + orig_len=10, new_len=10, orig_tokens=1000, new_tokens=1000 + ) is False + + def test_rows_grew_and_tokens_grew_means_no_progress(self): + """Pathological: the pass made the request larger — definitely stuck.""" + assert _compression_made_progress( + orig_len=10, new_len=12, orig_tokens=1000, new_tokens=1200 + ) is False + + def test_rows_grew_but_tokens_dropped_is_progress(self): + """Edge: summary rows may expand the row count while shrinking tokens. + + Token reduction alone is sufficient to keep the loop going. + """ + assert _compression_made_progress( + orig_len=10, new_len=11, orig_tokens=1000, new_tokens=600 + ) is True + + def test_tokens_grew_but_rows_dropped_is_progress(self): + """Edge: row reduction alone is sufficient even if tokens nominally + creep up (e.g. summary verbosity). Row-count reduction is a hard + signal that the transcript actually shrank. + """ + assert _compression_made_progress( + orig_len=10, new_len=5, orig_tokens=1000, new_tokens=1100 + ) is True