diff --git a/agent/agent_init.py b/agent/agent_init.py index ffefcee5eb7..e7f2ed9eac3 100644 --- a/agent/agent_init.py +++ b/agent/agent_init.py @@ -1575,6 +1575,7 @@ def init_agent( provider=agent.provider, api_mode=agent.api_mode, abort_on_summary_failure=compression_abort_on_summary_failure, + max_tokens=agent.max_tokens, ) agent.compression_enabled = compression_enabled agent.compression_in_place = compression_in_place diff --git a/agent/context_compressor.py b/agent/context_compressor.py index f1c6fca6f6e..5f9dcfa2e0d 100644 --- a/agent/context_compressor.py +++ b/agent/context_compressor.py @@ -667,6 +667,7 @@ class ContextCompressor(ContextEngine): api_key: Any = "", provider: str = "", api_mode: str = "", + max_tokens: int | None = None, ) -> None: """Update model info after a model switch or fallback activation.""" self.model = model @@ -675,8 +676,13 @@ class ContextCompressor(ContextEngine): self.provider = provider self.api_mode = api_mode self.context_length = context_length + # max_tokens=None here means "caller didn't specify" → keep the existing + # output reservation. A switch that genuinely changes the output budget + # passes the new value explicitly. (#43547) + if max_tokens is not None: + self.max_tokens = self._coerce_max_tokens(max_tokens) self.threshold_tokens = self._compute_threshold_tokens( - context_length, self.threshold_percent + context_length, self.threshold_percent, self.max_tokens, ) # Recalculate token budgets for the new context length so the # compressor stays calibrated after a model switch (e.g. 200K → 32K). @@ -716,11 +722,30 @@ class ContextCompressor(ContextEngine): _MIN_CTX_TRIGGER_RATIO = 0.85 @staticmethod - def _compute_threshold_tokens(context_length: int, threshold_percent: float) -> int: + def _coerce_max_tokens(value: Any) -> int | None: + """Normalize a max_tokens value to a positive int or None. + + Only a positive integer is a real output reservation. None (provider + default), non-numeric values, or <= 0 all mean "no reservation" — this + keeps the threshold arithmetic safe from non-int inputs (e.g. a test + MagicMock reaching ContextCompressor via a mocked parent agent). + """ + if value is None: + return None + try: + ivalue = int(value) + except (TypeError, ValueError): + return None + return ivalue if ivalue > 0 else None + + @staticmethod + def _compute_threshold_tokens( + context_length: int, threshold_percent: float, max_tokens: int | None = None, + ) -> int: """Compute the compaction trigger threshold in tokens. - The base value is ``context_length * threshold_percent``, floored at - ``MINIMUM_CONTEXT_LENGTH`` so large-context models don't compress + The base value is ``effective_input_budget * threshold_percent``, floored + at ``MINIMUM_CONTEXT_LENGTH`` so large-context models don't compress prematurely at 50%. BUT that floor degenerates at small windows: for a model whose ``context_length`` is at/below the minimum (e.g. a 64K local model), ``max(0.5*64000, 64000) == 64000`` makes the threshold @@ -731,15 +756,28 @@ class ContextCompressor(ContextEngine): ``_MIN_CTX_TRIGGER_RATIO`` (85%) of the window — high enough that a small model uses most of its context before compacting, but below 100% so compaction fires before the provider rejects the request. + + The provider reserves ``max_tokens`` of output space out of the same + window, so the usable INPUT budget is ``context_length - max_tokens``. + With a large ``max_tokens`` (e.g. 65536 on a custom provider) the input + budget is materially smaller than the raw window, and a threshold based + on the full window lets the session hit a provider 400 before compaction + fires (#43547). The percentage and the degenerate-window check below both + operate on the effective input budget. ``max_tokens=None`` (provider + default) conservatively assumes no reservation (full window). """ - pct_value = int(context_length * threshold_percent) + effective_window = context_length - (max_tokens or 0) + if effective_window <= 0: + effective_window = context_length + pct_value = int(effective_window * threshold_percent) floored = max(pct_value, MINIMUM_CONTEXT_LENGTH) - # If flooring pushed the threshold to/over the window it can never be - # reached. Trigger at 85% of the window so a minimum-context model - # rides most of its budget before compacting instead of wasting half. - if context_length > 0 and floored >= context_length: - return max(1, min(int(context_length * ContextCompressor._MIN_CTX_TRIGGER_RATIO), - context_length - 1)) + # If flooring pushed the threshold to/over the effective window it can + # never be reached. Trigger at 85% of the effective input budget so a + # minimum-context model rides most of its budget before compacting + # instead of wasting half. + if effective_window > 0 and floored >= effective_window: + return max(1, min(int(effective_window * ContextCompressor._MIN_CTX_TRIGGER_RATIO), + effective_window - 1)) return floored def __init__( @@ -757,6 +795,7 @@ class ContextCompressor(ContextEngine): provider: str = "", api_mode: str = "", abort_on_summary_failure: bool = False, + max_tokens: int | None = None, ): self.model = model self.base_url = base_url @@ -768,6 +807,13 @@ class ContextCompressor(ContextEngine): self.protect_last_n = protect_last_n self.summary_target_ratio = max(0.10, min(summary_target_ratio, 0.80)) self.quiet_mode = quiet_mode + # Output-token reservation: the provider carves max_tokens out of the + # context window, so the usable input budget is context_length - + # max_tokens. None = provider default => assume no reservation. (#43547) + # Coerce defensively: only a positive int is a real reservation; any + # other value (None, non-numeric, <=0) means "no reservation" so the + # threshold arithmetic never sees a non-int (e.g. a test MagicMock). + self.max_tokens = self._coerce_max_tokens(max_tokens) # When True, summary-generation failure aborts compression entirely # (returns messages unchanged, sets _last_compress_aborted=True). # When False (default = historical behavior), insert a @@ -786,7 +832,7 @@ class ContextCompressor(ContextEngine): # guards the degenerate case where the floor would equal/exceed the # window (small models), so auto-compression can still fire (#14690). self.threshold_tokens = self._compute_threshold_tokens( - self.context_length, threshold_percent + self.context_length, threshold_percent, self.max_tokens, ) self.compression_count = 0 diff --git a/tests/agent/test_context_compressor.py b/tests/agent/test_context_compressor.py index 79e89b457bd..cdbf66469c6 100644 --- a/tests/agent/test_context_compressor.py +++ b/tests/agent/test_context_compressor.py @@ -264,6 +264,59 @@ class TestCompress: assert c.should_compress(55000) is True assert c.should_compress(40000) is False + def test_max_tokens_reservation_lowers_threshold(self): + """#43547: the provider reserves max_tokens out of the window, so the + threshold must be based on (context_length - max_tokens), not the full + window. A 200K model reserving 65536 output tokens has a ~134K input + budget; at 50% that's ~67K, NOT 100K.""" + # No reservation (provider default) → full-window behavior, unchanged. + assert ContextCompressor._compute_threshold_tokens(200000, 0.50) == 100000 + assert ContextCompressor._compute_threshold_tokens(200000, 0.50, None) == 100000 + # 65536 reserved → effective input budget 134464; 50% = 67232. + assert ContextCompressor._compute_threshold_tokens(200000, 0.50, 65536) == 67232 + + def test_max_tokens_reservation_with_small_window_floors(self): + """With a large reservation on a smaller window the effective budget + can drop near/below the minimum floor — the degenerate-window guard + then triggers at 85% of the EFFECTIVE budget, never the raw window.""" + # 128K window, 65536 reserved → effective 62464 (< MINIMUM 64000). + # Floor (64000) >= effective window (62464) → 85% of effective. + t = ContextCompressor._compute_threshold_tokens(128000, 0.50, 65536) + assert t == int(62464 * 0.85) # 53094 + assert t < 62464 + + def test_max_tokens_exceeding_window_falls_back_to_full(self): + """Pathological: max_tokens >= context_length would make the effective + budget <= 0; fall back to the full window rather than produce a + non-positive threshold.""" + t = ContextCompressor._compute_threshold_tokens(64000, 0.50, 70000) + # effective_window <= 0 → fall back to full context (64000) → 85% guard. + assert t == 54400 # 85% of 64000, same as no-reservation small-ctx case + assert t > 0 + + def test_max_tokens_coercion_treats_non_int_as_no_reservation(self): + """A non-int / non-positive max_tokens must coerce safely so the + threshold arithmetic never raises. Guards the path where a mocked + parent agent forwards a MagicMock max_tokens into a child + ContextCompressor (regression for the delegate-test TypeError: + '<=' not supported between MagicMock and int).""" + from unittest.mock import MagicMock + assert ContextCompressor._coerce_max_tokens(None) is None + assert ContextCompressor._coerce_max_tokens(0) is None + assert ContextCompressor._coerce_max_tokens(-5) is None + assert ContextCompressor._coerce_max_tokens("nope") is None + assert ContextCompressor._coerce_max_tokens(65536) == 65536 + # The actual regression: building a compressor with a MagicMock + # max_tokens must NOT raise (the unmocked code did `ctx - MagicMock` + # then `MagicMock <= 0`). int(MagicMock()) returns 1, so coercion + # yields a harmless positive int rather than crashing — the threshold + # is computed cleanly with a 1-token reservation. + with patch("agent.context_compressor.get_model_context_length", return_value=200000): + c = ContextCompressor(model="m", quiet_mode=True, max_tokens=MagicMock()) + assert isinstance(c.max_tokens, int) + assert isinstance(c.threshold_tokens, int) + assert c.threshold_tokens > 0 # no crash, sane value + def test_compression_increments_count(self, compressor): msgs = self._make_messages(10) # Default config (abort_on_summary_failure=False) — fallback path