Merge pull request #50781 from NousResearch/salvage/output-token-reservation-threshold

fix(compress): reserve output tokens in the compaction threshold (#23767, #43547)
This commit is contained in:
kshitij 2026-06-22 17:33:09 +05:30 committed by GitHub
commit a904ff1724
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 112 additions and 12 deletions

View file

@ -1575,6 +1575,7 @@ def init_agent(
provider=agent.provider,
api_mode=agent.api_mode,
abort_on_summary_failure=compression_abort_on_summary_failure,
max_tokens=agent.max_tokens,
)
agent.compression_enabled = compression_enabled
agent.compression_in_place = compression_in_place

View file

@ -667,6 +667,7 @@ class ContextCompressor(ContextEngine):
api_key: Any = "",
provider: str = "",
api_mode: str = "",
max_tokens: int | None = None,
) -> None:
"""Update model info after a model switch or fallback activation."""
self.model = model
@ -675,8 +676,13 @@ class ContextCompressor(ContextEngine):
self.provider = provider
self.api_mode = api_mode
self.context_length = context_length
# max_tokens=None here means "caller didn't specify" → keep the existing
# output reservation. A switch that genuinely changes the output budget
# passes the new value explicitly. (#43547)
if max_tokens is not None:
self.max_tokens = self._coerce_max_tokens(max_tokens)
self.threshold_tokens = self._compute_threshold_tokens(
context_length, self.threshold_percent
context_length, self.threshold_percent, self.max_tokens,
)
# Recalculate token budgets for the new context length so the
# compressor stays calibrated after a model switch (e.g. 200K → 32K).
@ -716,11 +722,30 @@ class ContextCompressor(ContextEngine):
_MIN_CTX_TRIGGER_RATIO = 0.85
@staticmethod
def _compute_threshold_tokens(context_length: int, threshold_percent: float) -> int:
def _coerce_max_tokens(value: Any) -> int | None:
"""Normalize a max_tokens value to a positive int or None.
Only a positive integer is a real output reservation. None (provider
default), non-numeric values, or <= 0 all mean "no reservation" this
keeps the threshold arithmetic safe from non-int inputs (e.g. a test
MagicMock reaching ContextCompressor via a mocked parent agent).
"""
if value is None:
return None
try:
ivalue = int(value)
except (TypeError, ValueError):
return None
return ivalue if ivalue > 0 else None
@staticmethod
def _compute_threshold_tokens(
context_length: int, threshold_percent: float, max_tokens: int | None = None,
) -> int:
"""Compute the compaction trigger threshold in tokens.
The base value is ``context_length * threshold_percent``, floored at
``MINIMUM_CONTEXT_LENGTH`` so large-context models don't compress
The base value is ``effective_input_budget * threshold_percent``, floored
at ``MINIMUM_CONTEXT_LENGTH`` so large-context models don't compress
prematurely at 50%. BUT that floor degenerates at small windows: for a
model whose ``context_length`` is at/below the minimum (e.g. a 64K
local model), ``max(0.5*64000, 64000) == 64000`` makes the threshold
@ -731,15 +756,28 @@ class ContextCompressor(ContextEngine):
``_MIN_CTX_TRIGGER_RATIO`` (85%) of the window high enough that a
small model uses most of its context before compacting, but below
100% so compaction fires before the provider rejects the request.
The provider reserves ``max_tokens`` of output space out of the same
window, so the usable INPUT budget is ``context_length - max_tokens``.
With a large ``max_tokens`` (e.g. 65536 on a custom provider) the input
budget is materially smaller than the raw window, and a threshold based
on the full window lets the session hit a provider 400 before compaction
fires (#43547). The percentage and the degenerate-window check below both
operate on the effective input budget. ``max_tokens=None`` (provider
default) conservatively assumes no reservation (full window).
"""
pct_value = int(context_length * threshold_percent)
effective_window = context_length - (max_tokens or 0)
if effective_window <= 0:
effective_window = context_length
pct_value = int(effective_window * threshold_percent)
floored = max(pct_value, MINIMUM_CONTEXT_LENGTH)
# If flooring pushed the threshold to/over the window it can never be
# reached. Trigger at 85% of the window so a minimum-context model
# rides most of its budget before compacting instead of wasting half.
if context_length > 0 and floored >= context_length:
return max(1, min(int(context_length * ContextCompressor._MIN_CTX_TRIGGER_RATIO),
context_length - 1))
# If flooring pushed the threshold to/over the effective window it can
# never be reached. Trigger at 85% of the effective input budget so a
# minimum-context model rides most of its budget before compacting
# instead of wasting half.
if effective_window > 0 and floored >= effective_window:
return max(1, min(int(effective_window * ContextCompressor._MIN_CTX_TRIGGER_RATIO),
effective_window - 1))
return floored
def __init__(
@ -757,6 +795,7 @@ class ContextCompressor(ContextEngine):
provider: str = "",
api_mode: str = "",
abort_on_summary_failure: bool = False,
max_tokens: int | None = None,
):
self.model = model
self.base_url = base_url
@ -768,6 +807,13 @@ class ContextCompressor(ContextEngine):
self.protect_last_n = protect_last_n
self.summary_target_ratio = max(0.10, min(summary_target_ratio, 0.80))
self.quiet_mode = quiet_mode
# Output-token reservation: the provider carves max_tokens out of the
# context window, so the usable input budget is context_length -
# max_tokens. None = provider default => assume no reservation. (#43547)
# Coerce defensively: only a positive int is a real reservation; any
# other value (None, non-numeric, <=0) means "no reservation" so the
# threshold arithmetic never sees a non-int (e.g. a test MagicMock).
self.max_tokens = self._coerce_max_tokens(max_tokens)
# When True, summary-generation failure aborts compression entirely
# (returns messages unchanged, sets _last_compress_aborted=True).
# When False (default = historical behavior), insert a
@ -786,7 +832,7 @@ class ContextCompressor(ContextEngine):
# guards the degenerate case where the floor would equal/exceed the
# window (small models), so auto-compression can still fire (#14690).
self.threshold_tokens = self._compute_threshold_tokens(
self.context_length, threshold_percent
self.context_length, threshold_percent, self.max_tokens,
)
self.compression_count = 0

View file

@ -264,6 +264,59 @@ class TestCompress:
assert c.should_compress(55000) is True
assert c.should_compress(40000) is False
def test_max_tokens_reservation_lowers_threshold(self):
"""#43547: the provider reserves max_tokens out of the window, so the
threshold must be based on (context_length - max_tokens), not the full
window. A 200K model reserving 65536 output tokens has a ~134K input
budget; at 50% that's ~67K, NOT 100K."""
# No reservation (provider default) → full-window behavior, unchanged.
assert ContextCompressor._compute_threshold_tokens(200000, 0.50) == 100000
assert ContextCompressor._compute_threshold_tokens(200000, 0.50, None) == 100000
# 65536 reserved → effective input budget 134464; 50% = 67232.
assert ContextCompressor._compute_threshold_tokens(200000, 0.50, 65536) == 67232
def test_max_tokens_reservation_with_small_window_floors(self):
"""With a large reservation on a smaller window the effective budget
can drop near/below the minimum floor the degenerate-window guard
then triggers at 85% of the EFFECTIVE budget, never the raw window."""
# 128K window, 65536 reserved → effective 62464 (< MINIMUM 64000).
# Floor (64000) >= effective window (62464) → 85% of effective.
t = ContextCompressor._compute_threshold_tokens(128000, 0.50, 65536)
assert t == int(62464 * 0.85) # 53094
assert t < 62464
def test_max_tokens_exceeding_window_falls_back_to_full(self):
"""Pathological: max_tokens >= context_length would make the effective
budget <= 0; fall back to the full window rather than produce a
non-positive threshold."""
t = ContextCompressor._compute_threshold_tokens(64000, 0.50, 70000)
# effective_window <= 0 → fall back to full context (64000) → 85% guard.
assert t == 54400 # 85% of 64000, same as no-reservation small-ctx case
assert t > 0
def test_max_tokens_coercion_treats_non_int_as_no_reservation(self):
"""A non-int / non-positive max_tokens must coerce safely so the
threshold arithmetic never raises. Guards the path where a mocked
parent agent forwards a MagicMock max_tokens into a child
ContextCompressor (regression for the delegate-test TypeError:
'<=' not supported between MagicMock and int)."""
from unittest.mock import MagicMock
assert ContextCompressor._coerce_max_tokens(None) is None
assert ContextCompressor._coerce_max_tokens(0) is None
assert ContextCompressor._coerce_max_tokens(-5) is None
assert ContextCompressor._coerce_max_tokens("nope") is None
assert ContextCompressor._coerce_max_tokens(65536) == 65536
# The actual regression: building a compressor with a MagicMock
# max_tokens must NOT raise (the unmocked code did `ctx - MagicMock`
# then `MagicMock <= 0`). int(MagicMock()) returns 1, so coercion
# yields a harmless positive int rather than crashing — the threshold
# is computed cleanly with a 1-token reservation.
with patch("agent.context_compressor.get_model_context_length", return_value=200000):
c = ContextCompressor(model="m", quiet_mode=True, max_tokens=MagicMock())
assert isinstance(c.max_tokens, int)
assert isinstance(c.threshold_tokens, int)
assert c.threshold_tokens > 0 # no crash, sane value
def test_compression_increments_count(self, compressor):
msgs = self._make_messages(10)
# Default config (abort_on_summary_failure=False) — fallback path