Merge pull request #50781 from NousResearch/salvage/output-token-reservation-threshold

fix(compress): reserve output tokens in the compaction threshold (#23767, #43547)
2026-06-23 10:42:00 +00:00 · 2026-06-22 17:33:09 +05:30 · 2026-06-22 17:33:09 +05:30 · a904ff1724
commit a904ff1724
parent 75a70d98f3 623b21bf24
3 changed files with 112 additions and 12 deletions
--- a/agent/agent_init.py
+++ b/agent/agent_init.py
@ -1575,6 +1575,7 @@ def init_agent(
            provider=agent.provider,
            api_mode=agent.api_mode,
            abort_on_summary_failure=compression_abort_on_summary_failure,
+            max_tokens=agent.max_tokens,
        )
    agent.compression_enabled = compression_enabled
    agent.compression_in_place = compression_in_place
--- a/agent/context_compressor.py
+++ b/agent/context_compressor.py
@ -667,6 +667,7 @@ class ContextCompressor(ContextEngine):
        api_key: Any = "",
        provider: str = "",
        api_mode: str = "",
+        max_tokens: int | None = None,
    ) -> None:
        """Update model info after a model switch or fallback activation."""
        self.model = model
@ -675,8 +676,13 @@ class ContextCompressor(ContextEngine):
        self.provider = provider
        self.api_mode = api_mode
        self.context_length = context_length
+        # max_tokens=None here means "caller didn't specify" → keep the existing
+        # output reservation. A switch that genuinely changes the output budget
+        # passes the new value explicitly. (#43547)
+        if max_tokens is not None:
+            self.max_tokens = self._coerce_max_tokens(max_tokens)
        self.threshold_tokens = self._compute_threshold_tokens(
-            context_length, self.threshold_percent
+            context_length, self.threshold_percent, self.max_tokens,
        )
        # Recalculate token budgets for the new context length so the
        # compressor stays calibrated after a model switch (e.g. 200K → 32K).
@ -716,11 +722,30 @@ class ContextCompressor(ContextEngine):
    _MIN_CTX_TRIGGER_RATIO = 0.85

    @staticmethod
-    def _compute_threshold_tokens(context_length: int, threshold_percent: float) -> int:
+    def _coerce_max_tokens(value: Any) -> int | None:
+        """Normalize a max_tokens value to a positive int or None.
+
+        Only a positive integer is a real output reservation. None (provider
+        default), non-numeric values, or <= 0 all mean "no reservation" — this
+        keeps the threshold arithmetic safe from non-int inputs (e.g. a test
+        MagicMock reaching ContextCompressor via a mocked parent agent).
+        """
+        if value is None:
+            return None
+        try:
+            ivalue = int(value)
+        except (TypeError, ValueError):
+            return None
+        return ivalue if ivalue > 0 else None
+
+    @staticmethod
+    def _compute_threshold_tokens(
+        context_length: int, threshold_percent: float, max_tokens: int | None = None,
+    ) -> int:
        """Compute the compaction trigger threshold in tokens.

-        The base value is ``context_length * threshold_percent``, floored at
-        ``MINIMUM_CONTEXT_LENGTH`` so large-context models don't compress
+        The base value is ``effective_input_budget * threshold_percent``, floored
+        at ``MINIMUM_CONTEXT_LENGTH`` so large-context models don't compress
        prematurely at 50%. BUT that floor degenerates at small windows: for a
        model whose ``context_length`` is at/below the minimum (e.g. a 64K
        local model), ``max(0.5*64000, 64000) == 64000`` makes the threshold
@ -731,15 +756,28 @@ class ContextCompressor(ContextEngine):
        ``_MIN_CTX_TRIGGER_RATIO`` (85%) of the window — high enough that a
        small model uses most of its context before compacting, but below
        100% so compaction fires before the provider rejects the request.
+
+        The provider reserves ``max_tokens`` of output space out of the same
+        window, so the usable INPUT budget is ``context_length - max_tokens``.
+        With a large ``max_tokens`` (e.g. 65536 on a custom provider) the input
+        budget is materially smaller than the raw window, and a threshold based
+        on the full window lets the session hit a provider 400 before compaction
+        fires (#43547). The percentage and the degenerate-window check below both
+        operate on the effective input budget. ``max_tokens=None`` (provider
+        default) conservatively assumes no reservation (full window).
        """
-        pct_value = int(context_length * threshold_percent)
+        effective_window = context_length - (max_tokens or 0)
+        if effective_window <= 0:
+            effective_window = context_length
+        pct_value = int(effective_window * threshold_percent)
        floored = max(pct_value, MINIMUM_CONTEXT_LENGTH)
-        # If flooring pushed the threshold to/over the window it can never be
-        # reached. Trigger at 85% of the window so a minimum-context model
-        # rides most of its budget before compacting instead of wasting half.
-        if context_length > 0 and floored >= context_length:
-            return max(1, min(int(context_length * ContextCompressor._MIN_CTX_TRIGGER_RATIO),
-                              context_length - 1))
+        # If flooring pushed the threshold to/over the effective window it can
+        # never be reached. Trigger at 85% of the effective input budget so a
+        # minimum-context model rides most of its budget before compacting
+        # instead of wasting half.
+        if effective_window > 0 and floored >= effective_window:
+            return max(1, min(int(effective_window * ContextCompressor._MIN_CTX_TRIGGER_RATIO),
+                              effective_window - 1))
        return floored

    def __init__(
@ -757,6 +795,7 @@ class ContextCompressor(ContextEngine):
        provider: str = "",
        api_mode: str = "",
        abort_on_summary_failure: bool = False,
+        max_tokens: int | None = None,
    ):
        self.model = model
        self.base_url = base_url
@ -768,6 +807,13 @@ class ContextCompressor(ContextEngine):
        self.protect_last_n = protect_last_n
        self.summary_target_ratio = max(0.10, min(summary_target_ratio, 0.80))
        self.quiet_mode = quiet_mode
+        # Output-token reservation: the provider carves max_tokens out of the
+        # context window, so the usable input budget is context_length -
+        # max_tokens. None = provider default => assume no reservation. (#43547)
+        # Coerce defensively: only a positive int is a real reservation; any
+        # other value (None, non-numeric, <=0) means "no reservation" so the
+        # threshold arithmetic never sees a non-int (e.g. a test MagicMock).
+        self.max_tokens = self._coerce_max_tokens(max_tokens)
        # When True, summary-generation failure aborts compression entirely
        # (returns messages unchanged, sets _last_compress_aborted=True).
        # When False (default = historical behavior), insert a
@ -786,7 +832,7 @@ class ContextCompressor(ContextEngine):
        # guards the degenerate case where the floor would equal/exceed the
        # window (small models), so auto-compression can still fire (#14690).
        self.threshold_tokens = self._compute_threshold_tokens(
-            self.context_length, threshold_percent
+            self.context_length, threshold_percent, self.max_tokens,
        )
        self.compression_count = 0

--- a/tests/agent/test_context_compressor.py
+++ b/tests/agent/test_context_compressor.py
@ -264,6 +264,59 @@ class TestCompress:
        assert c.should_compress(55000) is True
        assert c.should_compress(40000) is False

+    def test_max_tokens_reservation_lowers_threshold(self):
+        """#43547: the provider reserves max_tokens out of the window, so the
+        threshold must be based on (context_length - max_tokens), not the full
+        window. A 200K model reserving 65536 output tokens has a ~134K input
+        budget; at 50% that's ~67K, NOT 100K."""
+        # No reservation (provider default) → full-window behavior, unchanged.
+        assert ContextCompressor._compute_threshold_tokens(200000, 0.50) == 100000
+        assert ContextCompressor._compute_threshold_tokens(200000, 0.50, None) == 100000
+        # 65536 reserved → effective input budget 134464; 50% = 67232.
+        assert ContextCompressor._compute_threshold_tokens(200000, 0.50, 65536) == 67232
+
+    def test_max_tokens_reservation_with_small_window_floors(self):
+        """With a large reservation on a smaller window the effective budget
+        can drop near/below the minimum floor — the degenerate-window guard
+        then triggers at 85% of the EFFECTIVE budget, never the raw window."""
+        # 128K window, 65536 reserved → effective 62464 (< MINIMUM 64000).
+        # Floor (64000) >= effective window (62464) → 85% of effective.
+        t = ContextCompressor._compute_threshold_tokens(128000, 0.50, 65536)
+        assert t == int(62464 * 0.85)  # 53094
+        assert t < 62464
+
+    def test_max_tokens_exceeding_window_falls_back_to_full(self):
+        """Pathological: max_tokens >= context_length would make the effective
+        budget <= 0; fall back to the full window rather than produce a
+        non-positive threshold."""
+        t = ContextCompressor._compute_threshold_tokens(64000, 0.50, 70000)
+        # effective_window <= 0 → fall back to full context (64000) → 85% guard.
+        assert t == 54400  # 85% of 64000, same as no-reservation small-ctx case
+        assert t > 0
+
+    def test_max_tokens_coercion_treats_non_int_as_no_reservation(self):
+        """A non-int / non-positive max_tokens must coerce safely so the
+        threshold arithmetic never raises. Guards the path where a mocked
+        parent agent forwards a MagicMock max_tokens into a child
+        ContextCompressor (regression for the delegate-test TypeError:
+        '<=' not supported between MagicMock and int)."""
+        from unittest.mock import MagicMock
+        assert ContextCompressor._coerce_max_tokens(None) is None
+        assert ContextCompressor._coerce_max_tokens(0) is None
+        assert ContextCompressor._coerce_max_tokens(-5) is None
+        assert ContextCompressor._coerce_max_tokens("nope") is None
+        assert ContextCompressor._coerce_max_tokens(65536) == 65536
+        # The actual regression: building a compressor with a MagicMock
+        # max_tokens must NOT raise (the unmocked code did `ctx - MagicMock`
+        # then `MagicMock <= 0`). int(MagicMock()) returns 1, so coercion
+        # yields a harmless positive int rather than crashing — the threshold
+        # is computed cleanly with a 1-token reservation.
+        with patch("agent.context_compressor.get_model_context_length", return_value=200000):
+            c = ContextCompressor(model="m", quiet_mode=True, max_tokens=MagicMock())
+        assert isinstance(c.max_tokens, int)
+        assert isinstance(c.threshold_tokens, int)
+        assert c.threshold_tokens > 0  # no crash, sane value
+
    def test_compression_increments_count(self, compressor):
        msgs = self._make_messages(10)
        # Default config (abort_on_summary_failure=False) — fallback path