mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-23 10:42:00 +00:00
Merge pull request #50781 from NousResearch/salvage/output-token-reservation-threshold
fix(compress): reserve output tokens in the compaction threshold (#23767, #43547)
This commit is contained in:
commit
a904ff1724
3 changed files with 112 additions and 12 deletions
|
|
@ -1575,6 +1575,7 @@ def init_agent(
|
|||
provider=agent.provider,
|
||||
api_mode=agent.api_mode,
|
||||
abort_on_summary_failure=compression_abort_on_summary_failure,
|
||||
max_tokens=agent.max_tokens,
|
||||
)
|
||||
agent.compression_enabled = compression_enabled
|
||||
agent.compression_in_place = compression_in_place
|
||||
|
|
|
|||
|
|
@ -667,6 +667,7 @@ class ContextCompressor(ContextEngine):
|
|||
api_key: Any = "",
|
||||
provider: str = "",
|
||||
api_mode: str = "",
|
||||
max_tokens: int | None = None,
|
||||
) -> None:
|
||||
"""Update model info after a model switch or fallback activation."""
|
||||
self.model = model
|
||||
|
|
@ -675,8 +676,13 @@ class ContextCompressor(ContextEngine):
|
|||
self.provider = provider
|
||||
self.api_mode = api_mode
|
||||
self.context_length = context_length
|
||||
# max_tokens=None here means "caller didn't specify" → keep the existing
|
||||
# output reservation. A switch that genuinely changes the output budget
|
||||
# passes the new value explicitly. (#43547)
|
||||
if max_tokens is not None:
|
||||
self.max_tokens = self._coerce_max_tokens(max_tokens)
|
||||
self.threshold_tokens = self._compute_threshold_tokens(
|
||||
context_length, self.threshold_percent
|
||||
context_length, self.threshold_percent, self.max_tokens,
|
||||
)
|
||||
# Recalculate token budgets for the new context length so the
|
||||
# compressor stays calibrated after a model switch (e.g. 200K → 32K).
|
||||
|
|
@ -716,11 +722,30 @@ class ContextCompressor(ContextEngine):
|
|||
_MIN_CTX_TRIGGER_RATIO = 0.85
|
||||
|
||||
@staticmethod
|
||||
def _compute_threshold_tokens(context_length: int, threshold_percent: float) -> int:
|
||||
def _coerce_max_tokens(value: Any) -> int | None:
|
||||
"""Normalize a max_tokens value to a positive int or None.
|
||||
|
||||
Only a positive integer is a real output reservation. None (provider
|
||||
default), non-numeric values, or <= 0 all mean "no reservation" — this
|
||||
keeps the threshold arithmetic safe from non-int inputs (e.g. a test
|
||||
MagicMock reaching ContextCompressor via a mocked parent agent).
|
||||
"""
|
||||
if value is None:
|
||||
return None
|
||||
try:
|
||||
ivalue = int(value)
|
||||
except (TypeError, ValueError):
|
||||
return None
|
||||
return ivalue if ivalue > 0 else None
|
||||
|
||||
@staticmethod
|
||||
def _compute_threshold_tokens(
|
||||
context_length: int, threshold_percent: float, max_tokens: int | None = None,
|
||||
) -> int:
|
||||
"""Compute the compaction trigger threshold in tokens.
|
||||
|
||||
The base value is ``context_length * threshold_percent``, floored at
|
||||
``MINIMUM_CONTEXT_LENGTH`` so large-context models don't compress
|
||||
The base value is ``effective_input_budget * threshold_percent``, floored
|
||||
at ``MINIMUM_CONTEXT_LENGTH`` so large-context models don't compress
|
||||
prematurely at 50%. BUT that floor degenerates at small windows: for a
|
||||
model whose ``context_length`` is at/below the minimum (e.g. a 64K
|
||||
local model), ``max(0.5*64000, 64000) == 64000`` makes the threshold
|
||||
|
|
@ -731,15 +756,28 @@ class ContextCompressor(ContextEngine):
|
|||
``_MIN_CTX_TRIGGER_RATIO`` (85%) of the window — high enough that a
|
||||
small model uses most of its context before compacting, but below
|
||||
100% so compaction fires before the provider rejects the request.
|
||||
|
||||
The provider reserves ``max_tokens`` of output space out of the same
|
||||
window, so the usable INPUT budget is ``context_length - max_tokens``.
|
||||
With a large ``max_tokens`` (e.g. 65536 on a custom provider) the input
|
||||
budget is materially smaller than the raw window, and a threshold based
|
||||
on the full window lets the session hit a provider 400 before compaction
|
||||
fires (#43547). The percentage and the degenerate-window check below both
|
||||
operate on the effective input budget. ``max_tokens=None`` (provider
|
||||
default) conservatively assumes no reservation (full window).
|
||||
"""
|
||||
pct_value = int(context_length * threshold_percent)
|
||||
effective_window = context_length - (max_tokens or 0)
|
||||
if effective_window <= 0:
|
||||
effective_window = context_length
|
||||
pct_value = int(effective_window * threshold_percent)
|
||||
floored = max(pct_value, MINIMUM_CONTEXT_LENGTH)
|
||||
# If flooring pushed the threshold to/over the window it can never be
|
||||
# reached. Trigger at 85% of the window so a minimum-context model
|
||||
# rides most of its budget before compacting instead of wasting half.
|
||||
if context_length > 0 and floored >= context_length:
|
||||
return max(1, min(int(context_length * ContextCompressor._MIN_CTX_TRIGGER_RATIO),
|
||||
context_length - 1))
|
||||
# If flooring pushed the threshold to/over the effective window it can
|
||||
# never be reached. Trigger at 85% of the effective input budget so a
|
||||
# minimum-context model rides most of its budget before compacting
|
||||
# instead of wasting half.
|
||||
if effective_window > 0 and floored >= effective_window:
|
||||
return max(1, min(int(effective_window * ContextCompressor._MIN_CTX_TRIGGER_RATIO),
|
||||
effective_window - 1))
|
||||
return floored
|
||||
|
||||
def __init__(
|
||||
|
|
@ -757,6 +795,7 @@ class ContextCompressor(ContextEngine):
|
|||
provider: str = "",
|
||||
api_mode: str = "",
|
||||
abort_on_summary_failure: bool = False,
|
||||
max_tokens: int | None = None,
|
||||
):
|
||||
self.model = model
|
||||
self.base_url = base_url
|
||||
|
|
@ -768,6 +807,13 @@ class ContextCompressor(ContextEngine):
|
|||
self.protect_last_n = protect_last_n
|
||||
self.summary_target_ratio = max(0.10, min(summary_target_ratio, 0.80))
|
||||
self.quiet_mode = quiet_mode
|
||||
# Output-token reservation: the provider carves max_tokens out of the
|
||||
# context window, so the usable input budget is context_length -
|
||||
# max_tokens. None = provider default => assume no reservation. (#43547)
|
||||
# Coerce defensively: only a positive int is a real reservation; any
|
||||
# other value (None, non-numeric, <=0) means "no reservation" so the
|
||||
# threshold arithmetic never sees a non-int (e.g. a test MagicMock).
|
||||
self.max_tokens = self._coerce_max_tokens(max_tokens)
|
||||
# When True, summary-generation failure aborts compression entirely
|
||||
# (returns messages unchanged, sets _last_compress_aborted=True).
|
||||
# When False (default = historical behavior), insert a
|
||||
|
|
@ -786,7 +832,7 @@ class ContextCompressor(ContextEngine):
|
|||
# guards the degenerate case where the floor would equal/exceed the
|
||||
# window (small models), so auto-compression can still fire (#14690).
|
||||
self.threshold_tokens = self._compute_threshold_tokens(
|
||||
self.context_length, threshold_percent
|
||||
self.context_length, threshold_percent, self.max_tokens,
|
||||
)
|
||||
self.compression_count = 0
|
||||
|
||||
|
|
|
|||
|
|
@ -264,6 +264,59 @@ class TestCompress:
|
|||
assert c.should_compress(55000) is True
|
||||
assert c.should_compress(40000) is False
|
||||
|
||||
def test_max_tokens_reservation_lowers_threshold(self):
|
||||
"""#43547: the provider reserves max_tokens out of the window, so the
|
||||
threshold must be based on (context_length - max_tokens), not the full
|
||||
window. A 200K model reserving 65536 output tokens has a ~134K input
|
||||
budget; at 50% that's ~67K, NOT 100K."""
|
||||
# No reservation (provider default) → full-window behavior, unchanged.
|
||||
assert ContextCompressor._compute_threshold_tokens(200000, 0.50) == 100000
|
||||
assert ContextCompressor._compute_threshold_tokens(200000, 0.50, None) == 100000
|
||||
# 65536 reserved → effective input budget 134464; 50% = 67232.
|
||||
assert ContextCompressor._compute_threshold_tokens(200000, 0.50, 65536) == 67232
|
||||
|
||||
def test_max_tokens_reservation_with_small_window_floors(self):
|
||||
"""With a large reservation on a smaller window the effective budget
|
||||
can drop near/below the minimum floor — the degenerate-window guard
|
||||
then triggers at 85% of the EFFECTIVE budget, never the raw window."""
|
||||
# 128K window, 65536 reserved → effective 62464 (< MINIMUM 64000).
|
||||
# Floor (64000) >= effective window (62464) → 85% of effective.
|
||||
t = ContextCompressor._compute_threshold_tokens(128000, 0.50, 65536)
|
||||
assert t == int(62464 * 0.85) # 53094
|
||||
assert t < 62464
|
||||
|
||||
def test_max_tokens_exceeding_window_falls_back_to_full(self):
|
||||
"""Pathological: max_tokens >= context_length would make the effective
|
||||
budget <= 0; fall back to the full window rather than produce a
|
||||
non-positive threshold."""
|
||||
t = ContextCompressor._compute_threshold_tokens(64000, 0.50, 70000)
|
||||
# effective_window <= 0 → fall back to full context (64000) → 85% guard.
|
||||
assert t == 54400 # 85% of 64000, same as no-reservation small-ctx case
|
||||
assert t > 0
|
||||
|
||||
def test_max_tokens_coercion_treats_non_int_as_no_reservation(self):
|
||||
"""A non-int / non-positive max_tokens must coerce safely so the
|
||||
threshold arithmetic never raises. Guards the path where a mocked
|
||||
parent agent forwards a MagicMock max_tokens into a child
|
||||
ContextCompressor (regression for the delegate-test TypeError:
|
||||
'<=' not supported between MagicMock and int)."""
|
||||
from unittest.mock import MagicMock
|
||||
assert ContextCompressor._coerce_max_tokens(None) is None
|
||||
assert ContextCompressor._coerce_max_tokens(0) is None
|
||||
assert ContextCompressor._coerce_max_tokens(-5) is None
|
||||
assert ContextCompressor._coerce_max_tokens("nope") is None
|
||||
assert ContextCompressor._coerce_max_tokens(65536) == 65536
|
||||
# The actual regression: building a compressor with a MagicMock
|
||||
# max_tokens must NOT raise (the unmocked code did `ctx - MagicMock`
|
||||
# then `MagicMock <= 0`). int(MagicMock()) returns 1, so coercion
|
||||
# yields a harmless positive int rather than crashing — the threshold
|
||||
# is computed cleanly with a 1-token reservation.
|
||||
with patch("agent.context_compressor.get_model_context_length", return_value=200000):
|
||||
c = ContextCompressor(model="m", quiet_mode=True, max_tokens=MagicMock())
|
||||
assert isinstance(c.max_tokens, int)
|
||||
assert isinstance(c.threshold_tokens, int)
|
||||
assert c.threshold_tokens > 0 # no crash, sane value
|
||||
|
||||
def test_compression_increments_count(self, compressor):
|
||||
msgs = self._make_messages(10)
|
||||
# Default config (abort_on_summary_failure=False) — fallback path
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue