diff --git a/gateway/run.py b/gateway/run.py index b7adadd3a..b4b6c6ef0 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -1757,9 +1757,9 @@ class GatewayRunner: # Token source priority: # 1. Actual API-reported prompt_tokens from the last turn # (stored in session_entry.last_prompt_tokens) - # 2. Rough char-based estimate (str(msg)//4) with a 1.4x - # safety factor to account for overestimation on tool-heavy - # conversations (code/JSON tokenizes at 5-7+ chars/token). + # 2. Rough char-based estimate (str(msg)//4). Overestimates + # by 30-50% on code/JSON-heavy sessions, but that just + # means hygiene fires a bit early — safe and harmless. # ----------------------------------------------------------------- if history and len(history) >= 4: from agent.model_metadata import ( @@ -1845,29 +1845,20 @@ class GatewayRunner: # Prefer actual API-reported tokens from the last turn # (stored in session entry) over the rough char-based estimate. - # The rough estimate (str(msg)//4) overestimates by 30-50% on - # tool-heavy/code-heavy conversations, causing premature compression. _stored_tokens = session_entry.last_prompt_tokens if _stored_tokens > 0: _approx_tokens = _stored_tokens _token_source = "actual" else: _approx_tokens = estimate_messages_tokens_rough(history) - # Apply safety factor only for rough estimates. - # Cap the adjusted threshold at 95% of context length - # so it never exceeds what the model can actually handle - # (the 1.4x factor previously pushed the threshold above - # the model's context limit for ~200K models like GLM-5). - _max_safe_threshold = int(_hyg_context_length * 0.95) - _compress_token_threshold = min( - int(_compress_token_threshold * 1.4), - _max_safe_threshold, - ) - _warn_token_threshold = min( - int(_warn_token_threshold * 1.4), - _hyg_context_length, - ) _token_source = "estimated" + # Note: rough estimates overestimate by 30-50% for code/JSON-heavy + # sessions, but that just means hygiene fires a bit early — which + # is safe and harmless. The 85% threshold already provides ample + # headroom (agent's own compressor runs at 50%). A previous 1.4x + # multiplier tried to compensate by inflating the threshold, but + # 85% * 1.4 = 119% of context — which exceeds the model's limit + # and prevented hygiene from ever firing for ~200K models (GLM-5). _needs_compress = _approx_tokens >= _compress_token_threshold diff --git a/tests/gateway/test_session_hygiene.py b/tests/gateway/test_session_hygiene.py index 1fadd8dbc..80d249347 100644 --- a/tests/gateway/test_session_hygiene.py +++ b/tests/gateway/test_session_hygiene.py @@ -212,71 +212,59 @@ class TestSessionHygieneWarnThreshold: assert post_compress_tokens < warn_threshold -class TestEstimatedTokenSafetyCap: - """Verify the 1.4x safety factor on rough estimates is capped at 95% of - context length, preventing the threshold from exceeding the model's - actual limit. +class TestEstimatedTokenThreshold: + """Verify that hygiene thresholds are always below the model's context + limit — for both actual and estimated token counts. - Bug: For ~200K models (GLM-5-turbo), the uncapped 1.4x pushed the - threshold to 238K — above the model's limit — so hygiene never fired. + Regression: a previous 1.4x multiplier on rough estimates pushed the + threshold to 85% * 1.4 = 119% of context, which exceeded the model's + limit and prevented hygiene from ever firing for ~200K models (GLM-5). + The fix removed the multiplier entirely — the 85% threshold already + provides ample headroom over the agent's 50% compressor. """ - def test_uncapped_14x_would_exceed_context(self): - """Without the cap, 200K * 0.85 * 1.4 = 238K > 200K (broken).""" + def test_threshold_below_context_for_200k_model(self): + """Hygiene threshold must always be below model context.""" context_length = 200_000 - threshold_pct = 0.85 - raw_threshold = int(context_length * threshold_pct) # 170K - uncapped = int(raw_threshold * 1.4) # 238K - assert uncapped > context_length, ( - "Uncapped 1.4x should exceed model context (this is the bug)" - ) + threshold = int(context_length * 0.85) + assert threshold < context_length - def test_capped_14x_stays_within_context(self): - """With the cap, the threshold stays at 95% of context length.""" - context_length = 200_000 - threshold_pct = 0.85 - raw_threshold = int(context_length * threshold_pct) # 170K - max_safe = int(context_length * 0.95) # 190K - capped = min(int(raw_threshold * 1.4), max_safe) - assert capped <= context_length, ( - f"Capped threshold ({capped:,}) must not exceed context ({context_length:,})" - ) - assert capped == max_safe, ( - f"For 200K models, the cap should bind: expected {max_safe:,}, got {capped:,}" - ) - - def test_cap_does_not_affect_large_context_models(self): - """For 1M+ models the 1.4x factor stays below 95%, so cap is no-op.""" - context_length = 1_000_000 - threshold_pct = 0.85 - raw_threshold = int(context_length * threshold_pct) # 850K - max_safe = int(context_length * 0.95) # 950K - uncapped = int(raw_threshold * 1.4) # 1,190K — but that's > 950K - capped = min(uncapped, max_safe) - # For very large models the cap still applies but the resulting - # threshold (950K) is still large enough to prevent premature compression - assert capped <= context_length - - def test_cap_for_128k_model(self): - """128K model: 128K * 0.85 * 1.4 = 152K — exceeds 128K, cap binds.""" + def test_threshold_below_context_for_128k_model(self): context_length = 128_000 - threshold_pct = 0.85 - raw_threshold = int(context_length * threshold_pct) # 108,800 - max_safe = int(context_length * 0.95) # 121,600 - uncapped = int(raw_threshold * 1.4) # 152,320 - capped = min(uncapped, max_safe) - assert uncapped > context_length, "1.4x exceeds 128K context" - assert capped == max_safe, "Cap should bind for 128K models" - assert capped < context_length, "Capped value must be below context limit" + threshold = int(context_length * 0.85) + assert threshold < context_length - def test_warn_threshold_capped_at_context_length(self): - """Warn threshold (0.95 * 1.4) must be capped at context_length.""" + def test_no_multiplier_means_same_threshold_for_estimated_and_actual(self): + """Without the 1.4x, estimated and actual token paths use the same threshold.""" context_length = 200_000 - raw_warn = int(context_length * 0.95) # 190K - uncapped_warn = int(raw_warn * 1.4) # 266K - capped_warn = min(uncapped_warn, context_length) - assert uncapped_warn > context_length - assert capped_warn == context_length + threshold_pct = 0.85 + threshold = int(context_length * threshold_pct) + # Both paths should use 170K — no inflation + assert threshold == 170_000 + + def test_warn_threshold_below_context(self): + """Warn threshold (95%) must be below context length.""" + for ctx in (128_000, 200_000, 1_000_000): + warn = int(ctx * 0.95) + assert warn < ctx + + def test_overestimate_fires_early_but_safely(self): + """If rough estimate is 50% inflated, hygiene fires at ~57% actual usage. + + That's between the agent's 50% threshold and the model's limit — + safe and harmless. + """ + context_length = 200_000 + threshold = int(context_length * 0.85) # 170K + # If actual tokens = 113K, rough estimate = 113K * 1.5 = 170K + # Hygiene fires when estimate hits 170K, actual is ~113K = 57% of ctx + actual_when_fires = threshold / 1.5 + assert actual_when_fires > context_length * 0.50, ( + "Early fire should still be above agent's 50% threshold" + ) + assert actual_when_fires < context_length, ( + "Early fire must be well below model limit" + ) class TestTokenEstimation: