diff --git a/gateway/run.py b/gateway/run.py index a1a734759..b7adadd3a 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -1778,6 +1778,10 @@ class GatewayRunner: _hyg_model = "anthropic/claude-sonnet-4.6" _hyg_threshold_pct = 0.85 _hyg_compression_enabled = True + _hyg_config_context_length = None + _hyg_provider = None + _hyg_base_url = None + _hyg_api_key = None try: _hyg_cfg_path = _hermes_home / "config.yaml" if _hyg_cfg_path.exists(): @@ -1791,6 +1795,17 @@ class GatewayRunner: _hyg_model = _model_cfg elif isinstance(_model_cfg, dict): _hyg_model = _model_cfg.get("default", _hyg_model) + # Read explicit context_length override from model config + # (same as run_agent.py lines 995-1005) + _raw_ctx = _model_cfg.get("context_length") + if _raw_ctx is not None: + try: + _hyg_config_context_length = int(_raw_ctx) + except (TypeError, ValueError): + pass + # Read provider for accurate context detection + _hyg_provider = _model_cfg.get("provider") or None + _hyg_base_url = _model_cfg.get("base_url") or None # Read compression settings — only use enabled flag. # The threshold is intentionally separate from the agent's @@ -1800,11 +1815,27 @@ class GatewayRunner: _hyg_compression_enabled = str( _comp_cfg.get("enabled", True) ).lower() in ("true", "1", "yes") + + # Resolve provider/base_url from runtime if not in config + if not _hyg_provider or not _hyg_base_url: + try: + _hyg_runtime = _resolve_runtime_agent_kwargs() + _hyg_provider = _hyg_provider or _hyg_runtime.get("provider") + _hyg_base_url = _hyg_base_url or _hyg_runtime.get("base_url") + _hyg_api_key = _hyg_runtime.get("api_key") + except Exception: + pass except Exception: pass if _hyg_compression_enabled: - _hyg_context_length = get_model_context_length(_hyg_model) + _hyg_context_length = get_model_context_length( + _hyg_model, + base_url=_hyg_base_url or "", + api_key=_hyg_api_key or "", + config_context_length=_hyg_config_context_length, + provider=_hyg_provider or "", + ) _compress_token_threshold = int( _hyg_context_length * _hyg_threshold_pct ) @@ -1822,11 +1853,20 @@ class GatewayRunner: _token_source = "actual" else: _approx_tokens = estimate_messages_tokens_rough(history) - # Apply safety factor only for rough estimates - _compress_token_threshold = int( - _compress_token_threshold * 1.4 + # Apply safety factor only for rough estimates. + # Cap the adjusted threshold at 95% of context length + # so it never exceeds what the model can actually handle + # (the 1.4x factor previously pushed the threshold above + # the model's context limit for ~200K models like GLM-5). + _max_safe_threshold = int(_hyg_context_length * 0.95) + _compress_token_threshold = min( + int(_compress_token_threshold * 1.4), + _max_safe_threshold, + ) + _warn_token_threshold = min( + int(_warn_token_threshold * 1.4), + _hyg_context_length, ) - _warn_token_threshold = int(_warn_token_threshold * 1.4) _token_source = "estimated" _needs_compress = _approx_tokens >= _compress_token_threshold diff --git a/tests/gateway/test_session_hygiene.py b/tests/gateway/test_session_hygiene.py index 7e75b906d..1fadd8dbc 100644 --- a/tests/gateway/test_session_hygiene.py +++ b/tests/gateway/test_session_hygiene.py @@ -212,6 +212,73 @@ class TestSessionHygieneWarnThreshold: assert post_compress_tokens < warn_threshold +class TestEstimatedTokenSafetyCap: + """Verify the 1.4x safety factor on rough estimates is capped at 95% of + context length, preventing the threshold from exceeding the model's + actual limit. + + Bug: For ~200K models (GLM-5-turbo), the uncapped 1.4x pushed the + threshold to 238K — above the model's limit — so hygiene never fired. + """ + + def test_uncapped_14x_would_exceed_context(self): + """Without the cap, 200K * 0.85 * 1.4 = 238K > 200K (broken).""" + context_length = 200_000 + threshold_pct = 0.85 + raw_threshold = int(context_length * threshold_pct) # 170K + uncapped = int(raw_threshold * 1.4) # 238K + assert uncapped > context_length, ( + "Uncapped 1.4x should exceed model context (this is the bug)" + ) + + def test_capped_14x_stays_within_context(self): + """With the cap, the threshold stays at 95% of context length.""" + context_length = 200_000 + threshold_pct = 0.85 + raw_threshold = int(context_length * threshold_pct) # 170K + max_safe = int(context_length * 0.95) # 190K + capped = min(int(raw_threshold * 1.4), max_safe) + assert capped <= context_length, ( + f"Capped threshold ({capped:,}) must not exceed context ({context_length:,})" + ) + assert capped == max_safe, ( + f"For 200K models, the cap should bind: expected {max_safe:,}, got {capped:,}" + ) + + def test_cap_does_not_affect_large_context_models(self): + """For 1M+ models the 1.4x factor stays below 95%, so cap is no-op.""" + context_length = 1_000_000 + threshold_pct = 0.85 + raw_threshold = int(context_length * threshold_pct) # 850K + max_safe = int(context_length * 0.95) # 950K + uncapped = int(raw_threshold * 1.4) # 1,190K — but that's > 950K + capped = min(uncapped, max_safe) + # For very large models the cap still applies but the resulting + # threshold (950K) is still large enough to prevent premature compression + assert capped <= context_length + + def test_cap_for_128k_model(self): + """128K model: 128K * 0.85 * 1.4 = 152K — exceeds 128K, cap binds.""" + context_length = 128_000 + threshold_pct = 0.85 + raw_threshold = int(context_length * threshold_pct) # 108,800 + max_safe = int(context_length * 0.95) # 121,600 + uncapped = int(raw_threshold * 1.4) # 152,320 + capped = min(uncapped, max_safe) + assert uncapped > context_length, "1.4x exceeds 128K context" + assert capped == max_safe, "Cap should bind for 128K models" + assert capped < context_length, "Capped value must be below context limit" + + def test_warn_threshold_capped_at_context_length(self): + """Warn threshold (0.95 * 1.4) must be capped at context_length.""" + context_length = 200_000 + raw_warn = int(context_length * 0.95) # 190K + uncapped_warn = int(raw_warn * 1.4) # 266K + capped_warn = min(uncapped_warn, context_length) + assert uncapped_warn > context_length + assert capped_warn == context_length + + class TestTokenEstimation: """Verify rough token estimation works as expected for hygiene checks."""