diff --git a/gateway/run.py b/gateway/run.py
index a1a734759..b7adadd3a 100644
--- a/gateway/run.py
+++ b/gateway/run.py
@@ -1778,6 +1778,10 @@ class GatewayRunner:
             _hyg_model = "anthropic/claude-sonnet-4.6"
             _hyg_threshold_pct = 0.85
             _hyg_compression_enabled = True
+            _hyg_config_context_length = None
+            _hyg_provider = None
+            _hyg_base_url = None
+            _hyg_api_key = None
             try:
                 _hyg_cfg_path = _hermes_home / "config.yaml"
                 if _hyg_cfg_path.exists():
@@ -1791,6 +1795,17 @@ class GatewayRunner:
                         _hyg_model = _model_cfg
                     elif isinstance(_model_cfg, dict):
                         _hyg_model = _model_cfg.get("default", _hyg_model)
+                        # Read explicit context_length override from model config
+                        # (same as run_agent.py lines 995-1005)
+                        _raw_ctx = _model_cfg.get("context_length")
+                        if _raw_ctx is not None:
+                            try:
+                                _hyg_config_context_length = int(_raw_ctx)
+                            except (TypeError, ValueError):
+                                pass
+                        # Read provider for accurate context detection
+                        _hyg_provider = _model_cfg.get("provider") or None
+                        _hyg_base_url = _model_cfg.get("base_url") or None
 
                     # Read compression settings — only use enabled flag.
                     # The threshold is intentionally separate from the agent's
@@ -1800,11 +1815,27 @@ class GatewayRunner:
                         _hyg_compression_enabled = str(
                             _comp_cfg.get("enabled", True)
                         ).lower() in ("true", "1", "yes")
+
+                # Resolve provider/base_url from runtime if not in config
+                if not _hyg_provider or not _hyg_base_url:
+                    try:
+                        _hyg_runtime = _resolve_runtime_agent_kwargs()
+                        _hyg_provider = _hyg_provider or _hyg_runtime.get("provider")
+                        _hyg_base_url = _hyg_base_url or _hyg_runtime.get("base_url")
+                        _hyg_api_key = _hyg_runtime.get("api_key")
+                    except Exception:
+                        pass
             except Exception:
                 pass
 
             if _hyg_compression_enabled:
-                _hyg_context_length = get_model_context_length(_hyg_model)
+                _hyg_context_length = get_model_context_length(
+                    _hyg_model,
+                    base_url=_hyg_base_url or "",
+                    api_key=_hyg_api_key or "",
+                    config_context_length=_hyg_config_context_length,
+                    provider=_hyg_provider or "",
+                )
                 _compress_token_threshold = int(
                     _hyg_context_length * _hyg_threshold_pct
                 )
@@ -1822,11 +1853,20 @@ class GatewayRunner:
                     _token_source = "actual"
                 else:
                     _approx_tokens = estimate_messages_tokens_rough(history)
-                    # Apply safety factor only for rough estimates
-                    _compress_token_threshold = int(
-                        _compress_token_threshold * 1.4
+                    # Apply safety factor only for rough estimates.
+                    # Cap the adjusted threshold at 95% of context length
+                    # so it never exceeds what the model can actually handle
+                    # (the 1.4x factor previously pushed the threshold above
+                    # the model's context limit for ~200K models like GLM-5).
+                    _max_safe_threshold = int(_hyg_context_length * 0.95)
+                    _compress_token_threshold = min(
+                        int(_compress_token_threshold * 1.4),
+                        _max_safe_threshold,
+                    )
+                    _warn_token_threshold = min(
+                        int(_warn_token_threshold * 1.4),
+                        _hyg_context_length,
                     )
-                    _warn_token_threshold = int(_warn_token_threshold * 1.4)
                     _token_source = "estimated"
 
                 _needs_compress = _approx_tokens >= _compress_token_threshold
diff --git a/tests/gateway/test_session_hygiene.py b/tests/gateway/test_session_hygiene.py
index 7e75b906d..1fadd8dbc 100644
--- a/tests/gateway/test_session_hygiene.py
+++ b/tests/gateway/test_session_hygiene.py
@@ -212,6 +212,73 @@ class TestSessionHygieneWarnThreshold:
         assert post_compress_tokens < warn_threshold
 
 
+class TestEstimatedTokenSafetyCap:
+    """Verify the 1.4x safety factor on rough estimates is capped at 95% of
+    context length, preventing the threshold from exceeding the model's
+    actual limit.
+
+    Bug: For ~200K models (GLM-5-turbo), the uncapped 1.4x pushed the
+    threshold to 238K — above the model's limit — so hygiene never fired.
+    """
+
+    def test_uncapped_14x_would_exceed_context(self):
+        """Without the cap, 200K * 0.85 * 1.4 = 238K > 200K (broken)."""
+        context_length = 200_000
+        threshold_pct = 0.85
+        raw_threshold = int(context_length * threshold_pct)  # 170K
+        uncapped = int(raw_threshold * 1.4)  # 238K
+        assert uncapped > context_length, (
+            "Uncapped 1.4x should exceed model context (this is the bug)"
+        )
+
+    def test_capped_14x_stays_within_context(self):
+        """With the cap, the threshold stays at 95% of context length."""
+        context_length = 200_000
+        threshold_pct = 0.85
+        raw_threshold = int(context_length * threshold_pct)  # 170K
+        max_safe = int(context_length * 0.95)  # 190K
+        capped = min(int(raw_threshold * 1.4), max_safe)
+        assert capped <= context_length, (
+            f"Capped threshold ({capped:,}) must not exceed context ({context_length:,})"
+        )
+        assert capped == max_safe, (
+            f"For 200K models, the cap should bind: expected {max_safe:,}, got {capped:,}"
+        )
+
+    def test_cap_does_not_affect_large_context_models(self):
+        """For 1M+ models the 1.4x factor stays below 95%, so cap is no-op."""
+        context_length = 1_000_000
+        threshold_pct = 0.85
+        raw_threshold = int(context_length * threshold_pct)  # 850K
+        max_safe = int(context_length * 0.95)  # 950K
+        uncapped = int(raw_threshold * 1.4)  # 1,190K — but that's > 950K
+        capped = min(uncapped, max_safe)
+        # For very large models the cap still applies but the resulting
+        # threshold (950K) is still large enough to prevent premature compression
+        assert capped <= context_length
+
+    def test_cap_for_128k_model(self):
+        """128K model: 128K * 0.85 * 1.4 = 152K — exceeds 128K, cap binds."""
+        context_length = 128_000
+        threshold_pct = 0.85
+        raw_threshold = int(context_length * threshold_pct)  # 108,800
+        max_safe = int(context_length * 0.95)  # 121,600
+        uncapped = int(raw_threshold * 1.4)  # 152,320
+        capped = min(uncapped, max_safe)
+        assert uncapped > context_length, "1.4x exceeds 128K context"
+        assert capped == max_safe, "Cap should bind for 128K models"
+        assert capped < context_length, "Capped value must be below context limit"
+
+    def test_warn_threshold_capped_at_context_length(self):
+        """Warn threshold (0.95 * 1.4) must be capped at context_length."""
+        context_length = 200_000
+        raw_warn = int(context_length * 0.95)  # 190K
+        uncapped_warn = int(raw_warn * 1.4)  # 266K
+        capped_warn = min(uncapped_warn, context_length)
+        assert uncapped_warn > context_length
+        assert capped_warn == context_length
+
+
 class TestTokenEstimation:
     """Verify rough token estimation works as expected for hygiene checks."""