diff --git a/gateway/run.py b/gateway/run.py index 1ab4556343..225f82fa1f 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -2361,7 +2361,18 @@ class GatewayRunner: # 85% * 1.4 = 119% of context — which exceeds the model's limit # and prevented hygiene from ever firing for ~200K models (GLM-5). - _needs_compress = _approx_tokens >= _compress_token_threshold + # Hard safety valve: force compression if message count is + # extreme, regardless of token estimates. This breaks the + # death spiral where API disconnects prevent token data + # collection, which prevents compression, which causes more + # disconnects. 400 messages is well above normal sessions + # but catches runaway growth before it becomes unrecoverable. + # (#2153) + _HARD_MSG_LIMIT = 400 + _needs_compress = ( + _approx_tokens >= _compress_token_threshold + or _msg_count >= _HARD_MSG_LIMIT + ) if _needs_compress: logger.info( diff --git a/run_agent.py b/run_agent.py index 7690359251..e18932d36f 100644 --- a/run_agent.py +++ b/run_agent.py @@ -7540,7 +7540,33 @@ class AIAgent: f"treating as probable context overflow.", force=True, ) - + + # Server disconnects on large sessions are often caused by + # the request exceeding the provider's context/payload limit + # without a proper HTTP error response. Treat these as + # context-length errors to trigger compression rather than + # burning through retries that will all fail the same way. + # This breaks the death spiral: disconnect → no token data + # → no compression → bigger session → more disconnects. + # (#2153) + if not is_context_length_error and not status_code: + _is_server_disconnect = ( + 'server disconnected' in error_msg + or 'peer closed connection' in error_msg + or error_type in ('ReadError', 'RemoteProtocolError', 'ServerDisconnectedError') + ) + if _is_server_disconnect: + ctx_len = getattr(getattr(self, 'context_compressor', None), 'context_length', 200000) + _is_large = approx_tokens > ctx_len * 0.6 or len(api_messages) > 200 + if _is_large: + is_context_length_error = True + self._vprint( + f"{self.log_prefix}⚠️ Server disconnected with large session " + f"(~{approx_tokens:,} tokens, {len(api_messages)} msgs) — " + f"treating as context-length error, attempting compression.", + force=True, + ) + if is_context_length_error: compressor = self.context_compressor old_ctx = compressor.context_length @@ -8175,11 +8201,20 @@ class AIAgent: # threshold (default 50%) leaves ample headroom; if tool # results push past it, the next API call will report the # real total and trigger compression then. + # + # If last_prompt_tokens is 0 (stale after API disconnect + # or provider returned no usage data), fall back to rough + # estimate to avoid missing compression. Without this, + # a session can grow unbounded after disconnects because + # should_compress(0) never fires. (#2153) _compressor = self.context_compressor - _real_tokens = ( - _compressor.last_prompt_tokens - + _compressor.last_completion_tokens - ) + if _compressor.last_prompt_tokens > 0: + _real_tokens = ( + _compressor.last_prompt_tokens + + _compressor.last_completion_tokens + ) + else: + _real_tokens = estimate_messages_tokens_rough(messages) # ── Context pressure warnings (user-facing only) ────────── # Notify the user (NOT the LLM) as context approaches the