mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
Three fixes for long-running gateway sessions that enter a death spiral when API disconnects prevent token data collection, which prevents compression, which causes more disconnects: Layer 1 — Stale token counter fallback (run_agent.py in-loop): When last_prompt_tokens is 0 (stale after API disconnect or provider returned no usage data), fall back to estimate_messages_tokens_rough() instead of passing 0 to should_compress(), which would never fire. Layer 2 — Server disconnect heuristic (run_agent.py error handler): When ReadError/RemoteProtocolError hits a large session (>60% context or >200 messages), treat it as a context-length error and trigger compression rather than burning through retries that all fail the same way. Layer 3 — Hard message count limit (gateway/run.py hygiene): Force compression when a session exceeds 400 messages, regardless of token estimates. This catches runaway growth even when all token-based checks fail due to missing API data. Based on the analysis from PR #2157 by ygd58 — the gateway threshold direction fix (1.4x multiplier) was already resolved on main.
This commit is contained in:
parent
8fd9fafc84
commit
f374ae4c61
2 changed files with 52 additions and 6 deletions
|
|
@ -2361,7 +2361,18 @@ class GatewayRunner:
|
|||
# 85% * 1.4 = 119% of context — which exceeds the model's limit
|
||||
# and prevented hygiene from ever firing for ~200K models (GLM-5).
|
||||
|
||||
_needs_compress = _approx_tokens >= _compress_token_threshold
|
||||
# Hard safety valve: force compression if message count is
|
||||
# extreme, regardless of token estimates. This breaks the
|
||||
# death spiral where API disconnects prevent token data
|
||||
# collection, which prevents compression, which causes more
|
||||
# disconnects. 400 messages is well above normal sessions
|
||||
# but catches runaway growth before it becomes unrecoverable.
|
||||
# (#2153)
|
||||
_HARD_MSG_LIMIT = 400
|
||||
_needs_compress = (
|
||||
_approx_tokens >= _compress_token_threshold
|
||||
or _msg_count >= _HARD_MSG_LIMIT
|
||||
)
|
||||
|
||||
if _needs_compress:
|
||||
logger.info(
|
||||
|
|
|
|||
45
run_agent.py
45
run_agent.py
|
|
@ -7540,7 +7540,33 @@ class AIAgent:
|
|||
f"treating as probable context overflow.",
|
||||
force=True,
|
||||
)
|
||||
|
||||
|
||||
# Server disconnects on large sessions are often caused by
|
||||
# the request exceeding the provider's context/payload limit
|
||||
# without a proper HTTP error response. Treat these as
|
||||
# context-length errors to trigger compression rather than
|
||||
# burning through retries that will all fail the same way.
|
||||
# This breaks the death spiral: disconnect → no token data
|
||||
# → no compression → bigger session → more disconnects.
|
||||
# (#2153)
|
||||
if not is_context_length_error and not status_code:
|
||||
_is_server_disconnect = (
|
||||
'server disconnected' in error_msg
|
||||
or 'peer closed connection' in error_msg
|
||||
or error_type in ('ReadError', 'RemoteProtocolError', 'ServerDisconnectedError')
|
||||
)
|
||||
if _is_server_disconnect:
|
||||
ctx_len = getattr(getattr(self, 'context_compressor', None), 'context_length', 200000)
|
||||
_is_large = approx_tokens > ctx_len * 0.6 or len(api_messages) > 200
|
||||
if _is_large:
|
||||
is_context_length_error = True
|
||||
self._vprint(
|
||||
f"{self.log_prefix}⚠️ Server disconnected with large session "
|
||||
f"(~{approx_tokens:,} tokens, {len(api_messages)} msgs) — "
|
||||
f"treating as context-length error, attempting compression.",
|
||||
force=True,
|
||||
)
|
||||
|
||||
if is_context_length_error:
|
||||
compressor = self.context_compressor
|
||||
old_ctx = compressor.context_length
|
||||
|
|
@ -8175,11 +8201,20 @@ class AIAgent:
|
|||
# threshold (default 50%) leaves ample headroom; if tool
|
||||
# results push past it, the next API call will report the
|
||||
# real total and trigger compression then.
|
||||
#
|
||||
# If last_prompt_tokens is 0 (stale after API disconnect
|
||||
# or provider returned no usage data), fall back to rough
|
||||
# estimate to avoid missing compression. Without this,
|
||||
# a session can grow unbounded after disconnects because
|
||||
# should_compress(0) never fires. (#2153)
|
||||
_compressor = self.context_compressor
|
||||
_real_tokens = (
|
||||
_compressor.last_prompt_tokens
|
||||
+ _compressor.last_completion_tokens
|
||||
)
|
||||
if _compressor.last_prompt_tokens > 0:
|
||||
_real_tokens = (
|
||||
_compressor.last_prompt_tokens
|
||||
+ _compressor.last_completion_tokens
|
||||
)
|
||||
else:
|
||||
_real_tokens = estimate_messages_tokens_rough(messages)
|
||||
|
||||
# ── Context pressure warnings (user-facing only) ──────────
|
||||
# Notify the user (NOT the LLM) as context approaches the
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue