diff --git a/gateway/run.py b/gateway/run.py index e5df08d82d3..5220606a520 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -9019,7 +9019,7 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew _hyg_model = "anthropic/claude-sonnet-4.6" _hyg_threshold_pct = 0.85 _hyg_compression_enabled = True - _hyg_hard_msg_limit = 400 + _hyg_hard_msg_limit = 5000 _hyg_config_context_length = None _hyg_provider = None _hyg_base_url = None @@ -9141,8 +9141,11 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew # extreme, regardless of token estimates. This breaks the # death spiral where API disconnects prevent token data # collection, which prevents compression, which causes more - # disconnects. 400 messages is well above normal sessions - # but catches runaway growth before it becomes unrecoverable. + # disconnects. 5000 messages is far above any normal session + # but catches truly runaway growth before it becomes + # unrecoverable. Set well clear of legitimate large-context + # (1M+) sessions doing thousands of short turns — those + # compress on the token threshold, not this count-based floor. # Threshold is configurable via # compression.hygiene_hard_message_limit. # (#2153) diff --git a/hermes_cli/config.py b/hermes_cli/config.py index c44bf8de6c0..27c56974b4a 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -1259,7 +1259,7 @@ DEFAULT_CONFIG = { "threshold": 0.50, # compress when context usage exceeds this ratio "target_ratio": 0.20, # fraction of threshold to preserve as recent tail "protect_last_n": 20, # minimum recent messages to keep uncompressed - "hygiene_hard_message_limit": 400, # gateway session-hygiene force-compress threshold by message count + "hygiene_hard_message_limit": 5000, # gateway session-hygiene force-compress threshold by message count "protect_first_n": 3, # non-system head messages always preserved # verbatim, in ADDITION to the system prompt # (which is always implicitly protected). Set to diff --git a/tests/gateway/test_session_hygiene.py b/tests/gateway/test_session_hygiene.py index fee815d2203..e4bb9092db0 100644 --- a/tests/gateway/test_session_hygiene.py +++ b/tests/gateway/test_session_hygiene.py @@ -741,7 +741,7 @@ async def test_session_hygiene_informs_user_when_aux_model_fails_but_recovers(mo async def test_session_hygiene_honors_configurable_hard_message_limit( monkeypatch, tmp_path ): - """compression.hygiene_hard_message_limit overrides the 400-message default. + """compression.hygiene_hard_message_limit overrides the default. Regression for user-reported fix: a gateway session with a small transcript (12 messages) should not hit hygiene compression by default, @@ -799,7 +799,7 @@ async def test_session_hygiene_honors_configurable_hard_message_limit( platform=Platform.TELEGRAM, chat_type="private", ) - # 12 messages: below 400 default → no compression without override, + # 12 messages: below default → no compression without override, # but above the configured limit of 10 → should compress. runner.session_store.load_transcript.return_value = _make_history(12, content_size=40) runner.session_store.has_any_sessions.return_value = True @@ -860,7 +860,7 @@ async def test_session_hygiene_default_hard_message_limit_does_not_fire_at_12_me monkeypatch, tmp_path ): """Sanity check for the companion test above: without config override, - 12 messages must NOT trigger the 400-message hard limit. If this test + 12 messages must NOT trigger the default hard limit. If this test passes without changes, the override test's finding is meaningful.""" fake_dotenv = types.ModuleType("dotenv") fake_dotenv.load_dotenv = lambda *args, **kwargs: None @@ -883,7 +883,7 @@ async def test_session_hygiene_default_hard_message_limit_does_not_fire_at_12_me fake_run_agent.AIAgent = FakeCompressAgent monkeypatch.setitem(sys.modules, "run_agent", fake_run_agent) - # No config.yaml — use defaults (hard_limit=400) + # No config.yaml — use defaults (hard_limit=5000) gateway_run = importlib.import_module("gateway.run") GatewayRunner = gateway_run.GatewayRunner @@ -947,7 +947,7 @@ async def test_session_hygiene_default_hard_message_limit_does_not_fire_at_12_me result = await runner._handle_message(event) assert result == "ok" - # No compression agent instantiated — 12 messages well under 400 default. + # No compression agent instantiated — 12 messages well under 5000 default. assert FakeCompressAgent.last_instance is None, ( - "Compression should NOT fire at 12 messages with default hard_limit=400" + "Compression should NOT fire at 12 messages with default hard_limit=5000" ) diff --git a/website/docs/user-guide/configuration.md b/website/docs/user-guide/configuration.md index c9ce105cdc1..0f9db9876c1 100644 --- a/website/docs/user-guide/configuration.md +++ b/website/docs/user-guide/configuration.md @@ -730,7 +730,7 @@ compression: target_ratio: 0.20 # Fraction of threshold to preserve as recent tail protect_last_n: 20 # Min recent messages to keep uncompressed protect_first_n: 3 # Non-system head messages pinned across compactions (0 = pin nothing) - hygiene_hard_message_limit: 400 # Gateway safety valve — see below + hygiene_hard_message_limit: 5000 # Gateway safety valve — see below # The summarization model/provider is configured under auxiliary: auxiliary: @@ -744,7 +744,7 @@ auxiliary: Older configs with `compression.summary_model`, `compression.summary_provider`, and `compression.summary_base_url` are automatically migrated to `auxiliary.compression.*` on first load (config version 17). No manual action needed. ::: -`hygiene_hard_message_limit` is a gateway-only **pre-compression safety valve**. Runaway sessions with thousands of messages can hit model context limits before the normal percent-of-context threshold fires; when message count crosses this ceiling, Hermes forces compression regardless of token usage. Default `400` — raise it for platforms where very long sessions are normal, lower it to force more aggressive compression. Editing this value on a running gateway takes effect on the next message (see below). +`hygiene_hard_message_limit` is a gateway-only **pre-compression safety valve**. It exists to break a death spiral: when API calls keep disconnecting on an oversized session, the gateway never receives token-usage data, so the token-based threshold can't fire, so the transcript keeps growing and disconnects get worse. This count-based floor fires on message count alone (always known, regardless of API failures) to force compression and recover the session. Default `5000` — far above any normal session, including large-context (1M+) models doing thousands of short turns, which compress on the token threshold long before this. Raise it further for unusual platforms, lower it to force more aggressive compression. Editing this value on a running gateway takes effect on the next message (see below). `protect_first_n` controls how many **non-system** head messages are pinned across every compaction. Default `3` — the opening user/assistant exchange survives every summarizer pass so the original goal stays visible. On long-running rolling-compaction sessions where the opening turn is no longer relevant, set `protect_first_n: 0` to pin nothing but the system prompt + summary + tail. The system prompt itself is always preserved regardless of this setting. diff --git a/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/user-guide/configuration.md b/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/user-guide/configuration.md index 519e742d710..1dbdab3befc 100644 --- a/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/user-guide/configuration.md +++ b/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/user-guide/configuration.md @@ -555,7 +555,7 @@ compression: threshold: 0.50 # 在上下文限制的此百分比时压缩 target_ratio: 0.20 # 保留为最近尾部的阈值分数 protect_last_n: 20 # 保持未压缩的最少最近消息数 - hygiene_hard_message_limit: 400 # Gateway 安全阀 —— 见下文 + hygiene_hard_message_limit: 5000 # Gateway 安全阀 —— 见下文 # 摘要模型/provider 在 auxiliary: 下配置: auxiliary: @@ -569,7 +569,7 @@ auxiliary: 带有 `compression.summary_model`、`compression.summary_provider` 和 `compression.summary_base_url` 的旧版配置在首次加载时自动迁移到 `auxiliary.compression.*`(配置版本 17)。无需手动操作。 ::: -`hygiene_hard_message_limit` 是仅限 gateway 的**预压缩安全阀**。拥有数千条消息的失控会话可能在正常的上下文百分比阈值触发之前就达到模型上下文限制;当消息数超过此上限时,Hermes 强制压缩,无论 token 使用情况如何。默认 `400` —— 对于非常长的会话正常的平台,请调高;要强制更积极的压缩,请降低。在运行中的 gateway 上编辑此值将在下一条消息时生效(见下文)。 +`hygiene_hard_message_limit` 是仅限 gateway 的**预压缩安全阀**。它的存在是为了打破一个死循环:当超大会话的 API 调用持续断开时,gateway 永远收不到 token 使用数据,基于 token 的阈值因此无法触发,于是 transcript 持续增长、断开愈发严重。这个基于消息数的下限仅凭消息数量触发(无论 API 是否失败,消息数始终已知),强制压缩以恢复会话。默认 `5000` —— 远高于任何正常会话,包括做数千次短轮次的大上下文(1M+)模型,它们早就在 token 阈值处压缩了。对于异常平台可调得更高;要强制更积极的压缩则调低。在运行中的 gateway 上编辑此值将在下一条消息时生效(见下文)。 :::tip Gateway 热重载压缩和上下文长度 从最近的版本开始,在运行中的 gateway 上编辑 `config.yaml` 中的 `model.context_length` 或任何 `compression.*` 键将在下一条消息时生效 —— 无需 gateway 重启、`/reset` 或会话轮换。缓存的 agent 签名包含这些键,因此 gateway 在检测到更改时会透明地重建 agent。API 密钥和工具/技能配置仍需要通常的重载路径。