From 7dac75f2ae0773b18e8088b678355c59dd164aa0 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Mon, 30 Mar 2026 13:18:21 -0700 Subject: [PATCH] fix: prevent context pressure warning spam after compression (#4012) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: add /yolo slash command to toggle dangerous command approvals Adds a /yolo command that toggles HERMES_YOLO_MODE at runtime, skipping all dangerous command approval prompts for the current session. Works in both CLI and gateway (Telegram, Discord, etc.). - /yolo -> ON: all commands auto-approved, no confirmation prompts - /yolo -> OFF: normal approval flow restored The --yolo CLI flag already existed for launch-time opt-in. This adds the ability to toggle mid-session without restarting. Session-scoped — resets when the process ends. Uses the existing HERMES_YOLO_MODE env var that check_all_command_guards() already respects. * fix: prevent context pressure warning spam (agent loop + gateway rate-limit) Two complementary fixes for repeated context pressure warnings spamming gateway users (Telegram, Discord, etc.): 1. Agent-level loop fix (run_agent.py): After compression, only reset _context_pressure_warned if the post-compression estimate is actually below the 85% warning level. Previously the flag was unconditionally reset, causing the warning to re-fire every loop iteration when compression couldn't reduce below 85% of the threshold (e.g. very low threshold like 15%, or system prompt alone exceeds the warning level). 2. Gateway-level rate-limit (gateway/run.py, salvaged from PR #3786): Per-chat_id cooldown of 1 hour on compression warning messages. Both warning paths ('still large after compression' and 'compression failed') are gated. Defense-in-depth — even if the agent-level fix has edge cases, users won't see more than one warning per hour. Co-authored-by: dlkakbs --------- Co-authored-by: dlkakbs --- gateway/run.py | 19 ++++++++++-- run_agent.py | 17 +++++++---- tests/gateway/test_session_hygiene.py | 43 +++++++++++++++++++++++++++ 3 files changed, 72 insertions(+), 7 deletions(-) diff --git a/gateway/run.py b/gateway/run.py index de077ede8..c85ed27b8 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -476,6 +476,13 @@ class GatewayRunner: self._honcho_managers: Dict[str, Any] = {} self._honcho_configs: Dict[str, Any] = {} + # Rate-limit compression warning messages sent to users. + # Keyed by chat_id — value is the timestamp of the last warning sent. + # Prevents the warning from firing on every message when a session + # remains above the threshold after compression. + self._compression_warn_sent: Dict[str, float] = {} + self._compression_warn_cooldown: int = 3600 # seconds (1 hour) + # Ensure tirith security scanner is available (downloads if needed) try: from tools.tirith_security import ensure_installed @@ -2400,13 +2407,18 @@ class GatewayRunner: pass # Still too large after compression — warn user + # Rate-limited to once per cooldown period per + # chat to avoid spamming on every message. if _new_tokens >= _warn_token_threshold: logger.warning( "Session hygiene: still ~%s tokens after " "compression — suggesting /reset", f"{_new_tokens:,}", ) - if _hyg_adapter: + _now = time.time() + _last_warn = self._compression_warn_sent.get(source.chat_id, 0) + if _hyg_adapter and _now - _last_warn >= self._compression_warn_cooldown: + self._compression_warn_sent[source.chat_id] = _now try: await _hyg_adapter.send( source.chat_id, @@ -2428,7 +2440,10 @@ class GatewayRunner: if _approx_tokens >= _warn_token_threshold: _hyg_adapter = self.adapters.get(source.platform) _hyg_meta = {"thread_id": source.thread_id} if source.thread_id else None - if _hyg_adapter: + _now = time.time() + _last_warn = self._compression_warn_sent.get(source.chat_id, 0) + if _hyg_adapter and _now - _last_warn >= self._compression_warn_cooldown: + self._compression_warn_sent[source.chat_id] = _now try: await _hyg_adapter.send( source.chat_id, diff --git a/run_agent.py b/run_agent.py index 13eba7fe7..794c9f67a 100644 --- a/run_agent.py +++ b/run_agent.py @@ -5221,11 +5221,8 @@ class AIAgent: except Exception as e: logger.warning("Session DB compression split failed — new session will NOT be indexed: %s", e) - # Reset context pressure warning and token estimate — usage drops - # after compaction. Without this, the stale last_prompt_tokens from - # the previous API call causes the pressure calculation to stay at - # >1000% and spam warnings / re-trigger compression in a loop. - self._context_pressure_warned = False + # Update token estimate after compaction so pressure calculations + # use the post-compression count, not the stale pre-compression one. _compressed_est = ( estimate_tokens_rough(new_system_prompt) + estimate_messages_tokens_rough(compressed) @@ -5233,6 +5230,16 @@ class AIAgent: self.context_compressor.last_prompt_tokens = _compressed_est self.context_compressor.last_completion_tokens = 0 + # Only reset the pressure warning if compression actually brought + # us below the warning level (85% of threshold). When compression + # can't reduce enough (e.g. threshold is very low, or system prompt + # alone exceeds the warning level), keep the flag set to prevent + # spamming the user with repeated warnings every loop iteration. + if self.context_compressor.threshold_tokens > 0: + _post_progress = _compressed_est / self.context_compressor.threshold_tokens + if _post_progress < 0.85: + self._context_pressure_warned = False + return compressed, new_system_prompt def _execute_tool_calls(self, assistant_message, messages: list, effective_task_id: str, api_call_count: int = 0) -> None: diff --git a/tests/gateway/test_session_hygiene.py b/tests/gateway/test_session_hygiene.py index b8ff8f8a8..843c0d416 100644 --- a/tests/gateway/test_session_hygiene.py +++ b/tests/gateway/test_session_hygiene.py @@ -212,6 +212,49 @@ class TestSessionHygieneWarnThreshold: assert post_compress_tokens < warn_threshold +class TestCompressionWarnRateLimit: + """Compression warning messages must be rate-limited per chat_id.""" + + def _make_runner(self): + from unittest.mock import MagicMock, patch + with patch("gateway.run.load_gateway_config"), \ + patch("gateway.run.SessionStore"), \ + patch("gateway.run.DeliveryRouter"): + from gateway.run import GatewayRunner + runner = GatewayRunner.__new__(GatewayRunner) + runner._compression_warn_sent = {} + runner._compression_warn_cooldown = 3600 + return runner + + def test_first_warn_is_sent(self): + runner = self._make_runner() + now = 1_000_000.0 + last = runner._compression_warn_sent.get("chat:1", 0) + assert now - last >= runner._compression_warn_cooldown + + def test_second_warn_suppressed_within_cooldown(self): + runner = self._make_runner() + now = 1_000_000.0 + runner._compression_warn_sent["chat:1"] = now - 60 # 1 minute ago + last = runner._compression_warn_sent.get("chat:1", 0) + assert now - last < runner._compression_warn_cooldown + + def test_warn_allowed_after_cooldown(self): + runner = self._make_runner() + now = 1_000_000.0 + runner._compression_warn_sent["chat:1"] = now - 3601 # just past cooldown + last = runner._compression_warn_sent.get("chat:1", 0) + assert now - last >= runner._compression_warn_cooldown + + def test_rate_limit_is_per_chat(self): + """Rate-limiting one chat must not suppress warnings for another.""" + runner = self._make_runner() + now = 1_000_000.0 + runner._compression_warn_sent["chat:1"] = now - 60 # suppressed + last_other = runner._compression_warn_sent.get("chat:2", 0) + assert now - last_other >= runner._compression_warn_cooldown + + class TestEstimatedTokenThreshold: """Verify that hygiene thresholds are always below the model's context limit — for both actual and estimated token counts.