fix: prevent context pressure warning spam after compression (#4012)

* feat: add /yolo slash command to toggle dangerous command approvals Adds a /yolo command that toggles HERMES_YOLO_MODE at runtime, skipping all dangerous command approval prompts for the current session. Works in both CLI and gateway (Telegram, Discord, etc.). - /yolo -> ON: all commands auto-approved, no confirmation prompts - /yolo -> OFF: normal approval flow restored The --yolo CLI flag already existed for launch-time opt-in. This adds the ability to toggle mid-session without restarting. Session-scoped — resets when the process ends. Uses the existing HERMES_YOLO_MODE env var that check_all_command_guards() already respects. * fix: prevent context pressure warning spam (agent loop + gateway rate-limit) Two complementary fixes for repeated context pressure warnings spamming gateway users (Telegram, Discord, etc.): 1. Agent-level loop fix (run_agent.py): After compression, only reset _context_pressure_warned if the post-compression estimate is actually below the 85% warning level. Previously the flag was unconditionally reset, causing the warning to re-fire every loop iteration when compression couldn't reduce below 85% of the threshold (e.g. very low threshold like 15%, or system prompt alone exceeds the warning level). 2. Gateway-level rate-limit (gateway/run.py, salvaged from PR #3786): Per-chat_id cooldown of 1 hour on compression warning messages. Both warning paths ('still large after compression' and 'compression failed') are gated. Defense-in-depth — even if the agent-level fix has edge cases, users won't see more than one warning per hour. Co-authored-by: dlkakbs <dlkakbs@users.noreply.github.com> --------- Co-authored-by: dlkakbs <dlkakbs@users.noreply.github.com>
2026-04-25 00:51:20 +00:00 · 2026-03-30 13:18:21 -07:00 · 2026-03-30 13:18:21 -07:00 · 7dac75f2ae
commit 7dac75f2ae
parent ed9af6e589
3 changed files with 72 additions and 7 deletions
--- a/gateway/run.py
+++ b/gateway/run.py
@ -476,6 +476,13 @@ class GatewayRunner:
        self._honcho_managers: Dict[str, Any] = {}
        self._honcho_configs: Dict[str, Any] = {}

+        # Rate-limit compression warning messages sent to users.
+        # Keyed by chat_id — value is the timestamp of the last warning sent.
+        # Prevents the warning from firing on every message when a session
+        # remains above the threshold after compression.
+        self._compression_warn_sent: Dict[str, float] = {}
+        self._compression_warn_cooldown: int = 3600  # seconds (1 hour)
+
        # Ensure tirith security scanner is available (downloads if needed)
        try:
            from tools.tirith_security import ensure_installed
@ -2400,13 +2407,18 @@ class GatewayRunner:
                                        pass

                                # Still too large after compression — warn user
+                                # Rate-limited to once per cooldown period per
+                                # chat to avoid spamming on every message.
                                if _new_tokens >= _warn_token_threshold:
                                    logger.warning(
                                        "Session hygiene: still ~%s tokens after "
                                        "compression — suggesting /reset",
                                        f"{_new_tokens:,}",
                                    )
-                                    if _hyg_adapter:
+                                    _now = time.time()
+                                    _last_warn = self._compression_warn_sent.get(source.chat_id, 0)
+                                    if _hyg_adapter and _now - _last_warn >= self._compression_warn_cooldown:
+                                        self._compression_warn_sent[source.chat_id] = _now
                                        try:
                                            await _hyg_adapter.send(
                                                source.chat_id,
@ -2428,7 +2440,10 @@ class GatewayRunner:
                        if _approx_tokens >= _warn_token_threshold:
                            _hyg_adapter = self.adapters.get(source.platform)
                            _hyg_meta = {"thread_id": source.thread_id} if source.thread_id else None
-                            if _hyg_adapter:
+                            _now = time.time()
+                            _last_warn = self._compression_warn_sent.get(source.chat_id, 0)
+                            if _hyg_adapter and _now - _last_warn >= self._compression_warn_cooldown:
+                                self._compression_warn_sent[source.chat_id] = _now
                                try:
                                    await _hyg_adapter.send(
                                        source.chat_id,
--- a/run_agent.py
+++ b/run_agent.py
@ -5221,11 +5221,8 @@ class AIAgent:
            except Exception as e:
                logger.warning("Session DB compression split failed — new session will NOT be indexed: %s", e)

-        # Reset context pressure warning and token estimate — usage drops
-        # after compaction.  Without this, the stale last_prompt_tokens from
-        # the previous API call causes the pressure calculation to stay at
-        # >1000% and spam warnings / re-trigger compression in a loop.
-        self._context_pressure_warned = False
+        # Update token estimate after compaction so pressure calculations
+        # use the post-compression count, not the stale pre-compression one.
        _compressed_est = (
            estimate_tokens_rough(new_system_prompt)
            + estimate_messages_tokens_rough(compressed)
@ -5233,6 +5230,16 @@ class AIAgent:
        self.context_compressor.last_prompt_tokens = _compressed_est
        self.context_compressor.last_completion_tokens = 0

+        # Only reset the pressure warning if compression actually brought
+        # us below the warning level (85% of threshold).  When compression
+        # can't reduce enough (e.g. threshold is very low, or system prompt
+        # alone exceeds the warning level), keep the flag set to prevent
+        # spamming the user with repeated warnings every loop iteration.
+        if self.context_compressor.threshold_tokens > 0:
+            _post_progress = _compressed_est / self.context_compressor.threshold_tokens
+            if _post_progress < 0.85:
+                self._context_pressure_warned = False
+
        return compressed, new_system_prompt

    def _execute_tool_calls(self, assistant_message, messages: list, effective_task_id: str, api_call_count: int = 0) -> None:
--- a/tests/gateway/test_session_hygiene.py
+++ b/tests/gateway/test_session_hygiene.py
@ -212,6 +212,49 @@ class TestSessionHygieneWarnThreshold:
        assert post_compress_tokens < warn_threshold


+class TestCompressionWarnRateLimit:
+    """Compression warning messages must be rate-limited per chat_id."""
+
+    def _make_runner(self):
+        from unittest.mock import MagicMock, patch
+        with patch("gateway.run.load_gateway_config"), \
+             patch("gateway.run.SessionStore"), \
+             patch("gateway.run.DeliveryRouter"):
+            from gateway.run import GatewayRunner
+            runner = GatewayRunner.__new__(GatewayRunner)
+            runner._compression_warn_sent = {}
+            runner._compression_warn_cooldown = 3600
+            return runner
+
+    def test_first_warn_is_sent(self):
+        runner = self._make_runner()
+        now = 1_000_000.0
+        last = runner._compression_warn_sent.get("chat:1", 0)
+        assert now - last >= runner._compression_warn_cooldown
+
+    def test_second_warn_suppressed_within_cooldown(self):
+        runner = self._make_runner()
+        now = 1_000_000.0
+        runner._compression_warn_sent["chat:1"] = now - 60  # 1 minute ago
+        last = runner._compression_warn_sent.get("chat:1", 0)
+        assert now - last < runner._compression_warn_cooldown
+
+    def test_warn_allowed_after_cooldown(self):
+        runner = self._make_runner()
+        now = 1_000_000.0
+        runner._compression_warn_sent["chat:1"] = now - 3601  # just past cooldown
+        last = runner._compression_warn_sent.get("chat:1", 0)
+        assert now - last >= runner._compression_warn_cooldown
+
+    def test_rate_limit_is_per_chat(self):
+        """Rate-limiting one chat must not suppress warnings for another."""
+        runner = self._make_runner()
+        now = 1_000_000.0
+        runner._compression_warn_sent["chat:1"] = now - 60  # suppressed
+        last_other = runner._compression_warn_sent.get("chat:2", 0)
+        assert now - last_other >= runner._compression_warn_cooldown
+
+
 class TestEstimatedTokenThreshold:
    """Verify that hygiene thresholds are always below the model's context
    limit — for both actual and estimated token counts.