From 7dac75f2ae0773b18e8088b678355c59dd164aa0 Mon Sep 17 00:00:00 2001
From: Teknium <127238744+teknium1@users.noreply.github.com>
Date: Mon, 30 Mar 2026 13:18:21 -0700
Subject: [PATCH] fix: prevent context pressure warning spam after compression
 (#4012)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* feat: add /yolo slash command to toggle dangerous command approvals

Adds a /yolo command that toggles HERMES_YOLO_MODE at runtime, skipping
all dangerous command approval prompts for the current session. Works in
both CLI and gateway (Telegram, Discord, etc.).

- /yolo -> ON: all commands auto-approved, no confirmation prompts
- /yolo -> OFF: normal approval flow restored

The --yolo CLI flag already existed for launch-time opt-in. This adds
the ability to toggle mid-session without restarting.

Session-scoped — resets when the process ends. Uses the existing
HERMES_YOLO_MODE env var that check_all_command_guards() already
respects.

* fix: prevent context pressure warning spam (agent loop + gateway rate-limit)

Two complementary fixes for repeated context pressure warnings spamming
gateway users (Telegram, Discord, etc.):

1. Agent-level loop fix (run_agent.py):
   After compression, only reset _context_pressure_warned if the
   post-compression estimate is actually below the 85% warning level.
   Previously the flag was unconditionally reset, causing the warning
   to re-fire every loop iteration when compression couldn't reduce
   below 85% of the threshold (e.g. very low threshold like 15%,
   or system prompt alone exceeds the warning level).

2. Gateway-level rate-limit (gateway/run.py, salvaged from PR #3786):
   Per-chat_id cooldown of 1 hour on compression warning messages.
   Both warning paths ('still large after compression' and 'compression
   failed') are gated. Defense-in-depth — even if the agent-level fix
   has edge cases, users won't see more than one warning per hour.

Co-authored-by: dlkakbs <dlkakbs@users.noreply.github.com>

---------

Co-authored-by: dlkakbs <dlkakbs@users.noreply.github.com>
---
 gateway/run.py                        | 19 ++++++++++--
 run_agent.py                          | 17 +++++++----
 tests/gateway/test_session_hygiene.py | 43 +++++++++++++++++++++++++++
 3 files changed, 72 insertions(+), 7 deletions(-)

diff --git a/gateway/run.py b/gateway/run.py
index de077ede8..c85ed27b8 100644
--- a/gateway/run.py
+++ b/gateway/run.py
@@ -476,6 +476,13 @@ class GatewayRunner:
         self._honcho_managers: Dict[str, Any] = {}
         self._honcho_configs: Dict[str, Any] = {}
 
+        # Rate-limit compression warning messages sent to users.
+        # Keyed by chat_id — value is the timestamp of the last warning sent.
+        # Prevents the warning from firing on every message when a session
+        # remains above the threshold after compression.
+        self._compression_warn_sent: Dict[str, float] = {}
+        self._compression_warn_cooldown: int = 3600  # seconds (1 hour)
+
         # Ensure tirith security scanner is available (downloads if needed)
         try:
             from tools.tirith_security import ensure_installed
@@ -2400,13 +2407,18 @@ class GatewayRunner:
                                         pass
 
                                 # Still too large after compression — warn user
+                                # Rate-limited to once per cooldown period per
+                                # chat to avoid spamming on every message.
                                 if _new_tokens >= _warn_token_threshold:
                                     logger.warning(
                                         "Session hygiene: still ~%s tokens after "
                                         "compression — suggesting /reset",
                                         f"{_new_tokens:,}",
                                     )
-                                    if _hyg_adapter:
+                                    _now = time.time()
+                                    _last_warn = self._compression_warn_sent.get(source.chat_id, 0)
+                                    if _hyg_adapter and _now - _last_warn >= self._compression_warn_cooldown:
+                                        self._compression_warn_sent[source.chat_id] = _now
                                         try:
                                             await _hyg_adapter.send(
                                                 source.chat_id,
@@ -2428,7 +2440,10 @@ class GatewayRunner:
                         if _approx_tokens >= _warn_token_threshold:
                             _hyg_adapter = self.adapters.get(source.platform)
                             _hyg_meta = {"thread_id": source.thread_id} if source.thread_id else None
-                            if _hyg_adapter:
+                            _now = time.time()
+                            _last_warn = self._compression_warn_sent.get(source.chat_id, 0)
+                            if _hyg_adapter and _now - _last_warn >= self._compression_warn_cooldown:
+                                self._compression_warn_sent[source.chat_id] = _now
                                 try:
                                     await _hyg_adapter.send(
                                         source.chat_id,
diff --git a/run_agent.py b/run_agent.py
index 13eba7fe7..794c9f67a 100644
--- a/run_agent.py
+++ b/run_agent.py
@@ -5221,11 +5221,8 @@ class AIAgent:
             except Exception as e:
                 logger.warning("Session DB compression split failed — new session will NOT be indexed: %s", e)
 
-        # Reset context pressure warning and token estimate — usage drops
-        # after compaction.  Without this, the stale last_prompt_tokens from
-        # the previous API call causes the pressure calculation to stay at
-        # >1000% and spam warnings / re-trigger compression in a loop.
-        self._context_pressure_warned = False
+        # Update token estimate after compaction so pressure calculations
+        # use the post-compression count, not the stale pre-compression one.
         _compressed_est = (
             estimate_tokens_rough(new_system_prompt)
             + estimate_messages_tokens_rough(compressed)
@@ -5233,6 +5230,16 @@ class AIAgent:
         self.context_compressor.last_prompt_tokens = _compressed_est
         self.context_compressor.last_completion_tokens = 0
 
+        # Only reset the pressure warning if compression actually brought
+        # us below the warning level (85% of threshold).  When compression
+        # can't reduce enough (e.g. threshold is very low, or system prompt
+        # alone exceeds the warning level), keep the flag set to prevent
+        # spamming the user with repeated warnings every loop iteration.
+        if self.context_compressor.threshold_tokens > 0:
+            _post_progress = _compressed_est / self.context_compressor.threshold_tokens
+            if _post_progress < 0.85:
+                self._context_pressure_warned = False
+
         return compressed, new_system_prompt
 
     def _execute_tool_calls(self, assistant_message, messages: list, effective_task_id: str, api_call_count: int = 0) -> None:
diff --git a/tests/gateway/test_session_hygiene.py b/tests/gateway/test_session_hygiene.py
index b8ff8f8a8..843c0d416 100644
--- a/tests/gateway/test_session_hygiene.py
+++ b/tests/gateway/test_session_hygiene.py
@@ -212,6 +212,49 @@ class TestSessionHygieneWarnThreshold:
         assert post_compress_tokens < warn_threshold
 
 
+class TestCompressionWarnRateLimit:
+    """Compression warning messages must be rate-limited per chat_id."""
+
+    def _make_runner(self):
+        from unittest.mock import MagicMock, patch
+        with patch("gateway.run.load_gateway_config"), \
+             patch("gateway.run.SessionStore"), \
+             patch("gateway.run.DeliveryRouter"):
+            from gateway.run import GatewayRunner
+            runner = GatewayRunner.__new__(GatewayRunner)
+            runner._compression_warn_sent = {}
+            runner._compression_warn_cooldown = 3600
+            return runner
+
+    def test_first_warn_is_sent(self):
+        runner = self._make_runner()
+        now = 1_000_000.0
+        last = runner._compression_warn_sent.get("chat:1", 0)
+        assert now - last >= runner._compression_warn_cooldown
+
+    def test_second_warn_suppressed_within_cooldown(self):
+        runner = self._make_runner()
+        now = 1_000_000.0
+        runner._compression_warn_sent["chat:1"] = now - 60  # 1 minute ago
+        last = runner._compression_warn_sent.get("chat:1", 0)
+        assert now - last < runner._compression_warn_cooldown
+
+    def test_warn_allowed_after_cooldown(self):
+        runner = self._make_runner()
+        now = 1_000_000.0
+        runner._compression_warn_sent["chat:1"] = now - 3601  # just past cooldown
+        last = runner._compression_warn_sent.get("chat:1", 0)
+        assert now - last >= runner._compression_warn_cooldown
+
+    def test_rate_limit_is_per_chat(self):
+        """Rate-limiting one chat must not suppress warnings for another."""
+        runner = self._make_runner()
+        now = 1_000_000.0
+        runner._compression_warn_sent["chat:1"] = now - 60  # suppressed
+        last_other = runner._compression_warn_sent.get("chat:2", 0)
+        assert now - last_other >= runner._compression_warn_cooldown
+
+
 class TestEstimatedTokenThreshold:
     """Verify that hygiene thresholds are always below the model's context
     limit — for both actual and estimated token counts.