mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix: prevent context pressure warning spam after compression (#4012)
* feat: add /yolo slash command to toggle dangerous command approvals Adds a /yolo command that toggles HERMES_YOLO_MODE at runtime, skipping all dangerous command approval prompts for the current session. Works in both CLI and gateway (Telegram, Discord, etc.). - /yolo -> ON: all commands auto-approved, no confirmation prompts - /yolo -> OFF: normal approval flow restored The --yolo CLI flag already existed for launch-time opt-in. This adds the ability to toggle mid-session without restarting. Session-scoped — resets when the process ends. Uses the existing HERMES_YOLO_MODE env var that check_all_command_guards() already respects. * fix: prevent context pressure warning spam (agent loop + gateway rate-limit) Two complementary fixes for repeated context pressure warnings spamming gateway users (Telegram, Discord, etc.): 1. Agent-level loop fix (run_agent.py): After compression, only reset _context_pressure_warned if the post-compression estimate is actually below the 85% warning level. Previously the flag was unconditionally reset, causing the warning to re-fire every loop iteration when compression couldn't reduce below 85% of the threshold (e.g. very low threshold like 15%, or system prompt alone exceeds the warning level). 2. Gateway-level rate-limit (gateway/run.py, salvaged from PR #3786): Per-chat_id cooldown of 1 hour on compression warning messages. Both warning paths ('still large after compression' and 'compression failed') are gated. Defense-in-depth — even if the agent-level fix has edge cases, users won't see more than one warning per hour. Co-authored-by: dlkakbs <dlkakbs@users.noreply.github.com> --------- Co-authored-by: dlkakbs <dlkakbs@users.noreply.github.com>
This commit is contained in:
parent
ed9af6e589
commit
7dac75f2ae
3 changed files with 72 additions and 7 deletions
|
|
@ -476,6 +476,13 @@ class GatewayRunner:
|
|||
self._honcho_managers: Dict[str, Any] = {}
|
||||
self._honcho_configs: Dict[str, Any] = {}
|
||||
|
||||
# Rate-limit compression warning messages sent to users.
|
||||
# Keyed by chat_id — value is the timestamp of the last warning sent.
|
||||
# Prevents the warning from firing on every message when a session
|
||||
# remains above the threshold after compression.
|
||||
self._compression_warn_sent: Dict[str, float] = {}
|
||||
self._compression_warn_cooldown: int = 3600 # seconds (1 hour)
|
||||
|
||||
# Ensure tirith security scanner is available (downloads if needed)
|
||||
try:
|
||||
from tools.tirith_security import ensure_installed
|
||||
|
|
@ -2400,13 +2407,18 @@ class GatewayRunner:
|
|||
pass
|
||||
|
||||
# Still too large after compression — warn user
|
||||
# Rate-limited to once per cooldown period per
|
||||
# chat to avoid spamming on every message.
|
||||
if _new_tokens >= _warn_token_threshold:
|
||||
logger.warning(
|
||||
"Session hygiene: still ~%s tokens after "
|
||||
"compression — suggesting /reset",
|
||||
f"{_new_tokens:,}",
|
||||
)
|
||||
if _hyg_adapter:
|
||||
_now = time.time()
|
||||
_last_warn = self._compression_warn_sent.get(source.chat_id, 0)
|
||||
if _hyg_adapter and _now - _last_warn >= self._compression_warn_cooldown:
|
||||
self._compression_warn_sent[source.chat_id] = _now
|
||||
try:
|
||||
await _hyg_adapter.send(
|
||||
source.chat_id,
|
||||
|
|
@ -2428,7 +2440,10 @@ class GatewayRunner:
|
|||
if _approx_tokens >= _warn_token_threshold:
|
||||
_hyg_adapter = self.adapters.get(source.platform)
|
||||
_hyg_meta = {"thread_id": source.thread_id} if source.thread_id else None
|
||||
if _hyg_adapter:
|
||||
_now = time.time()
|
||||
_last_warn = self._compression_warn_sent.get(source.chat_id, 0)
|
||||
if _hyg_adapter and _now - _last_warn >= self._compression_warn_cooldown:
|
||||
self._compression_warn_sent[source.chat_id] = _now
|
||||
try:
|
||||
await _hyg_adapter.send(
|
||||
source.chat_id,
|
||||
|
|
|
|||
17
run_agent.py
17
run_agent.py
|
|
@ -5221,11 +5221,8 @@ class AIAgent:
|
|||
except Exception as e:
|
||||
logger.warning("Session DB compression split failed — new session will NOT be indexed: %s", e)
|
||||
|
||||
# Reset context pressure warning and token estimate — usage drops
|
||||
# after compaction. Without this, the stale last_prompt_tokens from
|
||||
# the previous API call causes the pressure calculation to stay at
|
||||
# >1000% and spam warnings / re-trigger compression in a loop.
|
||||
self._context_pressure_warned = False
|
||||
# Update token estimate after compaction so pressure calculations
|
||||
# use the post-compression count, not the stale pre-compression one.
|
||||
_compressed_est = (
|
||||
estimate_tokens_rough(new_system_prompt)
|
||||
+ estimate_messages_tokens_rough(compressed)
|
||||
|
|
@ -5233,6 +5230,16 @@ class AIAgent:
|
|||
self.context_compressor.last_prompt_tokens = _compressed_est
|
||||
self.context_compressor.last_completion_tokens = 0
|
||||
|
||||
# Only reset the pressure warning if compression actually brought
|
||||
# us below the warning level (85% of threshold). When compression
|
||||
# can't reduce enough (e.g. threshold is very low, or system prompt
|
||||
# alone exceeds the warning level), keep the flag set to prevent
|
||||
# spamming the user with repeated warnings every loop iteration.
|
||||
if self.context_compressor.threshold_tokens > 0:
|
||||
_post_progress = _compressed_est / self.context_compressor.threshold_tokens
|
||||
if _post_progress < 0.85:
|
||||
self._context_pressure_warned = False
|
||||
|
||||
return compressed, new_system_prompt
|
||||
|
||||
def _execute_tool_calls(self, assistant_message, messages: list, effective_task_id: str, api_call_count: int = 0) -> None:
|
||||
|
|
|
|||
|
|
@ -212,6 +212,49 @@ class TestSessionHygieneWarnThreshold:
|
|||
assert post_compress_tokens < warn_threshold
|
||||
|
||||
|
||||
class TestCompressionWarnRateLimit:
|
||||
"""Compression warning messages must be rate-limited per chat_id."""
|
||||
|
||||
def _make_runner(self):
|
||||
from unittest.mock import MagicMock, patch
|
||||
with patch("gateway.run.load_gateway_config"), \
|
||||
patch("gateway.run.SessionStore"), \
|
||||
patch("gateway.run.DeliveryRouter"):
|
||||
from gateway.run import GatewayRunner
|
||||
runner = GatewayRunner.__new__(GatewayRunner)
|
||||
runner._compression_warn_sent = {}
|
||||
runner._compression_warn_cooldown = 3600
|
||||
return runner
|
||||
|
||||
def test_first_warn_is_sent(self):
|
||||
runner = self._make_runner()
|
||||
now = 1_000_000.0
|
||||
last = runner._compression_warn_sent.get("chat:1", 0)
|
||||
assert now - last >= runner._compression_warn_cooldown
|
||||
|
||||
def test_second_warn_suppressed_within_cooldown(self):
|
||||
runner = self._make_runner()
|
||||
now = 1_000_000.0
|
||||
runner._compression_warn_sent["chat:1"] = now - 60 # 1 minute ago
|
||||
last = runner._compression_warn_sent.get("chat:1", 0)
|
||||
assert now - last < runner._compression_warn_cooldown
|
||||
|
||||
def test_warn_allowed_after_cooldown(self):
|
||||
runner = self._make_runner()
|
||||
now = 1_000_000.0
|
||||
runner._compression_warn_sent["chat:1"] = now - 3601 # just past cooldown
|
||||
last = runner._compression_warn_sent.get("chat:1", 0)
|
||||
assert now - last >= runner._compression_warn_cooldown
|
||||
|
||||
def test_rate_limit_is_per_chat(self):
|
||||
"""Rate-limiting one chat must not suppress warnings for another."""
|
||||
runner = self._make_runner()
|
||||
now = 1_000_000.0
|
||||
runner._compression_warn_sent["chat:1"] = now - 60 # suppressed
|
||||
last_other = runner._compression_warn_sent.get("chat:2", 0)
|
||||
assert now - last_other >= runner._compression_warn_cooldown
|
||||
|
||||
|
||||
class TestEstimatedTokenThreshold:
|
||||
"""Verify that hygiene thresholds are always below the model's context
|
||||
limit — for both actual and estimated token counts.
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue