mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-09 08:21:50 +00:00
fix: don't evict cached agent on failed runs — prevents MCP restart loop (#7539)
* fix: circuit breaker stops CPU-burning restart loops on persistent errors
When a gateway session hits a non-retryable error (e.g. invalid model
ID → HTTP 400), the agent fails and returns. But if the session keeps
receiving messages (or something periodically recreates agents), each
attempt spawns a new AIAgent — reinitializing MCP server connections,
burning CPU — only to hit the same 400 error again. On a 4-core server,
this pegs an entire core per stuck session and accumulates 300+ minutes
of CPU time over hours.
Fix: add a per-session consecutive failure counter in the gateway runner.
- Track consecutive non-retryable failures per session key
- After 3 consecutive failures (_MAX_CONSECUTIVE_FAILURES), block
further agent creation for that session and notify the user:
'⚠️ This session has failed N times in a row with a non-retryable
error. Use /reset to start a new session.'
- Evict the cached agent when the circuit breaker engages to prevent
stale state from accumulating
- Reset the counter on successful agent runs
- Clear the counter on /reset and /new so users can recover
- Uses getattr() pattern so bare GatewayRunner instances (common in
tests using object.__new__) don't crash
Tests:
- 8 new tests in test_circuit_breaker.py covering counter behavior,
threshold, reset, session isolation, and bare-runner safety
Addresses #7130.
* Revert "fix: circuit breaker stops CPU-burning restart loops on persistent errors"
This reverts commit d848ea7109.
* fix: don't evict cached agent on failed runs — prevents MCP restart loop
When a run fails (e.g. invalid model ID → 400) and fallback activated,
the gateway was evicting the cached agent to 'retry primary next time.'
But evicting a failed agent forces a full AIAgent recreation on the next
message — reinitializing MCP server connections, spawning stdio
processes — only to hit the same 400 again. This created a CPU-burning
loop (91%+ for hours, #7130).
The fix: add `and not _run_failed` to the fallback-eviction check.
Failed runs keep the cached agent. The next message reuses it (no MCP
reinit), hits the same error, returns it to the user quickly. The user
can /reset or /model to fix their config.
Successful fallback runs still evict as before so the next message
retries the primary model.
Addresses #7130.
This commit is contained in:
parent
1ffd92cc94
commit
241032455c
2 changed files with 54 additions and 3 deletions
|
|
@ -7574,12 +7574,19 @@ class GatewayRunner:
|
|||
# Track fallback model state: if the agent switched to a
|
||||
# fallback model during this run, persist it so /model shows
|
||||
# the actually-active model instead of the config default.
|
||||
# Skip eviction when the run failed — evicting a failed agent
|
||||
# forces MCP reinit on the next message for no benefit (the
|
||||
# same error will recur). This was the root cause of #7130:
|
||||
# a bad model ID triggered fallback → eviction → recreation →
|
||||
# MCP reinit → same 400 → loop, burning 91% CPU for hours.
|
||||
_agent = agent_holder[0]
|
||||
if _agent is not None and hasattr(_agent, 'model'):
|
||||
_result_for_fb = result_holder[0]
|
||||
_run_failed = _result_for_fb.get("failed") if _result_for_fb else False
|
||||
if _agent is not None and hasattr(_agent, 'model') and not _run_failed:
|
||||
_cfg_model = _resolve_gateway_model()
|
||||
if _agent.model != _cfg_model and not self._is_intentional_model_switch(session_key, _agent.model):
|
||||
# Fallback activated — evict cached agent so the next
|
||||
# message starts fresh and retries the primary model.
|
||||
# Fallback activated on a successful run — evict cached
|
||||
# agent so the next message retries the primary model.
|
||||
self._evict_cached_agent(session_key)
|
||||
|
||||
# Check if we were interrupted OR have a queued message (/queue).
|
||||
|
|
|
|||
44
tests/gateway/test_fallback_eviction.py
Normal file
44
tests/gateway/test_fallback_eviction.py
Normal file
|
|
@ -0,0 +1,44 @@
|
|||
"""Tests for fallback-eviction gating on failed runs (#7130).
|
||||
|
||||
When a run fails, the gateway must NOT evict the cached agent — doing so
|
||||
forces MCP reinit on the next message, creating a CPU-burning restart loop.
|
||||
Eviction should only happen on successful runs where fallback activated.
|
||||
"""
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent))
|
||||
|
||||
|
||||
class TestFallbackEvictionGating:
|
||||
"""The fallback-eviction code path should skip eviction on failed runs."""
|
||||
|
||||
def test_failed_run_does_not_evict_cached_agent(self):
|
||||
"""When result has failed=True, the cached agent should NOT be evicted."""
|
||||
# The fix: `and not _run_failed` guard on the eviction check.
|
||||
# Simulate the variables that the eviction block uses.
|
||||
result = {"failed": True, "final_response": None, "error": "400 invalid model"}
|
||||
_run_failed = result.get("failed") if result else False
|
||||
assert _run_failed is True, "Failed run should be detected"
|
||||
|
||||
def test_successful_run_allows_eviction(self):
|
||||
"""When result is successful, fallback eviction should proceed."""
|
||||
result = {"completed": True, "final_response": "Hello!", "failed": False}
|
||||
_run_failed = result.get("failed") if result else False
|
||||
assert _run_failed is False, "Successful run should not be flagged"
|
||||
|
||||
def test_none_result_treated_as_not_failed(self):
|
||||
"""When result is None (edge case), treat as not-failed."""
|
||||
result = None
|
||||
_run_failed = result.get("failed") if result else False
|
||||
assert _run_failed is False
|
||||
|
||||
def test_missing_failed_key_treated_as_not_failed(self):
|
||||
"""When result dict doesn't have 'failed' key, treat as not-failed."""
|
||||
result = {"completed": True, "final_response": "Hello!"}
|
||||
_run_failed = result.get("failed") if result else False
|
||||
assert not _run_failed, "Missing 'failed' key should be falsy"
|
||||
Loading…
Add table
Add a link
Reference in a new issue