mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
* fix: circuit breaker stops CPU-burning restart loops on persistent errors
When a gateway session hits a non-retryable error (e.g. invalid model
ID → HTTP 400), the agent fails and returns. But if the session keeps
receiving messages (or something periodically recreates agents), each
attempt spawns a new AIAgent — reinitializing MCP server connections,
burning CPU — only to hit the same 400 error again. On a 4-core server,
this pegs an entire core per stuck session and accumulates 300+ minutes
of CPU time over hours.
Fix: add a per-session consecutive failure counter in the gateway runner.
- Track consecutive non-retryable failures per session key
- After 3 consecutive failures (_MAX_CONSECUTIVE_FAILURES), block
further agent creation for that session and notify the user:
'⚠️ This session has failed N times in a row with a non-retryable
error. Use /reset to start a new session.'
- Evict the cached agent when the circuit breaker engages to prevent
stale state from accumulating
- Reset the counter on successful agent runs
- Clear the counter on /reset and /new so users can recover
- Uses getattr() pattern so bare GatewayRunner instances (common in
tests using object.__new__) don't crash
Tests:
- 8 new tests in test_circuit_breaker.py covering counter behavior,
threshold, reset, session isolation, and bare-runner safety
Addresses #7130.
* Revert "fix: circuit breaker stops CPU-burning restart loops on persistent errors"
This reverts commit d848ea7109.
* fix: don't evict cached agent on failed runs — prevents MCP restart loop
When a run fails (e.g. invalid model ID → 400) and fallback activated,
the gateway was evicting the cached agent to 'retry primary next time.'
But evicting a failed agent forces a full AIAgent recreation on the next
message — reinitializing MCP server connections, spawning stdio
processes — only to hit the same 400 again. This created a CPU-burning
loop (91%+ for hours, #7130).
The fix: add `and not _run_failed` to the fallback-eviction check.
Failed runs keep the cached agent. The next message reuses it (no MCP
reinit), hits the same error, returns it to the user quickly. The user
can /reset or /model to fix their config.
Successful fallback runs still evict as before so the next message
retries the primary model.
Addresses #7130.
44 lines
2 KiB
Python
44 lines
2 KiB
Python
"""Tests for fallback-eviction gating on failed runs (#7130).
|
|
|
|
When a run fails, the gateway must NOT evict the cached agent — doing so
|
|
forces MCP reinit on the next message, creating a CPU-burning restart loop.
|
|
Eviction should only happen on successful runs where fallback activated.
|
|
"""
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
from unittest.mock import MagicMock, patch
|
|
|
|
import pytest
|
|
|
|
sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent))
|
|
|
|
|
|
class TestFallbackEvictionGating:
|
|
"""The fallback-eviction code path should skip eviction on failed runs."""
|
|
|
|
def test_failed_run_does_not_evict_cached_agent(self):
|
|
"""When result has failed=True, the cached agent should NOT be evicted."""
|
|
# The fix: `and not _run_failed` guard on the eviction check.
|
|
# Simulate the variables that the eviction block uses.
|
|
result = {"failed": True, "final_response": None, "error": "400 invalid model"}
|
|
_run_failed = result.get("failed") if result else False
|
|
assert _run_failed is True, "Failed run should be detected"
|
|
|
|
def test_successful_run_allows_eviction(self):
|
|
"""When result is successful, fallback eviction should proceed."""
|
|
result = {"completed": True, "final_response": "Hello!", "failed": False}
|
|
_run_failed = result.get("failed") if result else False
|
|
assert _run_failed is False, "Successful run should not be flagged"
|
|
|
|
def test_none_result_treated_as_not_failed(self):
|
|
"""When result is None (edge case), treat as not-failed."""
|
|
result = None
|
|
_run_failed = result.get("failed") if result else False
|
|
assert _run_failed is False
|
|
|
|
def test_missing_failed_key_treated_as_not_failed(self):
|
|
"""When result dict doesn't have 'failed' key, treat as not-failed."""
|
|
result = {"completed": True, "final_response": "Hello!"}
|
|
_run_failed = result.get("failed") if result else False
|
|
assert not _run_failed, "Missing 'failed' key should be falsy"
|