mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
When a session gets stuck (hung terminal, runaway tool loop) and the
user restarts the gateway, the same session history loads and puts the
agent right back in the stuck state. The user is trapped in a loop:
restart → stuck → restart → stuck.
Fix: track restart-failure counts per session using a simple JSON file
(.restart_failure_counts). On each shutdown with active agents, the
counter increments for those sessions. On startup, if any session has
been active across 3+ consecutive restarts, it's auto-suspended —
giving the user a clean slate on their next message.
The counter resets to 0 when a session completes a turn successfully
(response delivered), so normal sessions that happen to be active
during planned restarts (/restart, hermes update) won't accumulate
false counts.
Implementation:
- _increment_restart_failure_counts(): called during stop() when
agents are active. Writes {session_key: count} to JSON file.
Sessions NOT active are dropped (loop broken).
- _suspend_stuck_loop_sessions(): called on startup. Reads the file,
suspends sessions at threshold (3), clears the file.
- _clear_restart_failure_count(): called after successful response
delivery. Removes the session from the counter file.
No SessionEntry schema changes. No database migration. Pure file-based
tracking that naturally cleans up.
Test plan:
- 9 new stuck-loop tests (increment, accumulate, threshold, clear,
suspend, file cleanup, edge cases)
- All 28 gateway lifecycle tests pass (restart drain + auto-continue
+ stuck loop)
116 lines
4.5 KiB
Python
116 lines
4.5 KiB
Python
"""Tests for stuck-session loop detection (#7536).
|
|
|
|
When a session is active across 3+ consecutive gateway restarts (the agent
|
|
gets stuck, gateway restarts, same session gets stuck again), the session
|
|
is auto-suspended on startup so the user gets a clean slate.
|
|
"""
|
|
|
|
import json
|
|
from pathlib import Path
|
|
from unittest.mock import MagicMock
|
|
|
|
import pytest
|
|
|
|
from tests.gateway.restart_test_helpers import make_restart_runner
|
|
|
|
|
|
@pytest.fixture
|
|
def runner_with_home(tmp_path, monkeypatch):
|
|
"""Create a runner with a writable HERMES_HOME."""
|
|
monkeypatch.setattr("gateway.run._hermes_home", tmp_path)
|
|
runner, adapter = make_restart_runner()
|
|
return runner, tmp_path
|
|
|
|
|
|
class TestStuckLoopDetection:
|
|
|
|
def test_increment_creates_file(self, runner_with_home):
|
|
runner, home = runner_with_home
|
|
runner._increment_restart_failure_counts({"session:a", "session:b"})
|
|
path = home / runner._STUCK_LOOP_FILE
|
|
assert path.exists()
|
|
counts = json.loads(path.read_text())
|
|
assert counts["session:a"] == 1
|
|
assert counts["session:b"] == 1
|
|
|
|
def test_increment_accumulates(self, runner_with_home):
|
|
runner, home = runner_with_home
|
|
runner._increment_restart_failure_counts({"session:a"})
|
|
runner._increment_restart_failure_counts({"session:a"})
|
|
runner._increment_restart_failure_counts({"session:a"})
|
|
counts = json.loads((home / runner._STUCK_LOOP_FILE).read_text())
|
|
assert counts["session:a"] == 3
|
|
|
|
def test_increment_drops_inactive_sessions(self, runner_with_home):
|
|
runner, home = runner_with_home
|
|
runner._increment_restart_failure_counts({"session:a", "session:b"})
|
|
runner._increment_restart_failure_counts({"session:a"}) # b not active
|
|
counts = json.loads((home / runner._STUCK_LOOP_FILE).read_text())
|
|
assert "session:a" in counts
|
|
assert "session:b" not in counts
|
|
|
|
def test_suspend_at_threshold(self, runner_with_home):
|
|
runner, home = runner_with_home
|
|
# Simulate 3 restarts with session:a active each time
|
|
for _ in range(3):
|
|
runner._increment_restart_failure_counts({"session:a"})
|
|
|
|
# Create a mock session entry
|
|
mock_entry = MagicMock()
|
|
mock_entry.suspended = False
|
|
runner.session_store._entries = {"session:a": mock_entry}
|
|
runner.session_store._save = MagicMock()
|
|
|
|
suspended = runner._suspend_stuck_loop_sessions()
|
|
assert suspended == 1
|
|
assert mock_entry.suspended is True
|
|
|
|
def test_no_suspend_below_threshold(self, runner_with_home):
|
|
runner, home = runner_with_home
|
|
runner._increment_restart_failure_counts({"session:a"})
|
|
runner._increment_restart_failure_counts({"session:a"})
|
|
# Only 2 restarts — below threshold of 3
|
|
|
|
mock_entry = MagicMock()
|
|
mock_entry.suspended = False
|
|
runner.session_store._entries = {"session:a": mock_entry}
|
|
|
|
suspended = runner._suspend_stuck_loop_sessions()
|
|
assert suspended == 0
|
|
assert mock_entry.suspended is False
|
|
|
|
def test_clear_on_success(self, runner_with_home):
|
|
runner, home = runner_with_home
|
|
runner._increment_restart_failure_counts({"session:a", "session:b"})
|
|
runner._clear_restart_failure_count("session:a")
|
|
|
|
path = home / runner._STUCK_LOOP_FILE
|
|
counts = json.loads(path.read_text())
|
|
assert "session:a" not in counts
|
|
assert "session:b" in counts
|
|
|
|
def test_clear_removes_file_when_empty(self, runner_with_home):
|
|
runner, home = runner_with_home
|
|
runner._increment_restart_failure_counts({"session:a"})
|
|
runner._clear_restart_failure_count("session:a")
|
|
assert not (home / runner._STUCK_LOOP_FILE).exists()
|
|
|
|
def test_suspend_clears_file(self, runner_with_home):
|
|
runner, home = runner_with_home
|
|
for _ in range(3):
|
|
runner._increment_restart_failure_counts({"session:a"})
|
|
|
|
mock_entry = MagicMock()
|
|
mock_entry.suspended = False
|
|
runner.session_store._entries = {"session:a": mock_entry}
|
|
runner.session_store._save = MagicMock()
|
|
|
|
runner._suspend_stuck_loop_sessions()
|
|
assert not (home / runner._STUCK_LOOP_FILE).exists()
|
|
|
|
def test_no_file_no_crash(self, runner_with_home):
|
|
runner, home = runner_with_home
|
|
# No file exists — should return 0 and not crash
|
|
assert runner._suspend_stuck_loop_sessions() == 0
|
|
# Clear on nonexistent file — should not crash
|
|
runner._clear_restart_failure_count("nonexistent")
|