diff --git a/gateway/run.py b/gateway/run.py index a83fa2eed..d137d73c3 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -1475,6 +1475,106 @@ class GatewayRunner: except Exception: pass + _STUCK_LOOP_THRESHOLD = 3 # restarts while active before auto-suspend + _STUCK_LOOP_FILE = ".restart_failure_counts" + + def _increment_restart_failure_counts(self, active_session_keys: set) -> None: + """Increment restart-failure counters for sessions active at shutdown. + + Persists to a JSON file so counters survive across restarts. + Sessions NOT in active_session_keys are removed (they completed + successfully, so the loop is broken). + """ + import json + + path = _hermes_home / self._STUCK_LOOP_FILE + try: + counts = json.loads(path.read_text()) if path.exists() else {} + except Exception: + counts = {} + + # Increment active sessions, remove inactive ones (loop broken) + new_counts = {} + for key in active_session_keys: + new_counts[key] = counts.get(key, 0) + 1 + # Keep any entries that are still above 0 even if not active now + # (they might become active again next restart) + + try: + path.write_text(json.dumps(new_counts)) + except Exception: + pass + + def _suspend_stuck_loop_sessions(self) -> int: + """Suspend sessions that have been active across too many restarts. + + Returns the number of sessions suspended. Called on gateway startup + AFTER suspend_recently_active() to catch the stuck-loop pattern: + session loads → agent gets stuck → gateway restarts → repeat. + """ + import json + + path = _hermes_home / self._STUCK_LOOP_FILE + if not path.exists(): + return 0 + + try: + counts = json.loads(path.read_text()) + except Exception: + return 0 + + suspended = 0 + stuck_keys = [k for k, v in counts.items() if v >= self._STUCK_LOOP_THRESHOLD] + + for session_key in stuck_keys: + try: + entry = self.session_store._entries.get(session_key) + if entry and not entry.suspended: + entry.suspended = True + suspended += 1 + logger.warning( + "Auto-suspended stuck session %s (active across %d " + "consecutive restarts — likely a stuck loop)", + session_key[:30], counts[session_key], + ) + except Exception: + pass + + if suspended: + try: + self.session_store._save() + except Exception: + pass + + # Clear the file — counters start fresh after suspension + try: + path.unlink(missing_ok=True) + except Exception: + pass + + return suspended + + def _clear_restart_failure_count(self, session_key: str) -> None: + """Clear the restart-failure counter for a session that completed OK. + + Called after a successful agent turn to signal the loop is broken. + """ + import json + + path = _hermes_home / self._STUCK_LOOP_FILE + if not path.exists(): + return + try: + counts = json.loads(path.read_text()) + if session_key in counts: + del counts[session_key] + if counts: + path.write_text(json.dumps(counts)) + else: + path.unlink(missing_ok=True) + except Exception: + pass + async def _launch_detached_restart_command(self) -> None: import shutil import subprocess @@ -1618,6 +1718,17 @@ class GatewayRunner: except Exception as e: logger.warning("Session suspension on startup failed: %s", e) + # Stuck-loop detection (#7536): if a session has been active across + # 3+ consecutive restarts, it's probably stuck in a loop (the same + # history keeps causing the agent to hang). Auto-suspend it so the + # user gets a clean slate on the next message. + try: + stuck = self._suspend_stuck_loop_sessions() + if stuck: + logger.warning("Auto-suspended %d stuck-loop session(s)", stuck) + except Exception as e: + logger.debug("Stuck-loop detection failed: %s", e) + connected_count = 0 enabled_platform_count = 0 startup_nonretryable_errors: list[str] = [] @@ -2169,6 +2280,14 @@ class GatewayRunner: "active sessions." ) + # Track sessions that were active at shutdown for stuck-loop + # detection (#7536). On each restart, the counter increments + # for sessions that were running. If a session hits the + # threshold (3 consecutive restarts while active), the next + # startup auto-suspends it — breaking the loop. + if active_agents: + self._increment_restart_failure_counts(set(active_agents.keys())) + if self._restart_requested and self._restart_via_service: self._exit_code = GATEWAY_SERVICE_RESTART_EXIT_CODE self._exit_reason = self._exit_reason or "Gateway restart requested" @@ -3667,6 +3786,12 @@ class GatewayRunner: _response_time, _api_calls, _resp_len, ) + # Successful turn — clear any stuck-loop counter for this session. + # This ensures the counter only accumulates across CONSECUTIVE + # restarts where the session was active (never completed). + if session_key: + self._clear_restart_failure_count(session_key) + # Surface error details when the agent failed silently (final_response=None) if not response and agent_result.get("failed"): error_detail = agent_result.get("error", "unknown error") diff --git a/tests/gateway/test_stuck_loop.py b/tests/gateway/test_stuck_loop.py new file mode 100644 index 000000000..a26f29a2b --- /dev/null +++ b/tests/gateway/test_stuck_loop.py @@ -0,0 +1,116 @@ +"""Tests for stuck-session loop detection (#7536). + +When a session is active across 3+ consecutive gateway restarts (the agent +gets stuck, gateway restarts, same session gets stuck again), the session +is auto-suspended on startup so the user gets a clean slate. +""" + +import json +from pathlib import Path +from unittest.mock import MagicMock + +import pytest + +from tests.gateway.restart_test_helpers import make_restart_runner + + +@pytest.fixture +def runner_with_home(tmp_path, monkeypatch): + """Create a runner with a writable HERMES_HOME.""" + monkeypatch.setattr("gateway.run._hermes_home", tmp_path) + runner, adapter = make_restart_runner() + return runner, tmp_path + + +class TestStuckLoopDetection: + + def test_increment_creates_file(self, runner_with_home): + runner, home = runner_with_home + runner._increment_restart_failure_counts({"session:a", "session:b"}) + path = home / runner._STUCK_LOOP_FILE + assert path.exists() + counts = json.loads(path.read_text()) + assert counts["session:a"] == 1 + assert counts["session:b"] == 1 + + def test_increment_accumulates(self, runner_with_home): + runner, home = runner_with_home + runner._increment_restart_failure_counts({"session:a"}) + runner._increment_restart_failure_counts({"session:a"}) + runner._increment_restart_failure_counts({"session:a"}) + counts = json.loads((home / runner._STUCK_LOOP_FILE).read_text()) + assert counts["session:a"] == 3 + + def test_increment_drops_inactive_sessions(self, runner_with_home): + runner, home = runner_with_home + runner._increment_restart_failure_counts({"session:a", "session:b"}) + runner._increment_restart_failure_counts({"session:a"}) # b not active + counts = json.loads((home / runner._STUCK_LOOP_FILE).read_text()) + assert "session:a" in counts + assert "session:b" not in counts + + def test_suspend_at_threshold(self, runner_with_home): + runner, home = runner_with_home + # Simulate 3 restarts with session:a active each time + for _ in range(3): + runner._increment_restart_failure_counts({"session:a"}) + + # Create a mock session entry + mock_entry = MagicMock() + mock_entry.suspended = False + runner.session_store._entries = {"session:a": mock_entry} + runner.session_store._save = MagicMock() + + suspended = runner._suspend_stuck_loop_sessions() + assert suspended == 1 + assert mock_entry.suspended is True + + def test_no_suspend_below_threshold(self, runner_with_home): + runner, home = runner_with_home + runner._increment_restart_failure_counts({"session:a"}) + runner._increment_restart_failure_counts({"session:a"}) + # Only 2 restarts — below threshold of 3 + + mock_entry = MagicMock() + mock_entry.suspended = False + runner.session_store._entries = {"session:a": mock_entry} + + suspended = runner._suspend_stuck_loop_sessions() + assert suspended == 0 + assert mock_entry.suspended is False + + def test_clear_on_success(self, runner_with_home): + runner, home = runner_with_home + runner._increment_restart_failure_counts({"session:a", "session:b"}) + runner._clear_restart_failure_count("session:a") + + path = home / runner._STUCK_LOOP_FILE + counts = json.loads(path.read_text()) + assert "session:a" not in counts + assert "session:b" in counts + + def test_clear_removes_file_when_empty(self, runner_with_home): + runner, home = runner_with_home + runner._increment_restart_failure_counts({"session:a"}) + runner._clear_restart_failure_count("session:a") + assert not (home / runner._STUCK_LOOP_FILE).exists() + + def test_suspend_clears_file(self, runner_with_home): + runner, home = runner_with_home + for _ in range(3): + runner._increment_restart_failure_counts({"session:a"}) + + mock_entry = MagicMock() + mock_entry.suspended = False + runner.session_store._entries = {"session:a": mock_entry} + runner.session_store._save = MagicMock() + + runner._suspend_stuck_loop_sessions() + assert not (home / runner._STUCK_LOOP_FILE).exists() + + def test_no_file_no_crash(self, runner_with_home): + runner, home = runner_with_home + # No file exists — should return 0 and not crash + assert runner._suspend_stuck_loop_sessions() == 0 + # Clear on nonexistent file — should not crash + runner._clear_restart_failure_count("nonexistent")