fix: break stuck session resume loops after repeated restarts (#7536)

When a session gets stuck (hung terminal, runaway tool loop) and the user restarts the gateway, the same session history loads and puts the agent right back in the stuck state. The user is trapped in a loop: restart → stuck → restart → stuck. Fix: track restart-failure counts per session using a simple JSON file (.restart_failure_counts). On each shutdown with active agents, the counter increments for those sessions. On startup, if any session has been active across 3+ consecutive restarts, it's auto-suspended — giving the user a clean slate on their next message. The counter resets to 0 when a session completes a turn successfully (response delivered), so normal sessions that happen to be active during planned restarts (/restart, hermes update) won't accumulate false counts. Implementation: - _increment_restart_failure_counts(): called during stop() when agents are active. Writes {session_key: count} to JSON file. Sessions NOT active are dropped (loop broken). - _suspend_stuck_loop_sessions(): called on startup. Reads the file, suspends sessions at threshold (3), clears the file. - _clear_restart_failure_count(): called after successful response delivery. Removes the session from the counter file. No SessionEntry schema changes. No database migration. Pure file-based tracking that naturally cleans up. Test plan: - 9 new stuck-loop tests (increment, accumulate, threshold, clear, suspend, file cleanup, edge cases) - All 28 gateway lifecycle tests pass (restart drain + auto-continue + stuck loop)
2026-04-25 00:51:20 +00:00 · 2026-04-14 17:03:47 -07:00 · 2026-04-14 17:03:47 -07:00 · 6c89306437
commit 6c89306437
parent 847d7cbea5
2 changed files with 241 additions and 0 deletions
--- a/gateway/run.py
+++ b/gateway/run.py
@ -1475,6 +1475,106 @@ class GatewayRunner:
            except Exception:
                pass

+    _STUCK_LOOP_THRESHOLD = 3  # restarts while active before auto-suspend
+    _STUCK_LOOP_FILE = ".restart_failure_counts"
+
+    def _increment_restart_failure_counts(self, active_session_keys: set) -> None:
+        """Increment restart-failure counters for sessions active at shutdown.
+
+        Persists to a JSON file so counters survive across restarts.
+        Sessions NOT in active_session_keys are removed (they completed
+        successfully, so the loop is broken).
+        """
+        import json
+
+        path = _hermes_home / self._STUCK_LOOP_FILE
+        try:
+            counts = json.loads(path.read_text()) if path.exists() else {}
+        except Exception:
+            counts = {}
+
+        # Increment active sessions, remove inactive ones (loop broken)
+        new_counts = {}
+        for key in active_session_keys:
+            new_counts[key] = counts.get(key, 0) + 1
+        # Keep any entries that are still above 0 even if not active now
+        # (they might become active again next restart)
+
+        try:
+            path.write_text(json.dumps(new_counts))
+        except Exception:
+            pass
+
+    def _suspend_stuck_loop_sessions(self) -> int:
+        """Suspend sessions that have been active across too many restarts.
+
+        Returns the number of sessions suspended.  Called on gateway startup
+        AFTER suspend_recently_active() to catch the stuck-loop pattern:
+        session loads → agent gets stuck → gateway restarts → repeat.
+        """
+        import json
+
+        path = _hermes_home / self._STUCK_LOOP_FILE
+        if not path.exists():
+            return 0
+
+        try:
+            counts = json.loads(path.read_text())
+        except Exception:
+            return 0
+
+        suspended = 0
+        stuck_keys = [k for k, v in counts.items() if v >= self._STUCK_LOOP_THRESHOLD]
+
+        for session_key in stuck_keys:
+            try:
+                entry = self.session_store._entries.get(session_key)
+                if entry and not entry.suspended:
+                    entry.suspended = True
+                    suspended += 1
+                    logger.warning(
+                        "Auto-suspended stuck session %s (active across %d "
+                        "consecutive restarts — likely a stuck loop)",
+                        session_key[:30], counts[session_key],
+                    )
+            except Exception:
+                pass
+
+        if suspended:
+            try:
+                self.session_store._save()
+            except Exception:
+                pass
+
+        # Clear the file — counters start fresh after suspension
+        try:
+            path.unlink(missing_ok=True)
+        except Exception:
+            pass
+
+        return suspended
+
+    def _clear_restart_failure_count(self, session_key: str) -> None:
+        """Clear the restart-failure counter for a session that completed OK.
+
+        Called after a successful agent turn to signal the loop is broken.
+        """
+        import json
+
+        path = _hermes_home / self._STUCK_LOOP_FILE
+        if not path.exists():
+            return
+        try:
+            counts = json.loads(path.read_text())
+            if session_key in counts:
+                del counts[session_key]
+                if counts:
+                    path.write_text(json.dumps(counts))
+                else:
+                    path.unlink(missing_ok=True)
+        except Exception:
+            pass
+
    async def _launch_detached_restart_command(self) -> None:
        import shutil
        import subprocess
@ -1618,6 +1718,17 @@ class GatewayRunner:
            except Exception as e:
                logger.warning("Session suspension on startup failed: %s", e)

+        # Stuck-loop detection (#7536): if a session has been active across
+        # 3+ consecutive restarts, it's probably stuck in a loop (the same
+        # history keeps causing the agent to hang).  Auto-suspend it so the
+        # user gets a clean slate on the next message.
+        try:
+            stuck = self._suspend_stuck_loop_sessions()
+            if stuck:
+                logger.warning("Auto-suspended %d stuck-loop session(s)", stuck)
+        except Exception as e:
+            logger.debug("Stuck-loop detection failed: %s", e)
+
        connected_count = 0
        enabled_platform_count = 0
        startup_nonretryable_errors: list[str] = []
@ -2169,6 +2280,14 @@ class GatewayRunner:
                    "active sessions."
                )

+            # Track sessions that were active at shutdown for stuck-loop
+            # detection (#7536).  On each restart, the counter increments
+            # for sessions that were running.  If a session hits the
+            # threshold (3 consecutive restarts while active), the next
+            # startup auto-suspends it — breaking the loop.
+            if active_agents:
+                self._increment_restart_failure_counts(set(active_agents.keys()))
+
            if self._restart_requested and self._restart_via_service:
                self._exit_code = GATEWAY_SERVICE_RESTART_EXIT_CODE
                self._exit_reason = self._exit_reason or "Gateway restart requested"
@ -3667,6 +3786,12 @@ class GatewayRunner:
                _response_time, _api_calls, _resp_len,
            )

+            # Successful turn — clear any stuck-loop counter for this session.
+            # This ensures the counter only accumulates across CONSECUTIVE
+            # restarts where the session was active (never completed).
+            if session_key:
+                self._clear_restart_failure_count(session_key)
+
            # Surface error details when the agent failed silently (final_response=None)
            if not response and agent_result.get("failed"):
                error_detail = agent_result.get("error", "unknown error")
--- a/tests/gateway/test_stuck_loop.py
+++ b/tests/gateway/test_stuck_loop.py
@ -0,0 +1,116 @@
+"""Tests for stuck-session loop detection (#7536).
+
+When a session is active across 3+ consecutive gateway restarts (the agent
+gets stuck, gateway restarts, same session gets stuck again), the session
+is auto-suspended on startup so the user gets a clean slate.
+"""
+
+import json
+from pathlib import Path
+from unittest.mock import MagicMock
+
+import pytest
+
+from tests.gateway.restart_test_helpers import make_restart_runner
+
+
+@pytest.fixture
+def runner_with_home(tmp_path, monkeypatch):
+    """Create a runner with a writable HERMES_HOME."""
+    monkeypatch.setattr("gateway.run._hermes_home", tmp_path)
+    runner, adapter = make_restart_runner()
+    return runner, tmp_path
+
+
+class TestStuckLoopDetection:
+
+    def test_increment_creates_file(self, runner_with_home):
+        runner, home = runner_with_home
+        runner._increment_restart_failure_counts({"session:a", "session:b"})
+        path = home / runner._STUCK_LOOP_FILE
+        assert path.exists()
+        counts = json.loads(path.read_text())
+        assert counts["session:a"] == 1
+        assert counts["session:b"] == 1
+
+    def test_increment_accumulates(self, runner_with_home):
+        runner, home = runner_with_home
+        runner._increment_restart_failure_counts({"session:a"})
+        runner._increment_restart_failure_counts({"session:a"})
+        runner._increment_restart_failure_counts({"session:a"})
+        counts = json.loads((home / runner._STUCK_LOOP_FILE).read_text())
+        assert counts["session:a"] == 3
+
+    def test_increment_drops_inactive_sessions(self, runner_with_home):
+        runner, home = runner_with_home
+        runner._increment_restart_failure_counts({"session:a", "session:b"})
+        runner._increment_restart_failure_counts({"session:a"})  # b not active
+        counts = json.loads((home / runner._STUCK_LOOP_FILE).read_text())
+        assert "session:a" in counts
+        assert "session:b" not in counts
+
+    def test_suspend_at_threshold(self, runner_with_home):
+        runner, home = runner_with_home
+        # Simulate 3 restarts with session:a active each time
+        for _ in range(3):
+            runner._increment_restart_failure_counts({"session:a"})
+
+        # Create a mock session entry
+        mock_entry = MagicMock()
+        mock_entry.suspended = False
+        runner.session_store._entries = {"session:a": mock_entry}
+        runner.session_store._save = MagicMock()
+
+        suspended = runner._suspend_stuck_loop_sessions()
+        assert suspended == 1
+        assert mock_entry.suspended is True
+
+    def test_no_suspend_below_threshold(self, runner_with_home):
+        runner, home = runner_with_home
+        runner._increment_restart_failure_counts({"session:a"})
+        runner._increment_restart_failure_counts({"session:a"})
+        # Only 2 restarts — below threshold of 3
+
+        mock_entry = MagicMock()
+        mock_entry.suspended = False
+        runner.session_store._entries = {"session:a": mock_entry}
+
+        suspended = runner._suspend_stuck_loop_sessions()
+        assert suspended == 0
+        assert mock_entry.suspended is False
+
+    def test_clear_on_success(self, runner_with_home):
+        runner, home = runner_with_home
+        runner._increment_restart_failure_counts({"session:a", "session:b"})
+        runner._clear_restart_failure_count("session:a")
+
+        path = home / runner._STUCK_LOOP_FILE
+        counts = json.loads(path.read_text())
+        assert "session:a" not in counts
+        assert "session:b" in counts
+
+    def test_clear_removes_file_when_empty(self, runner_with_home):
+        runner, home = runner_with_home
+        runner._increment_restart_failure_counts({"session:a"})
+        runner._clear_restart_failure_count("session:a")
+        assert not (home / runner._STUCK_LOOP_FILE).exists()
+
+    def test_suspend_clears_file(self, runner_with_home):
+        runner, home = runner_with_home
+        for _ in range(3):
+            runner._increment_restart_failure_counts({"session:a"})
+
+        mock_entry = MagicMock()
+        mock_entry.suspended = False
+        runner.session_store._entries = {"session:a": mock_entry}
+        runner.session_store._save = MagicMock()
+
+        runner._suspend_stuck_loop_sessions()
+        assert not (home / runner._STUCK_LOOP_FILE).exists()
+
+    def test_no_file_no_crash(self, runner_with_home):
+        runner, home = runner_with_home
+        # No file exists — should return 0 and not crash
+        assert runner._suspend_stuck_loop_sessions() == 0
+        # Clear on nonexistent file — should not crash
+        runner._clear_restart_failure_count("nonexistent")