fix(gateway): pass session_key (not session_id) to active-process check during prune

SessionStore.prune_old_entries was calling self._has_active_processes_fn(entry.session_id) but the callback wired up in gateway/run.py is process_registry.has_active_for_session, which compares against session_key, not session_id. Every other caller in session.py (_is_session_expired, _should_reset) already passes session_key, so prune was the only outlier — and because session_id and session_key live in different namespaces, the guard never fired. Result in production: sessions with live background processes (queued cron output, detached agents, long-running Bash) were pruned out of _entries despite the docstring promising they'd be preserved. When the process finished and tried to deliver output, the session_key to session_id mapping was gone and the work was effectively orphaned. Also update the existing test_prune_skips_entries_with_active_processes, which was checking the wrong interface (its mock callback took session_id so it agreed with the buggy implementation). The test now uses a session_key-based mock, matching the production callback's real contract, and a new regression guard test pins the behaviour. Swallowed exceptions inside the prune loop now log at debug level instead of silently disappearing.
2026-04-25 00:51:20 +00:00 · 2026-04-18 13:18:36 +01:00 · 2026-04-18 13:18:36 +01:00 · 6b408e131c
commit 6b408e131c
parent eba7c869bb
2 changed files with 42 additions and 7 deletions
--- a/gateway/session.py
+++ b/gateway/session.py
@ -926,12 +926,18 @@ class SessionStore:
                    continue
                # Never prune sessions with an active background process
                # attached — the user may still be waiting on output.
+                # The callback is keyed by session_key (see process_registry.
+                # has_active_for_session); passing session_id here used to
+                # never match, so active sessions got pruned anyway.
                if self._has_active_processes_fn is not None:
                    try:
-                        if self._has_active_processes_fn(entry.session_id):
+                        if self._has_active_processes_fn(entry.session_key):
                            continue
-                    except Exception:
-                        pass
+                    except Exception as exc:
+                        logger.debug(
+                            "has_active_processes_fn raised during prune for %s: %s",
+                            entry.session_key, exc,
+                        )
                if entry.updated_at < cutoff:
                    removed_keys.append(key)
            for key in removed_keys:
--- a/tests/gateway/test_session_store_prune.py
+++ b/tests/gateway/test_session_store_prune.py
@ -117,11 +117,20 @@ class TestPruneBasics:
        assert "idle" not in store._entries

    def test_prune_skips_entries_with_active_processes(self, tmp_path):
-        """Sessions with active bg processes aren't pruned even if old."""
-        active_session_ids = {"sid_active"}
+        """Sessions with active bg processes aren't pruned even if old.

-        def _has_active(session_id: str) -> bool:
-            return session_id in active_session_ids
+        The callback is keyed by session_key — matching what
+        process_registry.has_active_for_session() actually consumes in
+        gateway/run.py.  Prior to the fix this test passed the callback a
+        session_id, which silently matched an implementation bug where
+        prune_old_entries was also passing session_id; real-world usage
+        (via process_registry) takes a session_key and never matched, so
+        active sessions were still being pruned.
+        """
+        active_session_keys = {"active"}
+
+        def _has_active(session_key: str) -> bool:
+            return session_key in active_session_keys

        store = _make_store(tmp_path, has_active_processes_fn=_has_active)
        store._entries["active"] = _entry(
@ -137,6 +146,26 @@ class TestPruneBasics:
        assert "active" in store._entries
        assert "idle" not in store._entries

+    def test_prune_active_check_uses_session_key_not_session_id(self, tmp_path):
+        """Regression guard: a callback that only recognises session_ids must
+        NOT protect entries during prune.  This pins the fix so a future
+        refactor can't silently revert to passing session_id again.
+        """
+        def _recognises_only_ids(identifier: str) -> bool:
+            return identifier.startswith("sid_")
+
+        store = _make_store(tmp_path, has_active_processes_fn=_recognises_only_ids)
+        store._entries["active"] = _entry(
+            "active", age_days=1000, session_id="sid_active"
+        )
+
+        removed = store.prune_old_entries(max_age_days=90)
+
+        # Entry is pruned because the callback receives "active" (session_key),
+        # not "sid_active" (session_id), so _recognises_only_ids returns False.
+        assert removed == 1
+        assert "active" not in store._entries
+
    def test_prune_does_not_write_disk_when_no_removals(self, tmp_path):
        """If nothing is evictable, _save() should NOT be called."""
        store = _make_store(tmp_path)