From 6b408e131c48d90d44b5f1a33c9da26b70cdc7a1 Mon Sep 17 00:00:00 2001
From: haileymarshall <haileymarshall005@gmail.com>
Date: Sat, 18 Apr 2026 13:18:36 +0100
Subject: [PATCH] fix(gateway): pass session_key (not session_id) to
 active-process check during prune
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

SessionStore.prune_old_entries was calling
self._has_active_processes_fn(entry.session_id) but the callback wired
up in gateway/run.py is process_registry.has_active_for_session, which
compares against session_key, not session_id. Every other caller in
session.py (_is_session_expired, _should_reset) already passes
session_key, so prune was the only outlier — and because session_id and
session_key live in different namespaces, the guard never fired.

Result in production: sessions with live background processes (queued
cron output, detached agents, long-running Bash) were pruned out of
_entries despite the docstring promising they'd be preserved. When the
process finished and tried to deliver output, the session_key to
session_id mapping was gone and the work was effectively orphaned.

Also update the existing test_prune_skips_entries_with_active_processes,
which was checking the wrong interface (its mock callback took session_id
so it agreed with the buggy implementation). The test now uses a
session_key-based mock, matching the production callback's real contract,
and a new regression guard test pins the behaviour.

Swallowed exceptions inside the prune loop now log at debug level instead
of silently disappearing.
---
 gateway/session.py                        | 12 ++++++--
 tests/gateway/test_session_store_prune.py | 37 ++++++++++++++++++++---
 2 files changed, 42 insertions(+), 7 deletions(-)

diff --git a/gateway/session.py b/gateway/session.py
index 8b31c2b0a..81278e852 100644
--- a/gateway/session.py
+++ b/gateway/session.py
@@ -926,12 +926,18 @@ class SessionStore:
                     continue
                 # Never prune sessions with an active background process
                 # attached — the user may still be waiting on output.
+                # The callback is keyed by session_key (see process_registry.
+                # has_active_for_session); passing session_id here used to
+                # never match, so active sessions got pruned anyway.
                 if self._has_active_processes_fn is not None:
                     try:
-                        if self._has_active_processes_fn(entry.session_id):
+                        if self._has_active_processes_fn(entry.session_key):
                             continue
-                    except Exception:
-                        pass
+                    except Exception as exc:
+                        logger.debug(
+                            "has_active_processes_fn raised during prune for %s: %s",
+                            entry.session_key, exc,
+                        )
                 if entry.updated_at < cutoff:
                     removed_keys.append(key)
             for key in removed_keys:
diff --git a/tests/gateway/test_session_store_prune.py b/tests/gateway/test_session_store_prune.py
index 9b1dca297..34fa21e25 100644
--- a/tests/gateway/test_session_store_prune.py
+++ b/tests/gateway/test_session_store_prune.py
@@ -117,11 +117,20 @@ class TestPruneBasics:
         assert "idle" not in store._entries
 
     def test_prune_skips_entries_with_active_processes(self, tmp_path):
-        """Sessions with active bg processes aren't pruned even if old."""
-        active_session_ids = {"sid_active"}
+        """Sessions with active bg processes aren't pruned even if old.
 
-        def _has_active(session_id: str) -> bool:
-            return session_id in active_session_ids
+        The callback is keyed by session_key — matching what
+        process_registry.has_active_for_session() actually consumes in
+        gateway/run.py.  Prior to the fix this test passed the callback a
+        session_id, which silently matched an implementation bug where
+        prune_old_entries was also passing session_id; real-world usage
+        (via process_registry) takes a session_key and never matched, so
+        active sessions were still being pruned.
+        """
+        active_session_keys = {"active"}
+
+        def _has_active(session_key: str) -> bool:
+            return session_key in active_session_keys
 
         store = _make_store(tmp_path, has_active_processes_fn=_has_active)
         store._entries["active"] = _entry(
@@ -137,6 +146,26 @@ class TestPruneBasics:
         assert "active" in store._entries
         assert "idle" not in store._entries
 
+    def test_prune_active_check_uses_session_key_not_session_id(self, tmp_path):
+        """Regression guard: a callback that only recognises session_ids must
+        NOT protect entries during prune.  This pins the fix so a future
+        refactor can't silently revert to passing session_id again.
+        """
+        def _recognises_only_ids(identifier: str) -> bool:
+            return identifier.startswith("sid_")
+
+        store = _make_store(tmp_path, has_active_processes_fn=_recognises_only_ids)
+        store._entries["active"] = _entry(
+            "active", age_days=1000, session_id="sid_active"
+        )
+
+        removed = store.prune_old_entries(max_age_days=90)
+
+        # Entry is pruned because the callback receives "active" (session_key),
+        # not "sid_active" (session_id), so _recognises_only_ids returns False.
+        assert removed == 1
+        assert "active" not in store._entries
+
     def test_prune_does_not_write_disk_when_no_removals(self, tmp_path):
         """If nothing is evictable, _save() should NOT be called."""
         store = _make_store(tmp_path)