fix(gateway): pass session_key (not session_id) to active-process check during prune

SessionStore.prune_old_entries was calling
self._has_active_processes_fn(entry.session_id) but the callback wired
up in gateway/run.py is process_registry.has_active_for_session, which
compares against session_key, not session_id. Every other caller in
session.py (_is_session_expired, _should_reset) already passes
session_key, so prune was the only outlier — and because session_id and
session_key live in different namespaces, the guard never fired.

Result in production: sessions with live background processes (queued
cron output, detached agents, long-running Bash) were pruned out of
_entries despite the docstring promising they'd be preserved. When the
process finished and tried to deliver output, the session_key to
session_id mapping was gone and the work was effectively orphaned.

Also update the existing test_prune_skips_entries_with_active_processes,
which was checking the wrong interface (its mock callback took session_id
so it agreed with the buggy implementation). The test now uses a
session_key-based mock, matching the production callback's real contract,
and a new regression guard test pins the behaviour.

Swallowed exceptions inside the prune loop now log at debug level instead
of silently disappearing.
This commit is contained in:
haileymarshall 2026-04-18 13:18:36 +01:00 committed by Teknium
parent eba7c869bb
commit 6b408e131c
2 changed files with 42 additions and 7 deletions

View file

@ -926,12 +926,18 @@ class SessionStore:
continue
# Never prune sessions with an active background process
# attached — the user may still be waiting on output.
# The callback is keyed by session_key (see process_registry.
# has_active_for_session); passing session_id here used to
# never match, so active sessions got pruned anyway.
if self._has_active_processes_fn is not None:
try:
if self._has_active_processes_fn(entry.session_id):
if self._has_active_processes_fn(entry.session_key):
continue
except Exception:
pass
except Exception as exc:
logger.debug(
"has_active_processes_fn raised during prune for %s: %s",
entry.session_key, exc,
)
if entry.updated_at < cutoff:
removed_keys.append(key)
for key in removed_keys:

View file

@ -117,11 +117,20 @@ class TestPruneBasics:
assert "idle" not in store._entries
def test_prune_skips_entries_with_active_processes(self, tmp_path):
"""Sessions with active bg processes aren't pruned even if old."""
active_session_ids = {"sid_active"}
"""Sessions with active bg processes aren't pruned even if old.
def _has_active(session_id: str) -> bool:
return session_id in active_session_ids
The callback is keyed by session_key matching what
process_registry.has_active_for_session() actually consumes in
gateway/run.py. Prior to the fix this test passed the callback a
session_id, which silently matched an implementation bug where
prune_old_entries was also passing session_id; real-world usage
(via process_registry) takes a session_key and never matched, so
active sessions were still being pruned.
"""
active_session_keys = {"active"}
def _has_active(session_key: str) -> bool:
return session_key in active_session_keys
store = _make_store(tmp_path, has_active_processes_fn=_has_active)
store._entries["active"] = _entry(
@ -137,6 +146,26 @@ class TestPruneBasics:
assert "active" in store._entries
assert "idle" not in store._entries
def test_prune_active_check_uses_session_key_not_session_id(self, tmp_path):
"""Regression guard: a callback that only recognises session_ids must
NOT protect entries during prune. This pins the fix so a future
refactor can't silently revert to passing session_id again.
"""
def _recognises_only_ids(identifier: str) -> bool:
return identifier.startswith("sid_")
store = _make_store(tmp_path, has_active_processes_fn=_recognises_only_ids)
store._entries["active"] = _entry(
"active", age_days=1000, session_id="sid_active"
)
removed = store.prune_old_entries(max_age_days=90)
# Entry is pruned because the callback receives "active" (session_key),
# not "sid_active" (session_id), so _recognises_only_ids returns False.
assert removed == 1
assert "active" not in store._entries
def test_prune_does_not_write_disk_when_no_removals(self, tmp_path):
"""If nothing is evictable, _save() should NOT be called."""
store = _make_store(tmp_path)