fix(gateway): fail closed when session.delete can't enumerate active sessions

If a concurrent RPC mutates _sessions while session.delete is iterating
it (e.g. a parallel session.create on the thread pool), the bare except
swallowed the RuntimeError and let the delete proceed against a row
that may still be live.  Snapshot via list(_sessions.values()) and
return an error when even that raises, instead of treating "couldn't
check" as "no active sessions."
This commit is contained in:
Brooklyn Nicholson 2026-04-29 19:26:13 -05:00 committed by Teknium
parent 49fcad8cf8
commit 8dcab19d02
2 changed files with 35 additions and 4 deletions

View file

@ -2835,6 +2835,32 @@ def test_session_delete_refuses_active_session(monkeypatch):
assert called == [], "delete_session must not be called for active sessions"
def test_session_delete_fails_closed_when_active_snapshot_raises(monkeypatch):
"""Concurrent ``_sessions`` mutation from another RPC thread can raise
``RuntimeError: dictionary changed size during iteration``. When the
handler can't enumerate active sessions safely it must refuse the
delete (fail closed) rather than fall through and allow it."""
class _DB:
def delete_session(self, *a, **kw):
raise AssertionError("delete must not run when active snapshot fails")
class _ExplodingDict:
def values(self):
raise RuntimeError("dictionary changed size during iteration")
monkeypatch.setattr(server, "_get_db", lambda: _DB())
monkeypatch.setattr(server, "_sessions", _ExplodingDict())
resp = server.handle_request(
{"id": "1", "method": "session.delete", "params": {"session_id": "x"}}
)
assert "error" in resp
assert resp["error"]["code"] == 5036
assert "enumerate active sessions" in resp["error"]["message"]
def test_session_delete_returns_4007_when_missing(monkeypatch):
class _DB:
def delete_session(self, sid, sessions_dir=None):

View file

@ -2112,11 +2112,16 @@ def _(rid, params: dict) -> dict:
return _db_unavailable_error(rid, code=5036)
# Block deletion of any session currently bound to a live TUI session
# in this process. The picker hides the active session anyway, but a
# racing caller could still target it.
# racing caller could still target it. Snapshot via ``list(...)``
# because ``_sessions`` is mutated by concurrent RPCs on the thread
# pool — iterating the dict directly can raise ``RuntimeError:
# dictionary changed size during iteration``. If even the snapshot
# raises, fail closed (refuse the delete) rather than fail open.
try:
active = {s.get("session_key") for s in _sessions.values() if s.get("session_key")}
except Exception:
active = set()
snapshot = list(_sessions.values())
except Exception as e:
return _err(rid, 5036, f"could not enumerate active sessions: {e}")
active = {s.get("session_key") for s in snapshot if s.get("session_key")}
if target in active:
return _err(rid, 4023, "cannot delete an active session")
sessions_dir = get_hermes_home() / "sessions"