test(tui): isolate session-create no-race test from shard-sibling leakage (#47230)

test_session_create_no_race_keeps_worker_alive flaked on CI shard 3 with
'build thread unregistered its own notify despite no race' while passing
20/20 in isolation locally. Root cause: daemon build threads from sibling
session.create tests in the same shard process mutate the shared
server._sessions dict under _sessions_lock and can replace/pop entries
mid-run, flipping this build thread's 'replaced' check (server.py:1011) to
True and triggering a spurious unregister_gateway_notify.

Fix is test-only: snapshot + clear server._sessions before the request so
the test sees only its own session, restore siblings in finally. Also assert
agent_ready.wait() actually returned True (was silently ignoring timeout) and
bump the timeout 2s -> 10s for loaded CI runners.
This commit is contained in:
Teknium 2026-06-16 05:56:50 -07:00 committed by GitHub
parent 1ac76a9472
commit 2483200963
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -4901,33 +4901,45 @@ def test_session_create_no_race_keeps_worker_alive(monkeypatch):
)
monkeypatch.setattr(_approval, "load_permanent_allowlist", lambda: None)
resp = server.handle_request(
{
"id": "1",
"method": "session.create",
"params": {"cols": 80},
}
)
sid = resp["result"]["session_id"]
# Isolate from sibling-test leakage: daemon build threads from prior
# session.create tests in the same shard process mutate the shared
# ``server._sessions`` dict under ``_sessions_lock`` and can replace/pop
# entries mid-run, which would flip this build thread's ``replaced`` check
# to True and trigger a spurious unregister. Snapshot, clear, and restore
# so this test sees only its own session regardless of shard composition.
_saved_sessions = dict(server._sessions)
server._sessions.clear()
# Wait for the build to finish (ready event inside session dict).
session = server._sessions[sid]
session["agent_ready"].wait(timeout=2.0)
try:
resp = server.handle_request(
{
"id": "1",
"method": "session.create",
"params": {"cols": 80},
}
)
sid = resp["result"]["session_id"]
# Build finished without a close race — nothing should have been
# cleaned up by the orphan check.
assert (
closed_workers == []
), f"build thread closed its own worker despite no race: {closed_workers}"
assert (
unregistered_keys == []
), f"build thread unregistered its own notify despite no race: {unregistered_keys}"
# Wait for the build to finish (ready event inside session dict).
session = server._sessions[sid]
built = session["agent_ready"].wait(timeout=10.0)
assert built, "agent build did not complete within timeout"
# Session should have the live worker installed.
assert session.get("slash_worker") is not None
# Build finished without a close race — nothing should have been
# cleaned up by the orphan check.
assert (
closed_workers == []
), f"build thread closed its own worker despite no race: {closed_workers}"
assert (
unregistered_keys == []
), f"build thread unregistered its own notify despite no race: {unregistered_keys}"
# Cleanup
server._sessions.pop(sid, None)
# Session should have the live worker installed.
assert session.get("slash_worker") is not None
finally:
# Cleanup + restore sibling sessions we snapshotted.
server._sessions.clear()
server._sessions.update(_saved_sessions)
def test_get_db_degrades_cleanly_when_sessiondb_init_fails(monkeypatch):