refactor(gateway): drop JSONL fallback in load_transcript

state.db is canonical. The 'use whichever source is longer' branch was
defensive code for the pre-DB migration; on every real DB it has not
fired (verified on a session corpus with 27 jsonl files / 950 sessions —
zero jsonl-bigger cases).

Test changes:
- TestLoadTranscriptCorruptLines: deleted (tested dead JSONL code path)
- TestLoadTranscriptPreferLongerSource: deleted (tested removed fallback)
- Replaced with TestLoadTranscriptDBOnly (DB-only reads)
- TestSessionStoreRewriteTranscript: fixture now creates DB session
- test_gateway_retry_replaces_last_user_turn: fixture uses real DB
This commit is contained in:
yoniebans 2026-05-20 09:20:09 +02:00 committed by Teknium
parent 1d27be0ff3
commit 024a8e3ee9
4 changed files with 35 additions and 229 deletions

View file

@ -1312,58 +1312,19 @@ class SessionStore:
f.write(json.dumps(msg, ensure_ascii=False) + "\n")
def load_transcript(self, session_id: str) -> List[Dict[str, Any]]:
"""Load all messages from a session's transcript."""
db_messages = []
# Try SQLite first
if self._db:
try:
db_messages = self._db.get_messages_as_conversation(session_id)
except Exception as e:
logger.debug("Could not load messages from DB: %s", e)
"""Load all messages from a session's transcript.
# Load legacy JSONL transcript (may contain more history than SQLite
# for sessions created before the DB layer was introduced).
transcript_path = self.get_transcript_path(session_id)
jsonl_messages = []
if transcript_path.exists():
try:
with open(transcript_path, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if line:
try:
jsonl_messages.append(json.loads(line))
except json.JSONDecodeError:
logger.warning(
"Skipping corrupt line in transcript %s: %s",
session_id, line[:120],
)
except OSError as e:
# JSONL is the legacy compatibility store. If it becomes
# unreadable, keep gateway recovery working by falling back to
# SQLite rows loaded above (or [] when no DB exists).
logger.debug("Failed to read JSONL transcript for %s: %s", session_id, e)
# Prefer whichever source has more messages.
#
# Background: when a session pre-dates SQLite storage (or when the DB
# layer was added while a long-lived session was already active), the
# first post-migration turn writes only the *new* messages to SQLite
# (because _flush_messages_to_session_db skips messages already in
# conversation_history, assuming they're persisted). On the *next*
# turn load_transcript returns those few SQLite rows and ignores the
# full JSONL history — the model sees a context of 1-4 messages instead
# of hundreds. Using the longer source prevents this silent truncation.
if len(jsonl_messages) > len(db_messages):
if db_messages:
logger.debug(
"Session %s: JSONL has %d messages vs SQLite %d"
"using JSONL (legacy session not yet fully migrated)",
session_id, len(jsonl_messages), len(db_messages),
)
return jsonl_messages
return db_messages
state.db is the canonical store. The legacy JSONL fallback was removed
in spec 002 pre-DB sessions on existing disks have already been
migrated (their DB row holds the full message history).
"""
if not self._db:
return []
try:
return self._db.get_messages_as_conversation(session_id)
except Exception as e:
logger.debug("Could not load messages from DB: %s", e)
return []
def build_session_context(