refactor(yuanbao): migrate recall to load_transcript()

Yuanbao's recall feature was reading the gateway JSONL directly to look up
messages by platform message_id, which state.db does not preserve. Migrated
to use load_transcript() which returns DB messages.

Recall branch A1 (message_id match) now falls through to A2 (content match)
or B (system note) for all sessions — a documented degradation. Follow-up
issue: add platform_message_id column to state.db messages to restore
exact-id matching.
This commit is contained in:
yoniebans 2026-05-20 09:21:17 +02:00 committed by Teknium
parent 024a8e3ee9
commit 971cfaa38c
3 changed files with 37 additions and 12 deletions

View file

@ -1410,19 +1410,19 @@ class RecallGuardMiddleware(InboundMiddleware):
logger.warning("[%s] Recall: failed to resolve session: %s", adapter.name, exc)
return
# Read JSONL directly — SQLite doesn't preserve message_id field.
transcript: list = []
# Load transcript from canonical store (state.db).
#
# Branch A1 below tries to match the recalled message by its platform
# `message_id`. state.db does NOT preserve `message_id` (only its own
# autoincrement primary key), so A1 will not match for any message
# persisted post-DB-canonical (i.e. all messages going forward). Recall
# falls through to A2 (content match) or B (system redaction note), both
# of which work DB-only.
#
# TODO: add a `platform_message_id` column to state.db messages to restore
# exact-id matching. Tracked separately.
try:
path = store.get_transcript_path(sid)
if path.exists():
with open(path, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if line:
try:
transcript.append(json.loads(line))
except json.JSONDecodeError:
pass
transcript = store.load_transcript(sid)
except Exception as exc:
logger.warning("[%s] Recall: failed to load transcript: %s", adapter.name, exc)
return

View file

View file

@ -0,0 +1,25 @@
"""Yuanbao recall: branch A2 (content-match) works without JSONL message_id."""
from gateway.session import SessionStore
from gateway.config import GatewayConfig
def test_recall_falls_through_to_content_match_without_message_id(tmp_path):
"""When transcript has no message_id field, A2 content-match still works."""
config = GatewayConfig()
store = SessionStore(sessions_dir=tmp_path, config=config)
sid = "test-yuanbao-recall"
store._db.create_session(session_id=sid, source="yuanbao:group:G")
store.append_to_transcript(sid, {"role": "user", "content": "sensitive content", "timestamp": 1.0})
store.append_to_transcript(sid, {"role": "assistant", "content": "ack", "timestamp": 2.0})
# The post-PR state: load_transcript returns DB-only, no message_id field.
history = store.load_transcript(sid)
assert all("message_id" not in msg for msg in history), \
"DB-only history should not carry message_id"
# Branch A2: content match should still find the message
target = next((m for m in history
if m.get("role") == "user" and m.get("content") == "sensitive content"), None)
assert target is not None
# Caller would then redact: target["content"] = REDACTED; store.rewrite_transcript(sid, history)