refactor(yuanbao): migrate recall to load_transcript()

Yuanbao's recall feature was reading the gateway JSONL directly to look up messages by platform message_id, which state.db does not preserve. Migrated to use load_transcript() which returns DB messages. Recall branch A1 (message_id match) now falls through to A2 (content match) or B (system note) for all sessions — a documented degradation. Follow-up issue: add platform_message_id column to state.db messages to restore exact-id matching.
2026-07-14 14:12:44 +00:00 · 2026-05-20 09:21:17 +02:00 · 2026-05-20 09:21:17 +02:00 · 971cfaa38c
commit 971cfaa38c
parent 024a8e3ee9
3 changed files with 37 additions and 12 deletions
--- a/gateway/platforms/yuanbao.py
+++ b/gateway/platforms/yuanbao.py
@ -1410,19 +1410,19 @@ class RecallGuardMiddleware(InboundMiddleware):
            logger.warning("[%s] Recall: failed to resolve session: %s", adapter.name, exc)
            return

-        # Read JSONL directly — SQLite doesn't preserve message_id field.
-        transcript: list = []
+        # Load transcript from canonical store (state.db).
+        #
+        # Branch A1 below tries to match the recalled message by its platform
+        # `message_id`. state.db does NOT preserve `message_id` (only its own
+        # autoincrement primary key), so A1 will not match for any message
+        # persisted post-DB-canonical (i.e. all messages going forward). Recall
+        # falls through to A2 (content match) or B (system redaction note), both
+        # of which work DB-only.
+        #
+        # TODO: add a `platform_message_id` column to state.db messages to restore
+        # exact-id matching. Tracked separately.
        try:
-            path = store.get_transcript_path(sid)
-            if path.exists():
-                with open(path, "r", encoding="utf-8") as f:
-                    for line in f:
-                        line = line.strip()
-                        if line:
-                            try:
-                                transcript.append(json.loads(line))
-                            except json.JSONDecodeError:
-                                pass
+            transcript = store.load_transcript(sid)
        except Exception as exc:
            logger.warning("[%s] Recall: failed to load transcript: %s", adapter.name, exc)
            return
--- a/tests/gateway/platforms/init.py
+++ b/tests/gateway/platforms/init.py
--- a/tests/gateway/platforms/test_yuanbao_recall_db_only.py
+++ b/tests/gateway/platforms/test_yuanbao_recall_db_only.py
@ -0,0 +1,25 @@
+"""Yuanbao recall: branch A2 (content-match) works without JSONL message_id."""
+from gateway.session import SessionStore
+from gateway.config import GatewayConfig
+
+
+def test_recall_falls_through_to_content_match_without_message_id(tmp_path):
+    """When transcript has no message_id field, A2 content-match still works."""
+    config = GatewayConfig()
+    store = SessionStore(sessions_dir=tmp_path, config=config)
+
+    sid = "test-yuanbao-recall"
+    store._db.create_session(session_id=sid, source="yuanbao:group:G")
+    store.append_to_transcript(sid, {"role": "user", "content": "sensitive content", "timestamp": 1.0})
+    store.append_to_transcript(sid, {"role": "assistant", "content": "ack", "timestamp": 2.0})
+
+    # The post-PR state: load_transcript returns DB-only, no message_id field.
+    history = store.load_transcript(sid)
+    assert all("message_id" not in msg for msg in history), \
+        "DB-only history should not carry message_id"
+
+    # Branch A2: content match should still find the message
+    target = next((m for m in history
+                   if m.get("role") == "user" and m.get("content") == "sensitive content"), None)
+    assert target is not None
+    # Caller would then redact: target["content"] = REDACTED; store.rewrite_transcript(sid, history)