From 971cfaa38c6dc048be508cb4707a5f25c6087dfa Mon Sep 17 00:00:00 2001 From: yoniebans Date: Wed, 20 May 2026 09:21:17 +0200 Subject: [PATCH] refactor(yuanbao): migrate recall to load_transcript() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Yuanbao's recall feature was reading the gateway JSONL directly to look up messages by platform message_id, which state.db does not preserve. Migrated to use load_transcript() which returns DB messages. Recall branch A1 (message_id match) now falls through to A2 (content match) or B (system note) for all sessions — a documented degradation. Follow-up issue: add platform_message_id column to state.db messages to restore exact-id matching. --- gateway/platforms/yuanbao.py | 24 +++++++++--------- tests/gateway/platforms/__init__.py | 0 .../platforms/test_yuanbao_recall_db_only.py | 25 +++++++++++++++++++ 3 files changed, 37 insertions(+), 12 deletions(-) create mode 100644 tests/gateway/platforms/__init__.py create mode 100644 tests/gateway/platforms/test_yuanbao_recall_db_only.py diff --git a/gateway/platforms/yuanbao.py b/gateway/platforms/yuanbao.py index 7015e0c848c..aed6717bd36 100644 --- a/gateway/platforms/yuanbao.py +++ b/gateway/platforms/yuanbao.py @@ -1410,19 +1410,19 @@ class RecallGuardMiddleware(InboundMiddleware): logger.warning("[%s] Recall: failed to resolve session: %s", adapter.name, exc) return - # Read JSONL directly — SQLite doesn't preserve message_id field. - transcript: list = [] + # Load transcript from canonical store (state.db). + # + # Branch A1 below tries to match the recalled message by its platform + # `message_id`. state.db does NOT preserve `message_id` (only its own + # autoincrement primary key), so A1 will not match for any message + # persisted post-DB-canonical (i.e. all messages going forward). Recall + # falls through to A2 (content match) or B (system redaction note), both + # of which work DB-only. + # + # TODO: add a `platform_message_id` column to state.db messages to restore + # exact-id matching. Tracked separately. try: - path = store.get_transcript_path(sid) - if path.exists(): - with open(path, "r", encoding="utf-8") as f: - for line in f: - line = line.strip() - if line: - try: - transcript.append(json.loads(line)) - except json.JSONDecodeError: - pass + transcript = store.load_transcript(sid) except Exception as exc: logger.warning("[%s] Recall: failed to load transcript: %s", adapter.name, exc) return diff --git a/tests/gateway/platforms/__init__.py b/tests/gateway/platforms/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/gateway/platforms/test_yuanbao_recall_db_only.py b/tests/gateway/platforms/test_yuanbao_recall_db_only.py new file mode 100644 index 00000000000..6186df6787a --- /dev/null +++ b/tests/gateway/platforms/test_yuanbao_recall_db_only.py @@ -0,0 +1,25 @@ +"""Yuanbao recall: branch A2 (content-match) works without JSONL message_id.""" +from gateway.session import SessionStore +from gateway.config import GatewayConfig + + +def test_recall_falls_through_to_content_match_without_message_id(tmp_path): + """When transcript has no message_id field, A2 content-match still works.""" + config = GatewayConfig() + store = SessionStore(sessions_dir=tmp_path, config=config) + + sid = "test-yuanbao-recall" + store._db.create_session(session_id=sid, source="yuanbao:group:G") + store.append_to_transcript(sid, {"role": "user", "content": "sensitive content", "timestamp": 1.0}) + store.append_to_transcript(sid, {"role": "assistant", "content": "ack", "timestamp": 2.0}) + + # The post-PR state: load_transcript returns DB-only, no message_id field. + history = store.load_transcript(sid) + assert all("message_id" not in msg for msg in history), \ + "DB-only history should not carry message_id" + + # Branch A2: content match should still find the message + target = next((m for m in history + if m.get("role") == "user" and m.get("content") == "sensitive content"), None) + assert target is not None + # Caller would then redact: target["content"] = REDACTED; store.rewrite_transcript(sid, history)