feat(state.db): persist platform_message_id; restore yuanbao exact-id recall

PR #29211 dropped JSONL gateway transcripts and noted that the platform's own `message_id` field (used by Yuanbao's recall guard to redact a message by exact platform id) was no longer preserved — falling back to content-match. That fallback works for the common case but redacts the wrong row when two messages share text (or fails to match when content is post-processed). Restore exact-id matching by giving state.db a column for it: - New `platform_message_id TEXT` column on the messages table (SCHEMA_VERSION bump 11 → 12; column added via declarative reconciler on existing DBs, no version-gated migration block needed) - Partial index `idx_messages_platform_msg_id` on (session_id, platform_message_id) to keep recall's point-lookup cheap even on large sessions - `append_message()` and `replace_messages()` accept the new value: the gateway-facing `append_to_transcript` in `gateway/session.py` forwards either `message["platform_message_id"]` or the legacy `message["message_id"]` key (yuanbao's existing convention) - `get_messages_as_conversation()` surfaces the column back on the message dict as `message_id` so platform code reads the same shape it used to read from JSONL - Yuanbao `_patch_transcript`: restore branch A1 (exact id match) ahead of A2 (content match) ahead of B (system-note). Both branches log which one fired so operators can tell from gateway.log whether recall hit the canonical path or had to fall back. Tests: - New low-level round-trip tests in `test_hermes_state.py` for both `append_message` and `replace_messages` paths - The PR's `test_yuanbao_recall_db_only.py` was rewritten to assert the new contract: branch A1 (id match) works against DB-only transcripts, and branch A2 (content match) still recovers rows that were observed without a platform id (e.g. agent-processed @bot messages where run.py doesn't carry msg_id through)
2026-06-05 07:41:39 +00:00 · 2026-05-20 12:55:01 -07:00 · 2026-05-20 12:55:01 -07:00 · 31a0100104
commit 31a0100104
parent 0cc1a1d2d9
5 changed files with 185 additions and 38 deletions
--- a/hermes_state.py
+++ b/hermes_state.py
@ -33,7 +33,7 @@ T = TypeVar("T")

 DEFAULT_DB_PATH = get_hermes_home() / "state.db"

-SCHEMA_VERSION = 11
+SCHEMA_VERSION = 12

 # ---------------------------------------------------------------------------
 # WAL-compatibility fallback
@ -236,7 +236,8 @@ CREATE TABLE IF NOT EXISTS messages (
    reasoning_content TEXT,
    reasoning_details TEXT,
    codex_reasoning_items TEXT,
-    codex_message_items TEXT
+    codex_message_items TEXT,
+    platform_message_id TEXT
 );

 CREATE TABLE IF NOT EXISTS state_meta (
@ -571,6 +572,19 @@ class SessionDB:
        # column gets created here.
        self._reconcile_columns(cursor)

+        # Indexes that reference reconciler-added columns must be created
+        # AFTER _reconcile_columns runs — declaring them in SCHEMA_SQL
+        # makes the initial executescript fail on legacy DBs (the index's
+        # WHERE clause references a column that doesn't exist yet).
+        try:
+            cursor.execute(
+                "CREATE INDEX IF NOT EXISTS idx_messages_platform_msg_id "
+                "ON messages(session_id, platform_message_id) "
+                "WHERE platform_message_id IS NOT NULL"
+            )
+        except sqlite3.OperationalError as exc:
+            logger.debug("idx_messages_platform_msg_id create skipped: %s", exc)
+
        # ── Schema version bookkeeping ─────────────────────────────────
        # Bump to current so future data migrations (if any) can gate on
        # version.  No version-gated column additions remain.
@ -1445,12 +1459,19 @@ class SessionDB:
        reasoning_details: Any = None,
        codex_reasoning_items: Any = None,
        codex_message_items: Any = None,
+        platform_message_id: str = None,
    ) -> int:
        """
        Append a message to a session. Returns the message row ID.

        Also increments the session's message_count (and tool_call_count
        if role is 'tool' or tool_calls is present).
+
+        ``platform_message_id`` is the external messaging platform's own
+        message ID (e.g. Telegram update_id, Yuanbao msg_id).  It is
+        independent of the SQLite autoincrement primary key and is used by
+        platform-specific flows like yuanbao's recall guard to redact a
+        message by its platform-side identifier.
        """
        # Serialize structured fields to JSON before entering the write txn
        reasoning_details_json = (
@ -1480,8 +1501,8 @@ class SessionDB:
                """INSERT INTO messages (session_id, role, content, tool_call_id,
                   tool_calls, tool_name, timestamp, token_count, finish_reason,
                   reasoning, reasoning_content, reasoning_details, codex_reasoning_items,
-                   codex_message_items)
-                   VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
+                   codex_message_items, platform_message_id)
+                   VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
                (
                    session_id,
                    role,
@ -1497,6 +1518,7 @@ class SessionDB:
                    reasoning_details_json,
                    codex_items_json,
                    codex_message_items_json,
+                    platform_message_id,
                ),
            )
            msg_id = cursor.lastrowid
@ -1558,13 +1580,18 @@ class SessionDB:
                    json.dumps(codex_message_items) if codex_message_items else None
                )
                tool_calls_json = json.dumps(tool_calls) if tool_calls else None
+                # Accept either `platform_message_id` (new explicit name) or
+                # `message_id` (yuanbao's existing convention on message dicts).
+                platform_msg_id = (
+                    msg.get("platform_message_id") or msg.get("message_id")
+                )

                conn.execute(
                    """INSERT INTO messages (session_id, role, content, tool_call_id,
                       tool_calls, tool_name, timestamp, token_count, finish_reason,
                       reasoning, reasoning_content, reasoning_details, codex_reasoning_items,
-                       codex_message_items)
-                       VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
+                       codex_message_items, platform_message_id)
+                       VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
                    (
                        session_id,
                        role,
@ -1580,6 +1607,7 @@ class SessionDB:
                        reasoning_details_json,
                        codex_items_json,
                        codex_message_items_json,
+                        platform_msg_id,
                    ),
                )
                total_messages += 1
@ -1897,7 +1925,7 @@ class SessionDB:
            rows = self._conn.execute(
                "SELECT role, content, tool_call_id, tool_calls, tool_name, "
                "finish_reason, reasoning, reasoning_content, reasoning_details, "
-                "codex_reasoning_items, codex_message_items "
+                "codex_reasoning_items, codex_message_items, platform_message_id "
                f"FROM messages WHERE session_id IN ({placeholders}) ORDER BY id",
                tuple(session_ids),
            ).fetchall()
@ -1918,6 +1946,13 @@ class SessionDB:
                except (json.JSONDecodeError, TypeError):
                    logger.warning("Failed to deserialize tool_calls in conversation replay, falling back to []")
                    msg["tool_calls"] = []
+            # Surface the platform-side message id (e.g. yuanbao msg_id,
+            # telegram update_id) so platform-specific flows like recall
+            # can match by external identifier instead of having to fall
+            # back to content-match heuristics.  Exposed as ``message_id``
+            # for backward compatibility with the JSONL transcript shape.
+            if row["platform_message_id"]:
+                msg["message_id"] = row["platform_message_id"]
            # Restore reasoning fields on assistant messages so providers
            # that replay reasoning (OpenRouter, OpenAI, Nous) receive
            # coherent multi-turn reasoning context.