mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-04 07:31:58 +00:00
feat(state.db): persist platform_message_id; restore yuanbao exact-id recall
PR #29211 dropped JSONL gateway transcripts and noted that the platform's own `message_id` field (used by Yuanbao's recall guard to redact a message by exact platform id) was no longer preserved — falling back to content-match. That fallback works for the common case but redacts the wrong row when two messages share text (or fails to match when content is post-processed). Restore exact-id matching by giving state.db a column for it: - New `platform_message_id TEXT` column on the messages table (SCHEMA_VERSION bump 11 → 12; column added via declarative reconciler on existing DBs, no version-gated migration block needed) - Partial index `idx_messages_platform_msg_id` on (session_id, platform_message_id) to keep recall's point-lookup cheap even on large sessions - `append_message()` and `replace_messages()` accept the new value: the gateway-facing `append_to_transcript` in `gateway/session.py` forwards either `message["platform_message_id"]` or the legacy `message["message_id"]` key (yuanbao's existing convention) - `get_messages_as_conversation()` surfaces the column back on the message dict as `message_id` so platform code reads the same shape it used to read from JSONL - Yuanbao `_patch_transcript`: restore branch A1 (exact id match) ahead of A2 (content match) ahead of B (system-note). Both branches log which one fired so operators can tell from gateway.log whether recall hit the canonical path or had to fall back. Tests: - New low-level round-trip tests in `test_hermes_state.py` for both `append_message` and `replace_messages` paths - The PR's `test_yuanbao_recall_db_only.py` was rewritten to assert the new contract: branch A1 (id match) works against DB-only transcripts, and branch A2 (content match) still recovers rows that were observed without a platform id (e.g. agent-processed @bot messages where run.py doesn't carry msg_id through)
This commit is contained in:
parent
0cc1a1d2d9
commit
31a0100104
5 changed files with 185 additions and 38 deletions
|
|
@ -1410,33 +1410,43 @@ class RecallGuardMiddleware(InboundMiddleware):
|
|||
logger.warning("[%s] Recall: failed to resolve session: %s", adapter.name, exc)
|
||||
return
|
||||
|
||||
# Load transcript from canonical store (state.db). See Branch A below
|
||||
# for why we can no longer match by platform `message_id`.
|
||||
# Load transcript from canonical store (state.db). Since PR #29278
|
||||
# added a ``platform_message_id`` column to the messages table and
|
||||
# ``append_to_transcript`` wires the incoming dict's ``message_id``
|
||||
# into it, ``load_transcript`` returns rows with ``message_id`` set
|
||||
# for any message that was observed with one — Branch A1 (exact id
|
||||
# match) is the canonical path again.
|
||||
try:
|
||||
transcript = store.load_transcript(sid)
|
||||
except Exception as exc:
|
||||
logger.warning("[%s] Recall: failed to load transcript: %s", adapter.name, exc)
|
||||
return
|
||||
|
||||
# Branch A: content-match redaction. state.db does NOT preserve the
|
||||
# platform `message_id` (only its own autoincrement primary key), so we
|
||||
# cannot redact by exact id. Match by content instead. Most yuanbao
|
||||
# recalls carry the recalled text via `recalled_content`, which is
|
||||
# sufficient for any non-duplicate message.
|
||||
#
|
||||
# TODO: add a `platform_message_id` column to state.db messages to
|
||||
# restore exact-id matching. Tracked separately.
|
||||
# Branch A1: exact platform message_id match. Authoritative when the
|
||||
# row was persisted with a platform_message_id (observed group
|
||||
# messages and any inbound message whose adapter carried a msg_id).
|
||||
target = None
|
||||
if recalled_content:
|
||||
branch_label = ""
|
||||
for entry in transcript:
|
||||
if entry.get("message_id") == recalled_id:
|
||||
target = entry
|
||||
branch_label = "branch A1: id match"
|
||||
break
|
||||
# Branch A2: content-match fallback for messages that lack an exact
|
||||
# platform id on the row — e.g. agent-processed @bot messages
|
||||
# (run.py doesn't carry msg_id through) or older rows persisted
|
||||
# before the platform_message_id column existed.
|
||||
if target is None and recalled_content:
|
||||
for entry in transcript:
|
||||
if entry.get("role") == "user" and entry.get("content") == recalled_content:
|
||||
target = entry
|
||||
branch_label = "branch A2: content match"
|
||||
break
|
||||
if target is not None:
|
||||
target["content"] = cls._REDACTED
|
||||
try:
|
||||
store.rewrite_transcript(sid, transcript)
|
||||
logger.info("[%s] Recall: redacted msg_id=%s (branch A: content match)", adapter.name, recalled_id)
|
||||
logger.info("[%s] Recall: redacted msg_id=%s (%s)", adapter.name, recalled_id, branch_label)
|
||||
except Exception as exc:
|
||||
logger.warning("[%s] Recall: rewrite_transcript failed: %s", adapter.name, exc)
|
||||
return
|
||||
|
|
|
|||
|
|
@ -1271,6 +1271,12 @@ class SessionStore:
|
|||
reasoning_details=message.get("reasoning_details") if message.get("role") == "assistant" else None,
|
||||
codex_reasoning_items=message.get("codex_reasoning_items") if message.get("role") == "assistant" else None,
|
||||
codex_message_items=message.get("codex_message_items") if message.get("role") == "assistant" else None,
|
||||
# Platform-side message id (yuanbao msg_id, telegram update_id, …).
|
||||
# Accept either explicit ``platform_message_id`` or the legacy
|
||||
# ``message_id`` key the JSONL transcript used.
|
||||
platform_message_id=(
|
||||
message.get("platform_message_id") or message.get("message_id")
|
||||
),
|
||||
)
|
||||
except Exception as e:
|
||||
logger.debug("Session DB operation failed: %s", e)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue