mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-05 07:41:39 +00:00
feat(state.db): persist platform_message_id; restore yuanbao exact-id recall
PR #29211 dropped JSONL gateway transcripts and noted that the platform's own `message_id` field (used by Yuanbao's recall guard to redact a message by exact platform id) was no longer preserved — falling back to content-match. That fallback works for the common case but redacts the wrong row when two messages share text (or fails to match when content is post-processed). Restore exact-id matching by giving state.db a column for it: - New `platform_message_id TEXT` column on the messages table (SCHEMA_VERSION bump 11 → 12; column added via declarative reconciler on existing DBs, no version-gated migration block needed) - Partial index `idx_messages_platform_msg_id` on (session_id, platform_message_id) to keep recall's point-lookup cheap even on large sessions - `append_message()` and `replace_messages()` accept the new value: the gateway-facing `append_to_transcript` in `gateway/session.py` forwards either `message["platform_message_id"]` or the legacy `message["message_id"]` key (yuanbao's existing convention) - `get_messages_as_conversation()` surfaces the column back on the message dict as `message_id` so platform code reads the same shape it used to read from JSONL - Yuanbao `_patch_transcript`: restore branch A1 (exact id match) ahead of A2 (content match) ahead of B (system-note). Both branches log which one fired so operators can tell from gateway.log whether recall hit the canonical path or had to fall back. Tests: - New low-level round-trip tests in `test_hermes_state.py` for both `append_message` and `replace_messages` paths - The PR's `test_yuanbao_recall_db_only.py` was rewritten to assert the new contract: branch A1 (id match) works against DB-only transcripts, and branch A2 (content match) still recovers rows that were observed without a platform id (e.g. agent-processed @bot messages where run.py doesn't carry msg_id through)
This commit is contained in:
parent
0cc1a1d2d9
commit
31a0100104
5 changed files with 185 additions and 38 deletions
|
|
@ -33,7 +33,7 @@ T = TypeVar("T")
|
|||
|
||||
DEFAULT_DB_PATH = get_hermes_home() / "state.db"
|
||||
|
||||
SCHEMA_VERSION = 11
|
||||
SCHEMA_VERSION = 12
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# WAL-compatibility fallback
|
||||
|
|
@ -236,7 +236,8 @@ CREATE TABLE IF NOT EXISTS messages (
|
|||
reasoning_content TEXT,
|
||||
reasoning_details TEXT,
|
||||
codex_reasoning_items TEXT,
|
||||
codex_message_items TEXT
|
||||
codex_message_items TEXT,
|
||||
platform_message_id TEXT
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS state_meta (
|
||||
|
|
@ -571,6 +572,19 @@ class SessionDB:
|
|||
# column gets created here.
|
||||
self._reconcile_columns(cursor)
|
||||
|
||||
# Indexes that reference reconciler-added columns must be created
|
||||
# AFTER _reconcile_columns runs — declaring them in SCHEMA_SQL
|
||||
# makes the initial executescript fail on legacy DBs (the index's
|
||||
# WHERE clause references a column that doesn't exist yet).
|
||||
try:
|
||||
cursor.execute(
|
||||
"CREATE INDEX IF NOT EXISTS idx_messages_platform_msg_id "
|
||||
"ON messages(session_id, platform_message_id) "
|
||||
"WHERE platform_message_id IS NOT NULL"
|
||||
)
|
||||
except sqlite3.OperationalError as exc:
|
||||
logger.debug("idx_messages_platform_msg_id create skipped: %s", exc)
|
||||
|
||||
# ── Schema version bookkeeping ─────────────────────────────────
|
||||
# Bump to current so future data migrations (if any) can gate on
|
||||
# version. No version-gated column additions remain.
|
||||
|
|
@ -1445,12 +1459,19 @@ class SessionDB:
|
|||
reasoning_details: Any = None,
|
||||
codex_reasoning_items: Any = None,
|
||||
codex_message_items: Any = None,
|
||||
platform_message_id: str = None,
|
||||
) -> int:
|
||||
"""
|
||||
Append a message to a session. Returns the message row ID.
|
||||
|
||||
Also increments the session's message_count (and tool_call_count
|
||||
if role is 'tool' or tool_calls is present).
|
||||
|
||||
``platform_message_id`` is the external messaging platform's own
|
||||
message ID (e.g. Telegram update_id, Yuanbao msg_id). It is
|
||||
independent of the SQLite autoincrement primary key and is used by
|
||||
platform-specific flows like yuanbao's recall guard to redact a
|
||||
message by its platform-side identifier.
|
||||
"""
|
||||
# Serialize structured fields to JSON before entering the write txn
|
||||
reasoning_details_json = (
|
||||
|
|
@ -1480,8 +1501,8 @@ class SessionDB:
|
|||
"""INSERT INTO messages (session_id, role, content, tool_call_id,
|
||||
tool_calls, tool_name, timestamp, token_count, finish_reason,
|
||||
reasoning, reasoning_content, reasoning_details, codex_reasoning_items,
|
||||
codex_message_items)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
|
||||
codex_message_items, platform_message_id)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
|
||||
(
|
||||
session_id,
|
||||
role,
|
||||
|
|
@ -1497,6 +1518,7 @@ class SessionDB:
|
|||
reasoning_details_json,
|
||||
codex_items_json,
|
||||
codex_message_items_json,
|
||||
platform_message_id,
|
||||
),
|
||||
)
|
||||
msg_id = cursor.lastrowid
|
||||
|
|
@ -1558,13 +1580,18 @@ class SessionDB:
|
|||
json.dumps(codex_message_items) if codex_message_items else None
|
||||
)
|
||||
tool_calls_json = json.dumps(tool_calls) if tool_calls else None
|
||||
# Accept either `platform_message_id` (new explicit name) or
|
||||
# `message_id` (yuanbao's existing convention on message dicts).
|
||||
platform_msg_id = (
|
||||
msg.get("platform_message_id") or msg.get("message_id")
|
||||
)
|
||||
|
||||
conn.execute(
|
||||
"""INSERT INTO messages (session_id, role, content, tool_call_id,
|
||||
tool_calls, tool_name, timestamp, token_count, finish_reason,
|
||||
reasoning, reasoning_content, reasoning_details, codex_reasoning_items,
|
||||
codex_message_items)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
|
||||
codex_message_items, platform_message_id)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
|
||||
(
|
||||
session_id,
|
||||
role,
|
||||
|
|
@ -1580,6 +1607,7 @@ class SessionDB:
|
|||
reasoning_details_json,
|
||||
codex_items_json,
|
||||
codex_message_items_json,
|
||||
platform_msg_id,
|
||||
),
|
||||
)
|
||||
total_messages += 1
|
||||
|
|
@ -1897,7 +1925,7 @@ class SessionDB:
|
|||
rows = self._conn.execute(
|
||||
"SELECT role, content, tool_call_id, tool_calls, tool_name, "
|
||||
"finish_reason, reasoning, reasoning_content, reasoning_details, "
|
||||
"codex_reasoning_items, codex_message_items "
|
||||
"codex_reasoning_items, codex_message_items, platform_message_id "
|
||||
f"FROM messages WHERE session_id IN ({placeholders}) ORDER BY id",
|
||||
tuple(session_ids),
|
||||
).fetchall()
|
||||
|
|
@ -1918,6 +1946,13 @@ class SessionDB:
|
|||
except (json.JSONDecodeError, TypeError):
|
||||
logger.warning("Failed to deserialize tool_calls in conversation replay, falling back to []")
|
||||
msg["tool_calls"] = []
|
||||
# Surface the platform-side message id (e.g. yuanbao msg_id,
|
||||
# telegram update_id) so platform-specific flows like recall
|
||||
# can match by external identifier instead of having to fall
|
||||
# back to content-match heuristics. Exposed as ``message_id``
|
||||
# for backward compatibility with the JSONL transcript shape.
|
||||
if row["platform_message_id"]:
|
||||
msg["message_id"] = row["platform_message_id"]
|
||||
# Restore reasoning fields on assistant messages so providers
|
||||
# that replay reasoning (OpenRouter, OpenAI, Nous) receive
|
||||
# coherent multi-turn reasoning context.
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue