fix(gateway): keep Telegram topic bindings aligned with compression children (#34409)

Telegram DM topic bindings persist (chat_id, thread_id) -> session_id in
SQLite so reopening a topic resumes the right Hermes session. When
compression rotated session_entry.session_id mid-turn, the binding row
stayed pointed at the pre-compression parent. On the next inbound
message in that topic the gateway reloaded the oversized parent
transcript, retriggering preflight compression — sometimes in a loop.

Two-pronged fix:

1. `_sync_telegram_topic_binding(source, entry, *, reason)` helper
   called immediately after each of the three session_id rotation sites
   in _handle_message_with_agent (hygiene compression, agent-result
   compression rotation, /compress command). Keeps future bindings
   fresh.

2. Read-path self-heal: when resolving an existing topic binding, walk
   SessionDB.get_compression_tip() forward and switch_session to the
   descendant instead of the stored parent. Rewrites the binding row to
   the tip so subsequent messages skip the walk. Heals existing stale
   state on the next user message without requiring a gateway restart.

Skipped from competing PRs as not load-bearing for the bug:
- advance_session_after_compression SessionStore primitive (#26204/
  #28870/#33416) — preserves end_reason='compression' analytics nicety
  but doesn't affect routing correctness.
- Cached-agent eviction on session_id mismatch — _compress_context()
  already mutates tmp_agent.session_id on the cached object so the
  in-memory agent self-corrects.
- Startup repair pass (#33416) — redundant once the read path heals on
  the next message; one-line CLI follow-up can address bindings for
  topics users never reopen.

Closes #20470, #29712, #33414. Acknowledges work in #23195
(@litvinovvo), #26204 (@bizyumov), #28870 (@donrhmexe), #29713
(@hehehe0803), #29945 (@eugeneb1ack), #33416 (@bizyumov).
This commit is contained in:
Teknium 2026-05-28 23:25:52 -07:00 committed by GitHub
parent ec7736f8a7
commit db96fc60d0
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 150 additions and 0 deletions

View file

@ -2303,6 +2303,32 @@ class GatewayRunner:
session_id=session_entry.session_id,
)
def _sync_telegram_topic_binding(
self,
source: SessionSource,
session_entry,
*,
reason: str,
) -> None:
"""Update the topic binding to point at ``session_entry.session_id``.
Telegram topic lanes persist a (chat_id, thread_id) -> session_id row
so reopening a topic in a fresh process resumes the right Hermes
session. When compression rotates ``session_entry.session_id`` mid-turn,
the binding goes stale and the next inbound message in that topic
reloads the oversized parent transcript instead of the compressed
child, retriggering preflight compression sometimes in a loop
(#20470, #29712, #33414).
"""
if not self._is_telegram_topic_lane(source):
return
try:
self._record_telegram_topic_binding(source, session_entry)
except Exception:
logger.debug(
"telegram topic binding refresh failed (%s)", reason, exc_info=True,
)
def _recover_telegram_topic_thread_id(
self,
source: SessionSource,
@ -8279,6 +8305,28 @@ class GatewayRunner:
binding = None
if binding:
bound_session_id = str(binding.get("session_id") or "")
# Heal bindings that point at a pre-compression parent: walk
# the compression-continuation chain forward to its tip so the
# next message resumes the compressed child instead of
# reloading the oversized parent transcript (#20470/#29712/
# #33414). Returns the input unchanged when the session isn't
# a compression parent, so this is cheap and safe.
if bound_session_id and self._session_db is not None:
try:
canonical_session_id = self._session_db.get_compression_tip(
bound_session_id,
)
except Exception:
logger.debug(
"compression-tip lookup failed for %s",
bound_session_id, exc_info=True,
)
canonical_session_id = bound_session_id
if (
canonical_session_id
and canonical_session_id != bound_session_id
):
bound_session_id = canonical_session_id
if bound_session_id and bound_session_id != session_entry.session_id:
# Route the override through SessionStore so the session_key
# → session_id mapping is persisted to disk and the previous
@ -8288,6 +8336,15 @@ class GatewayRunner:
switched = self.session_store.switch_session(session_key, bound_session_id)
if switched is not None:
session_entry = switched
# If the stored binding pointed at a parent, rewrite it to the
# canonical descendant now that we've followed the chain.
if (
bound_session_id
and bound_session_id != str(binding.get("session_id") or "")
):
self._sync_telegram_topic_binding(
source, session_entry, reason="compression-tip-walk",
)
else:
try:
self._record_telegram_topic_binding(source, session_entry)
@ -8664,6 +8721,10 @@ class GatewayRunner:
if _hyg_new_sid != session_entry.session_id:
session_entry.session_id = _hyg_new_sid
self.session_store._save()
self._sync_telegram_topic_binding(
source, session_entry,
reason="hygiene-compression",
)
self.session_store.rewrite_transcript(
session_entry.session_id, _compressed
@ -8929,6 +8990,9 @@ class GatewayRunner:
if agent_result.get("session_id") and agent_result["session_id"] != session_entry.session_id:
session_entry.session_id = agent_result["session_id"]
self.session_store._save()
self._sync_telegram_topic_binding(
source, session_entry, reason="agent-result-compression",
)
# Prepend reasoning/thinking if display is enabled (per-platform)
try:
@ -12373,6 +12437,9 @@ class GatewayRunner:
if new_session_id != session_entry.session_id:
session_entry.session_id = new_session_id
self.session_store._save()
self._sync_telegram_topic_binding(
source, session_entry, reason="compress-command",
)
self.session_store.rewrite_transcript(new_session_id, compressed)
# Reset stored token count — transcript changed, old value is stale