fix(compression): preserve goal, platform, and session indexing across rotation

Three state-loss bugs at the compression rotation boundary, fixed together
because they all live in the same ~80-line rotation block:

- #33618: a persistent /goal did not follow the rotation. load_goal does a
  flat per-session lookup with no lineage walk, so a goal silently died when
  compression minted a fresh child id. Added migrate_goal_to_session() and
  call it after the child session is created (move-not-copy: the parent row
  is archived as cleared so exactly one active goal row exists).

- #33906/#33907: if the child create_session raised (FK constraint,
  contended write), the outer handler only warned and let the agent continue
  on the NEW id — which has no row in state.db — producing an orphan session.
  Now the rotation rolls agent.session_id back to the still-indexed parent
  (reopening it) instead of stranding the conversation on a phantom id.

- #27633: the compaction-boundary on_session_start notification omitted the
  platform kwarg, so context-engine plugins saw source=unknown for every
  message after the boundary. Forward platform (matching the initial
  session-start call in agent_init.py).

Co-authored-by: denisqq <21260182+denisqq@users.noreply.github.com>
Co-authored-by: zccyman <16263913+zccyman@users.noreply.github.com>
Co-authored-by: liuhao1024 <sunsky.lau@gmail.com>
This commit is contained in:
teknium1 2026-06-20 12:28:37 -07:00 committed by Teknium
parent b4b512c507
commit 7ace96ba40
4 changed files with 277 additions and 8 deletions

View file

@ -592,14 +592,62 @@ def compress_context(
except Exception:
pass
agent._session_db_created = False
agent._session_db.create_session(
session_id=agent.session_id,
source=agent.platform or os.environ.get("HERMES_SESSION_SOURCE", "cli"),
model=agent.model,
model_config=agent._session_init_model_config,
parent_session_id=old_session_id,
)
try:
agent._session_db.create_session(
session_id=agent.session_id,
source=agent.platform or os.environ.get("HERMES_SESSION_SOURCE", "cli"),
model=agent.model,
model_config=agent._session_init_model_config,
parent_session_id=old_session_id,
)
except Exception as _cs_err:
# The child row could not be created (e.g. FK constraint,
# contended write). Previously the outer handler simply
# warned and let the agent continue on the NEW id — which
# has no row in state.db, producing an orphan: the parent
# is ended, the child is never indexed, and every
# subsequent message is attributed to a session that
# doesn't exist (#33906/#33907). Roll the live id back to
# the parent so the conversation stays attached to a real,
# indexed session instead of a phantom.
logger.warning(
"Compression child session create failed (%s) — "
"rolling back to parent session %s to avoid an orphan.",
_cs_err, old_session_id,
)
agent.session_id = old_session_id
try:
from gateway.session_context import set_current_session_id
set_current_session_id(agent.session_id)
except Exception:
os.environ["HERMES_SESSION_ID"] = agent.session_id
try:
from hermes_logging import set_session_context
set_session_context(agent.session_id)
except Exception:
pass
# Re-open the parent: it was ended above, but we're
# continuing on it, so it must not stay closed.
try:
agent._session_db.reopen_session(old_session_id)
except Exception:
pass
old_session_id = None # no rotation happened
# The parent row already exists in state.db, so mark the
# session as created — _ensure_db_session would otherwise
# retry a (harmless INSERT OR IGNORE) create next turn.
agent._session_db_created = True
raise
agent._session_db_created = True
# Carry a persistent /goal onto the continuation session.
# Compression mints a fresh child id; load_goal does a flat
# per-session lookup with no parent walk, so without this an
# active goal silently dies at the boundary (#33618).
try:
from hermes_cli.goals import migrate_goal_to_session
migrate_goal_to_session(old_session_id, agent.session_id, reason="compression")
except Exception as _goal_err:
logger.debug("Could not migrate goal on compression: %s", _goal_err)
# Auto-number the title for the continuation session
if old_title:
try:
@ -615,7 +663,18 @@ def compress_context(
agent._session_db.update_system_prompt(agent.session_id, new_system_prompt)
agent._last_flushed_db_idx = 0
except Exception as e:
logger.warning("Session DB compression split failed — new session will NOT be indexed: %s", e)
# If the rotation rolled back to the parent (orphan-avoidance
# above), agent.session_id is the still-indexed parent and
# old_session_id was cleared — so this is recovery, not an
# un-indexed orphan. Otherwise an earlier step failed before the
# child was created and the warning's original meaning holds.
if locals().get("old_session_id") is None and not in_place:
logger.warning(
"Compression rotation aborted and rolled back to the "
"parent session (%s): %s", agent.session_id or "?", e,
)
else:
logger.warning("Session DB compression split failed — new session will NOT be indexed: %s", e)
# Compaction-boundary bookkeeping, computed once. `old_session_id` is only
# bound in the rotation branch; in-place leaves it unset. `_boundary_parent`
@ -637,6 +696,7 @@ def compress_context(
agent.session_id or "",
boundary_reason="compression",
old_session_id=_boundary_parent,
platform=getattr(agent, "platform", None) or "cli",
conversation_id=getattr(agent, "_gateway_session_key", None),
)
except Exception as _ce_err:

View file

@ -279,6 +279,44 @@ def clear_goal(session_id: str) -> None:
save_goal(session_id, state)
def migrate_goal_to_session(old_session_id: str, new_session_id: str, *, reason: str = "") -> bool:
"""Carry a persistent /goal from a parent session to its continuation.
Context compression rotates ``session_id`` to a fresh child session,
but ``load_goal`` does a flat ``goal:<session_id>`` lookup with no
parent-lineage walk so an active goal silently dies at the
compaction boundary (#33618). Copy the goal onto the new session and
archive the old row as ``cleared`` so exactly one active goal row
exists per logical conversation (avoids the "two active goals"
hazard of a pure copy).
Returns True when a goal was migrated, False when there was nothing
to migrate or the DB was unavailable. Best-effort and never raises
a failure here must not block compression.
"""
if not old_session_id or not new_session_id or old_session_id == new_session_id:
return False
try:
state = load_goal(old_session_id)
if state is None or getattr(state, "status", None) == "cleared":
return False
# Don't clobber a goal already set on the child (e.g. a resumed
# lineage that re-established its own goal).
if load_goal(new_session_id) is not None:
return False
save_goal(new_session_id, state)
# Archive the parent's row so it isn't double-counted as active.
clear_goal(old_session_id)
logger.debug(
"GoalManager: migrated goal %s -> %s (%s)",
old_session_id, new_session_id, reason or "rotation",
)
return True
except Exception as exc: # pragma: no cover - defensive
logger.debug("GoalManager: goal migration failed: %s", exc)
return False
# ──────────────────────────────────────────────────────────────────────
# Judge
# ──────────────────────────────────────────────────────────────────────
@ -907,6 +945,7 @@ __all__ = [
"load_goal",
"save_goal",
"clear_goal",
"migrate_goal_to_session",
"judge_goal",
"run_kanban_goal_loop",
]

View file

@ -0,0 +1,129 @@
"""Compression rotation hardening — state-loss fixes at the compaction boundary.
When auto-compression rotates ``agent.session_id`` to a continuation child,
three pieces of state used to be lost or corrupted:
* #33618 — a persistent ``/goal`` did not follow the rotation (``load_goal``
is a flat per-session lookup with no lineage walk), so it silently died.
* #33906/#33907 — if the child ``create_session`` raised, the outer handler
only warned and let the agent continue on the NEW (un-indexed) id,
producing an orphan session missing from state.db.
* #27633 — the compaction-boundary ``on_session_start`` notification omitted
the ``platform`` kwarg, so context-engine plugins saw ``source=unknown``
for every message after the boundary.
These tests drive the real ``compress_context`` path against a real SessionDB.
"""
from __future__ import annotations
import os
from pathlib import Path
from unittest.mock import MagicMock, patch
from hermes_state import SessionDB
def _build_agent_with_db(db: SessionDB, session_id: str, platform: str = "telegram"):
with patch.dict(os.environ, {"OPENROUTER_API_KEY": "test-key"}):
from run_agent import AIAgent
agent = AIAgent(
api_key="test-key",
base_url="https://openrouter.ai/api/v1",
model="test/model",
platform=platform,
quiet_mode=True,
session_db=db,
session_id=session_id,
skip_context_files=True,
skip_memory=True,
)
compressor = MagicMock()
compressor.compress.return_value = [
{"role": "user", "content": "[CONTEXT COMPACTION] summary"},
{"role": "user", "content": "tail"},
]
compressor.compression_count = 1
compressor.last_prompt_tokens = 0
compressor.last_completion_tokens = 0
compressor._last_summary_error = None
compressor._last_compress_aborted = False
compressor._last_summary_auth_failure = False
compressor._last_aux_model_failure_model = None
compressor._last_aux_model_failure_error = None
agent.context_compressor = compressor
return agent
def _msgs(n=20):
return [{"role": "user", "content": f"m{i}"} for i in range(n)]
class TestGoalMigratesOnRotation:
def test_goal_follows_compression_rotation(self, tmp_path: Path):
db = SessionDB(db_path=tmp_path / "state.db")
parent = "PARENT_GOAL_ROT"
db.create_session(parent, source="cli")
agent = _build_agent_with_db(db, parent)
# Set a persistent goal on the parent via the real persistence path.
with patch.dict(os.environ, {"HERMES_HOME": str(tmp_path / ".hermes")}):
(tmp_path / ".hermes").mkdir(exist_ok=True)
import hermes_cli.goals as goals
goals._DB_CACHE.clear()
# Point the goal DB at the same state.db the agent uses.
with patch.object(goals, "_get_session_db", return_value=db):
goals.save_goal(parent, goals.GoalState(goal="finish the migration"))
agent._compress_context(_msgs(), "sys", approx_tokens=120_000)
child = agent.session_id
assert child != parent # rotation happened
migrated = goals.load_goal(child)
assert migrated is not None
assert migrated.goal == "finish the migration"
goals._DB_CACHE.clear()
class TestOrphanRollbackOnCreateFailure:
def test_rolls_back_to_parent_when_child_create_fails(self, tmp_path: Path):
db = SessionDB(db_path=tmp_path / "state.db")
parent = "PARENT_ORPHAN_ROT"
db.create_session(parent, source="cli")
agent = _build_agent_with_db(db, parent)
# Make the CHILD create_session raise, but let the initial parent
# end_session/reopen work. We patch create_session to blow up.
real_create = db.create_session
def _boom(*a, **k):
raise RuntimeError("FOREIGN KEY constraint failed")
with patch.object(db, "create_session", side_effect=_boom):
agent._compress_context(_msgs(), "sys", approx_tokens=120_000)
# The live id must roll back to the still-indexed parent — NOT a
# phantom child id that has no row in state.db.
assert agent.session_id == parent
assert db.get_session(parent) is not None
_ = real_create # silence unused
class TestPlatformForwardedAtBoundary:
def test_on_session_start_receives_platform(self, tmp_path: Path):
db = SessionDB(db_path=tmp_path / "state.db")
parent = "PARENT_PLATFORM_ROT"
db.create_session(parent, source="telegram")
agent = _build_agent_with_db(db, parent, platform="telegram")
agent._compress_context(_msgs(), "sys", approx_tokens=120_000)
# The boundary notify must forward the platform so context-engine
# plugins don't fall back to source=unknown (#27633).
calls = [c for c in agent.context_compressor.on_session_start.call_args_list]
assert calls, "on_session_start was not called at the boundary"
kwargs = calls[-1].kwargs
assert kwargs.get("platform") == "telegram"
assert kwargs.get("boundary_reason") == "compression"

View file

@ -547,6 +547,47 @@ class TestGoalStateSubgoalsBackcompat:
assert rt.subgoals == ["a", "b", "c"]
class TestMigrateGoalToSession:
"""migrate_goal_to_session carries a /goal from a parent session to its
compression continuation child (#33618). load_goal does a flat
per-session lookup with no lineage walk, so without migration an active
goal silently dies when compression rotates session_id."""
def test_migrates_active_goal_to_child(self, hermes_home):
from hermes_cli.goals import save_goal, load_goal, migrate_goal_to_session, GoalState
save_goal("parent-sid", GoalState(goal="ship the feature"))
assert migrate_goal_to_session("parent-sid", "child-sid", reason="compression") is True
child = load_goal("child-sid")
assert child is not None and child.goal == "ship the feature"
# Parent row archived (cleared) so only the child is active.
parent = load_goal("parent-sid")
assert parent is not None and parent.status == "cleared"
def test_no_goal_to_migrate_returns_false(self, hermes_home):
from hermes_cli.goals import migrate_goal_to_session, load_goal
assert migrate_goal_to_session("empty-parent", "child2") is False
assert load_goal("child2") is None
def test_does_not_clobber_existing_child_goal(self, hermes_home):
from hermes_cli.goals import save_goal, load_goal, migrate_goal_to_session, GoalState
save_goal("p3", GoalState(goal="parent goal"))
save_goal("c3", GoalState(goal="child already has one"))
assert migrate_goal_to_session("p3", "c3") is False
assert load_goal("c3").goal == "child already has one"
def test_same_id_is_noop(self, hermes_home):
from hermes_cli.goals import save_goal, migrate_goal_to_session, GoalState
save_goal("same", GoalState(goal="g"))
assert migrate_goal_to_session("same", "same") is False
def test_cleared_goal_not_migrated(self, hermes_home):
from hermes_cli.goals import save_goal, clear_goal, migrate_goal_to_session, load_goal, GoalState
save_goal("p4", GoalState(goal="done already"))
clear_goal("p4")
assert migrate_goal_to_session("p4", "c4") is False
assert load_goal("c4") is None
class TestGoalManagerSubgoals:
def test_add_subgoal(self, hermes_home):
from hermes_cli.goals import GoalManager