diff --git a/agent/conversation_compression.py b/agent/conversation_compression.py index 610f0ac5ac6..93055f6402f 100644 --- a/agent/conversation_compression.py +++ b/agent/conversation_compression.py @@ -592,14 +592,62 @@ def compress_context( except Exception: pass agent._session_db_created = False - agent._session_db.create_session( - session_id=agent.session_id, - source=agent.platform or os.environ.get("HERMES_SESSION_SOURCE", "cli"), - model=agent.model, - model_config=agent._session_init_model_config, - parent_session_id=old_session_id, - ) + try: + agent._session_db.create_session( + session_id=agent.session_id, + source=agent.platform or os.environ.get("HERMES_SESSION_SOURCE", "cli"), + model=agent.model, + model_config=agent._session_init_model_config, + parent_session_id=old_session_id, + ) + except Exception as _cs_err: + # The child row could not be created (e.g. FK constraint, + # contended write). Previously the outer handler simply + # warned and let the agent continue on the NEW id — which + # has no row in state.db, producing an orphan: the parent + # is ended, the child is never indexed, and every + # subsequent message is attributed to a session that + # doesn't exist (#33906/#33907). Roll the live id back to + # the parent so the conversation stays attached to a real, + # indexed session instead of a phantom. + logger.warning( + "Compression child session create failed (%s) — " + "rolling back to parent session %s to avoid an orphan.", + _cs_err, old_session_id, + ) + agent.session_id = old_session_id + try: + from gateway.session_context import set_current_session_id + set_current_session_id(agent.session_id) + except Exception: + os.environ["HERMES_SESSION_ID"] = agent.session_id + try: + from hermes_logging import set_session_context + set_session_context(agent.session_id) + except Exception: + pass + # Re-open the parent: it was ended above, but we're + # continuing on it, so it must not stay closed. + try: + agent._session_db.reopen_session(old_session_id) + except Exception: + pass + old_session_id = None # no rotation happened + # The parent row already exists in state.db, so mark the + # session as created — _ensure_db_session would otherwise + # retry a (harmless INSERT OR IGNORE) create next turn. + agent._session_db_created = True + raise agent._session_db_created = True + # Carry a persistent /goal onto the continuation session. + # Compression mints a fresh child id; load_goal does a flat + # per-session lookup with no parent walk, so without this an + # active goal silently dies at the boundary (#33618). + try: + from hermes_cli.goals import migrate_goal_to_session + migrate_goal_to_session(old_session_id, agent.session_id, reason="compression") + except Exception as _goal_err: + logger.debug("Could not migrate goal on compression: %s", _goal_err) # Auto-number the title for the continuation session if old_title: try: @@ -615,7 +663,18 @@ def compress_context( agent._session_db.update_system_prompt(agent.session_id, new_system_prompt) agent._last_flushed_db_idx = 0 except Exception as e: - logger.warning("Session DB compression split failed — new session will NOT be indexed: %s", e) + # If the rotation rolled back to the parent (orphan-avoidance + # above), agent.session_id is the still-indexed parent and + # old_session_id was cleared — so this is recovery, not an + # un-indexed orphan. Otherwise an earlier step failed before the + # child was created and the warning's original meaning holds. + if locals().get("old_session_id") is None and not in_place: + logger.warning( + "Compression rotation aborted and rolled back to the " + "parent session (%s): %s", agent.session_id or "?", e, + ) + else: + logger.warning("Session DB compression split failed — new session will NOT be indexed: %s", e) # Compaction-boundary bookkeeping, computed once. `old_session_id` is only # bound in the rotation branch; in-place leaves it unset. `_boundary_parent` @@ -637,6 +696,7 @@ def compress_context( agent.session_id or "", boundary_reason="compression", old_session_id=_boundary_parent, + platform=getattr(agent, "platform", None) or "cli", conversation_id=getattr(agent, "_gateway_session_key", None), ) except Exception as _ce_err: diff --git a/hermes_cli/goals.py b/hermes_cli/goals.py index a6a28deaf95..8359466e3a0 100644 --- a/hermes_cli/goals.py +++ b/hermes_cli/goals.py @@ -279,6 +279,44 @@ def clear_goal(session_id: str) -> None: save_goal(session_id, state) +def migrate_goal_to_session(old_session_id: str, new_session_id: str, *, reason: str = "") -> bool: + """Carry a persistent /goal from a parent session to its continuation. + + Context compression rotates ``session_id`` to a fresh child session, + but ``load_goal`` does a flat ``goal:`` lookup with no + parent-lineage walk — so an active goal silently dies at the + compaction boundary (#33618). Copy the goal onto the new session and + archive the old row as ``cleared`` so exactly one active goal row + exists per logical conversation (avoids the "two active goals" + hazard of a pure copy). + + Returns True when a goal was migrated, False when there was nothing + to migrate or the DB was unavailable. Best-effort and never raises — + a failure here must not block compression. + """ + if not old_session_id or not new_session_id or old_session_id == new_session_id: + return False + try: + state = load_goal(old_session_id) + if state is None or getattr(state, "status", None) == "cleared": + return False + # Don't clobber a goal already set on the child (e.g. a resumed + # lineage that re-established its own goal). + if load_goal(new_session_id) is not None: + return False + save_goal(new_session_id, state) + # Archive the parent's row so it isn't double-counted as active. + clear_goal(old_session_id) + logger.debug( + "GoalManager: migrated goal %s -> %s (%s)", + old_session_id, new_session_id, reason or "rotation", + ) + return True + except Exception as exc: # pragma: no cover - defensive + logger.debug("GoalManager: goal migration failed: %s", exc) + return False + + # ────────────────────────────────────────────────────────────────────── # Judge # ────────────────────────────────────────────────────────────────────── @@ -907,6 +945,7 @@ __all__ = [ "load_goal", "save_goal", "clear_goal", + "migrate_goal_to_session", "judge_goal", "run_kanban_goal_loop", ] diff --git a/tests/agent/test_compression_rotation_state.py b/tests/agent/test_compression_rotation_state.py new file mode 100644 index 00000000000..510c485182a --- /dev/null +++ b/tests/agent/test_compression_rotation_state.py @@ -0,0 +1,129 @@ +"""Compression rotation hardening — state-loss fixes at the compaction boundary. + +When auto-compression rotates ``agent.session_id`` to a continuation child, +three pieces of state used to be lost or corrupted: + + * #33618 — a persistent ``/goal`` did not follow the rotation (``load_goal`` + is a flat per-session lookup with no lineage walk), so it silently died. + * #33906/#33907 — if the child ``create_session`` raised, the outer handler + only warned and let the agent continue on the NEW (un-indexed) id, + producing an orphan session missing from state.db. + * #27633 — the compaction-boundary ``on_session_start`` notification omitted + the ``platform`` kwarg, so context-engine plugins saw ``source=unknown`` + for every message after the boundary. + +These tests drive the real ``compress_context`` path against a real SessionDB. +""" + +from __future__ import annotations + +import os +from pathlib import Path +from unittest.mock import MagicMock, patch + +from hermes_state import SessionDB + + +def _build_agent_with_db(db: SessionDB, session_id: str, platform: str = "telegram"): + with patch.dict(os.environ, {"OPENROUTER_API_KEY": "test-key"}): + from run_agent import AIAgent + + agent = AIAgent( + api_key="test-key", + base_url="https://openrouter.ai/api/v1", + model="test/model", + platform=platform, + quiet_mode=True, + session_db=db, + session_id=session_id, + skip_context_files=True, + skip_memory=True, + ) + + compressor = MagicMock() + compressor.compress.return_value = [ + {"role": "user", "content": "[CONTEXT COMPACTION] summary"}, + {"role": "user", "content": "tail"}, + ] + compressor.compression_count = 1 + compressor.last_prompt_tokens = 0 + compressor.last_completion_tokens = 0 + compressor._last_summary_error = None + compressor._last_compress_aborted = False + compressor._last_summary_auth_failure = False + compressor._last_aux_model_failure_model = None + compressor._last_aux_model_failure_error = None + agent.context_compressor = compressor + return agent + + +def _msgs(n=20): + return [{"role": "user", "content": f"m{i}"} for i in range(n)] + + +class TestGoalMigratesOnRotation: + def test_goal_follows_compression_rotation(self, tmp_path: Path): + db = SessionDB(db_path=tmp_path / "state.db") + parent = "PARENT_GOAL_ROT" + db.create_session(parent, source="cli") + agent = _build_agent_with_db(db, parent) + + # Set a persistent goal on the parent via the real persistence path. + with patch.dict(os.environ, {"HERMES_HOME": str(tmp_path / ".hermes")}): + (tmp_path / ".hermes").mkdir(exist_ok=True) + import hermes_cli.goals as goals + goals._DB_CACHE.clear() + # Point the goal DB at the same state.db the agent uses. + with patch.object(goals, "_get_session_db", return_value=db): + goals.save_goal(parent, goals.GoalState(goal="finish the migration")) + + agent._compress_context(_msgs(), "sys", approx_tokens=120_000) + child = agent.session_id + assert child != parent # rotation happened + + migrated = goals.load_goal(child) + assert migrated is not None + assert migrated.goal == "finish the migration" + goals._DB_CACHE.clear() + + +class TestOrphanRollbackOnCreateFailure: + def test_rolls_back_to_parent_when_child_create_fails(self, tmp_path: Path): + db = SessionDB(db_path=tmp_path / "state.db") + parent = "PARENT_ORPHAN_ROT" + db.create_session(parent, source="cli") + agent = _build_agent_with_db(db, parent) + + # Make the CHILD create_session raise, but let the initial parent + # end_session/reopen work. We patch create_session to blow up. + real_create = db.create_session + + def _boom(*a, **k): + raise RuntimeError("FOREIGN KEY constraint failed") + + with patch.object(db, "create_session", side_effect=_boom): + agent._compress_context(_msgs(), "sys", approx_tokens=120_000) + + # The live id must roll back to the still-indexed parent — NOT a + # phantom child id that has no row in state.db. + assert agent.session_id == parent + assert db.get_session(parent) is not None + _ = real_create # silence unused + + +class TestPlatformForwardedAtBoundary: + def test_on_session_start_receives_platform(self, tmp_path: Path): + db = SessionDB(db_path=tmp_path / "state.db") + parent = "PARENT_PLATFORM_ROT" + db.create_session(parent, source="telegram") + agent = _build_agent_with_db(db, parent, platform="telegram") + + agent._compress_context(_msgs(), "sys", approx_tokens=120_000) + + # The boundary notify must forward the platform so context-engine + # plugins don't fall back to source=unknown (#27633). + calls = [c for c in agent.context_compressor.on_session_start.call_args_list] + assert calls, "on_session_start was not called at the boundary" + kwargs = calls[-1].kwargs + assert kwargs.get("platform") == "telegram" + assert kwargs.get("boundary_reason") == "compression" diff --git a/tests/hermes_cli/test_goals.py b/tests/hermes_cli/test_goals.py index 0dae684b629..63d00b945ed 100644 --- a/tests/hermes_cli/test_goals.py +++ b/tests/hermes_cli/test_goals.py @@ -547,6 +547,47 @@ class TestGoalStateSubgoalsBackcompat: assert rt.subgoals == ["a", "b", "c"] +class TestMigrateGoalToSession: + """migrate_goal_to_session carries a /goal from a parent session to its + compression continuation child (#33618). load_goal does a flat + per-session lookup with no lineage walk, so without migration an active + goal silently dies when compression rotates session_id.""" + + def test_migrates_active_goal_to_child(self, hermes_home): + from hermes_cli.goals import save_goal, load_goal, migrate_goal_to_session, GoalState + save_goal("parent-sid", GoalState(goal="ship the feature")) + assert migrate_goal_to_session("parent-sid", "child-sid", reason="compression") is True + child = load_goal("child-sid") + assert child is not None and child.goal == "ship the feature" + # Parent row archived (cleared) so only the child is active. + parent = load_goal("parent-sid") + assert parent is not None and parent.status == "cleared" + + def test_no_goal_to_migrate_returns_false(self, hermes_home): + from hermes_cli.goals import migrate_goal_to_session, load_goal + assert migrate_goal_to_session("empty-parent", "child2") is False + assert load_goal("child2") is None + + def test_does_not_clobber_existing_child_goal(self, hermes_home): + from hermes_cli.goals import save_goal, load_goal, migrate_goal_to_session, GoalState + save_goal("p3", GoalState(goal="parent goal")) + save_goal("c3", GoalState(goal="child already has one")) + assert migrate_goal_to_session("p3", "c3") is False + assert load_goal("c3").goal == "child already has one" + + def test_same_id_is_noop(self, hermes_home): + from hermes_cli.goals import save_goal, migrate_goal_to_session, GoalState + save_goal("same", GoalState(goal="g")) + assert migrate_goal_to_session("same", "same") is False + + def test_cleared_goal_not_migrated(self, hermes_home): + from hermes_cli.goals import save_goal, clear_goal, migrate_goal_to_session, load_goal, GoalState + save_goal("p4", GoalState(goal="done already")) + clear_goal("p4") + assert migrate_goal_to_session("p4", "c4") is False + assert load_goal("c4") is None + + class TestGoalManagerSubgoals: def test_add_subgoal(self, hermes_home): from hermes_cli.goals import GoalManager