From d5e2fbf244027620ff7aed2ba36fbe0eb5997cf7 Mon Sep 17 00:00:00 2001 From: konsisumer Date: Thu, 11 Jun 2026 17:12:32 +0200 Subject: [PATCH] fix(agent): frame compaction handoff sections as historical context --- agent/context_compressor.py | 32 ++++++++++++------- tests/agent/test_context_compressor.py | 11 +++++-- ...t_context_compressor_temporal_anchoring.py | 4 +-- tests/agent/test_resume_stale_active_task.py | 21 ++++++------ tests/agent/test_summary_prefix_semantics.py | 22 ++++++++++++- 5 files changed, 62 insertions(+), 28 deletions(-) diff --git a/agent/context_compressor.py b/agent/context_compressor.py index 98d226b46af..86352a6336b 100644 --- a/agent/context_compressor.py +++ b/agent/context_compressor.py @@ -34,6 +34,12 @@ from agent.redact import redact_sensitive_text logger = logging.getLogger(__name__) +HISTORICAL_TASK_HEADING = "## Historical Task Snapshot" +HISTORICAL_IN_PROGRESS_HEADING = "## Historical In-Progress State" +HISTORICAL_PENDING_ASKS_HEADING = "## Historical Pending User Asks" +HISTORICAL_REMAINING_WORK_HEADING = "## Historical Remaining Work" + + SUMMARY_PREFIX = ( "[CONTEXT COMPACTION — REFERENCE ONLY] Earlier turns were compacted " "into the summary below. This is a handoff from a previous context " @@ -43,12 +49,14 @@ SUMMARY_PREFIX = ( "Respond ONLY to the latest user message that appears AFTER this " "summary — that message is the single source of truth for what to do " "right now. " - "If the latest user message is consistent with the '## Active Task' " + f"If the latest user message is consistent with the '{HISTORICAL_TASK_HEADING}' " "section, you may use the summary as background. If the latest user " "message contradicts, supersedes, changes topic from, or in any way " - "diverges from '## Active Task' / '## In Progress' / '## Pending User " - "Asks' / '## Remaining Work', the latest message WINS — discard those " - "stale items entirely and do not 'wrap up the old task first'. " + f"diverges from '{HISTORICAL_TASK_HEADING}' / " + f"'{HISTORICAL_IN_PROGRESS_HEADING}' / " + f"'{HISTORICAL_PENDING_ASKS_HEADING}' / " + f"'{HISTORICAL_REMAINING_WORK_HEADING}', the latest message WINS — " + "discard those stale items entirely and do not 'wrap up the old task first'. " "Reverse signals in the latest message (e.g. 'stop', 'undo', 'roll " "back', 'just verify', 'don't do that anymore', 'never mind', a new " "topic) must immediately end any in-flight work described in the " @@ -1155,7 +1163,7 @@ class ContextCompressor(ContextEngine): ) reason_text = f" Summary failure reason: {reason}." if reason else "" - body = f"""## Active Task + body = f"""{HISTORICAL_TASK_HEADING} {active_task} ## Goal @@ -1172,7 +1180,7 @@ Recovered from a deterministic fallback because the LLM context summarizer was u ## Active State Unknown from deterministic fallback. Inspect current repository/session state if needed. -## In Progress +{HISTORICAL_IN_PROGRESS_HEADING} {active_task} ## Blocked @@ -1184,13 +1192,13 @@ None recoverable from deterministic fallback. ## Resolved Questions None recoverable from deterministic fallback. -## Pending User Asks +{HISTORICAL_PENDING_ASKS_HEADING} {active_task} ## Relevant Files {_bullets(relevant_files, limit=12)} -## Remaining Work +{HISTORICAL_REMAINING_WORK_HEADING} Continue from the most recent unfulfilled user ask and protected tail messages. Verify state with tools before making claims. ## Last Dropped Turns @@ -1312,7 +1320,7 @@ Summary generation was unavailable, so this is a best-effort deterministic fallb _temporal_anchoring_rule = "" # Shared structured template (used by both paths). - _template_sections = f"""## Active Task + _template_sections = f"""{HISTORICAL_TASK_HEADING} [THE SINGLE MOST IMPORTANT FIELD. Capture the user's most recent unfulfilled input verbatim — the exact words they used. This includes: - Explicit task assignments ("refactor the auth module") @@ -1359,7 +1367,7 @@ Be specific with file paths, commands, line numbers, and results.] - Any running processes or servers - Environment details that matter] -## In Progress +{HISTORICAL_IN_PROGRESS_HEADING} [Work currently underway — what was being done when compaction fired] ## Blocked @@ -1371,13 +1379,13 @@ Be specific with file paths, commands, line numbers, and results.] ## Resolved Questions [Questions the user asked that were ALREADY answered — include the answer so it is not repeated] -## Pending User Asks +{HISTORICAL_PENDING_ASKS_HEADING} [Questions or requests from the user that have NOT yet been answered or fulfilled. If none, write "None."] ## Relevant Files [Files read, modified, or created — with brief note on each] -## Remaining Work +{HISTORICAL_REMAINING_WORK_HEADING} [What remains to be done — framed as context, not instructions] ## Critical Context diff --git a/tests/agent/test_context_compressor.py b/tests/agent/test_context_compressor.py index 1b4242e0e01..0c56da2687e 100644 --- a/tests/agent/test_context_compressor.py +++ b/tests/agent/test_context_compressor.py @@ -3,7 +3,11 @@ import pytest from unittest.mock import patch, MagicMock -from agent.context_compressor import ContextCompressor, SUMMARY_PREFIX +from agent.context_compressor import ( + ContextCompressor, + HISTORICAL_TASK_HEADING, + SUMMARY_PREFIX, +) @pytest.fixture() @@ -157,7 +161,7 @@ class TestCompress: result = c.compress(msgs) combined = "\n".join(str(m.get("content", "")) for m in result) - assert "## Active Task" in combined + assert HISTORICAL_TASK_HEADING in combined assert "Please fix the compression summary failure" in combined assert "read_file" in combined assert "agent/context_compressor.py" in combined @@ -1213,7 +1217,8 @@ class TestCompressWithClient: """When the summary lands as standalone role='user' (e.g. head ends with assistant/tool), the message body must include the explicit '--- END OF CONTEXT SUMMARY ---' marker. Without it, weak models - read the verbatim past user request quoted in '## Active Task' as + read the verbatim past user request quoted in the historical task + snapshot as fresh input (#11475, #14521). """ mock_response = MagicMock() diff --git a/tests/agent/test_context_compressor_temporal_anchoring.py b/tests/agent/test_context_compressor_temporal_anchoring.py index 973bf12909f..52101dc56e6 100644 --- a/tests/agent/test_context_compressor_temporal_anchoring.py +++ b/tests/agent/test_context_compressor_temporal_anchoring.py @@ -15,7 +15,7 @@ from datetime import datetime, timezone from unittest.mock import MagicMock, patch import hermes_time -from agent.context_compressor import ContextCompressor +from agent.context_compressor import ContextCompressor, HISTORICAL_TASK_HEADING def _compressor() -> ContextCompressor: @@ -98,7 +98,7 @@ def test_clock_failure_omits_rule_but_compaction_still_runs(): prompt = mock_call.call_args.kwargs["messages"][0]["content"] assert "TEMPORAL ANCHORING" not in prompt # Structured template still intact. - assert "## Active Task" in prompt + assert HISTORICAL_TASK_HEADING in prompt def test_anchoring_rule_uses_date_from_hermes_time_now(): diff --git a/tests/agent/test_resume_stale_active_task.py b/tests/agent/test_resume_stale_active_task.py index 6b22a37c440..9f64880762c 100644 --- a/tests/agent/test_resume_stale_active_task.py +++ b/tests/agent/test_resume_stale_active_task.py @@ -1,9 +1,9 @@ """Regression coverage for #35344: a resumed session must not let a stale -``## Active Task`` from an inherited compaction handoff hijack the reply to a +historical task snapshot from an inherited compaction handoff hijack the reply to a new, unrelated user message. The failure mode (real report): a lineage was compacted, producing a handoff -whose ``## Active Task`` described task A. The lineage was resumed later and +whose historical task snapshot described task A. The lineage was resumed later and the user asked about an unrelated task B. The model answered with A because the handoff's resume directive outranked the fresh ask. @@ -16,14 +16,15 @@ named reverse-signal verbs. Two invariants guard the resume path specifically: pre-fix stale handoff cannot keep its "resume exactly" directive forever. 2. The current handoff prefix contains an unambiguous "latest message wins / - discard stale Active Task" rule, so an unrelated new ask is privileged over - the inherited ``## Active Task``. + discard stale historical task" rule, so an unrelated new ask is privileged over + the inherited task snapshot. These are content/structural assertions (no live model call) — they pin the mechanism that makes the stale task historical rather than active. """ from agent.context_compressor import ( + HISTORICAL_TASK_HEADING, SUMMARY_PREFIX, LEGACY_SUMMARY_PREFIX, ContextCompressor, @@ -48,10 +49,10 @@ _OLD_CONFLICTING_PREFIX = ( def test_latest_message_wins_over_inherited_active_task(): """The handoff must explicitly privilege the latest user message over a - stale ``## Active Task`` — the core #35344 contract.""" + stale historical task snapshot — the core #35344 contract.""" lower = SUMMARY_PREFIX.lower() assert "latest user message" in lower - assert "## active task" in lower + assert HISTORICAL_TASK_HEADING.lower() in lower # Conflict-resolution must be explicit, not implied. assert "wins" in lower or "supersede" in lower assert "discard" in lower @@ -69,7 +70,7 @@ def test_resumed_stale_handoff_gets_renormalized_to_current_prefix(): prefix when re-normalized on re-compaction — so the "resume exactly" directive cannot survive into a resumed session.""" stale_body = ( - "## Active Task\n" + f"{HISTORICAL_TASK_HEADING}\n" "User asked: 'Migrate the billing module to Stripe'\n\n" "## Goal\nMigrate billing.\n" ) @@ -92,7 +93,7 @@ def test_resumed_stale_handoff_gets_renormalized_to_current_prefix(): def test_legacy_prefix_handoff_also_renormalized(): """The same upgrade applies to the oldest ``[CONTEXT SUMMARY]:`` handoff format that may sit in a long-lived resumed lineage.""" - legacy = f"{LEGACY_SUMMARY_PREFIX} ## Active Task\nUser asked: 'task A'" + legacy = f"{LEGACY_SUMMARY_PREFIX} {HISTORICAL_TASK_HEADING}\nUser asked: 'task A'" renormalized = ContextCompressor._with_summary_prefix(legacy) assert renormalized.startswith(SUMMARY_PREFIX) assert LEGACY_SUMMARY_PREFIX not in renormalized @@ -107,7 +108,7 @@ def test_inherited_handoff_detected_in_resumed_protected_head(): Task read as live intent).""" messages = [ {"role": "system", "content": "system prompt"}, - {"role": "user", "content": f"{SUMMARY_PREFIX}\n## Active Task\nUser asked: 'task A'"}, + {"role": "user", "content": f"{SUMMARY_PREFIX}\n{HISTORICAL_TASK_HEADING}\nUser asked: 'task A'"}, {"role": "assistant", "content": "ok"}, {"role": "user", "content": "Unrelated task B: what's the capital of France?"}, ] @@ -129,7 +130,7 @@ def test_historical_prefixed_handoff_detected_and_stripped(): stale 'resume exactly' text as a fresh turn.""" messages = [ {"role": "system", "content": "system prompt"}, - {"role": "user", "content": f"{_OLD_CONFLICTING_PREFIX}\n## Active Task\nUser asked: 'task A'"}, + {"role": "user", "content": f"{_OLD_CONFLICTING_PREFIX}\n{HISTORICAL_TASK_HEADING}\nUser asked: 'task A'"}, {"role": "assistant", "content": "ok"}, {"role": "user", "content": "Unrelated task B"}, ] diff --git a/tests/agent/test_summary_prefix_semantics.py b/tests/agent/test_summary_prefix_semantics.py index 6e8b8f3a7c4..6b3756bf1bb 100644 --- a/tests/agent/test_summary_prefix_semantics.py +++ b/tests/agent/test_summary_prefix_semantics.py @@ -18,7 +18,13 @@ the agent repeatedly re-surfacing already-cancelled work across turns. These tests pin the post-fix invariants so the conflict cannot regress. """ -from agent.context_compressor import SUMMARY_PREFIX +from agent.context_compressor import ( + HISTORICAL_IN_PROGRESS_HEADING, + HISTORICAL_PENDING_ASKS_HEADING, + HISTORICAL_REMAINING_WORK_HEADING, + HISTORICAL_TASK_HEADING, + SUMMARY_PREFIX, +) def test_no_resume_exactly_directive(): @@ -30,10 +36,24 @@ def test_latest_message_wins_on_conflict(): """The prefix must explicitly say latest user message wins on conflict.""" lower = SUMMARY_PREFIX.lower() assert "latest user message" in lower + assert HISTORICAL_TASK_HEADING.lower() in lower + assert HISTORICAL_PENDING_ASKS_HEADING.lower() in lower + assert HISTORICAL_REMAINING_WORK_HEADING.lower() in lower # Must have an explicit conflict-resolution rule. assert "wins" in lower or "supersede" in lower or "discard" in lower +def test_handoff_sections_are_framed_as_historical(): + """The summary headings referenced in the prefix must sound historical, + not like live instructions for the current turn.""" + lower = SUMMARY_PREFIX.lower() + assert "## active task" not in lower + assert "## pending user asks" not in lower + assert "## remaining work" not in lower + assert HISTORICAL_TASK_HEADING.lower() in lower + assert HISTORICAL_IN_PROGRESS_HEADING.lower() in lower + + def test_reverse_signals_called_out(): """Reverse signals (stop/undo/never mind/topic change) must be named so the model recognizes them as cancellation triggers, not just background."""