fix(agent): frame compaction handoff sections as historical context

This commit is contained in:
konsisumer 2026-06-11 17:12:32 +02:00 committed by Teknium
parent 484f484c25
commit d5e2fbf244
5 changed files with 62 additions and 28 deletions

View file

@ -34,6 +34,12 @@ from agent.redact import redact_sensitive_text
logger = logging.getLogger(__name__)
HISTORICAL_TASK_HEADING = "## Historical Task Snapshot"
HISTORICAL_IN_PROGRESS_HEADING = "## Historical In-Progress State"
HISTORICAL_PENDING_ASKS_HEADING = "## Historical Pending User Asks"
HISTORICAL_REMAINING_WORK_HEADING = "## Historical Remaining Work"
SUMMARY_PREFIX = (
"[CONTEXT COMPACTION — REFERENCE ONLY] Earlier turns were compacted "
"into the summary below. This is a handoff from a previous context "
@ -43,12 +49,14 @@ SUMMARY_PREFIX = (
"Respond ONLY to the latest user message that appears AFTER this "
"summary — that message is the single source of truth for what to do "
"right now. "
"If the latest user message is consistent with the '## Active Task' "
f"If the latest user message is consistent with the '{HISTORICAL_TASK_HEADING}' "
"section, you may use the summary as background. If the latest user "
"message contradicts, supersedes, changes topic from, or in any way "
"diverges from '## Active Task' / '## In Progress' / '## Pending User "
"Asks' / '## Remaining Work', the latest message WINS — discard those "
"stale items entirely and do not 'wrap up the old task first'. "
f"diverges from '{HISTORICAL_TASK_HEADING}' / "
f"'{HISTORICAL_IN_PROGRESS_HEADING}' / "
f"'{HISTORICAL_PENDING_ASKS_HEADING}' / "
f"'{HISTORICAL_REMAINING_WORK_HEADING}', the latest message WINS — "
"discard those stale items entirely and do not 'wrap up the old task first'. "
"Reverse signals in the latest message (e.g. 'stop', 'undo', 'roll "
"back', 'just verify', 'don't do that anymore', 'never mind', a new "
"topic) must immediately end any in-flight work described in the "
@ -1155,7 +1163,7 @@ class ContextCompressor(ContextEngine):
)
reason_text = f" Summary failure reason: {reason}." if reason else ""
body = f"""## Active Task
body = f"""{HISTORICAL_TASK_HEADING}
{active_task}
## Goal
@ -1172,7 +1180,7 @@ Recovered from a deterministic fallback because the LLM context summarizer was u
## Active State
Unknown from deterministic fallback. Inspect current repository/session state if needed.
## In Progress
{HISTORICAL_IN_PROGRESS_HEADING}
{active_task}
## Blocked
@ -1184,13 +1192,13 @@ None recoverable from deterministic fallback.
## Resolved Questions
None recoverable from deterministic fallback.
## Pending User Asks
{HISTORICAL_PENDING_ASKS_HEADING}
{active_task}
## Relevant Files
{_bullets(relevant_files, limit=12)}
## Remaining Work
{HISTORICAL_REMAINING_WORK_HEADING}
Continue from the most recent unfulfilled user ask and protected tail messages. Verify state with tools before making claims.
## Last Dropped Turns
@ -1312,7 +1320,7 @@ Summary generation was unavailable, so this is a best-effort deterministic fallb
_temporal_anchoring_rule = ""
# Shared structured template (used by both paths).
_template_sections = f"""## Active Task
_template_sections = f"""{HISTORICAL_TASK_HEADING}
[THE SINGLE MOST IMPORTANT FIELD. Capture the user's most recent unfulfilled
input verbatim the exact words they used. This includes:
- Explicit task assignments ("refactor the auth module")
@ -1359,7 +1367,7 @@ Be specific with file paths, commands, line numbers, and results.]
- Any running processes or servers
- Environment details that matter]
## In Progress
{HISTORICAL_IN_PROGRESS_HEADING}
[Work currently underway what was being done when compaction fired]
## Blocked
@ -1371,13 +1379,13 @@ Be specific with file paths, commands, line numbers, and results.]
## Resolved Questions
[Questions the user asked that were ALREADY answered include the answer so it is not repeated]
## Pending User Asks
{HISTORICAL_PENDING_ASKS_HEADING}
[Questions or requests from the user that have NOT yet been answered or fulfilled. If none, write "None."]
## Relevant Files
[Files read, modified, or created with brief note on each]
## Remaining Work
{HISTORICAL_REMAINING_WORK_HEADING}
[What remains to be done framed as context, not instructions]
## Critical Context

View file

@ -3,7 +3,11 @@
import pytest
from unittest.mock import patch, MagicMock
from agent.context_compressor import ContextCompressor, SUMMARY_PREFIX
from agent.context_compressor import (
ContextCompressor,
HISTORICAL_TASK_HEADING,
SUMMARY_PREFIX,
)
@pytest.fixture()
@ -157,7 +161,7 @@ class TestCompress:
result = c.compress(msgs)
combined = "\n".join(str(m.get("content", "")) for m in result)
assert "## Active Task" in combined
assert HISTORICAL_TASK_HEADING in combined
assert "Please fix the compression summary failure" in combined
assert "read_file" in combined
assert "agent/context_compressor.py" in combined
@ -1213,7 +1217,8 @@ class TestCompressWithClient:
"""When the summary lands as standalone role='user' (e.g. head ends
with assistant/tool), the message body must include the explicit
'--- END OF CONTEXT SUMMARY ---' marker. Without it, weak models
read the verbatim past user request quoted in '## Active Task' as
read the verbatim past user request quoted in the historical task
snapshot as
fresh input (#11475, #14521).
"""
mock_response = MagicMock()

View file

@ -15,7 +15,7 @@ from datetime import datetime, timezone
from unittest.mock import MagicMock, patch
import hermes_time
from agent.context_compressor import ContextCompressor
from agent.context_compressor import ContextCompressor, HISTORICAL_TASK_HEADING
def _compressor() -> ContextCompressor:
@ -98,7 +98,7 @@ def test_clock_failure_omits_rule_but_compaction_still_runs():
prompt = mock_call.call_args.kwargs["messages"][0]["content"]
assert "TEMPORAL ANCHORING" not in prompt
# Structured template still intact.
assert "## Active Task" in prompt
assert HISTORICAL_TASK_HEADING in prompt
def test_anchoring_rule_uses_date_from_hermes_time_now():

View file

@ -1,9 +1,9 @@
"""Regression coverage for #35344: a resumed session must not let a stale
``## Active Task`` from an inherited compaction handoff hijack the reply to a
historical task snapshot from an inherited compaction handoff hijack the reply to a
new, unrelated user message.
The failure mode (real report): a lineage was compacted, producing a handoff
whose ``## Active Task`` described task A. The lineage was resumed later and
whose historical task snapshot described task A. The lineage was resumed later and
the user asked about an unrelated task B. The model answered with A because
the handoff's resume directive outranked the fresh ask.
@ -16,14 +16,15 @@ named reverse-signal verbs. Two invariants guard the resume path specifically:
pre-fix stale handoff cannot keep its "resume exactly" directive forever.
2. The current handoff prefix contains an unambiguous "latest message wins /
discard stale Active Task" rule, so an unrelated new ask is privileged over
the inherited ``## Active Task``.
discard stale historical task" rule, so an unrelated new ask is privileged over
the inherited task snapshot.
These are content/structural assertions (no live model call) they pin the
mechanism that makes the stale task historical rather than active.
"""
from agent.context_compressor import (
HISTORICAL_TASK_HEADING,
SUMMARY_PREFIX,
LEGACY_SUMMARY_PREFIX,
ContextCompressor,
@ -48,10 +49,10 @@ _OLD_CONFLICTING_PREFIX = (
def test_latest_message_wins_over_inherited_active_task():
"""The handoff must explicitly privilege the latest user message over a
stale ``## Active Task`` — the core #35344 contract."""
stale historical task snapshot the core #35344 contract."""
lower = SUMMARY_PREFIX.lower()
assert "latest user message" in lower
assert "## active task" in lower
assert HISTORICAL_TASK_HEADING.lower() in lower
# Conflict-resolution must be explicit, not implied.
assert "wins" in lower or "supersede" in lower
assert "discard" in lower
@ -69,7 +70,7 @@ def test_resumed_stale_handoff_gets_renormalized_to_current_prefix():
prefix when re-normalized on re-compaction so the "resume exactly"
directive cannot survive into a resumed session."""
stale_body = (
"## Active Task\n"
f"{HISTORICAL_TASK_HEADING}\n"
"User asked: 'Migrate the billing module to Stripe'\n\n"
"## Goal\nMigrate billing.\n"
)
@ -92,7 +93,7 @@ def test_resumed_stale_handoff_gets_renormalized_to_current_prefix():
def test_legacy_prefix_handoff_also_renormalized():
"""The same upgrade applies to the oldest ``[CONTEXT SUMMARY]:`` handoff
format that may sit in a long-lived resumed lineage."""
legacy = f"{LEGACY_SUMMARY_PREFIX} ## Active Task\nUser asked: 'task A'"
legacy = f"{LEGACY_SUMMARY_PREFIX} {HISTORICAL_TASK_HEADING}\nUser asked: 'task A'"
renormalized = ContextCompressor._with_summary_prefix(legacy)
assert renormalized.startswith(SUMMARY_PREFIX)
assert LEGACY_SUMMARY_PREFIX not in renormalized
@ -107,7 +108,7 @@ def test_inherited_handoff_detected_in_resumed_protected_head():
Task read as live intent)."""
messages = [
{"role": "system", "content": "system prompt"},
{"role": "user", "content": f"{SUMMARY_PREFIX}\n## Active Task\nUser asked: 'task A'"},
{"role": "user", "content": f"{SUMMARY_PREFIX}\n{HISTORICAL_TASK_HEADING}\nUser asked: 'task A'"},
{"role": "assistant", "content": "ok"},
{"role": "user", "content": "Unrelated task B: what's the capital of France?"},
]
@ -129,7 +130,7 @@ def test_historical_prefixed_handoff_detected_and_stripped():
stale 'resume exactly' text as a fresh turn."""
messages = [
{"role": "system", "content": "system prompt"},
{"role": "user", "content": f"{_OLD_CONFLICTING_PREFIX}\n## Active Task\nUser asked: 'task A'"},
{"role": "user", "content": f"{_OLD_CONFLICTING_PREFIX}\n{HISTORICAL_TASK_HEADING}\nUser asked: 'task A'"},
{"role": "assistant", "content": "ok"},
{"role": "user", "content": "Unrelated task B"},
]

View file

@ -18,7 +18,13 @@ the agent repeatedly re-surfacing already-cancelled work across turns.
These tests pin the post-fix invariants so the conflict cannot regress.
"""
from agent.context_compressor import SUMMARY_PREFIX
from agent.context_compressor import (
HISTORICAL_IN_PROGRESS_HEADING,
HISTORICAL_PENDING_ASKS_HEADING,
HISTORICAL_REMAINING_WORK_HEADING,
HISTORICAL_TASK_HEADING,
SUMMARY_PREFIX,
)
def test_no_resume_exactly_directive():
@ -30,10 +36,24 @@ def test_latest_message_wins_on_conflict():
"""The prefix must explicitly say latest user message wins on conflict."""
lower = SUMMARY_PREFIX.lower()
assert "latest user message" in lower
assert HISTORICAL_TASK_HEADING.lower() in lower
assert HISTORICAL_PENDING_ASKS_HEADING.lower() in lower
assert HISTORICAL_REMAINING_WORK_HEADING.lower() in lower
# Must have an explicit conflict-resolution rule.
assert "wins" in lower or "supersede" in lower or "discard" in lower
def test_handoff_sections_are_framed_as_historical():
"""The summary headings referenced in the prefix must sound historical,
not like live instructions for the current turn."""
lower = SUMMARY_PREFIX.lower()
assert "## active task" not in lower
assert "## pending user asks" not in lower
assert "## remaining work" not in lower
assert HISTORICAL_TASK_HEADING.lower() in lower
assert HISTORICAL_IN_PROGRESS_HEADING.lower() in lower
def test_reverse_signals_called_out():
"""Reverse signals (stop/undo/never mind/topic change) must be named so
the model recognizes them as cancellation triggers, not just background."""