mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-12 08:51:53 +00:00
fix(agent): frame compaction handoff sections as historical context
This commit is contained in:
parent
484f484c25
commit
d5e2fbf244
5 changed files with 62 additions and 28 deletions
|
|
@ -34,6 +34,12 @@ from agent.redact import redact_sensitive_text
|
|||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
HISTORICAL_TASK_HEADING = "## Historical Task Snapshot"
|
||||
HISTORICAL_IN_PROGRESS_HEADING = "## Historical In-Progress State"
|
||||
HISTORICAL_PENDING_ASKS_HEADING = "## Historical Pending User Asks"
|
||||
HISTORICAL_REMAINING_WORK_HEADING = "## Historical Remaining Work"
|
||||
|
||||
|
||||
SUMMARY_PREFIX = (
|
||||
"[CONTEXT COMPACTION — REFERENCE ONLY] Earlier turns were compacted "
|
||||
"into the summary below. This is a handoff from a previous context "
|
||||
|
|
@ -43,12 +49,14 @@ SUMMARY_PREFIX = (
|
|||
"Respond ONLY to the latest user message that appears AFTER this "
|
||||
"summary — that message is the single source of truth for what to do "
|
||||
"right now. "
|
||||
"If the latest user message is consistent with the '## Active Task' "
|
||||
f"If the latest user message is consistent with the '{HISTORICAL_TASK_HEADING}' "
|
||||
"section, you may use the summary as background. If the latest user "
|
||||
"message contradicts, supersedes, changes topic from, or in any way "
|
||||
"diverges from '## Active Task' / '## In Progress' / '## Pending User "
|
||||
"Asks' / '## Remaining Work', the latest message WINS — discard those "
|
||||
"stale items entirely and do not 'wrap up the old task first'. "
|
||||
f"diverges from '{HISTORICAL_TASK_HEADING}' / "
|
||||
f"'{HISTORICAL_IN_PROGRESS_HEADING}' / "
|
||||
f"'{HISTORICAL_PENDING_ASKS_HEADING}' / "
|
||||
f"'{HISTORICAL_REMAINING_WORK_HEADING}', the latest message WINS — "
|
||||
"discard those stale items entirely and do not 'wrap up the old task first'. "
|
||||
"Reverse signals in the latest message (e.g. 'stop', 'undo', 'roll "
|
||||
"back', 'just verify', 'don't do that anymore', 'never mind', a new "
|
||||
"topic) must immediately end any in-flight work described in the "
|
||||
|
|
@ -1155,7 +1163,7 @@ class ContextCompressor(ContextEngine):
|
|||
)
|
||||
|
||||
reason_text = f" Summary failure reason: {reason}." if reason else ""
|
||||
body = f"""## Active Task
|
||||
body = f"""{HISTORICAL_TASK_HEADING}
|
||||
{active_task}
|
||||
|
||||
## Goal
|
||||
|
|
@ -1172,7 +1180,7 @@ Recovered from a deterministic fallback because the LLM context summarizer was u
|
|||
## Active State
|
||||
Unknown from deterministic fallback. Inspect current repository/session state if needed.
|
||||
|
||||
## In Progress
|
||||
{HISTORICAL_IN_PROGRESS_HEADING}
|
||||
{active_task}
|
||||
|
||||
## Blocked
|
||||
|
|
@ -1184,13 +1192,13 @@ None recoverable from deterministic fallback.
|
|||
## Resolved Questions
|
||||
None recoverable from deterministic fallback.
|
||||
|
||||
## Pending User Asks
|
||||
{HISTORICAL_PENDING_ASKS_HEADING}
|
||||
{active_task}
|
||||
|
||||
## Relevant Files
|
||||
{_bullets(relevant_files, limit=12)}
|
||||
|
||||
## Remaining Work
|
||||
{HISTORICAL_REMAINING_WORK_HEADING}
|
||||
Continue from the most recent unfulfilled user ask and protected tail messages. Verify state with tools before making claims.
|
||||
|
||||
## Last Dropped Turns
|
||||
|
|
@ -1312,7 +1320,7 @@ Summary generation was unavailable, so this is a best-effort deterministic fallb
|
|||
_temporal_anchoring_rule = ""
|
||||
|
||||
# Shared structured template (used by both paths).
|
||||
_template_sections = f"""## Active Task
|
||||
_template_sections = f"""{HISTORICAL_TASK_HEADING}
|
||||
[THE SINGLE MOST IMPORTANT FIELD. Capture the user's most recent unfulfilled
|
||||
input verbatim — the exact words they used. This includes:
|
||||
- Explicit task assignments ("refactor the auth module")
|
||||
|
|
@ -1359,7 +1367,7 @@ Be specific with file paths, commands, line numbers, and results.]
|
|||
- Any running processes or servers
|
||||
- Environment details that matter]
|
||||
|
||||
## In Progress
|
||||
{HISTORICAL_IN_PROGRESS_HEADING}
|
||||
[Work currently underway — what was being done when compaction fired]
|
||||
|
||||
## Blocked
|
||||
|
|
@ -1371,13 +1379,13 @@ Be specific with file paths, commands, line numbers, and results.]
|
|||
## Resolved Questions
|
||||
[Questions the user asked that were ALREADY answered — include the answer so it is not repeated]
|
||||
|
||||
## Pending User Asks
|
||||
{HISTORICAL_PENDING_ASKS_HEADING}
|
||||
[Questions or requests from the user that have NOT yet been answered or fulfilled. If none, write "None."]
|
||||
|
||||
## Relevant Files
|
||||
[Files read, modified, or created — with brief note on each]
|
||||
|
||||
## Remaining Work
|
||||
{HISTORICAL_REMAINING_WORK_HEADING}
|
||||
[What remains to be done — framed as context, not instructions]
|
||||
|
||||
## Critical Context
|
||||
|
|
|
|||
|
|
@ -3,7 +3,11 @@
|
|||
import pytest
|
||||
from unittest.mock import patch, MagicMock
|
||||
|
||||
from agent.context_compressor import ContextCompressor, SUMMARY_PREFIX
|
||||
from agent.context_compressor import (
|
||||
ContextCompressor,
|
||||
HISTORICAL_TASK_HEADING,
|
||||
SUMMARY_PREFIX,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
|
|
@ -157,7 +161,7 @@ class TestCompress:
|
|||
result = c.compress(msgs)
|
||||
|
||||
combined = "\n".join(str(m.get("content", "")) for m in result)
|
||||
assert "## Active Task" in combined
|
||||
assert HISTORICAL_TASK_HEADING in combined
|
||||
assert "Please fix the compression summary failure" in combined
|
||||
assert "read_file" in combined
|
||||
assert "agent/context_compressor.py" in combined
|
||||
|
|
@ -1213,7 +1217,8 @@ class TestCompressWithClient:
|
|||
"""When the summary lands as standalone role='user' (e.g. head ends
|
||||
with assistant/tool), the message body must include the explicit
|
||||
'--- END OF CONTEXT SUMMARY ---' marker. Without it, weak models
|
||||
read the verbatim past user request quoted in '## Active Task' as
|
||||
read the verbatim past user request quoted in the historical task
|
||||
snapshot as
|
||||
fresh input (#11475, #14521).
|
||||
"""
|
||||
mock_response = MagicMock()
|
||||
|
|
|
|||
|
|
@ -15,7 +15,7 @@ from datetime import datetime, timezone
|
|||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import hermes_time
|
||||
from agent.context_compressor import ContextCompressor
|
||||
from agent.context_compressor import ContextCompressor, HISTORICAL_TASK_HEADING
|
||||
|
||||
|
||||
def _compressor() -> ContextCompressor:
|
||||
|
|
@ -98,7 +98,7 @@ def test_clock_failure_omits_rule_but_compaction_still_runs():
|
|||
prompt = mock_call.call_args.kwargs["messages"][0]["content"]
|
||||
assert "TEMPORAL ANCHORING" not in prompt
|
||||
# Structured template still intact.
|
||||
assert "## Active Task" in prompt
|
||||
assert HISTORICAL_TASK_HEADING in prompt
|
||||
|
||||
|
||||
def test_anchoring_rule_uses_date_from_hermes_time_now():
|
||||
|
|
|
|||
|
|
@ -1,9 +1,9 @@
|
|||
"""Regression coverage for #35344: a resumed session must not let a stale
|
||||
``## Active Task`` from an inherited compaction handoff hijack the reply to a
|
||||
historical task snapshot from an inherited compaction handoff hijack the reply to a
|
||||
new, unrelated user message.
|
||||
|
||||
The failure mode (real report): a lineage was compacted, producing a handoff
|
||||
whose ``## Active Task`` described task A. The lineage was resumed later and
|
||||
whose historical task snapshot described task A. The lineage was resumed later and
|
||||
the user asked about an unrelated task B. The model answered with A because
|
||||
the handoff's resume directive outranked the fresh ask.
|
||||
|
||||
|
|
@ -16,14 +16,15 @@ named reverse-signal verbs. Two invariants guard the resume path specifically:
|
|||
pre-fix stale handoff cannot keep its "resume exactly" directive forever.
|
||||
|
||||
2. The current handoff prefix contains an unambiguous "latest message wins /
|
||||
discard stale Active Task" rule, so an unrelated new ask is privileged over
|
||||
the inherited ``## Active Task``.
|
||||
discard stale historical task" rule, so an unrelated new ask is privileged over
|
||||
the inherited task snapshot.
|
||||
|
||||
These are content/structural assertions (no live model call) — they pin the
|
||||
mechanism that makes the stale task historical rather than active.
|
||||
"""
|
||||
|
||||
from agent.context_compressor import (
|
||||
HISTORICAL_TASK_HEADING,
|
||||
SUMMARY_PREFIX,
|
||||
LEGACY_SUMMARY_PREFIX,
|
||||
ContextCompressor,
|
||||
|
|
@ -48,10 +49,10 @@ _OLD_CONFLICTING_PREFIX = (
|
|||
|
||||
def test_latest_message_wins_over_inherited_active_task():
|
||||
"""The handoff must explicitly privilege the latest user message over a
|
||||
stale ``## Active Task`` — the core #35344 contract."""
|
||||
stale historical task snapshot — the core #35344 contract."""
|
||||
lower = SUMMARY_PREFIX.lower()
|
||||
assert "latest user message" in lower
|
||||
assert "## active task" in lower
|
||||
assert HISTORICAL_TASK_HEADING.lower() in lower
|
||||
# Conflict-resolution must be explicit, not implied.
|
||||
assert "wins" in lower or "supersede" in lower
|
||||
assert "discard" in lower
|
||||
|
|
@ -69,7 +70,7 @@ def test_resumed_stale_handoff_gets_renormalized_to_current_prefix():
|
|||
prefix when re-normalized on re-compaction — so the "resume exactly"
|
||||
directive cannot survive into a resumed session."""
|
||||
stale_body = (
|
||||
"## Active Task\n"
|
||||
f"{HISTORICAL_TASK_HEADING}\n"
|
||||
"User asked: 'Migrate the billing module to Stripe'\n\n"
|
||||
"## Goal\nMigrate billing.\n"
|
||||
)
|
||||
|
|
@ -92,7 +93,7 @@ def test_resumed_stale_handoff_gets_renormalized_to_current_prefix():
|
|||
def test_legacy_prefix_handoff_also_renormalized():
|
||||
"""The same upgrade applies to the oldest ``[CONTEXT SUMMARY]:`` handoff
|
||||
format that may sit in a long-lived resumed lineage."""
|
||||
legacy = f"{LEGACY_SUMMARY_PREFIX} ## Active Task\nUser asked: 'task A'"
|
||||
legacy = f"{LEGACY_SUMMARY_PREFIX} {HISTORICAL_TASK_HEADING}\nUser asked: 'task A'"
|
||||
renormalized = ContextCompressor._with_summary_prefix(legacy)
|
||||
assert renormalized.startswith(SUMMARY_PREFIX)
|
||||
assert LEGACY_SUMMARY_PREFIX not in renormalized
|
||||
|
|
@ -107,7 +108,7 @@ def test_inherited_handoff_detected_in_resumed_protected_head():
|
|||
Task read as live intent)."""
|
||||
messages = [
|
||||
{"role": "system", "content": "system prompt"},
|
||||
{"role": "user", "content": f"{SUMMARY_PREFIX}\n## Active Task\nUser asked: 'task A'"},
|
||||
{"role": "user", "content": f"{SUMMARY_PREFIX}\n{HISTORICAL_TASK_HEADING}\nUser asked: 'task A'"},
|
||||
{"role": "assistant", "content": "ok"},
|
||||
{"role": "user", "content": "Unrelated task B: what's the capital of France?"},
|
||||
]
|
||||
|
|
@ -129,7 +130,7 @@ def test_historical_prefixed_handoff_detected_and_stripped():
|
|||
stale 'resume exactly' text as a fresh turn."""
|
||||
messages = [
|
||||
{"role": "system", "content": "system prompt"},
|
||||
{"role": "user", "content": f"{_OLD_CONFLICTING_PREFIX}\n## Active Task\nUser asked: 'task A'"},
|
||||
{"role": "user", "content": f"{_OLD_CONFLICTING_PREFIX}\n{HISTORICAL_TASK_HEADING}\nUser asked: 'task A'"},
|
||||
{"role": "assistant", "content": "ok"},
|
||||
{"role": "user", "content": "Unrelated task B"},
|
||||
]
|
||||
|
|
|
|||
|
|
@ -18,7 +18,13 @@ the agent repeatedly re-surfacing already-cancelled work across turns.
|
|||
These tests pin the post-fix invariants so the conflict cannot regress.
|
||||
"""
|
||||
|
||||
from agent.context_compressor import SUMMARY_PREFIX
|
||||
from agent.context_compressor import (
|
||||
HISTORICAL_IN_PROGRESS_HEADING,
|
||||
HISTORICAL_PENDING_ASKS_HEADING,
|
||||
HISTORICAL_REMAINING_WORK_HEADING,
|
||||
HISTORICAL_TASK_HEADING,
|
||||
SUMMARY_PREFIX,
|
||||
)
|
||||
|
||||
|
||||
def test_no_resume_exactly_directive():
|
||||
|
|
@ -30,10 +36,24 @@ def test_latest_message_wins_on_conflict():
|
|||
"""The prefix must explicitly say latest user message wins on conflict."""
|
||||
lower = SUMMARY_PREFIX.lower()
|
||||
assert "latest user message" in lower
|
||||
assert HISTORICAL_TASK_HEADING.lower() in lower
|
||||
assert HISTORICAL_PENDING_ASKS_HEADING.lower() in lower
|
||||
assert HISTORICAL_REMAINING_WORK_HEADING.lower() in lower
|
||||
# Must have an explicit conflict-resolution rule.
|
||||
assert "wins" in lower or "supersede" in lower or "discard" in lower
|
||||
|
||||
|
||||
def test_handoff_sections_are_framed_as_historical():
|
||||
"""The summary headings referenced in the prefix must sound historical,
|
||||
not like live instructions for the current turn."""
|
||||
lower = SUMMARY_PREFIX.lower()
|
||||
assert "## active task" not in lower
|
||||
assert "## pending user asks" not in lower
|
||||
assert "## remaining work" not in lower
|
||||
assert HISTORICAL_TASK_HEADING.lower() in lower
|
||||
assert HISTORICAL_IN_PROGRESS_HEADING.lower() in lower
|
||||
|
||||
|
||||
def test_reverse_signals_called_out():
|
||||
"""Reverse signals (stop/undo/never mind/topic change) must be named so
|
||||
the model recognizes them as cancellation triggers, not just background."""
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue