mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-22 10:32:00 +00:00
fix(agent): frame compaction handoff sections as historical context
This commit is contained in:
parent
484f484c25
commit
d5e2fbf244
5 changed files with 62 additions and 28 deletions
|
|
@ -3,7 +3,11 @@
|
|||
import pytest
|
||||
from unittest.mock import patch, MagicMock
|
||||
|
||||
from agent.context_compressor import ContextCompressor, SUMMARY_PREFIX
|
||||
from agent.context_compressor import (
|
||||
ContextCompressor,
|
||||
HISTORICAL_TASK_HEADING,
|
||||
SUMMARY_PREFIX,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
|
|
@ -157,7 +161,7 @@ class TestCompress:
|
|||
result = c.compress(msgs)
|
||||
|
||||
combined = "\n".join(str(m.get("content", "")) for m in result)
|
||||
assert "## Active Task" in combined
|
||||
assert HISTORICAL_TASK_HEADING in combined
|
||||
assert "Please fix the compression summary failure" in combined
|
||||
assert "read_file" in combined
|
||||
assert "agent/context_compressor.py" in combined
|
||||
|
|
@ -1213,7 +1217,8 @@ class TestCompressWithClient:
|
|||
"""When the summary lands as standalone role='user' (e.g. head ends
|
||||
with assistant/tool), the message body must include the explicit
|
||||
'--- END OF CONTEXT SUMMARY ---' marker. Without it, weak models
|
||||
read the verbatim past user request quoted in '## Active Task' as
|
||||
read the verbatim past user request quoted in the historical task
|
||||
snapshot as
|
||||
fresh input (#11475, #14521).
|
||||
"""
|
||||
mock_response = MagicMock()
|
||||
|
|
|
|||
|
|
@ -15,7 +15,7 @@ from datetime import datetime, timezone
|
|||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import hermes_time
|
||||
from agent.context_compressor import ContextCompressor
|
||||
from agent.context_compressor import ContextCompressor, HISTORICAL_TASK_HEADING
|
||||
|
||||
|
||||
def _compressor() -> ContextCompressor:
|
||||
|
|
@ -98,7 +98,7 @@ def test_clock_failure_omits_rule_but_compaction_still_runs():
|
|||
prompt = mock_call.call_args.kwargs["messages"][0]["content"]
|
||||
assert "TEMPORAL ANCHORING" not in prompt
|
||||
# Structured template still intact.
|
||||
assert "## Active Task" in prompt
|
||||
assert HISTORICAL_TASK_HEADING in prompt
|
||||
|
||||
|
||||
def test_anchoring_rule_uses_date_from_hermes_time_now():
|
||||
|
|
|
|||
|
|
@ -1,9 +1,9 @@
|
|||
"""Regression coverage for #35344: a resumed session must not let a stale
|
||||
``## Active Task`` from an inherited compaction handoff hijack the reply to a
|
||||
historical task snapshot from an inherited compaction handoff hijack the reply to a
|
||||
new, unrelated user message.
|
||||
|
||||
The failure mode (real report): a lineage was compacted, producing a handoff
|
||||
whose ``## Active Task`` described task A. The lineage was resumed later and
|
||||
whose historical task snapshot described task A. The lineage was resumed later and
|
||||
the user asked about an unrelated task B. The model answered with A because
|
||||
the handoff's resume directive outranked the fresh ask.
|
||||
|
||||
|
|
@ -16,14 +16,15 @@ named reverse-signal verbs. Two invariants guard the resume path specifically:
|
|||
pre-fix stale handoff cannot keep its "resume exactly" directive forever.
|
||||
|
||||
2. The current handoff prefix contains an unambiguous "latest message wins /
|
||||
discard stale Active Task" rule, so an unrelated new ask is privileged over
|
||||
the inherited ``## Active Task``.
|
||||
discard stale historical task" rule, so an unrelated new ask is privileged over
|
||||
the inherited task snapshot.
|
||||
|
||||
These are content/structural assertions (no live model call) — they pin the
|
||||
mechanism that makes the stale task historical rather than active.
|
||||
"""
|
||||
|
||||
from agent.context_compressor import (
|
||||
HISTORICAL_TASK_HEADING,
|
||||
SUMMARY_PREFIX,
|
||||
LEGACY_SUMMARY_PREFIX,
|
||||
ContextCompressor,
|
||||
|
|
@ -48,10 +49,10 @@ _OLD_CONFLICTING_PREFIX = (
|
|||
|
||||
def test_latest_message_wins_over_inherited_active_task():
|
||||
"""The handoff must explicitly privilege the latest user message over a
|
||||
stale ``## Active Task`` — the core #35344 contract."""
|
||||
stale historical task snapshot — the core #35344 contract."""
|
||||
lower = SUMMARY_PREFIX.lower()
|
||||
assert "latest user message" in lower
|
||||
assert "## active task" in lower
|
||||
assert HISTORICAL_TASK_HEADING.lower() in lower
|
||||
# Conflict-resolution must be explicit, not implied.
|
||||
assert "wins" in lower or "supersede" in lower
|
||||
assert "discard" in lower
|
||||
|
|
@ -69,7 +70,7 @@ def test_resumed_stale_handoff_gets_renormalized_to_current_prefix():
|
|||
prefix when re-normalized on re-compaction — so the "resume exactly"
|
||||
directive cannot survive into a resumed session."""
|
||||
stale_body = (
|
||||
"## Active Task\n"
|
||||
f"{HISTORICAL_TASK_HEADING}\n"
|
||||
"User asked: 'Migrate the billing module to Stripe'\n\n"
|
||||
"## Goal\nMigrate billing.\n"
|
||||
)
|
||||
|
|
@ -92,7 +93,7 @@ def test_resumed_stale_handoff_gets_renormalized_to_current_prefix():
|
|||
def test_legacy_prefix_handoff_also_renormalized():
|
||||
"""The same upgrade applies to the oldest ``[CONTEXT SUMMARY]:`` handoff
|
||||
format that may sit in a long-lived resumed lineage."""
|
||||
legacy = f"{LEGACY_SUMMARY_PREFIX} ## Active Task\nUser asked: 'task A'"
|
||||
legacy = f"{LEGACY_SUMMARY_PREFIX} {HISTORICAL_TASK_HEADING}\nUser asked: 'task A'"
|
||||
renormalized = ContextCompressor._with_summary_prefix(legacy)
|
||||
assert renormalized.startswith(SUMMARY_PREFIX)
|
||||
assert LEGACY_SUMMARY_PREFIX not in renormalized
|
||||
|
|
@ -107,7 +108,7 @@ def test_inherited_handoff_detected_in_resumed_protected_head():
|
|||
Task read as live intent)."""
|
||||
messages = [
|
||||
{"role": "system", "content": "system prompt"},
|
||||
{"role": "user", "content": f"{SUMMARY_PREFIX}\n## Active Task\nUser asked: 'task A'"},
|
||||
{"role": "user", "content": f"{SUMMARY_PREFIX}\n{HISTORICAL_TASK_HEADING}\nUser asked: 'task A'"},
|
||||
{"role": "assistant", "content": "ok"},
|
||||
{"role": "user", "content": "Unrelated task B: what's the capital of France?"},
|
||||
]
|
||||
|
|
@ -129,7 +130,7 @@ def test_historical_prefixed_handoff_detected_and_stripped():
|
|||
stale 'resume exactly' text as a fresh turn."""
|
||||
messages = [
|
||||
{"role": "system", "content": "system prompt"},
|
||||
{"role": "user", "content": f"{_OLD_CONFLICTING_PREFIX}\n## Active Task\nUser asked: 'task A'"},
|
||||
{"role": "user", "content": f"{_OLD_CONFLICTING_PREFIX}\n{HISTORICAL_TASK_HEADING}\nUser asked: 'task A'"},
|
||||
{"role": "assistant", "content": "ok"},
|
||||
{"role": "user", "content": "Unrelated task B"},
|
||||
]
|
||||
|
|
|
|||
|
|
@ -18,7 +18,13 @@ the agent repeatedly re-surfacing already-cancelled work across turns.
|
|||
These tests pin the post-fix invariants so the conflict cannot regress.
|
||||
"""
|
||||
|
||||
from agent.context_compressor import SUMMARY_PREFIX
|
||||
from agent.context_compressor import (
|
||||
HISTORICAL_IN_PROGRESS_HEADING,
|
||||
HISTORICAL_PENDING_ASKS_HEADING,
|
||||
HISTORICAL_REMAINING_WORK_HEADING,
|
||||
HISTORICAL_TASK_HEADING,
|
||||
SUMMARY_PREFIX,
|
||||
)
|
||||
|
||||
|
||||
def test_no_resume_exactly_directive():
|
||||
|
|
@ -30,10 +36,24 @@ def test_latest_message_wins_on_conflict():
|
|||
"""The prefix must explicitly say latest user message wins on conflict."""
|
||||
lower = SUMMARY_PREFIX.lower()
|
||||
assert "latest user message" in lower
|
||||
assert HISTORICAL_TASK_HEADING.lower() in lower
|
||||
assert HISTORICAL_PENDING_ASKS_HEADING.lower() in lower
|
||||
assert HISTORICAL_REMAINING_WORK_HEADING.lower() in lower
|
||||
# Must have an explicit conflict-resolution rule.
|
||||
assert "wins" in lower or "supersede" in lower or "discard" in lower
|
||||
|
||||
|
||||
def test_handoff_sections_are_framed_as_historical():
|
||||
"""The summary headings referenced in the prefix must sound historical,
|
||||
not like live instructions for the current turn."""
|
||||
lower = SUMMARY_PREFIX.lower()
|
||||
assert "## active task" not in lower
|
||||
assert "## pending user asks" not in lower
|
||||
assert "## remaining work" not in lower
|
||||
assert HISTORICAL_TASK_HEADING.lower() in lower
|
||||
assert HISTORICAL_IN_PROGRESS_HEADING.lower() in lower
|
||||
|
||||
|
||||
def test_reverse_signals_called_out():
|
||||
"""Reverse signals (stop/undo/never mind/topic change) must be named so
|
||||
the model recognizes them as cancellation triggers, not just background."""
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue