mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-07-01 12:02:05 +00:00
test(compression): tolerate safe contention rollback in concurrent-fork test (#55597)
The concurrent-compression regression asserted the parent ends with exactly one child. Under heavy CI write contention the lock winner's child create_session can exhaust its SQLite retry budget, and _compress_context deliberately rolls the live id back to the still-indexed parent rather than orphaning a child (the create-failure rollback in agent/conversation_compression.py). That safe rollback leaves zero children and is correct — so the exact == 1 assertion flaked under load. Assert the actual invariant instead: children <= 1 (a 2+ fork is the bug Damien's incident is about), rotated <= 1, and rotated == n_children. A mutation check (force the lock to always acquire) confirms the relaxed assertion still fails hard on a real 2-child fork.
This commit is contained in:
parent
d6c53dcdcb
commit
d2d470e321
1 changed files with 38 additions and 13 deletions
|
|
@ -97,8 +97,11 @@ def test_concurrent_compression_does_not_fork_session(tmp_path: Path) -> None:
|
|||
"""Two AIAgents that share a session_id MUST NOT both rotate it.
|
||||
|
||||
Without the per-session compression lock this fixture deterministically
|
||||
produces 2 child sessions (transcript fork). With the lock the second
|
||||
path aborts cleanly, leaving exactly 1 canonical child.
|
||||
produces 2 child sessions (transcript fork). With the lock at most one
|
||||
path rotates: normally exactly 1 canonical child, or — under heavy DB
|
||||
write contention that makes the winner's child create_session exhaust its
|
||||
retries — 0, because _compress_context safely rolls back to the parent
|
||||
instead of orphaning a child. The forbidden outcome is 2+ (the fork).
|
||||
"""
|
||||
db = SessionDB(db_path=tmp_path / "state.db")
|
||||
|
||||
|
|
@ -126,25 +129,47 @@ def test_concurrent_compression_does_not_fork_session(tmp_path: Path) -> None:
|
|||
t_a.join(timeout=10)
|
||||
t_b.join(timeout=10)
|
||||
|
||||
# Exactly one canonical child — not two orphans.
|
||||
assert _count_children(db, parent_sid) == 1, (
|
||||
"Compression lock failed: parent session has multiple children in state.db "
|
||||
"(transcript fork). This is Damien's incident shape — see the test docstring."
|
||||
# The invariant Damien's incident is about: the parent must NEVER end up
|
||||
# with two (or more) children — that is the transcript fork. The lock
|
||||
# guarantees only one path rotates.
|
||||
#
|
||||
# Zero children is also a valid, non-forking outcome: under heavy DB write
|
||||
# contention the winner's child ``create_session`` can exhaust its retry
|
||||
# budget, and ``_compress_context`` deliberately rolls the live id back to
|
||||
# the (still-indexed) parent rather than stranding an orphan child — see
|
||||
# the create-failure rollback in agent/conversation_compression.py. That
|
||||
# safe rollback leaves 0 children and is correct. So the contract is
|
||||
# ``children <= 1``; only ``>= 2`` is the bug. Asserting an exact ``== 1``
|
||||
# made this test flaky under the concurrent CI load that triggers the
|
||||
# contention rollback (#54465 churn surfaced it).
|
||||
n_children = _count_children(db, parent_sid)
|
||||
assert n_children <= 1, (
|
||||
f"Compression lock failed: parent session has {n_children} children in "
|
||||
"state.db (transcript fork). This is Damien's incident shape — see the "
|
||||
"test docstring. Two or more children means the lock did not serialize "
|
||||
"the concurrent rotations."
|
||||
)
|
||||
|
||||
# And exactly one of the two agents actually rotated its session_id; the
|
||||
# other should still hold the parent_sid (its compression was skipped).
|
||||
# The number of agents that rotated their session_id must match the number
|
||||
# of children created — and must never exceed one. (Both rotating would be
|
||||
# the fork; the winner rolling back to parent under contention yields zero,
|
||||
# which agrees with zero children.)
|
||||
rotated = sum(
|
||||
1 for a in (agent_a, agent_b) if a.session_id != parent_sid
|
||||
)
|
||||
assert rotated == 1, (
|
||||
f"Expected exactly one agent to rotate session_id, got {rotated}. "
|
||||
"Both agents rotating means the lock didn't serialize them."
|
||||
assert rotated <= 1, (
|
||||
f"Expected at most one agent to rotate session_id, got {rotated}. "
|
||||
"More than one rotating means the lock didn't serialize them."
|
||||
)
|
||||
assert rotated == n_children, (
|
||||
f"Inconsistent state: {rotated} agent(s) rotated but {n_children} "
|
||||
"child session(s) exist — rotation and child creation diverged."
|
||||
)
|
||||
|
||||
# The lock must be released after the winner finished.
|
||||
# The lock must be released after both paths finished, regardless of
|
||||
# whether the winner committed a child or rolled back.
|
||||
assert db.get_compression_lock_holder(parent_sid) is None, (
|
||||
"Compression lock leaked: still held after both rotations completed."
|
||||
"Compression lock leaked: still held after both paths completed."
|
||||
)
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue