hermes-agent/tests/agent/test_compression_concurrent_fork.py
Teknium d2d470e321
test(compression): tolerate safe contention rollback in concurrent-fork test (#55597)
The concurrent-compression regression asserted the parent ends with exactly
one child. Under heavy CI write contention the lock winner's child
create_session can exhaust its SQLite retry budget, and _compress_context
deliberately rolls the live id back to the still-indexed parent rather than
orphaning a child (the create-failure rollback in
agent/conversation_compression.py). That safe rollback leaves zero children
and is correct — so the exact == 1 assertion flaked under load.

Assert the actual invariant instead: children <= 1 (a 2+ fork is the bug
Damien's incident is about), rotated <= 1, and rotated == n_children. A
mutation check (force the lock to always acquire) confirms the relaxed
assertion still fails hard on a real 2-child fork.
2026-06-30 04:22:47 -07:00

640 lines
27 KiB
Python

"""Regression: prevent transcript fork when two paths compress the same session_id.
Damien's incident (Discord, 2026-05-28): a long Hermes session in a Discord
gateway hit the compression threshold at the end of a turn. The parent agent
finished delivering the response and ``conversation_loop.py`` fired
``_spawn_background_review(...)`` — which builds a forked ``AIAgent`` that
inherits ``agent.session_id`` (see ``agent/background_review.py``::
``review_agent.session_id = agent.session_id``). Roughly two seconds later
a synthetic ``Background process proc_… completed`` event arrived and
started a fresh turn on the same parent ``session_id`` (still cached in the
gateway's ``SessionEntry``). Both paths hit preflight compression on the
same parent transcript and called ``_compress_context`` concurrently. Each
ended the parent and created its own CHILD session in ``state.db``, both
parented to the same old id. The gateway's ``SessionEntry`` only caught one
rotation; the other child became an orphan that silently accumulated writes.
Repro shape on Damien's machine:
parent 20260527_234659_e65f0e ended_at=set end_reason='compression'
child 20260528_113619_fc80e1 parent=20260527_234659_e65f0e (in SessionEntry)
child <orphan> parent=20260527_234659_e65f0e (silent writes)
This regression simulates the two concurrent ``compress_context`` calls
against a shared ``state.db`` and asserts that the per-session compression
lock added in this PR prevents the orphan child. Without the lock the
fixture deterministically produces 2 children; with the lock, exactly 1.
"""
from __future__ import annotations
import os
import threading
import time
from pathlib import Path
from unittest.mock import MagicMock, patch
import pytest
from hermes_state import SessionDB
def _build_agent_with_db(db: SessionDB, session_id: str):
"""Build an AIAgent that's wired to ``db`` and pinned to ``session_id``."""
with patch.dict(os.environ, {"OPENROUTER_API_KEY": "test-key"}):
from run_agent import AIAgent
agent = AIAgent(
api_key="test-key",
base_url="https://openrouter.ai/api/v1",
model="test/model",
quiet_mode=True,
session_db=db,
session_id=session_id,
skip_context_files=True,
skip_memory=True,
)
# Stub the compressor so it returns deterministic output and DOESN'T make
# an LLM call. Sleep inside compress() so the two threads' rotations
# actually overlap — without that the OS could happen to serialize them
# and hide the bug.
compressor = MagicMock()
def _compress_with_overlap(*_a, **_kw):
time.sleep(0.25)
return [
{"role": "user", "content": "[CONTEXT COMPACTION] summary"},
{"role": "user", "content": "tail"},
]
compressor.compress.side_effect = _compress_with_overlap
compressor.compression_count = 1
compressor.last_prompt_tokens = 0
compressor.last_completion_tokens = 0
compressor._last_summary_error = None
compressor._last_compress_aborted = False
compressor._last_aux_model_failure_model = None
compressor._last_aux_model_failure_error = None
agent.context_compressor = compressor
# These tests cover the ROTATION fallback path (forking, child sessions,
# lock contention) — pin in_place=False so they keep exercising it
# regardless of the global default (which flipped to True in #38763).
agent.compression_in_place = False
return agent
def _count_children(db: SessionDB, parent_sid: str) -> int:
"""Count rows in state.db whose parent_session_id == parent_sid."""
rows = db._conn.execute(
"SELECT id FROM sessions WHERE parent_session_id = ?",
(parent_sid,),
).fetchall()
return len(rows)
def test_concurrent_compression_does_not_fork_session(tmp_path: Path) -> None:
"""Two AIAgents that share a session_id MUST NOT both rotate it.
Without the per-session compression lock this fixture deterministically
produces 2 child sessions (transcript fork). With the lock at most one
path rotates: normally exactly 1 canonical child, or — under heavy DB
write contention that makes the winner's child create_session exhaust its
retries — 0, because _compress_context safely rolls back to the parent
instead of orphaning a child. The forbidden outcome is 2+ (the fork).
"""
db = SessionDB(db_path=tmp_path / "state.db")
parent_sid = "PARENT_TEST_SESSION"
db.create_session(parent_sid, source="discord")
# Two agents on the same session_id, both wired to the same db —
# mirrors the parent-turn agent + the background-review fork right
# after a turn ends.
agent_a = _build_agent_with_db(db, parent_sid)
agent_b = _build_agent_with_db(db, parent_sid)
messages = [{"role": "user", "content": f"m{i}"} for i in range(20)]
def run(agent):
try:
agent._compress_context(messages, "sys", approx_tokens=120_000)
except Exception:
# Surface to the test if either raises — should not happen.
raise
t_a = threading.Thread(target=run, args=(agent_a,), name="main_turn")
t_b = threading.Thread(target=run, args=(agent_b,), name="review_fork")
t_a.start()
t_b.start()
t_a.join(timeout=10)
t_b.join(timeout=10)
# The invariant Damien's incident is about: the parent must NEVER end up
# with two (or more) children — that is the transcript fork. The lock
# guarantees only one path rotates.
#
# Zero children is also a valid, non-forking outcome: under heavy DB write
# contention the winner's child ``create_session`` can exhaust its retry
# budget, and ``_compress_context`` deliberately rolls the live id back to
# the (still-indexed) parent rather than stranding an orphan child — see
# the create-failure rollback in agent/conversation_compression.py. That
# safe rollback leaves 0 children and is correct. So the contract is
# ``children <= 1``; only ``>= 2`` is the bug. Asserting an exact ``== 1``
# made this test flaky under the concurrent CI load that triggers the
# contention rollback (#54465 churn surfaced it).
n_children = _count_children(db, parent_sid)
assert n_children <= 1, (
f"Compression lock failed: parent session has {n_children} children in "
"state.db (transcript fork). This is Damien's incident shape — see the "
"test docstring. Two or more children means the lock did not serialize "
"the concurrent rotations."
)
# The number of agents that rotated their session_id must match the number
# of children created — and must never exceed one. (Both rotating would be
# the fork; the winner rolling back to parent under contention yields zero,
# which agrees with zero children.)
rotated = sum(
1 for a in (agent_a, agent_b) if a.session_id != parent_sid
)
assert rotated <= 1, (
f"Expected at most one agent to rotate session_id, got {rotated}. "
"More than one rotating means the lock didn't serialize them."
)
assert rotated == n_children, (
f"Inconsistent state: {rotated} agent(s) rotated but {n_children} "
"child session(s) exist — rotation and child creation diverged."
)
# The lock must be released after both paths finished, regardless of
# whether the winner committed a child or rolled back.
assert db.get_compression_lock_holder(parent_sid) is None, (
"Compression lock leaked: still held after both paths completed."
)
def test_skipped_compression_returns_messages_unchanged(tmp_path: Path) -> None:
"""The loser of the lock race must return its input messages verbatim.
Callers (preflight compression in ``conversation_loop.py``) detect the
no-op via ``len(returned) == len(input)`` and stop the auto-compress
retry loop. If the skipped path returned the compressed view, that
detection would break and the caller would mutate the conversation
without going through state.db rotation.
"""
db = SessionDB(db_path=tmp_path / "state.db")
parent_sid = "LOSER_TEST"
db.create_session(parent_sid, source="discord")
# Pre-acquire the lock so the agent's compress_context sees it held.
held = db.try_acquire_compression_lock(parent_sid, "external_holder")
assert held is True
agent = _build_agent_with_db(db, parent_sid)
messages = [{"role": "user", "content": "m1"}, {"role": "user", "content": "m2"}]
compressed, _sp = agent._compress_context(messages, "sys", approx_tokens=120_000)
# Skipped: messages returned verbatim, no rotation
assert compressed is messages or compressed == messages
assert agent.session_id == parent_sid
# Compressor was never called (the skip happens before .compress())
agent.context_compressor.compress.assert_not_called()
def test_lock_refresh_keeps_owner_live_past_initial_ttl(tmp_path: Path, monkeypatch) -> None:
"""The owning compression call must keep its lease alive while it runs."""
real_try_acquire = SessionDB.try_acquire_compression_lock
def _short_ttl(self, session_id: str, holder: str, ttl_seconds: float = 300.0) -> bool:
return real_try_acquire(self, session_id, holder, ttl_seconds=1.0)
monkeypatch.setattr(SessionDB, "try_acquire_compression_lock", _short_ttl)
db = SessionDB(db_path=tmp_path / "state.db")
parent_sid = "REFRESH_TEST"
db.create_session(parent_sid, source="discord")
agent_a = _build_agent_with_db(db, parent_sid)
agent_a._compression_lock_ttl_seconds = 1.0
agent_a._compression_lock_refresh_interval = 0.25
def _slow_compress(*_a, **_kw):
time.sleep(2.0)
return [
{"role": "user", "content": "[CONTEXT COMPACTION] summary"},
{"role": "user", "content": "tail"},
]
agent_a.context_compressor.compress.side_effect = _slow_compress
messages = [{"role": "user", "content": f"m{i}"} for i in range(20)]
def run(agent):
agent._compress_context(messages, "sys", approx_tokens=120_000)
t_a = threading.Thread(target=run, args=(agent_a,), name="refresh_owner")
t_a.start()
deadline = time.time() + 2.0
while db.get_compression_lock_holder(parent_sid) is None and time.time() < deadline:
time.sleep(0.05)
assert db.get_compression_lock_holder(parent_sid) is not None
time.sleep(1.2)
assert db.try_acquire_compression_lock(
parent_sid, "refresh_probe", ttl_seconds=1.0
) is False, "live owner lease expired and was reclaimable before compression finished"
t_a.join(timeout=10)
assert not t_a.is_alive()
assert _count_children(db, parent_sid) == 1
assert db.get_compression_lock_holder(parent_sid) is None
def test_post_compress_exception_stops_lock_refresher(tmp_path: Path, monkeypatch) -> None:
"""A warning-path exception after compress() returns must still release the lock."""
real_try_acquire = SessionDB.try_acquire_compression_lock
def _short_ttl(self, session_id: str, holder: str, ttl_seconds: float = 300.0) -> bool:
return real_try_acquire(self, session_id, holder, ttl_seconds=1.0)
monkeypatch.setattr(SessionDB, "try_acquire_compression_lock", _short_ttl)
db = SessionDB(db_path=tmp_path / "state.db")
parent_sid = "REFRESH_EXCEPTION_TEST"
db.create_session(parent_sid, source="discord")
agent = _build_agent_with_db(db, parent_sid)
agent._compression_lock_ttl_seconds = 1.0
agent._compression_lock_refresh_interval = 0.1
agent.context_compressor._last_summary_error = "summary failed"
agent._emit_warning = lambda *_a, **_k: (_ for _ in ()).throw(RuntimeError("warn boom"))
messages = [{"role": "user", "content": f"m{i}"} for i in range(20)]
with pytest.raises(RuntimeError, match="warn boom"):
agent._compress_context(messages, "sys", approx_tokens=120_000)
time.sleep(1.3)
assert db.try_acquire_compression_lock(parent_sid, "probe", ttl_seconds=1.0) is True
def test_abort_warning_exception_stops_lock_refresher(tmp_path: Path, monkeypatch) -> None:
"""An abort-path warning exception must still release the refreshed lock."""
real_try_acquire = SessionDB.try_acquire_compression_lock
def _short_ttl(self, session_id: str, holder: str, ttl_seconds: float = 300.0) -> bool:
return real_try_acquire(self, session_id, holder, ttl_seconds=1.0)
monkeypatch.setattr(SessionDB, "try_acquire_compression_lock", _short_ttl)
db = SessionDB(db_path=tmp_path / "state.db")
parent_sid = "REFRESH_ABORT_TEST"
db.create_session(parent_sid, source="discord")
agent = _build_agent_with_db(db, parent_sid)
agent._compression_lock_ttl_seconds = 1.0
agent._compression_lock_refresh_interval = 0.1
def _aborting_compress(*_a, **_kw):
agent.context_compressor._last_compress_aborted = True
agent.context_compressor._last_summary_error = "summary failed"
return [{"role": "user", "content": "tail"}]
agent.context_compressor.compress.side_effect = _aborting_compress
agent._emit_warning = lambda *_a, **_k: (_ for _ in ()).throw(RuntimeError("abort boom"))
messages = [{"role": "user", "content": f"m{i}"} for i in range(20)]
with pytest.raises(RuntimeError, match="abort boom"):
agent._compress_context(messages, "sys", approx_tokens=120_000)
time.sleep(1.3)
assert db.try_acquire_compression_lock(parent_sid, "probe", ttl_seconds=1.0) is True
def test_typeerror_fallback_exception_stops_lock_refresher(tmp_path: Path, monkeypatch) -> None:
"""A strict-signature fallback failure must still release the refreshed lock."""
real_try_acquire = SessionDB.try_acquire_compression_lock
def _short_ttl(self, session_id: str, holder: str, ttl_seconds: float = 300.0) -> bool:
return real_try_acquire(self, session_id, holder, ttl_seconds=1.0)
monkeypatch.setattr(SessionDB, "try_acquire_compression_lock", _short_ttl)
db = SessionDB(db_path=tmp_path / "state.db")
parent_sid = "REFRESH_TYPEERROR_TEST"
db.create_session(parent_sid, source="discord")
agent = _build_agent_with_db(db, parent_sid)
agent._compression_lock_ttl_seconds = 1.0
agent._compression_lock_refresh_interval = 0.1
def _strict_signature(*_a, **_kw):
if "focus_topic" in _kw or "force" in _kw:
raise TypeError("strict signature")
raise RuntimeError("fallback boom")
agent.context_compressor.compress.side_effect = _strict_signature
messages = [{"role": "user", "content": f"m{i}"} for i in range(20)]
with pytest.raises(RuntimeError, match="fallback boom"):
agent._compress_context(messages, "sys", approx_tokens=120_000)
time.sleep(1.3)
assert db.try_acquire_compression_lock(parent_sid, "probe", ttl_seconds=1.0) is True
class _NoLockSubsystemDB:
"""Wraps a real SessionDB but simulates a pre-#34351 version skew.
A long-lived process can hold ``hermes_state.SessionDB`` bound to the
OLD class in memory (no compression-lock methods) while a lazily
re-imported ``conversation_compression.py`` calls the NEW lock code.
``try_acquire_compression_lock`` then raises ``AttributeError`` — which
is NOT a ``sqlite3.Error``, so the method's own fail-open guard never
runs. Before the fix the exception propagated to the outer agent loop,
which printed the error and retried; compression never succeeded, the
token count never dropped, and the loop re-triggered compaction forever.
"""
def __init__(self, real_db: SessionDB) -> None:
self._real = real_db
def try_acquire_compression_lock(self, *_a, **_k): # noqa: D401
raise AttributeError(
"'SessionDB' object has no attribute 'try_acquire_compression_lock'"
)
def get_compression_lock_holder(self, *_a, **_k):
raise AttributeError("'SessionDB' object has no attribute 'get_compression_lock_holder'")
def release_compression_lock(self, *_a, **_k):
raise AttributeError("'SessionDB' object has no attribute 'release_compression_lock'")
def __getattr__(self, name):
# Everything else (create_session, append, rotation helpers) goes to
# the real db so the post-lock compression + rotation path runs.
return getattr(self._real, name)
def test_missing_lock_subsystem_fails_open_not_infinite_loop(tmp_path: Path, monkeypatch) -> None:
"""Version skew (no lock methods) must fail OPEN, not raise into the loop.
Reproduces the "API call #47/#48/#49 ... has no attribute
try_acquire_compression_lock" infinite-compaction spin: when the lock
subsystem is absent, ``_compress_context`` must skip locking and proceed
with compression (so the loop makes progress and terminates) instead of
letting the ``AttributeError`` escape to the retry loop.
"""
db = SessionDB(db_path=tmp_path / "state.db")
parent_sid = "SKEW_TEST_SESSION"
db.create_session(parent_sid, source="discord")
agent = _build_agent_with_db(db, parent_sid)
# Swap in the lock-less wrapper AFTER construction (the agent already
# holds a normal db reference; we only break the lock methods).
agent._session_db = _NoLockSubsystemDB(db)
monkeypatch.setattr(
"agent.conversation_compression._CompressionLockLeaseRefresher",
lambda *_a, **_k: (_ for _ in ()).throw(
AssertionError("lock refresher should not start on fail-open lock skew")
),
)
messages = [{"role": "user", "content": f"m{i}"} for i in range(20)]
# MUST NOT raise AttributeError. Before the fix this raised and the
# outer loop would retry forever.
compressed, _sp = agent._compress_context(messages, "sys", approx_tokens=120_000)
# Compression actually ran (proceeded past the broken lock) and made
# progress, so the auto-compress loop would terminate.
agent.context_compressor.compress.assert_called_once()
assert len(compressed) < len(messages), (
"Compression made no progress despite failing open — loop would still spin."
)
# Session rotated (compression succeeded end-to-end).
assert agent.session_id != parent_sid
def test_review_fork_disables_compression_to_prevent_stale_parent_fork(tmp_path: Path) -> None:
"""The background-review fork must set ``compression_enabled = False``
so it can never compress the parent it shares a session_id with
(issue #38727).
The per-session compression lock only serialises a SAME-WINDOW concurrent
race. It does NOT stop a stale parent from being compressed again in a
LATER turn: if ``review_agent`` had won the race, its new child session is
never adopted by the gateway (the fork is single-lifecycle and dies right
after one ``run_conversation``), so the foreground path would start the
next turn from the stale parent and compress it AGAIN — leaving the same
parent with two sibling children.
The fix makes the review fork never trigger compression at all. Both
compression trigger sites in ``agent/conversation_loop.py`` gate on
``agent.compression_enabled`` BEFORE calling ``_compress_context``:
• preflight (``if agent.compression_enabled and len(messages) > ...``)
• mid-loop (``if agent.compression_enabled and _compressor.should_compress(...)``)
so a fork with the flag cleared never reaches the rotation path.
This test pins the contract at the source: ``_run_review_in_thread``
must set ``review_agent.compression_enabled = False`` on the fork it
builds. It calls the real worker synchronously with
``AIAgent.run_conversation`` patched (so no LLM call happens) and
captures the constructed review agent to assert the flag.
"""
import agent.background_review as br
captured = {}
def _fake_run_conversation(self, *_a, **_k):
captured["compression_enabled"] = self.compression_enabled
captured["session_id"] = self.session_id
return {"final_response": "", "messages": []}
parent_sid = "REVIEW_FORK_FLAG_TEST"
db = SessionDB(db_path=tmp_path / "state.db")
db.create_session(parent_sid, source="discord")
parent = _build_agent_with_db(db, parent_sid)
# The worker does a local ``from run_agent import AIAgent``; patching
# the class method covers that import path.
from run_agent import AIAgent
with patch.object(AIAgent, "run_conversation", _fake_run_conversation):
br._run_review_in_thread(
parent,
[{"role": "user", "content": "hi"}],
"review this conversation",
)
assert captured, (
"_run_review_in_thread never reached run_conversation — the spawn path "
"changed; update this test to capture the review AIAgent."
)
assert captured["session_id"] == parent_sid, (
"Review fork should inherit the parent's session_id (shared id is the "
"whole reason compression must be disabled)."
)
assert captured["compression_enabled"] is False, (
"FIX REGRESSION: background-review fork did NOT disable compression. "
"It shares the parent's session_id, so an enabled fork can rotate the "
"parent into an orphan child (issue #38727). The trigger gates in "
"conversation_loop.py only short-circuit when compression_enabled is "
"False — this flag MUST be cleared on the review fork."
)
db.close()
# ── Lease-refresher bounded-failure tolerance (salvage follow-up, #54465) ────
# A single falsy refresh (transient DB blip) must NOT permanently kill the
# lease — only a *persistent* failure (genuine lost-ownership) should stop the
# refresher after a bounded number of consecutive failures. Without this, one
# escaped lock-contention error silently reintroduces the TTL-expiry wedge the
# PR set out to fix.
class _FlakyRefreshDB:
"""A db whose refresh_compression_lock returns a scripted sequence."""
def __init__(self, results):
self._results = list(results)
self.calls = 0
def refresh_compression_lock(self, session_id, holder, ttl_seconds=300.0):
self.calls += 1
if self._results:
return self._results.pop(0)
return True # steady-state success after the scripted prefix
def _no_sleep(refresher) -> None:
"""Make the refresher loop iterate without real wall-clock sleeps.
``_stop.wait(interval)`` returns False (keep looping) instantly instead of
blocking for the (clamped) interval, so count-based tests stay fast and
deterministic — the loop's termination is driven by the failure cap / the
scripted db, not by timing.
"""
refresher._stop.wait = lambda _interval: False # type: ignore[assignment]
def test_lease_refresher_survives_single_transient_failure() -> None:
"""One False (transient blip) followed by success must NOT stop the loop.
Regression for the W1/W2 finding: the original ``if not refreshed: break``
treated a one-off failure identically to genuine lost-ownership, killing
the lease on the first hiccup.
"""
from agent.conversation_compression import _CompressionLockLeaseRefresher
# Script: success, FAILURE (blip), success, then stop the loop externally.
db = _FlakyRefreshDB([True, False, True])
refresher = _CompressionLockLeaseRefresher(
db, "sess", "holder", ttl_seconds=10.0, refresh_interval_seconds=0.001
)
# Stop after exactly 4 ticks (3 scripted + 1 steady success), no real sleep.
refresher._stop.wait = lambda _i: db.calls >= 4 # type: ignore[assignment]
refresher._run()
# The single False at call 2 must NOT have ended the loop — we keep going
# past it (calls reach >= 4), proving the blip was tolerated.
assert db.calls >= 4, (
"Lease refresher stopped after a single transient failure — the "
"bounded-tolerance fix regressed (one blip must not kill the lease)."
)
def test_lease_refresher_failure_window_is_bounded_by_ttl() -> None:
"""Persistent failure stops within one lease's worth of time, not forever.
The contract (not a magic count): the give-up window
``cap * refresh_interval`` must be <= the TTL, so a stuck refresher can
never hold the lock past its TTL. We assert that relationship directly
rather than freezing a literal cap (behavior contract over snapshot).
"""
from agent.conversation_compression import _CompressionLockLeaseRefresher
ttl, interval = 10.0, 2.0 # cap should be int(10/2) = 5
db = _FlakyRefreshDB([False] * 50) # never recovers (lost ownership)
refresher = _CompressionLockLeaseRefresher(
db, "sess", "holder", ttl_seconds=ttl, refresh_interval_seconds=interval
)
_no_sleep(refresher)
refresher._run()
cap = refresher._max_consecutive_failures
assert cap == int(ttl / interval), "cap must derive from ttl/interval"
# Stops at the cap — not on the first failure, not forever.
assert db.calls == cap
# The invariant that makes the cap honest: total tolerance <= one TTL.
assert cap * interval <= ttl, (
f"give-up window {cap * interval}s must not exceed the lease TTL {ttl}s"
)
def test_lease_refresher_failure_cap_has_floor_of_one() -> None:
"""A degenerate interval >= ttl still tolerates exactly one blip (floor 1)."""
from agent.conversation_compression import _CompressionLockLeaseRefresher
db = _FlakyRefreshDB([False] * 10)
refresher = _CompressionLockLeaseRefresher(
db, "sess", "holder", ttl_seconds=1.0, refresh_interval_seconds=5.0
)
_no_sleep(refresher)
refresher._run()
assert refresher._max_consecutive_failures == 1
assert db.calls == 1
def test_lease_refresher_recovers_after_raise() -> None:
"""A raise treated as a failure tick must RESET on a later success — the
exception arm gets the same blip-tolerance as a falsy return, not just a
'doesn't crash' guarantee."""
from agent.conversation_compression import _CompressionLockLeaseRefresher
class _RaiseThenOKDB:
"""Raise once, then succeed forever — the transient-blip analog."""
def __init__(self):
self.calls = 0
def refresh_compression_lock(self, *a, **k):
self.calls += 1
if self.calls == 1:
raise RuntimeError("simulated DB hiccup")
return True
db = _RaiseThenOKDB()
refresher = _CompressionLockLeaseRefresher(
db, "sess", "holder", ttl_seconds=10.0, refresh_interval_seconds=2.0
)
# Run a handful of ticks past the raise, then stop.
refresher._stop.wait = lambda _i: db.calls >= 4 # type: ignore[assignment]
refresher._run() # must not propagate the RuntimeError
# Survived the raise and kept refreshing — the counter reset on recovery.
assert db.calls >= 4
def test_lease_refresher_stops_on_persistent_raise() -> None:
"""A refresh that raises every tick is bounded by the same TTL-derived cap,
never propagates, and never loops forever."""
from agent.conversation_compression import _CompressionLockLeaseRefresher
class _AlwaysRaiseDB:
def __init__(self):
self.calls = 0
def refresh_compression_lock(self, *a, **k):
self.calls += 1
raise RuntimeError("simulated DB hiccup")
db = _AlwaysRaiseDB()
refresher = _CompressionLockLeaseRefresher(
db, "sess", "holder", ttl_seconds=10.0, refresh_interval_seconds=2.0
)
_no_sleep(refresher)
refresher._run() # must not propagate
assert db.calls == refresher._max_consecutive_failures