mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-07-01 12:02:05 +00:00
test: de-flake SIGKILL-tree, compression-tip resume, and fallback-cooldown tests
Three CI flakes hit while landing the credential-pool restore fix; all three were timing/wall-clock races in the tests, not product bugs (each passes locally and the assertions are correct): - test_entire_tree_is_sigkilled_not_just_parent: _terminate_host_pid SIGKILLs synchronously, but the test's 4s budget after a 1s in-function SIGTERM grace left almost no slack for the kernel to tear down 3 processes + reparent the children to zombies under loaded-CI scheduling. Widen the wait to 15s and make the liveness predicate tolerant of vanished-pid / zombie races. The assertion never weakens: every tree member must end up dead or zombie. - test_session_resume_follows_compression_tip: appended messages got time.time() timestamps (~now) while the test forced session started_at into the past, so the get_compression_tip MAX(m.timestamp) tiebreaker depended on wall-clock ordering. Pass explicit, well-separated message timestamps so the chain resolution is deterministic by construction. - test_non_retryable_exhaustion_arms_cooldown: asserted the short (5s) exhaustion cooldown with a tight +1.0s slack, which false-fails when wall-clock jitter between the 'before' snapshot and the cooldown computation exceeds a second on a loaded runner. Widen to +30s — still cleanly below the 60s rate-limit window it must distinguish from.
This commit is contained in:
parent
f0de4c6a47
commit
88b3d8638e
3 changed files with 38 additions and 10 deletions
|
|
@ -70,7 +70,12 @@ class TestExhaustionArmsCooldown:
|
|||
cooldown = getattr(agent, "_rate_limited_until", 0)
|
||||
assert cooldown > before
|
||||
# Cooldown is the short exhaustion window, not the 60s rate-limit one.
|
||||
assert cooldown <= before + _FALLBACK_EXHAUSTED_COOLDOWN_S + 1.0
|
||||
# Use a generous upper slack: the only thing this must distinguish is
|
||||
# the ~5s short window from the 60s rate-limit window, so any margin
|
||||
# well below 60s proves it. A tight +1.0s false-fails on a loaded CI
|
||||
# runner when wall-clock jitter between `before` and the cooldown
|
||||
# computation exceeds a second (GC pause, swap, scheduler contention).
|
||||
assert cooldown <= before + _FALLBACK_EXHAUSTED_COOLDOWN_S + 30.0
|
||||
|
||||
def test_no_chain_does_not_arm_cooldown(self):
|
||||
"""An empty chain (no fallback configured) must not arm a cooldown —
|
||||
|
|
|
|||
|
|
@ -982,10 +982,16 @@ def test_session_resume_follows_compression_tip(monkeypatch, tmp_path):
|
|||
db = SessionDB(db_path=tmp_path / "state.db")
|
||||
base = int(time.time()) - 10_000
|
||||
db.create_session("parent_root", source="tui")
|
||||
db.append_message("parent_root", role="user", content="pre-compression turn")
|
||||
db.append_message(
|
||||
"parent_root", role="user", content="pre-compression turn",
|
||||
timestamp=base + 10,
|
||||
)
|
||||
db.end_session("parent_root", "compression")
|
||||
db.create_session("cont_tip", source="tui", parent_session_id="parent_root")
|
||||
db.append_message("cont_tip", role="assistant", content="post-compression reply")
|
||||
db.append_message(
|
||||
"cont_tip", role="assistant", content="post-compression reply",
|
||||
timestamp=base + 110,
|
||||
)
|
||||
conn = db._conn
|
||||
assert conn is not None
|
||||
conn.execute(
|
||||
|
|
|
|||
|
|
@ -1568,14 +1568,31 @@ class TestSigkillEscalation:
|
|||
try:
|
||||
ProcessRegistry._terminate_host_pid(parent.pid)
|
||||
|
||||
def _all_dead():
|
||||
return not any(
|
||||
psutil.pid_exists(p)
|
||||
and ProcessRegistry._proc_alive(psutil.Process(p))
|
||||
for p in all_pids
|
||||
)
|
||||
def _pid_dead(p: int) -> bool:
|
||||
# A pid is "dead" for our purposes if it no longer exists OR
|
||||
# exists only as an unreaped zombie (already terminated, just
|
||||
# not reaped by its reparented parent yet). psutil can also
|
||||
# raise mid-probe if the pid vanishes between the existence
|
||||
# check and the status read — treat any such race as dead.
|
||||
try:
|
||||
if not psutil.pid_exists(p):
|
||||
return True
|
||||
return not ProcessRegistry._proc_alive(psutil.Process(p))
|
||||
except Exception:
|
||||
return True
|
||||
|
||||
assert _wait_until(_all_dead, timeout=4.0), (
|
||||
def _all_dead():
|
||||
return all(_pid_dead(p) for p in all_pids)
|
||||
|
||||
# _terminate_host_pid SIGKILLs synchronously before returning, so
|
||||
# the kill signals are already delivered here. The only remaining
|
||||
# wait is the kernel tearing down 3 processes and the reparented
|
||||
# children transitioning to zombie — which can lag on a loaded CI
|
||||
# runner. Give a generous budget (matches the wait() test's 10s)
|
||||
# so this asserts the escalation BEHAVIOR, not the runner's
|
||||
# scheduling latency. The assertion itself never weakens: every
|
||||
# tree member must end up dead/zombie.
|
||||
assert _wait_until(_all_dead, timeout=15.0, interval=0.02), (
|
||||
"entire SIGTERM-ignoring tree (parent + children) must be SIGKILLed"
|
||||
)
|
||||
finally:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue