From 88b3d8638e830c54b4acd7a7bc32689cf57ba67b Mon Sep 17 00:00:00 2001 From: Hermes Agent <127238744+teknium1@users.noreply.github.com> Date: Sat, 27 Jun 2026 19:51:19 -0700 Subject: [PATCH] test: de-flake SIGKILL-tree, compression-tip resume, and fallback-cooldown tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three CI flakes hit while landing the credential-pool restore fix; all three were timing/wall-clock races in the tests, not product bugs (each passes locally and the assertions are correct): - test_entire_tree_is_sigkilled_not_just_parent: _terminate_host_pid SIGKILLs synchronously, but the test's 4s budget after a 1s in-function SIGTERM grace left almost no slack for the kernel to tear down 3 processes + reparent the children to zombies under loaded-CI scheduling. Widen the wait to 15s and make the liveness predicate tolerant of vanished-pid / zombie races. The assertion never weakens: every tree member must end up dead or zombie. - test_session_resume_follows_compression_tip: appended messages got time.time() timestamps (~now) while the test forced session started_at into the past, so the get_compression_tip MAX(m.timestamp) tiebreaker depended on wall-clock ordering. Pass explicit, well-separated message timestamps so the chain resolution is deterministic by construction. - test_non_retryable_exhaustion_arms_cooldown: asserted the short (5s) exhaustion cooldown with a tight +1.0s slack, which false-fails when wall-clock jitter between the 'before' snapshot and the cooldown computation exceeds a second on a loaded runner. Widen to +30s — still cleanly below the 60s rate-limit window it must distinguish from. --- ...test_24996_fallback_exhaustion_cooldown.py | 7 ++++- tests/test_tui_gateway_server.py | 10 ++++-- tests/tools/test_process_registry.py | 31 ++++++++++++++----- 3 files changed, 38 insertions(+), 10 deletions(-) diff --git a/tests/run_agent/test_24996_fallback_exhaustion_cooldown.py b/tests/run_agent/test_24996_fallback_exhaustion_cooldown.py index 8d82a2af679..76481154630 100644 --- a/tests/run_agent/test_24996_fallback_exhaustion_cooldown.py +++ b/tests/run_agent/test_24996_fallback_exhaustion_cooldown.py @@ -70,7 +70,12 @@ class TestExhaustionArmsCooldown: cooldown = getattr(agent, "_rate_limited_until", 0) assert cooldown > before # Cooldown is the short exhaustion window, not the 60s rate-limit one. - assert cooldown <= before + _FALLBACK_EXHAUSTED_COOLDOWN_S + 1.0 + # Use a generous upper slack: the only thing this must distinguish is + # the ~5s short window from the 60s rate-limit window, so any margin + # well below 60s proves it. A tight +1.0s false-fails on a loaded CI + # runner when wall-clock jitter between `before` and the cooldown + # computation exceeds a second (GC pause, swap, scheduler contention). + assert cooldown <= before + _FALLBACK_EXHAUSTED_COOLDOWN_S + 30.0 def test_no_chain_does_not_arm_cooldown(self): """An empty chain (no fallback configured) must not arm a cooldown — diff --git a/tests/test_tui_gateway_server.py b/tests/test_tui_gateway_server.py index 97f584d0732..92a01b3f41d 100644 --- a/tests/test_tui_gateway_server.py +++ b/tests/test_tui_gateway_server.py @@ -982,10 +982,16 @@ def test_session_resume_follows_compression_tip(monkeypatch, tmp_path): db = SessionDB(db_path=tmp_path / "state.db") base = int(time.time()) - 10_000 db.create_session("parent_root", source="tui") - db.append_message("parent_root", role="user", content="pre-compression turn") + db.append_message( + "parent_root", role="user", content="pre-compression turn", + timestamp=base + 10, + ) db.end_session("parent_root", "compression") db.create_session("cont_tip", source="tui", parent_session_id="parent_root") - db.append_message("cont_tip", role="assistant", content="post-compression reply") + db.append_message( + "cont_tip", role="assistant", content="post-compression reply", + timestamp=base + 110, + ) conn = db._conn assert conn is not None conn.execute( diff --git a/tests/tools/test_process_registry.py b/tests/tools/test_process_registry.py index 020dcd11ae8..659ef2e21e8 100644 --- a/tests/tools/test_process_registry.py +++ b/tests/tools/test_process_registry.py @@ -1568,14 +1568,31 @@ class TestSigkillEscalation: try: ProcessRegistry._terminate_host_pid(parent.pid) - def _all_dead(): - return not any( - psutil.pid_exists(p) - and ProcessRegistry._proc_alive(psutil.Process(p)) - for p in all_pids - ) + def _pid_dead(p: int) -> bool: + # A pid is "dead" for our purposes if it no longer exists OR + # exists only as an unreaped zombie (already terminated, just + # not reaped by its reparented parent yet). psutil can also + # raise mid-probe if the pid vanishes between the existence + # check and the status read — treat any such race as dead. + try: + if not psutil.pid_exists(p): + return True + return not ProcessRegistry._proc_alive(psutil.Process(p)) + except Exception: + return True - assert _wait_until(_all_dead, timeout=4.0), ( + def _all_dead(): + return all(_pid_dead(p) for p in all_pids) + + # _terminate_host_pid SIGKILLs synchronously before returning, so + # the kill signals are already delivered here. The only remaining + # wait is the kernel tearing down 3 processes and the reparented + # children transitioning to zombie — which can lag on a loaded CI + # runner. Give a generous budget (matches the wait() test's 10s) + # so this asserts the escalation BEHAVIOR, not the runner's + # scheduling latency. The assertion itself never weakens: every + # tree member must end up dead/zombie. + assert _wait_until(_all_dead, timeout=15.0, interval=0.02), ( "entire SIGTERM-ignoring tree (parent + children) must be SIGKILLed" ) finally: