test(24996): freeze monotonic clock to de-flake fallback cooldown timing

The exhaustion-cooldown timing assertions relied on a wall-clock budget (before + window + 1.0s). On loaded CI runners the activation calls could exceed the 1s slack, flaking 'Run tests slice 4/8'. Freeze chat_completion_helpers.time.monotonic so the cooldown math is exact and load-independent across all four tests.
2026-07-01 12:02:05 +00:00 · 2026-06-27 19:46:14 -07:00 · 2026-06-27 19:46:14 -07:00 · e418605450
commit e418605450
parent 1ad8b44413
1 changed files with 38 additions and 27 deletions
--- a/tests/run_agent/test_24996_fallback_exhaustion_cooldown.py
+++ b/tests/run_agent/test_24996_fallback_exhaustion_cooldown.py
@ -15,9 +15,7 @@ restore stays gated (and does NOT reset the index) until the cooldown clears.
 Rate-limit / billing failures keep their own 60s cooldown and are unaffected.
 """

-import time
 from unittest.mock import MagicMock, patch
-
 from run_agent import AIAgent
 from agent.error_classifier import FailoverReason
 from agent.chat_completion_helpers import _FALLBACK_EXHAUSTED_COOLDOWN_S
@ -51,31 +49,35 @@ def _mock_client(base_url="https://openrouter.ai/api/v1", api_key="fb-key"):
 class TestExhaustionArmsCooldown:
    def test_non_retryable_exhaustion_arms_cooldown(self):
        """Walking a non-empty chain to exhaustion on a non-rate-limit
-        failure arms a short ``_rate_limited_until`` cooldown."""
+        failure arms a short ``_rate_limited_until`` cooldown.
+
+        ``time.monotonic`` is frozen inside ``chat_completion_helpers`` so the
+        cooldown math is exact and independent of CI scheduling latency — the
+        previous wall-clock upper bound (``before + window + 1.0``) flaked on
+        loaded runners when the three activation calls took longer than 1s.
+        """
        fbs = [
            {"provider": "openai", "model": "gpt-4o"},
            {"provider": "zai", "model": "glm-4.7"},
        ]
        agent = _make_agent(fallback_model=fbs)
        agent._rate_limited_until = 0
-        before = time.monotonic()
-        with patch(
-            "agent.auxiliary_client.resolve_provider_client",
-            return_value=(_mock_client(), "resolved"),
+        frozen = 1_000.0
+        with (
+            patch("agent.chat_completion_helpers.time.monotonic", return_value=frozen),
+            patch(
+                "agent.auxiliary_client.resolve_provider_client",
+                return_value=(_mock_client(), "resolved"),
+            ),
        ):
            assert agent._try_activate_fallback() is True   # -> entry 0
            assert agent._try_activate_fallback() is True   # -> entry 1
            # Chain now exhausted; a non-rate-limit failure must arm cooldown.
            assert agent._try_activate_fallback() is False
-        cooldown = getattr(agent, "_rate_limited_until", 0)
-        assert cooldown > before
-        # Cooldown is the short exhaustion window, not the 60s rate-limit one.
-        # Use a generous upper slack: the only thing this must distinguish is
-        # the ~5s short window from the 60s rate-limit window, so any margin
-        # well below 60s proves it. A tight +1.0s false-fails on a loaded CI
-        # runner when wall-clock jitter between `before` and the cooldown
-        # computation exceeds a second (GC pause, swap, scheduler contention).
-        assert cooldown <= before + _FALLBACK_EXHAUSTED_COOLDOWN_S + 30.0
+            cooldown = getattr(agent, "_rate_limited_until", 0)
+        # Cooldown is exactly the short exhaustion window past the frozen clock,
+        # not the 60s rate-limit one.
+        assert cooldown == frozen + _FALLBACK_EXHAUSTED_COOLDOWN_S

    def test_no_chain_does_not_arm_cooldown(self):
        """An empty chain (no fallback configured) must not arm a cooldown —
@ -92,30 +94,39 @@ class TestExhaustionArmsCooldown:
        fbs = [{"provider": "openai", "model": "gpt-4o"}]
        agent = _make_agent(fallback_model=fbs)
        agent._rate_limited_until = 0
-        before = time.monotonic()
-        with patch(
-            "agent.auxiliary_client.resolve_provider_client",
-            return_value=(_mock_client(), "resolved"),
+        frozen = 1_000.0
+        with (
+            patch("agent.chat_completion_helpers.time.monotonic", return_value=frozen),
+            patch(
+                "agent.auxiliary_client.resolve_provider_client",
+                return_value=(_mock_client(), "resolved"),
+            ),
        ):
            # First activation with rate_limit reason arms the 60s cooldown.
            assert agent._try_activate_fallback(reason=FailoverReason.rate_limit) is True
            # Chain exhausted on the next call (also rate_limit) -> still False,
            # and the 60s cooldown must survive (max(), not overwritten down).
            assert agent._try_activate_fallback(reason=FailoverReason.rate_limit) is False
-        cooldown = getattr(agent, "_rate_limited_until", 0)
-        assert cooldown > before + 50  # ~60s, far past the short window
+            cooldown = getattr(agent, "_rate_limited_until", 0)
+        # ~60s past the frozen clock, far past the short exhaustion window.
+        assert cooldown == frozen + 60

    def test_cooldown_never_shrinks_existing_window(self):
        """If a longer cooldown is already armed, exhaustion must not reduce
        it (we take the max)."""
        fbs = [{"provider": "openai", "model": "gpt-4o"}]
        agent = _make_agent(fallback_model=fbs)
-        far_future = time.monotonic() + 999
+        frozen = 1_000.0
+        far_future = frozen + 999
        agent._rate_limited_until = far_future
-        with patch(
-            "agent.auxiliary_client.resolve_provider_client",
-            return_value=(_mock_client(), "resolved"),
+        with (
+            patch("agent.chat_completion_helpers.time.monotonic", return_value=frozen),
+            patch(
+                "agent.auxiliary_client.resolve_provider_client",
+                return_value=(_mock_client(), "resolved"),
+            ),
        ):
            assert agent._try_activate_fallback() is True
            assert agent._try_activate_fallback() is False
-        assert getattr(agent, "_rate_limited_until", 0) >= far_future
+            cooldown = getattr(agent, "_rate_limited_until", 0)
+        assert cooldown == far_future