From 6cb9917c73a6799173615ace5a8e529b9b89dce6 Mon Sep 17 00:00:00 2001
From: Teknium <127238744+teknium1@users.noreply.github.com>
Date: Tue, 19 May 2026 17:27:17 -0700
Subject: [PATCH] perf(compression): defer feasibility check to first
 compression attempt (#28957)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`AIAgent.__init__` was eagerly calling
`_check_compression_model_feasibility()` which probes the auxiliary
provider chain and runs `get_model_context_length()` (potentially
network-bound) to decide whether the configured auxiliary model can
fit a full compression-threshold window. That cost ~440ms cold on
every agent construction.

Most `chat -q` invocations finish in 1-5 seconds and never accumulate
enough context to trip the compression threshold, so the feasibility
check is pure overhead. The result is also only consumed when
compression actually fires (the function adjusts the live threshold
downward if the aux model can't fit; absent that mutation, the gate
in `conversation_loop.py:442` would never fire anyway).

Defer to first `compress_context()` call via
`agent._compression_feasibility_checked` sentinel. Runs at most once
per agent lifetime, just before the first compression pass. The
warning storage (`_compression_warning`) and gateway replay
machinery is unchanged — it still emits to status_callback on the
first turn that actually needs compression.

E2E timing (chat -q 'hi', 3 runs each):
                BEFORE   AFTER    delta
  median wall   2.03s    1.86s    -8% (-169ms)
  min wall      1.92s    1.63s    -15% (-293ms)

Real cold-start observation (synthetic 31-turn agent loop): identical
behavior since feasibility check fires once on first compression and
caches. No semantic difference for sessions that DO compress.

UX trade-off: users with broken auxiliary-provider config no longer
see the warning at session start. They see it when compression first
fires — which is exactly when it matters. For users with working
config (the vast majority), the warning never fires anyway, so the
deferral is invisible.

Tests:
- tests/run_agent/test_compression_feasibility.py — 16/16 pass
  (the one test that asserted call-at-init was updated to drive the
  lazy check explicitly via agent._check_compression_model_feasibility())
- Live tmux session: 2-turn conversation + tool call completes clean,
  zero errors in agent.log
---
 agent/agent_init.py                           |  8 +++++++-
 agent/conversation_compression.py             | 13 +++++++++++++
 .../run_agent/test_compression_feasibility.py | 19 +++++++++++++++++--
 3 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/agent/agent_init.py b/agent/agent_init.py
index a5d27c0b73d..e0846291ad6 100644
--- a/agent/agent_init.py
+++ b/agent/agent_init.py
@@ -1466,7 +1466,13 @@ def init_agent(
     # Gateway status_callback is not yet wired, so any warning is stored
     # in _compression_warning and replayed in the first run_conversation().
     agent._compression_warning = None
-    agent._check_compression_model_feasibility()
+    # Lazy feasibility check: deferred to the first turn that approaches the
+    # compression threshold. Running it eagerly here costs ~400ms cold (network
+    # probe of the auxiliary provider chain + /models lookup) on every agent
+    # init, including short ``chat -q`` runs that never reach the threshold.
+    # ``ensure_compression_feasibility_checked`` (called from
+    # ``run_conversation``'s preflight) runs it at most once per agent.
+    agent._compression_feasibility_checked = False
 
     # Snapshot primary runtime for per-turn restoration.  When fallback
     # activates during a turn, the next turn restores these values so the
diff --git a/agent/conversation_compression.py b/agent/conversation_compression.py
index 3f6a1ecbfac..a3a9ba1d6fb 100644
--- a/agent/conversation_compression.py
+++ b/agent/conversation_compression.py
@@ -281,6 +281,19 @@ def compress_context(
         prompt — the session is NOT rotated.  Callers should detect the
         no-op via ``len(returned) == len(input)`` and stop the retry loop.
     """
+    # Lazy feasibility check — run the auxiliary-provider probe + context
+    # length lookup just-in-time on the first compression attempt instead of
+    # at AIAgent.__init__. Saves ~400ms cold off every short session that
+    # never reaches the threshold (the vast majority of ``chat -q`` runs).
+    # The check itself sets ``agent._compression_warning`` so the
+    # status-callback replay machinery still emits the warning to the user
+    # the first time it would matter.
+    if not getattr(agent, "_compression_feasibility_checked", True):
+        try:
+            check_compression_model_feasibility(agent)
+        finally:
+            agent._compression_feasibility_checked = True
+
     _pre_msg_count = len(messages)
     logger.info(
         "context compression started: session=%s messages=%d tokens=~%s model=%s focus=%r",
diff --git a/tests/run_agent/test_compression_feasibility.py b/tests/run_agent/test_compression_feasibility.py
index 3e23f3eb5d3..3be0f0235a3 100644
--- a/tests/run_agent/test_compression_feasibility.py
+++ b/tests/run_agent/test_compression_feasibility.py
@@ -222,7 +222,14 @@ def test_feasibility_check_ignores_invalid_context_length(mock_get_client, mock_
 
 
 def test_init_feasibility_check_uses_aux_context_override_from_config():
-    """Real AIAgent init should cache and forward auxiliary.compression.context_length."""
+    """Lazy feasibility check should cache and forward auxiliary.compression.context_length.
+
+    NB: feasibility check is deferred from AIAgent.__init__ to the first
+    actual compression attempt (saves ~400ms cold startup on short sessions
+    that never trigger compression). The test drives the check explicitly
+    via ``agent._check_compression_model_feasibility()`` to assert the
+    config-override threading.
+    """
 
     class _StubCompressor:
         def __init__(self, *args, **kwargs):
@@ -264,7 +271,15 @@ def test_init_feasibility_check_uses_aux_context_override_from_config():
             skip_memory=True,
         )
 
-    assert agent._aux_compression_context_length_config == 1_000_000
+        # Config override is captured eagerly in __init__ (still needed
+        # because the threshold-derivation logic at construction time
+        # consults it).
+        assert agent._aux_compression_context_length_config == 1_000_000
+
+        # The expensive feasibility probe is deferred. Drive it manually
+        # to validate the call shape still forwards the override correctly.
+        agent._check_compression_model_feasibility()
+
     mock_ctx_len.assert_called_once_with(
         "custom/big-model",
         base_url="http://custom-endpoint:8080/v1",