diff --git a/agent/agent_init.py b/agent/agent_init.py index a5d27c0b73d..e0846291ad6 100644 --- a/agent/agent_init.py +++ b/agent/agent_init.py @@ -1466,7 +1466,13 @@ def init_agent( # Gateway status_callback is not yet wired, so any warning is stored # in _compression_warning and replayed in the first run_conversation(). agent._compression_warning = None - agent._check_compression_model_feasibility() + # Lazy feasibility check: deferred to the first turn that approaches the + # compression threshold. Running it eagerly here costs ~400ms cold (network + # probe of the auxiliary provider chain + /models lookup) on every agent + # init, including short ``chat -q`` runs that never reach the threshold. + # ``ensure_compression_feasibility_checked`` (called from + # ``run_conversation``'s preflight) runs it at most once per agent. + agent._compression_feasibility_checked = False # Snapshot primary runtime for per-turn restoration. When fallback # activates during a turn, the next turn restores these values so the diff --git a/agent/conversation_compression.py b/agent/conversation_compression.py index 3f6a1ecbfac..a3a9ba1d6fb 100644 --- a/agent/conversation_compression.py +++ b/agent/conversation_compression.py @@ -281,6 +281,19 @@ def compress_context( prompt — the session is NOT rotated. Callers should detect the no-op via ``len(returned) == len(input)`` and stop the retry loop. """ + # Lazy feasibility check — run the auxiliary-provider probe + context + # length lookup just-in-time on the first compression attempt instead of + # at AIAgent.__init__. Saves ~400ms cold off every short session that + # never reaches the threshold (the vast majority of ``chat -q`` runs). + # The check itself sets ``agent._compression_warning`` so the + # status-callback replay machinery still emits the warning to the user + # the first time it would matter. + if not getattr(agent, "_compression_feasibility_checked", True): + try: + check_compression_model_feasibility(agent) + finally: + agent._compression_feasibility_checked = True + _pre_msg_count = len(messages) logger.info( "context compression started: session=%s messages=%d tokens=~%s model=%s focus=%r", diff --git a/tests/run_agent/test_compression_feasibility.py b/tests/run_agent/test_compression_feasibility.py index 3e23f3eb5d3..3be0f0235a3 100644 --- a/tests/run_agent/test_compression_feasibility.py +++ b/tests/run_agent/test_compression_feasibility.py @@ -222,7 +222,14 @@ def test_feasibility_check_ignores_invalid_context_length(mock_get_client, mock_ def test_init_feasibility_check_uses_aux_context_override_from_config(): - """Real AIAgent init should cache and forward auxiliary.compression.context_length.""" + """Lazy feasibility check should cache and forward auxiliary.compression.context_length. + + NB: feasibility check is deferred from AIAgent.__init__ to the first + actual compression attempt (saves ~400ms cold startup on short sessions + that never trigger compression). The test drives the check explicitly + via ``agent._check_compression_model_feasibility()`` to assert the + config-override threading. + """ class _StubCompressor: def __init__(self, *args, **kwargs): @@ -264,7 +271,15 @@ def test_init_feasibility_check_uses_aux_context_override_from_config(): skip_memory=True, ) - assert agent._aux_compression_context_length_config == 1_000_000 + # Config override is captured eagerly in __init__ (still needed + # because the threshold-derivation logic at construction time + # consults it). + assert agent._aux_compression_context_length_config == 1_000_000 + + # The expensive feasibility probe is deferred. Drive it manually + # to validate the call shape still forwards the override correctly. + agent._check_compression_model_feasibility() + mock_ctx_len.assert_called_once_with( "custom/big-model", base_url="http://custom-endpoint:8080/v1",