refactor(memory): remove flush_memories entirely (#15696)

The AIAgent.flush_memories pre-compression save, the gateway _flush_memories_for_session, and everything feeding them are obsolete now that the background memory/skill review handles persistent memory extraction. Problems with flush_memories: - Pre-dates the background review loop. It was the only memory-save path when introduced; the background review now fires every 10 user turns on CLI and gateway alike, which is far more frequent than compression or session reset ever triggered flush. - Blocking and synchronous. Pre-compression flush ran on the live agent before compression, blocking the user-visible response. - Cache-breaking. Flush built a temporary conversation prefix (system prompt + memory-only tool list) that diverged from the live conversation's cached prefix, invalidating prompt caching. The gateway variant spawned a fresh AIAgent with its own clean prompt for each finalized session — still cache-breaking, just in a different process. - Redundant. Background review runs in the live conversation's session context, gets the same content, writes to the same memory store, and doesn't break the cache. Everything flush_memories claimed to preserve is already covered. What this removes: - AIAgent.flush_memories() method (~248 LOC in run_agent.py) - Pre-compression flush call in _compress_context - flush_memories call sites in cli.py (/new + exit) - GatewayRunner._flush_memories_for_session + _async_flush_memories (and the 3 call sites: session expiry watcher, /new, /resume) - 'flush_memories' entry from DEFAULT_CONFIG auxiliary tasks, hermes tools UI task list, auxiliary_client docstrings - _memory_flush_min_turns config + init - #15631's headroom-deduction math in _check_compression_model_feasibility (headroom was only needed because flush dragged the full main-agent system prompt along; the compression summariser sends a single user-role prompt so new_threshold = aux_context is safe again) - The dedicated test files and assertions that exercised flush-specific paths What this renames (with read-time backcompat on sessions.json): - SessionEntry.memory_flushed -> SessionEntry.expiry_finalized. The session-expiry watcher still uses the flag to avoid re-running finalize/eviction on the same expired session; the new name reflects what it now actually gates. from_dict() reads 'expiry_finalized' first, falls back to the legacy 'memory_flushed' key so existing sessions.json files upgrade seamlessly. Supersedes #15631 and #15638. Tested: 383 targeted tests pass across run_agent/, agent/, cli/, and gateway/ session-boundary suites. No behavior regressions — background memory review continues to handle persistent memory extraction on both CLI and gateway.
2026-04-28 01:21:43 +00:00 · 2026-04-25 08:21:14 -07:00 · 2026-04-25 08:21:14 -07:00 · ea01bdcebe
commit ea01bdcebe
parent d635e2df3f
23 changed files with 78 additions and 1567 deletions
--- a/tests/run_agent/test_compression_feasibility.py
+++ b/tests/run_agent/test_compression_feasibility.py
@ -41,8 +41,6 @@ def _make_agent(
    agent.tool_progress_callback = None
    agent._compression_warning = None
    agent._aux_compression_context_length_config = None
-    # Tools feed into the headroom calculation in _check_compression_model_feasibility.
-    # Tests that want to assert specific threshold values can override this.
    agent.tools = []

    compressor = MagicMock(spec=ContextCompressor)
@ -85,9 +83,8 @@ def test_auto_corrects_threshold_when_aux_context_below_threshold(mock_get_clien
    assert "threshold:" in messages[0]
    # Warning stored for gateway replay
    assert agent._compression_warning is not None
-    # Threshold on the live compressor was actually lowered, accounting for
-    # the request-overhead headroom (empty tools list → ~12K headroom only).
-    assert agent.context_compressor.threshold_tokens == 68_000
+    # Threshold on the live compressor was actually lowered to aux_context.
+    assert agent.context_compressor.threshold_tokens == 80_000


@patch("agent.model_metadata.get_model_context_length", return_value=32_768)
@ -346,93 +343,7 @@ def test_just_below_threshold_auto_corrects(mock_get_client, mock_ctx_len):
    assert len(messages) == 1
    assert "small-model" in messages[0]
    assert "Auto-lowered" in messages[0]
-    assert agent.context_compressor.threshold_tokens == 87_999
-
-
-# ── Headroom for system prompt + tool schemas ────────────────────────
-
-
-@patch("agent.model_metadata.get_model_context_length", return_value=128_000)
-@patch("agent.auxiliary_client.get_text_auxiliary_client")
-def test_auto_lowered_threshold_reserves_headroom_for_tools_and_system(mock_get_client, mock_ctx_len):
-    """When aux context binds the threshold, new_threshold must leave room
-    for the system prompt and tool schemas that auxiliary callers
-    (compression summariser, flush_memories) prepend to the message list.
-
-    Without headroom, a full-budget message window + ~25K system/tool
-    overhead overflows the aux model with HTTP 400.  Regression guard for
-    the flush_memories-on-busy-toolset overflow path.
-    """
-    # Main context 200K, threshold 70% = 140K.  Aux pins at 128K (below
-    # threshold → triggers auto-correct).
-    agent = _make_agent(main_context=200_000, threshold_percent=0.70)
-
-    # Build a realistic tool schema load.
-    agent.tools = [
-        {
-            "type": "function",
-            "function": {
-                "name": f"tool_{i}",
-                "description": "x" * 200,
-                "parameters": {"type": "object", "properties": {"arg": {"type": "string", "description": "y" * 120}}},
-            },
-        }
-        for i in range(50)
-    ]
-
-    mock_client = MagicMock()
-    mock_client.base_url = "https://openrouter.ai/api/v1"
-    mock_client.api_key = "sk-aux"
-    mock_get_client.return_value = (mock_client, "model-with-128k")
-
-    agent._emit_status = lambda msg: None
-    agent._check_compression_model_feasibility()
-
-    new_threshold = agent.context_compressor.threshold_tokens
-
-    # Must have strictly reserved headroom: new_threshold < aux_context.
-    assert new_threshold < 128_000, (
-        f"threshold {new_threshold} did not reserve headroom below aux=128,000 "
-        f"— system prompt + tools would overflow the aux model"
-    )
-    # Must respect the 64K hard floor.
-    from agent.model_metadata import MINIMUM_CONTEXT_LENGTH
-    assert new_threshold >= MINIMUM_CONTEXT_LENGTH
-
-
-@patch("agent.model_metadata.get_model_context_length", return_value=80_000)
-@patch("agent.auxiliary_client.get_text_auxiliary_client")
-def test_headroom_floors_at_minimum_context(mock_get_client, mock_ctx_len):
-    """If headroom subtraction would push below 64K floor, clamp to 64K
-    rather than refusing the session — the aux is still workable for a
-    smaller message window.
-    """
-    # Aux at 80K, with enough tools to push headroom > 16K → naive subtract
-    # would land at < 64K.  The max(..., MINIMUM_CONTEXT_LENGTH) clamp must
-    # keep the session running.
-    agent = _make_agent(main_context=200_000, threshold_percent=0.50)
-    agent.tools = [
-        {
-            "type": "function",
-            "function": {
-                "name": f"tool_{i}",
-                "description": "z" * 2_000,  # fat descriptions
-                "parameters": {},
-            },
-        }
-        for i in range(30)
-    ]
-
-    mock_client = MagicMock()
-    mock_client.base_url = "https://openrouter.ai/api/v1"
-    mock_client.api_key = "sk-aux"
-    mock_get_client.return_value = (mock_client, "small-aux-model")
-
-    agent._emit_status = lambda msg: None
-    agent._check_compression_model_feasibility()
-
-    from agent.model_metadata import MINIMUM_CONTEXT_LENGTH
-    assert agent.context_compressor.threshold_tokens == MINIMUM_CONTEXT_LENGTH
+    assert agent.context_compressor.threshold_tokens == 99_999


 # ── Two-phase: __init__ + run_conversation replay ───────────────────