Merge pull request #27248 from NousResearch/hermes/hermes-27dc9cc2

refactor(run_agent): extract AIAgent internals into agent/ modules (16k→3.8k lines, 76% reduction)
2026-07-14 14:12:44 +00:00 · 2026-05-16 23:52:16 -07:00 · 2026-05-16 23:52:16 -07:00 · 56ad30de17
commit 56ad30de17
parent 7322816efa 563b4d9e51
19 changed files with 14148 additions and 12694 deletions
--- a/agent/agent_init.py
+++ b/agent/agent_init.py
--- a/agent/agent_runtime_helpers.py
+++ b/agent/agent_runtime_helpers.py
--- a/agent/background_review.py
+++ b/agent/background_review.py
@ -0,0 +1,570 @@
+"""Background memory/skill review — fork the agent to evaluate the turn.
+
+After every turn, ``AIAgent.run_conversation`` may call
+:func:`spawn_background_review` to fire off a daemon thread that replays
+the conversation snapshot in a forked :class:`AIAgent` and asks itself
+"should any skill/memory be saved or updated?".  Writes go straight to
+the memory + skill stores.  Main conversation and prompt cache are never
+touched.
+
+The fork inherits the parent's live runtime (provider, model, base_url,
+credentials, cached system prompt) so it hits the same prefix cache and
+uses the same auth.  It runs with a tool whitelist limited to memory and
+skill management tools; everything else is denied at runtime.
+
+See the ``hermes-agent-dev`` skill (``references/self-improvement-loop.md``)
+for invariants and PR review criteria.
+"""
+
+from __future__ import annotations
+
+import contextlib
+import json
+import logging
+import os
+from typing import Any, Dict, List, Optional
+
+logger = logging.getLogger(__name__)
+
+
+# Review-prompt strings — used by ``spawn_background_review_thread`` to build
+# the user-message that the forked review agent receives.  AIAgent exposes
+# them as class attributes (``_MEMORY_REVIEW_PROMPT`` etc.) for back-compat;
+# the actual text lives here so future edits are one-place.
+_MEMORY_REVIEW_PROMPT = (
+    "Review the conversation above and consider saving to memory if appropriate.\n\n"
+    "Focus on:\n"
+    "1. Has the user revealed things about themselves — their persona, desires, "
+    "preferences, or personal details worth remembering?\n"
+    "2. Has the user expressed expectations about how you should behave, their work "
+    "style, or ways they want you to operate?\n\n"
+    "If something stands out, save it using the memory tool. "
+    "If nothing is worth saving, just say 'Nothing to save.' and stop."
+)
+
+_SKILL_REVIEW_PROMPT = (
+    "Review the conversation above and update the skill library. Be "
+    "ACTIVE — most sessions produce at least one skill update, even if "
+    "small. A pass that does nothing is a missed learning opportunity, "
+    "not a neutral outcome.\n\n"
+    "Target shape of the library: CLASS-LEVEL skills, each with a rich "
+    "SKILL.md and a `references/` directory for session-specific detail. "
+    "Not a long flat list of narrow one-session-one-skill entries. This "
+    "shapes HOW you update, not WHETHER you update.\n\n"
+    "Signals to look for (any one of these warrants action):\n"
+    "  • User corrected your style, tone, format, legibility, or "
+    "verbosity. Frustration signals like 'stop doing X', 'this is too "
+    "verbose', 'don't format like this', 'why are you explaining', "
+    "'just give me the answer', 'you always do Y and I hate it', or an "
+    "explicit 'remember this' are FIRST-CLASS skill signals, not just "
+    "memory signals. Update the relevant skill(s) to embed the "
+    "preference so the next session starts already knowing.\n"
+    "  • User corrected your workflow, approach, or sequence of steps. "
+    "Encode the correction as a pitfall or explicit step in the skill "
+    "that governs that class of task.\n"
+    "  • Non-trivial technique, fix, workaround, debugging path, or "
+    "tool-usage pattern emerged that a future session would benefit "
+    "from. Capture it.\n"
+    "  • A skill that got loaded or consulted this session turned out "
+    "to be wrong, missing a step, or outdated. Patch it NOW.\n\n"
+    "Preference order — prefer the earliest action that fits, but do "
+    "pick one when a signal above fired:\n"
+    "  1. UPDATE A CURRENTLY-LOADED SKILL. Look back through the "
+    "conversation for skills the user loaded via /skill-name or you "
+    "read via skill_view. If any of them covers the territory of the "
+    "new learning, PATCH that one first. It is the skill that was in "
+    "play, so it's the right one to extend.\n"
+    "  2. UPDATE AN EXISTING UMBRELLA (via skills_list + skill_view). "
+    "If no loaded skill fits but an existing class-level skill does, "
+    "patch it. Add a subsection, a pitfall, or broaden a trigger.\n"
+    "  3. ADD A SUPPORT FILE under an existing umbrella. Skills can be "
+    "packaged with three kinds of support files — use the right "
+    "directory per kind:\n"
+    "     • `references/<topic>.md` — session-specific detail (error "
+    "transcripts, reproduction recipes, provider quirks) AND "
+    "condensed knowledge banks: quoted research, API docs, external "
+    "authoritative excerpts, or domain notes you found while working "
+    "on the problem. Write it concise and for the value of the task, "
+    "not as a full mirror of upstream docs.\n"
+    "     • `templates/<name>.<ext>` — starter files meant to be "
+    "copied and modified (boilerplate configs, scaffolding, a "
+    "known-good example the agent can `reproduce with modifications`).\n"
+    "     • `scripts/<name>.<ext>` — statically re-runnable actions "
+    "the skill can invoke directly (verification scripts, fixture "
+    "generators, deterministic probes, anything the agent should run "
+    "rather than hand-type each time).\n"
+    "     Add support files via skill_manage action=write_file with "
+    "file_path starting 'references/', 'templates/', or 'scripts/'. "
+    "The umbrella's SKILL.md should gain a one-line pointer to any "
+    "new support file so future agents know it exists.\n"
+    "  4. CREATE A NEW CLASS-LEVEL UMBRELLA SKILL when no existing "
+    "skill covers the class. The name MUST be at the class level. "
+    "The name MUST NOT be a specific PR number, error string, feature "
+    "codename, library-alone name, or 'fix-X / debug-Y / audit-Z-today' "
+    "session artifact. If the proposed name only makes sense for "
+    "today's task, it's wrong — fall back to (1), (2), or (3).\n\n"
+    "User-preference embedding (important): when the user expressed a "
+    "style/format/workflow preference, the update belongs in the "
+    "SKILL.md body, not just in memory. Memory captures 'who the user "
+    "is and what the current situation and state of your operations "
+    "are'; skills capture 'how to do this class of task for this "
+    "user'. When they complain about how you handled a task, the "
+    "skill that governs that task needs to carry the lesson.\n\n"
+    "If you notice two existing skills that overlap, note it in your "
+    "reply — the background curator handles consolidation at scale.\n\n"
+    "Do NOT capture (these become persistent self-imposed constraints "
+    "that bite you later when the environment changes):\n"
+    "  • Environment-dependent failures: missing binaries, fresh-install "
+    "errors, post-migration path mismatches, 'command not found', "
+    "unconfigured credentials, uninstalled packages. The user can fix "
+    "these — they are not durable rules.\n"
+    "  • Negative claims about tools or features ('browser tools do not "
+    "work', 'X tool is broken', 'cannot use Y from execute_code'). These "
+    "harden into refusals the agent cites against itself for months "
+    "after the actual problem was fixed.\n"
+    "  • Session-specific transient errors that resolved before the "
+    "conversation ended. If retrying worked, the lesson is the retry "
+    "pattern, not the original failure.\n"
+    "  • One-off task narratives. A user asking 'summarize today's "
+    "market' or 'analyze this PR' is not a class of work that warrants "
+    "a skill.\n\n"
+    "If a tool failed because of setup state, capture the FIX (install "
+    "command, config step, env var to set) under an existing setup or "
+    "troubleshooting skill — never 'this tool does not work' as a "
+    "standalone constraint.\n\n"
+    "'Nothing to save.' is a real option but should NOT be the "
+    "default. If the session ran smoothly with no corrections and "
+    "produced no new technique, just say 'Nothing to save.' and stop. "
+    "Otherwise, act."
+)
+
+_COMBINED_REVIEW_PROMPT = (
+    "Review the conversation above and update two things:\n\n"
+    "**Memory**: who the user is. Did the user reveal persona, "
+    "desires, preferences, personal details, or expectations about "
+    "how you should behave? Save facts about the user and durable "
+    "preferences with the memory tool.\n\n"
+    "**Skills**: how to do this class of task. Be ACTIVE — most "
+    "sessions produce at least one skill update. A pass that does "
+    "nothing is a missed learning opportunity, not a neutral outcome.\n\n"
+    "Target shape of the skill library: CLASS-LEVEL skills with a rich "
+    "SKILL.md and a `references/` directory for session-specific detail. "
+    "Not a long flat list of narrow one-session-one-skill entries.\n\n"
+    "Signals that warrant a skill update (any one is enough):\n"
+    "  • User corrected your style, tone, format, legibility, "
+    "verbosity, or approach. Frustration is a FIRST-CLASS skill "
+    "signal, not just a memory signal. 'stop doing X', 'don't format "
+    "like this', 'I hate when you Y' — embed the lesson in the skill "
+    "that governs that task so the next session starts fixed.\n"
+    "  • Non-trivial technique, fix, workaround, or debugging path "
+    "emerged.\n"
+    "  • A skill that was loaded or consulted turned out wrong, "
+    "missing, or outdated — patch it now.\n\n"
+    "Preference order for skills — pick the earliest that fits:\n"
+    "  1. UPDATE A CURRENTLY-LOADED SKILL. Check what skills were "
+    "loaded via /skill-name or skill_view in the conversation. If one "
+    "of them covers the learning, PATCH it first. It was in play; "
+    "it's the right place.\n"
+    "  2. UPDATE AN EXISTING UMBRELLA (skills_list + skill_view to "
+    "find the right one). Patch it.\n"
+    "  3. ADD A SUPPORT FILE under an existing umbrella via "
+    "skill_manage action=write_file. Three kinds: "
+    "`references/<topic>.md` for session-specific detail OR condensed "
+    "knowledge banks (quoted research, API docs excerpts, domain "
+    "notes) written concise and task-focused; `templates/<name>.<ext>` "
+    "for starter files meant to be copied and modified; "
+    "`scripts/<name>.<ext>` for statically re-runnable actions "
+    "(verification, fixture generators, probes). Add a one-line "
+    "pointer in SKILL.md so future agents find them.\n"
+    "  4. CREATE A NEW CLASS-LEVEL UMBRELLA when nothing exists. "
+    "Name at the class level — NOT a PR number, error string, "
+    "codename, library-alone name, or 'fix-X / debug-Y' session "
+    "artifact. If the name only fits today's task, fall back to (1), "
+    "(2), or (3).\n\n"
+    "User-preference embedding: when the user complains about how "
+    "you handled a task, update the skill that governs that task — "
+    "memory alone isn't enough. Memory says 'who the user is and "
+    "what the current situation and state of your operations are'; "
+    "skills say 'how to do this class of task for this user'. Both "
+    "should carry user-preference lessons when relevant.\n\n"
+    "If you notice overlapping existing skills, mention it — the "
+    "background curator handles consolidation.\n\n"
+    "Do NOT capture as skills (these become persistent self-imposed "
+    "constraints that bite you later when the environment changes):\n"
+    "  • Environment-dependent failures: missing binaries, fresh-install "
+    "errors, post-migration path mismatches, 'command not found', "
+    "unconfigured credentials, uninstalled packages. The user can fix "
+    "these — they are not durable rules.\n"
+    "  • Negative claims about tools or features ('browser tools do not "
+    "work', 'X tool is broken', 'cannot use Y from execute_code'). These "
+    "harden into refusals the agent cites against itself for months "
+    "after the actual problem was fixed.\n"
+    "  • Session-specific transient errors that resolved before the "
+    "conversation ended. If retrying worked, the lesson is the retry "
+    "pattern, not the original failure.\n"
+    "  • One-off task narratives. A user asking 'summarize today's "
+    "market' or 'analyze this PR' is not a class of work that warrants "
+    "a skill.\n\n"
+    "If a tool failed because of setup state, capture the FIX (install "
+    "command, config step, env var to set) under an existing setup or "
+    "troubleshooting skill — never 'this tool does not work' as a "
+    "standalone constraint.\n\n"
+    "Act on whichever of the two dimensions has real signal. If "
+    "genuinely nothing stands out on either, say 'Nothing to save.' "
+    "and stop — but don't reach for that conclusion as a default."
+)
+
+
+
+def summarize_background_review_actions(
+    review_messages: List[Dict],
+    prior_snapshot: List[Dict],
+) -> List[str]:
+    """Build the human-facing action summary for a background review pass.
+
+    Walks the review agent's session messages and collects "successful tool
+    action" descriptions to surface to the user (e.g. "Memory updated").
+    Tool messages already present in ``prior_snapshot`` are skipped so we
+    don't re-surface stale results from the prior conversation that the
+    review agent inherited via ``conversation_history`` (issue #14944).
+
+    Matching is by ``tool_call_id`` when available, with a content-equality
+    fallback for tool messages that lack one.
+    """
+    existing_tool_call_ids = set()
+    existing_tool_contents = set()
+    for prior in prior_snapshot or []:
+        if not isinstance(prior, dict) or prior.get("role") != "tool":
+            continue
+        tcid = prior.get("tool_call_id")
+        if tcid:
+            existing_tool_call_ids.add(tcid)
+        else:
+            content = prior.get("content")
+            if isinstance(content, str):
+                existing_tool_contents.add(content)
+
+    actions: List[str] = []
+    for msg in review_messages or []:
+        if not isinstance(msg, dict) or msg.get("role") != "tool":
+            continue
+        tcid = msg.get("tool_call_id")
+        if tcid and tcid in existing_tool_call_ids:
+            continue
+        if not tcid:
+            content_str = msg.get("content")
+            if isinstance(content_str, str) and content_str in existing_tool_contents:
+                continue
+        try:
+            data = json.loads(msg.get("content", "{}"))
+        except (json.JSONDecodeError, TypeError):
+            continue
+        if not isinstance(data, dict) or not data.get("success"):
+            continue
+        message = data.get("message", "")
+        target = data.get("target", "")
+        if "created" in message.lower():
+            actions.append(message)
+        elif "updated" in message.lower():
+            actions.append(message)
+        elif "added" in message.lower() or (target and "add" in message.lower()):
+            label = "Memory" if target == "memory" else "User profile" if target == "user" else target
+            actions.append(f"{label} updated")
+        elif "Entry added" in message:
+            label = "Memory" if target == "memory" else "User profile" if target == "user" else target
+            actions.append(f"{label} updated")
+        elif "removed" in message.lower() or "replaced" in message.lower():
+            label = "Memory" if target == "memory" else "User profile" if target == "user" else target
+            actions.append(f"{label} updated")
+    return actions
+
+
+def build_memory_write_metadata(
+    agent: Any,
+    *,
+    write_origin: Optional[str] = None,
+    execution_context: Optional[str] = None,
+    task_id: Optional[str] = None,
+    tool_call_id: Optional[str] = None,
+) -> Dict[str, Any]:
+    """Build provenance metadata for external memory-provider mirrors."""
+    metadata: Dict[str, Any] = {
+        "write_origin": write_origin or getattr(agent, "_memory_write_origin", "assistant_tool"),
+        "execution_context": (
+            execution_context
+            or getattr(agent, "_memory_write_context", "foreground")
+        ),
+        "session_id": agent.session_id or "",
+        "parent_session_id": agent._parent_session_id or "",
+        "platform": agent.platform or os.environ.get("HERMES_SESSION_SOURCE", "cli"),
+        "tool_name": "memory",
+    }
+    if task_id:
+        metadata["task_id"] = task_id
+    if tool_call_id:
+        metadata["tool_call_id"] = tool_call_id
+    return {k: v for k, v in metadata.items() if v not in {None, ""}}
+
+
+def _run_review_in_thread(
+    agent: Any,
+    messages_snapshot: List[Dict],
+    prompt: str,
+) -> None:
+    """Worker function executed in the background-review daemon thread.
+
+    Spawns a forked ``AIAgent`` inheriting the parent's runtime, runs the
+    review prompt, and surfaces a compact action summary back to the user
+    via ``agent._safe_print`` and ``agent.background_review_callback``.
+    """
+    # Local import to avoid a hard circular dep at module load.
+    from run_agent import AIAgent
+    from tools.terminal_tool import set_approval_callback as _set_approval_callback
+
+    # Install a non-interactive approval callback on this worker
+    # thread so any dangerous-command guard the review agent trips
+    # resolves to "deny" instead of falling back to input() -- which
+    # deadlocks against the parent's prompt_toolkit TUI (#15216).
+    # Same pattern as _subagent_auto_deny in tools/delegate_tool.py.
+    def _bg_review_auto_deny(command, description, **kwargs):
+        logger.warning(
+            "Background review auto-denied dangerous command: %s (%s)",
+            command, description,
+        )
+        return "deny"
+    try:
+        _set_approval_callback(_bg_review_auto_deny)
+    except Exception:
+        pass
+
+    review_agent = None
+    review_messages: List[Dict] = []
+    try:
+        with open(os.devnull, "w", encoding="utf-8") as _devnull, \
+             contextlib.redirect_stdout(_devnull), \
+             contextlib.redirect_stderr(_devnull):
+            # Inherit the parent agent's live runtime (provider, model,
+            # base_url, api_key, api_mode) so the fork uses the exact
+            # same credentials the main turn is using.  Without this,
+            # AIAgent.__init__ re-runs auto-resolution from env vars,
+            # which fails for OAuth-only providers, session-scoped
+            # creds, or credential-pool setups where the resolver can't
+            # reconstruct auth from scratch -- producing the spurious
+            # "No LLM provider configured" warning at end of turn.
+            _parent_runtime = agent._current_main_runtime()
+            _parent_api_mode = _parent_runtime.get("api_mode") or None
+            # The review fork needs to call agent-loop tools (memory,
+            # skill_manage). Those tools require Hermes' own dispatch,
+            # which the codex_app_server runtime bypasses entirely
+            # (it runs the turn inside codex's subprocess). So when
+            # the parent is on codex_app_server, downgrade the review
+            # fork to codex_responses — same auth/credentials, but
+            # talks to the OpenAI Responses API directly so Hermes
+            # owns the loop and the agent-loop tools dispatch.
+            if _parent_api_mode == "codex_app_server":
+                _parent_api_mode = "codex_responses"
+            # skip_memory=True keeps the review fork from
+            # touching external memory plugins (honcho, mem0,
+            # supermemory, etc.).  Without it, the fork's
+            # __init__ rebuilds its own _memory_manager from
+            # config, scoped to the parent's session_id, and
+            # run_conversation() then leaks the harness prompt
+            # into the user's real memory namespace via three
+            # ingestion sites: on_turn_start (cadence + turn
+            # message), prefetch_all (recall query), and
+            # sync_all (harness prompt + review output recorded
+            # as a (user, assistant) turn pair).  Built-in
+            # MEMORY.md / USER.md state is re-bound from the
+            # parent below so memory(action="add") writes from
+            # the review still land on disk; the review just
+            # has zero side effects on external providers.
+            review_agent = AIAgent(
+                model=agent.model,
+                max_iterations=16,
+                quiet_mode=True,
+                platform=agent.platform,
+                provider=agent.provider,
+                api_mode=_parent_api_mode,
+                base_url=_parent_runtime.get("base_url") or None,
+                api_key=_parent_runtime.get("api_key") or None,
+                credential_pool=getattr(agent, "_credential_pool", None),
+                parent_session_id=agent.session_id,
+                skip_memory=True,
+            )
+            review_agent._memory_write_origin = "background_review"
+            review_agent._memory_write_context = "background_review"
+            review_agent._memory_store = agent._memory_store
+            review_agent._memory_enabled = agent._memory_enabled
+            review_agent._user_profile_enabled = agent._user_profile_enabled
+            review_agent._memory_nudge_interval = 0
+            review_agent._skill_nudge_interval = 0
+            # Suppress all status/warning emits from the fork so the
+            # user only sees the final successful-action summary.
+            # Without this, mid-review "Iteration budget exhausted",
+            # rate-limit retries, compression warnings, and other
+            # lifecycle messages bubble up through _emit_status ->
+            # _vprint and leak past the stdout redirect (they go via
+            # _print_fn/status_callback, which bypass sys.stdout).
+            review_agent.suppress_status_output = True
+            # Inherit the parent's cached system prompt verbatim so
+            # the review fork's outbound HTTP request hits the same
+            # Anthropic/OpenRouter prefix cache the parent warmed.
+            # Without this, the fork rebuilds the system prompt from
+            # scratch (fresh _hermes_now() timestamp, fresh
+            # session_id, narrower toolset → different skills_prompt)
+            # and the byte-exact prefix-cache key misses. See
+            # issue #25322 and PR #17276 for the full analysis +
+            # measured impact (~26% end-to-end cost reduction on
+            # Sonnet 4.5).
+            review_agent._cached_system_prompt = agent._cached_system_prompt
+            # Defensive: pin session_start + session_id to the
+            # parent's so any code path that re-renders parts of
+            # the system prompt (compression, plugin hooks) still
+            # produces byte-identical output. The cached-prompt
+            # assignment above already short-circuits the normal
+            # rebuild path, but these pins guarantee parity even
+            # if a future code path bypasses the cache.
+            review_agent.session_start = agent.session_start
+            review_agent.session_id = agent.session_id
+
+            from model_tools import get_tool_definitions
+            from hermes_cli.plugins import (
+                set_thread_tool_whitelist,
+                clear_thread_tool_whitelist,
+            )
+
+            review_whitelist = {
+                t["function"]["name"]
+                for t in get_tool_definitions(
+                    enabled_toolsets=["memory", "skills"],
+                    quiet_mode=True,
+                )
+            }
+            set_thread_tool_whitelist(
+                review_whitelist,
+                deny_msg_fmt=(
+                    "Background review denied non-whitelisted tool: "
+                    "{tool_name}. Only memory/skill tools are allowed."
+                ),
+            )
+            try:
+                review_agent.run_conversation(
+                    user_message=(
+                        prompt
+                        + "\n\nYou can only call memory and skill "
+                        "management tools. Other tools will be denied "
+                        "at runtime — do not attempt them."
+                    ),
+                    conversation_history=messages_snapshot,
+                )
+            finally:
+                clear_thread_tool_whitelist()
+
+            # Tear down memory providers while stdout is still
+            # redirected so background thread teardown (Honcho flush,
+            # Hindsight sync, etc.) stays silent.  The finally block
+            # below is a safety net for the exception path.
+            try:
+                review_agent.shutdown_memory_provider()
+            except Exception:
+                pass
+            try:
+                review_agent.close()
+            except Exception:
+                pass
+            review_messages = list(getattr(review_agent, "_session_messages", []))
+            review_agent = None
+
+        # Scan the review agent's messages for successful tool actions
+        # and surface a compact summary to the user. Tool messages
+        # already present in messages_snapshot must be skipped, since
+        # the review agent inherits that history and would otherwise
+        # re-surface stale "created"/"updated" messages from the prior
+        # conversation as if they just happened (issue #14944).
+        actions = summarize_background_review_actions(
+            review_messages,
+            messages_snapshot,
+        )
+
+        if actions:
+            summary = " · ".join(dict.fromkeys(actions))
+            agent._safe_print(
+                f"  💾 Self-improvement review: {summary}"
+            )
+            _bg_cb = agent.background_review_callback
+            if _bg_cb:
+                try:
+                    _bg_cb(
+                        f"💾 Self-improvement review: {summary}"
+                    )
+                except Exception:
+                    pass
+
+    except Exception as e:
+        logger.warning("Background memory/skill review failed: %s", e)
+        agent._emit_auxiliary_failure("background review", e)
+    finally:
+        # Safety-net cleanup for the exception path.  Normal
+        # completion already shut down inside redirect_stdout above.
+        # Re-open devnull here so any teardown output (Honcho flush,
+        # Hindsight sync, background thread joins) stays silent even
+        # on the exception path where redirect_stdout already exited.
+        if review_agent is not None:
+            try:
+                with open(os.devnull, "w", encoding="utf-8") as _fn, \
+                     contextlib.redirect_stdout(_fn), \
+                     contextlib.redirect_stderr(_fn):
+                    try:
+                        review_agent.shutdown_memory_provider()
+                    except Exception:
+                        pass
+                    try:
+                        review_agent.close()
+                    except Exception:
+                        pass
+            except Exception:
+                pass
+        # Clear the approval callback on this bg-review thread so a
+        # recycled thread-id doesn't inherit a stale reference.
+        try:
+            _set_approval_callback(None)
+        except Exception:
+            pass
+
+
+def spawn_background_review_thread(
+    agent: Any,
+    messages_snapshot: List[Dict],
+    review_memory: bool = False,
+    review_skills: bool = False,
+):
+    """Build the review thread target and prompt for a background review.
+
+    Returns a ``(target, prompt)`` tuple.  The caller (``AIAgent._spawn_background_review``)
+    owns the actual ``threading.Thread`` construction so test-level patches
+    of ``run_agent.threading.Thread`` keep working.
+    """
+    # Pick the right prompt based on which triggers fired.  Allow per-agent
+    # override (the prompts moved to module-level constants but old code paths
+    # that set agent._MEMORY_REVIEW_PROMPT etc. directly keep working).
+    if review_memory and review_skills:
+        prompt = getattr(agent, "_COMBINED_REVIEW_PROMPT", _COMBINED_REVIEW_PROMPT)
+    elif review_memory:
+        prompt = getattr(agent, "_MEMORY_REVIEW_PROMPT", _MEMORY_REVIEW_PROMPT)
+    else:
+        prompt = getattr(agent, "_SKILL_REVIEW_PROMPT", _SKILL_REVIEW_PROMPT)
+
+    def _target() -> None:
+        _run_review_in_thread(agent, messages_snapshot, prompt)
+
+    return _target, prompt
+
+
+__all__ = [
+    "_MEMORY_REVIEW_PROMPT",
+    "_SKILL_REVIEW_PROMPT",
+    "_COMBINED_REVIEW_PROMPT",
+    "spawn_background_review_thread",
+    "summarize_background_review_actions",
+    "build_memory_write_metadata",
+]
--- a/agent/chat_completion_helpers.py
+++ b/agent/chat_completion_helpers.py
--- a/agent/codex_runtime.py
+++ b/agent/codex_runtime.py
@ -0,0 +1,448 @@
+"""Codex API runtime — App Server and Responses-API streaming paths.
+
+Extracted from :class:`AIAgent` to keep the agent loop file focused.
+Each function takes the parent ``AIAgent`` as its first argument
+(``agent``).  AIAgent keeps thin forwarder methods for backward
+compatibility.
+
+* ``run_codex_app_server_turn`` — drives one turn through the
+  ``codex_app_server`` subprocess client (used when a Codex CLI install
+  is the active provider).
+* ``run_codex_stream`` — streams a Codex Responses API call (the
+  ``codex_responses`` api_mode).
+* ``run_codex_create_stream_fallback`` — recovery path when the
+  Responses ``stream=True`` initial create fails.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import os
+from types import SimpleNamespace
+from typing import Any, Dict, List
+
+logger = logging.getLogger(__name__)
+
+
+def run_codex_app_server_turn(
+    agent,
+    *,
+    user_message: str,
+    original_user_message: Any,
+    messages: List[Dict[str, Any]],
+    effective_task_id: str,
+    should_review_memory: bool = False,
+) -> Dict[str, Any]:
+    """Codex app-server runtime path. Hands the entire turn to a `codex
+    app-server` subprocess and projects its events back into Hermes'
+    messages list so memory/skill review keep working.
+
+    Called from run_conversation() when agent.api_mode == "codex_app_server".
+    Returns the same dict shape as the chat_completions path.
+    """
+    from agent.transports.codex_app_server_session import CodexAppServerSession
+
+    # Lazy session: one CodexAppServerSession per AIAgent instance.
+    # Spawned on first turn, reused across turns, closed at AIAgent
+    # shutdown (see _cleanup hook).
+    if not hasattr(agent, "_codex_session") or agent._codex_session is None:
+        cwd = getattr(agent, "session_cwd", None) or os.getcwd()
+        # Approval callback: defer to Hermes' standard prompt flow if a
+        # CLI thread has installed one. Gateway / cron contexts get the
+        # codex-side fail-closed default.
+        try:
+            from tools.terminal_tool import _get_approval_callback
+            approval_callback = _get_approval_callback()
+        except Exception:
+            approval_callback = None
+        agent._codex_session = CodexAppServerSession(
+            cwd=cwd,
+            approval_callback=approval_callback,
+        )
+
+    # NOTE: the user message is ALREADY appended to messages by the
+    # standard run_conversation() flow (line ~11823) before the early
+    # return reaches us. Do NOT append again — that would duplicate.
+
+    try:
+        turn = agent._codex_session.run_turn(user_input=user_message)
+    except Exception as exc:
+        logger.exception("codex app-server turn failed")
+        # Crash → unconditionally drop the session so the next turn
+        # respawns from scratch instead of reusing a dead client.
+        try:
+            agent._codex_session.close()
+        except Exception:
+            pass
+        agent._codex_session = None
+        return {
+            "final_response": (
+                f"Codex app-server turn failed: {exc}. "
+                f"Fall back to default runtime with `/codex-runtime auto`."
+            ),
+            "messages": messages,
+            "api_calls": 0,
+            "completed": False,
+            "partial": True,
+            "error": str(exc),
+        }
+
+    # If the turn signalled the underlying client is wedged (deadline
+    # blown, post-tool watchdog tripped, OAuth refresh died, subprocess
+    # exited), retire the session so the next turn respawns codex
+    # rather than riding the broken process. Mirrors openclaw beta.8's
+    # "retire timed-out app-server clients" fix.
+    if getattr(turn, "should_retire", False):
+        logger.warning(
+            "codex app-server session retired (turn error: %s)",
+            turn.error,
+        )
+        try:
+            agent._codex_session.close()
+        except Exception:
+            pass
+        agent._codex_session = None
+
+    # Splice projected messages into the conversation. The projector emits
+    # standard {role, content, tool_calls, tool_call_id} entries, which
+    # is exactly what curator.py / sessions DB expect.
+    if turn.projected_messages:
+        messages.extend(turn.projected_messages)
+
+    # Counter ticks for the agent-improvement loop.
+    # _turns_since_memory and _user_turn_count are ALREADY incremented
+    # in the run_conversation() pre-loop block (lines ~11793-11817) so we
+    # do NOT touch them here — that would double-count.
+    # Only _iters_since_skill needs explicit increment, since the
+    # chat_completions loop bumps it per tool iteration (line ~12110)
+    # and that loop is bypassed on this path.
+    agent._iters_since_skill = (
+        getattr(agent, "_iters_since_skill", 0) + turn.tool_iterations
+    )
+
+    # Now check the skill nudge AFTER iters were incremented — same
+    # pattern the chat_completions path uses (line ~15432).
+    should_review_skills = False
+    if (
+        agent._skill_nudge_interval > 0
+        and agent._iters_since_skill >= agent._skill_nudge_interval
+        and "skill_manage" in agent.valid_tool_names
+    ):
+        should_review_skills = True
+        agent._iters_since_skill = 0
+
+    # External memory provider sync (mirrors line ~15439). Skipped on
+    # interrupt/error to avoid feeding partial transcripts to memory.
+    if not turn.interrupted and turn.error is None:
+        try:
+            agent._sync_external_memory_for_turn(
+                original_user_message=original_user_message,
+                final_response=turn.final_text,
+                interrupted=False,
+            )
+        except Exception:
+            logger.debug("external memory sync raised", exc_info=True)
+
+    # Background review fork — same cadence + signature as the default
+    # path (line ~15449). Only fires when a trigger actually tripped AND
+    # we have a real final response.
+    if (
+        turn.final_text
+        and not turn.interrupted
+        and (should_review_memory or should_review_skills)
+    ):
+        try:
+            agent._spawn_background_review(
+                messages_snapshot=list(messages),
+                review_memory=should_review_memory,
+                review_skills=should_review_skills,
+            )
+        except Exception:
+            logger.debug("background review spawn raised", exc_info=True)
+
+    return {
+        "final_response": turn.final_text,
+        "messages": messages,
+        "api_calls": 1,  # one app-server "turn" maps to one logical API call
+        "completed": not turn.interrupted and turn.error is None,
+        "partial": turn.interrupted or turn.error is not None,
+        "error": turn.error,
+        "codex_thread_id": turn.thread_id,
+        "codex_turn_id": turn.turn_id,
+    }
+
+
+
+
+def run_codex_stream(agent, api_kwargs: dict, client: Any = None, on_first_delta: callable = None):
+    """Execute one streaming Responses API request and return the final response."""
+    import httpx as _httpx
+
+    active_client = client or agent._ensure_primary_openai_client(reason="codex_stream_direct")
+    max_stream_retries = 1
+    has_tool_calls = False
+    first_delta_fired = False
+    # Accumulate streamed text so we can recover if get_final_response()
+    # returns empty output (e.g. chatgpt.com backend-api sends
+    # response.incomplete instead of response.completed).
+    agent._codex_streamed_text_parts: list = []
+    for attempt in range(max_stream_retries + 1):
+        if agent._interrupt_requested:
+            raise InterruptedError("Agent interrupted before Codex stream retry")
+        collected_output_items: list = []
+        try:
+            with active_client.responses.stream(**api_kwargs) as stream:
+                for event in stream:
+                    agent._touch_activity("receiving stream response")
+                    if agent._interrupt_requested:
+                        break
+                    event_type = getattr(event, "type", "")
+                    # Fire callbacks on text content deltas (suppress during tool calls)
+                    if "output_text.delta" in event_type or event_type == "response.output_text.delta":
+                        delta_text = getattr(event, "delta", "")
+                        if delta_text:
+                            agent._codex_streamed_text_parts.append(delta_text)
+                        if delta_text and not has_tool_calls:
+                            if not first_delta_fired:
+                                first_delta_fired = True
+                                if on_first_delta:
+                                    try:
+                                        on_first_delta()
+                                    except Exception:
+                                        pass
+                            agent._fire_stream_delta(delta_text)
+                    # Track tool calls to suppress text streaming
+                    elif "function_call" in event_type:
+                        has_tool_calls = True
+                    # Fire reasoning callbacks
+                    elif "reasoning" in event_type and "delta" in event_type:
+                        reasoning_text = getattr(event, "delta", "")
+                        if reasoning_text:
+                            agent._fire_reasoning_delta(reasoning_text)
+                    # Collect completed output items — some backends
+                    # (chatgpt.com/backend-api/codex) stream valid items
+                    # via response.output_item.done but the SDK's
+                    # get_final_response() returns an empty output list.
+                    elif event_type == "response.output_item.done":
+                        done_item = getattr(event, "item", None)
+                        if done_item is not None:
+                            collected_output_items.append(done_item)
+                    # Log non-completed terminal events for diagnostics
+                    elif event_type in {"response.incomplete", "response.failed"}:
+                        resp_obj = getattr(event, "response", None)
+                        status = getattr(resp_obj, "status", None) if resp_obj else None
+                        incomplete_details = getattr(resp_obj, "incomplete_details", None) if resp_obj else None
+                        logger.warning(
+                            "Codex Responses stream received terminal event %s "
+                            "(status=%s, incomplete_details=%s, streamed_chars=%d). %s",
+                            event_type, status, incomplete_details,
+                            sum(len(p) for p in agent._codex_streamed_text_parts),
+                            agent._client_log_context(),
+                        )
+                final_response = stream.get_final_response()
+                # PATCH: ChatGPT Codex backend streams valid output items
+                # but get_final_response() can return an empty output list.
+                # Backfill from collected items or synthesize from deltas.
+                _out = getattr(final_response, "output", None)
+                if isinstance(_out, list) and not _out:
+                    if collected_output_items:
+                        final_response.output = list(collected_output_items)
+                        logger.debug(
+                            "Codex stream: backfilled %d output items from stream events",
+                            len(collected_output_items),
+                        )
+                    elif agent._codex_streamed_text_parts and not has_tool_calls:
+                        assembled = "".join(agent._codex_streamed_text_parts)
+                        final_response.output = [SimpleNamespace(
+                            type="message",
+                            role="assistant",
+                            status="completed",
+                            content=[SimpleNamespace(type="output_text", text=assembled)],
+                        )]
+                        logger.debug(
+                            "Codex stream: synthesized output from %d text deltas (%d chars)",
+                            len(agent._codex_streamed_text_parts), len(assembled),
+                        )
+                return final_response
+        except (_httpx.RemoteProtocolError, _httpx.ReadTimeout, _httpx.ConnectError, ConnectionError) as exc:
+            if attempt < max_stream_retries:
+                logger.debug(
+                    "Codex Responses stream transport failed (attempt %s/%s); retrying. %s error=%s",
+                    attempt + 1,
+                    max_stream_retries + 1,
+                    agent._client_log_context(),
+                    exc,
+                )
+                continue
+            logger.debug(
+                "Codex Responses stream transport failed; falling back to create(stream=True). %s error=%s",
+                agent._client_log_context(),
+                exc,
+            )
+            return agent._run_codex_create_stream_fallback(api_kwargs, client=active_client)
+        except RuntimeError as exc:
+            err_text = str(exc)
+            missing_completed = "response.completed" in err_text
+            # The OpenAI SDK's Responses streaming state machine raises
+            # ``RuntimeError("Expected to have received `response.created`
+            # before `<event-type>`")`` when the first SSE event from the
+            # server is anything other than ``response.created`` — and it
+            # discards the event's payload before we can read it.  Three
+            # real-world backends emit a different first frame:
+            #
+            #   * xAI on grok-4.x OAuth — sends ``error`` (issues
+            #     reported around the May 2026 SuperGrok rollout when
+            #     multi-turn conversations replay encrypted reasoning
+            #     content the OAuth tier rejects)
+            #   * codex-lb relays — send ``codex.rate_limits`` (#14634)
+            #   * custom Responses relays — send ``response.in_progress``
+            #     (#8133)
+            #
+            # In all three cases the underlying byte stream is still
+            # readable: a non-stream ``responses.create(stream=True)``
+            # fallback succeeds and surfaces the real provider error as
+            # a normal exception with body+status_code attached, which
+            # ``_summarize_api_error`` can then translate into a useful
+            # user-facing line.  Treat ``response.created`` prelude
+            # errors the same way we already treat ``response.completed``
+            # postlude errors.
+            prelude_error = (
+                "Expected to have received `response.created`" in err_text
+                or "Expected to have received \"response.created\"" in err_text
+            )
+            if (missing_completed or prelude_error) and attempt < max_stream_retries:
+                logger.debug(
+                    "Responses stream %s (attempt %s/%s); retrying. %s",
+                    "prelude rejected" if prelude_error else "closed before completion",
+                    attempt + 1,
+                    max_stream_retries + 1,
+                    agent._client_log_context(),
+                )
+                continue
+            if missing_completed or prelude_error:
+                logger.debug(
+                    "Responses stream %s; falling back to create(stream=True). %s err=%s",
+                    "rejected before response.created" if prelude_error else "did not emit response.completed",
+                    agent._client_log_context(),
+                    err_text,
+                )
+                return agent._run_codex_create_stream_fallback(api_kwargs, client=active_client)
+            raise
+
+
+
+def run_codex_create_stream_fallback(agent, api_kwargs: dict, client: Any = None):
+    """Fallback path for stream completion edge cases on Codex-style Responses backends."""
+    active_client = client or agent._ensure_primary_openai_client(reason="codex_create_stream_fallback")
+    fallback_kwargs = dict(api_kwargs)
+    fallback_kwargs["stream"] = True
+    fallback_kwargs = agent._get_transport().preflight_kwargs(fallback_kwargs, allow_stream=True)
+    stream_or_response = active_client.responses.create(**fallback_kwargs)
+
+    # Compatibility shim for mocks or providers that still return a concrete response.
+    if hasattr(stream_or_response, "output"):
+        return stream_or_response
+    if not hasattr(stream_or_response, "__iter__"):
+        return stream_or_response
+
+    terminal_response = None
+    collected_output_items: list = []
+    collected_text_deltas: list = []
+    try:
+        for event in stream_or_response:
+            agent._touch_activity("receiving stream response")
+            event_type = getattr(event, "type", None)
+            if not event_type and isinstance(event, dict):
+                event_type = event.get("type")
+
+            # ``error`` SSE frames carry the provider's real failure
+            # reason (subscription / quota / model-not-available /
+            # rejected-reasoning-replay) but never appear in the
+            # ``{completed, incomplete, failed}`` terminal set, so the
+            # raw loop below would silently consume them and end with
+            # "did not emit a terminal response".  xAI in particular
+            # emits ``type=error`` as the FIRST frame for OAuth
+            # accounts whose Grok subscription is missing/exhausted —
+            # the SDK's stream helper raises ``RuntimeError(Expected
+            # to have received response.created before error)`` which
+            # the caller catches and routes here, expecting this
+            # fallback to surface the message.  Synthesize an
+            # APIError-shaped exception so ``_summarize_api_error``
+            # and the credential-pool entitlement detector see the
+            # real text instead of a generic RuntimeError.
+            if event_type == "error":
+                err_message = getattr(event, "message", None)
+                if not err_message and isinstance(event, dict):
+                    err_message = event.get("message")
+                err_code = getattr(event, "code", None)
+                if not err_code and isinstance(event, dict):
+                    err_code = event.get("code")
+                err_param = getattr(event, "param", None)
+                if not err_param and isinstance(event, dict):
+                    err_param = event.get("param")
+                err_message = (err_message or "stream emitted error event").strip()
+                from run_agent import _StreamErrorEvent
+                raise _StreamErrorEvent(err_message, code=err_code, param=err_param)
+
+            # Collect output items and text deltas for backfill
+            if event_type == "response.output_item.done":
+                done_item = getattr(event, "item", None)
+                if done_item is None and isinstance(event, dict):
+                    done_item = event.get("item")
+                if done_item is not None:
+                    collected_output_items.append(done_item)
+            elif event_type in {"response.output_text.delta",}:
+                delta = getattr(event, "delta", "")
+                if not delta and isinstance(event, dict):
+                    delta = event.get("delta", "")
+                if delta:
+                    collected_text_deltas.append(delta)
+
+            if event_type not in {"response.completed", "response.incomplete", "response.failed"}:
+                continue
+
+            terminal_response = getattr(event, "response", None)
+            if terminal_response is None and isinstance(event, dict):
+                terminal_response = event.get("response")
+            if terminal_response is not None:
+                # Backfill empty output from collected stream events
+                _out = getattr(terminal_response, "output", None)
+                if isinstance(_out, list) and not _out:
+                    if collected_output_items:
+                        terminal_response.output = list(collected_output_items)
+                        logger.debug(
+                            "Codex fallback stream: backfilled %d output items",
+                            len(collected_output_items),
+                        )
+                    elif collected_text_deltas:
+                        assembled = "".join(collected_text_deltas)
+                        terminal_response.output = [SimpleNamespace(
+                            type="message", role="assistant",
+                            status="completed",
+                            content=[SimpleNamespace(type="output_text", text=assembled)],
+                        )]
+                        logger.debug(
+                            "Codex fallback stream: synthesized from %d deltas (%d chars)",
+                            len(collected_text_deltas), len(assembled),
+                        )
+                return terminal_response
+    finally:
+        close_fn = getattr(stream_or_response, "close", None)
+        if callable(close_fn):
+            try:
+                close_fn()
+            except Exception:
+                pass
+
+    if terminal_response is not None:
+        return terminal_response
+    raise RuntimeError("Responses create(stream=True) fallback did not emit a terminal response.")
+
+
+
+__all__ = [
+    "run_codex_app_server_turn",
+    "run_codex_stream",
+    "run_codex_create_stream_fallback",
+]
--- a/agent/conversation_compression.py
+++ b/agent/conversation_compression.py
@ -0,0 +1,556 @@
+"""Context compression — extract the AIAgent methods that drive summarisation.
+
+Three concerns live here:
+
+* :func:`check_compression_model_feasibility` — startup probe of the
+  configured auxiliary compression model.  Warns when the aux context
+  window can't fit the main model's compression threshold; auto-lowers
+  the session threshold when possible; hard-rejects auxes below
+  ``MINIMUM_CONTEXT_LENGTH``.
+
+* :func:`replay_compression_warning` — re-emit a stored warning through
+  the gateway ``status_callback`` once it's wired up (the callback is
+  set after :class:`AIAgent` construction).
+
+* :func:`compress_context` — the actual compression call.  Runs the
+  configured compressor, splits the SQLite session, rotates the
+  session_id, notifies plugin context engines / memory providers, and
+  returns the compressed message list and freshly-built system prompt.
+
+* :func:`try_shrink_image_parts_in_messages` — image-too-large recovery
+  helper that re-encodes ``data:image/...;base64,...`` parts at a smaller
+  size so retries can fit under provider ceilings (Anthropic's 5 MB).
+
+``run_agent`` keeps thin wrappers for each so existing call sites
+(``self._compress_context(...)``) keep working.  Tests that exercise
+these paths see no behavioural change.
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+import tempfile
+import uuid
+from datetime import datetime
+from pathlib import Path
+from typing import Any, List, Optional, Tuple
+
+from agent.model_metadata import estimate_request_tokens_rough
+
+logger = logging.getLogger(__name__)
+
+
+def check_compression_model_feasibility(agent: Any) -> None:
+    """Warn at session start if the auxiliary compression model's context
+    window is smaller than the main model's compression threshold.
+
+    When the auxiliary model cannot fit the content that needs summarising,
+    compression will either fail outright (the LLM call errors) or produce
+    a severely truncated summary.
+
+    Called during ``AIAgent.__init__`` so CLI users see the warning
+    immediately (via ``_vprint``).  The gateway sets ``status_callback``
+    *after* construction, so :func:`replay_compression_warning` re-sends
+    the stored warning through the callback on the first
+    ``run_conversation()`` call.
+    """
+    if not agent.compression_enabled:
+        return
+    try:
+        from agent.auxiliary_client import (
+            _resolve_task_provider_model,
+            get_text_auxiliary_client,
+        )
+        from agent.model_metadata import (
+            MINIMUM_CONTEXT_LENGTH,
+            get_model_context_length,
+        )
+
+        client, aux_model = get_text_auxiliary_client(
+            "compression",
+            main_runtime=agent._current_main_runtime(),
+        )
+        # Best-effort aux provider label for the warning message. The
+        # configured provider may be "auto", in which case we fall back
+        # to the client's base_url hostname so the user can still tell
+        # where the compression model is actually being called.
+        try:
+            _aux_cfg_provider, _, _, _, _ = _resolve_task_provider_model("compression")
+        except Exception:
+            _aux_cfg_provider = ""
+        if client is None or not aux_model:
+            if _aux_cfg_provider and _aux_cfg_provider != "auto":
+                msg = (
+                    "⚠ Configured auxiliary compression provider "
+                    f"'{_aux_cfg_provider}' is unavailable — context "
+                    "compression will drop middle turns without a summary. "
+                    "Check auxiliary.compression in config.yaml and "
+                    "reauthenticate that provider."
+                )
+            else:
+                msg = (
+                    "⚠ No auxiliary LLM provider configured — context "
+                    "compression will drop middle turns without a summary. "
+                    "Run `hermes setup` or set OPENROUTER_API_KEY."
+                )
+            agent._compression_warning = msg
+            agent._emit_status(msg)
+            logger.warning(
+                "No auxiliary LLM provider for compression — "
+                "summaries will be unavailable."
+            )
+            return
+
+        aux_base_url = str(getattr(client, "base_url", ""))
+        aux_api_key = str(getattr(client, "api_key", ""))
+
+        aux_context = get_model_context_length(
+            aux_model,
+            base_url=aux_base_url,
+            api_key=aux_api_key,
+            config_context_length=getattr(agent, "_aux_compression_context_length_config", None),
+            # Each model must be resolved with its own provider so that
+            # provider-specific paths (e.g. Bedrock static table, OpenRouter API)
+            # are invoked for the correct client, not inherited from the main model.
+            provider=(_aux_cfg_provider if _aux_cfg_provider and _aux_cfg_provider != "auto" else getattr(agent, "provider", "")),
+            custom_providers=agent._custom_providers,
+        )
+
+        # Hard floor: the auxiliary compression model must have at least
+        # MINIMUM_CONTEXT_LENGTH (64K) tokens of context.  The main model
+        # is already required to meet this floor (checked earlier in
+        # __init__), so the compression model must too — otherwise it
+        # cannot summarise a full threshold-sized window of main-model
+        # content.  Mirrors the main-model rejection pattern.
+        if aux_context and aux_context < MINIMUM_CONTEXT_LENGTH:
+            raise ValueError(
+                f"Auxiliary compression model {aux_model} has a context "
+                f"window of {aux_context:,} tokens, which is below the "
+                f"minimum {MINIMUM_CONTEXT_LENGTH:,} required by Hermes "
+                f"Agent.  Choose a compression model with at least "
+                f"{MINIMUM_CONTEXT_LENGTH // 1000}K context (set "
+                f"auxiliary.compression.model in config.yaml), or set "
+                f"auxiliary.compression.context_length to override the "
+                f"detected value if it is wrong."
+            )
+
+        threshold = agent.context_compressor.threshold_tokens
+        if aux_context < threshold:
+            # Auto-correct: lower the live session threshold so
+            # compression actually works this session.  The hard floor
+            # above guarantees aux_context >= MINIMUM_CONTEXT_LENGTH,
+            # so the new threshold is always >= 64K.
+            #
+            # The compression summariser sends a single user-role
+            # prompt (no system prompt, no tools) to the aux model, so
+            # new_threshold == aux_context is safe: the request is
+            # the raw messages plus a small summarisation instruction.
+            old_threshold = threshold
+            new_threshold = aux_context
+            agent.context_compressor.threshold_tokens = new_threshold
+            # Keep threshold_percent in sync so future main-model
+            # context_length changes (update_model) re-derive from a
+            # sensible number rather than the original too-high value.
+            main_ctx = agent.context_compressor.context_length
+            if main_ctx:
+                agent.context_compressor.threshold_percent = (
+                    new_threshold / main_ctx
+                )
+            safe_pct = int((aux_context / main_ctx) * 100) if main_ctx else 50
+            # Build human-readable "model (provider)" labels for both
+            # the main model and the compression model so users can
+            # tell at a glance which provider each side is actually
+            # using. When the configured provider is empty or "auto",
+            # fall back to the client's base_url hostname.
+            _main_model = getattr(agent, "model", "") or "?"
+            _main_provider = getattr(agent, "provider", "") or ""
+            _aux_provider_label = (
+                _aux_cfg_provider
+                if _aux_cfg_provider and _aux_cfg_provider != "auto"
+                else ""
+            )
+            if not _aux_provider_label:
+                try:
+                    from urllib.parse import urlparse
+                    _aux_provider_label = (
+                        urlparse(aux_base_url).hostname or aux_base_url
+                    )
+                except Exception:
+                    _aux_provider_label = aux_base_url or "auto"
+            _main_label = (
+                f"{_main_model} ({_main_provider})"
+                if _main_provider
+                else _main_model
+            )
+            _aux_label = f"{aux_model} ({_aux_provider_label})"
+            msg = (
+                f"⚠ Compression model {_aux_label} context is "
+                f"{aux_context:,} tokens, but the main model "
+                f"{_main_label}'s compression threshold was "
+                f"{old_threshold:,} tokens. "
+                f"Auto-lowered this session's threshold to "
+                f"{new_threshold:,} tokens so compression can run.\n"
+                f"  To make this permanent, edit config.yaml — either:\n"
+                f"  1. Use a larger compression model:\n"
+                f"       auxiliary:\n"
+                f"         compression:\n"
+                f"           model: <model-with-{old_threshold:,}+-context>\n"
+                f"  2. Lower the compression threshold:\n"
+                f"       compression:\n"
+                f"         threshold: 0.{safe_pct:02d}"
+            )
+            agent._compression_warning = msg
+            agent._emit_status(msg)
+            logger.warning(
+                "Auxiliary compression model %s has %d token context, "
+                "below the main model's compression threshold of %d "
+                "tokens — auto-lowered session threshold to %d to "
+                "keep compression working.",
+                aux_model,
+                aux_context,
+                old_threshold,
+                new_threshold,
+            )
+    except ValueError:
+        # Hard rejections (aux below minimum context) must propagate
+        # so the session refuses to start.
+        raise
+    except Exception as exc:
+        logger.debug(
+            "Compression feasibility check failed (non-fatal): %s", exc
+        )
+
+
+def replay_compression_warning(agent: Any) -> None:
+    """Re-send the compression warning through ``status_callback``.
+
+    During ``__init__`` the gateway's ``status_callback`` is not yet
+    wired, so ``_emit_status`` only reaches ``_vprint`` (CLI).  This
+    method is called once at the start of the first
+    ``run_conversation()`` — by then the gateway has set the callback,
+    so every platform (Telegram, Discord, Slack, etc.) receives the
+    warning.
+    """
+    msg = getattr(agent, "_compression_warning", None)
+    if msg and agent.status_callback:
+        try:
+            agent.status_callback("lifecycle", msg)
+        except Exception:
+            pass
+
+
+def compress_context(
+    agent: Any,
+    messages: list,
+    system_message: str,
+    *,
+    approx_tokens: Optional[int] = None,
+    task_id: str = "default",
+    focus_topic: Optional[str] = None,
+) -> Tuple[list, str]:
+    """Compress conversation context and split the session in SQLite.
+
+    Args:
+        agent: The owning :class:`AIAgent`.
+        messages: Current message history (will be summarised).
+        system_message: Current system prompt; rebuilt after compression.
+        approx_tokens: Pre-compression token estimate, logged for ops.
+        task_id: Tool task scope (used for clearing file-read dedup state).
+        focus_topic: Optional focus string for guided compression — the
+            summariser will prioritise preserving information related to
+            this topic.  Inspired by Claude Code's ``/compact <focus>``.
+
+    Returns:
+        ``(compressed_messages, new_system_prompt)`` tuple.
+    """
+    _pre_msg_count = len(messages)
+    logger.info(
+        "context compression started: session=%s messages=%d tokens=~%s model=%s focus=%r",
+        agent.session_id or "none", _pre_msg_count,
+        f"{approx_tokens:,}" if approx_tokens else "unknown", agent.model,
+        focus_topic,
+    )
+    agent._emit_status(
+        "🗜️ Compacting context — summarizing earlier conversation so I can continue..."
+    )
+
+    # Notify external memory provider before compression discards context
+    if agent._memory_manager:
+        try:
+            agent._memory_manager.on_pre_compress(messages)
+        except Exception:
+            pass
+
+    try:
+        compressed = agent.context_compressor.compress(messages, current_tokens=approx_tokens, focus_topic=focus_topic)
+    except TypeError:
+        # Plugin context engine with strict signature that doesn't accept
+        # focus_topic — fall back to calling without it.
+        compressed = agent.context_compressor.compress(messages, current_tokens=approx_tokens)
+
+    summary_error = getattr(agent.context_compressor, "_last_summary_error", None)
+    if summary_error:
+        if getattr(agent, "_last_compression_summary_warning", None) != summary_error:
+            agent._last_compression_summary_warning = summary_error
+            agent._emit_warning(
+                f"⚠ Compression summary failed: {summary_error}. "
+                "Inserted a fallback context marker."
+            )
+    else:
+        # No hard failure — but did the configured aux model error out
+        # and get recovered by retrying on main?  Surface that so users
+        # know their auxiliary.compression.model setting is broken even
+        # though compression succeeded.
+        _aux_fail_model = getattr(agent.context_compressor, "_last_aux_model_failure_model", None)
+        _aux_fail_err = getattr(agent.context_compressor, "_last_aux_model_failure_error", None)
+        if _aux_fail_model:
+            # Dedup on (model, error) so we don't spam on every compaction
+            _aux_key = (_aux_fail_model, _aux_fail_err)
+            if getattr(agent, "_last_aux_fallback_warning_key", None) != _aux_key:
+                agent._last_aux_fallback_warning_key = _aux_key
+                agent._emit_warning(
+                    f"ℹ Configured compression model '{_aux_fail_model}' failed "
+                    f"({_aux_fail_err or 'unknown error'}). Recovered using main model — "
+                    "check auxiliary.compression.model in config.yaml."
+                )
+
+    todo_snapshot = agent._todo_store.format_for_injection()
+    if todo_snapshot:
+        compressed.append({"role": "user", "content": todo_snapshot})
+
+    agent._invalidate_system_prompt()
+    new_system_prompt = agent._build_system_prompt(system_message)
+    agent._cached_system_prompt = new_system_prompt
+
+    if agent._session_db:
+        try:
+            # Propagate title to the new session with auto-numbering
+            old_title = agent._session_db.get_session_title(agent.session_id)
+            # Trigger memory extraction on the old session before it rotates.
+            agent.commit_memory_session(messages)
+            agent._session_db.end_session(agent.session_id, "compression")
+            old_session_id = agent.session_id
+            agent.session_id = f"{datetime.now().strftime('%Y%m%d_%H%M%S')}_{uuid.uuid4().hex[:6]}"
+            os.environ["HERMES_SESSION_ID"] = agent.session_id
+            try:
+                from gateway.session_context import _SESSION_ID
+                _SESSION_ID.set(agent.session_id)
+            except Exception:
+                pass
+            # Update session_log_file to point to the new session's JSON file
+            agent.session_log_file = agent.logs_dir / f"session_{agent.session_id}.json"
+            agent._session_db_created = False
+            agent._session_db.create_session(
+                session_id=agent.session_id,
+                source=agent.platform or os.environ.get("HERMES_SESSION_SOURCE", "cli"),
+                model=agent.model,
+                model_config=agent._session_init_model_config,
+                parent_session_id=old_session_id,
+            )
+            agent._session_db_created = True
+            # Auto-number the title for the continuation session
+            if old_title:
+                try:
+                    new_title = agent._session_db.get_next_title_in_lineage(old_title)
+                    agent._session_db.set_session_title(agent.session_id, new_title)
+                except (ValueError, Exception) as e:
+                    logger.debug("Could not propagate title on compression: %s", e)
+            agent._session_db.update_system_prompt(agent.session_id, new_system_prompt)
+            # Reset flush cursor — new session starts with no messages written
+            agent._last_flushed_db_idx = 0
+        except Exception as e:
+            logger.warning("Session DB compression split failed — new session will NOT be indexed: %s", e)
+
+    # Notify the context engine that the session_id rotated because of
+    # compression (not a fresh /new). Plugin engines (e.g. hermes-lcm) use
+    # boundary_reason="compression" to preserve DAG lineage across the
+    # rollover instead of re-initializing fresh per-session state.
+    # See hermes-lcm#68. Built-in ContextCompressor ignores kwargs.
+    try:
+        _old_sid = locals().get("old_session_id")
+        if _old_sid and hasattr(agent.context_compressor, "on_session_start"):
+            agent.context_compressor.on_session_start(
+                agent.session_id or "",
+                boundary_reason="compression",
+                old_session_id=_old_sid,
+            )
+    except Exception as _ce_err:
+        logger.debug("context engine on_session_start (compression): %s", _ce_err)
+
+    # Notify memory providers of the compression-driven session_id rotation
+    # so provider-cached per-session state (Hindsight's _document_id,
+    # accumulated turn buffers, counters) refreshes. reset=False because
+    # the logical conversation continues; only the id and DB row rolled
+    # over. See #6672.
+    try:
+        _old_sid = locals().get("old_session_id")
+        if _old_sid and agent._memory_manager:
+            agent._memory_manager.on_session_switch(
+                agent.session_id or "",
+                parent_session_id=_old_sid,
+                reset=False,
+                reason="compression",
+            )
+    except Exception as _me_err:
+        logger.debug("memory manager on_session_switch (compression): %s", _me_err)
+
+    # Warn on repeated compressions (quality degrades with each pass)
+    _cc = agent.context_compressor.compression_count
+    if _cc >= 2:
+        agent._vprint(
+            f"{agent.log_prefix}⚠️  Session compressed {_cc} times — "
+            f"accuracy may degrade. Consider /new to start fresh.",
+            force=True,
+        )
+
+    # Update token estimate after compaction so pressure calculations
+    # use the post-compression count, not the stale pre-compression one.
+    # Use estimate_request_tokens_rough() so tool schemas are included —
+    # with 50+ tools enabled, schemas alone can add 20-30K tokens, and
+    # omitting them delays the next compression cycle far past the
+    # configured threshold (issue #14695).
+    _compressed_est = estimate_request_tokens_rough(
+        compressed,
+        system_prompt=new_system_prompt or "",
+        tools=agent.tools or None,
+    )
+    agent.context_compressor.last_prompt_tokens = _compressed_est
+    agent.context_compressor.last_completion_tokens = 0
+
+    # Clear the file-read dedup cache.  After compression the original
+    # read content is summarised away — if the model re-reads the same
+    # file it needs the full content, not a "file unchanged" stub.
+    try:
+        from tools.file_tools import reset_file_dedup
+        reset_file_dedup(task_id)
+    except Exception:
+        pass
+
+    logger.info(
+        "context compression done: session=%s messages=%d->%d tokens=~%s",
+        agent.session_id or "none", _pre_msg_count, len(compressed),
+        f"{_compressed_est:,}",
+    )
+    return compressed, new_system_prompt
+
+
+def try_shrink_image_parts_in_messages(api_messages: list) -> bool:
+    """Re-encode all native image parts at a smaller size to recover from
+    image-too-large errors (Anthropic 5 MB, unknown other providers).
+
+    Mutates ``api_messages`` in place. Returns True if any image part was
+    actually replaced, False if there were no image parts to shrink or
+    Pillow couldn't help (caller should surface the original error).
+
+    Strategy: look for ``image_url`` / ``input_image`` parts carrying a
+    ``data:image/...;base64,...`` payload.  For each one whose encoded
+    size exceeds 4 MB (a safe target that slides under Anthropic's 5 MB
+    ceiling with header overhead), write the base64 to a tempfile, call
+    ``vision_tools._resize_image_for_vision`` to produce a smaller data
+    URL, and substitute it in place.
+
+    Non-data-URL images (http/https URLs) are not touched — the provider
+    fetches those itself and the size limit is different.
+    """
+    if not api_messages:
+        return False
+
+    try:
+        from tools.vision_tools import _resize_image_for_vision
+    except Exception as exc:
+        logger.warning("image-shrink recovery: vision_tools unavailable — %s", exc)
+        return False
+
+    # 4 MB target leaves comfortable headroom under Anthropic's 5 MB.
+    # Non-Anthropic providers we haven't observed rejecting are fine with
+    # much larger; shrinking to 4 MB here loses quality but only fires
+    # after a confirmed provider rejection, so the alternative is failure.
+    target_bytes = 4 * 1024 * 1024
+    changed_count = 0
+
+    def _shrink_data_url(url: str) -> Optional[str]:
+        """Return a smaller data URL, or None if shrink can't help."""
+        if not isinstance(url, str) or not url.startswith("data:"):
+            return None
+        if len(url) <= target_bytes:
+            # This specific image wasn't the oversized one.
+            return None
+        try:
+            header, _, data = url.partition(",")
+            mime = "image/jpeg"
+            if header.startswith("data:"):
+                mime_part = header[len("data:"):].split(";", 1)[0].strip()
+                if mime_part.startswith("image/"):
+                    mime = mime_part
+            import base64 as _b64
+            raw = _b64.b64decode(data)
+            suffix = {
+                "image/png": ".png", "image/gif": ".gif", "image/webp": ".webp",
+                "image/jpeg": ".jpg", "image/jpg": ".jpg", "image/bmp": ".bmp",
+            }.get(mime, ".jpg")
+            tmp = tempfile.NamedTemporaryFile(
+                prefix="hermes_shrink_", suffix=suffix, delete=False,
+            )
+            try:
+                tmp.write(raw)
+                tmp.close()
+                resized = _resize_image_for_vision(
+                    Path(tmp.name),
+                    mime_type=mime,
+                    max_base64_bytes=target_bytes,
+                )
+            finally:
+                try:
+                    Path(tmp.name).unlink(missing_ok=True)
+                except Exception:
+                    pass
+            if not resized or len(resized) >= len(url):
+                # Shrink didn't help (or made it bigger — corrupt input?).
+                return None
+            return resized
+        except Exception as exc:
+            logger.warning("image-shrink recovery: re-encode failed — %s", exc)
+            return None
+
+    for msg in api_messages:
+        if not isinstance(msg, dict):
+            continue
+        content = msg.get("content")
+        if not isinstance(content, list):
+            continue
+        for part in content:
+            if not isinstance(part, dict):
+                continue
+            ptype = part.get("type")
+            if ptype not in {"image_url", "input_image"}:
+                continue
+            image_value = part.get("image_url")
+            # OpenAI chat.completions: {"image_url": {"url": "data:..."}}
+            # OpenAI Responses: {"image_url": "data:..."}
+            if isinstance(image_value, dict):
+                url = image_value.get("url", "")
+                resized = _shrink_data_url(url)
+                if resized:
+                    image_value["url"] = resized
+                    changed_count += 1
+            elif isinstance(image_value, str):
+                resized = _shrink_data_url(image_value)
+                if resized:
+                    part["image_url"] = resized
+                    changed_count += 1
+
+    if changed_count:
+        logger.info(
+            "image-shrink recovery: re-encoded %d image part(s) to fit under %.0f MB",
+            changed_count, target_bytes / (1024 * 1024),
+        )
+    return changed_count > 0
+
+
+__all__ = [
+    "check_compression_model_feasibility",
+    "replay_compression_warning",
+    "compress_context",
+    "try_shrink_image_parts_in_messages",
+]
--- a/agent/conversation_loop.py
+++ b/agent/conversation_loop.py
--- a/agent/iteration_budget.py
+++ b/agent/iteration_budget.py
@ -0,0 +1,62 @@
+"""Per-agent iteration budget — thread-safe consume/refund counter.
+
+Extracted from ``run_agent.py``.  Each ``AIAgent`` instance (parent or
+subagent) holds an :class:`IterationBudget`; the parent's cap comes from
+``max_iterations`` (default 90), each subagent's cap comes from
+``delegation.max_iterations`` (default 50).
+
+``run_agent`` re-exports ``IterationBudget`` so existing
+``from run_agent import IterationBudget`` imports keep working unchanged.
+"""
+
+from __future__ import annotations
+
+import threading
+
+
+class IterationBudget:
+    """Thread-safe iteration counter for an agent.
+
+    Each agent (parent or subagent) gets its own ``IterationBudget``.
+    The parent's budget is capped at ``max_iterations`` (default 90).
+    Each subagent gets an independent budget capped at
+    ``delegation.max_iterations`` (default 50) — this means total
+    iterations across parent + subagents can exceed the parent's cap.
+    Users control the per-subagent limit via ``delegation.max_iterations``
+    in config.yaml.
+
+    ``execute_code`` (programmatic tool calling) iterations are refunded via
+    :meth:`refund` so they don't eat into the budget.
+    """
+
+    def __init__(self, max_total: int):
+        self.max_total = max_total
+        self._used = 0
+        self._lock = threading.Lock()
+
+    def consume(self) -> bool:
+        """Try to consume one iteration.  Returns True if allowed."""
+        with self._lock:
+            if self._used >= self.max_total:
+                return False
+            self._used += 1
+            return True
+
+    def refund(self) -> None:
+        """Give back one iteration (e.g. for execute_code turns)."""
+        with self._lock:
+            if self._used > 0:
+                self._used -= 1
+
+    @property
+    def used(self) -> int:
+        with self._lock:
+            return self._used
+
+    @property
+    def remaining(self) -> int:
+        with self._lock:
+            return max(0, self.max_total - self._used)
+
+
+__all__ = ["IterationBudget"]
--- a/agent/message_sanitization.py
+++ b/agent/message_sanitization.py
@ -0,0 +1,444 @@
+"""Message and tool-payload sanitization helpers.
+
+Pure functions extracted from ``run_agent.py`` so the AIAgent module can
+stay focused on the conversation loop.  These walk OpenAI-format message
+lists and structured payloads, repairing or stripping problematic
+characters that would otherwise crash ``json.dumps`` inside the OpenAI
+SDK or be rejected by upstream APIs.
+
+All helpers are stateless and side-effect-free except for in-place
+mutation of their input (where documented).  Backward-compatible
+re-exports from ``run_agent`` remain in place so existing imports
+``from run_agent import _sanitize_surrogates`` keep working.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import re
+from typing import Any
+
+logger = logging.getLogger(__name__)
+
+# Lone surrogate code points are invalid in UTF-8 and crash json.dumps
+# inside the OpenAI SDK.  Used by every surrogate-sanitization helper
+# below as well as by run_agent and the CLI for paste-from-clipboard
+# scrubbing.
+_SURROGATE_RE = re.compile(r'[\ud800-\udfff]')
+
+
+def _sanitize_surrogates(text: str) -> str:
+    """Replace lone surrogate code points with U+FFFD (replacement character).
+
+    Surrogates are invalid in UTF-8 and will crash ``json.dumps()`` inside the
+    OpenAI SDK.  This is a fast no-op when the text contains no surrogates.
+    """
+    if _SURROGATE_RE.search(text):
+        return _SURROGATE_RE.sub('\ufffd', text)
+    return text
+
+
+def _sanitize_structure_surrogates(payload: Any) -> bool:
+    """Replace surrogate code points in nested dict/list payloads in-place.
+
+    Mirror of ``_sanitize_structure_non_ascii`` but for surrogate recovery.
+    Used to scrub nested structured fields (e.g. ``reasoning_details`` — an
+    array of dicts with ``summary``/``text`` strings) that flat per-field
+    checks don't reach.  Returns True if any surrogates were replaced.
+    """
+    found = False
+
+    def _walk(node):
+        nonlocal found
+        if isinstance(node, dict):
+            for key, value in node.items():
+                if isinstance(value, str):
+                    if _SURROGATE_RE.search(value):
+                        node[key] = _SURROGATE_RE.sub('\ufffd', value)
+                        found = True
+                elif isinstance(value, (dict, list)):
+                    _walk(value)
+        elif isinstance(node, list):
+            for idx, value in enumerate(node):
+                if isinstance(value, str):
+                    if _SURROGATE_RE.search(value):
+                        node[idx] = _SURROGATE_RE.sub('\ufffd', value)
+                        found = True
+                elif isinstance(value, (dict, list)):
+                    _walk(value)
+
+    _walk(payload)
+    return found
+
+
+def _sanitize_messages_surrogates(messages: list) -> bool:
+    """Sanitize surrogate characters from all string content in a messages list.
+
+    Walks message dicts in-place. Returns True if any surrogates were found
+    and replaced, False otherwise. Covers content/text, name, tool call
+    metadata/arguments, AND any additional string or nested structured fields
+    (``reasoning``, ``reasoning_content``, ``reasoning_details``, etc.) so
+    retries don't fail on a non-content field.  Byte-level reasoning models
+    (xiaomi/mimo, kimi, glm) can emit lone surrogates in reasoning output
+    that flow through to ``api_messages["reasoning_content"]`` on the next
+    turn and crash json.dumps inside the OpenAI SDK.
+    """
+    found = False
+    for msg in messages:
+        if not isinstance(msg, dict):
+            continue
+        content = msg.get("content")
+        if isinstance(content, str) and _SURROGATE_RE.search(content):
+            msg["content"] = _SURROGATE_RE.sub('\ufffd', content)
+            found = True
+        elif isinstance(content, list):
+            for part in content:
+                if isinstance(part, dict):
+                    text = part.get("text")
+                    if isinstance(text, str) and _SURROGATE_RE.search(text):
+                        part["text"] = _SURROGATE_RE.sub('\ufffd', text)
+                        found = True
+        name = msg.get("name")
+        if isinstance(name, str) and _SURROGATE_RE.search(name):
+            msg["name"] = _SURROGATE_RE.sub('\ufffd', name)
+            found = True
+        tool_calls = msg.get("tool_calls")
+        if isinstance(tool_calls, list):
+            for tc in tool_calls:
+                if not isinstance(tc, dict):
+                    continue
+                tc_id = tc.get("id")
+                if isinstance(tc_id, str) and _SURROGATE_RE.search(tc_id):
+                    tc["id"] = _SURROGATE_RE.sub('\ufffd', tc_id)
+                    found = True
+                fn = tc.get("function")
+                if isinstance(fn, dict):
+                    fn_name = fn.get("name")
+                    if isinstance(fn_name, str) and _SURROGATE_RE.search(fn_name):
+                        fn["name"] = _SURROGATE_RE.sub('\ufffd', fn_name)
+                        found = True
+                    fn_args = fn.get("arguments")
+                    if isinstance(fn_args, str) and _SURROGATE_RE.search(fn_args):
+                        fn["arguments"] = _SURROGATE_RE.sub('\ufffd', fn_args)
+                        found = True
+        # Walk any additional string / nested fields (reasoning,
+        # reasoning_content, reasoning_details, etc.) — surrogates from
+        # byte-level reasoning models (xiaomi/mimo, kimi, glm) can lurk
+        # in these fields and aren't covered by the per-field checks above.
+        # Matches _sanitize_messages_non_ascii's coverage (PR #10537).
+        for key, value in msg.items():
+            if key in {"content", "name", "tool_calls", "role"}:
+                continue
+            if isinstance(value, str):
+                if _SURROGATE_RE.search(value):
+                    msg[key] = _SURROGATE_RE.sub('\ufffd', value)
+                    found = True
+            elif isinstance(value, (dict, list)):
+                if _sanitize_structure_surrogates(value):
+                    found = True
+    return found
+
+
+def _escape_invalid_chars_in_json_strings(raw: str) -> str:
+    """Escape unescaped control chars inside JSON string values.
+
+    Walks the raw JSON character-by-character, tracking whether we are
+    inside a double-quoted string. Inside strings, replaces literal
+    control characters (0x00-0x1F) that aren't already part of an escape
+    sequence with their ``\\uXXXX`` equivalents. Pass-through for everything
+    else.
+
+    Ported from #12093 — complements the other repair passes in
+    ``_repair_tool_call_arguments`` when ``json.loads(strict=False)`` is
+    not enough (e.g. llama.cpp backends that emit literal apostrophes or
+    tabs alongside other malformations).
+    """
+    out: list[str] = []
+    in_string = False
+    i = 0
+    n = len(raw)
+    while i < n:
+        ch = raw[i]
+        if in_string:
+            if ch == "\\" and i + 1 < n:
+                # Already-escaped char — pass through as-is
+                out.append(ch)
+                out.append(raw[i + 1])
+                i += 2
+                continue
+            if ch == '"':
+                in_string = False
+                out.append(ch)
+            elif ord(ch) < 0x20:
+                out.append(f"\\u{ord(ch):04x}")
+            else:
+                out.append(ch)
+        else:
+            if ch == '"':
+                in_string = True
+            out.append(ch)
+        i += 1
+    return "".join(out)
+
+
+def _repair_tool_call_arguments(raw_args: str, tool_name: str = "?") -> str:
+    """Attempt to repair malformed tool_call argument JSON.
+
+    Models like GLM-5.1 via Ollama can produce truncated JSON, trailing
+    commas, Python ``None``, etc.  The API proxy rejects these with HTTP 400
+    "invalid tool call arguments".  This function applies common repairs;
+    if all fail it returns ``"{}"`` so the request succeeds (better than
+    crashing the session).  All repairs are logged at WARNING level.
+    """
+    raw_stripped = raw_args.strip() if isinstance(raw_args, str) else ""
+
+    # Fast-path: empty / whitespace-only -> empty object
+    if not raw_stripped:
+        logger.warning("Sanitized empty tool_call arguments for %s", tool_name)
+        return "{}"
+
+    # Python-literal None -> normalise to {}
+    if raw_stripped == "None":
+        logger.warning("Sanitized Python-None tool_call arguments for %s", tool_name)
+        return "{}"
+
+    # Repair pass 0: llama.cpp backends sometimes emit literal control
+    # characters (tabs, newlines) inside JSON string values. json.loads
+    # with strict=False accepts these and lets us re-serialise the
+    # result into wire-valid JSON without any string surgery. This is
+    # the most common local-model repair case (#12068).
+    try:
+        parsed = json.loads(raw_stripped, strict=False)
+        reserialised = json.dumps(parsed, separators=(",", ":"))
+        if reserialised != raw_stripped:
+            logger.warning(
+                "Repaired unescaped control chars in tool_call arguments for %s",
+                tool_name,
+            )
+        return reserialised
+    except (json.JSONDecodeError, TypeError, ValueError):
+        pass
+
+    # Attempt common JSON repairs
+    fixed = raw_stripped
+    # 1. Strip trailing commas before } or ]
+    fixed = re.sub(r',\s*([}\]])', r'\1', fixed)
+    # 2. Close unclosed structures
+    open_curly = fixed.count('{') - fixed.count('}')
+    open_bracket = fixed.count('[') - fixed.count(']')
+    if open_curly > 0:
+        fixed += '}' * open_curly
+    if open_bracket > 0:
+        fixed += ']' * open_bracket
+    # 3. Remove excess closing braces/brackets (bounded to 50 iterations)
+    for _ in range(50):
+        try:
+            json.loads(fixed)
+            break
+        except json.JSONDecodeError:
+            if fixed.endswith('}') and fixed.count('}') > fixed.count('{'):
+                fixed = fixed[:-1]
+            elif fixed.endswith(']') and fixed.count(']') > fixed.count('['):
+                fixed = fixed[:-1]
+            else:
+                break
+
+    try:
+        json.loads(fixed)
+        logger.warning(
+            "Repaired malformed tool_call arguments for %s: %s → %s",
+            tool_name, raw_stripped[:80], fixed[:80],
+        )
+        return fixed
+    except json.JSONDecodeError:
+        pass
+
+    # Repair pass 4: escape unescaped control chars inside JSON strings,
+    # then retry. Catches cases where strict=False alone fails because
+    # other malformations are present too.
+    try:
+        escaped = _escape_invalid_chars_in_json_strings(fixed)
+        if escaped != fixed:
+            json.loads(escaped)
+            logger.warning(
+                "Repaired control-char-laced tool_call arguments for %s: %s → %s",
+                tool_name, raw_stripped[:80], escaped[:80],
+            )
+            return escaped
+    except (json.JSONDecodeError, TypeError, ValueError):
+        pass
+
+    # Last resort: replace with empty object so the API request doesn't
+    # crash the entire session.
+    logger.warning(
+        "Unrepairable tool_call arguments for %s — "
+        "replaced with empty object (was: %s)",
+        tool_name, raw_stripped[:80],
+    )
+    return "{}"
+
+
+def _strip_non_ascii(text: str) -> str:
+    """Remove non-ASCII characters, replacing with closest ASCII equivalent or removing.
+
+    Used as a last resort when the system encoding is ASCII and can't handle
+    any non-ASCII characters (e.g. LANG=C on Chromebooks).
+    """
+    return text.encode('ascii', errors='ignore').decode('ascii')
+
+
+def _sanitize_messages_non_ascii(messages: list) -> bool:
+    """Strip non-ASCII characters from all string content in a messages list.
+
+    This is a last-resort recovery for systems with ASCII-only encoding
+    (LANG=C, Chromebooks, minimal containers).  Returns True if any
+    non-ASCII content was found and sanitized.
+    """
+    found = False
+    for msg in messages:
+        if not isinstance(msg, dict):
+            continue
+        # Sanitize content (string)
+        content = msg.get("content")
+        if isinstance(content, str):
+            sanitized = _strip_non_ascii(content)
+            if sanitized != content:
+                msg["content"] = sanitized
+                found = True
+        elif isinstance(content, list):
+            for part in content:
+                if isinstance(part, dict):
+                    text = part.get("text")
+                    if isinstance(text, str):
+                        sanitized = _strip_non_ascii(text)
+                        if sanitized != text:
+                            part["text"] = sanitized
+                            found = True
+        # Sanitize name field (can contain non-ASCII in tool results)
+        name = msg.get("name")
+        if isinstance(name, str):
+            sanitized = _strip_non_ascii(name)
+            if sanitized != name:
+                msg["name"] = sanitized
+                found = True
+        # Sanitize tool_calls
+        tool_calls = msg.get("tool_calls")
+        if isinstance(tool_calls, list):
+            for tc in tool_calls:
+                if isinstance(tc, dict):
+                    fn = tc.get("function", {})
+                    if isinstance(fn, dict):
+                        fn_args = fn.get("arguments")
+                        if isinstance(fn_args, str):
+                            sanitized = _strip_non_ascii(fn_args)
+                            if sanitized != fn_args:
+                                fn["arguments"] = sanitized
+                                found = True
+        # Sanitize any additional top-level string fields (e.g. reasoning_content)
+        for key, value in msg.items():
+            if key in {"content", "name", "tool_calls", "role"}:
+                continue
+            if isinstance(value, str):
+                sanitized = _strip_non_ascii(value)
+                if sanitized != value:
+                    msg[key] = sanitized
+                    found = True
+    return found
+
+
+def _sanitize_tools_non_ascii(tools: list) -> bool:
+    """Strip non-ASCII characters from tool payloads in-place."""
+    return _sanitize_structure_non_ascii(tools)
+
+
+def _strip_images_from_messages(messages: list) -> bool:
+    """Remove image_url content parts from all messages in-place.
+
+    Called when a server signals it does not support images (e.g.
+    "Only 'text' content type is supported.").  Mutates messages so the
+    next API call sends text only.
+
+    Preserves message alternation invariants:
+      * ``tool``-role messages whose content was entirely images are replaced
+        with a plaintext placeholder, NOT deleted — deleting them would leave
+        the paired ``tool_call_id`` on the prior assistant message unmatched,
+        which providers reject with HTTP 400.
+      * Non-tool messages whose content becomes empty are dropped.  In
+        practice this only hits synthetic image-only user messages appended
+        for attachment delivery; real user turns always include text.
+
+    Returns True if any image parts were removed.
+    """
+    found = False
+    to_delete = []
+    for i, msg in enumerate(messages):
+        if not isinstance(msg, dict):
+            continue
+        content = msg.get("content")
+        if not isinstance(content, list):
+            continue
+        new_parts = []
+        for part in content:
+            if isinstance(part, dict) and part.get("type") in {"image_url", "image", "input_image"}:
+                found = True
+            else:
+                new_parts.append(part)
+        if len(new_parts) < len(content):
+            if new_parts:
+                msg["content"] = new_parts
+            elif msg.get("role") == "tool":
+                # Preserve tool_call_id linkage — providers require every
+                # assistant tool_call to have a matching tool response.
+                msg["content"] = "[image content removed — server does not support images]"
+            else:
+                # Synthetic image-only user/assistant message with no text;
+                # safe to drop.
+                to_delete.append(i)
+    for i in reversed(to_delete):
+        del messages[i]
+    return found
+
+
+def _sanitize_structure_non_ascii(payload: Any) -> bool:
+    """Strip non-ASCII characters from nested dict/list payloads in-place."""
+    found = False
+
+    def _walk(node):
+        nonlocal found
+        if isinstance(node, dict):
+            for key, value in node.items():
+                if isinstance(value, str):
+                    sanitized = _strip_non_ascii(value)
+                    if sanitized != value:
+                        node[key] = sanitized
+                        found = True
+                elif isinstance(value, (dict, list)):
+                    _walk(value)
+        elif isinstance(node, list):
+            for idx, value in enumerate(node):
+                if isinstance(value, str):
+                    sanitized = _strip_non_ascii(value)
+                    if sanitized != value:
+                        node[idx] = sanitized
+                        found = True
+                elif isinstance(value, (dict, list)):
+                    _walk(value)
+
+    _walk(payload)
+    return found
+
+
+__all__ = [
+    "_SURROGATE_RE",
+    "_sanitize_surrogates",
+    "_sanitize_structure_surrogates",
+    "_sanitize_messages_surrogates",
+    "_escape_invalid_chars_in_json_strings",
+    "_repair_tool_call_arguments",
+    "_strip_non_ascii",
+    "_sanitize_messages_non_ascii",
+    "_sanitize_tools_non_ascii",
+    "_strip_images_from_messages",
+    "_sanitize_structure_non_ascii",
+]
--- a/agent/process_bootstrap.py
+++ b/agent/process_bootstrap.py
@ -0,0 +1,167 @@
+"""Process-level bootstrap helpers for ``run_agent``.
+
+Three concerns, all tied to ``AIAgent`` boot-time / runtime IO setup:
+
+1. **Lazy OpenAI SDK import** — ``_load_openai_cls`` + ``_OpenAIProxy``
+   defer the 240ms-ish ``from openai import OpenAI`` cost until first use,
+   while preserving ``isinstance(client, OpenAI)`` checks and
+   ``patch("run_agent.OpenAI", ...)`` test patterns.
+
+2. **Crash-resistant stdio** — ``_SafeWriter`` wraps stdout/stderr so
+   ``OSError: Input/output error`` from broken pipes (systemd, Docker,
+   thread teardown races) cannot crash the agent.  ``_install_safe_stdio``
+   applies the wrapper.
+
+3. **HTTP proxy resolution** — ``_get_proxy_from_env`` reads
+   ``HTTPS_PROXY`` / ``HTTP_PROXY`` / ``ALL_PROXY``;
+   ``_get_proxy_for_base_url`` respects ``NO_PROXY`` for the given base URL.
+
+``run_agent`` re-exports every name so existing
+``from run_agent import _get_proxy_from_env`` imports keep working
+unchanged.
+"""
+
+from __future__ import annotations
+
+import os
+import sys
+import urllib.request
+from typing import Optional
+
+from utils import base_url_hostname, normalize_proxy_url
+
+
+# Cached at module level so we only pay the OpenAI SDK import cost once
+# per process (after the first lazy load).
+_OPENAI_CLS_CACHE = None
+
+
+def _load_openai_cls() -> type:
+    """Import and cache ``openai.OpenAI``."""
+    global _OPENAI_CLS_CACHE
+    if _OPENAI_CLS_CACHE is None:
+        from openai import OpenAI as _cls
+        _OPENAI_CLS_CACHE = _cls
+    return _OPENAI_CLS_CACHE
+
+
+class _OpenAIProxy:
+    """Module-level proxy that looks like ``openai.OpenAI`` but imports lazily."""
+
+    __slots__ = ()
+
+    def __call__(self, *args, **kwargs):
+        return _load_openai_cls()(*args, **kwargs)
+
+    def __instancecheck__(self, obj):
+        return isinstance(obj, _load_openai_cls())
+
+    def __repr__(self):
+        return "<lazy openai.OpenAI proxy>"
+
+
+class _SafeWriter:
+    """Transparent stdio wrapper that catches OSError/ValueError from broken pipes.
+
+    When hermes-agent runs as a systemd service, Docker container, or headless
+    daemon, the stdout/stderr pipe can become unavailable (idle timeout, buffer
+    exhaustion, socket reset). Any print() call then raises
+    ``OSError: [Errno 5] Input/output error``, which can crash agent setup or
+    run_conversation() — especially via double-fault when an except handler
+    also tries to print.
+
+    Additionally, when subagents run in ThreadPoolExecutor threads, the shared
+    stdout handle can close between thread teardown and cleanup, raising
+    ``ValueError: I/O operation on closed file`` instead of OSError.
+
+    This wrapper delegates all writes to the underlying stream and silently
+    catches both OSError and ValueError. It is transparent when the wrapped
+    stream is healthy.
+    """
+
+    __slots__ = ("_inner",)
+
+    def __init__(self, inner):
+        object.__setattr__(self, "_inner", inner)
+
+    def write(self, data):
+        try:
+            return self._inner.write(data)
+        except (OSError, ValueError):
+            return len(data) if isinstance(data, str) else 0
+
+    def flush(self):
+        try:
+            self._inner.flush()
+        except (OSError, ValueError):
+            pass
+
+    def fileno(self):
+        return self._inner.fileno()
+
+    def isatty(self):
+        try:
+            return self._inner.isatty()
+        except (OSError, ValueError):
+            return False
+
+    def __getattr__(self, name):
+        return getattr(self._inner, name)
+
+
+def _get_proxy_from_env() -> Optional[str]:
+    """Read proxy URL from environment variables.
+
+    Checks HTTPS_PROXY, HTTP_PROXY, ALL_PROXY (and lowercase variants) in order.
+    Returns the first valid proxy URL found, or None if no proxy is configured.
+    """
+    for key in ("HTTPS_PROXY", "HTTP_PROXY", "ALL_PROXY",
+                "https_proxy", "http_proxy", "all_proxy"):
+        value = os.environ.get(key, "").strip()
+        if value:
+            return normalize_proxy_url(value)
+    return None
+
+
+def _get_proxy_for_base_url(base_url: Optional[str]) -> Optional[str]:
+    """Return an env-configured proxy unless NO_PROXY excludes this base URL."""
+    proxy = _get_proxy_from_env()
+    if not proxy or not base_url:
+        return proxy
+
+    host = base_url_hostname(base_url)
+    if not host:
+        return proxy
+
+    try:
+        if urllib.request.proxy_bypass_environment(host):
+            return None
+    except Exception:
+        pass
+
+    return proxy
+
+
+def _install_safe_stdio() -> None:
+    """Wrap stdout/stderr so best-effort console output cannot crash the agent."""
+    for stream_name in ("stdout", "stderr"):
+        stream = getattr(sys, stream_name, None)
+        if stream is not None and not isinstance(stream, _SafeWriter):
+            setattr(sys, stream_name, _SafeWriter(stream))
+
+
+# Module-level proxy instance — drops in for ``openai.OpenAI``.  Imported as
+# ``from agent.process_bootstrap import OpenAI`` (or re-exported via
+# ``run_agent`` for legacy tests).
+OpenAI = _OpenAIProxy()
+
+
+__all__ = [
+    "OpenAI",
+    "_OpenAIProxy",
+    "_load_openai_cls",
+    "_SafeWriter",
+    "_install_safe_stdio",
+    "_get_proxy_from_env",
+    "_get_proxy_for_base_url",
+]
--- a/agent/stream_diag.py
+++ b/agent/stream_diag.py
@ -0,0 +1,280 @@
+"""Stream diagnostics — per-attempt counters, exception chains, retry logging.
+
+When a streaming chat-completions request dies mid-response, we want to
+know why: which Cloudflare edge served the request, which OpenRouter
+downstream provider answered, how many bytes/chunks we got before the
+drop, the HTTP status, the underlying httpx error class.  These helpers
+collect that info and emit it both to ``agent.log`` (full detail) and to
+the user-facing status line (compact).
+
+All helpers are extracted from :class:`AIAgent` for cleanliness.
+``run_agent`` keeps thin forwarder methods so existing call sites and
+tests that patch ``run_agent.<helper>`` keep working.
+"""
+
+from __future__ import annotations
+
+import logging
+import time
+from typing import Any, Dict, List, Optional
+
+logger = logging.getLogger(__name__)
+
+
+# Per-attempt stream diagnostic headers.  Lowercased; httpx returns
+# CIMultiDict so case-insensitive lookups already work, but we read .get()
+# on the dict from agent.log for free-form post-hoc analysis.
+STREAM_DIAG_HEADERS = (
+    "cf-ray",
+    "cf-cache-status",
+    "x-openrouter-provider",
+    "x-openrouter-model",
+    "x-openrouter-id",
+    "x-request-id",
+    "x-vercel-id",
+    "via",
+    "server",
+    "x-forwarded-for",
+)
+
+
+def stream_diag_init() -> Dict[str, Any]:
+    """Return a fresh per-attempt diagnostic dict.
+
+    Mutated in-place by the streaming functions and read from the retry
+    block when a stream dies.  Lives on ``request_client_holder`` so it
+    survives across the closure boundary.
+    """
+    return {
+        "started_at": time.time(),
+        "first_chunk_at": None,
+        "chunks": 0,
+        "bytes": 0,
+        "headers": {},
+        "http_status": None,
+    }
+
+
+def stream_diag_capture_response(agent: Any, diag: Dict[str, Any], http_response: Any) -> None:
+    """Snapshot interesting headers + HTTP status from the live stream.
+
+    Called once at stream open (before iterating chunks) so the metadata
+    survives even if the stream dies before any chunk arrives.  Failures
+    are swallowed — diag is best-effort.
+    """
+    if http_response is None or not isinstance(diag, dict):
+        return
+    try:
+        diag["http_status"] = getattr(http_response, "status_code", None)
+    except Exception:
+        pass
+    try:
+        headers = getattr(http_response, "headers", None) or {}
+        captured: Dict[str, str] = {}
+        # Allow per-agent override of the headers list (back-compat).
+        target_headers = getattr(agent, "_STREAM_DIAG_HEADERS", STREAM_DIAG_HEADERS)
+        for name in target_headers:
+            try:
+                val = headers.get(name)
+                if val:
+                    # Truncate single-value to keep log lines bounded.
+                    captured[name] = str(val)[:120]
+            except Exception:
+                continue
+        diag["headers"] = captured
+    except Exception:
+        pass
+
+
+def flatten_exception_chain(error: BaseException) -> str:
+    """Return a compact ``Outer(msg) <- Inner(msg) <- ...`` rendering.
+
+    OpenAI SDK wraps httpx errors as ``APIConnectionError`` /
+    ``APIError`` and only the wrapper's class is visible at the catch
+    site — but the underlying ``RemoteProtocolError`` /
+    ``ConnectError`` / ``ReadError`` is what tells us WHY the stream
+    died.  Walks ``__cause__`` then ``__context__`` (deduped, max 4
+    deep) to surface the chain in one line.
+    """
+    seen: List[BaseException] = []
+    link: Optional[BaseException] = error
+    while link is not None and len(seen) < 4:
+        if link in seen:
+            break
+        seen.append(link)
+        nxt = getattr(link, "__cause__", None) or getattr(
+            link, "__context__", None
+        )
+        if nxt is None or nxt is link:
+            break
+        link = nxt
+    parts: List[str] = []
+    for e in seen:
+        msg = str(e).strip().replace("\n", " ")
+        if len(msg) > 140:
+            msg = msg[:140] + "…"
+        parts.append(f"{type(e).__name__}({msg})" if msg else type(e).__name__)
+    return " <- ".join(parts) if parts else type(error).__name__
+
+
+def log_stream_retry(
+    agent: Any,
+    *,
+    kind: str,
+    error: BaseException,
+    attempt: int,
+    max_attempts: int,
+    mid_tool_call: bool,
+    diag: Optional[Dict[str, Any]] = None,
+) -> None:
+    """Record a transient stream-drop and retry to ``agent.log``.
+
+    Always logs a structured WARNING so users have a breadcrumb regardless
+    of UI verbosity.  Subagents in particular benefit because their
+    retries no longer spam the parent's terminal — but the file log keeps
+    full detail (provider, error class, attempt, base_url, subagent_id).
+
+    When *diag* is provided (the per-attempt stream-diagnostic dict from
+    :func:`stream_diag_init`), the WARNING also captures upstream headers
+    (cf-ray, x-openrouter-provider, x-openrouter-id), HTTP status, bytes
+    streamed before the drop, and elapsed time on the dying attempt.
+    These are the breadcrumbs needed to answer "is one CF edge / one
+    downstream provider responsible, or is it random across runs?"
+    """
+    try:
+        try:
+            _summary = agent._summarize_api_error(error)
+        except Exception:
+            _summary = str(error)
+        if _summary and len(_summary) > 240:
+            _summary = _summary[:240] + "…"
+
+        # Inner-cause chain (httpx errors hide under openai.APIError).
+        try:
+            _chain = flatten_exception_chain(error)
+        except Exception:
+            _chain = type(error).__name__
+
+        # Per-attempt counters and upstream headers.
+        _now = time.time()
+        _bytes = 0
+        _chunks = 0
+        _elapsed = 0.0
+        _ttfb = None
+        _headers_repr = "-"
+        _http_status = "-"
+        if isinstance(diag, dict):
+            try:
+                _bytes = int(diag.get("bytes") or 0)
+                _chunks = int(diag.get("chunks") or 0)
+                _started = float(diag.get("started_at") or _now)
+                _elapsed = max(0.0, _now - _started)
+                _first = diag.get("first_chunk_at")
+                if _first is not None:
+                    _ttfb = max(0.0, float(_first) - _started)
+                headers = diag.get("headers") or {}
+                if isinstance(headers, dict) and headers:
+                    _headers_repr = " ".join(
+                        f"{k}={v}" for k, v in headers.items()
+                    )
+                if diag.get("http_status") is not None:
+                    _http_status = str(diag.get("http_status"))
+            except Exception:
+                pass
+
+        logger.warning(
+            "Stream %s on attempt %s/%s — retrying. "
+            "subagent_id=%s depth=%s provider=%s base_url=%s "
+            "error_type=%s error=%s "
+            "chain=%s "
+            "http_status=%s bytes=%d chunks=%d elapsed=%.2fs ttfb=%s "
+            "upstream=[%s]",
+            kind,
+            attempt,
+            max_attempts,
+            getattr(agent, "_subagent_id", None) or "-",
+            getattr(agent, "_delegate_depth", 0),
+            agent.provider or "-",
+            agent.base_url or "-",
+            type(error).__name__,
+            _summary,
+            _chain,
+            _http_status,
+            _bytes,
+            _chunks,
+            _elapsed,
+            f"{_ttfb:.2f}s" if _ttfb is not None else "-",
+            _headers_repr,
+            extra={"mid_tool_call": mid_tool_call},
+        )
+    except Exception:
+        logger.debug("stream-retry log emit failed", exc_info=True)
+
+
+def emit_stream_drop(
+    agent: Any,
+    *,
+    error: BaseException,
+    attempt: int,
+    max_attempts: int,
+    mid_tool_call: bool,
+    diag: Optional[Dict[str, Any]] = None,
+) -> None:
+    """Emit a single user-visible line for a stream drop+retry.
+
+    Both top-level agents and subagents announce drops in the UI — the
+    parent prefixes subagent lines with ``[subagent-N]`` via ``log_prefix``
+    so they're easy to attribute.  All cases also write a structured
+    WARNING to ``agent.log`` via :func:`log_stream_retry` with the full
+    diagnostic detail (subagent_id, provider, base_url, error_type,
+    cf-ray, x-openrouter-provider, bytes/chunks, elapsed) for post-hoc
+    analysis.
+
+    The user-visible status line is intentionally compact: provider,
+    error class, attempt N/M, plus ``after Xs`` when the stream dropped
+    mid-flight.  Full diagnostic detail goes to ``agent.log`` only —
+    ``hermes logs --level WARNING | grep "Stream drop"`` to inspect.
+    """
+    kind = "drop mid tool-call" if mid_tool_call else "drop"
+    log_stream_retry(
+        agent,
+        kind=kind,
+        error=error,
+        attempt=attempt,
+        max_attempts=max_attempts,
+        mid_tool_call=mid_tool_call,
+        diag=diag,
+    )
+    provider = agent.provider or "provider"
+    # Compose a brief "after Xs" suffix when we have timing data — helps
+    # the user distinguish "couldn't connect" (0s) from "died after 30s
+    # of streaming" (likely upstream idle-kill or proxy timeout).
+    _suffix = ""
+    if isinstance(diag, dict):
+        try:
+            started = diag.get("started_at")
+            if started is not None:
+                _suffix = f" after {max(0.0, time.time() - float(started)):.1f}s"
+        except Exception:
+            pass
+    try:
+        agent._emit_status(
+            f"⚠️ {provider} stream {kind} ({type(error).__name__}){_suffix} "
+            f"— reconnecting, retry {attempt}/{max_attempts}"
+        )
+        agent._touch_activity(
+            f"stream retry {attempt}/{max_attempts} "
+            f"after {type(error).__name__}"
+        )
+    except Exception:
+        pass
+
+
+__all__ = [
+    "STREAM_DIAG_HEADERS",
+    "stream_diag_init",
+    "stream_diag_capture_response",
+    "flatten_exception_chain",
+    "log_stream_retry",
+    "emit_stream_drop",
+]
--- a/agent/system_prompt.py
+++ b/agent/system_prompt.py
@ -0,0 +1,333 @@
+"""System-prompt assembly for :class:`AIAgent`.
+
+The agent's system prompt is built once per session and reused across all
+turns — only context compression triggers a rebuild.  This keeps the
+upstream prefix cache warm.  See ``hermes-agent-dev``'s
+``references/system-prompt-invariant.md`` for the invariants and
+``references/self-improvement-loop.md`` for how the background-review
+fork inherits the cached prompt verbatim.
+
+Three tiers are joined with ``\\n\\n``:
+
+* ``stable``   — identity (SOUL.md or DEFAULT_AGENT_IDENTITY), tool
+  guidance, computer-use guidance, nous subscription block, tool-use
+  enforcement guidance + per-model operational guidance, skills prompt,
+  alibaba model-name workaround, environment hints, platform hints.
+* ``context``  — caller-supplied ``system_message`` plus context files
+  (AGENTS.md / .cursorrules / etc.) discovered under ``TERMINAL_CWD``.
+* ``volatile`` — memory snapshot, USER.md profile, external memory
+  provider block, timestamp/session/model/provider line.
+
+Pure helpers that read the agent's state.  AIAgent keeps thin forwarders.
+"""
+
+from __future__ import annotations
+
+import json
+import os
+from typing import Any, Dict, List, Optional
+
+from agent.prompt_builder import (
+    DEFAULT_AGENT_IDENTITY,
+    GOOGLE_MODEL_OPERATIONAL_GUIDANCE,
+    HERMES_AGENT_HELP_GUIDANCE,
+    KANBAN_GUIDANCE,
+    MEMORY_GUIDANCE,
+    OPENAI_MODEL_EXECUTION_GUIDANCE,
+    PLATFORM_HINTS,
+    SESSION_SEARCH_GUIDANCE,
+    SKILLS_GUIDANCE,
+    TOOL_USE_ENFORCEMENT_GUIDANCE,
+    TOOL_USE_ENFORCEMENT_MODELS,
+)
+
+
+def _ra():
+    """Lazy reference to the ``run_agent`` module.
+
+    Helpers like ``load_soul_md``, ``build_environment_hints``,
+    ``build_context_files_prompt``, ``build_nous_subscription_prompt``,
+    ``build_skills_system_prompt`` and ``get_toolset_for_tool`` are
+    imported into ``run_agent``'s namespace.  Many tests
+    ``patch("run_agent.load_soul_md", ...)``; if we imported them
+    directly here those patches would not reach us.  Looking them up
+    through ``run_agent`` on every call preserves the patch contract.
+    """
+    import run_agent
+    return run_agent
+
+
+def build_system_prompt_parts(agent: Any, system_message: Optional[str] = None) -> Dict[str, str]:
+    """Assemble the system prompt as three ordered parts.
+
+    Returns a dict with three keys:
+      * ``stable``   — identity, tool guidance, skills prompt,
+        environment hints, platform hints, model-family operational
+        guidance.
+      * ``context``  — context files (AGENTS.md, .cursorrules, etc.)
+        and caller-supplied system_message.
+      * ``volatile`` — memory snapshot, user profile, external
+        memory provider block, timestamp line.
+
+    Joined into a single string by :func:`build_system_prompt` and
+    cached on ``agent._cached_system_prompt`` for the lifetime of the
+    AIAgent.  Hermes never re-renders parts of this string mid-
+    session — that's the only way to keep upstream prompt caches
+    warm across turns.
+    """
+    # Local import to avoid pulling model_tools at module load.  Tests
+    # patch ``run_agent.get_toolset_for_tool`` and similar helpers, so
+    # we resolve through ``_ra()`` to honor those patches.
+    _r = _ra()
+
+    # ── Stable tier ────────────────────────────────────────────────
+    stable_parts: List[str] = []
+
+    # Try SOUL.md as primary identity unless the caller explicitly skipped it.
+    # Some execution modes (cron) still want HERMES_HOME persona while keeping
+    # cwd project instructions disabled.
+    _soul_loaded = False
+    if agent.load_soul_identity or not agent.skip_context_files:
+        _soul_content = _r.load_soul_md()
+        if _soul_content:
+            stable_parts.append(_soul_content)
+            _soul_loaded = True
+
+    if not _soul_loaded:
+        # Fallback to hardcoded identity
+        stable_parts.append(DEFAULT_AGENT_IDENTITY)
+
+    # Pointer to the hermes-agent skill + docs for user questions about Hermes itself.
+    stable_parts.append(HERMES_AGENT_HELP_GUIDANCE)
+
+    # Tool-aware behavioral guidance: only inject when the tools are loaded
+    tool_guidance = []
+    if "memory" in agent.valid_tool_names:
+        tool_guidance.append(MEMORY_GUIDANCE)
+    if "session_search" in agent.valid_tool_names:
+        tool_guidance.append(SESSION_SEARCH_GUIDANCE)
+    if "skill_manage" in agent.valid_tool_names:
+        tool_guidance.append(SKILLS_GUIDANCE)
+    # Kanban worker/orchestrator lifecycle — only present when the
+    # dispatcher spawned this process (kanban_show check_fn gates on
+    # HERMES_KANBAN_TASK env var). Normal chat sessions never see
+    # this block.
+    if "kanban_show" in agent.valid_tool_names:
+        tool_guidance.append(KANBAN_GUIDANCE)
+    if tool_guidance:
+        stable_parts.append(" ".join(tool_guidance))
+
+    # Computer-use (macOS) — goes in as its own block rather than being
+    # merged into tool_guidance because the content is multi-paragraph.
+    if "computer_use" in agent.valid_tool_names:
+        from agent.prompt_builder import COMPUTER_USE_GUIDANCE
+        stable_parts.append(COMPUTER_USE_GUIDANCE)
+
+    nous_subscription_prompt = _r.build_nous_subscription_prompt(agent.valid_tool_names)
+    if nous_subscription_prompt:
+        stable_parts.append(nous_subscription_prompt)
+    # Tool-use enforcement: tells the model to actually call tools instead
+    # of describing intended actions.  Controlled by config.yaml
+    # agent.tool_use_enforcement:
+    #   "auto" (default) — matches TOOL_USE_ENFORCEMENT_MODELS
+    #   true  — always inject (all models)
+    #   false — never inject
+    #   list  — custom model-name substrings to match
+    if agent.valid_tool_names:
+        _enforce = agent._tool_use_enforcement
+        _inject = False
+        if _enforce is True or (isinstance(_enforce, str) and _enforce.lower() in {"true", "always", "yes", "on"}):
+            _inject = True
+        elif _enforce is False or (isinstance(_enforce, str) and _enforce.lower() in {"false", "never", "no", "off"}):
+            _inject = False
+        elif isinstance(_enforce, list):
+            model_lower = (agent.model or "").lower()
+            _inject = any(p.lower() in model_lower for p in _enforce if isinstance(p, str))
+        else:
+            # "auto" or any unrecognised value — use hardcoded defaults
+            model_lower = (agent.model or "").lower()
+            _inject = any(p in model_lower for p in TOOL_USE_ENFORCEMENT_MODELS)
+        if _inject:
+            stable_parts.append(TOOL_USE_ENFORCEMENT_GUIDANCE)
+            _model_lower = (agent.model or "").lower()
+            # Google model operational guidance (conciseness, absolute
+            # paths, parallel tool calls, verify-before-edit, etc.)
+            if "gemini" in _model_lower or "gemma" in _model_lower:
+                stable_parts.append(GOOGLE_MODEL_OPERATIONAL_GUIDANCE)
+            # OpenAI GPT/Codex execution discipline (tool persistence,
+            # prerequisite checks, verification, anti-hallucination).
+            if "gpt" in _model_lower or "codex" in _model_lower:
+                stable_parts.append(OPENAI_MODEL_EXECUTION_GUIDANCE)
+
+    has_skills_tools = any(name in agent.valid_tool_names for name in ['skills_list', 'skill_view', 'skill_manage'])
+    if has_skills_tools:
+        avail_toolsets = {
+            toolset
+            for toolset in (
+                _r.get_toolset_for_tool(tool_name) for tool_name in agent.valid_tool_names
+            )
+            if toolset
+        }
+        skills_prompt = _r.build_skills_system_prompt(
+            available_tools=agent.valid_tool_names,
+            available_toolsets=avail_toolsets,
+        )
+    else:
+        skills_prompt = ""
+    if skills_prompt:
+        stable_parts.append(skills_prompt)
+
+    # Alibaba Coding Plan API always returns "glm-4.7" as model name regardless
+    # of the requested model. Inject explicit model identity into the system prompt
+    # so the agent can correctly report which model it is (workaround for API bug).
+    # Stable for the lifetime of an agent instance — model and provider are fixed
+    # at construction time.
+    if agent.provider == "alibaba":
+        _model_short = agent.model.split("/")[-1] if "/" in agent.model else agent.model
+        stable_parts.append(
+            f"You are powered by the model named {_model_short}. "
+            f"The exact model ID is {agent.model}. "
+            f"When asked what model you are, always answer based on this information, "
+            f"not on any model name returned by the API."
+        )
+
+    # Environment hints (WSL, Termux, etc.) — tell the agent about the
+    # execution environment so it can translate paths and adapt behavior.
+    # Stable for the lifetime of the process.
+    _env_hints = _r.build_environment_hints()
+    if _env_hints:
+        stable_parts.append(_env_hints)
+
+    platform_key = (agent.platform or "").lower().strip()
+    if platform_key in PLATFORM_HINTS:
+        stable_parts.append(PLATFORM_HINTS[platform_key])
+    elif platform_key:
+        # Check plugin registry for platform-specific LLM guidance
+        try:
+            from gateway.platform_registry import platform_registry
+            _entry = platform_registry.get(platform_key)
+            if _entry and _entry.platform_hint:
+                stable_parts.append(_entry.platform_hint)
+        except Exception:
+            pass
+
+    # ── Context tier (cwd-dependent, may change between sessions) ─
+    context_parts: List[str] = []
+
+    # Note: ephemeral_system_prompt is NOT included here. It's injected at
+    # API-call time only so it stays out of the cached/stored system prompt.
+    if system_message is not None:
+        context_parts.append(system_message)
+
+    if not agent.skip_context_files:
+        # Use TERMINAL_CWD for context file discovery when set (gateway
+        # mode).  The gateway process runs from the hermes-agent install
+        # dir, so os.getcwd() would pick up the repo's AGENTS.md and
+        # other dev files — inflating token usage by ~10k for no benefit.
+        _context_cwd = os.getenv("TERMINAL_CWD") or None
+        context_files_prompt = _r.build_context_files_prompt(
+            cwd=_context_cwd, skip_soul=_soul_loaded)
+        if context_files_prompt:
+            context_parts.append(context_files_prompt)
+
+    # ── Volatile tier (changes per session/turn — never cached) ───
+    volatile_parts: List[str] = []
+
+    if agent._memory_store:
+        if agent._memory_enabled:
+            mem_block = agent._memory_store.format_for_system_prompt("memory")
+            if mem_block:
+                volatile_parts.append(mem_block)
+        # USER.md is always included when enabled.
+        if agent._user_profile_enabled:
+            user_block = agent._memory_store.format_for_system_prompt("user")
+            if user_block:
+                volatile_parts.append(user_block)
+
+    # External memory provider system prompt block (additive to built-in)
+    if agent._memory_manager:
+        try:
+            _ext_mem_block = agent._memory_manager.build_system_prompt()
+            if _ext_mem_block:
+                volatile_parts.append(_ext_mem_block)
+        except Exception:
+            pass
+
+    from hermes_time import now as _hermes_now
+    now = _hermes_now()
+    timestamp_line = f"Conversation started: {now.strftime('%A, %B %d, %Y %I:%M %p')}"
+    if agent.pass_session_id and agent.session_id:
+        timestamp_line += f"\nSession ID: {agent.session_id}"
+    if agent.model:
+        timestamp_line += f"\nModel: {agent.model}"
+    if agent.provider:
+        timestamp_line += f"\nProvider: {agent.provider}"
+    volatile_parts.append(timestamp_line)
+
+    return {
+        "stable":   "\n\n".join(p.strip() for p in stable_parts   if p and p.strip()),
+        "context":  "\n\n".join(p.strip() for p in context_parts  if p and p.strip()),
+        "volatile": "\n\n".join(p.strip() for p in volatile_parts if p and p.strip()),
+    }
+
+
+def build_system_prompt(agent: Any, system_message: Optional[str] = None) -> str:
+    """Assemble the full system prompt from all layers.
+
+    Called once per session (cached on ``agent._cached_system_prompt``) and
+    only rebuilt after context compression events. This ensures the system
+    prompt is stable across all turns in a session, maximizing prefix cache
+    hits.
+
+    Layers are ordered cache-friendly: stable identity/guidance first,
+    then session-stable context files, then per-call volatile content
+    (memory, USER profile, timestamp).  The whole string is treated as
+    one cached block — Hermes never rebuilds or reinjects parts of it
+    mid-session, which is the only way to keep upstream prompt caches
+    warm across turns.
+    """
+    parts = build_system_prompt_parts(agent, system_message=system_message)
+    return "\n\n".join(p for p in (parts["stable"], parts["context"], parts["volatile"]) if p)
+
+
+def invalidate_system_prompt(agent: Any) -> None:
+    """Invalidate the cached system prompt, forcing a rebuild on the next turn.
+
+    Called after context compression events. Also reloads memory from disk
+    so the rebuilt prompt captures any writes from this session.
+    """
+    agent._cached_system_prompt = None
+    if agent._memory_store:
+        agent._memory_store.load_from_disk()
+
+
+def format_tools_for_system_message(agent: Any) -> str:
+    """Format tool definitions for the system message in the trajectory format.
+
+    Returns:
+        str: JSON string representation of tool definitions
+    """
+    if not agent.tools:
+        return "[]"
+
+    # Convert tool definitions to the format expected in trajectories
+    formatted_tools = []
+    for tool in agent.tools:
+        func = tool["function"]
+        formatted_tool = {
+            "name": func["name"],
+            "description": func.get("description", ""),
+            "parameters": func.get("parameters", {}),
+            "required": None  # Match the format in the example
+        }
+        formatted_tools.append(formatted_tool)
+
+    return json.dumps(formatted_tools, ensure_ascii=False)
+
+
+__all__ = [
+    "build_system_prompt_parts",
+    "build_system_prompt",
+    "invalidate_system_prompt",
+    "format_tools_for_system_message",
+]
--- a/agent/tool_dispatch_helpers.py
+++ b/agent/tool_dispatch_helpers.py
@ -0,0 +1,336 @@
+"""Tool-dispatch helpers — parallelism gating, multimodal envelopes, mutation tracking.
+
+Pure module-level utilities extracted from ``run_agent.py``:
+
+* ``_is_destructive_command`` — terminal-command heuristic used to gate
+  parallel batch dispatch.
+* ``_should_parallelize_tool_batch`` / ``_extract_parallel_scope_path`` /
+  ``_paths_overlap`` — the rules engine deciding when a multi-tool batch
+  can run concurrently.
+* ``_is_multimodal_tool_result`` / ``_multimodal_text_summary`` /
+  ``_append_subdir_hint_to_multimodal`` — envelope helpers for the
+  ``{"_multimodal": True, "content": [...], "text_summary": ...}`` dict
+  shape returned by tools like ``computer_use``.
+* ``_extract_file_mutation_targets`` / ``_extract_error_preview`` —
+  per-turn file-mutation verifier inputs.
+* ``_trajectory_normalize_msg`` — strip image blobs from a message for
+  trajectory saving.
+
+All helpers are stateless.  ``run_agent`` re-exports each name so existing
+``from run_agent import ...`` imports in tests and other modules keep
+working unchanged.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import os
+import re
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+from agent.tool_result_classification import (
+    FILE_MUTATING_TOOL_NAMES as _FILE_MUTATING_TOOLS,
+)
+
+logger = logging.getLogger(__name__)
+
+# Tools that must never run concurrently (interactive / user-facing).
+# When any of these appear in a batch, we fall back to sequential execution.
+_NEVER_PARALLEL_TOOLS = frozenset({"clarify"})
+
+# Read-only tools with no shared mutable session state.
+_PARALLEL_SAFE_TOOLS = frozenset({
+    "ha_get_state",
+    "ha_list_entities",
+    "ha_list_services",
+    "read_file",
+    "search_files",
+    "session_search",
+    "skill_view",
+    "skills_list",
+    "vision_analyze",
+    "web_extract",
+    "web_search",
+})
+
+# File tools can run concurrently when they target independent paths.
+_PATH_SCOPED_TOOLS = frozenset({"read_file", "write_file", "patch"})
+
+# Patterns that indicate a terminal command may modify/delete files.
+_DESTRUCTIVE_PATTERNS = re.compile(
+    r"""(?:^|\s|&&|\|\||;|`)(?:
+        rm\s|rmdir\s|
+        cp\s|install\s|
+        mv\s|
+        sed\s+-i|
+        truncate\s|
+        dd\s|
+        shred\s|
+        git\s+(?:reset|clean|checkout)\s
+    )""",
+    re.VERBOSE,
+)
+# Output redirects that overwrite files (> but not >>)
+_REDIRECT_OVERWRITE = re.compile(r'[^>]>[^>]|^>[^>]')
+
+
+def _is_destructive_command(cmd: str) -> bool:
+    """Heuristic: does this terminal command look like it modifies/deletes files?"""
+    if not cmd:
+        return False
+    if _DESTRUCTIVE_PATTERNS.search(cmd):
+        return True
+    if _REDIRECT_OVERWRITE.search(cmd):
+        return True
+    return False
+
+
+def _is_mcp_tool_parallel_safe(tool_name: str) -> bool:
+    """Check if an MCP tool comes from a server with parallel tool calls enabled.
+
+    Lazy-imports from ``tools.mcp_tool`` to avoid circular dependencies.
+    Returns False if the MCP module is not available.
+    """
+    try:
+        from tools.mcp_tool import is_mcp_tool_parallel_safe
+        return is_mcp_tool_parallel_safe(tool_name)
+    except Exception:
+        return False
+
+
+def _should_parallelize_tool_batch(tool_calls) -> bool:
+    """Return True when a tool-call batch is safe to run concurrently."""
+    if len(tool_calls) <= 1:
+        return False
+
+    tool_names = [tc.function.name for tc in tool_calls]
+    if any(name in _NEVER_PARALLEL_TOOLS for name in tool_names):
+        return False
+
+    reserved_paths: list[Path] = []
+    for tool_call in tool_calls:
+        tool_name = tool_call.function.name
+        try:
+            function_args = json.loads(tool_call.function.arguments)
+        except Exception:
+            logging.debug(
+                "Could not parse args for %s — defaulting to sequential; raw=%s",
+                tool_name,
+                tool_call.function.arguments[:200],
+            )
+            return False
+        if not isinstance(function_args, dict):
+            logging.debug(
+                "Non-dict args for %s (%s) — defaulting to sequential",
+                tool_name,
+                type(function_args).__name__,
+            )
+            return False
+
+        if tool_name in _PATH_SCOPED_TOOLS:
+            scoped_path = _extract_parallel_scope_path(tool_name, function_args)
+            if scoped_path is None:
+                return False
+            if any(_paths_overlap(scoped_path, existing) for existing in reserved_paths):
+                return False
+            reserved_paths.append(scoped_path)
+            continue
+
+        if tool_name not in _PARALLEL_SAFE_TOOLS:
+            # Check if it's an MCP tool from a server that opted into parallel calls.
+            if not _is_mcp_tool_parallel_safe(tool_name):
+                return False
+
+    return True
+
+
+def _extract_parallel_scope_path(tool_name: str, function_args: dict) -> Optional[Path]:
+    """Return the normalized file target for path-scoped tools."""
+    if tool_name not in _PATH_SCOPED_TOOLS:
+        return None
+
+    raw_path = function_args.get("path")
+    if not isinstance(raw_path, str) or not raw_path.strip():
+        return None
+
+    expanded = Path(raw_path).expanduser()
+    if expanded.is_absolute():
+        return Path(os.path.abspath(str(expanded)))
+
+    # Avoid resolve(); the file may not exist yet.
+    return Path(os.path.abspath(str(Path.cwd() / expanded)))
+
+
+def _paths_overlap(left: Path, right: Path) -> bool:
+    """Return True when two paths may refer to the same subtree."""
+    left_parts = left.parts
+    right_parts = right.parts
+    if not left_parts or not right_parts:
+        # Empty paths shouldn't reach here (guarded upstream), but be safe.
+        return bool(left_parts) == bool(right_parts) and bool(left_parts)
+    common_len = min(len(left_parts), len(right_parts))
+    return left_parts[:common_len] == right_parts[:common_len]
+
+
+def _is_multimodal_tool_result(value: Any) -> bool:
+    """True if the value is a multimodal tool result envelope.
+
+    Multimodal handlers (e.g. tools/computer_use) return a dict with
+    `_multimodal=True`, a `content` key holding OpenAI-style content
+    parts, and an optional `text_summary` for string-only fallbacks.
+    """
+    return (
+        isinstance(value, dict)
+        and value.get("_multimodal") is True
+        and isinstance(value.get("content"), list)
+    )
+
+
+def _multimodal_text_summary(value: Any) -> str:
+    """Extract a plain text view of a multimodal tool result.
+
+    Used wherever downstream code needs a string — logging, previews,
+    persistence size heuristics, fall-back content for providers that
+    don't support multipart tool messages.
+    """
+    if _is_multimodal_tool_result(value):
+        if value.get("text_summary"):
+            return str(value["text_summary"])
+        parts = []
+        for p in value.get("content") or []:
+            if isinstance(p, dict) and p.get("type") == "text":
+                parts.append(str(p.get("text", "")))
+        if parts:
+            return "\n".join(parts)
+        return "[multimodal tool result]"
+    if isinstance(value, str):
+        return value
+    try:
+        return json.dumps(value, default=str)
+    except Exception:
+        return str(value)
+
+
+def _append_subdir_hint_to_multimodal(value: Dict[str, Any], hint: str) -> None:
+    """Mutate a multimodal tool-result envelope to append a subdir hint.
+
+    The hint is added to the first text part so the model sees it; image
+    parts are left untouched. `text_summary` is also updated for
+    string-fallback callers.
+    """
+    if not _is_multimodal_tool_result(value):
+        return
+    parts = value.get("content") or []
+    for p in parts:
+        if isinstance(p, dict) and p.get("type") == "text":
+            p["text"] = str(p.get("text", "")) + hint
+            break
+    else:
+        parts.insert(0, {"type": "text", "text": hint})
+        value["content"] = parts
+    if isinstance(value.get("text_summary"), str):
+        value["text_summary"] = value["text_summary"] + hint
+
+
+def _extract_file_mutation_targets(tool_name: str, args: Dict[str, Any]) -> List[str]:
+    """Return the file paths a ``write_file`` or ``patch`` call is targeting.
+
+    For ``write_file`` and ``patch`` in replace mode this is just ``args["path"]``.
+    For ``patch`` in V4A patch mode we parse the patch content for
+    ``*** Update File:`` / ``*** Add File:`` / ``*** Delete File:`` headers so
+    the verifier can track each file in a multi-file patch separately.
+    """
+    if tool_name not in _FILE_MUTATING_TOOLS:
+        return []
+    if tool_name == "write_file":
+        p = args.get("path")
+        return [str(p)] if p else []
+    # tool_name == "patch"
+    mode = args.get("mode") or "replace"
+    if mode == "replace":
+        p = args.get("path")
+        return [str(p)] if p else []
+    if mode == "patch":
+        body = args.get("patch") or ""
+        if not isinstance(body, str) or not body:
+            return []
+        paths: List[str] = []
+        for _m in re.finditer(
+            r'^\*\*\*\s+(?:Update|Add|Delete)\s+File:\s*(.+)$',
+            body,
+            re.MULTILINE,
+        ):
+            p = _m.group(1).strip()
+            if p:
+                paths.append(p)
+        return paths
+    return []
+
+
+def _extract_error_preview(result: Any, max_len: int = 180) -> str:
+    """Pull a one-line error summary out of a tool result for footer display."""
+    text = _multimodal_text_summary(result) if result is not None else ""
+    if not isinstance(text, str):
+        try:
+            text = str(text)
+        except Exception:
+            return ""
+    # Try to parse JSON and pull the ``error`` field — tool handlers return
+    # ``{"success": false, "error": "..."}``; raw string wins if parse fails.
+    stripped = text.strip()
+    if stripped.startswith("{"):
+        try:
+            data = json.loads(stripped)
+            if isinstance(data, dict) and isinstance(data.get("error"), str):
+                text = data["error"]
+        except Exception:
+            pass
+    # Collapse whitespace, trim to max_len.
+    text = " ".join(text.split())
+    if len(text) > max_len:
+        text = text[: max_len - 1] + "…"
+    return text
+
+
+def _trajectory_normalize_msg(msg: Dict[str, Any]) -> Dict[str, Any]:
+    """Strip image blobs from a message for trajectory saving.
+
+    Returns a shallow copy with multimodal tool results replaced by their
+    text_summary, and image parts in content lists replaced by
+    `[screenshot]` placeholders. Keeps the message schema otherwise intact.
+    """
+    if not isinstance(msg, dict):
+        return msg
+    content = msg.get("content")
+    if _is_multimodal_tool_result(content):
+        return {**msg, "content": _multimodal_text_summary(content)}
+    if isinstance(content, list):
+        cleaned = []
+        for p in content:
+            if isinstance(p, dict) and p.get("type") in {"image", "image_url", "input_image"}:
+                cleaned.append({"type": "text", "text": "[screenshot]"})
+            else:
+                cleaned.append(p)
+        return {**msg, "content": cleaned}
+    return msg
+
+
+__all__ = [
+    "_NEVER_PARALLEL_TOOLS",
+    "_PARALLEL_SAFE_TOOLS",
+    "_PATH_SCOPED_TOOLS",
+    "_DESTRUCTIVE_PATTERNS",
+    "_REDIRECT_OVERWRITE",
+    "_is_destructive_command",
+    "_should_parallelize_tool_batch",
+    "_extract_parallel_scope_path",
+    "_paths_overlap",
+    "_is_multimodal_tool_result",
+    "_multimodal_text_summary",
+    "_append_subdir_hint_to_multimodal",
+    "_extract_file_mutation_targets",
+    "_extract_error_preview",
+    "_trajectory_normalize_msg",
+]
--- a/agent/tool_executor.py
+++ b/agent/tool_executor.py
@ -0,0 +1,920 @@
+"""Tool-call execution — sequential and concurrent dispatch.
+
+Both AIAgent methods (``_execute_tool_calls_sequential`` and
+``_execute_tool_calls_concurrent``) live here as module-level
+functions that take the parent ``AIAgent`` as their first argument.
+
+``run_agent`` keeps thin wrappers so existing call sites work; tests
+that patch ``run_agent._set_interrupt`` are honored because the
+extracted functions reach back through the ``run_agent`` module via
+``_ra()`` for that symbol.
+"""
+
+from __future__ import annotations
+
+import concurrent.futures
+import contextvars
+import json
+import logging
+import os
+import random
+import threading
+import time
+from typing import Any, Optional
+
+from agent.display import (
+    KawaiiSpinner,
+    build_tool_preview as _build_tool_preview,
+    get_cute_tool_message as _get_cute_tool_message_impl,
+    get_tool_emoji as _get_tool_emoji,
+    _detect_tool_failure,
+)
+from agent.tool_guardrails import ToolGuardrailDecision
+from agent.tool_dispatch_helpers import (
+    _is_destructive_command,
+    _is_multimodal_tool_result,
+    _multimodal_text_summary,
+    _append_subdir_hint_to_multimodal,
+)
+from tools.terminal_tool import (
+    _get_approval_callback,
+    _get_sudo_password_callback,
+    set_approval_callback as _set_approval_callback,
+    set_sudo_password_callback as _set_sudo_password_callback,
+    get_active_env,
+)
+from tools.tool_result_storage import (
+    maybe_persist_tool_result,
+    enforce_turn_budget,
+)
+
+logger = logging.getLogger(__name__)
+
+# Maximum number of concurrent worker threads for parallel tool execution.
+# Mirrors the constant in ``run_agent`` for tests/imports that look here.
+_MAX_TOOL_WORKERS = 8
+
+
+def _ra():
+    """Lazy reference to ``run_agent`` so patches like ``run_agent._set_interrupt`` work."""
+    import run_agent
+    return run_agent
+
+
+def execute_tool_calls_concurrent(agent, assistant_message, messages: list, effective_task_id: str, api_call_count: int = 0) -> None:
+    """Execute multiple tool calls concurrently using a thread pool.
+
+    Results are collected in the original tool-call order and appended to
+    messages so the API sees them in the expected sequence.
+    """
+    tool_calls = assistant_message.tool_calls
+    num_tools = len(tool_calls)
+
+    # ── Pre-flight: interrupt check ──────────────────────────────────
+    if agent._interrupt_requested:
+        print(f"{agent.log_prefix}⚡ Interrupt: skipping {num_tools} tool call(s)")
+        for tc in tool_calls:
+            messages.append({
+                "role": "tool",
+                "name": tc.function.name,
+                "content": f"[Tool execution cancelled — {tc.function.name} was skipped due to user interrupt]",
+                "tool_call_id": tc.id,
+            })
+        return
+
+    # ── Parse args + pre-execution bookkeeping ───────────────────────
+    parsed_calls = []  # list of (tool_call, function_name, function_args)
+    for tool_call in tool_calls:
+        function_name = tool_call.function.name
+
+        # Reset nudge counters
+        if function_name == "memory":
+            agent._turns_since_memory = 0
+        elif function_name == "skill_manage":
+            agent._iters_since_skill = 0
+
+        try:
+            function_args = json.loads(tool_call.function.arguments)
+        except json.JSONDecodeError:
+            function_args = {}
+        if not isinstance(function_args, dict):
+            function_args = {}
+
+        # Checkpoint for file-mutating tools
+        if function_name in {"write_file", "patch"} and agent._checkpoint_mgr.enabled:
+            try:
+                file_path = function_args.get("path", "")
+                if file_path:
+                    work_dir = agent._checkpoint_mgr.get_working_dir_for_path(file_path)
+                    agent._checkpoint_mgr.ensure_checkpoint(work_dir, f"before {function_name}")
+            except Exception:
+                pass
+
+        # Checkpoint before destructive terminal commands
+        if function_name == "terminal" and agent._checkpoint_mgr.enabled:
+            try:
+                cmd = function_args.get("command", "")
+                if _is_destructive_command(cmd):
+                    cwd = function_args.get("workdir") or os.getenv("TERMINAL_CWD", os.getcwd())
+                    agent._checkpoint_mgr.ensure_checkpoint(
+                        cwd, f"before terminal: {cmd[:60]}"
+                    )
+            except Exception:
+                pass
+
+        block_result = None
+        blocked_by_guardrail = False
+        try:
+            from hermes_cli.plugins import get_pre_tool_call_block_message
+            block_message = get_pre_tool_call_block_message(
+                function_name, function_args, task_id=effective_task_id or "",
+            )
+        except Exception:
+            block_message = None
+
+        if block_message is not None:
+            block_result = json.dumps({"error": block_message}, ensure_ascii=False)
+        else:
+            guardrail_decision = agent._tool_guardrails.before_call(function_name, function_args)
+            if not guardrail_decision.allows_execution:
+                block_result = agent._guardrail_block_result(guardrail_decision)
+                blocked_by_guardrail = True
+
+        parsed_calls.append((tool_call, function_name, function_args, block_result, blocked_by_guardrail))
+
+    # ── Logging / callbacks ──────────────────────────────────────────
+    tool_names_str = ", ".join(name for _, name, _, _, _ in parsed_calls)
+    if not agent.quiet_mode:
+        print(f"  ⚡ Concurrent: {num_tools} tool calls — {tool_names_str}")
+        for i, (tc, name, args, block_result, blocked_by_guardrail) in enumerate(parsed_calls, 1):
+            args_str = json.dumps(args, ensure_ascii=False)
+            if agent.verbose_logging:
+                print(f"  📞 Tool {i}: {name}({list(args.keys())})")
+                print(agent._wrap_verbose("Args: ", json.dumps(args, indent=2, ensure_ascii=False)))
+            else:
+                args_preview = args_str[:agent.log_prefix_chars] + "..." if len(args_str) > agent.log_prefix_chars else args_str
+                print(f"  📞 Tool {i}: {name}({list(args.keys())}) - {args_preview}")
+
+    for tc, name, args, block_result, blocked_by_guardrail in parsed_calls:
+        if block_result is not None:
+            continue
+        if agent.tool_progress_callback:
+            try:
+                preview = _build_tool_preview(name, args)
+                agent.tool_progress_callback("tool.started", name, preview, args)
+            except Exception as cb_err:
+                logging.debug(f"Tool progress callback error: {cb_err}")
+
+    for tc, name, args, block_result, blocked_by_guardrail in parsed_calls:
+        if block_result is not None:
+            continue
+        if agent.tool_start_callback:
+            try:
+                agent.tool_start_callback(tc.id, name, args)
+            except Exception as cb_err:
+                logging.debug(f"Tool start callback error: {cb_err}")
+
+    # ── Concurrent execution ─────────────────────────────────────────
+    # Each slot holds (function_name, function_args, function_result, duration, error_flag, blocked_flag)
+    results = [None] * num_tools
+    for i, (tc, name, args, block_result, blocked_by_guardrail) in enumerate(parsed_calls):
+        if block_result is not None:
+            results[i] = (name, args, block_result, 0.0, True, True)
+
+    # Touch activity before launching workers so the gateway knows
+    # we're executing tools (not stuck).
+    agent._current_tool = tool_names_str
+    agent._touch_activity(f"executing {num_tools} tools concurrently: {tool_names_str}")
+
+    # Capture CLI callbacks from the agent thread so worker threads can
+    # register them locally.  Without this, _get_approval_callback() in
+    # terminal_tool returns None in ThreadPoolExecutor workers, causing
+    # the dangerous-command prompt to fall back to input() — which
+    # deadlocks against prompt_toolkit's raw terminal mode (#13617).
+    _parent_approval_cb = _get_approval_callback()
+    _parent_sudo_cb = _get_sudo_password_callback()
+
+    def _run_tool(index, tool_call, function_name, function_args):
+        """Worker function executed in a thread."""
+        # Register this worker tid so the agent can fan out an interrupt
+        # to it — see AIAgent.interrupt().  Must happen first thing, and
+        # must be paired with discard + clear in the finally block.
+        _worker_tid = threading.current_thread().ident
+        with agent._tool_worker_threads_lock:
+            agent._tool_worker_threads.add(_worker_tid)
+        # Race: if the agent was interrupted between fan-out (which
+        # snapshotted an empty/earlier set) and our registration, apply
+        # the interrupt to our own tid now so is_interrupted() inside
+        # the tool returns True on the next poll.
+        if agent._interrupt_requested:
+            try:
+                _ra()._set_interrupt(True, _worker_tid)
+            except Exception:
+                pass
+        # Set the activity callback on THIS worker thread so
+        # _wait_for_process (terminal commands) can fire heartbeats.
+        # The callback is thread-local; the main thread's callback
+        # is invisible to worker threads.
+        try:
+            from tools.environments.base import set_activity_callback
+            set_activity_callback(agent._touch_activity)
+        except Exception:
+            pass
+        # Propagate approval/sudo callbacks to this worker thread.
+        # Mirrors cli.py run_agent() pattern (GHSA-qg5c-hvr5-hjgr).
+        if _parent_approval_cb is not None:
+            try:
+                _set_approval_callback(_parent_approval_cb)
+            except Exception:
+                pass
+        if _parent_sudo_cb is not None:
+            try:
+                _set_sudo_password_callback(_parent_sudo_cb)
+            except Exception:
+                pass
+        start = time.time()
+        try:
+            result = agent._invoke_tool(
+                function_name,
+                function_args,
+                effective_task_id,
+                tool_call.id,
+                messages=messages,
+                pre_tool_block_checked=True,
+            )
+        except Exception as tool_error:
+            result = f"Error executing tool '{function_name}': {tool_error}"
+            logger.error("_invoke_tool raised for %s: %s", function_name, tool_error, exc_info=True)
+        duration = time.time() - start
+        is_error, _ = _detect_tool_failure(function_name, result)
+        if is_error:
+            logger.info("tool %s failed (%.2fs): %s", function_name, duration, result[:200])
+        else:
+            logger.info("tool %s completed (%.2fs, %d chars)", function_name, duration, len(result))
+        results[index] = (function_name, function_args, result, duration, is_error, False)
+        # Tear down worker-tid tracking.  Clear any interrupt bit we may
+        # have set so the next task scheduled onto this recycled tid
+        # starts with a clean slate.
+        with agent._tool_worker_threads_lock:
+            agent._tool_worker_threads.discard(_worker_tid)
+        try:
+            _ra()._set_interrupt(False, _worker_tid)
+        except Exception:
+            pass
+        # Clear thread-local callbacks so a recycled worker thread
+        # doesn't hold stale references to a disposed CLI instance.
+        try:
+            _set_approval_callback(None)
+            _set_sudo_password_callback(None)
+        except Exception:
+            pass
+
+    # Start spinner for CLI mode (skip when TUI handles tool progress)
+    spinner = None
+    if agent._should_emit_quiet_tool_messages() and agent._should_start_quiet_spinner():
+        face = random.choice(KawaiiSpinner.get_waiting_faces())
+        spinner = KawaiiSpinner(f"{face} ⚡ running {num_tools} tools concurrently", spinner_type='dots', print_fn=agent._print_fn)
+        spinner.start()
+
+    try:
+        runnable_calls = [
+            (i, tc, name, args)
+            for i, (tc, name, args, block_result, blocked_by_guardrail) in enumerate(parsed_calls)
+            if block_result is None
+        ]
+        futures = []
+        if runnable_calls:
+            max_workers = min(len(runnable_calls), _MAX_TOOL_WORKERS)
+            with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
+                for i, tc, name, args in runnable_calls:
+                    # Propagate ContextVars (e.g. _approval_session_key); mirrors asyncio.to_thread.
+                    ctx = contextvars.copy_context()
+                    f = executor.submit(ctx.run, _run_tool, i, tc, name, args)
+                    futures.append(f)
+
+                # Wait for all to complete with periodic heartbeats so the
+                # gateway's inactivity monitor doesn't kill us during long
+                # concurrent tool batches. Also check for user interrupts
+                # so we don't block indefinitely when the user sends /stop
+                # or a new message during concurrent tool execution.
+                _conc_start = time.time()
+                _interrupt_logged = False
+                while True:
+                    done, not_done = concurrent.futures.wait(
+                        futures, timeout=5.0,
+                    )
+                    if not not_done:
+                        break
+
+                    # Check for interrupt — the per-thread interrupt signal
+                    # already causes individual tools (terminal, execute_code)
+                    # to abort, but tools without interrupt checks (web_search,
+                    # read_file) will run to completion. Cancel any futures
+                    # that haven't started yet so we don't block on them.
+                    if agent._interrupt_requested:
+                        if not _interrupt_logged:
+                            _interrupt_logged = True
+                            agent._vprint(
+                                f"{agent.log_prefix}⚡ Interrupt: cancelling "
+                                f"{len(not_done)} pending concurrent tool(s)",
+                                force=True,
+                            )
+                        for f in not_done:
+                            f.cancel()
+                        # Give already-running tools a moment to notice the
+                        # per-thread interrupt signal and exit gracefully.
+                        concurrent.futures.wait(not_done, timeout=3.0)
+                        break
+
+                    _conc_elapsed = int(time.time() - _conc_start)
+                    # Heartbeat every ~30s (6 × 5s poll intervals)
+                    if _conc_elapsed > 0 and _conc_elapsed % 30 < 6:
+                        _still_running = [
+                            parsed_calls[futures.index(f)][1]
+                            for f in not_done
+                            if f in futures
+                        ]
+                        agent._touch_activity(
+                            f"concurrent tools running ({_conc_elapsed}s, "
+                            f"{len(not_done)} remaining: {', '.join(_still_running[:3])})"
+                        )
+    finally:
+        if spinner:
+            # Build a summary message for the spinner stop
+            completed = sum(1 for r in results if r is not None)
+            total_dur = sum(r[3] for r in results if r is not None)
+            spinner.stop(f"⚡ {completed}/{num_tools} tools completed in {total_dur:.1f}s total")
+
+    # ── Post-execution: display per-tool results ─────────────────────
+    for i, (tc, name, args, block_result, blocked_by_guardrail) in enumerate(parsed_calls):
+        r = results[i]
+        blocked = False
+        if r is None:
+            # Tool was cancelled (interrupt) or thread didn't return
+            if agent._interrupt_requested:
+                function_result = f"[Tool execution cancelled — {name} was skipped due to user interrupt]"
+            else:
+                function_result = f"Error executing tool '{name}': thread did not return a result"
+            tool_duration = 0.0
+        else:
+            function_name, function_args, function_result, tool_duration, is_error, blocked = r
+
+            if not blocked:
+                function_result = agent._append_guardrail_observation(
+                    function_name,
+                    function_args,
+                    function_result,
+                    failed=is_error,
+                )
+
+            if is_error:
+                _err_text = _multimodal_text_summary(function_result)
+                result_preview = _err_text[:200] if len(_err_text) > 200 else _err_text
+                logger.warning("Tool %s returned error (%.2fs): %s", function_name, tool_duration, result_preview)
+
+            # Track file-mutation outcome for the turn-end verifier.
+            # `blocked` calls never actually ran — don't let a guardrail
+            # block count as either a failure or a success.
+            if not blocked:
+                try:
+                    agent._record_file_mutation_result(
+                        function_name, function_args, function_result, is_error,
+                    )
+                except Exception as _ver_err:
+                    logging.debug("file-mutation verifier record failed: %s", _ver_err)
+
+            if not blocked and agent.tool_progress_callback:
+                try:
+                    agent.tool_progress_callback(
+                        "tool.completed", function_name, None, None,
+                        duration=tool_duration, is_error=is_error,
+                    )
+                except Exception as cb_err:
+                    logging.debug(f"Tool progress callback error: {cb_err}")
+
+            if agent.verbose_logging:
+                logging.debug(f"Tool {function_name} completed in {tool_duration:.2f}s")
+                logging.debug(f"Tool result ({len(function_result)} chars): {function_result}")
+
+        # Print cute message per tool
+        if agent._should_emit_quiet_tool_messages():
+            cute_msg = _get_cute_tool_message_impl(name, args, tool_duration, result=function_result)
+            agent._safe_print(f"  {cute_msg}")
+        elif not agent.quiet_mode:
+            _preview_str = _multimodal_text_summary(function_result)
+            if agent.verbose_logging:
+                print(f"  ✅ Tool {i+1} completed in {tool_duration:.2f}s")
+                print(agent._wrap_verbose("Result: ", _preview_str))
+            else:
+                response_preview = _preview_str[:agent.log_prefix_chars] + "..." if len(_preview_str) > agent.log_prefix_chars else _preview_str
+                print(f"  ✅ Tool {i+1} completed in {tool_duration:.2f}s - {response_preview}")
+
+        agent._current_tool = None
+        agent._touch_activity(f"tool completed: {name} ({tool_duration:.1f}s)")
+
+        if not blocked and agent.tool_complete_callback:
+            try:
+                agent.tool_complete_callback(tc.id, name, args, function_result)
+            except Exception as cb_err:
+                logging.debug(f"Tool complete callback error: {cb_err}")
+
+        function_result = maybe_persist_tool_result(
+            content=function_result,
+            tool_name=name,
+            tool_use_id=tc.id,
+            env=get_active_env(effective_task_id),
+        ) if not _is_multimodal_tool_result(function_result) else function_result
+
+        subdir_hints = agent._subdirectory_hints.check_tool_call(name, args)
+        if subdir_hints:
+            if _is_multimodal_tool_result(function_result):
+                # Append the hint to the text summary part so the model
+                # still sees it; don't touch the image blocks.
+                _append_subdir_hint_to_multimodal(function_result, subdir_hints)
+            else:
+                function_result += subdir_hints
+
+        # Unwrap _multimodal dicts to an OpenAI-style content list so any
+        # vision-capable provider receives [{type:text},{type:image_url}]
+        # rather than a raw Python dict.  The Anthropic adapter already
+        # accepts content lists; vision-capable OpenAI-compatible servers
+        # (mlx-vlm, GPT-4o, …) accept image_url in tool messages natively.
+        # Text-only servers get a string-safe fallback here so a rejected
+        # image tool result never poisons canonical session history.
+        # String results pass through unchanged.
+        _tool_content = agent._tool_result_content_for_active_model(name, function_result)
+        tool_msg = {
+            "role": "tool",
+            "name": name,
+            "content": _tool_content,
+            "tool_call_id": tc.id,
+        }
+        messages.append(tool_msg)
+
+        # ── Per-tool /steer drain ───────────────────────────────────
+        # Same as the sequential path: drain between each collected
+        # result so the steer lands as early as possible.
+        agent._apply_pending_steer_to_tool_results(messages, 1)
+
+    # ── Per-turn aggregate budget enforcement ─────────────────────────
+    num_tools = len(parsed_calls)
+    if num_tools > 0:
+        turn_tool_msgs = messages[-num_tools:]
+        enforce_turn_budget(turn_tool_msgs, env=get_active_env(effective_task_id))
+
+    # ── /steer injection ──────────────────────────────────────────────
+    # Append any pending user steer text to the last tool result so the
+    # agent sees it on its next iteration. Runs AFTER budget enforcement
+    # so the steer marker is never truncated. See steer() for details.
+    if num_tools > 0:
+        agent._apply_pending_steer_to_tool_results(messages, num_tools)
+
+
+
+def execute_tool_calls_sequential(agent, assistant_message, messages: list, effective_task_id: str, api_call_count: int = 0) -> None:
+    """Execute tool calls sequentially (original behavior). Used for single calls or interactive tools."""
+    for i, tool_call in enumerate(assistant_message.tool_calls, 1):
+        # SAFETY: check interrupt BEFORE starting each tool.
+        # If the user sent "stop" during a previous tool's execution,
+        # do NOT start any more tools -- skip them all immediately.
+        if agent._interrupt_requested:
+            remaining_calls = assistant_message.tool_calls[i-1:]
+            if remaining_calls:
+                agent._vprint(f"{agent.log_prefix}⚡ Interrupt: skipping {len(remaining_calls)} tool call(s)", force=True)
+            for skipped_tc in remaining_calls:
+                skipped_name = skipped_tc.function.name
+                skip_msg = {
+                    "role": "tool",
+                    "name": skipped_name,
+                    "content": f"[Tool execution cancelled — {skipped_name} was skipped due to user interrupt]",
+                    "tool_call_id": skipped_tc.id,
+                }
+                messages.append(skip_msg)
+            break
+
+        function_name = tool_call.function.name
+
+        try:
+            function_args = json.loads(tool_call.function.arguments)
+        except json.JSONDecodeError as e:
+            logging.warning(f"Unexpected JSON error after validation: {e}")
+            function_args = {}
+        if not isinstance(function_args, dict):
+            function_args = {}
+
+        # Check plugin hooks for a block directive before executing.
+        _block_msg: Optional[str] = None
+        try:
+            from hermes_cli.plugins import get_pre_tool_call_block_message
+            _block_msg = get_pre_tool_call_block_message(
+                function_name, function_args, task_id=effective_task_id or "",
+            )
+        except Exception:
+            pass
+
+        _guardrail_block_decision: ToolGuardrailDecision | None = None
+        if _block_msg is None:
+            guardrail_decision = agent._tool_guardrails.before_call(function_name, function_args)
+            if not guardrail_decision.allows_execution:
+                _guardrail_block_decision = guardrail_decision
+
+        _execution_blocked = _block_msg is not None or _guardrail_block_decision is not None
+
+        if _execution_blocked:
+            # Tool blocked by plugin or guardrail policy — skip counters,
+            # callbacks, checkpointing, activity mutation, and real execution.
+            pass
+        # Reset nudge counters when the relevant tool is actually used
+        elif function_name == "memory":
+            agent._turns_since_memory = 0
+        elif function_name == "skill_manage":
+            agent._iters_since_skill = 0
+
+        if not agent.quiet_mode:
+            args_str = json.dumps(function_args, ensure_ascii=False)
+            if agent.verbose_logging:
+                print(f"  📞 Tool {i}: {function_name}({list(function_args.keys())})")
+                print(agent._wrap_verbose("Args: ", json.dumps(function_args, indent=2, ensure_ascii=False)))
+            else:
+                args_preview = args_str[:agent.log_prefix_chars] + "..." if len(args_str) > agent.log_prefix_chars else args_str
+                print(f"  📞 Tool {i}: {function_name}({list(function_args.keys())}) - {args_preview}")
+
+        if not _execution_blocked:
+            agent._current_tool = function_name
+            agent._touch_activity(f"executing tool: {function_name}")
+
+        # Set activity callback for long-running tool execution (terminal
+        # commands, etc.) so the gateway's inactivity monitor doesn't kill
+        # the agent while a command is running.
+        if not _execution_blocked:
+            try:
+                from tools.environments.base import set_activity_callback
+                set_activity_callback(agent._touch_activity)
+            except Exception:
+                pass
+
+        if not _execution_blocked and agent.tool_progress_callback:
+            try:
+                preview = _build_tool_preview(function_name, function_args)
+                agent.tool_progress_callback("tool.started", function_name, preview, function_args)
+            except Exception as cb_err:
+                logging.debug(f"Tool progress callback error: {cb_err}")
+
+        if not _execution_blocked and agent.tool_start_callback:
+            try:
+                agent.tool_start_callback(tool_call.id, function_name, function_args)
+            except Exception as cb_err:
+                logging.debug(f"Tool start callback error: {cb_err}")
+
+        # Checkpoint: snapshot working dir before file-mutating tools
+        if not _execution_blocked and function_name in {"write_file", "patch"} and agent._checkpoint_mgr.enabled:
+            try:
+                file_path = function_args.get("path", "")
+                if file_path:
+                    work_dir = agent._checkpoint_mgr.get_working_dir_for_path(file_path)
+                    agent._checkpoint_mgr.ensure_checkpoint(
+                        work_dir, f"before {function_name}"
+                    )
+            except Exception:
+                pass  # never block tool execution
+
+        # Checkpoint before destructive terminal commands
+        if not _execution_blocked and function_name == "terminal" and agent._checkpoint_mgr.enabled:
+            try:
+                cmd = function_args.get("command", "")
+                if _is_destructive_command(cmd):
+                    cwd = function_args.get("workdir") or os.getenv("TERMINAL_CWD", os.getcwd())
+                    agent._checkpoint_mgr.ensure_checkpoint(
+                        cwd, f"before terminal: {cmd[:60]}"
+                    )
+            except Exception:
+                pass  # never block tool execution
+
+        tool_start_time = time.time()
+
+        if _block_msg is not None:
+            # Tool blocked by plugin policy — return error without executing.
+            function_result = json.dumps({"error": _block_msg}, ensure_ascii=False)
+            tool_duration = 0.0
+        elif _guardrail_block_decision is not None:
+            # Tool blocked by tool-loop guardrail — synthesize exactly one
+            # tool result for the original tool_call_id without executing.
+            function_result = agent._guardrail_block_result(_guardrail_block_decision)
+            tool_duration = 0.0
+        elif function_name == "todo":
+            from tools.todo_tool import todo_tool as _todo_tool
+            function_result = _todo_tool(
+                todos=function_args.get("todos"),
+                merge=function_args.get("merge", False),
+                store=agent._todo_store,
+            )
+            tool_duration = time.time() - tool_start_time
+            if agent._should_emit_quiet_tool_messages():
+                agent._vprint(f"  {_get_cute_tool_message_impl('todo', function_args, tool_duration, result=function_result)}")
+        elif function_name == "session_search":
+            session_db = agent._get_session_db_for_recall()
+            if not session_db:
+                from hermes_state import format_session_db_unavailable
+                function_result = json.dumps({"success": False, "error": format_session_db_unavailable()})
+            else:
+                from tools.session_search_tool import session_search as _session_search
+                function_result = _session_search(
+                    query=function_args.get("query", ""),
+                    role_filter=function_args.get("role_filter"),
+                    limit=function_args.get("limit", 3),
+                    db=session_db,
+                    current_session_id=agent.session_id,
+                )
+            tool_duration = time.time() - tool_start_time
+            if agent._should_emit_quiet_tool_messages():
+                agent._vprint(f"  {_get_cute_tool_message_impl('session_search', function_args, tool_duration, result=function_result)}")
+        elif function_name == "memory":
+            target = function_args.get("target", "memory")
+            from tools.memory_tool import memory_tool as _memory_tool
+            function_result = _memory_tool(
+                action=function_args.get("action"),
+                target=target,
+                content=function_args.get("content"),
+                old_text=function_args.get("old_text"),
+                store=agent._memory_store,
+            )
+            # Bridge: notify external memory provider of built-in memory writes
+            if agent._memory_manager and function_args.get("action") in {"add", "replace"}:
+                try:
+                    agent._memory_manager.on_memory_write(
+                        function_args.get("action", ""),
+                        target,
+                        function_args.get("content", ""),
+                        metadata=agent._build_memory_write_metadata(
+                            task_id=effective_task_id,
+                            tool_call_id=getattr(tool_call, "id", None),
+                        ),
+                    )
+                except Exception:
+                    pass
+            tool_duration = time.time() - tool_start_time
+            if agent._should_emit_quiet_tool_messages():
+                agent._vprint(f"  {_get_cute_tool_message_impl('memory', function_args, tool_duration, result=function_result)}")
+        elif function_name == "clarify":
+            from tools.clarify_tool import clarify_tool as _clarify_tool
+            function_result = _clarify_tool(
+                question=function_args.get("question", ""),
+                choices=function_args.get("choices"),
+                callback=agent.clarify_callback,
+            )
+            tool_duration = time.time() - tool_start_time
+            if agent._should_emit_quiet_tool_messages():
+                agent._vprint(f"  {_get_cute_tool_message_impl('clarify', function_args, tool_duration, result=function_result)}")
+        elif function_name == "delegate_task":
+            tasks_arg = function_args.get("tasks")
+            if tasks_arg and isinstance(tasks_arg, list):
+                spinner_label = f"🔀 delegating {len(tasks_arg)} tasks"
+            else:
+                goal_preview = (function_args.get("goal") or "")[:30]
+                spinner_label = f"🔀 {goal_preview}" if goal_preview else "🔀 delegating"
+            spinner = None
+            if agent._should_emit_quiet_tool_messages() and agent._should_start_quiet_spinner():
+                face = random.choice(KawaiiSpinner.get_waiting_faces())
+                spinner = KawaiiSpinner(f"{face} {spinner_label}", spinner_type='dots', print_fn=agent._print_fn)
+                spinner.start()
+            agent._delegate_spinner = spinner
+            _delegate_result = None
+            try:
+                function_result = agent._dispatch_delegate_task(function_args)
+                _delegate_result = function_result
+            finally:
+                agent._delegate_spinner = None
+                tool_duration = time.time() - tool_start_time
+                cute_msg = _get_cute_tool_message_impl('delegate_task', function_args, tool_duration, result=_delegate_result)
+                if spinner:
+                    spinner.stop(cute_msg)
+                elif agent._should_emit_quiet_tool_messages():
+                    agent._vprint(f"  {cute_msg}")
+        elif agent._context_engine_tool_names and function_name in agent._context_engine_tool_names:
+            # Context engine tools (lcm_grep, lcm_describe, lcm_expand, etc.)
+            spinner = None
+            if agent._should_emit_quiet_tool_messages():
+                face = random.choice(KawaiiSpinner.get_waiting_faces())
+                emoji = _get_tool_emoji(function_name)
+                preview = _build_tool_preview(function_name, function_args) or function_name
+                spinner = KawaiiSpinner(f"{face} {emoji} {preview}", spinner_type='dots', print_fn=agent._print_fn)
+                spinner.start()
+            _ce_result = None
+            try:
+                function_result = agent.context_compressor.handle_tool_call(function_name, function_args, messages=messages)
+                _ce_result = function_result
+            except Exception as tool_error:
+                function_result = json.dumps({"error": f"Context engine tool '{function_name}' failed: {tool_error}"})
+                logger.error("context_engine.handle_tool_call raised for %s: %s", function_name, tool_error, exc_info=True)
+            finally:
+                tool_duration = time.time() - tool_start_time
+                cute_msg = _get_cute_tool_message_impl(function_name, function_args, tool_duration, result=_ce_result)
+                if spinner:
+                    spinner.stop(cute_msg)
+                elif agent._should_emit_quiet_tool_messages():
+                    agent._vprint(f"  {cute_msg}")
+        elif agent._memory_manager and agent._memory_manager.has_tool(function_name):
+            # Memory provider tools (hindsight_retain, honcho_search, etc.)
+            # These are not in the tool registry — route through MemoryManager.
+            spinner = None
+            if agent._should_emit_quiet_tool_messages() and agent._should_start_quiet_spinner():
+                face = random.choice(KawaiiSpinner.get_waiting_faces())
+                emoji = _get_tool_emoji(function_name)
+                preview = _build_tool_preview(function_name, function_args) or function_name
+                spinner = KawaiiSpinner(f"{face} {emoji} {preview}", spinner_type='dots', print_fn=agent._print_fn)
+                spinner.start()
+            _mem_result = None
+            try:
+                function_result = agent._memory_manager.handle_tool_call(function_name, function_args)
+                _mem_result = function_result
+            except Exception as tool_error:
+                function_result = json.dumps({"error": f"Memory tool '{function_name}' failed: {tool_error}"})
+                logger.error("memory_manager.handle_tool_call raised for %s: %s", function_name, tool_error, exc_info=True)
+            finally:
+                tool_duration = time.time() - tool_start_time
+                cute_msg = _get_cute_tool_message_impl(function_name, function_args, tool_duration, result=_mem_result)
+                if spinner:
+                    spinner.stop(cute_msg)
+                elif agent._should_emit_quiet_tool_messages():
+                    agent._vprint(f"  {cute_msg}")
+        elif agent.quiet_mode:
+            spinner = None
+            if agent._should_emit_quiet_tool_messages() and agent._should_start_quiet_spinner():
+                face = random.choice(KawaiiSpinner.get_waiting_faces())
+                emoji = _get_tool_emoji(function_name)
+                preview = _build_tool_preview(function_name, function_args) or function_name
+                spinner = KawaiiSpinner(f"{face} {emoji} {preview}", spinner_type='dots', print_fn=agent._print_fn)
+                spinner.start()
+            _spinner_result = None
+            try:
+                function_result = _ra().handle_function_call(
+                    function_name, function_args, effective_task_id,
+                    tool_call_id=tool_call.id,
+                    session_id=agent.session_id or "",
+                    enabled_tools=list(agent.valid_tool_names) if agent.valid_tool_names else None,
+                    skip_pre_tool_call_hook=True,
+                )
+                _spinner_result = function_result
+            except Exception as tool_error:
+                function_result = f"Error executing tool '{function_name}': {tool_error}"
+                logger.error("handle_function_call raised for %s: %s", function_name, tool_error, exc_info=True)
+            finally:
+                tool_duration = time.time() - tool_start_time
+                cute_msg = _get_cute_tool_message_impl(function_name, function_args, tool_duration, result=_spinner_result)
+                if spinner:
+                    spinner.stop(cute_msg)
+                elif agent._should_emit_quiet_tool_messages():
+                    agent._vprint(f"  {cute_msg}")
+        else:
+            try:
+                function_result = _ra().handle_function_call(
+                    function_name, function_args, effective_task_id,
+                    tool_call_id=tool_call.id,
+                    session_id=agent.session_id or "",
+                    enabled_tools=list(agent.valid_tool_names) if agent.valid_tool_names else None,
+                    skip_pre_tool_call_hook=True,
+                )
+            except Exception as tool_error:
+                function_result = f"Error executing tool '{function_name}': {tool_error}"
+                logger.error("handle_function_call raised for %s: %s", function_name, tool_error, exc_info=True)
+            tool_duration = time.time() - tool_start_time
+
+        if isinstance(function_result, str):
+            result_preview = function_result if agent.verbose_logging else (
+                function_result[:200] if len(function_result) > 200 else function_result
+            )
+            _result_len = len(function_result)
+        else:
+            # Multimodal dict result (_multimodal=True) — not sliceable as string
+            result_preview = function_result
+            _result_len = len(str(function_result))
+
+        # Log tool errors to the persistent error log so [error] tags
+        # in the UI always have a corresponding detailed entry on disk.
+        _is_error_result, _ = _detect_tool_failure(function_name, function_result)
+        if not _execution_blocked:
+            function_result = agent._append_guardrail_observation(
+                function_name,
+                function_args,
+                function_result,
+                failed=_is_error_result,
+            )
+            result_preview = function_result if agent.verbose_logging else (
+                function_result[:200] if len(function_result) > 200 else function_result
+            )
+        if _is_error_result:
+            logger.warning("Tool %s returned error (%.2fs): %s", function_name, tool_duration, result_preview)
+        else:
+            logger.info("tool %s completed (%.2fs, %d chars)", function_name, tool_duration, _result_len)
+
+        # Track file-mutation outcome for the turn-end verifier.  See
+        # the concurrent path for the rationale; both paths must feed
+        # the same state so the footer reflects every tool call in the
+        # turn, not just the parallel ones.
+        if not _execution_blocked:
+            try:
+                agent._record_file_mutation_result(
+                    function_name, function_args, function_result, _is_error_result,
+                )
+            except Exception as _ver_err:
+                logging.debug("file-mutation verifier record failed: %s", _ver_err)
+
+        if not _execution_blocked and agent.tool_progress_callback:
+            try:
+                agent.tool_progress_callback(
+                    "tool.completed", function_name, None, None,
+                    duration=tool_duration, is_error=_is_error_result,
+                )
+            except Exception as cb_err:
+                logging.debug(f"Tool progress callback error: {cb_err}")
+
+        agent._current_tool = None
+        agent._touch_activity(f"tool completed: {function_name} ({tool_duration:.1f}s)")
+
+        if agent.verbose_logging:
+            logging.debug(f"Tool {function_name} completed in {tool_duration:.2f}s")
+            _log_result = _multimodal_text_summary(function_result)
+            logging.debug(f"Tool result ({len(_log_result)} chars): {_log_result}")
+
+        if not _execution_blocked and agent.tool_complete_callback:
+            try:
+                agent.tool_complete_callback(tool_call.id, function_name, function_args, function_result)
+            except Exception as cb_err:
+                logging.debug(f"Tool complete callback error: {cb_err}")
+
+        function_result = maybe_persist_tool_result(
+            content=function_result,
+            tool_name=function_name,
+            tool_use_id=tool_call.id,
+            env=get_active_env(effective_task_id),
+        ) if not _is_multimodal_tool_result(function_result) else function_result
+
+        # Discover subdirectory context files from tool arguments
+        subdir_hints = agent._subdirectory_hints.check_tool_call(function_name, function_args)
+        if subdir_hints:
+            if _is_multimodal_tool_result(function_result):
+                _append_subdir_hint_to_multimodal(function_result, subdir_hints)
+            else:
+                function_result += subdir_hints
+
+        # Unwrap _multimodal dicts to an OpenAI-style content list
+        # (see parallel path for rationale). String results pass through.
+        _tool_content = agent._tool_result_content_for_active_model(function_name, function_result)
+        tool_msg = {
+            "role": "tool",
+            "name": function_name,
+            "content": _tool_content,
+            "tool_call_id": tool_call.id
+        }
+        messages.append(tool_msg)
+
+        # ── Per-tool /steer drain ───────────────────────────────────
+        # Drain pending steer BETWEEN individual tool calls so the
+        # injection lands as soon as a tool finishes — not after the
+        # entire batch.  The model sees it on the next API iteration.
+        agent._apply_pending_steer_to_tool_results(messages, 1)
+
+        if not agent.quiet_mode:
+            if agent.verbose_logging:
+                print(f"  ✅ Tool {i} completed in {tool_duration:.2f}s")
+                print(agent._wrap_verbose("Result: ", function_result))
+            else:
+                _fr_str = function_result if isinstance(function_result, str) else str(function_result)
+                response_preview = _fr_str[:agent.log_prefix_chars] + "..." if len(_fr_str) > agent.log_prefix_chars else _fr_str
+                print(f"  ✅ Tool {i} completed in {tool_duration:.2f}s - {response_preview}")
+
+        if agent._interrupt_requested and i < len(assistant_message.tool_calls):
+            remaining = len(assistant_message.tool_calls) - i
+            agent._vprint(f"{agent.log_prefix}⚡ Interrupt: skipping {remaining} remaining tool call(s)", force=True)
+            for skipped_tc in assistant_message.tool_calls[i:]:
+                skipped_name = skipped_tc.function.name
+                skip_msg = {
+                    "role": "tool",
+                    "name": skipped_name,
+                    "content": f"[Tool execution skipped — {skipped_name} was not started. User sent a new message]",
+                    "tool_call_id": skipped_tc.id
+                }
+                messages.append(skip_msg)
+            break
+
+        if agent.tool_delay > 0 and i < len(assistant_message.tool_calls):
+            time.sleep(agent.tool_delay)
+
+    # ── Per-turn aggregate budget enforcement ─────────────────────────
+    num_tools_seq = len(assistant_message.tool_calls)
+    if num_tools_seq > 0:
+        enforce_turn_budget(messages[-num_tools_seq:], env=get_active_env(effective_task_id))
+
+    # ── /steer injection ──────────────────────────────────────────────
+    # See _execute_tool_calls_parallel for the rationale. Same hook,
+    # applied to sequential execution as well.
+    if num_tools_seq > 0:
+        agent._apply_pending_steer_to_tool_results(messages, num_tools_seq)
+
+
+
+
+__all__ = [
+    "execute_tool_calls_concurrent",
+    "execute_tool_calls_sequential",
+]
--- a/run_agent.py
+++ b/run_agent.py
--- a/tests/run_agent/test_jsondecodeerror_retryable.py
+++ b/tests/run_agent/test_jsondecodeerror_retryable.py
@ -73,15 +73,20 @@ class TestAgentLoopSourceStillHasCarveOut:
    revert that happens to leave the test file intact."""

    def test_run_agent_excludes_jsondecodeerror_from_local_validation(self):
-        import run_agent
        import inspect
-        src = inspect.getsource(run_agent)
+        from agent import conversation_loop
+        # The agent loop body lives in agent/conversation_loop.py after
+        # the run_agent.py refactor.  Assert the carve-out is present in
+        # the extracted module specifically — if it ever moves back or
+        # disappears, this fails loudly rather than silently passing
+        # against a non-existent inline replica.
+        src = inspect.getsource(conversation_loop)
        # The predicate we care about must reference json.JSONDecodeError
        # in its exclusion tuple. We check for the specific co-occurrence
        # rather than the literal string so harmless reformatting doesn't
        # break us.
        assert "is_local_validation_error" in src
        assert "JSONDecodeError" in src, (
-            "run_agent.py must carve out json.JSONDecodeError from the "
-            "is_local_validation_error classification — see #14782."
+            "agent/conversation_loop.py must carve out json.JSONDecodeError "
+            "from the is_local_validation_error classification — see #14782."
        )
--- a/tests/run_agent/test_memory_nudge_counter_hydration.py
+++ b/tests/run_agent/test_memory_nudge_counter_hydration.py
@ -120,10 +120,22 @@ def test_production_code_contains_hydration_block():
    """Smoke test: confirm the hydration code is actually wired into
    run_conversation(). If someone deletes it, tests above still pass
    against the inline replica — this fails them awake.
+
+    After the run_agent.py refactor the agent-loop body lives in
+    ``agent/conversation_loop.py`` and uses ``agent.X`` rather than
+    ``self.X``.  Assert the block is present in the extracted module
+    specifically — if it ever drifts back into run_agent.py or
+    disappears entirely, this guard fails loudly.
    """
    from pathlib import Path
-    src = Path(__file__).resolve().parents[2] / "run_agent.py"
-    content = src.read_text(encoding="utf-8")
+    repo = Path(__file__).resolve().parents[2]
+    cl_path = repo / "agent" / "conversation_loop.py"
+    src_cl = cl_path.read_text(encoding="utf-8")
    # Anchor on the unique comment + the modulo line.
-    assert "Hydrate per-session nudge counters from persisted history" in content
-    assert "self._turns_since_memory = prior_user_turns % self._memory_nudge_interval" in content
+    assert "Hydrate per-session nudge counters from persisted history" in src_cl, (
+        f"Hydration comment missing from {cl_path}"
+    )
+    assert (
+        "agent._turns_since_memory = prior_user_turns % agent._memory_nudge_interval"
+        in src_cl
+    ), f"Hydration modulo assignment missing from {cl_path}"
--- a/tests/run_agent/test_run_agent.py
+++ b/tests/run_agent/test_run_agent.py
@ -4879,23 +4879,26 @@ class TestAnthropicInterruptHandler:
    def test_interruptible_has_anthropic_branch(self):
        """The interrupt handler must check api_mode == 'anthropic_messages'."""
        import inspect
-        source = inspect.getsource(AIAgent._interruptible_api_call)
+        from agent.chat_completion_helpers import interruptible_api_call
+        source = inspect.getsource(interruptible_api_call)
        assert "anthropic_messages" in source, \
-            "_interruptible_api_call must handle Anthropic interrupt (api_mode check)"
+            "interruptible_api_call must handle Anthropic interrupt (api_mode check)"

    def test_interruptible_rebuilds_anthropic_client(self):
        """After interrupting, the Anthropic client should be rebuilt."""
        import inspect
-        source = inspect.getsource(AIAgent._interruptible_api_call)
+        from agent.chat_completion_helpers import interruptible_api_call
+        source = inspect.getsource(interruptible_api_call)
        assert "build_anthropic_client" in source, \
-            "_interruptible_api_call must rebuild Anthropic client after interrupt"
+            "interruptible_api_call must rebuild Anthropic client after interrupt"

    def test_streaming_has_anthropic_branch(self):
        """_streaming_api_call must also handle Anthropic interrupt."""
        import inspect
-        source = inspect.getsource(AIAgent._interruptible_streaming_api_call)
+        from agent.chat_completion_helpers import interruptible_streaming_api_call
+        source = inspect.getsource(interruptible_streaming_api_call)
        assert "anthropic_messages" in source, \
-            "_streaming_api_call must handle Anthropic interrupt"
+            "interruptible_streaming_api_call must handle Anthropic interrupt"


 # ---------------------------------------------------------------------------
@ -5304,14 +5307,20 @@ class TestMemoryNudgeCounterPersistence:
    def test_counters_not_reset_in_preamble(self):
        """The run_conversation preamble must not zero the nudge counters."""
        import inspect
-        src = inspect.getsource(AIAgent.run_conversation)
+        from agent.conversation_loop import run_conversation as _rc
+        src = inspect.getsource(_rc)
        # The preamble resets many fields (retry counts, budget, etc.)
        # before the main loop. Find that reset block and verify our
        # counters aren't in it. The reset block ends at iteration_budget.
-        preamble_end = src.index("self.iteration_budget = IterationBudget")
+        # The extracted body uses ``agent.X`` (not ``self.X``).  Anchor
+        # exactly on ``agent.iteration_budget = IterationBudget`` so an
+        # unrelated identifier ending in ``iteration_budget`` (e.g.
+        # ``_iteration_budget`` or ``shared_iteration_budget``) can't
+        # match the boundary.
+        preamble_end = src.index("agent.iteration_budget = IterationBudget")
        preamble = src[:preamble_end]
-        assert "self._turns_since_memory = 0" not in preamble
-        assert "self._iters_since_skill = 0" not in preamble
+        assert "agent._turns_since_memory = 0" not in preamble
+        assert "agent._iters_since_skill = 0" not in preamble


 class TestDeadRetryCode:
@ -5319,7 +5328,8 @@ class TestDeadRetryCode:

    def test_no_unreachable_max_retries_after_backoff(self):
        import inspect
-        source = inspect.getsource(AIAgent.run_conversation)
+        from agent.conversation_loop import run_conversation as _rc
+        source = inspect.getsource(_rc)
        occurrences = source.count("if retry_count >= max_retries:")
        assert occurrences == 2, (
            f"Expected 2 occurrences of 'if retry_count >= max_retries:' "
@ -5357,7 +5367,8 @@ class TestMemoryContextSanitization:
        a literal <memory-context> tag we don't silently delete their text.
        The streaming scrubber + plugin-side scrub cover real leak paths."""
        import inspect
-        src = inspect.getsource(AIAgent.run_conversation)
+        from agent.conversation_loop import run_conversation as _rc
+        src = inspect.getsource(_rc)
        assert "sanitize_context(user_message)" not in src
        assert "sanitize_context(persist_user_message)" not in src

@ -5393,7 +5404,8 @@ class TestMemoryProviderTurnStart:
    def test_on_turn_start_called_before_prefetch(self):
        """Source-level check: on_turn_start appears before prefetch_all in run_conversation."""
        import inspect
-        src = inspect.getsource(AIAgent.run_conversation)
+        from agent.conversation_loop import run_conversation as _rc
+        src = inspect.getsource(_rc)
        # Find the actual method calls, not comments
        idx_turn_start = src.index(".on_turn_start(")
        idx_prefetch = src.index(".prefetch_all(")
@ -5403,7 +5415,10 @@ class TestMemoryProviderTurnStart:
        )

    def test_on_turn_start_uses_user_turn_count(self):
-        """Source-level check: on_turn_start receives self._user_turn_count."""
+        """Source-level check: on_turn_start receives the user_turn_count."""
        import inspect
-        src = inspect.getsource(AIAgent.run_conversation)
-        assert "on_turn_start(self._user_turn_count" in src
+        from agent.conversation_loop import run_conversation as _rc
+        src = inspect.getsource(_rc)
+        # The extracted body uses ``agent.X`` rather than ``self.X``;
+        # assert the extracted-form spelling directly.
+        assert "on_turn_start(agent._user_turn_count" in src
--- a/tests/run_agent/test_tool_executor_contextvar_propagation.py
+++ b/tests/run_agent/test_tool_executor_contextvar_propagation.py
@ -152,19 +152,28 @@ def test_run_agent_concurrent_executor_wraps_submit_with_copy_context():
    import inspect

    import run_agent
+    from agent import tool_executor as tool_executor_module

-    src_path = inspect.getsourcefile(run_agent)
-    assert src_path is not None
-    tree = ast.parse(open(src_path, encoding="utf-8").read())
+    # Source for both modules — the concurrent-executor body lives in
+    # ``agent/tool_executor.py`` after the run_agent.py refactor (PR
+    # following #16660).  Search both so this guard keeps firing
+    # regardless of where the call site lives.
+    sources = []
+    for mod in (run_agent, tool_executor_module):
+        src_path = inspect.getsourcefile(mod)
+        assert src_path is not None
+        sources.append((src_path, open(src_path, encoding="utf-8").read()))

    submit_calls_in_agent: list[ast.Call] = []
-    for node in ast.walk(tree):
-        if not isinstance(node, ast.Call):
-            continue
-        func = node.func
-        # Match executor.submit(...) style calls.
-        if isinstance(func, ast.Attribute) and func.attr == "submit":
-            submit_calls_in_agent.append(node)
+    for _src_path, src_text in sources:
+        tree = ast.parse(src_text)
+        for node in ast.walk(tree):
+            if not isinstance(node, ast.Call):
+                continue
+            func = node.func
+            # Match executor.submit(...) style calls.
+            if isinstance(func, ast.Attribute) and func.attr == "submit":
+                submit_calls_in_agent.append(node)

    # Filter to the submit call inside the concurrent tool executor —
    # identifiable by passing `_run_tool` as its target. Other submit()