diff --git a/agent/conversation_loop.py b/agent/conversation_loop.py new file mode 100644 index 00000000000..c95f1b63385 --- /dev/null +++ b/agent/conversation_loop.py @@ -0,0 +1,3964 @@ +"""The agent conversation loop β€” extracted from ``run_agent.AIAgent``. + +This is the biggest single chunk pulled out of ``run_agent.py``: the +roughly 3,900-line :func:`run_conversation` body that drives one user +turn through the agent (model call, tool dispatch, retries, fallbacks, +compression, post-turn hooks, background memory/skill review nudges). + +The function takes the parent ``AIAgent`` instance as its first +argument (``agent``) and accesses its state via attribute lookup. +``_ra().AIAgent.run_conversation`` is now a thin forwarder. + +Symbols that production code or tests patch on ``run_agent`` directly +(``handle_function_call``, ``_set_interrupt``, ``OpenAI``, ...) are +resolved through :func:`_ra` so those patches keep working. +""" + +from __future__ import annotations + +import json +import logging +import os +import random +import re +import ssl +import threading +import time +import uuid +from typing import Any, Dict, List, Optional + +from agent.anthropic_adapter import _is_oauth_token +from agent.auxiliary_client import set_runtime_main +from agent.codex_responses_adapter import _summarize_user_message_for_log +from agent.display import KawaiiSpinner +from agent.error_classifier import FailoverReason, classify_api_error +from agent.iteration_budget import IterationBudget +from agent.memory_manager import build_memory_context_block +from agent.message_sanitization import ( + _repair_tool_call_arguments, + _sanitize_messages_non_ascii, + _sanitize_messages_surrogates, + _sanitize_structure_non_ascii, + _sanitize_structure_surrogates, + _sanitize_surrogates, + _sanitize_tools_non_ascii, + _strip_images_from_messages, + _strip_non_ascii, +) +from agent.model_metadata import ( + estimate_messages_tokens_rough, + estimate_request_tokens_rough, + get_next_probe_tier, + parse_available_output_tokens_from_error, + parse_context_limit_from_error, + save_context_length, +) +from agent.nous_rate_guard import ( + clear_nous_rate_limit, + is_genuine_nous_rate_limit, + nous_rate_limit_remaining, + record_nous_rate_limit, +) +from agent.process_bootstrap import _install_safe_stdio +from agent.prompt_caching import apply_anthropic_cache_control +from agent.retry_utils import jittered_backoff +from agent.trajectory import has_incomplete_scratchpad +from agent.usage_pricing import estimate_usage_cost, normalize_usage +from hermes_constants import display_hermes_home as _dhh_fn +from hermes_logging import set_session_context +from tools.schema_sanitizer import strip_pattern_and_format +from tools.skill_provenance import set_current_write_origin +from utils import base_url_host_matches, env_var_enabled + +logger = logging.getLogger(__name__) + + +def _ra(): + """Lazy reference to ``run_agent`` so callers can patch + ``run_agent.handle_function_call`` / ``run_agent._set_interrupt`` / + ``run_agent.OpenAI`` and have those patches reach this code path. + """ + import run_agent + return run_agent + + +def run_conversation( + agent, + user_message: str, + system_message: str = None, + conversation_history: List[Dict[str, Any]] = None, + task_id: str = None, + stream_callback: Optional[callable] = None, + persist_user_message: Optional[str] = None, +) -> Dict[str, Any]: + """ + Run a complete conversation with tool calling until completion. + + Args: + user_message (str): The user's message/question + system_message (str): Custom system message (optional, overrides ephemeral_system_prompt if provided) + conversation_history (List[Dict]): Previous conversation messages (optional) + task_id (str): Unique identifier for this task to isolate VMs between concurrent tasks (optional, auto-generated if not provided) + stream_callback: Optional callback invoked with each text delta during streaming. + Used by the TTS pipeline to start audio generation before the full response. + When None (default), API calls use the standard non-streaming path. + persist_user_message: Optional clean user message to store in + transcripts/history when user_message contains API-only + synthetic prefixes. + or queuing follow-up prefetch work. + + Returns: + Dict: Complete conversation result with final response and message history + """ + # Guard stdio against OSError from broken pipes (systemd/headless/daemon). + # Installed once, transparent when streams are healthy, prevents crash on write. + _install_safe_stdio() + + agent._ensure_db_session() + + # Tell auxiliary_client what the live main provider/model are for + # this turn. Used by tools whose behaviour depends on the active + # main model (e.g. vision_analyze's native fast path) so they see + # the CLI/gateway override instead of the stale config.yaml + # default. Idempotent β€” fine to call every turn. + try: + from agent.auxiliary_client import set_runtime_main + set_runtime_main( + getattr(agent, "provider", "") or "", + getattr(agent, "model", "") or "", + ) + except Exception: + pass + + # Tag all log records on this thread with the session ID so + # ``hermes logs --session `` can filter a single conversation. + from hermes_logging import set_session_context + set_session_context(agent.session_id) + + # Bind the skill write-origin ContextVar for this thread so tool + # handlers (e.g. skill_manage create) can tell whether they are + # running inside the background agent-improvement review fork vs. + # a foreground user-directed turn. Set at the top of each call; + # the review fork runs on its own thread with a fresh context, + # so the foreground value here does not leak into it. + from tools.skill_provenance import set_current_write_origin + set_current_write_origin(getattr(agent, "_memory_write_origin", "assistant_tool")) + + # If the previous turn activated fallback, restore the primary + # runtime so this turn gets a fresh attempt with the preferred model. + # No-op when _fallback_activated is False (gateway, first turn, etc.). + agent._restore_primary_runtime() + + # Sanitize surrogate characters from user input. Clipboard paste from + # rich-text editors (Google Docs, Word, etc.) can inject lone surrogates + # that are invalid UTF-8 and crash JSON serialization in the OpenAI SDK. + if isinstance(user_message, str): + user_message = _sanitize_surrogates(user_message) + if isinstance(persist_user_message, str): + persist_user_message = _sanitize_surrogates(persist_user_message) + + # Store stream callback for _interruptible_api_call to pick up + agent._stream_callback = stream_callback + agent._persist_user_message_idx = None + agent._persist_user_message_override = persist_user_message + # Generate unique task_id if not provided to isolate VMs between concurrent tasks + effective_task_id = task_id or str(uuid.uuid4()) + # Expose the active task_id so tools running mid-turn (e.g. delegate_task + # in delegate_tool.py) can identify this agent for the cross-agent file + # state registry. Set BEFORE any tool dispatch so snapshots taken at + # child-launch time see the parent's real id, not None. + agent._current_task_id = effective_task_id + + # Reset retry counters and iteration budget at the start of each turn + # so subagent usage from a previous turn doesn't eat into the next one. + agent._invalid_tool_retries = 0 + agent._invalid_json_retries = 0 + agent._empty_content_retries = 0 + agent._incomplete_scratchpad_retries = 0 + agent._codex_incomplete_retries = 0 + agent._thinking_prefill_retries = 0 + agent._post_tool_empty_retried = False + agent._last_content_with_tools = None + agent._last_content_tools_all_housekeeping = False + agent._mute_post_response = False + agent._unicode_sanitization_passes = 0 + agent._tool_guardrails.reset_for_turn() + agent._tool_guardrail_halt_decision = None + # True until the server rejects an image_url content part with an error + # like "Only 'text' content type is supported." Set to False on first + # rejection and kept False for the rest of the session so we never re-send + # images to a text-only endpoint. Scoped per `_run()` call, not per instance. + agent._vision_supported = True + + # Pre-turn connection health check: detect and clean up dead TCP + # connections left over from provider outages or dropped streams. + # This prevents the next API call from hanging on a zombie socket. + if agent.api_mode != "anthropic_messages": + try: + if agent._cleanup_dead_connections(): + agent._emit_status( + "πŸ”Œ Detected stale connections from a previous provider " + "issue β€” cleaned up automatically. Proceeding with fresh " + "connection." + ) + except Exception: + pass + # Replay compression warning through status_callback for gateway + # platforms (the callback was not wired during __init__). + if agent._compression_warning: + agent._replay_compression_warning() + agent._compression_warning = None # send once + + # NOTE: _turns_since_memory and _iters_since_skill are NOT reset here. + # They are initialized in __init__ and must persist across run_conversation + # calls so that nudge logic accumulates correctly in CLI mode. + agent.iteration_budget = IterationBudget(agent.max_iterations) + + # Log conversation turn start for debugging/observability + _preview_text = _summarize_user_message_for_log(user_message) + _msg_preview = (_preview_text[:80] + "...") if len(_preview_text) > 80 else _preview_text + _msg_preview = _msg_preview.replace("\n", " ") + logger.info( + "conversation turn: session=%s model=%s provider=%s platform=%s history=%d msg=%r", + agent.session_id or "none", agent.model, agent.provider or "unknown", + agent.platform or "unknown", len(conversation_history or []), + _msg_preview, + ) + + # Initialize conversation (copy to avoid mutating the caller's list) + messages = list(conversation_history) if conversation_history else [] + + # Hydrate todo store from conversation history (gateway creates a fresh + # AIAgent per message, so the in-memory store is empty -- we need to + # recover the todo state from the most recent todo tool response in history) + if conversation_history and not agent._todo_store.has_items(): + agent._hydrate_todo_store(conversation_history) + + # Hydrate per-session nudge counters from persisted history. + # Gateway creates a fresh AIAgent per inbound message (cache miss / + # 1h idle eviction / config-signature mismatch / process restart), so + # _turns_since_memory and _user_turn_count start at 0 every turn and + # the memory.nudge_interval trigger may never be reached. Reconstruct + # an effective count from prior user turns in conversation_history. + # Idempotent: a cached agent that already accumulated counters keeps + # them; only a freshly-built agent with empty in-memory state hydrates. + # See issue #22357. + if conversation_history and agent._user_turn_count == 0: + prior_user_turns = sum( + 1 for m in conversation_history if m.get("role") == "user" + ) + if prior_user_turns > 0: + agent._user_turn_count = prior_user_turns + if agent._memory_nudge_interval > 0 and agent._turns_since_memory == 0: + # % preserves original 1-in-N cadence rather than firing a + # review immediately on resume (which would surprise users + # whose session happened to land just past a multiple of N). + agent._turns_since_memory = prior_user_turns % agent._memory_nudge_interval + + + # Prefill messages (few-shot priming) are injected at API-call time only, + # never stored in the messages list. This keeps them ephemeral: they won't + # be saved to session DB, session logs, or batch trajectories, but they're + # automatically re-applied on every API call (including session continuations). + + # Track user turns for memory flush and periodic nudge logic + agent._user_turn_count += 1 + + # Reset the streaming context scrubber at the top of each turn so a + # hung span from a prior interrupted stream can't taint this turn's + # output. + scrubber = getattr(agent, "_stream_context_scrubber", None) + if scrubber is not None: + scrubber.reset() + # Reset the think scrubber for the same reason β€” an interrupted + # prior stream may have left us inside an unterminated block. + think_scrubber = getattr(agent, "_stream_think_scrubber", None) + if think_scrubber is not None: + think_scrubber.reset() + + # Preserve the original user message (no nudge injection). + original_user_message = persist_user_message if persist_user_message is not None else user_message + + # Track memory nudge trigger (turn-based, checked here). + # Skill trigger is checked AFTER the agent loop completes, based on + # how many tool iterations THIS turn used. + _should_review_memory = False + if (agent._memory_nudge_interval > 0 + and "memory" in agent.valid_tool_names + and agent._memory_store): + agent._turns_since_memory += 1 + if agent._turns_since_memory >= agent._memory_nudge_interval: + _should_review_memory = True + agent._turns_since_memory = 0 + + # Add user message + user_msg = {"role": "user", "content": user_message} + messages.append(user_msg) + current_turn_user_idx = len(messages) - 1 + agent._persist_user_message_idx = current_turn_user_idx + + if not agent.quiet_mode: + _print_preview = _summarize_user_message_for_log(user_message) + agent._safe_print(f"πŸ’¬ Starting conversation: '{_print_preview[:60]}{'...' if len(_print_preview) > 60 else ''}'") + + # ── System prompt (cached per session for prefix caching) ── + # Built once on first call, reused for all subsequent calls. + # Only rebuilt after context compression events (which invalidate + # the cache and reload memory from disk). + # + # For continuing sessions (gateway creates a fresh AIAgent per + # message), we load the stored system prompt from the session DB + # instead of rebuilding. Rebuilding would pick up memory changes + # from disk that the model already knows about (it wrote them!), + # producing a different system prompt and breaking the Anthropic + # prefix cache. + if agent._cached_system_prompt is None: + stored_prompt = None + if conversation_history and agent._session_db: + try: + session_row = agent._session_db.get_session(agent.session_id) + if session_row: + stored_prompt = session_row.get("system_prompt") or None + except Exception: + pass # Fall through to build fresh + + if stored_prompt: + # Continuing session β€” reuse the exact system prompt from + # the previous turn so the Anthropic cache prefix matches. + agent._cached_system_prompt = stored_prompt + else: + # First turn of a new session β€” build from scratch. + agent._cached_system_prompt = agent._build_system_prompt(system_message) + # Plugin hook: on_session_start + # Fired once when a brand-new session is created (not on + # continuation). Plugins can use this to initialise + # session-scoped state (e.g. warm a memory cache). + try: + from hermes_cli.plugins import invoke_hook as _invoke_hook + _invoke_hook( + "on_session_start", + session_id=agent.session_id, + model=agent.model, + platform=getattr(agent, "platform", None) or "", + ) + except Exception as exc: + logger.warning("on_session_start hook failed: %s", exc) + + # Store the system prompt snapshot in SQLite + if agent._session_db: + try: + agent._session_db.update_system_prompt(agent.session_id, agent._cached_system_prompt) + except Exception as e: + logger.debug("Session DB update_system_prompt failed: %s", e) + + active_system_prompt = agent._cached_system_prompt + + # ── Preflight context compression ── + # Before entering the main loop, check if the loaded conversation + # history already exceeds the model's context threshold. This handles + # cases where a user switches to a model with a smaller context window + # while having a large existing session β€” compress proactively rather + # than waiting for an API error (which might be caught as a non-retryable + # 4xx and abort the request entirely). + if ( + agent.compression_enabled + and len(messages) > agent.context_compressor.protect_first_n + + agent.context_compressor.protect_last_n + 1 + ): + # Include tool schema tokens β€” with many tools these can add + # 20-30K+ tokens that the old sys+msg estimate missed entirely. + _preflight_tokens = estimate_request_tokens_rough( + messages, + system_prompt=active_system_prompt or "", + tools=agent.tools or None, + ) + + if _preflight_tokens >= agent.context_compressor.threshold_tokens: + logger.info( + "Preflight compression: ~%s tokens >= %s threshold (model %s, ctx %s)", + f"{_preflight_tokens:,}", + f"{agent.context_compressor.threshold_tokens:,}", + agent.model, + f"{agent.context_compressor.context_length:,}", + ) + agent._emit_status( + f"πŸ“¦ Preflight compression: ~{_preflight_tokens:,} tokens " + f">= {agent.context_compressor.threshold_tokens:,} threshold. " + "This may take a moment." + ) + # May need multiple passes for very large sessions with small + # context windows (each pass summarises the middle N turns). + for _pass in range(3): + _orig_len = len(messages) + messages, active_system_prompt = agent._compress_context( + messages, system_message, approx_tokens=_preflight_tokens, + task_id=effective_task_id, + ) + if len(messages) >= _orig_len: + break # Cannot compress further + # Compression created a new session β€” clear the history + # reference so _flush_messages_to_session_db writes ALL + # compressed messages to the new session's SQLite, not + # skipping them because conversation_history is still the + # pre-compression length. + conversation_history = None + # Fix: reset retry counters after compression so the model + # gets a fresh budget on the compressed context. Without + # this, pre-compression retries carry over and the model + # hits "(empty)" immediately after compression-induced + # context loss. + agent._empty_content_retries = 0 + agent._thinking_prefill_retries = 0 + agent._last_content_with_tools = None + agent._last_content_tools_all_housekeeping = False + agent._mute_post_response = False + # Re-estimate after compression + _preflight_tokens = estimate_request_tokens_rough( + messages, + system_prompt=active_system_prompt or "", + tools=agent.tools or None, + ) + if _preflight_tokens < agent.context_compressor.threshold_tokens: + break # Under threshold + + # Plugin hook: pre_llm_call + # Fired once per turn before the tool-calling loop. Plugins can + # return a dict with a ``context`` key (or a plain string) whose + # value is appended to the current turn's user message. + # + # Context is ALWAYS injected into the user message, never the + # system prompt. This preserves the prompt cache prefix β€” the + # system prompt stays identical across turns so cached tokens + # are reused. The system prompt is Hermes's territory; plugins + # contribute context alongside the user's input. + # + # All injected context is ephemeral (not persisted to session DB). + _plugin_user_context = "" + try: + from hermes_cli.plugins import invoke_hook as _invoke_hook + _pre_results = _invoke_hook( + "pre_llm_call", + session_id=agent.session_id, + user_message=original_user_message, + conversation_history=list(messages), + is_first_turn=(not bool(conversation_history)), + model=agent.model, + platform=getattr(agent, "platform", None) or "", + sender_id=getattr(agent, "_user_id", None) or "", + ) + _ctx_parts: list[str] = [] + for r in _pre_results: + if isinstance(r, dict) and r.get("context"): + _ctx_parts.append(str(r["context"])) + elif isinstance(r, str) and r.strip(): + _ctx_parts.append(r) + if _ctx_parts: + _plugin_user_context = "\n\n".join(_ctx_parts) + except Exception as exc: + logger.warning("pre_llm_call hook failed: %s", exc) + + # Main conversation loop + api_call_count = 0 + final_response = None + interrupted = False + codex_ack_continuations = 0 + length_continue_retries = 0 + truncated_tool_call_retries = 0 + truncated_response_prefix = "" + compression_attempts = 0 + _turn_exit_reason = "unknown" # Diagnostic: why the loop ended + + # Per-turn file-mutation verifier state. Keyed by resolved path; + # each failed ``write_file`` / ``patch`` call records the error + # preview. Later successful writes to the same path remove the + # entry (the model recovered). At end-of-turn, any entries still + # present are surfaced in an advisory footer so the model cannot + # over-claim success while the file is actually unchanged on disk. + agent._turn_failed_file_mutations: Dict[str, Dict[str, Any]] = {} + + # Record the execution thread so interrupt()/clear_interrupt() can + # scope the tool-level interrupt signal to THIS agent's thread only. + # Must be set before any thread-scoped interrupt syncing. + agent._execution_thread_id = threading.current_thread().ident + + # Always clear stale per-thread state from a previous turn. If an + # interrupt arrived before startup finished, preserve it and bind it + # to this execution thread now instead of dropping it on the floor. + _ra()._set_interrupt(False, agent._execution_thread_id) + if agent._interrupt_requested: + _ra()._set_interrupt(True, agent._execution_thread_id) + agent._interrupt_thread_signal_pending = False + else: + agent._interrupt_message = None + agent._interrupt_thread_signal_pending = False + + # Notify memory providers of the new turn so cadence tracking works. + # Must happen BEFORE prefetch_all() so providers know which turn it is + # and can gate context/dialectic refresh via contextCadence/dialecticCadence. + if agent._memory_manager: + try: + _turn_msg = original_user_message if isinstance(original_user_message, str) else "" + agent._memory_manager.on_turn_start(agent._user_turn_count, _turn_msg) + except Exception: + pass + + # External memory provider: prefetch once before the tool loop. + # Reuse the cached result on every iteration to avoid re-calling + # prefetch_all() on each tool call (10 tool calls = 10x latency + cost). + # Use original_user_message (clean input) β€” user_message may contain + # injected skill content that bloats / breaks provider queries. + _ext_prefetch_cache = "" + if agent._memory_manager: + try: + _query = original_user_message if isinstance(original_user_message, str) else "" + _ext_prefetch_cache = agent._memory_manager.prefetch_all(_query) or "" + except Exception: + pass + + # Optional opt-in runtime: if api_mode == codex_app_server, hand the + # turn to the codex app-server subprocess (terminal/file ops/patching + # all run inside Codex). Default Hermes path is bypassed entirely. + # See agent/transports/codex_app_server_session.py for the adapter + # and references/codex-app-server-runtime.md for the rationale. + if agent.api_mode == "codex_app_server": + return agent._run_codex_app_server_turn( + user_message=user_message, + original_user_message=original_user_message, + messages=messages, + effective_task_id=effective_task_id, + should_review_memory=_should_review_memory, + ) + + while (api_call_count < agent.max_iterations and agent.iteration_budget.remaining > 0) or agent._budget_grace_call: + # Reset per-turn checkpoint dedup so each iteration can take one snapshot + agent._checkpoint_mgr.new_turn() + + # Check for interrupt request (e.g., user sent new message) + if agent._interrupt_requested: + interrupted = True + _turn_exit_reason = "interrupted_by_user" + if not agent.quiet_mode: + agent._safe_print("\n⚑ Breaking out of tool loop due to interrupt...") + break + + api_call_count += 1 + agent._api_call_count = api_call_count + agent._touch_activity(f"starting API call #{api_call_count}") + + # Grace call: the budget is exhausted but we gave the model one + # more chance. Consume the grace flag so the loop exits after + # this iteration regardless of outcome. + if agent._budget_grace_call: + agent._budget_grace_call = False + elif not agent.iteration_budget.consume(): + _turn_exit_reason = "budget_exhausted" + if not agent.quiet_mode: + agent._safe_print(f"\n⚠️ Iteration budget exhausted ({agent.iteration_budget.used}/{agent.iteration_budget.max_total} iterations used)") + break + + # Fire step_callback for gateway hooks (agent:step event) + if agent.step_callback is not None: + try: + prev_tools = [] + for _idx, _m in enumerate(reversed(messages)): + if _m.get("role") == "assistant" and _m.get("tool_calls"): + _fwd_start = len(messages) - _idx + _results_by_id = {} + for _tm in messages[_fwd_start:]: + if _tm.get("role") != "tool": + break + _tcid = _tm.get("tool_call_id") + if _tcid: + _results_by_id[_tcid] = _tm.get("content", "") + prev_tools = [ + { + "name": tc["function"]["name"], + "result": _results_by_id.get(tc.get("id")), + "arguments": tc["function"].get("arguments"), + } + for tc in _m["tool_calls"] + if isinstance(tc, dict) + ] + break + agent.step_callback(api_call_count, prev_tools) + except Exception as _step_err: + logger.debug("step_callback error (iteration %s): %s", api_call_count, _step_err) + + # Track tool-calling iterations for skill nudge. + # Counter resets whenever skill_manage is actually used. + if (agent._skill_nudge_interval > 0 + and "skill_manage" in agent.valid_tool_names): + agent._iters_since_skill += 1 + + # ── Pre-API-call /steer drain ────────────────────────────────── + # If a /steer arrived during the previous API call (while the model + # was thinking), drain it now β€” before we build api_messages β€” so + # the model sees the steer text on THIS iteration. Without this, + # steers sent during an API call only land after the NEXT tool batch, + # which may never come if the model returns a final response. + # + # We scan backwards for the last tool-role message in the messages + # list. If found, the steer is appended there. If not (first + # iteration, no tools yet), the steer stays pending for the next + # tool batch β€” injecting into a user message would break role + # alternation, and there's no tool output to piggyback on. + _pre_api_steer = agent._drain_pending_steer() + if _pre_api_steer: + _injected = False + for _si in range(len(messages) - 1, -1, -1): + _sm = messages[_si] + if isinstance(_sm, dict) and _sm.get("role") == "tool": + marker = f"\n\nUser guidance: {_pre_api_steer}" + existing = _sm.get("content", "") + if isinstance(existing, str): + _sm["content"] = existing + marker + else: + # Multimodal content blocks β€” append text block + try: + blocks = list(existing) if existing else [] + blocks.append({"type": "text", "text": marker}) + _sm["content"] = blocks + except Exception: + pass + _injected = True + logger.debug( + "Pre-API-call steer drain: injected into tool msg at index %d", + _si, + ) + break + if not _injected: + # No tool message to inject into β€” put it back so + # the post-tool-execution drain picks it up later. + _lock = getattr(agent, "_pending_steer_lock", None) + if _lock is not None: + with _lock: + if agent._pending_steer: + agent._pending_steer = agent._pending_steer + "\n" + _pre_api_steer + else: + agent._pending_steer = _pre_api_steer + else: + existing = getattr(agent, "_pending_steer", None) + agent._pending_steer = (existing + "\n" + _pre_api_steer) if existing else _pre_api_steer + + # Prepare messages for API call + # If we have an ephemeral system prompt, prepend it to the messages + # Note: Reasoning is embedded in content via tags for trajectory storage. + # However, providers like Moonshot AI require a separate 'reasoning_content' field + # on assistant messages with tool_calls. We handle both cases here. + request_logger = getattr(agent, "logger", None) or logging.getLogger(__name__) + repaired_tool_calls = agent._sanitize_tool_call_arguments( + messages, + logger=request_logger, + session_id=agent.session_id, + ) + if repaired_tool_calls > 0: + request_logger.info( + "Sanitized %s corrupted tool_call arguments before request (session=%s)", + repaired_tool_calls, + agent.session_id or "-", + ) + + # Defensive: repair malformed role-alternation before API call. + # Catches cases where the history got wedged into a + # ``tool β†’ user`` or ``user β†’ user`` tail (e.g. after empty- + # response scaffolding was stripped and a new user message + # landed after an orphan tool result). Most providers return + # empty content on malformed sequences, which would otherwise + # retrigger the empty-retry loop indefinitely. + repaired_seq = agent._repair_message_sequence(messages) + if repaired_seq > 0: + request_logger.info( + "Repaired %s message-alternation violations before request (session=%s)", + repaired_seq, + agent.session_id or "-", + ) + + api_messages = [] + for idx, msg in enumerate(messages): + api_msg = msg.copy() + + # Inject ephemeral context into the current turn's user message. + # Sources: memory manager prefetch + plugin pre_llm_call hooks + # with target="user_message" (the default). Both are + # API-call-time only β€” the original message in `messages` is + # never mutated, so nothing leaks into session persistence. + if idx == current_turn_user_idx and msg.get("role") == "user": + _injections = [] + if _ext_prefetch_cache: + _fenced = build_memory_context_block(_ext_prefetch_cache) + if _fenced: + _injections.append(_fenced) + if _plugin_user_context: + _injections.append(_plugin_user_context) + if _injections: + _base = api_msg.get("content", "") + if isinstance(_base, str): + api_msg["content"] = _base + "\n\n" + "\n\n".join(_injections) + + # For ALL assistant messages, pass reasoning back to the API + # This ensures multi-turn reasoning context is preserved + agent._copy_reasoning_content_for_api(msg, api_msg) + + # Remove 'reasoning' field - it's for trajectory storage only + # We've copied it to 'reasoning_content' for the API above + if "reasoning" in api_msg: + api_msg.pop("reasoning") + # Remove finish_reason - not accepted by strict APIs (e.g. Mistral) + if "finish_reason" in api_msg: + api_msg.pop("finish_reason") + # Strip internal thinking-prefill marker + api_msg.pop("_thinking_prefill", None) + # Strip Codex Responses API fields (call_id, response_item_id) for + # strict providers like Mistral, Fireworks, etc. that reject unknown fields. + # Uses new dicts so the internal messages list retains the fields + # for Codex Responses compatibility. + if agent._should_sanitize_tool_calls(): + agent._sanitize_tool_calls_for_strict_api(api_msg) + # Keep 'reasoning_details' - OpenRouter uses this for multi-turn reasoning context + # The signature field helps maintain reasoning continuity + api_messages.append(api_msg) + + # Build the final system message: cached prompt + ephemeral system prompt. + # Ephemeral additions are API-call-time only (not persisted to session DB). + # External recall context is injected into the user message, not the system + # prompt, so the stable cache prefix remains unchanged. + # + # NOTE: Plugin context from pre_llm_call hooks is injected into the + # user message (see injection block above), NOT the system prompt. + # This is intentional β€” system prompt modifications break the prompt + # cache prefix. The system prompt is reserved for Hermes internals. + # + # Hermes invariant: the system prompt is built ONCE per session + # (cached on ``_cached_system_prompt``) and replayed verbatim on + # every turn. We send it as a single content string so the + # bytes are byte-stable across turns and upstream prompt caches + # stay warm. + effective_system = active_system_prompt or "" + if agent.ephemeral_system_prompt: + effective_system = (effective_system + "\n\n" + agent.ephemeral_system_prompt).strip() + if effective_system: + api_messages = [{"role": "system", "content": effective_system}] + api_messages + + # Inject ephemeral prefill messages right after the system prompt + # but before conversation history. Same API-call-time-only pattern. + if agent.prefill_messages: + sys_offset = 1 if (api_messages and api_messages[0].get("role") == "system") else 0 + for idx, pfm in enumerate(agent.prefill_messages): + api_messages.insert(sys_offset + idx, pfm.copy()) + + # Apply Anthropic prompt caching for Claude models on native + # Anthropic, OpenRouter, and third-party Anthropic-compatible + # gateways. Auto-detected: if ``_use_prompt_caching`` is set, + # inject cache_control breakpoints (system + last 3 messages) + # to reduce input token costs by ~75% on multi-turn + # conversations. + if agent._use_prompt_caching: + api_messages = apply_anthropic_cache_control( + api_messages, + cache_ttl=agent._cache_ttl, + native_anthropic=agent._use_native_cache_layout, + ) + + # Safety net: strip orphaned tool results / add stubs for missing + # results before sending to the API. Runs unconditionally β€” not + # gated on context_compressor β€” so orphans from session loading or + # manual message manipulation are always caught. + api_messages = agent._sanitize_api_messages(api_messages) + + # Drop thinking-only assistant turns (reasoning but no visible + # output and no tool_calls) and merge any adjacent user messages + # left behind. Prevents Anthropic 400s ("The final block in an + # assistant message cannot be `thinking`.") and equivalent errors + # from third-party Anthropic-compatible gateways that can't replay + # a thinking-only turn. Runs on the per-call copy only β€” the + # stored conversation history keeps the reasoning block for the + # UI transcript and session persistence. + api_messages = agent._drop_thinking_only_and_merge_users(api_messages) + + # Normalize message whitespace and tool-call JSON for consistent + # prefix matching. Ensures bit-perfect prefixes across turns, + # which enables KV cache reuse on local inference servers + # (llama.cpp, vLLM, Ollama) and improves cache hit rates for + # cloud providers. Operates on api_messages (the API copy) so + # the original conversation history in `messages` is untouched. + for am in api_messages: + if isinstance(am.get("content"), str): + am["content"] = am["content"].strip() + for am in api_messages: + tcs = am.get("tool_calls") + if not tcs: + continue + new_tcs = [] + for tc in tcs: + if isinstance(tc, dict) and "function" in tc: + try: + args_obj = json.loads(tc["function"]["arguments"]) + tc = {**tc, "function": { + **tc["function"], + "arguments": json.dumps( + args_obj, separators=(",", ":"), + sort_keys=True, + ), + }} + except Exception: + tc["function"]["arguments"] = _repair_tool_call_arguments( + tc["function"]["arguments"], + tc["function"].get("name", "?"), + ) + new_tcs.append(tc) + am["tool_calls"] = new_tcs + + # Proactively strip any surrogate characters before the API call. + # Models served via Ollama (Kimi K2.5, GLM-5, Qwen) can return + # lone surrogates (U+D800-U+DFFF) that crash json.dumps() inside + # the OpenAI SDK. Sanitizing here prevents the 3-retry cycle. + _sanitize_messages_surrogates(api_messages) + + # Calculate approximate request size for logging + total_chars = sum(len(str(msg)) for msg in api_messages) + approx_tokens = estimate_messages_tokens_rough(api_messages) + + # Thinking spinner for quiet mode (animated during API call) + thinking_spinner = None + + if not agent.quiet_mode: + agent._vprint(f"\n{agent.log_prefix}πŸ”„ Making API call #{api_call_count}/{agent.max_iterations}...") + agent._vprint(f"{agent.log_prefix} πŸ“Š Request size: {len(api_messages)} messages, ~{approx_tokens:,} tokens (~{total_chars:,} chars)") + agent._vprint(f"{agent.log_prefix} πŸ”§ Available tools: {len(agent.tools) if agent.tools else 0}") + else: + # Animated thinking spinner in quiet mode + face = random.choice(KawaiiSpinner.get_thinking_faces()) + verb = random.choice(KawaiiSpinner.get_thinking_verbs()) + if agent.thinking_callback: + # CLI TUI mode: use prompt_toolkit widget instead of raw spinner + # (works in both streaming and non-streaming modes) + agent.thinking_callback(f"{face} {verb}...") + elif not agent._has_stream_consumers() and agent._should_start_quiet_spinner(): + # Raw KawaiiSpinner only when no streaming consumers and the + # spinner output has a safe sink. + spinner_type = random.choice(['brain', 'sparkle', 'pulse', 'moon', 'star']) + thinking_spinner = KawaiiSpinner(f"{face} {verb}...", spinner_type=spinner_type, print_fn=agent._print_fn) + thinking_spinner.start() + + # Log request details if verbose + if agent.verbose_logging: + logging.debug(f"API Request - Model: {agent.model}, Messages: {len(messages)}, Tools: {len(agent.tools) if agent.tools else 0}") + logging.debug(f"Last message role: {messages[-1]['role'] if messages else 'none'}") + logging.debug(f"Total message size: ~{approx_tokens:,} tokens") + + api_start_time = time.time() + retry_count = 0 + max_retries = agent._api_max_retries + primary_recovery_attempted = False + max_compression_attempts = 3 + codex_auth_retry_attempted=False + anthropic_auth_retry_attempted=False + nous_auth_retry_attempted=False + copilot_auth_retry_attempted=False + thinking_sig_retry_attempted = False + image_shrink_retry_attempted = False + oauth_1m_beta_retry_attempted = False + llama_cpp_grammar_retry_attempted = False + has_retried_429 = False + restart_with_compressed_messages = False + restart_with_length_continuation = False + + finish_reason = "stop" + response = None # Guard against UnboundLocalError if all retries fail + api_kwargs = None # Guard against UnboundLocalError in except handler + + while retry_count < max_retries: + # ── Nous Portal rate limit guard ────────────────────── + # If another session already recorded that Nous is rate- + # limited, skip the API call entirely. Each attempt + # (including SDK-level retries) counts against RPH and + # deepens the rate limit hole. + if agent.provider == "nous": + try: + from agent.nous_rate_guard import ( + nous_rate_limit_remaining, + format_remaining as _fmt_nous_remaining, + ) + _nous_remaining = nous_rate_limit_remaining() + if _nous_remaining is not None and _nous_remaining > 0: + _nous_msg = ( + f"Nous Portal rate limit active β€” " + f"resets in {_fmt_nous_remaining(_nous_remaining)}." + ) + agent._vprint( + f"{agent.log_prefix}⏳ {_nous_msg} Trying fallback...", + force=True, + ) + agent._emit_status(f"⏳ {_nous_msg}") + if agent._try_activate_fallback(): + retry_count = 0 + compression_attempts = 0 + primary_recovery_attempted = False + continue + # No fallback available β€” return with clear message + agent._persist_session(messages, conversation_history) + return { + "final_response": ( + f"⏳ {_nous_msg}\n\n" + "No fallback provider available. " + "Try again after the reset, or add a " + "fallback provider in config.yaml." + ), + "messages": messages, + "api_calls": api_call_count, + "completed": False, + "failed": True, + "error": _nous_msg, + } + except ImportError: + pass + except Exception: + pass # Never let rate guard break the agent loop + + try: + agent._reset_stream_delivery_tracking() + api_kwargs = agent._build_api_kwargs(api_messages) + if agent._force_ascii_payload: + _sanitize_structure_non_ascii(api_kwargs) + if agent.api_mode == "codex_responses": + api_kwargs = agent._get_transport().preflight_kwargs(api_kwargs, allow_stream=False) + + try: + from hermes_cli.plugins import invoke_hook as _invoke_hook + _invoke_hook( + "pre_api_request", + task_id=effective_task_id, + session_id=agent.session_id or "", + platform=agent.platform or "", + model=agent.model, + provider=agent.provider, + base_url=agent.base_url, + api_mode=agent.api_mode, + api_call_count=api_call_count, + message_count=len(api_messages), + tool_count=len(agent.tools or []), + approx_input_tokens=approx_tokens, + request_char_count=total_chars, + max_tokens=agent.max_tokens, + ) + except Exception: + pass + + if env_var_enabled("HERMES_DUMP_REQUESTS"): + agent._dump_api_request_debug(api_kwargs, reason="preflight") + + # Always prefer the streaming path β€” even without stream + # consumers. Streaming gives us fine-grained health + # checking (90s stale-stream detection, 60s read timeout) + # that the non-streaming path lacks. Without this, + # subagents and other quiet-mode callers can hang + # indefinitely when the provider keeps the connection + # alive with SSE pings but never delivers a response. + # The streaming path is a no-op for callbacks when no + # consumers are registered, and falls back to non- + # streaming automatically if the provider doesn't + # support it. + def _stop_spinner(): + nonlocal thinking_spinner + if thinking_spinner: + thinking_spinner.stop("") + thinking_spinner = None + if agent.thinking_callback: + agent.thinking_callback("") + + _use_streaming = True + # Provider signaled "stream not supported" on a previous + # attempt β€” switch to non-streaming for the rest of this + # session instead of re-failing every retry. + if getattr(agent, "_disable_streaming", False): + _use_streaming = False + # CopilotACPClient communicates via subprocess stdio and + # returns a plain SimpleNamespace β€” not an iterable + # stream. Mirror the ACP exclusion used for Responses + # API upgrade (lines ~1083-1085). + elif ( + agent.provider == "copilot-acp" + or str(agent.base_url or "").lower().startswith("acp://copilot") + or str(agent.base_url or "").lower().startswith("acp+tcp://") + ): + _use_streaming = False + elif not agent._has_stream_consumers(): + # No display/TTS consumer. Still prefer streaming for + # health checking, but skip for Mock clients in tests + # (mocks return SimpleNamespace, not stream iterators). + from unittest.mock import Mock + if isinstance(getattr(agent, "client", None), Mock): + _use_streaming = False + + if _use_streaming: + response = agent._interruptible_streaming_api_call( + api_kwargs, on_first_delta=_stop_spinner + ) + else: + response = agent._interruptible_api_call(api_kwargs) + + api_duration = time.time() - api_start_time + + # Stop thinking spinner silently -- the response box or tool + # execution messages that follow are more informative. + if thinking_spinner: + thinking_spinner.stop("") + thinking_spinner = None + if agent.thinking_callback: + agent.thinking_callback("") + + if not agent.quiet_mode: + agent._vprint(f"{agent.log_prefix}⏱️ API call completed in {api_duration:.2f}s") + + if agent.verbose_logging: + # Log response with provider info if available + resp_model = getattr(response, 'model', 'N/A') if response else 'N/A' + logging.debug(f"API Response received - Model: {resp_model}, Usage: {response.usage if hasattr(response, 'usage') else 'N/A'}") + + # Validate response shape before proceeding + response_invalid = False + error_details = [] + if agent.api_mode == "codex_responses": + _ct_v = agent._get_transport() + if not _ct_v.validate_response(response): + if response is None: + response_invalid = True + error_details.append("response is None") + else: + # Provider returned a terminal failure (e.g. quota exhaustion). + # Treat as invalid so the fallback chain is triggered instead of + # letting the error bubble up outside the retry/fallback loop. + _codex_resp_status = str(getattr(response, "status", "") or "").strip().lower() + if _codex_resp_status in {"failed", "cancelled"}: + _codex_error_obj = getattr(response, "error", None) + _codex_error_msg = ( + _codex_error_obj.get("message") if isinstance(_codex_error_obj, dict) + else str(_codex_error_obj) if _codex_error_obj + else f"Responses API returned status '{_codex_resp_status}'" + ) + logging.warning( + "Codex response status='%s' (error=%s). Routing to fallback. %s", + _codex_resp_status, _codex_error_msg, + agent._client_log_context(), + ) + response_invalid = True + error_details.append(f"response.status={_codex_resp_status}: {_codex_error_msg}") + else: + # output_text fallback: stream backfill may have failed + # but normalize can still recover from output_text + _out_text = getattr(response, "output_text", None) + _out_text_stripped = _out_text.strip() if isinstance(_out_text, str) else "" + if _out_text_stripped: + logger.debug( + "Codex response.output is empty but output_text is present " + "(%d chars); deferring to normalization.", + len(_out_text_stripped), + ) + else: + _resp_status = getattr(response, "status", None) + _resp_incomplete = getattr(response, "incomplete_details", None) + logger.warning( + "Codex response.output is empty after stream backfill " + "(status=%s, incomplete_details=%s, model=%s). %s", + _resp_status, _resp_incomplete, + getattr(response, "model", None), + f"api_mode={agent.api_mode} provider={agent.provider}", + ) + response_invalid = True + error_details.append("response.output is empty") + elif agent.api_mode == "anthropic_messages": + _tv = agent._get_transport() + if not _tv.validate_response(response): + response_invalid = True + if response is None: + error_details.append("response is None") + else: + error_details.append("response.content invalid (not a non-empty list)") + elif agent.api_mode == "bedrock_converse": + _btv = agent._get_transport() + if not _btv.validate_response(response): + response_invalid = True + if response is None: + error_details.append("response is None") + else: + error_details.append("Bedrock response invalid (no output or choices)") + else: + _ctv = agent._get_transport() + if not _ctv.validate_response(response): + response_invalid = True + if response is None: + error_details.append("response is None") + elif not hasattr(response, 'choices'): + error_details.append("response has no 'choices' attribute") + elif response.choices is None: + error_details.append("response.choices is None") + else: + error_details.append("response.choices is empty") + + if response_invalid: + # Stop spinner before printing error messages + if thinking_spinner: + thinking_spinner.stop("(Β΄;Ο‰;`) oops, retrying...") + thinking_spinner = None + if agent.thinking_callback: + agent.thinking_callback("") + + # Invalid response β€” could be rate limiting, provider timeout, + # upstream server error, or malformed response. + retry_count += 1 + + # Eager fallback: empty/malformed responses are a common + # rate-limit symptom. Switch to fallback immediately + # rather than retrying with extended backoff. + if agent._fallback_index < len(agent._fallback_chain): + agent._emit_status("⚠️ Empty/malformed response β€” switching to fallback...") + if agent._try_activate_fallback(): + retry_count = 0 + compression_attempts = 0 + primary_recovery_attempted = False + continue + + # Check for error field in response (some providers include this) + error_msg = "Unknown" + provider_name = "Unknown" + if response and hasattr(response, 'error') and response.error: + error_msg = str(response.error) + # Try to extract provider from error metadata + if hasattr(response.error, 'metadata') and response.error.metadata: + provider_name = response.error.metadata.get('provider_name', 'Unknown') + elif response and hasattr(response, 'message') and response.message: + error_msg = str(response.message) + + # Try to get provider from model field (OpenRouter often returns actual model used) + if provider_name == "Unknown" and response and hasattr(response, 'model') and response.model: + provider_name = f"model={response.model}" + + # Check for x-openrouter-provider or similar metadata + if provider_name == "Unknown" and response: + # Log all response attributes for debugging + resp_attrs = {k: str(v)[:100] for k, v in vars(response).items() if not k.startswith('_')} + if agent.verbose_logging: + logging.debug(f"Response attributes for invalid response: {resp_attrs}") + + # Extract error code from response for contextual diagnostics + _resp_error_code = None + if response and hasattr(response, 'error') and response.error: + _code_raw = getattr(response.error, 'code', None) + if _code_raw is None and isinstance(response.error, dict): + _code_raw = response.error.get('code') + if _code_raw is not None: + try: + _resp_error_code = int(_code_raw) + except (TypeError, ValueError): + pass + + # Build a human-readable failure hint from the error code + # and response time, instead of always assuming rate limiting. + if _resp_error_code == 524: + _failure_hint = f"upstream provider timed out (Cloudflare 524, {api_duration:.0f}s)" + elif _resp_error_code == 504: + _failure_hint = f"upstream gateway timeout (504, {api_duration:.0f}s)" + elif _resp_error_code == 429: + _failure_hint = f"rate limited by upstream provider (429)" + elif _resp_error_code in {500, 502}: + _failure_hint = f"upstream server error ({_resp_error_code}, {api_duration:.0f}s)" + elif _resp_error_code in {503, 529}: + _failure_hint = f"upstream provider overloaded ({_resp_error_code})" + elif _resp_error_code is not None: + _failure_hint = f"upstream error (code {_resp_error_code}, {api_duration:.0f}s)" + elif api_duration < 10: + _failure_hint = f"fast response ({api_duration:.1f}s) β€” likely rate limited" + elif api_duration > 60: + _failure_hint = f"slow response ({api_duration:.0f}s) β€” likely upstream timeout" + else: + _failure_hint = f"response time {api_duration:.1f}s" + + agent._vprint(f"{agent.log_prefix}⚠️ Invalid API response (attempt {retry_count}/{max_retries}): {', '.join(error_details)}", force=True) + agent._vprint(f"{agent.log_prefix} 🏒 Provider: {provider_name}", force=True) + cleaned_provider_error = agent._clean_error_message(error_msg) + agent._vprint(f"{agent.log_prefix} πŸ“ Provider message: {cleaned_provider_error}", force=True) + agent._vprint(f"{agent.log_prefix} ⏱️ {_failure_hint}", force=True) + + if retry_count >= max_retries: + # Try fallback before giving up + agent._emit_status(f"⚠️ Max retries ({max_retries}) for invalid responses β€” trying fallback...") + if agent._try_activate_fallback(): + retry_count = 0 + compression_attempts = 0 + primary_recovery_attempted = False + continue + agent._emit_status(f"❌ Max retries ({max_retries}) exceeded for invalid responses. Giving up.") + logging.error(f"{agent.log_prefix}Invalid API response after {max_retries} retries.") + agent._persist_session(messages, conversation_history) + return { + "messages": messages, + "completed": False, + "api_calls": api_call_count, + "error": f"Invalid API response after {max_retries} retries: {_failure_hint}", + "failed": True # Mark as failure for filtering + } + + # Backoff before retry β€” jittered exponential: 5s base, 120s cap + wait_time = jittered_backoff(retry_count, base_delay=5.0, max_delay=120.0) + agent._vprint(f"{agent.log_prefix}⏳ Retrying in {wait_time:.1f}s ({_failure_hint})...", force=True) + logging.warning(f"Invalid API response (retry {retry_count}/{max_retries}): {', '.join(error_details)} | Provider: {provider_name}") + + # Sleep in small increments to stay responsive to interrupts + sleep_end = time.time() + wait_time + _backoff_touch_counter = 0 + while time.time() < sleep_end: + if agent._interrupt_requested: + agent._vprint(f"{agent.log_prefix}⚑ Interrupt detected during retry wait, aborting.", force=True) + agent._persist_session(messages, conversation_history) + agent.clear_interrupt() + return { + "final_response": f"Operation interrupted during retry ({_failure_hint}, attempt {retry_count}/{max_retries}).", + "messages": messages, + "api_calls": api_call_count, + "completed": False, + "interrupted": True, + } + time.sleep(0.2) + # Touch activity every ~30s so the gateway's inactivity + # monitor knows we're alive during backoff waits. + _backoff_touch_counter += 1 + if _backoff_touch_counter % 150 == 0: # 150 Γ— 0.2s = 30s + agent._touch_activity( + f"retry backoff ({retry_count}/{max_retries}), " + f"{int(sleep_end - time.time())}s remaining" + ) + continue # Retry the API call + + # Check finish_reason before proceeding + if agent.api_mode == "codex_responses": + status = getattr(response, "status", None) + incomplete_details = getattr(response, "incomplete_details", None) + incomplete_reason = None + if isinstance(incomplete_details, dict): + incomplete_reason = incomplete_details.get("reason") + else: + incomplete_reason = getattr(incomplete_details, "reason", None) + if status == "incomplete" and incomplete_reason in {"max_output_tokens", "length"}: + finish_reason = "length" + else: + finish_reason = "stop" + elif agent.api_mode == "anthropic_messages": + _tfr = agent._get_transport() + finish_reason = _tfr.map_finish_reason(response.stop_reason) + elif agent.api_mode == "bedrock_converse": + # Bedrock response already normalized at dispatch β€” use transport + _bt_fr = agent._get_transport() + _bedrock_result = _bt_fr.normalize_response(response) + finish_reason = _bedrock_result.finish_reason + else: + _cc_fr = agent._get_transport() + _finish_result = _cc_fr.normalize_response(response) + finish_reason = _finish_result.finish_reason + assistant_message = _finish_result + if agent._should_treat_stop_as_truncated( + finish_reason, + assistant_message, + messages, + ): + agent._vprint( + f"{agent.log_prefix}⚠️ Treating suspicious Ollama/GLM stop response as truncated", + force=True, + ) + finish_reason = "length" + + if finish_reason == "length": + agent._vprint(f"{agent.log_prefix}⚠️ Response truncated (finish_reason='length') - model hit max output tokens", force=True) + + # Normalize the truncated response to a single OpenAI-style + # message shape so text-continuation and tool-call retry + # work uniformly across chat_completions, bedrock_converse, + # and anthropic_messages. For Anthropic we use the same + # adapter the agent loop already relies on so the rebuilt + # interim assistant message is byte-identical to what + # would have been appended in the non-truncated path. + _trunc_msg = None + _trunc_transport = agent._get_transport() + if agent.api_mode == "anthropic_messages": + _trunc_result = _trunc_transport.normalize_response( + response, strip_tool_prefix=agent._is_anthropic_oauth + ) + else: + _trunc_result = _trunc_transport.normalize_response(response) + _trunc_msg = _trunc_result + + _trunc_content = getattr(_trunc_msg, "content", None) if _trunc_msg else None + _trunc_has_tool_calls = bool(getattr(_trunc_msg, "tool_calls", None)) if _trunc_msg else False + + # ── Detect thinking-budget exhaustion ────────────── + # When the model spends ALL output tokens on reasoning + # and has none left for the response, continuation + # retries are pointless. Detect this early and give a + # targeted error instead of wasting 3 API calls. + # A response is "thinking exhausted" only when the model + # actually produced reasoning blocks but no visible text after + # them. Models that do not use tags (e.g. GLM-4.7 on + # NVIDIA Build, minimax) may return content=None or an empty + # string for unrelated reasons β€” treat those as normal + # truncations that deserve continuation retries, not as + # thinking-budget exhaustion. + _has_think_tags = bool( + _trunc_content and re.search( + r'<(?:think|thinking|reasoning|REASONING_SCRATCHPAD)[^>]*>', + _trunc_content, + re.IGNORECASE, + ) + ) + _thinking_exhausted = ( + not _trunc_has_tool_calls + and _has_think_tags + and ( + (_trunc_content is not None and not agent._has_content_after_think_block(_trunc_content)) + or _trunc_content is None + ) + ) + + if _thinking_exhausted: + _exhaust_error = ( + "Model used all output tokens on reasoning with none left " + "for the response. Try lowering reasoning effort or " + "increasing max_tokens." + ) + agent._vprint( + f"{agent.log_prefix}πŸ’­ Reasoning exhausted the output token budget β€” " + f"no visible response was produced.", + force=True, + ) + # Return a user-friendly message as the response so + # CLI (response box) and gateway (chat message) both + # display it naturally instead of a suppressed error. + _exhaust_response = ( + "⚠️ **Thinking Budget Exhausted**\n\n" + "The model used all its output tokens on reasoning " + "and had none left for the actual response.\n\n" + "To fix this:\n" + "β†’ Lower reasoning effort: `/thinkon low` or `/thinkon minimal`\n" + "β†’ Or switch to a larger/non-reasoning model with `/model`" + ) + agent._cleanup_task_resources(effective_task_id) + agent._persist_session(messages, conversation_history) + return { + "final_response": _exhaust_response, + "messages": messages, + "api_calls": api_call_count, + "completed": False, + "partial": True, + "error": _exhaust_error, + } + + if agent.api_mode in {"chat_completions", "bedrock_converse", "anthropic_messages"}: + assistant_message = _trunc_msg + if assistant_message is not None and not _trunc_has_tool_calls: + length_continue_retries += 1 + interim_msg = agent._build_assistant_message(assistant_message, finish_reason) + messages.append(interim_msg) + if assistant_message.content: + truncated_response_prefix += assistant_message.content + + if length_continue_retries < 3: + agent._vprint( + f"{agent.log_prefix}↻ Requesting continuation " + f"({length_continue_retries}/3)..." + ) + continue_msg = { + "role": "user", + "content": ( + "[System: Your previous response was truncated by the output " + "length limit. Continue exactly where you left off. Do not " + "restart or repeat prior text. Finish the answer directly.]" + ), + } + messages.append(continue_msg) + agent._session_messages = messages + agent._save_session_log(messages) + restart_with_length_continuation = True + break + + partial_response = agent._strip_think_blocks(truncated_response_prefix).strip() + agent._cleanup_task_resources(effective_task_id) + agent._persist_session(messages, conversation_history) + return { + "final_response": partial_response or None, + "messages": messages, + "api_calls": api_call_count, + "completed": False, + "partial": True, + "error": "Response remained truncated after 3 continuation attempts", + } + + if agent.api_mode in {"chat_completions", "bedrock_converse", "anthropic_messages"}: + assistant_message = _trunc_msg + if assistant_message is not None and _trunc_has_tool_calls: + if truncated_tool_call_retries < 1: + truncated_tool_call_retries += 1 + agent._vprint( + f"{agent.log_prefix}⚠️ Truncated tool call detected β€” retrying API call...", + force=True, + ) + # Don't append the broken response to messages; + # just re-run the same API call from the current + # message state, giving the model another chance. + continue + agent._vprint( + f"{agent.log_prefix}⚠️ Truncated tool call response detected again β€” refusing to execute incomplete tool arguments.", + force=True, + ) + agent._cleanup_task_resources(effective_task_id) + agent._persist_session(messages, conversation_history) + return { + "final_response": None, + "messages": messages, + "api_calls": api_call_count, + "completed": False, + "partial": True, + "error": "Response truncated due to output length limit", + } + + # If we have prior messages, roll back to last complete state + if len(messages) > 1: + agent._vprint(f"{agent.log_prefix} βͺ Rolling back to last complete assistant turn") + rolled_back_messages = agent._get_messages_up_to_last_assistant(messages) + + agent._cleanup_task_resources(effective_task_id) + agent._persist_session(messages, conversation_history) + + return { + "final_response": None, + "messages": rolled_back_messages, + "api_calls": api_call_count, + "completed": False, + "partial": True, + "error": "Response truncated due to output length limit" + } + else: + # First message was truncated - mark as failed + agent._vprint(f"{agent.log_prefix}❌ First response truncated - cannot recover", force=True) + agent._persist_session(messages, conversation_history) + return { + "final_response": None, + "messages": messages, + "api_calls": api_call_count, + "completed": False, + "failed": True, + "error": "First response truncated due to output length limit" + } + + # Track actual token usage from response for context management + if hasattr(response, 'usage') and response.usage: + canonical_usage = normalize_usage( + response.usage, + provider=agent.provider, + api_mode=agent.api_mode, + ) + prompt_tokens = canonical_usage.prompt_tokens + completion_tokens = canonical_usage.output_tokens + total_tokens = canonical_usage.total_tokens + usage_dict = { + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "total_tokens": total_tokens, + } + agent.context_compressor.update_from_response(usage_dict) + + # Cache discovered context length after successful call. + # Only persist limits confirmed by the provider (parsed + # from the error message), not guessed probe tiers. + if getattr(agent.context_compressor, "_context_probed", False): + ctx = agent.context_compressor.context_length + if getattr(agent.context_compressor, "_context_probe_persistable", False): + save_context_length(agent.model, agent.base_url, ctx) + agent._safe_print(f"{agent.log_prefix}πŸ’Ύ Cached context length: {ctx:,} tokens for {agent.model}") + agent.context_compressor._context_probed = False + agent.context_compressor._context_probe_persistable = False + + agent.session_prompt_tokens += prompt_tokens + agent.session_completion_tokens += completion_tokens + agent.session_total_tokens += total_tokens + agent.session_api_calls += 1 + agent.session_input_tokens += canonical_usage.input_tokens + agent.session_output_tokens += canonical_usage.output_tokens + agent.session_cache_read_tokens += canonical_usage.cache_read_tokens + agent.session_cache_write_tokens += canonical_usage.cache_write_tokens + agent.session_reasoning_tokens += canonical_usage.reasoning_tokens + + # Log API call details for debugging/observability + _cache_pct = "" + if canonical_usage.cache_read_tokens and prompt_tokens: + _cache_pct = f" cache={canonical_usage.cache_read_tokens}/{prompt_tokens} ({100*canonical_usage.cache_read_tokens/prompt_tokens:.0f}%)" + logger.info( + "API call #%d: model=%s provider=%s in=%d out=%d total=%d latency=%.1fs%s", + agent.session_api_calls, agent.model, agent.provider or "unknown", + prompt_tokens, completion_tokens, total_tokens, + api_duration, _cache_pct, + ) + + cost_result = estimate_usage_cost( + agent.model, + canonical_usage, + provider=agent.provider, + base_url=agent.base_url, + api_key=getattr(agent, "api_key", ""), + ) + if cost_result.amount_usd is not None: + agent.session_estimated_cost_usd += float(cost_result.amount_usd) + agent.session_cost_status = cost_result.status + agent.session_cost_source = cost_result.source + + # Persist token counts to session DB for /insights. + # Do this for every platform with a session_id so non-CLI + # sessions (gateway, cron, delegated runs) cannot lose + # token/accounting data if a higher-level persistence path + # is skipped or fails. Gateway/session-store writes use + # absolute totals, so they safely overwrite these per-call + # deltas instead of double-counting them. + if agent._session_db and agent.session_id: + try: + # Ensure the session row exists before attempting UPDATE. + # Under concurrent load (cron/kanban), the initial + # _ensure_db_session() may have failed due to SQLite + # locking. Retry here so per-call token deltas are + # not silently lost (UPDATE on a non-existent row + # affects 0 rows without error). + if not agent._session_db_created: + agent._ensure_db_session() + agent._session_db.update_token_counts( + agent.session_id, + input_tokens=canonical_usage.input_tokens, + output_tokens=canonical_usage.output_tokens, + cache_read_tokens=canonical_usage.cache_read_tokens, + cache_write_tokens=canonical_usage.cache_write_tokens, + reasoning_tokens=canonical_usage.reasoning_tokens, + estimated_cost_usd=float(cost_result.amount_usd) + if cost_result.amount_usd is not None else None, + cost_status=cost_result.status, + cost_source=cost_result.source, + billing_provider=agent.provider, + billing_base_url=agent.base_url, + billing_mode="subscription_included" + if cost_result.status == "included" else None, + model=agent.model, + api_call_count=1, + ) + except Exception as e: + # Log token persistence failures so they're + # visible in agent.log β€” silent loss here is + # the root cause of undercounted analytics. + logger.debug( + "Token persistence failed (session=%s, tokens=%d): %s", + agent.session_id, total_tokens, e, + ) + + if agent.verbose_logging: + logging.debug(f"Token usage: prompt={usage_dict['prompt_tokens']:,}, completion={usage_dict['completion_tokens']:,}, total={usage_dict['total_tokens']:,}") + + # Surface cache hit stats for any provider that reports + # them β€” not just those where we inject cache_control + # markers. OpenAI/Kimi/DeepSeek/Qwen all do automatic + # server-side prefix caching and return + # ``prompt_tokens_details.cached_tokens``; users + # previously could not see their cache % because this + # line was gated on ``_use_prompt_caching``, which is + # only True for Anthropic-style marker injection. + # ``canonical_usage`` is already normalised from all + # three API shapes (Anthropic / Codex / OpenAI-chat) + # so we can rely on its values directly. + cached = canonical_usage.cache_read_tokens + written = canonical_usage.cache_write_tokens + prompt = usage_dict["prompt_tokens"] + if (cached or written) and not agent.quiet_mode: + hit_pct = (cached / prompt * 100) if prompt > 0 else 0 + agent._vprint( + f"{agent.log_prefix} πŸ’Ύ Cache: " + f"{cached:,}/{prompt:,} tokens " + f"({hit_pct:.0f}% hit, {written:,} written)" + ) + + has_retried_429 = False # Reset on success + # Clear Nous rate limit state on successful request β€” + # proves the limit has reset and other sessions can + # resume hitting Nous. + if agent.provider == "nous": + try: + from agent.nous_rate_guard import clear_nous_rate_limit + clear_nous_rate_limit() + except Exception: + pass + agent._touch_activity(f"API call #{api_call_count} completed") + break # Success, exit retry loop + + except InterruptedError: + if thinking_spinner: + thinking_spinner.stop("") + thinking_spinner = None + if agent.thinking_callback: + agent.thinking_callback("") + api_elapsed = time.time() - api_start_time + agent._vprint(f"{agent.log_prefix}⚑ Interrupted during API call.", force=True) + agent._persist_session(messages, conversation_history) + interrupted = True + final_response = f"Operation interrupted: waiting for model response ({api_elapsed:.1f}s elapsed)." + break + + except Exception as api_error: + # Stop spinner before printing error messages + if thinking_spinner: + thinking_spinner.stop("(β•₯_β•₯) error, retrying...") + thinking_spinner = None + if agent.thinking_callback: + agent.thinking_callback("") + + # ----------------------------------------------------------- + # UnicodeEncodeError recovery. Two common causes: + # 1. Lone surrogates (U+D800..U+DFFF) from clipboard paste + # (Google Docs, rich-text editors) β€” sanitize and retry. + # 2. ASCII codec on systems with LANG=C or non-UTF-8 locale + # (e.g. Chromebooks) β€” any non-ASCII character fails. + # Detect via the error message mentioning 'ascii' codec. + # We sanitize messages in-place and may retry twice: + # first to strip surrogates, then once more for pure + # ASCII-only locale sanitization if needed. + # ----------------------------------------------------------- + if isinstance(api_error, UnicodeEncodeError) and getattr(agent, '_unicode_sanitization_passes', 0) < 2: + _err_str = str(api_error).lower() + _is_ascii_codec = "'ascii'" in _err_str or "ascii" in _err_str + # Detect surrogate errors β€” utf-8 codec refusing to + # encode U+D800..U+DFFF. The error text is: + # "'utf-8' codec can't encode characters in position + # N-M: surrogates not allowed" + _is_surrogate_error = ( + "surrogate" in _err_str + or ("'utf-8'" in _err_str and not _is_ascii_codec) + ) + # Sanitize surrogates from both the canonical `messages` + # list AND `api_messages` (the API-copy, which may carry + # `reasoning_content`/`reasoning_details` transformed + # from `reasoning` β€” fields the canonical list doesn't + # have directly). Also clean `api_kwargs` if built and + # `prefill_messages` if present. Mirrors the ASCII + # codec recovery below. + _surrogates_found = _sanitize_messages_surrogates(messages) + if isinstance(api_messages, list): + if _sanitize_messages_surrogates(api_messages): + _surrogates_found = True + if isinstance(api_kwargs, dict): + if _sanitize_structure_surrogates(api_kwargs): + _surrogates_found = True + if isinstance(getattr(agent, "prefill_messages", None), list): + if _sanitize_messages_surrogates(agent.prefill_messages): + _surrogates_found = True + # Gate the retry on the error type, not on whether we + # found anything β€” _force_ascii_payload / the extended + # surrogate walker above cover all known paths, but a + # new transformed field could still slip through. If + # the error was a surrogate encode failure, always let + # the retry run; the proactive sanitizer at line ~8781 + # runs again on the next iteration. Bounded by + # _unicode_sanitization_passes < 2 (outer guard). + if _surrogates_found or _is_surrogate_error: + agent._unicode_sanitization_passes += 1 + if _surrogates_found: + agent._vprint( + f"{agent.log_prefix}⚠️ Stripped invalid surrogate characters from messages. Retrying...", + force=True, + ) + else: + agent._vprint( + f"{agent.log_prefix}⚠️ Surrogate encoding error β€” retrying after full-payload sanitization...", + force=True, + ) + continue + if _is_ascii_codec: + agent._force_ascii_payload = True + # ASCII codec: the system encoding can't handle + # non-ASCII characters at all. Sanitize all + # non-ASCII content from messages/tool schemas and retry. + # Sanitize both the canonical `messages` list and + # `api_messages` (the API-copy built before the retry + # loop, which may contain extra fields like + # reasoning_content that are not in `messages`). + _messages_sanitized = _sanitize_messages_non_ascii(messages) + if isinstance(api_messages, list): + _sanitize_messages_non_ascii(api_messages) + # Also sanitize the last api_kwargs if already built, + # so a leftover non-ASCII value in a transformed field + # (e.g. extra_body, reasoning_content) doesn't survive + # into the next attempt via _build_api_kwargs cache paths. + if isinstance(api_kwargs, dict): + _sanitize_structure_non_ascii(api_kwargs) + _prefill_sanitized = False + if isinstance(getattr(agent, "prefill_messages", None), list): + _prefill_sanitized = _sanitize_messages_non_ascii(agent.prefill_messages) + + _tools_sanitized = False + if isinstance(getattr(agent, "tools", None), list): + _tools_sanitized = _sanitize_tools_non_ascii(agent.tools) + + _system_sanitized = False + if isinstance(active_system_prompt, str): + _sanitized_system = _strip_non_ascii(active_system_prompt) + if _sanitized_system != active_system_prompt: + active_system_prompt = _sanitized_system + agent._cached_system_prompt = _sanitized_system + _system_sanitized = True + if isinstance(getattr(agent, "ephemeral_system_prompt", None), str): + _sanitized_ephemeral = _strip_non_ascii(agent.ephemeral_system_prompt) + if _sanitized_ephemeral != agent.ephemeral_system_prompt: + agent.ephemeral_system_prompt = _sanitized_ephemeral + _system_sanitized = True + + _headers_sanitized = False + _default_headers = ( + agent._client_kwargs.get("default_headers") + if isinstance(getattr(agent, "_client_kwargs", None), dict) + else None + ) + if isinstance(_default_headers, dict): + _headers_sanitized = _sanitize_structure_non_ascii(_default_headers) + + # Sanitize the API key β€” non-ASCII characters in + # credentials (e.g. Κ‹ instead of v from a bad + # copy-paste) cause httpx to fail when encoding + # the Authorization header as ASCII. This is the + # most common cause of persistent UnicodeEncodeError + # that survives message/tool sanitization (#6843). + _credential_sanitized = False + _raw_key = getattr(agent, "api_key", None) or "" + if _raw_key: + _clean_key = _strip_non_ascii(_raw_key) + if _clean_key != _raw_key: + agent.api_key = _clean_key + if isinstance(getattr(agent, "_client_kwargs", None), dict): + agent._client_kwargs["api_key"] = _clean_key + # Also update the live client β€” it holds its + # own copy of api_key which auth_headers reads + # dynamically on every request. + if getattr(agent, "client", None) is not None and hasattr(agent.client, "api_key"): + agent.client.api_key = _clean_key + _credential_sanitized = True + agent._vprint( + f"{agent.log_prefix}⚠️ API key contained non-ASCII characters " + f"(bad copy-paste?) β€” stripped them. If auth fails, " + f"re-copy the key from your provider's dashboard.", + force=True, + ) + + # Always retry on ASCII codec detection β€” + # _force_ascii_payload guarantees the full + # api_kwargs payload is sanitized on the + # next iteration (line ~8475). Even when + # per-component checks above find nothing + # (e.g. non-ASCII only in api_messages' + # reasoning_content), the flag catches it. + # Bounded by _unicode_sanitization_passes < 2. + agent._unicode_sanitization_passes += 1 + _any_sanitized = ( + _messages_sanitized + or _prefill_sanitized + or _tools_sanitized + or _system_sanitized + or _headers_sanitized + or _credential_sanitized + ) + if _any_sanitized: + agent._vprint( + f"{agent.log_prefix}⚠️ System encoding is ASCII β€” stripped non-ASCII characters from request payload. Retrying...", + force=True, + ) + else: + agent._vprint( + f"{agent.log_prefix}⚠️ System encoding is ASCII β€” enabling full-payload sanitization for retry...", + force=True, + ) + continue + + # ── Image-rejection recovery ────────────────────────────── + # Some providers (mlx-lm, text-only endpoints, text-only + # fallbacks on multimodal models) reject any message that + # contains image_url content with a 4xx error like + # "Only 'text' content type is supported." On first hit, + # strip all images from the message list, mark the session + # as vision-unsupported, and retry with text only. + # + # Detection is best-effort English phrase matching β€” a + # locale-translated or heavily-reworded upstream error + # will bypass this guard and fall through to the normal + # error handler. Expand the phrase list when new + # provider wordings are observed in the wild. + _err_body = "" + try: + _err_body = str(getattr(api_error, "body", None) or + getattr(api_error, "message", None) or + str(api_error)) + except Exception: + pass + _err_status = getattr(api_error, "status_code", None) + _IMAGE_REJECTION_PHRASES = ( + "only 'text' content type is supported", + "only text content type is supported", + "image_url is not supported", + "image content is not supported", + "multimodal is not supported", + "multimodal content is not supported", + "multimodal input is not supported", + "vision is not supported", + "vision input is not supported", + "does not support images", + "does not support image input", + "does not support multimodal", + "does not support vision", + "model does not support image", + # ChatGPT-account Codex backend + # (https://chatgpt.com/backend-api/codex) rejects + # data:image/...base64 URLs in input_image fields + # with HTTP 400 "Invalid 'input[N].content[K].image_url'. + # Expected a valid URL, but got a value with an + # invalid format." The OpenAI Responses API on the + # public endpoint accepts data URLs, but the + # ChatGPT-account variant does not. Without this + # phrase the agent cascaded into compression / + # context-too-large recovery instead of just + # stripping the images. Match is narrow on + # purpose β€” keyed on the field-path apostrophe so + # we don't false-trip on other URL validation + # errors. (issue #23570) + "image_url'. expected", + # DeepSeek's OpenAI-compatible API reports text-only + # request-body variants as: + # "unknown variant `image_url`, expected `text`". + "unknown variant `image_url`, expected `text`", + "unknown variant image_url, expected text", + ) + _err_lower = _err_body.lower() + _looks_like_image_rejection = any( + p in _err_lower for p in _IMAGE_REJECTION_PHRASES + ) + # 4xx-only gate: never interpret 5xx/timeout as "server + # said no to images" β€” those are transient and must + # route to the normal retry path. + _status_ok = _err_status is None or (400 <= int(_err_status) < 500) + if ( + getattr(agent, "_vision_supported", True) + and _looks_like_image_rejection + and _status_ok + ): + agent._vision_supported = False + _imgs_removed = _strip_images_from_messages(messages) + if isinstance(api_messages, list): + _strip_images_from_messages(api_messages) + agent._vprint( + f"{agent.log_prefix}⚠️ Server rejected image content β€” " + f"switching to text-only mode for this session" + + (". Stripped images from history and retrying." if _imgs_removed else "."), + force=True, + ) + continue + + status_code = getattr(api_error, "status_code", None) + error_context = agent._extract_api_error_context(api_error) + + # ── Classify the error for structured recovery decisions ── + _compressor = getattr(agent, "context_compressor", None) + _ctx_len = getattr(_compressor, "context_length", 200000) if _compressor else 200000 + classified = classify_api_error( + api_error, + provider=getattr(agent, "provider", "") or "", + model=getattr(agent, "model", "") or "", + approx_tokens=approx_tokens, + context_length=_ctx_len, + num_messages=len(api_messages) if api_messages else 0, + ) + logger.debug( + "Error classified: reason=%s status=%s retryable=%s compress=%s rotate=%s fallback=%s", + classified.reason.value, classified.status_code, + classified.retryable, classified.should_compress, + classified.should_rotate_credential, classified.should_fallback, + ) + + recovered_with_pool, has_retried_429 = agent._recover_with_credential_pool( + status_code=status_code, + has_retried_429=has_retried_429, + classified_reason=classified.reason, + error_context=error_context, + ) + if recovered_with_pool: + continue + + # Image-too-large recovery: shrink oversized native image + # parts in-place and retry once. Triggered by Anthropic's + # per-image 5 MB ceiling (400 with "image exceeds 5 MB + # maximum") or any other provider that complains about + # image size. If shrink fails or a second attempt still + # fails, fall through to normal error handling. + if ( + classified.reason == FailoverReason.image_too_large + and not image_shrink_retry_attempted + ): + image_shrink_retry_attempted = True + if agent._try_shrink_image_parts_in_messages(api_messages): + agent._vprint( + f"{agent.log_prefix}πŸ“ Image(s) exceeded provider size limit β€” " + f"shrank and retrying...", + force=True, + ) + continue + else: + logger.info( + "image-shrink recovery: no data-URL image parts found " + "or shrink didn't reduce size; surfacing original error." + ) + + # Anthropic OAuth subscription rejected the 1M-context beta + # header ("long context beta is not yet available for this + # subscription"). Disable the beta for the rest of this + # session, rebuild the client, and retry once. 1M-capable + # subscriptions never hit this branch β€” they accept the + # beta and keep full 1M context. See PR #17680 for the + # original report (we chose reactive recovery over the + # proposed unconditional omit so capable subscriptions + # don't silently lose the capability). + if ( + classified.reason == FailoverReason.oauth_long_context_beta_forbidden + and agent.api_mode == "anthropic_messages" + and agent._is_anthropic_oauth + and not oauth_1m_beta_retry_attempted + ): + oauth_1m_beta_retry_attempted = True + if not getattr(agent, "_oauth_1m_beta_disabled", False): + agent._oauth_1m_beta_disabled = True + try: + agent._anthropic_client.close() + except Exception: + pass + agent._rebuild_anthropic_client() + agent._vprint( + f"{agent.log_prefix}πŸ”• OAuth subscription doesn't support " + f"the 1M-context beta β€” disabled for this session and retrying...", + force=True, + ) + continue + + if ( + agent.api_mode == "codex_responses" + and agent.provider == "openai-codex" + and status_code == 401 + and not codex_auth_retry_attempted + ): + codex_auth_retry_attempted = True + if agent._try_refresh_codex_client_credentials(force=True): + agent._vprint(f"{agent.log_prefix}πŸ” Codex auth refreshed after 401. Retrying request...") + continue + if ( + agent.api_mode == "chat_completions" + and agent.provider == "nous" + and status_code == 401 + and not nous_auth_retry_attempted + ): + nous_auth_retry_attempted = True + if agent._try_refresh_nous_client_credentials(force=True): + print(f"{agent.log_prefix}πŸ” Nous agent key refreshed after 401. Retrying request...") + continue + # Credential refresh didn't help β€” show diagnostic info. + # Most common causes: Portal OAuth expired/revoked, + # account out of credits, or agent key blocked. + from hermes_constants import display_hermes_home as _dhh_fn + _dhh = _dhh_fn() + _body_text = "" + try: + _body = getattr(api_error, "body", None) or getattr(api_error, "response", None) + if _body is not None: + _body_text = str(_body)[:200] + except Exception: + pass + print(f"{agent.log_prefix}πŸ” Nous 401 β€” Portal authentication failed.") + if _body_text: + print(f"{agent.log_prefix} Response: {_body_text}") + print(f"{agent.log_prefix} Most likely: Portal OAuth expired, account out of credits, or agent key revoked.") + print(f"{agent.log_prefix} Troubleshooting:") + print(f"{agent.log_prefix} β€’ Re-authenticate: hermes login --provider nous") + print(f"{agent.log_prefix} β€’ Check credits / billing: https://portal.nousresearch.com") + print(f"{agent.log_prefix} β€’ Verify stored credentials: {_dhh}/auth.json") + print(f"{agent.log_prefix} β€’ Switch providers temporarily: /model --provider openrouter") + if ( + agent.provider == "copilot" + and status_code == 401 + and not copilot_auth_retry_attempted + ): + copilot_auth_retry_attempted = True + if agent._try_refresh_copilot_client_credentials(): + agent._vprint(f"{agent.log_prefix}πŸ” Copilot credentials refreshed after 401. Retrying request...") + continue + if ( + agent.api_mode == "anthropic_messages" + and status_code == 401 + and hasattr(agent, '_anthropic_api_key') + and not anthropic_auth_retry_attempted + ): + anthropic_auth_retry_attempted = True + from agent.anthropic_adapter import _is_oauth_token + if agent._try_refresh_anthropic_client_credentials(): + print(f"{agent.log_prefix}πŸ” Anthropic credentials refreshed after 401. Retrying request...") + continue + # Credential refresh didn't help β€” show diagnostic info + key = agent._anthropic_api_key + auth_method = "Bearer (OAuth/setup-token)" if _is_oauth_token(key) else "x-api-key (API key)" + print(f"{agent.log_prefix}πŸ” Anthropic 401 β€” authentication failed.") + print(f"{agent.log_prefix} Auth method: {auth_method}") + print(f"{agent.log_prefix} Token prefix: {key[:12]}..." if key and len(key) > 12 else f"{agent.log_prefix} Token: (empty or short)") + print(f"{agent.log_prefix} Troubleshooting:") + from hermes_constants import display_hermes_home as _dhh_fn + _dhh = _dhh_fn() + print(f"{agent.log_prefix} β€’ Check ANTHROPIC_TOKEN in {_dhh}/.env for Hermes-managed OAuth/setup tokens") + print(f"{agent.log_prefix} β€’ Check ANTHROPIC_API_KEY in {_dhh}/.env for API keys or legacy token values") + print(f"{agent.log_prefix} β€’ For API keys: verify at https://platform.claude.com/settings/keys") + print(f"{agent.log_prefix} β€’ For Claude Code: run 'claude /login' to refresh, then retry") + print(f"{agent.log_prefix} β€’ Legacy cleanup: hermes config set ANTHROPIC_TOKEN \"\"") + print(f"{agent.log_prefix} β€’ Clear stale keys: hermes config set ANTHROPIC_API_KEY \"\"") + + # ── Thinking block signature recovery ───────────────── + # Anthropic signs thinking blocks against the full turn + # content. Any upstream mutation (context compression, + # session truncation, message merging) invalidates the + # signature β†’ HTTP 400. Recovery: strip reasoning_details + # from all messages so the next retry sends no thinking + # blocks at all. One-shot β€” don't retry infinitely. + if ( + classified.reason == FailoverReason.thinking_signature + and not thinking_sig_retry_attempted + ): + thinking_sig_retry_attempted = True + for _m in messages: + if isinstance(_m, dict): + _m.pop("reasoning_details", None) + agent._vprint( + f"{agent.log_prefix}⚠️ Thinking block signature invalid β€” " + f"stripped all thinking blocks, retrying...", + force=True, + ) + logging.warning( + "%sThinking block signature recovery: stripped " + "reasoning_details from %d messages", + agent.log_prefix, len(messages), + ) + continue + + # ── llama.cpp grammar-parse recovery ────────────────── + # llama.cpp's ``json-schema-to-grammar`` converter rejects + # regex escape classes (``\d``, ``\w``, ``\s``) and most + # ``format`` values in tool schemas. MCP servers emit + # these routinely for date/phone/email params. Recovery: + # strip ``pattern``/``format`` from ``agent.tools`` and + # retry once. We keep the keywords by default so cloud + # providers get the full prompting hints; this branch + # fires only for users on llama.cpp's OAI server. + if ( + classified.reason == FailoverReason.llama_cpp_grammar_pattern + and not llama_cpp_grammar_retry_attempted + ): + llama_cpp_grammar_retry_attempted = True + try: + from tools.schema_sanitizer import strip_pattern_and_format + _, _stripped = strip_pattern_and_format(agent.tools) + except Exception as _strip_exc: # pragma: no cover β€” defensive + logging.warning( + "%sllama.cpp grammar recovery: strip helper failed: %s", + agent.log_prefix, _strip_exc, + ) + _stripped = 0 + if _stripped: + agent._vprint( + f"{agent.log_prefix}⚠️ llama.cpp rejected tool schema grammar β€” " + f"stripped {_stripped} pattern/format keyword(s), retrying...", + force=True, + ) + logging.warning( + "%sllama.cpp grammar recovery: stripped %d " + "pattern/format keyword(s) from tool schemas", + agent.log_prefix, _stripped, + ) + continue + # No keywords found to strip β€” fall through to normal + # retry path rather than loop forever on the same error. + logging.warning( + "%sllama.cpp grammar error but no pattern/format " + "keywords to strip β€” falling through to normal retry", + agent.log_prefix, + ) + + retry_count += 1 + elapsed_time = time.time() - api_start_time + agent._touch_activity( + f"API error recovery (attempt {retry_count}/{max_retries})" + ) + + error_type = type(api_error).__name__ + error_msg = str(api_error).lower() + _error_summary = agent._summarize_api_error(api_error) + logger.warning( + "API call failed (attempt %s/%s) error_type=%s %s summary=%s", + retry_count, + max_retries, + error_type, + agent._client_log_context(), + _error_summary, + ) + + _provider = getattr(agent, "provider", "unknown") + _base = getattr(agent, "base_url", "unknown") + _model = getattr(agent, "model", "unknown") + _status_code_str = f" [HTTP {status_code}]" if status_code else "" + agent._vprint(f"{agent.log_prefix}⚠️ API call failed (attempt {retry_count}/{max_retries}): {error_type}{_status_code_str}", force=True) + agent._vprint(f"{agent.log_prefix} πŸ”Œ Provider: {_provider} Model: {_model}", force=True) + agent._vprint(f"{agent.log_prefix} 🌐 Endpoint: {_base}", force=True) + agent._vprint(f"{agent.log_prefix} πŸ“ Error: {_error_summary}", force=True) + if status_code and status_code < 500: + _err_body = getattr(api_error, "body", None) + _err_body_str = str(_err_body)[:300] if _err_body else None + if _err_body_str: + agent._vprint(f"{agent.log_prefix} πŸ“‹ Details: {_err_body_str}", force=True) + agent._vprint(f"{agent.log_prefix} ⏱️ Elapsed: {elapsed_time:.2f}s Context: {len(api_messages)} msgs, ~{approx_tokens:,} tokens") + + # Actionable hint for OpenRouter "no tool endpoints" error. + # This fires regardless of whether fallback succeeds β€” the + # user needs to know WHY their model failed so they can fix + # their provider routing, not just silently fall back. + if ( + agent._is_openrouter_url() + and "support tool use" in error_msg + ): + agent._vprint( + f"{agent.log_prefix} πŸ’‘ No OpenRouter providers for {_model} support tool calling with your current settings.", + force=True, + ) + if agent.providers_allowed: + agent._vprint( + f"{agent.log_prefix} Your provider_routing.only restriction is filtering out tool-capable providers.", + force=True, + ) + agent._vprint( + f"{agent.log_prefix} Try removing the restriction or adding providers that support tools for this model.", + force=True, + ) + agent._vprint( + f"{agent.log_prefix} Check which providers support tools: https://openrouter.ai/models/{_model}", + force=True, + ) + + # Check for interrupt before deciding to retry + if agent._interrupt_requested: + agent._vprint(f"{agent.log_prefix}⚑ Interrupt detected during error handling, aborting retries.", force=True) + agent._persist_session(messages, conversation_history) + agent.clear_interrupt() + return { + "final_response": f"Operation interrupted: handling API error ({error_type}: {agent._clean_error_message(str(api_error))}).", + "messages": messages, + "api_calls": api_call_count, + "completed": False, + "interrupted": True, + } + + # Check for 413 payload-too-large BEFORE generic 4xx handler. + # A 413 is a payload-size error β€” the correct response is to + # compress history and retry, not abort immediately. + status_code = getattr(api_error, "status_code", None) + + # ── Anthropic Sonnet long-context tier gate ─────────── + # Anthropic returns HTTP 429 "Extra usage is required for + # long context requests" when a Claude Max (or similar) + # subscription doesn't include the 1M-context tier. This + # is NOT a transient rate limit β€” retrying or switching + # credentials won't help. Reduce context to 200k (the + # standard tier) and compress. + if classified.reason == FailoverReason.long_context_tier: + _reduced_ctx = 200000 + compressor = agent.context_compressor + old_ctx = compressor.context_length + if old_ctx > _reduced_ctx: + compressor.update_model( + model=agent.model, + context_length=_reduced_ctx, + base_url=agent.base_url, + api_key=getattr(agent, "api_key", ""), + provider=agent.provider, + ) + # Context probing flags β€” only set on built-in + # compressor (plugin engines manage their own). + if hasattr(compressor, "_context_probed"): + compressor._context_probed = True + # Don't persist β€” this is a subscription-tier + # limitation, not a model capability. If the + # user later enables extra usage the 1M limit + # should come back automatically. + compressor._context_probe_persistable = False + agent._vprint( + f"{agent.log_prefix}⚠️ Anthropic long-context tier " + f"requires extra usage β€” reducing context: " + f"{old_ctx:,} β†’ {_reduced_ctx:,} tokens", + force=True, + ) + + compression_attempts += 1 + if compression_attempts <= max_compression_attempts: + original_len = len(messages) + messages, active_system_prompt = agent._compress_context( + messages, system_message, + approx_tokens=approx_tokens, + task_id=effective_task_id, + ) + # Compression created a new session β€” clear history + # so _flush_messages_to_session_db writes compressed + # messages to the new session, not skipping them. + conversation_history = None + if len(messages) < original_len or old_ctx > _reduced_ctx: + agent._emit_status( + f"πŸ—œοΈ Context reduced to {_reduced_ctx:,} tokens " + f"(was {old_ctx:,}), retrying..." + ) + time.sleep(2) + restart_with_compressed_messages = True + break + # Fall through to normal error handling if compression + # is exhausted or didn't help. + + # Eager fallback for rate-limit errors (429 or quota exhaustion). + # When a fallback model is configured, switch immediately instead + # of burning through retries with exponential backoff -- the + # primary provider won't recover within the retry window. + is_rate_limited = classified.reason in { + FailoverReason.rate_limit, + FailoverReason.billing, + } + if is_rate_limited and agent._fallback_index < len(agent._fallback_chain): + # Don't eagerly fallback if credential pool rotation may + # still recover. See _pool_may_recover_from_rate_limit + # for the single-credential-pool and CloudCode-quota + # exceptions. Fixes #11314 and #13636. + pool_may_recover = _pool_may_recover_from_rate_limit( + agent._credential_pool, + provider=agent.provider, + base_url=getattr(agent, "base_url", None), + ) + if not pool_may_recover: + agent._emit_status("⚠️ Rate limited β€” switching to fallback provider...") + if agent._try_activate_fallback(reason=classified.reason): + retry_count = 0 + compression_attempts = 0 + primary_recovery_attempted = False + continue + + # ── Nous Portal: record rate limit & skip retries ───── + # When Nous returns a 429 that is a genuine account- + # level rate limit, record the reset time to a shared + # file so ALL sessions (cron, gateway, auxiliary) know + # not to pile on, then skip further retries -- each + # one burns another RPH request and deepens the hole. + # The retry loop's top-of-iteration guard will catch + # this on the next pass and try fallback or bail. + # + # IMPORTANT: Nous Portal multiplexes multiple upstream + # providers (DeepSeek, Kimi, MiMo, Hermes). A 429 can + # also mean an UPSTREAM provider is out of capacity + # for one specific model -- transient, clears in + # seconds, nothing to do with the caller's quota. + # Tripping the cross-session breaker on that would + # block every Nous model for minutes. We use + # ``is_genuine_nous_rate_limit`` to tell the two + # apart via the 429's own x-ratelimit-* headers and + # the last-known-good state captured on the previous + # successful response. + if ( + is_rate_limited + and agent.provider == "nous" + and classified.reason == FailoverReason.rate_limit + and not recovered_with_pool + ): + _genuine_nous_rate_limit = False + try: + from agent.nous_rate_guard import ( + is_genuine_nous_rate_limit, + record_nous_rate_limit, + ) + _err_resp = getattr(api_error, "response", None) + _err_hdrs = ( + getattr(_err_resp, "headers", None) + if _err_resp else None + ) + _genuine_nous_rate_limit = is_genuine_nous_rate_limit( + headers=_err_hdrs, + last_known_state=agent._rate_limit_state, + ) + if _genuine_nous_rate_limit: + record_nous_rate_limit( + headers=_err_hdrs, + error_context=error_context, + ) + else: + logging.info( + "Nous 429 looks like upstream capacity " + "(no exhausted bucket in headers or " + "last-known state) -- not tripping " + "cross-session breaker." + ) + except Exception: + pass + if _genuine_nous_rate_limit: + # Skip straight to max_retries -- the + # top-of-loop guard will handle fallback or + # bail cleanly. + retry_count = max_retries + continue + # Upstream capacity 429: fall through to normal + # retry logic. A different model (or the same + # model a moment later) will typically succeed. + + is_payload_too_large = ( + classified.reason == FailoverReason.payload_too_large + ) + + if is_payload_too_large: + compression_attempts += 1 + if compression_attempts > max_compression_attempts: + agent._vprint(f"{agent.log_prefix}❌ Max compression attempts ({max_compression_attempts}) reached for payload-too-large error.", force=True) + agent._vprint(f"{agent.log_prefix} πŸ’‘ Try /new to start a fresh conversation, or /compress to retry compression.", force=True) + logging.error(f"{agent.log_prefix}413 compression failed after {max_compression_attempts} attempts.") + agent._persist_session(messages, conversation_history) + return { + "messages": messages, + "completed": False, + "api_calls": api_call_count, + "error": f"Request payload too large: max compression attempts ({max_compression_attempts}) reached.", + "partial": True, + "failed": True, + "compression_exhausted": True, + } + agent._emit_status(f"⚠️ Request payload too large (413) β€” compression attempt {compression_attempts}/{max_compression_attempts}...") + + original_len = len(messages) + messages, active_system_prompt = agent._compress_context( + messages, system_message, approx_tokens=approx_tokens, + task_id=effective_task_id, + ) + # Compression created a new session β€” clear history + # so _flush_messages_to_session_db writes compressed + # messages to the new session, not skipping them. + conversation_history = None + + if len(messages) < original_len: + agent._emit_status(f"πŸ—œοΈ Compressed {original_len} β†’ {len(messages)} messages, retrying...") + time.sleep(2) # Brief pause between compression retries + restart_with_compressed_messages = True + break + else: + agent._vprint(f"{agent.log_prefix}❌ Payload too large and cannot compress further.", force=True) + agent._vprint(f"{agent.log_prefix} πŸ’‘ Try /new to start a fresh conversation, or /compress to retry compression.", force=True) + logging.error(f"{agent.log_prefix}413 payload too large. Cannot compress further.") + agent._persist_session(messages, conversation_history) + return { + "messages": messages, + "completed": False, + "api_calls": api_call_count, + "error": "Request payload too large (413). Cannot compress further.", + "partial": True, + "failed": True, + "compression_exhausted": True, + } + + # Check for context-length errors BEFORE generic 4xx handler. + # The classifier detects context overflow from: explicit error + # messages, generic 400 + large session heuristic (#1630), and + # server disconnect + large session pattern (#2153). + is_context_length_error = ( + classified.reason == FailoverReason.context_overflow + ) + + if is_context_length_error: + compressor = agent.context_compressor + old_ctx = compressor.context_length + + # ── Distinguish two very different errors ─────────── + # 1. "Prompt too long": the INPUT exceeds the context window. + # Fix: reduce context_length + compress history. + # 2. "max_tokens too large": input is fine, but + # input_tokens + requested max_tokens > context_window. + # Fix: reduce max_tokens (the OUTPUT cap) for this call. + # Do NOT shrink context_length β€” the window is unchanged. + # + # Note: max_tokens = output token cap (one response). + # context_length = total window (input + output combined). + available_out = parse_available_output_tokens_from_error(error_msg) + if available_out is not None: + # Error is purely about the output cap being too large. + # Cap output to the available space and retry without + # touching context_length or triggering compression. + safe_out = max(1, available_out - 64) # small safety margin + agent._ephemeral_max_output_tokens = safe_out + agent._vprint( + f"{agent.log_prefix}⚠️ Output cap too large for current prompt β€” " + f"retrying with max_tokens={safe_out:,} " + f"(available_tokens={available_out:,}; context_length unchanged at {old_ctx:,})", + force=True, + ) + # Still count against compression_attempts so we don't + # loop forever if the error keeps recurring. + compression_attempts += 1 + if compression_attempts > max_compression_attempts: + agent._vprint(f"{agent.log_prefix}❌ Max compression attempts ({max_compression_attempts}) reached.", force=True) + agent._vprint(f"{agent.log_prefix} πŸ’‘ Try /new to start a fresh conversation, or /compress to retry compression.", force=True) + logging.error(f"{agent.log_prefix}Context compression failed after {max_compression_attempts} attempts.") + agent._persist_session(messages, conversation_history) + return { + "messages": messages, + "completed": False, + "api_calls": api_call_count, + "error": f"Context length exceeded: max compression attempts ({max_compression_attempts}) reached.", + "partial": True, + "failed": True, + "compression_exhausted": True, + } + restart_with_compressed_messages = True + break + + # Error is about the INPUT being too large β€” reduce context_length. + # Try to parse the actual limit from the error message + parsed_limit = parse_context_limit_from_error(error_msg) + _provider_lower = (getattr(agent, "provider", "") or "").lower() + _base_lower = (getattr(agent, "base_url", "") or "").rstrip("/").lower() + is_minimax_provider = ( + _provider_lower in {"minimax", "minimax-cn"} + or _base_lower.startswith(( + "https://api.minimax.io/anthropic", + "https://api.minimaxi.com/anthropic", + )) + ) + minimax_delta_only_overflow = ( + is_minimax_provider + and parsed_limit is None + and "context window exceeds limit (" in error_msg + ) + if parsed_limit and parsed_limit < old_ctx: + new_ctx = parsed_limit + agent._vprint(f"{agent.log_prefix}Context limit detected from API: {new_ctx:,} tokens (was {old_ctx:,})", force=True) + elif minimax_delta_only_overflow: + new_ctx = old_ctx + agent._vprint( + f"{agent.log_prefix}Provider reported overflow amount only; " + f"keeping context_length at {old_ctx:,} tokens and compressing.", + force=True, + ) + else: + # Step down to the next probe tier + new_ctx = get_next_probe_tier(old_ctx) + + if new_ctx and new_ctx < old_ctx: + compressor.update_model( + model=agent.model, + context_length=new_ctx, + base_url=agent.base_url, + api_key=getattr(agent, "api_key", ""), + provider=agent.provider, + ) + # Context probing flags β€” only set on built-in + # compressor (plugin engines manage their own). + if hasattr(compressor, "_context_probed"): + compressor._context_probed = True + # Only persist limits parsed from the provider's + # error message (a real number). Guessed fallback + # tiers from get_next_probe_tier() should stay + # in-memory only β€” persisting them pollutes the + # cache with wrong values. + compressor._context_probe_persistable = bool( + parsed_limit and parsed_limit == new_ctx + ) + agent._vprint(f"{agent.log_prefix}⚠️ Context length exceeded β€” stepping down: {old_ctx:,} β†’ {new_ctx:,} tokens", force=True) + else: + agent._vprint(f"{agent.log_prefix}⚠️ Context length exceeded at minimum tier β€” attempting compression...", force=True) + + compression_attempts += 1 + if compression_attempts > max_compression_attempts: + agent._vprint(f"{agent.log_prefix}❌ Max compression attempts ({max_compression_attempts}) reached.", force=True) + agent._vprint(f"{agent.log_prefix} πŸ’‘ Try /new to start a fresh conversation, or /compress to retry compression.", force=True) + logging.error(f"{agent.log_prefix}Context compression failed after {max_compression_attempts} attempts.") + agent._persist_session(messages, conversation_history) + return { + "messages": messages, + "completed": False, + "api_calls": api_call_count, + "error": f"Context length exceeded: max compression attempts ({max_compression_attempts}) reached.", + "partial": True, + "failed": True, + "compression_exhausted": True, + } + agent._emit_status(f"πŸ—œοΈ Context too large (~{approx_tokens:,} tokens) β€” compressing ({compression_attempts}/{max_compression_attempts})...") + + original_len = len(messages) + messages, active_system_prompt = agent._compress_context( + messages, system_message, approx_tokens=approx_tokens, + task_id=effective_task_id, + ) + # Compression created a new session β€” clear history + # so _flush_messages_to_session_db writes compressed + # messages to the new session, not skipping them. + conversation_history = None + + if len(messages) < original_len or new_ctx and new_ctx < old_ctx: + if len(messages) < original_len: + agent._emit_status(f"πŸ—œοΈ Compressed {original_len} β†’ {len(messages)} messages, retrying...") + time.sleep(2) # Brief pause between compression retries + restart_with_compressed_messages = True + break + else: + # Can't compress further and already at minimum tier + agent._vprint(f"{agent.log_prefix}❌ Context length exceeded and cannot compress further.", force=True) + agent._vprint(f"{agent.log_prefix} πŸ’‘ The conversation has accumulated too much content. Try /new to start fresh, or /compress to manually trigger compression.", force=True) + logging.error(f"{agent.log_prefix}Context length exceeded: {approx_tokens:,} tokens. Cannot compress further.") + agent._persist_session(messages, conversation_history) + return { + "messages": messages, + "completed": False, + "api_calls": api_call_count, + "error": f"Context length exceeded ({approx_tokens:,} tokens). Cannot compress further.", + "partial": True, + "failed": True, + "compression_exhausted": True, + } + + # Check for non-retryable client errors. The classifier + # already accounts for 413, 429, 529 (transient), context + # overflow, and generic-400 heuristics. Local validation + # errors (ValueError, TypeError) are programming bugs. + # Exclude UnicodeEncodeError β€” it's a ValueError subclass + # but is handled separately by the surrogate sanitization + # path above. Exclude json.JSONDecodeError β€” also a + # ValueError subclass, but it indicates a transient + # provider/network failure (malformed response body, + # truncated stream, routing layer corruption), not a + # local programming bug, and should be retried (#14782). + is_local_validation_error = ( + isinstance(api_error, (ValueError, TypeError)) + and not isinstance( + api_error, (UnicodeEncodeError, json.JSONDecodeError) + ) + # ssl.SSLError (and its subclass SSLCertVerificationError) + # inherits from OSError *and* ValueError via Python MRO, + # so the isinstance(ValueError) check above would + # misclassify a TLS transport failure as a local + # programming bug and abort without retrying. Exclude + # ssl.SSLError explicitly so the error classifier's + # retryable=True mapping takes effect instead. + and not isinstance(api_error, ssl.SSLError) + ) + is_client_error = ( + is_local_validation_error + or ( + not classified.retryable + and not classified.should_compress + and classified.reason not in { + FailoverReason.rate_limit, + FailoverReason.billing, + FailoverReason.overloaded, + FailoverReason.context_overflow, + FailoverReason.payload_too_large, + FailoverReason.long_context_tier, + FailoverReason.thinking_signature, + } + ) + ) and not is_context_length_error + + if is_client_error: + # Try fallback before aborting β€” a different provider + # may not have the same issue (rate limit, auth, etc.) + agent._emit_status(f"⚠️ Non-retryable error (HTTP {status_code}) β€” trying fallback...") + if agent._try_activate_fallback(): + retry_count = 0 + compression_attempts = 0 + primary_recovery_attempted = False + continue + if api_kwargs is not None: + agent._dump_api_request_debug( + api_kwargs, reason="non_retryable_client_error", error=api_error, + ) + agent._emit_status( + f"❌ Non-retryable error (HTTP {status_code}): " + f"{agent._summarize_api_error(api_error)}" + ) + agent._vprint(f"{agent.log_prefix}❌ Non-retryable client error (HTTP {status_code}). Aborting.", force=True) + agent._vprint(f"{agent.log_prefix} πŸ”Œ Provider: {_provider} Model: {_model}", force=True) + agent._vprint(f"{agent.log_prefix} 🌐 Endpoint: {_base}", force=True) + # Actionable guidance for common auth errors + if classified.is_auth or classified.reason == FailoverReason.billing: + if _provider == "openai-codex" and status_code == 401: + agent._vprint(f"{agent.log_prefix} πŸ’‘ Codex OAuth token was rejected (HTTP 401). Your token may have been", force=True) + agent._vprint(f"{agent.log_prefix} refreshed by another client (Codex CLI, VS Code). To fix:", force=True) + agent._vprint(f"{agent.log_prefix} 1. Run `codex` in your terminal to generate fresh tokens.", force=True) + agent._vprint(f"{agent.log_prefix} 2. Then run `hermes auth` to re-authenticate.", force=True) + else: + agent._vprint(f"{agent.log_prefix} πŸ’‘ Your API key was rejected by the provider. Check:", force=True) + agent._vprint(f"{agent.log_prefix} β€’ Is the key valid? Run: hermes setup", force=True) + agent._vprint(f"{agent.log_prefix} β€’ Does your account have access to {_model}?", force=True) + if base_url_host_matches(str(_base), "openrouter.ai"): + agent._vprint(f"{agent.log_prefix} β€’ Check credits: https://openrouter.ai/settings/credits", force=True) + else: + agent._vprint(f"{agent.log_prefix} πŸ’‘ This type of error won't be fixed by retrying.", force=True) + logging.error(f"{agent.log_prefix}Non-retryable client error: {api_error}") + # Skip session persistence when the error is likely + # context-overflow related (status 400 + large session). + # Persisting the failed user message would make the + # session even larger, causing the same failure on the + # next attempt. (#1630) + if status_code == 400 and (approx_tokens > 50000 or len(api_messages) > 80): + agent._vprint( + f"{agent.log_prefix}⚠️ Skipping session persistence " + f"for large failed session to prevent growth loop.", + force=True, + ) + else: + agent._persist_session(messages, conversation_history) + return { + "final_response": None, + "messages": messages, + "api_calls": api_call_count, + "completed": False, + "failed": True, + "error": str(api_error), + } + + if retry_count >= max_retries: + # Before falling back, try rebuilding the primary + # client once for transient transport errors (stale + # connection pool, TCP reset). Only attempted once + # per API call block. + if not primary_recovery_attempted and agent._try_recover_primary_transport( + api_error, retry_count=retry_count, max_retries=max_retries, + ): + primary_recovery_attempted = True + retry_count = 0 + continue + # Try fallback before giving up entirely + agent._emit_status(f"⚠️ Max retries ({max_retries}) exhausted β€” trying fallback...") + if agent._try_activate_fallback(): + retry_count = 0 + compression_attempts = 0 + primary_recovery_attempted = False + continue + _final_summary = agent._summarize_api_error(api_error) + if is_rate_limited: + agent._emit_status(f"❌ Rate limited after {max_retries} retries β€” {_final_summary}") + else: + agent._emit_status(f"❌ API failed after {max_retries} retries β€” {_final_summary}") + agent._vprint(f"{agent.log_prefix} πŸ’€ Final error: {_final_summary}", force=True) + + # Detect SSE stream-drop pattern (e.g. "Network + # connection lost") and surface actionable guidance. + # This typically happens when the model generates a + # very large tool call (write_file with huge content) + # and the proxy/CDN drops the stream mid-response. + _is_stream_drop = ( + not getattr(api_error, "status_code", None) + and any(p in error_msg for p in ( + "connection lost", "connection reset", + "connection closed", "network connection", + "network error", "terminated", + )) + ) + if _is_stream_drop: + agent._vprint( + f"{agent.log_prefix} πŸ’‘ The provider's stream " + f"connection keeps dropping. This often happens " + f"when the model tries to write a very large " + f"file in a single tool call.", + force=True, + ) + agent._vprint( + f"{agent.log_prefix} Try asking the model " + f"to use execute_code with Python's open() for " + f"large files, or to write the file in smaller " + f"sections.", + force=True, + ) + + logging.error( + "%sAPI call failed after %s retries. %s | provider=%s model=%s msgs=%s tokens=~%s", + agent.log_prefix, max_retries, _final_summary, + _provider, _model, len(api_messages), f"{approx_tokens:,}", + ) + if api_kwargs is not None: + agent._dump_api_request_debug( + api_kwargs, reason="max_retries_exhausted", error=api_error, + ) + agent._persist_session(messages, conversation_history) + _final_response = f"API call failed after {max_retries} retries: {_final_summary}" + if _is_stream_drop: + _final_response += ( + "\n\nThe provider's stream connection keeps " + "dropping β€” this often happens when generating " + "very large tool call responses (e.g. write_file " + "with long content). Try asking me to use " + "execute_code with Python's open() for large " + "files, or to write in smaller sections." + ) + return { + "final_response": _final_response, + "messages": messages, + "api_calls": api_call_count, + "completed": False, + "failed": True, + "error": _final_summary, + } + + # For rate limits, respect the Retry-After header if present + _retry_after = None + if is_rate_limited: + _resp_headers = getattr(getattr(api_error, "response", None), "headers", None) + if _resp_headers and hasattr(_resp_headers, "get"): + _ra_raw = _resp_headers.get("retry-after") or _resp_headers.get("Retry-After") + if _ra_raw: + try: + _retry_after = min(float(_ra_raw), 120) # Cap at 2 minutes + except (TypeError, ValueError): + pass + wait_time = _retry_after if _retry_after else jittered_backoff(retry_count, base_delay=2.0, max_delay=60.0) + if is_rate_limited: + agent._emit_status(f"⏱️ Rate limited. Waiting {wait_time:.1f}s (attempt {retry_count + 1}/{max_retries})...") + else: + agent._emit_status(f"⏳ Retrying in {wait_time:.1f}s (attempt {retry_count}/{max_retries})...") + logger.warning( + "Retrying API call in %ss (attempt %s/%s) %s error=%s", + wait_time, + retry_count, + max_retries, + agent._client_log_context(), + api_error, + ) + # Sleep in small increments so we can respond to interrupts quickly + # instead of blocking the entire wait_time in one sleep() call + sleep_end = time.time() + wait_time + _backoff_touch_counter = 0 + while time.time() < sleep_end: + if agent._interrupt_requested: + agent._vprint(f"{agent.log_prefix}⚑ Interrupt detected during retry wait, aborting.", force=True) + agent._persist_session(messages, conversation_history) + agent.clear_interrupt() + return { + "final_response": f"Operation interrupted: retrying API call after error (retry {retry_count}/{max_retries}).", + "messages": messages, + "api_calls": api_call_count, + "completed": False, + "interrupted": True, + } + time.sleep(0.2) # Check interrupt every 200ms + # Touch activity every ~30s so the gateway's inactivity + # monitor knows we're alive during backoff waits. + _backoff_touch_counter += 1 + if _backoff_touch_counter % 150 == 0: # 150 Γ— 0.2s = 30s + agent._touch_activity( + f"error retry backoff ({retry_count}/{max_retries}), " + f"{int(sleep_end - time.time())}s remaining" + ) + + # If the API call was interrupted, skip response processing + if interrupted: + _turn_exit_reason = "interrupted_during_api_call" + break + + if restart_with_compressed_messages: + api_call_count -= 1 + agent.iteration_budget.refund() + # Count compression restarts toward the retry limit to prevent + # infinite loops when compression reduces messages but not enough + # to fit the context window. + retry_count += 1 + restart_with_compressed_messages = False + continue + + if restart_with_length_continuation: + # Progressively boost the output token budget on each retry. + # Retry 1 β†’ 2Γ— base, retry 2 β†’ 3Γ— base, capped at 32 768. + # Applies to all providers via _ephemeral_max_output_tokens. + _boost_base = agent.max_tokens if agent.max_tokens else 4096 + _boost = _boost_base * (length_continue_retries + 1) + agent._ephemeral_max_output_tokens = min(_boost, 32768) + continue + + # Guard: if all retries exhausted without a successful response + # (e.g. repeated context-length errors that exhausted retry_count), + # the `response` variable is still None. Break out cleanly. + if response is None: + _turn_exit_reason = "all_retries_exhausted_no_response" + print(f"{agent.log_prefix}❌ All API retries exhausted with no successful response.") + agent._persist_session(messages, conversation_history) + break + + try: + _transport = agent._get_transport() + _normalize_kwargs = {} + if agent.api_mode == "anthropic_messages": + _normalize_kwargs["strip_tool_prefix"] = agent._is_anthropic_oauth + normalized = _transport.normalize_response(response, **_normalize_kwargs) + assistant_message = normalized + finish_reason = normalized.finish_reason + + # Normalize content to string β€” some OpenAI-compatible servers + # (llama-server, etc.) return content as a dict or list instead + # of a plain string, which crashes downstream .strip() calls. + if assistant_message.content is not None and not isinstance(assistant_message.content, str): + raw = assistant_message.content + if isinstance(raw, dict): + assistant_message.content = raw.get("text", "") or raw.get("content", "") or json.dumps(raw) + elif isinstance(raw, list): + # Multimodal content list β€” extract text parts + parts = [] + for part in raw: + if isinstance(part, str): + parts.append(part) + elif isinstance(part, dict) and part.get("type") == "text": + parts.append(part.get("text", "")) + elif isinstance(part, dict) and "text" in part: + parts.append(str(part["text"])) + assistant_message.content = "\n".join(parts) + else: + assistant_message.content = str(raw) + + try: + from hermes_cli.plugins import invoke_hook as _invoke_hook + _assistant_tool_calls = getattr(assistant_message, "tool_calls", None) or [] + _assistant_text = assistant_message.content or "" + _invoke_hook( + "post_api_request", + task_id=effective_task_id, + session_id=agent.session_id or "", + platform=agent.platform or "", + model=agent.model, + provider=agent.provider, + base_url=agent.base_url, + api_mode=agent.api_mode, + api_call_count=api_call_count, + api_duration=api_duration, + finish_reason=finish_reason, + message_count=len(api_messages), + response_model=getattr(response, "model", None), + usage=agent._usage_summary_for_api_request_hook(response), + assistant_content_chars=len(_assistant_text), + assistant_tool_call_count=len(_assistant_tool_calls), + ) + except Exception: + pass + + # Handle assistant response + if assistant_message.content and not agent.quiet_mode: + if agent.verbose_logging: + agent._vprint(f"{agent.log_prefix}πŸ€– Assistant: {assistant_message.content}") + else: + agent._vprint(f"{agent.log_prefix}πŸ€– Assistant: {assistant_message.content[:100]}{'...' if len(assistant_message.content) > 100 else ''}") + + # Notify progress callback of model's thinking (used by subagent + # delegation to relay the child's reasoning to the parent display). + if (assistant_message.content and agent.tool_progress_callback): + _think_text = assistant_message.content.strip() + # Strip reasoning XML tags that shouldn't leak to parent display + _think_text = re.sub( + r'', '', _think_text + ).strip() + # For subagents: relay first line to parent display (existing behaviour). + # For all agents with a structured callback: emit reasoning.available event. + first_line = _think_text.split('\n')[0][:80] if _think_text else "" + if first_line and getattr(agent, '_delegate_depth', 0) > 0: + try: + agent.tool_progress_callback("_thinking", first_line) + except Exception: + pass + elif _think_text: + try: + agent.tool_progress_callback("reasoning.available", "_thinking", _think_text[:500], None) + except Exception: + pass + + # Check for incomplete (opened but never closed) + # This means the model ran out of output tokens mid-reasoning β€” retry up to 2 times + if has_incomplete_scratchpad(assistant_message.content or ""): + agent._incomplete_scratchpad_retries += 1 + + agent._vprint(f"{agent.log_prefix}⚠️ Incomplete detected (opened but never closed)") + + if agent._incomplete_scratchpad_retries <= 2: + agent._vprint(f"{agent.log_prefix}πŸ”„ Retrying API call ({agent._incomplete_scratchpad_retries}/2)...") + # Don't add the broken message, just retry + continue + else: + # Max retries - discard this turn and save as partial + agent._vprint(f"{agent.log_prefix}❌ Max retries (2) for incomplete scratchpad. Saving as partial.", force=True) + agent._incomplete_scratchpad_retries = 0 + + rolled_back_messages = agent._get_messages_up_to_last_assistant(messages) + agent._cleanup_task_resources(effective_task_id) + agent._persist_session(messages, conversation_history) + + return { + "final_response": None, + "messages": rolled_back_messages, + "api_calls": api_call_count, + "completed": False, + "partial": True, + "error": "Incomplete REASONING_SCRATCHPAD after 2 retries" + } + + # Reset incomplete scratchpad counter on clean response + agent._incomplete_scratchpad_retries = 0 + + if agent.api_mode == "codex_responses" and finish_reason == "incomplete": + agent._codex_incomplete_retries += 1 + + interim_msg = agent._build_assistant_message(assistant_message, finish_reason) + interim_has_content = bool((interim_msg.get("content") or "").strip()) + interim_has_reasoning = bool(interim_msg.get("reasoning", "").strip()) if isinstance(interim_msg.get("reasoning"), str) else False + interim_has_codex_reasoning = bool(interim_msg.get("codex_reasoning_items")) + interim_has_codex_message_items = bool(interim_msg.get("codex_message_items")) + + if ( + interim_has_content + or interim_has_reasoning + or interim_has_codex_reasoning + or interim_has_codex_message_items + ): + last_msg = messages[-1] if messages else None + # Duplicate detection: two consecutive incomplete assistant + # messages with identical content AND reasoning are collapsed. + # For provider-state-only changes (encrypted reasoning + # items or replayable message ids/phases/statuses differ + # while visible content/reasoning are unchanged), compare + # those opaque payloads too so we don't silently drop the + # newer continuation state. + last_codex_items = last_msg.get("codex_reasoning_items") if isinstance(last_msg, dict) else None + interim_codex_items = interim_msg.get("codex_reasoning_items") + last_codex_message_items = last_msg.get("codex_message_items") if isinstance(last_msg, dict) else None + interim_codex_message_items = interim_msg.get("codex_message_items") + duplicate_interim = ( + isinstance(last_msg, dict) + and last_msg.get("role") == "assistant" + and last_msg.get("finish_reason") == "incomplete" + and (last_msg.get("content") or "") == (interim_msg.get("content") or "") + and (last_msg.get("reasoning") or "") == (interim_msg.get("reasoning") or "") + and last_codex_items == interim_codex_items + and last_codex_message_items == interim_codex_message_items + ) + if not duplicate_interim: + messages.append(interim_msg) + agent._emit_interim_assistant_message(interim_msg) + + if agent._codex_incomplete_retries < 3: + if not agent.quiet_mode: + agent._vprint(f"{agent.log_prefix}↻ Codex response incomplete; continuing turn ({agent._codex_incomplete_retries}/3)") + agent._session_messages = messages + agent._save_session_log(messages) + continue + + agent._codex_incomplete_retries = 0 + agent._persist_session(messages, conversation_history) + return { + "final_response": None, + "messages": messages, + "api_calls": api_call_count, + "completed": False, + "partial": True, + "error": "Codex response remained incomplete after 3 continuation attempts", + } + elif hasattr(agent, "_codex_incomplete_retries"): + agent._codex_incomplete_retries = 0 + + # Check for tool calls + if assistant_message.tool_calls: + if not agent.quiet_mode: + agent._vprint(f"{agent.log_prefix}πŸ”§ Processing {len(assistant_message.tool_calls)} tool call(s)...") + + if agent.verbose_logging: + for tc in assistant_message.tool_calls: + logging.debug(f"Tool call: {tc.function.name} with args: {tc.function.arguments[:200]}...") + + # Validate tool call names - detect model hallucinations + # Repair mismatched tool names before validating + for tc in assistant_message.tool_calls: + if tc.function.name not in agent.valid_tool_names: + repaired = agent._repair_tool_call(tc.function.name) + if repaired: + print(f"{agent.log_prefix}πŸ”§ Auto-repaired tool name: '{tc.function.name}' -> '{repaired}'") + tc.function.name = repaired + invalid_tool_calls = [ + tc.function.name for tc in assistant_message.tool_calls + if tc.function.name not in agent.valid_tool_names + ] + if invalid_tool_calls: + # Track retries for invalid tool calls + agent._invalid_tool_retries += 1 + + # Return helpful error to model β€” model can agent-correct next turn + available = ", ".join(sorted(agent.valid_tool_names)) + invalid_name = invalid_tool_calls[0] + invalid_preview = invalid_name[:80] + "..." if len(invalid_name) > 80 else invalid_name + agent._vprint(f"{agent.log_prefix}⚠️ Unknown tool '{invalid_preview}' β€” sending error to model for agent-correction ({agent._invalid_tool_retries}/3)") + + if agent._invalid_tool_retries >= 3: + agent._vprint(f"{agent.log_prefix}❌ Max retries (3) for invalid tool calls exceeded. Stopping as partial.", force=True) + agent._invalid_tool_retries = 0 + agent._persist_session(messages, conversation_history) + return { + "final_response": None, + "messages": messages, + "api_calls": api_call_count, + "completed": False, + "partial": True, + "error": f"Model generated invalid tool call: {invalid_preview}" + } + + assistant_msg = agent._build_assistant_message(assistant_message, finish_reason) + messages.append(assistant_msg) + for tc in assistant_message.tool_calls: + if tc.function.name not in agent.valid_tool_names: + content = f"Tool '{tc.function.name}' does not exist. Available tools: {available}" + else: + content = "Skipped: another tool call in this turn used an invalid name. Please retry this tool call." + messages.append({ + "role": "tool", + "name": tc.function.name, + "tool_call_id": tc.id, + "content": content, + }) + continue + # Reset retry counter on successful tool call validation + agent._invalid_tool_retries = 0 + + # Validate tool call arguments are valid JSON + # Handle empty strings as empty objects (common model quirk) + invalid_json_args = [] + for tc in assistant_message.tool_calls: + args = tc.function.arguments + if isinstance(args, (dict, list)): + tc.function.arguments = json.dumps(args) + continue + if args is not None and not isinstance(args, str): + tc.function.arguments = str(args) + args = tc.function.arguments + # Treat empty/whitespace strings as empty object + if not args or not args.strip(): + tc.function.arguments = "{}" + continue + try: + json.loads(args) + except json.JSONDecodeError as e: + invalid_json_args.append((tc.function.name, str(e))) + + if invalid_json_args: + # Check if the invalid JSON is due to truncation rather + # than a model formatting mistake. Routers sometimes + # rewrite finish_reason from "length" to "tool_calls", + # hiding the truncation from the length handler above. + # Detect truncation: args that don't end with } or ] + # (after stripping whitespace) are cut off mid-stream. + _truncated = any( + not (tc.function.arguments or "").rstrip().endswith(("}", "]")) + for tc in assistant_message.tool_calls + if tc.function.name in {n for n, _ in invalid_json_args} + ) + if _truncated: + agent._vprint( + f"{agent.log_prefix}⚠️ Truncated tool call arguments detected " + f"(finish_reason={finish_reason!r}) β€” refusing to execute.", + force=True, + ) + agent._invalid_json_retries = 0 + agent._cleanup_task_resources(effective_task_id) + agent._persist_session(messages, conversation_history) + return { + "final_response": None, + "messages": messages, + "api_calls": api_call_count, + "completed": False, + "partial": True, + "error": "Response truncated due to output length limit", + } + + # Track retries for invalid JSON arguments + agent._invalid_json_retries += 1 + + tool_name, error_msg = invalid_json_args[0] + agent._vprint(f"{agent.log_prefix}⚠️ Invalid JSON in tool call arguments for '{tool_name}': {error_msg}") + + if agent._invalid_json_retries < 3: + agent._vprint(f"{agent.log_prefix}πŸ”„ Retrying API call ({agent._invalid_json_retries}/3)...") + # Don't add anything to messages, just retry the API call + continue + else: + # Instead of returning partial, inject tool error results so the model can recover. + # Using tool results (not user messages) preserves role alternation. + agent._vprint(f"{agent.log_prefix}⚠️ Injecting recovery tool results for invalid JSON...") + agent._invalid_json_retries = 0 # Reset for next attempt + + # Append the assistant message with its (broken) tool_calls + recovery_assistant = agent._build_assistant_message(assistant_message, finish_reason) + messages.append(recovery_assistant) + + # Respond with tool error results for each tool call + invalid_names = {name for name, _ in invalid_json_args} + for tc in assistant_message.tool_calls: + if tc.function.name in invalid_names: + err = next(e for n, e in invalid_json_args if n == tc.function.name) + tool_result = ( + f"Error: Invalid JSON arguments. {err}. " + f"For tools with no required parameters, use an empty object: {{}}. " + f"Please retry with valid JSON." + ) + else: + tool_result = "Skipped: other tool call in this response had invalid JSON." + messages.append({ + "role": "tool", + "name": tc.function.name, + "tool_call_id": tc.id, + "content": tool_result, + }) + continue + + # Reset retry counter on successful JSON validation + agent._invalid_json_retries = 0 + + # ── Post-call guardrails ────────────────────────── + assistant_message.tool_calls = agent._cap_delegate_task_calls( + assistant_message.tool_calls + ) + assistant_message.tool_calls = agent._deduplicate_tool_calls( + assistant_message.tool_calls + ) + + assistant_msg = agent._build_assistant_message(assistant_message, finish_reason) + + # If this turn has both content AND tool_calls, capture the content + # as a fallback final response. Common pattern: model delivers its + # answer and calls memory/skill tools as a side-effect in the same + # turn. If the follow-up turn after tools is empty, we use this. + turn_content = assistant_message.content or "" + if turn_content and agent._has_content_after_think_block(turn_content): + agent._last_content_with_tools = turn_content + # Only mute subsequent output when EVERY tool call in + # this turn is post-response housekeeping (memory, todo, + # skill_manage, etc.). If any substantive tool is present + # (search_files, read_file, write_file, terminal, ...), + # keep output visible so the user sees progress. + _HOUSEKEEPING_TOOLS = frozenset({ + "memory", "todo", "skill_manage", "session_search", + }) + _all_housekeeping = all( + tc.function.name in _HOUSEKEEPING_TOOLS + for tc in assistant_message.tool_calls + ) + agent._last_content_tools_all_housekeeping = _all_housekeeping + if _all_housekeeping and agent._has_stream_consumers(): + agent._mute_post_response = True + elif agent._should_emit_quiet_tool_messages(): + clean = agent._strip_think_blocks(turn_content).strip() + if clean: + agent._vprint(f" β”Š πŸ’¬ {clean}") + + # Pop thinking-only prefill message(s) before appending + # (tool-call path β€” same rationale as the final-response path). + _had_prefill = False + while ( + messages + and isinstance(messages[-1], dict) + and messages[-1].get("_thinking_prefill") + ): + messages.pop() + _had_prefill = True + + # Reset prefill counter when tool calls follow a prefill + # recovery. Without this, the counter accumulates across + # the whole conversation β€” a model that intermittently + # empties (empty β†’ prefill β†’ tools β†’ empty β†’ prefill β†’ + # tools) burns both prefill attempts and the third empty + # gets zero recovery. Resetting here treats each tool- + # call success as a fresh start. + if _had_prefill: + agent._thinking_prefill_retries = 0 + agent._empty_content_retries = 0 + # Successful tool execution β€” reset the post-tool nudge + # flag so it can fire again if the model goes empty on + # a LATER tool round. + agent._post_tool_empty_retried = False + + messages.append(assistant_msg) + agent._emit_interim_assistant_message(assistant_msg) + + # Close any open streaming display (response box, reasoning + # box) before tool execution begins. Intermediate turns may + # have streamed early content that opened the response box; + # flushing here prevents it from wrapping tool feed lines. + # Only signal the display callback β€” TTS (_stream_callback) + # should NOT receive None (it uses None as end-of-stream). + if agent.stream_delta_callback: + try: + agent.stream_delta_callback(None) + except Exception: + pass + + agent._execute_tool_calls(assistant_message, messages, effective_task_id, api_call_count) + + if agent._tool_guardrail_halt_decision is not None: + decision = agent._tool_guardrail_halt_decision + _turn_exit_reason = "guardrail_halt" + final_response = agent._toolguard_controlled_halt_response(decision) + agent._emit_status( + f"⚠️ Tool guardrail halted {decision.tool_name}: {decision.code}" + ) + messages.append({"role": "assistant", "content": final_response}) + break + + # Reset per-turn retry counters after successful tool + # execution so a single truncation doesn't poison the + # entire conversation. + truncated_tool_call_retries = 0 + + # Signal that a paragraph break is needed before the next + # streamed text. We don't emit it immediately because + # multiple consecutive tool iterations would stack up + # redundant blank lines. Instead, _fire_stream_delta() + # will prepend a single "\n\n" the next time real text + # arrives. + agent._stream_needs_break = True + + # Refund the iteration if the ONLY tool(s) called were + # execute_code (programmatic tool calling). These are + # cheap RPC-style calls that shouldn't eat the budget. + _tc_names = {tc.function.name for tc in assistant_message.tool_calls} + if _tc_names == {"execute_code"}: + agent.iteration_budget.refund() + + # Use real token counts from the API response to decide + # compression. prompt_tokens + completion_tokens is the + # actual context size the provider reported plus the + # assistant turn β€” a tight lower bound for the next prompt. + # Tool results appended above aren't counted yet, but the + # threshold (default 50%) leaves ample headroom; if tool + # results push past it, the next API call will report the + # real total and trigger compression then. + # + # If last_prompt_tokens is 0 (stale after API disconnect + # or provider returned no usage data), fall back to rough + # estimate to avoid missing compression. Without this, + # a session can grow unbounded after disconnects because + # should_compress(0) never fires. (#2153) + _compressor = agent.context_compressor + if _compressor.last_prompt_tokens > 0: + # Only use prompt_tokens β€” completion/reasoning + # tokens don't consume context window space. + # Thinking models (GLM-5.1, QwQ, DeepSeek R1) + # inflate completion_tokens with reasoning, + # causing premature compression. (#12026) + _real_tokens = _compressor.last_prompt_tokens + else: + # Include tool schemas β€” with 50+ tools enabled + # these add 20-30K tokens the messages-only + # estimate misses, which can skip compression + # past the configured threshold (#14695). + _real_tokens = estimate_request_tokens_rough( + messages, tools=agent.tools or None + ) + + if agent.compression_enabled and _compressor.should_compress(_real_tokens): + agent._safe_print(" ⟳ compacting context…") + messages, active_system_prompt = agent._compress_context( + messages, system_message, + approx_tokens=agent.context_compressor.last_prompt_tokens, + task_id=effective_task_id, + ) + # Compression created a new session β€” clear history so + # _flush_messages_to_session_db writes compressed messages + # to the new session (see preflight compression comment). + conversation_history = None + + # Save session log incrementally (so progress is visible even if interrupted) + agent._session_messages = messages + agent._save_session_log(messages) + + # Continue loop for next response + continue + + else: + # No tool calls - this is the final response + final_response = assistant_message.content or "" + + # Fix: unmute output when entering the no-tool-call branch + # so the user can see empty-response warnings and recovery + # status messages. _mute_post_response was set during a + # prior housekeeping tool turn and should not silence the + # final response path. + agent._mute_post_response = False + + # Check if response only has think block with no actual content after it + if not agent._has_content_after_think_block(final_response): + # ── Partial stream recovery ───────────────────── + # If content was already streamed to the user before + # the connection died, use it as the final response + # instead of falling through to prior-turn fallback + # or wasting API calls on retries. + _partial_streamed = ( + getattr(agent, "_current_streamed_assistant_text", "") or "" + ) + if agent._has_content_after_think_block(_partial_streamed): + _turn_exit_reason = "partial_stream_recovery" + _recovered = agent._strip_think_blocks(_partial_streamed).strip() + logger.info( + "Partial stream content delivered (%d chars) " + "β€” using as final response", + len(_recovered), + ) + agent._emit_status( + "↻ Stream interrupted β€” using delivered content " + "as final response" + ) + final_response = _recovered + agent._response_was_previewed = True + break + + # If the previous turn already delivered real content alongside + # HOUSEKEEPING tool calls (e.g. "You're welcome!" + memory save), + # the model has nothing more to say. Use the earlier content + # immediately instead of wasting API calls on retries. + # NOTE: Only use this shortcut when ALL tools in that turn were + # housekeeping (memory, todo, etc.). When substantive tools + # were called (terminal, search_files, etc.), the content was + # likely mid-task narration ("I'll scan the directory...") and + # the empty follow-up means the model choked β€” let the + # post-tool nudge below handle that instead of exiting early. + fallback = getattr(agent, '_last_content_with_tools', None) + if fallback and getattr(agent, '_last_content_tools_all_housekeeping', False): + _turn_exit_reason = "fallback_prior_turn_content" + logger.info("Empty follow-up after tool calls β€” using prior turn content as final response") + agent._emit_status("↻ Empty response after tool calls β€” using earlier content as final answer") + agent._last_content_with_tools = None + agent._last_content_tools_all_housekeeping = False + agent._empty_content_retries = 0 + # Do NOT modify the assistant message content β€” the + # old code injected "Calling the X tools..." which + # poisoned the conversation history. Just use the + # fallback text as the final response and break. + final_response = agent._strip_think_blocks(fallback).strip() + agent._response_was_previewed = True + break + + # ── Post-tool-call empty response nudge ─────────── + # The model returned empty after executing tool calls. + # This covers two cases: + # (a) No prior-turn content at all β€” model went silent + # (b) Prior turn had content + SUBSTANTIVE tools (the + # fallback above was skipped because the content + # was mid-task narration, not a final answer) + # Instead of giving up, nudge the model to continue by + # appending a user-level hint. This is the #9400 case: + # weaker models (mimo-v2-pro, GLM-5, etc.) sometimes + # return empty after tool results instead of continuing + # to the next step. One retry with a nudge usually + # fixes it. + _prior_was_tool = any( + m.get("role") == "tool" + for m in messages[-5:] # check recent messages + ) + # Detect Qwen3/Ollama-style in-content thinking blocks. + # Ollama puts in the content field (not in + # reasoning_content), so _has_structured below would + # miss it. We check here so thinking-only responses + # after tool calls route to prefill instead of nudge. + _has_inline_thinking = bool( + re.search( + r'||', + final_response or "", + re.IGNORECASE, + ) + ) + if ( + _prior_was_tool + and not getattr(agent, "_post_tool_empty_retried", False) + and not _has_inline_thinking # thinking model still working β€” let prefill handle + ): + agent._post_tool_empty_retried = True + # Clear stale narration so it doesn't resurface + # on a later empty response after the nudge. + agent._last_content_with_tools = None + agent._last_content_tools_all_housekeeping = False + logger.info( + "Empty response after tool calls β€” nudging model " + "to continue processing" + ) + agent._emit_status( + "⚠️ Model returned empty after tool calls β€” " + "nudging to continue" + ) + # Append the empty assistant message first so the + # message sequence stays valid: + # tool(result) β†’ assistant("(empty)") β†’ user(nudge) + # Without this, we'd have tool β†’ user which most + # APIs reject as an invalid sequence. + _nudge_msg = agent._build_assistant_message(assistant_message, finish_reason) + _nudge_msg["content"] = "(empty)" + _nudge_msg["_empty_recovery_synthetic"] = True + messages.append(_nudge_msg) + messages.append({ + "role": "user", + "content": ( + "You just executed tool calls but returned an " + "empty response. Please process the tool " + "results above and continue with the task." + ), + "_empty_recovery_synthetic": True, + }) + continue + + # ── Thinking-only prefill continuation ────────── + # The model produced structured reasoning (via API + # fields) but no visible text content. Rather than + # giving up, append the assistant message as-is and + # continue β€” the model will see its own reasoning + # on the next turn and produce the text portion. + # Inspired by clawdbot's "incomplete-text" recovery. + # Also covers Qwen3/Ollama in-content blocks + # (detected above as _has_inline_thinking). + _has_structured = bool( + getattr(assistant_message, "reasoning", None) + or getattr(assistant_message, "reasoning_content", None) + or getattr(assistant_message, "reasoning_details", None) + or _has_inline_thinking + ) + if _has_structured and agent._thinking_prefill_retries < 2: + agent._thinking_prefill_retries += 1 + logger.info( + "Thinking-only response (no visible content) β€” " + "prefilling to continue (%d/2)", + agent._thinking_prefill_retries, + ) + agent._emit_status( + f"↻ Thinking-only response β€” prefilling to continue " + f"({agent._thinking_prefill_retries}/2)" + ) + interim_msg = agent._build_assistant_message( + assistant_message, "incomplete" + ) + interim_msg["_thinking_prefill"] = True + messages.append(interim_msg) + agent._session_messages = messages + agent._save_session_log(messages) + continue + + # ── Empty response retry ────────────────────── + # Model returned nothing usable. Retry up to 3 + # times before attempting fallback. This covers + # both truly empty responses (no content, no + # reasoning) AND reasoning-only responses after + # prefill exhaustion β€” models like mimo-v2-pro + # always populate reasoning fields via OpenRouter, + # so the old `not _has_structured` guard blocked + # retries for every reasoning model after prefill. + _truly_empty = not agent._strip_think_blocks( + final_response + ).strip() + _prefill_exhausted = ( + _has_structured + and agent._thinking_prefill_retries >= 2 + ) + if _truly_empty and (not _has_structured or _prefill_exhausted) and agent._empty_content_retries < 3: + agent._empty_content_retries += 1 + logger.warning( + "Empty response (no content or reasoning) β€” " + "retry %d/3 (model=%s)", + agent._empty_content_retries, agent.model, + ) + agent._emit_status( + f"⚠️ Empty response from model β€” retrying " + f"({agent._empty_content_retries}/3)" + ) + continue + + # ── Exhausted retries β€” try fallback provider ── + # Before giving up with "(empty)", attempt to + # switch to the next provider in the fallback + # chain. This covers the case where a model + # (e.g. GLM-4.5-Air) consistently returns empty + # due to context degradation or provider issues. + if _truly_empty and agent._fallback_chain: + logger.warning( + "Empty response after %d retries β€” " + "attempting fallback (model=%s, provider=%s)", + agent._empty_content_retries, agent.model, + agent.provider, + ) + agent._emit_status( + "⚠️ Model returning empty responses β€” " + "switching to fallback provider..." + ) + if agent._try_activate_fallback(): + agent._empty_content_retries = 0 + agent._emit_status( + f"↻ Switched to fallback: {agent.model} " + f"({agent.provider})" + ) + logger.info( + "Fallback activated after empty responses: " + "now using %s on %s", + agent.model, agent.provider, + ) + continue + + # Exhausted retries and fallback chain (or no + # fallback configured). Fall through to the + # "(empty)" terminal. + _turn_exit_reason = "empty_response_exhausted" + reasoning_text = agent._extract_reasoning(assistant_message) + agent._drop_trailing_empty_response_scaffolding(messages) + assistant_msg = agent._build_assistant_message(assistant_message, finish_reason) + assistant_msg["content"] = "(empty)" + # This is a user-facing failure sentinel for the gateway, + # not real assistant content. Persisting it makes later + # "continue" turns replay assistant("(empty)") as if it + # were a meaningful model response, which can keep long + # tool-heavy sessions stuck in empty-response loops. + assistant_msg["_empty_terminal_sentinel"] = True + messages.append(assistant_msg) + + if reasoning_text: + reasoning_preview = reasoning_text[:500] + "..." if len(reasoning_text) > 500 else reasoning_text + logger.warning( + "Reasoning-only response (no visible content) " + "after exhausting retries and fallback. " + "Reasoning: %s", reasoning_preview, + ) + agent._emit_status( + "⚠️ Model produced reasoning but no visible " + "response after all retries. Returning empty." + ) + else: + logger.warning( + "Empty response (no content or reasoning) " + "after %d retries. No fallback available. " + "model=%s provider=%s", + agent._empty_content_retries, agent.model, + agent.provider, + ) + agent._emit_status( + "❌ Model returned no content after all retries" + + (" and fallback attempts." if agent._fallback_chain else + ". No fallback providers configured.") + ) + + final_response = "(empty)" + break + + # Reset retry counter/signature on successful content + agent._empty_content_retries = 0 + agent._thinking_prefill_retries = 0 + + if ( + agent.api_mode == "codex_responses" + and agent.valid_tool_names + and codex_ack_continuations < 2 + and agent._looks_like_codex_intermediate_ack( + user_message=user_message, + assistant_content=final_response, + messages=messages, + ) + ): + codex_ack_continuations += 1 + interim_msg = agent._build_assistant_message(assistant_message, "incomplete") + messages.append(interim_msg) + agent._emit_interim_assistant_message(interim_msg) + + continue_msg = { + "role": "user", + "content": ( + "[System: Continue now. Execute the required tool calls and only " + "send your final answer after completing the task.]" + ), + } + messages.append(continue_msg) + agent._session_messages = messages + agent._save_session_log(messages) + continue + + codex_ack_continuations = 0 + + if truncated_response_prefix: + final_response = truncated_response_prefix + final_response + truncated_response_prefix = "" + length_continue_retries = 0 + + final_response = agent._strip_think_blocks(final_response).strip() + + final_msg = agent._build_assistant_message(assistant_message, finish_reason) + + # Pop thinking-only prefill and empty-response retry + # scaffolding before appending the final response. These + # internal turns are only for the next API retry and should + # not become durable transcript context. + while ( + messages + and isinstance(messages[-1], dict) + and ( + messages[-1].get("_thinking_prefill") + or messages[-1].get("_empty_recovery_synthetic") + or messages[-1].get("_empty_terminal_sentinel") + ) + ): + messages.pop() + + messages.append(final_msg) + + _turn_exit_reason = f"text_response(finish_reason={finish_reason})" + if not agent.quiet_mode: + agent._safe_print(f"πŸŽ‰ Conversation completed after {api_call_count} OpenAI-compatible API call(s)") + break + + except Exception as e: + error_msg = f"Error during OpenAI-compatible API call #{api_call_count}: {str(e)}" + try: + print(f"❌ {error_msg}") + except (OSError, ValueError): + logger.error(error_msg) + + logger.debug("Outer loop error in API call #%d", api_call_count, exc_info=True) + + # If an assistant message with tool_calls was already appended, + # the API expects a role="tool" result for every tool_call_id. + # Fill in error results for any that weren't answered yet. + for idx in range(len(messages) - 1, -1, -1): + msg = messages[idx] + if not isinstance(msg, dict): + break + if msg.get("role") == "tool": + continue + if msg.get("role") == "assistant" and msg.get("tool_calls"): + answered_ids = { + m["tool_call_id"] + for m in messages[idx + 1:] + if isinstance(m, dict) and m.get("role") == "tool" + } + for tc in msg["tool_calls"]: + if not tc or not isinstance(tc, dict): continue + if tc["id"] not in answered_ids: + err_msg = { + "role": "tool", + "name": _ra().AIAgent._get_tool_call_name_static(tc), + "tool_call_id": tc["id"], + "content": f"Error executing tool: {error_msg}", + } + messages.append(err_msg) + break + + # Non-tool errors don't need a synthetic message injected. + # The error is already printed to the user (line above), and + # the retry loop continues. Injecting a fake user/assistant + # message pollutes history, burns tokens, and risks violating + # role-alternation invariants. + + # If we're near the limit, break to avoid infinite loops + if api_call_count >= agent.max_iterations - 1: + _turn_exit_reason = f"error_near_max_iterations({error_msg[:80]})" + final_response = f"I apologize, but I encountered repeated errors: {error_msg}" + # Append as assistant so the history stays valid for + # session resume (avoids consecutive user messages). + messages.append({"role": "assistant", "content": final_response}) + break + + if final_response is None and ( + api_call_count >= agent.max_iterations + or agent.iteration_budget.remaining <= 0 + ): + # Budget exhausted β€” ask the model for a summary via one extra + # API call with tools stripped. _handle_max_iterations injects a + # user message and makes a single toolless request. + _turn_exit_reason = f"max_iterations_reached({api_call_count}/{agent.max_iterations})" + agent._emit_status( + f"⚠️ Iteration budget exhausted ({api_call_count}/{agent.max_iterations}) " + "β€” asking model to summarise" + ) + if not agent.quiet_mode: + agent._safe_print( + f"\n⚠️ Iteration budget exhausted ({api_call_count}/{agent.max_iterations}) " + "β€” requesting summary..." + ) + final_response = agent._handle_max_iterations(messages, api_call_count) + + # If running as a kanban worker, block the task so the dispatcher + # knows the worker could not complete (rather than treating it as a + # protocol violation). The agent loop strips tools before calling + # _handle_max_iterations, so the model cannot call kanban_block + # itself β€” we must do it on its behalf. + _kanban_task = os.environ.get("HERMES_KANBAN_TASK") + if _kanban_task: + try: + _ra().handle_function_call( + "kanban_block", + { + "task_id": _kanban_task, + "reason": ( + f"Iteration budget exhausted " + f"({api_call_count}/{agent.max_iterations}) β€” " + "task could not complete within the allowed " + "iterations" + ), + }, + task_id=effective_task_id, + ) + logger.info( + "kanban_block called for task %s after iteration " + "exhaustion (%d/%d)", + _kanban_task, api_call_count, agent.max_iterations, + ) + except Exception: + logger.warning( + "Failed to call kanban_block after iteration " + "exhaustion for task %s", + _kanban_task, + exc_info=True, + ) + + # Determine if conversation completed successfully + completed = final_response is not None and api_call_count < agent.max_iterations + + # Save trajectory if enabled. ``user_message`` may be a multimodal + # list of parts; the trajectory format wants a plain string. + agent._save_trajectory(messages, _summarize_user_message_for_log(user_message), completed) + + # Clean up VM and browser for this task after conversation completes + agent._cleanup_task_resources(effective_task_id) + + # Persist session to both JSON log and SQLite only after private retry + # scaffolding has been removed. Otherwise a later user "continue" turn + # can replay assistant("(empty)") / recovery nudges and fall into the + # same empty-response loop again. + agent._drop_trailing_empty_response_scaffolding(messages) + agent._persist_session(messages, conversation_history) + + # ── Turn-exit diagnostic log ───────────────────────────────────── + # Always logged at INFO so agent.log captures WHY every turn ended. + # When the last message is a tool result (agent was mid-work), log + # at WARNING β€” this is the "just stops" scenario users report. + _last_msg_role = messages[-1].get("role") if messages else None + _last_tool_name = None + if _last_msg_role == "tool": + # Walk back to find the assistant message with the tool call + for _m in reversed(messages): + if _m.get("role") == "assistant" and _m.get("tool_calls"): + _tcs = _m["tool_calls"] + if _tcs and isinstance(_tcs[0], dict): + _last_tool_name = _tcs[-1].get("function", {}).get("name") + break + + _turn_tool_count = sum( + 1 for m in messages + if isinstance(m, dict) and m.get("role") == "assistant" and m.get("tool_calls") + ) + _resp_len = len(final_response) if final_response else 0 + _budget_used = agent.iteration_budget.used if agent.iteration_budget else 0 + _budget_max = agent.iteration_budget.max_total if agent.iteration_budget else 0 + + _diag_msg = ( + "Turn ended: reason=%s model=%s api_calls=%d/%d budget=%d/%d " + "tool_turns=%d last_msg_role=%s response_len=%d session=%s" + ) + _diag_args = ( + _turn_exit_reason, agent.model, api_call_count, agent.max_iterations, + _budget_used, _budget_max, + _turn_tool_count, _last_msg_role, _resp_len, + agent.session_id or "none", + ) + + if _last_msg_role == "tool" and not interrupted: + # Agent was mid-work β€” this is the "just stops" case. + logger.warning( + "Turn ended with pending tool result (agent may appear stuck). " + + _diag_msg + " last_tool=%s", + *_diag_args, _last_tool_name, + ) + else: + logger.info(_diag_msg, *_diag_args) + + # File-mutation verifier footer. + # If one or more ``write_file`` / ``patch`` calls failed during this + # turn and were never superseded by a successful write to the same + # path, append an advisory footer to the assistant response. This + # catches the specific case β€” reported by Ben Eng (#15524-adjacent) + # β€” where a model issues a batch of parallel patches, half of them + # fail with "Could not find old_string", and the model summarises + # the turn claiming every file was edited. The user then has to + # manually run ``git status`` to catch the lie. With this footer + # the truth is surfaced on every turn, so over-claiming is + # structurally impossible past the model. + # + # Gate: only applied when a real text response exists for this + # turn and the user didn't interrupt. Empty/interrupted turns + # already have other surface text that shouldn't be augmented. + if final_response and not interrupted: + try: + _failed = getattr(agent, "_turn_failed_file_mutations", None) or {} + if _failed and agent._file_mutation_verifier_enabled(): + footer = agent._format_file_mutation_failure_footer(_failed) + if footer: + final_response = final_response.rstrip() + "\n\n" + footer + except Exception as _ver_err: + logger.debug("file-mutation verifier footer failed: %s", _ver_err) + + # Plugin hook: transform_llm_output + # Fired once per turn after the tool-calling loop completes. + # Plugins can transform the LLM's output text before it's returned. + # First hook to return a string wins; None/empty return leaves text unchanged. + if final_response and not interrupted: + try: + from hermes_cli.plugins import invoke_hook as _invoke_hook + _transform_results = _invoke_hook( + "transform_llm_output", + response_text=final_response, + session_id=agent.session_id or "", + model=agent.model, + platform=getattr(agent, "platform", None) or "", + ) + for _hook_result in _transform_results: + if isinstance(_hook_result, str) and _hook_result: + final_response = _hook_result + break # First non-empty string wins + except Exception as exc: + logger.warning("transform_llm_output hook failed: %s", exc) + + # Plugin hook: post_llm_call + # Fired once per turn after the tool-calling loop completes. + # Plugins can use this to persist conversation data (e.g. sync + # to an external memory system). + if final_response and not interrupted: + try: + from hermes_cli.plugins import invoke_hook as _invoke_hook + _invoke_hook( + "post_llm_call", + session_id=agent.session_id, + user_message=original_user_message, + assistant_response=final_response, + conversation_history=list(messages), + model=agent.model, + platform=getattr(agent, "platform", None) or "", + ) + except Exception as exc: + logger.warning("post_llm_call hook failed: %s", exc) + + # Extract reasoning from the CURRENT turn only. Walk backwards + # but stop at the user message that started this turn β€” anything + # earlier is from a prior turn and must not leak into the reasoning + # box (confusing stale display; #17055). Within the current turn + # we still want the *most recent* non-empty reasoning: many + # providers (Claude thinking, DeepSeek v4, Codex Responses) emit + # reasoning on the tool-call step and leave the final-answer step + # with reasoning=None, so picking only the last assistant would + # silently drop legitimate same-turn reasoning. + last_reasoning = None + for msg in reversed(messages): + if msg.get("role") == "user": + break # turn boundary β€” don't cross into prior turns + if msg.get("role") == "assistant" and msg.get("reasoning"): + last_reasoning = msg["reasoning"] + break + + # Build result with interrupt info if applicable + result = { + "final_response": final_response, + "last_reasoning": last_reasoning, + "messages": messages, + "api_calls": api_call_count, + "completed": completed, + "turn_exit_reason": _turn_exit_reason, + "partial": False, # True only when stopped due to invalid tool calls + "interrupted": interrupted, + "response_previewed": getattr(agent, "_response_was_previewed", False), + "model": agent.model, + "provider": agent.provider, + "base_url": agent.base_url, + "input_tokens": agent.session_input_tokens, + "output_tokens": agent.session_output_tokens, + "cache_read_tokens": agent.session_cache_read_tokens, + "cache_write_tokens": agent.session_cache_write_tokens, + "reasoning_tokens": agent.session_reasoning_tokens, + "prompt_tokens": agent.session_prompt_tokens, + "completion_tokens": agent.session_completion_tokens, + "total_tokens": agent.session_total_tokens, + "last_prompt_tokens": getattr(agent.context_compressor, "last_prompt_tokens", 0) or 0, + "estimated_cost_usd": agent.session_estimated_cost_usd, + "cost_status": agent.session_cost_status, + "cost_source": agent.session_cost_source, + } + if agent._tool_guardrail_halt_decision is not None: + result["guardrail"] = agent._tool_guardrail_halt_decision.to_metadata() + # If a /steer landed after the final assistant turn (no more tool + # batches to drain into), hand it back to the caller so it can be + # delivered as the next user turn instead of being silently lost. + _leftover_steer = agent._drain_pending_steer() + if _leftover_steer: + result["pending_steer"] = _leftover_steer + agent._response_was_previewed = False + + # Include interrupt message if one triggered the interrupt + if interrupted and agent._interrupt_message: + result["interrupt_message"] = agent._interrupt_message + + # Clear interrupt state after handling + agent.clear_interrupt() + + # Clear stream callback so it doesn't leak into future calls + agent._stream_callback = None + + # Check skill trigger NOW β€” based on how many tool iterations THIS turn used. + _should_review_skills = False + if (agent._skill_nudge_interval > 0 + and agent._iters_since_skill >= agent._skill_nudge_interval + and "skill_manage" in agent.valid_tool_names): + _should_review_skills = True + agent._iters_since_skill = 0 + + # External memory provider: sync the completed turn + queue next prefetch. + agent._sync_external_memory_for_turn( + original_user_message=original_user_message, + final_response=final_response, + interrupted=interrupted, + ) + + # Background memory/skill review β€” runs AFTER the response is delivered + # so it never competes with the user's task for model attention. + if final_response and not interrupted and (_should_review_memory or _should_review_skills): + try: + agent._spawn_background_review( + messages_snapshot=list(messages), + review_memory=_should_review_memory, + review_skills=_should_review_skills, + ) + except Exception: + pass # Background review is best-effort + + # Note: Memory provider on_session_end() + shutdown_all() are NOT + # called here β€” run_conversation() is called once per user message in + # multi-turn sessions. Shutting down after every turn would kill the + # provider before the second message. Actual session-end cleanup is + # handled by the CLI (atexit / /reset) and gateway (session expiry / + # _reset_session). + + # Plugin hook: on_session_end + # Fired at the very end of every run_conversation call. + # Plugins can use this for cleanup, flushing buffers, etc. + try: + from hermes_cli.plugins import invoke_hook as _invoke_hook + _invoke_hook( + "on_session_end", + session_id=agent.session_id, + completed=completed, + interrupted=interrupted, + model=agent.model, + platform=getattr(agent, "platform", None) or "", + ) + except Exception as exc: + logger.warning("on_session_end hook failed: %s", exc) + + return result + + + +__all__ = ["run_conversation"] diff --git a/run_agent.py b/run_agent.py index 8ea73167ac9..b13eb851175 100644 --- a/run_agent.py +++ b/run_agent.py @@ -5694,3873 +5694,9 @@ class AIAgent: stream_callback: Optional[callable] = None, persist_user_message: Optional[str] = None, ) -> Dict[str, Any]: - """ - Run a complete conversation with tool calling until completion. - - Args: - user_message (str): The user's message/question - system_message (str): Custom system message (optional, overrides ephemeral_system_prompt if provided) - conversation_history (List[Dict]): Previous conversation messages (optional) - task_id (str): Unique identifier for this task to isolate VMs between concurrent tasks (optional, auto-generated if not provided) - stream_callback: Optional callback invoked with each text delta during streaming. - Used by the TTS pipeline to start audio generation before the full response. - When None (default), API calls use the standard non-streaming path. - persist_user_message: Optional clean user message to store in - transcripts/history when user_message contains API-only - synthetic prefixes. - or queuing follow-up prefetch work. - - Returns: - Dict: Complete conversation result with final response and message history - """ - # Guard stdio against OSError from broken pipes (systemd/headless/daemon). - # Installed once, transparent when streams are healthy, prevents crash on write. - _install_safe_stdio() - - self._ensure_db_session() - - # Tell auxiliary_client what the live main provider/model are for - # this turn. Used by tools whose behaviour depends on the active - # main model (e.g. vision_analyze's native fast path) so they see - # the CLI/gateway override instead of the stale config.yaml - # default. Idempotent β€” fine to call every turn. - try: - from agent.auxiliary_client import set_runtime_main - set_runtime_main( - getattr(self, "provider", "") or "", - getattr(self, "model", "") or "", - ) - except Exception: - pass - - # Tag all log records on this thread with the session ID so - # ``hermes logs --session `` can filter a single conversation. - from hermes_logging import set_session_context - set_session_context(self.session_id) - - # Bind the skill write-origin ContextVar for this thread so tool - # handlers (e.g. skill_manage create) can tell whether they are - # running inside the background self-improvement review fork vs. - # a foreground user-directed turn. Set at the top of each call; - # the review fork runs on its own thread with a fresh context, - # so the foreground value here does not leak into it. - from tools.skill_provenance import set_current_write_origin - set_current_write_origin(getattr(self, "_memory_write_origin", "assistant_tool")) - - # If the previous turn activated fallback, restore the primary - # runtime so this turn gets a fresh attempt with the preferred model. - # No-op when _fallback_activated is False (gateway, first turn, etc.). - self._restore_primary_runtime() - - # Sanitize surrogate characters from user input. Clipboard paste from - # rich-text editors (Google Docs, Word, etc.) can inject lone surrogates - # that are invalid UTF-8 and crash JSON serialization in the OpenAI SDK. - if isinstance(user_message, str): - user_message = _sanitize_surrogates(user_message) - if isinstance(persist_user_message, str): - persist_user_message = _sanitize_surrogates(persist_user_message) - - # Store stream callback for _interruptible_api_call to pick up - self._stream_callback = stream_callback - self._persist_user_message_idx = None - self._persist_user_message_override = persist_user_message - # Generate unique task_id if not provided to isolate VMs between concurrent tasks - effective_task_id = task_id or str(uuid.uuid4()) - # Expose the active task_id so tools running mid-turn (e.g. delegate_task - # in delegate_tool.py) can identify this agent for the cross-agent file - # state registry. Set BEFORE any tool dispatch so snapshots taken at - # child-launch time see the parent's real id, not None. - self._current_task_id = effective_task_id - - # Reset retry counters and iteration budget at the start of each turn - # so subagent usage from a previous turn doesn't eat into the next one. - self._invalid_tool_retries = 0 - self._invalid_json_retries = 0 - self._empty_content_retries = 0 - self._incomplete_scratchpad_retries = 0 - self._codex_incomplete_retries = 0 - self._thinking_prefill_retries = 0 - self._post_tool_empty_retried = False - self._last_content_with_tools = None - self._last_content_tools_all_housekeeping = False - self._mute_post_response = False - self._unicode_sanitization_passes = 0 - self._tool_guardrails.reset_for_turn() - self._tool_guardrail_halt_decision = None - # True until the server rejects an image_url content part with an error - # like "Only 'text' content type is supported." Set to False on first - # rejection and kept False for the rest of the session so we never re-send - # images to a text-only endpoint. Scoped per `_run()` call, not per instance. - self._vision_supported = True - - # Pre-turn connection health check: detect and clean up dead TCP - # connections left over from provider outages or dropped streams. - # This prevents the next API call from hanging on a zombie socket. - if self.api_mode != "anthropic_messages": - try: - if self._cleanup_dead_connections(): - self._emit_status( - "πŸ”Œ Detected stale connections from a previous provider " - "issue β€” cleaned up automatically. Proceeding with fresh " - "connection." - ) - except Exception: - pass - # Replay compression warning through status_callback for gateway - # platforms (the callback was not wired during __init__). - if self._compression_warning: - self._replay_compression_warning() - self._compression_warning = None # send once - - # NOTE: _turns_since_memory and _iters_since_skill are NOT reset here. - # They are initialized in __init__ and must persist across run_conversation - # calls so that nudge logic accumulates correctly in CLI mode. - self.iteration_budget = IterationBudget(self.max_iterations) - - # Log conversation turn start for debugging/observability - _preview_text = _summarize_user_message_for_log(user_message) - _msg_preview = (_preview_text[:80] + "...") if len(_preview_text) > 80 else _preview_text - _msg_preview = _msg_preview.replace("\n", " ") - logger.info( - "conversation turn: session=%s model=%s provider=%s platform=%s history=%d msg=%r", - self.session_id or "none", self.model, self.provider or "unknown", - self.platform or "unknown", len(conversation_history or []), - _msg_preview, - ) - - # Initialize conversation (copy to avoid mutating the caller's list) - messages = list(conversation_history) if conversation_history else [] - - # Hydrate todo store from conversation history (gateway creates a fresh - # AIAgent per message, so the in-memory store is empty -- we need to - # recover the todo state from the most recent todo tool response in history) - if conversation_history and not self._todo_store.has_items(): - self._hydrate_todo_store(conversation_history) - - # Hydrate per-session nudge counters from persisted history. - # Gateway creates a fresh AIAgent per inbound message (cache miss / - # 1h idle eviction / config-signature mismatch / process restart), so - # _turns_since_memory and _user_turn_count start at 0 every turn and - # the memory.nudge_interval trigger may never be reached. Reconstruct - # an effective count from prior user turns in conversation_history. - # Idempotent: a cached agent that already accumulated counters keeps - # them; only a freshly-built agent with empty in-memory state hydrates. - # See issue #22357. - if conversation_history and self._user_turn_count == 0: - prior_user_turns = sum( - 1 for m in conversation_history if m.get("role") == "user" - ) - if prior_user_turns > 0: - self._user_turn_count = prior_user_turns - if self._memory_nudge_interval > 0 and self._turns_since_memory == 0: - # % preserves original 1-in-N cadence rather than firing a - # review immediately on resume (which would surprise users - # whose session happened to land just past a multiple of N). - self._turns_since_memory = prior_user_turns % self._memory_nudge_interval - - - # Prefill messages (few-shot priming) are injected at API-call time only, - # never stored in the messages list. This keeps them ephemeral: they won't - # be saved to session DB, session logs, or batch trajectories, but they're - # automatically re-applied on every API call (including session continuations). - - # Track user turns for memory flush and periodic nudge logic - self._user_turn_count += 1 - - # Reset the streaming context scrubber at the top of each turn so a - # hung span from a prior interrupted stream can't taint this turn's - # output. - scrubber = getattr(self, "_stream_context_scrubber", None) - if scrubber is not None: - scrubber.reset() - # Reset the think scrubber for the same reason β€” an interrupted - # prior stream may have left us inside an unterminated block. - think_scrubber = getattr(self, "_stream_think_scrubber", None) - if think_scrubber is not None: - think_scrubber.reset() - - # Preserve the original user message (no nudge injection). - original_user_message = persist_user_message if persist_user_message is not None else user_message - - # Track memory nudge trigger (turn-based, checked here). - # Skill trigger is checked AFTER the agent loop completes, based on - # how many tool iterations THIS turn used. - _should_review_memory = False - if (self._memory_nudge_interval > 0 - and "memory" in self.valid_tool_names - and self._memory_store): - self._turns_since_memory += 1 - if self._turns_since_memory >= self._memory_nudge_interval: - _should_review_memory = True - self._turns_since_memory = 0 - - # Add user message - user_msg = {"role": "user", "content": user_message} - messages.append(user_msg) - current_turn_user_idx = len(messages) - 1 - self._persist_user_message_idx = current_turn_user_idx - - if not self.quiet_mode: - _print_preview = _summarize_user_message_for_log(user_message) - self._safe_print(f"πŸ’¬ Starting conversation: '{_print_preview[:60]}{'...' if len(_print_preview) > 60 else ''}'") - - # ── System prompt (cached per session for prefix caching) ── - # Built once on first call, reused for all subsequent calls. - # Only rebuilt after context compression events (which invalidate - # the cache and reload memory from disk). - # - # For continuing sessions (gateway creates a fresh AIAgent per - # message), we load the stored system prompt from the session DB - # instead of rebuilding. Rebuilding would pick up memory changes - # from disk that the model already knows about (it wrote them!), - # producing a different system prompt and breaking the Anthropic - # prefix cache. - if self._cached_system_prompt is None: - stored_prompt = None - if conversation_history and self._session_db: - try: - session_row = self._session_db.get_session(self.session_id) - if session_row: - stored_prompt = session_row.get("system_prompt") or None - except Exception: - pass # Fall through to build fresh - - if stored_prompt: - # Continuing session β€” reuse the exact system prompt from - # the previous turn so the Anthropic cache prefix matches. - self._cached_system_prompt = stored_prompt - else: - # First turn of a new session β€” build from scratch. - self._cached_system_prompt = self._build_system_prompt(system_message) - # Plugin hook: on_session_start - # Fired once when a brand-new session is created (not on - # continuation). Plugins can use this to initialise - # session-scoped state (e.g. warm a memory cache). - try: - from hermes_cli.plugins import invoke_hook as _invoke_hook - _invoke_hook( - "on_session_start", - session_id=self.session_id, - model=self.model, - platform=getattr(self, "platform", None) or "", - ) - except Exception as exc: - logger.warning("on_session_start hook failed: %s", exc) - - # Store the system prompt snapshot in SQLite - if self._session_db: - try: - self._session_db.update_system_prompt(self.session_id, self._cached_system_prompt) - except Exception as e: - logger.debug("Session DB update_system_prompt failed: %s", e) - - active_system_prompt = self._cached_system_prompt - - # ── Preflight context compression ── - # Before entering the main loop, check if the loaded conversation - # history already exceeds the model's context threshold. This handles - # cases where a user switches to a model with a smaller context window - # while having a large existing session β€” compress proactively rather - # than waiting for an API error (which might be caught as a non-retryable - # 4xx and abort the request entirely). - if ( - self.compression_enabled - and len(messages) > self.context_compressor.protect_first_n - + self.context_compressor.protect_last_n + 1 - ): - # Include tool schema tokens β€” with many tools these can add - # 20-30K+ tokens that the old sys+msg estimate missed entirely. - _preflight_tokens = estimate_request_tokens_rough( - messages, - system_prompt=active_system_prompt or "", - tools=self.tools or None, - ) - - if _preflight_tokens >= self.context_compressor.threshold_tokens: - logger.info( - "Preflight compression: ~%s tokens >= %s threshold (model %s, ctx %s)", - f"{_preflight_tokens:,}", - f"{self.context_compressor.threshold_tokens:,}", - self.model, - f"{self.context_compressor.context_length:,}", - ) - self._emit_status( - f"πŸ“¦ Preflight compression: ~{_preflight_tokens:,} tokens " - f">= {self.context_compressor.threshold_tokens:,} threshold. " - "This may take a moment." - ) - # May need multiple passes for very large sessions with small - # context windows (each pass summarises the middle N turns). - for _pass in range(3): - _orig_len = len(messages) - messages, active_system_prompt = self._compress_context( - messages, system_message, approx_tokens=_preflight_tokens, - task_id=effective_task_id, - ) - if len(messages) >= _orig_len: - break # Cannot compress further - # Compression created a new session β€” clear the history - # reference so _flush_messages_to_session_db writes ALL - # compressed messages to the new session's SQLite, not - # skipping them because conversation_history is still the - # pre-compression length. - conversation_history = None - # Fix: reset retry counters after compression so the model - # gets a fresh budget on the compressed context. Without - # this, pre-compression retries carry over and the model - # hits "(empty)" immediately after compression-induced - # context loss. - self._empty_content_retries = 0 - self._thinking_prefill_retries = 0 - self._last_content_with_tools = None - self._last_content_tools_all_housekeeping = False - self._mute_post_response = False - # Re-estimate after compression - _preflight_tokens = estimate_request_tokens_rough( - messages, - system_prompt=active_system_prompt or "", - tools=self.tools or None, - ) - if _preflight_tokens < self.context_compressor.threshold_tokens: - break # Under threshold - - # Plugin hook: pre_llm_call - # Fired once per turn before the tool-calling loop. Plugins can - # return a dict with a ``context`` key (or a plain string) whose - # value is appended to the current turn's user message. - # - # Context is ALWAYS injected into the user message, never the - # system prompt. This preserves the prompt cache prefix β€” the - # system prompt stays identical across turns so cached tokens - # are reused. The system prompt is Hermes's territory; plugins - # contribute context alongside the user's input. - # - # All injected context is ephemeral (not persisted to session DB). - _plugin_user_context = "" - try: - from hermes_cli.plugins import invoke_hook as _invoke_hook - _pre_results = _invoke_hook( - "pre_llm_call", - session_id=self.session_id, - user_message=original_user_message, - conversation_history=list(messages), - is_first_turn=(not bool(conversation_history)), - model=self.model, - platform=getattr(self, "platform", None) or "", - sender_id=getattr(self, "_user_id", None) or "", - ) - _ctx_parts: list[str] = [] - for r in _pre_results: - if isinstance(r, dict) and r.get("context"): - _ctx_parts.append(str(r["context"])) - elif isinstance(r, str) and r.strip(): - _ctx_parts.append(r) - if _ctx_parts: - _plugin_user_context = "\n\n".join(_ctx_parts) - except Exception as exc: - logger.warning("pre_llm_call hook failed: %s", exc) - - # Main conversation loop - api_call_count = 0 - final_response = None - interrupted = False - codex_ack_continuations = 0 - length_continue_retries = 0 - truncated_tool_call_retries = 0 - truncated_response_prefix = "" - compression_attempts = 0 - _turn_exit_reason = "unknown" # Diagnostic: why the loop ended - - # Per-turn file-mutation verifier state. Keyed by resolved path; - # each failed ``write_file`` / ``patch`` call records the error - # preview. Later successful writes to the same path remove the - # entry (the model recovered). At end-of-turn, any entries still - # present are surfaced in an advisory footer so the model cannot - # over-claim success while the file is actually unchanged on disk. - self._turn_failed_file_mutations: Dict[str, Dict[str, Any]] = {} - - # Record the execution thread so interrupt()/clear_interrupt() can - # scope the tool-level interrupt signal to THIS agent's thread only. - # Must be set before any thread-scoped interrupt syncing. - self._execution_thread_id = threading.current_thread().ident - - # Always clear stale per-thread state from a previous turn. If an - # interrupt arrived before startup finished, preserve it and bind it - # to this execution thread now instead of dropping it on the floor. - _set_interrupt(False, self._execution_thread_id) - if self._interrupt_requested: - _set_interrupt(True, self._execution_thread_id) - self._interrupt_thread_signal_pending = False - else: - self._interrupt_message = None - self._interrupt_thread_signal_pending = False - - # Notify memory providers of the new turn so cadence tracking works. - # Must happen BEFORE prefetch_all() so providers know which turn it is - # and can gate context/dialectic refresh via contextCadence/dialecticCadence. - if self._memory_manager: - try: - _turn_msg = original_user_message if isinstance(original_user_message, str) else "" - self._memory_manager.on_turn_start(self._user_turn_count, _turn_msg) - except Exception: - pass - - # External memory provider: prefetch once before the tool loop. - # Reuse the cached result on every iteration to avoid re-calling - # prefetch_all() on each tool call (10 tool calls = 10x latency + cost). - # Use original_user_message (clean input) β€” user_message may contain - # injected skill content that bloats / breaks provider queries. - _ext_prefetch_cache = "" - if self._memory_manager: - try: - _query = original_user_message if isinstance(original_user_message, str) else "" - _ext_prefetch_cache = self._memory_manager.prefetch_all(_query) or "" - except Exception: - pass - - # Optional opt-in runtime: if api_mode == codex_app_server, hand the - # turn to the codex app-server subprocess (terminal/file ops/patching - # all run inside Codex). Default Hermes path is bypassed entirely. - # See agent/transports/codex_app_server_session.py for the adapter - # and references/codex-app-server-runtime.md for the rationale. - if self.api_mode == "codex_app_server": - return self._run_codex_app_server_turn( - user_message=user_message, - original_user_message=original_user_message, - messages=messages, - effective_task_id=effective_task_id, - should_review_memory=_should_review_memory, - ) - - while (api_call_count < self.max_iterations and self.iteration_budget.remaining > 0) or self._budget_grace_call: - # Reset per-turn checkpoint dedup so each iteration can take one snapshot - self._checkpoint_mgr.new_turn() - - # Check for interrupt request (e.g., user sent new message) - if self._interrupt_requested: - interrupted = True - _turn_exit_reason = "interrupted_by_user" - if not self.quiet_mode: - self._safe_print("\n⚑ Breaking out of tool loop due to interrupt...") - break - - api_call_count += 1 - self._api_call_count = api_call_count - self._touch_activity(f"starting API call #{api_call_count}") - - # Grace call: the budget is exhausted but we gave the model one - # more chance. Consume the grace flag so the loop exits after - # this iteration regardless of outcome. - if self._budget_grace_call: - self._budget_grace_call = False - elif not self.iteration_budget.consume(): - _turn_exit_reason = "budget_exhausted" - if not self.quiet_mode: - self._safe_print(f"\n⚠️ Iteration budget exhausted ({self.iteration_budget.used}/{self.iteration_budget.max_total} iterations used)") - break - - # Fire step_callback for gateway hooks (agent:step event) - if self.step_callback is not None: - try: - prev_tools = [] - for _idx, _m in enumerate(reversed(messages)): - if _m.get("role") == "assistant" and _m.get("tool_calls"): - _fwd_start = len(messages) - _idx - _results_by_id = {} - for _tm in messages[_fwd_start:]: - if _tm.get("role") != "tool": - break - _tcid = _tm.get("tool_call_id") - if _tcid: - _results_by_id[_tcid] = _tm.get("content", "") - prev_tools = [ - { - "name": tc["function"]["name"], - "result": _results_by_id.get(tc.get("id")), - "arguments": tc["function"].get("arguments"), - } - for tc in _m["tool_calls"] - if isinstance(tc, dict) - ] - break - self.step_callback(api_call_count, prev_tools) - except Exception as _step_err: - logger.debug("step_callback error (iteration %s): %s", api_call_count, _step_err) - - # Track tool-calling iterations for skill nudge. - # Counter resets whenever skill_manage is actually used. - if (self._skill_nudge_interval > 0 - and "skill_manage" in self.valid_tool_names): - self._iters_since_skill += 1 - - # ── Pre-API-call /steer drain ────────────────────────────────── - # If a /steer arrived during the previous API call (while the model - # was thinking), drain it now β€” before we build api_messages β€” so - # the model sees the steer text on THIS iteration. Without this, - # steers sent during an API call only land after the NEXT tool batch, - # which may never come if the model returns a final response. - # - # We scan backwards for the last tool-role message in the messages - # list. If found, the steer is appended there. If not (first - # iteration, no tools yet), the steer stays pending for the next - # tool batch β€” injecting into a user message would break role - # alternation, and there's no tool output to piggyback on. - _pre_api_steer = self._drain_pending_steer() - if _pre_api_steer: - _injected = False - for _si in range(len(messages) - 1, -1, -1): - _sm = messages[_si] - if isinstance(_sm, dict) and _sm.get("role") == "tool": - marker = f"\n\nUser guidance: {_pre_api_steer}" - existing = _sm.get("content", "") - if isinstance(existing, str): - _sm["content"] = existing + marker - else: - # Multimodal content blocks β€” append text block - try: - blocks = list(existing) if existing else [] - blocks.append({"type": "text", "text": marker}) - _sm["content"] = blocks - except Exception: - pass - _injected = True - logger.debug( - "Pre-API-call steer drain: injected into tool msg at index %d", - _si, - ) - break - if not _injected: - # No tool message to inject into β€” put it back so - # the post-tool-execution drain picks it up later. - _lock = getattr(self, "_pending_steer_lock", None) - if _lock is not None: - with _lock: - if self._pending_steer: - self._pending_steer = self._pending_steer + "\n" + _pre_api_steer - else: - self._pending_steer = _pre_api_steer - else: - existing = getattr(self, "_pending_steer", None) - self._pending_steer = (existing + "\n" + _pre_api_steer) if existing else _pre_api_steer - - # Prepare messages for API call - # If we have an ephemeral system prompt, prepend it to the messages - # Note: Reasoning is embedded in content via tags for trajectory storage. - # However, providers like Moonshot AI require a separate 'reasoning_content' field - # on assistant messages with tool_calls. We handle both cases here. - request_logger = getattr(self, "logger", None) or logging.getLogger(__name__) - repaired_tool_calls = self._sanitize_tool_call_arguments( - messages, - logger=request_logger, - session_id=self.session_id, - ) - if repaired_tool_calls > 0: - request_logger.info( - "Sanitized %s corrupted tool_call arguments before request (session=%s)", - repaired_tool_calls, - self.session_id or "-", - ) - - # Defensive: repair malformed role-alternation before API call. - # Catches cases where the history got wedged into a - # ``tool β†’ user`` or ``user β†’ user`` tail (e.g. after empty- - # response scaffolding was stripped and a new user message - # landed after an orphan tool result). Most providers return - # empty content on malformed sequences, which would otherwise - # retrigger the empty-retry loop indefinitely. - repaired_seq = self._repair_message_sequence(messages) - if repaired_seq > 0: - request_logger.info( - "Repaired %s message-alternation violations before request (session=%s)", - repaired_seq, - self.session_id or "-", - ) - - api_messages = [] - for idx, msg in enumerate(messages): - api_msg = msg.copy() - - # Inject ephemeral context into the current turn's user message. - # Sources: memory manager prefetch + plugin pre_llm_call hooks - # with target="user_message" (the default). Both are - # API-call-time only β€” the original message in `messages` is - # never mutated, so nothing leaks into session persistence. - if idx == current_turn_user_idx and msg.get("role") == "user": - _injections = [] - if _ext_prefetch_cache: - _fenced = build_memory_context_block(_ext_prefetch_cache) - if _fenced: - _injections.append(_fenced) - if _plugin_user_context: - _injections.append(_plugin_user_context) - if _injections: - _base = api_msg.get("content", "") - if isinstance(_base, str): - api_msg["content"] = _base + "\n\n" + "\n\n".join(_injections) - - # For ALL assistant messages, pass reasoning back to the API - # This ensures multi-turn reasoning context is preserved - self._copy_reasoning_content_for_api(msg, api_msg) - - # Remove 'reasoning' field - it's for trajectory storage only - # We've copied it to 'reasoning_content' for the API above - if "reasoning" in api_msg: - api_msg.pop("reasoning") - # Remove finish_reason - not accepted by strict APIs (e.g. Mistral) - if "finish_reason" in api_msg: - api_msg.pop("finish_reason") - # Strip internal thinking-prefill marker - api_msg.pop("_thinking_prefill", None) - # Strip Codex Responses API fields (call_id, response_item_id) for - # strict providers like Mistral, Fireworks, etc. that reject unknown fields. - # Uses new dicts so the internal messages list retains the fields - # for Codex Responses compatibility. - if self._should_sanitize_tool_calls(): - self._sanitize_tool_calls_for_strict_api(api_msg) - # Keep 'reasoning_details' - OpenRouter uses this for multi-turn reasoning context - # The signature field helps maintain reasoning continuity - api_messages.append(api_msg) - - # Build the final system message: cached prompt + ephemeral system prompt. - # Ephemeral additions are API-call-time only (not persisted to session DB). - # External recall context is injected into the user message, not the system - # prompt, so the stable cache prefix remains unchanged. - # - # NOTE: Plugin context from pre_llm_call hooks is injected into the - # user message (see injection block above), NOT the system prompt. - # This is intentional β€” system prompt modifications break the prompt - # cache prefix. The system prompt is reserved for Hermes internals. - # - # Hermes invariant: the system prompt is built ONCE per session - # (cached on ``_cached_system_prompt``) and replayed verbatim on - # every turn. We send it as a single content string so the - # bytes are byte-stable across turns and upstream prompt caches - # stay warm. - effective_system = active_system_prompt or "" - if self.ephemeral_system_prompt: - effective_system = (effective_system + "\n\n" + self.ephemeral_system_prompt).strip() - if effective_system: - api_messages = [{"role": "system", "content": effective_system}] + api_messages - - # Inject ephemeral prefill messages right after the system prompt - # but before conversation history. Same API-call-time-only pattern. - if self.prefill_messages: - sys_offset = 1 if (api_messages and api_messages[0].get("role") == "system") else 0 - for idx, pfm in enumerate(self.prefill_messages): - api_messages.insert(sys_offset + idx, pfm.copy()) - - # Apply Anthropic prompt caching for Claude models on native - # Anthropic, OpenRouter, and third-party Anthropic-compatible - # gateways. Auto-detected: if ``_use_prompt_caching`` is set, - # inject cache_control breakpoints (system + last 3 messages) - # to reduce input token costs by ~75% on multi-turn - # conversations. - if self._use_prompt_caching: - api_messages = apply_anthropic_cache_control( - api_messages, - cache_ttl=self._cache_ttl, - native_anthropic=self._use_native_cache_layout, - ) - - # Safety net: strip orphaned tool results / add stubs for missing - # results before sending to the API. Runs unconditionally β€” not - # gated on context_compressor β€” so orphans from session loading or - # manual message manipulation are always caught. - api_messages = self._sanitize_api_messages(api_messages) - - # Drop thinking-only assistant turns (reasoning but no visible - # output and no tool_calls) and merge any adjacent user messages - # left behind. Prevents Anthropic 400s ("The final block in an - # assistant message cannot be `thinking`.") and equivalent errors - # from third-party Anthropic-compatible gateways that can't replay - # a thinking-only turn. Runs on the per-call copy only β€” the - # stored conversation history keeps the reasoning block for the - # UI transcript and session persistence. - api_messages = self._drop_thinking_only_and_merge_users(api_messages) - - # Normalize message whitespace and tool-call JSON for consistent - # prefix matching. Ensures bit-perfect prefixes across turns, - # which enables KV cache reuse on local inference servers - # (llama.cpp, vLLM, Ollama) and improves cache hit rates for - # cloud providers. Operates on api_messages (the API copy) so - # the original conversation history in `messages` is untouched. - for am in api_messages: - if isinstance(am.get("content"), str): - am["content"] = am["content"].strip() - for am in api_messages: - tcs = am.get("tool_calls") - if not tcs: - continue - new_tcs = [] - for tc in tcs: - if isinstance(tc, dict) and "function" in tc: - try: - args_obj = json.loads(tc["function"]["arguments"]) - tc = {**tc, "function": { - **tc["function"], - "arguments": json.dumps( - args_obj, separators=(",", ":"), - sort_keys=True, - ), - }} - except Exception: - tc["function"]["arguments"] = _repair_tool_call_arguments( - tc["function"]["arguments"], - tc["function"].get("name", "?"), - ) - new_tcs.append(tc) - am["tool_calls"] = new_tcs - - # Proactively strip any surrogate characters before the API call. - # Models served via Ollama (Kimi K2.5, GLM-5, Qwen) can return - # lone surrogates (U+D800-U+DFFF) that crash json.dumps() inside - # the OpenAI SDK. Sanitizing here prevents the 3-retry cycle. - _sanitize_messages_surrogates(api_messages) - - # Calculate approximate request size for logging - total_chars = sum(len(str(msg)) for msg in api_messages) - approx_tokens = estimate_messages_tokens_rough(api_messages) - - # Thinking spinner for quiet mode (animated during API call) - thinking_spinner = None - - if not self.quiet_mode: - self._vprint(f"\n{self.log_prefix}πŸ”„ Making API call #{api_call_count}/{self.max_iterations}...") - self._vprint(f"{self.log_prefix} πŸ“Š Request size: {len(api_messages)} messages, ~{approx_tokens:,} tokens (~{total_chars:,} chars)") - self._vprint(f"{self.log_prefix} πŸ”§ Available tools: {len(self.tools) if self.tools else 0}") - else: - # Animated thinking spinner in quiet mode - face = random.choice(KawaiiSpinner.get_thinking_faces()) - verb = random.choice(KawaiiSpinner.get_thinking_verbs()) - if self.thinking_callback: - # CLI TUI mode: use prompt_toolkit widget instead of raw spinner - # (works in both streaming and non-streaming modes) - self.thinking_callback(f"{face} {verb}...") - elif not self._has_stream_consumers() and self._should_start_quiet_spinner(): - # Raw KawaiiSpinner only when no streaming consumers and the - # spinner output has a safe sink. - spinner_type = random.choice(['brain', 'sparkle', 'pulse', 'moon', 'star']) - thinking_spinner = KawaiiSpinner(f"{face} {verb}...", spinner_type=spinner_type, print_fn=self._print_fn) - thinking_spinner.start() - - # Log request details if verbose - if self.verbose_logging: - logging.debug(f"API Request - Model: {self.model}, Messages: {len(messages)}, Tools: {len(self.tools) if self.tools else 0}") - logging.debug(f"Last message role: {messages[-1]['role'] if messages else 'none'}") - logging.debug(f"Total message size: ~{approx_tokens:,} tokens") - - api_start_time = time.time() - retry_count = 0 - max_retries = self._api_max_retries - primary_recovery_attempted = False - max_compression_attempts = 3 - codex_auth_retry_attempted=False - anthropic_auth_retry_attempted=False - nous_auth_retry_attempted=False - copilot_auth_retry_attempted=False - thinking_sig_retry_attempted = False - image_shrink_retry_attempted = False - oauth_1m_beta_retry_attempted = False - llama_cpp_grammar_retry_attempted = False - has_retried_429 = False - restart_with_compressed_messages = False - restart_with_length_continuation = False - - finish_reason = "stop" - response = None # Guard against UnboundLocalError if all retries fail - api_kwargs = None # Guard against UnboundLocalError in except handler - - while retry_count < max_retries: - # ── Nous Portal rate limit guard ────────────────────── - # If another session already recorded that Nous is rate- - # limited, skip the API call entirely. Each attempt - # (including SDK-level retries) counts against RPH and - # deepens the rate limit hole. - if self.provider == "nous": - try: - from agent.nous_rate_guard import ( - nous_rate_limit_remaining, - format_remaining as _fmt_nous_remaining, - ) - _nous_remaining = nous_rate_limit_remaining() - if _nous_remaining is not None and _nous_remaining > 0: - _nous_msg = ( - f"Nous Portal rate limit active β€” " - f"resets in {_fmt_nous_remaining(_nous_remaining)}." - ) - self._vprint( - f"{self.log_prefix}⏳ {_nous_msg} Trying fallback...", - force=True, - ) - self._emit_status(f"⏳ {_nous_msg}") - if self._try_activate_fallback(): - retry_count = 0 - compression_attempts = 0 - primary_recovery_attempted = False - continue - # No fallback available β€” return with clear message - self._persist_session(messages, conversation_history) - return { - "final_response": ( - f"⏳ {_nous_msg}\n\n" - "No fallback provider available. " - "Try again after the reset, or add a " - "fallback provider in config.yaml." - ), - "messages": messages, - "api_calls": api_call_count, - "completed": False, - "failed": True, - "error": _nous_msg, - } - except ImportError: - pass - except Exception: - pass # Never let rate guard break the agent loop - - try: - self._reset_stream_delivery_tracking() - api_kwargs = self._build_api_kwargs(api_messages) - if self._force_ascii_payload: - _sanitize_structure_non_ascii(api_kwargs) - if self.api_mode == "codex_responses": - api_kwargs = self._get_transport().preflight_kwargs(api_kwargs, allow_stream=False) - - try: - from hermes_cli.plugins import invoke_hook as _invoke_hook - _invoke_hook( - "pre_api_request", - task_id=effective_task_id, - session_id=self.session_id or "", - platform=self.platform or "", - model=self.model, - provider=self.provider, - base_url=self.base_url, - api_mode=self.api_mode, - api_call_count=api_call_count, - message_count=len(api_messages), - tool_count=len(self.tools or []), - approx_input_tokens=approx_tokens, - request_char_count=total_chars, - max_tokens=self.max_tokens, - ) - except Exception: - pass - - if env_var_enabled("HERMES_DUMP_REQUESTS"): - self._dump_api_request_debug(api_kwargs, reason="preflight") - - # Always prefer the streaming path β€” even without stream - # consumers. Streaming gives us fine-grained health - # checking (90s stale-stream detection, 60s read timeout) - # that the non-streaming path lacks. Without this, - # subagents and other quiet-mode callers can hang - # indefinitely when the provider keeps the connection - # alive with SSE pings but never delivers a response. - # The streaming path is a no-op for callbacks when no - # consumers are registered, and falls back to non- - # streaming automatically if the provider doesn't - # support it. - def _stop_spinner(): - nonlocal thinking_spinner - if thinking_spinner: - thinking_spinner.stop("") - thinking_spinner = None - if self.thinking_callback: - self.thinking_callback("") - - _use_streaming = True - # Provider signaled "stream not supported" on a previous - # attempt β€” switch to non-streaming for the rest of this - # session instead of re-failing every retry. - if getattr(self, "_disable_streaming", False): - _use_streaming = False - # CopilotACPClient communicates via subprocess stdio and - # returns a plain SimpleNamespace β€” not an iterable - # stream. Mirror the ACP exclusion used for Responses - # API upgrade (lines ~1083-1085). - elif ( - self.provider == "copilot-acp" - or str(self.base_url or "").lower().startswith("acp://copilot") - or str(self.base_url or "").lower().startswith("acp+tcp://") - ): - _use_streaming = False - elif not self._has_stream_consumers(): - # No display/TTS consumer. Still prefer streaming for - # health checking, but skip for Mock clients in tests - # (mocks return SimpleNamespace, not stream iterators). - from unittest.mock import Mock - if isinstance(getattr(self, "client", None), Mock): - _use_streaming = False - - if _use_streaming: - response = self._interruptible_streaming_api_call( - api_kwargs, on_first_delta=_stop_spinner - ) - else: - response = self._interruptible_api_call(api_kwargs) - - api_duration = time.time() - api_start_time - - # Stop thinking spinner silently -- the response box or tool - # execution messages that follow are more informative. - if thinking_spinner: - thinking_spinner.stop("") - thinking_spinner = None - if self.thinking_callback: - self.thinking_callback("") - - if not self.quiet_mode: - self._vprint(f"{self.log_prefix}⏱️ API call completed in {api_duration:.2f}s") - - if self.verbose_logging: - # Log response with provider info if available - resp_model = getattr(response, 'model', 'N/A') if response else 'N/A' - logging.debug(f"API Response received - Model: {resp_model}, Usage: {response.usage if hasattr(response, 'usage') else 'N/A'}") - - # Validate response shape before proceeding - response_invalid = False - error_details = [] - if self.api_mode == "codex_responses": - _ct_v = self._get_transport() - if not _ct_v.validate_response(response): - if response is None: - response_invalid = True - error_details.append("response is None") - else: - # Provider returned a terminal failure (e.g. quota exhaustion). - # Treat as invalid so the fallback chain is triggered instead of - # letting the error bubble up outside the retry/fallback loop. - _codex_resp_status = str(getattr(response, "status", "") or "").strip().lower() - if _codex_resp_status in {"failed", "cancelled"}: - _codex_error_obj = getattr(response, "error", None) - _codex_error_msg = ( - _codex_error_obj.get("message") if isinstance(_codex_error_obj, dict) - else str(_codex_error_obj) if _codex_error_obj - else f"Responses API returned status '{_codex_resp_status}'" - ) - logging.warning( - "Codex response status='%s' (error=%s). Routing to fallback. %s", - _codex_resp_status, _codex_error_msg, - self._client_log_context(), - ) - response_invalid = True - error_details.append(f"response.status={_codex_resp_status}: {_codex_error_msg}") - else: - # output_text fallback: stream backfill may have failed - # but normalize can still recover from output_text - _out_text = getattr(response, "output_text", None) - _out_text_stripped = _out_text.strip() if isinstance(_out_text, str) else "" - if _out_text_stripped: - logger.debug( - "Codex response.output is empty but output_text is present " - "(%d chars); deferring to normalization.", - len(_out_text_stripped), - ) - else: - _resp_status = getattr(response, "status", None) - _resp_incomplete = getattr(response, "incomplete_details", None) - logger.warning( - "Codex response.output is empty after stream backfill " - "(status=%s, incomplete_details=%s, model=%s). %s", - _resp_status, _resp_incomplete, - getattr(response, "model", None), - f"api_mode={self.api_mode} provider={self.provider}", - ) - response_invalid = True - error_details.append("response.output is empty") - elif self.api_mode == "anthropic_messages": - _tv = self._get_transport() - if not _tv.validate_response(response): - response_invalid = True - if response is None: - error_details.append("response is None") - else: - error_details.append("response.content invalid (not a non-empty list)") - elif self.api_mode == "bedrock_converse": - _btv = self._get_transport() - if not _btv.validate_response(response): - response_invalid = True - if response is None: - error_details.append("response is None") - else: - error_details.append("Bedrock response invalid (no output or choices)") - else: - _ctv = self._get_transport() - if not _ctv.validate_response(response): - response_invalid = True - if response is None: - error_details.append("response is None") - elif not hasattr(response, 'choices'): - error_details.append("response has no 'choices' attribute") - elif response.choices is None: - error_details.append("response.choices is None") - else: - error_details.append("response.choices is empty") - - if response_invalid: - # Stop spinner before printing error messages - if thinking_spinner: - thinking_spinner.stop("(Β΄;Ο‰;`) oops, retrying...") - thinking_spinner = None - if self.thinking_callback: - self.thinking_callback("") - - # Invalid response β€” could be rate limiting, provider timeout, - # upstream server error, or malformed response. - retry_count += 1 - - # Eager fallback: empty/malformed responses are a common - # rate-limit symptom. Switch to fallback immediately - # rather than retrying with extended backoff. - if self._fallback_index < len(self._fallback_chain): - self._emit_status("⚠️ Empty/malformed response β€” switching to fallback...") - if self._try_activate_fallback(): - retry_count = 0 - compression_attempts = 0 - primary_recovery_attempted = False - continue - - # Check for error field in response (some providers include this) - error_msg = "Unknown" - provider_name = "Unknown" - if response and hasattr(response, 'error') and response.error: - error_msg = str(response.error) - # Try to extract provider from error metadata - if hasattr(response.error, 'metadata') and response.error.metadata: - provider_name = response.error.metadata.get('provider_name', 'Unknown') - elif response and hasattr(response, 'message') and response.message: - error_msg = str(response.message) - - # Try to get provider from model field (OpenRouter often returns actual model used) - if provider_name == "Unknown" and response and hasattr(response, 'model') and response.model: - provider_name = f"model={response.model}" - - # Check for x-openrouter-provider or similar metadata - if provider_name == "Unknown" and response: - # Log all response attributes for debugging - resp_attrs = {k: str(v)[:100] for k, v in vars(response).items() if not k.startswith('_')} - if self.verbose_logging: - logging.debug(f"Response attributes for invalid response: {resp_attrs}") - - # Extract error code from response for contextual diagnostics - _resp_error_code = None - if response and hasattr(response, 'error') and response.error: - _code_raw = getattr(response.error, 'code', None) - if _code_raw is None and isinstance(response.error, dict): - _code_raw = response.error.get('code') - if _code_raw is not None: - try: - _resp_error_code = int(_code_raw) - except (TypeError, ValueError): - pass - - # Build a human-readable failure hint from the error code - # and response time, instead of always assuming rate limiting. - if _resp_error_code == 524: - _failure_hint = f"upstream provider timed out (Cloudflare 524, {api_duration:.0f}s)" - elif _resp_error_code == 504: - _failure_hint = f"upstream gateway timeout (504, {api_duration:.0f}s)" - elif _resp_error_code == 429: - _failure_hint = f"rate limited by upstream provider (429)" - elif _resp_error_code in {500, 502}: - _failure_hint = f"upstream server error ({_resp_error_code}, {api_duration:.0f}s)" - elif _resp_error_code in {503, 529}: - _failure_hint = f"upstream provider overloaded ({_resp_error_code})" - elif _resp_error_code is not None: - _failure_hint = f"upstream error (code {_resp_error_code}, {api_duration:.0f}s)" - elif api_duration < 10: - _failure_hint = f"fast response ({api_duration:.1f}s) β€” likely rate limited" - elif api_duration > 60: - _failure_hint = f"slow response ({api_duration:.0f}s) β€” likely upstream timeout" - else: - _failure_hint = f"response time {api_duration:.1f}s" - - self._vprint(f"{self.log_prefix}⚠️ Invalid API response (attempt {retry_count}/{max_retries}): {', '.join(error_details)}", force=True) - self._vprint(f"{self.log_prefix} 🏒 Provider: {provider_name}", force=True) - cleaned_provider_error = self._clean_error_message(error_msg) - self._vprint(f"{self.log_prefix} πŸ“ Provider message: {cleaned_provider_error}", force=True) - self._vprint(f"{self.log_prefix} ⏱️ {_failure_hint}", force=True) - - if retry_count >= max_retries: - # Try fallback before giving up - self._emit_status(f"⚠️ Max retries ({max_retries}) for invalid responses β€” trying fallback...") - if self._try_activate_fallback(): - retry_count = 0 - compression_attempts = 0 - primary_recovery_attempted = False - continue - self._emit_status(f"❌ Max retries ({max_retries}) exceeded for invalid responses. Giving up.") - logging.error(f"{self.log_prefix}Invalid API response after {max_retries} retries.") - self._persist_session(messages, conversation_history) - return { - "messages": messages, - "completed": False, - "api_calls": api_call_count, - "error": f"Invalid API response after {max_retries} retries: {_failure_hint}", - "failed": True # Mark as failure for filtering - } - - # Backoff before retry β€” jittered exponential: 5s base, 120s cap - wait_time = jittered_backoff(retry_count, base_delay=5.0, max_delay=120.0) - self._vprint(f"{self.log_prefix}⏳ Retrying in {wait_time:.1f}s ({_failure_hint})...", force=True) - logging.warning(f"Invalid API response (retry {retry_count}/{max_retries}): {', '.join(error_details)} | Provider: {provider_name}") - - # Sleep in small increments to stay responsive to interrupts - sleep_end = time.time() + wait_time - _backoff_touch_counter = 0 - while time.time() < sleep_end: - if self._interrupt_requested: - self._vprint(f"{self.log_prefix}⚑ Interrupt detected during retry wait, aborting.", force=True) - self._persist_session(messages, conversation_history) - self.clear_interrupt() - return { - "final_response": f"Operation interrupted during retry ({_failure_hint}, attempt {retry_count}/{max_retries}).", - "messages": messages, - "api_calls": api_call_count, - "completed": False, - "interrupted": True, - } - time.sleep(0.2) - # Touch activity every ~30s so the gateway's inactivity - # monitor knows we're alive during backoff waits. - _backoff_touch_counter += 1 - if _backoff_touch_counter % 150 == 0: # 150 Γ— 0.2s = 30s - self._touch_activity( - f"retry backoff ({retry_count}/{max_retries}), " - f"{int(sleep_end - time.time())}s remaining" - ) - continue # Retry the API call - - # Check finish_reason before proceeding - if self.api_mode == "codex_responses": - status = getattr(response, "status", None) - incomplete_details = getattr(response, "incomplete_details", None) - incomplete_reason = None - if isinstance(incomplete_details, dict): - incomplete_reason = incomplete_details.get("reason") - else: - incomplete_reason = getattr(incomplete_details, "reason", None) - if status == "incomplete" and incomplete_reason in {"max_output_tokens", "length"}: - finish_reason = "length" - else: - finish_reason = "stop" - elif self.api_mode == "anthropic_messages": - _tfr = self._get_transport() - finish_reason = _tfr.map_finish_reason(response.stop_reason) - elif self.api_mode == "bedrock_converse": - # Bedrock response already normalized at dispatch β€” use transport - _bt_fr = self._get_transport() - _bedrock_result = _bt_fr.normalize_response(response) - finish_reason = _bedrock_result.finish_reason - else: - _cc_fr = self._get_transport() - _finish_result = _cc_fr.normalize_response(response) - finish_reason = _finish_result.finish_reason - assistant_message = _finish_result - if self._should_treat_stop_as_truncated( - finish_reason, - assistant_message, - messages, - ): - self._vprint( - f"{self.log_prefix}⚠️ Treating suspicious Ollama/GLM stop response as truncated", - force=True, - ) - finish_reason = "length" - - if finish_reason == "length": - self._vprint(f"{self.log_prefix}⚠️ Response truncated (finish_reason='length') - model hit max output tokens", force=True) - - # Normalize the truncated response to a single OpenAI-style - # message shape so text-continuation and tool-call retry - # work uniformly across chat_completions, bedrock_converse, - # and anthropic_messages. For Anthropic we use the same - # adapter the agent loop already relies on so the rebuilt - # interim assistant message is byte-identical to what - # would have been appended in the non-truncated path. - _trunc_msg = None - _trunc_transport = self._get_transport() - if self.api_mode == "anthropic_messages": - _trunc_result = _trunc_transport.normalize_response( - response, strip_tool_prefix=self._is_anthropic_oauth - ) - else: - _trunc_result = _trunc_transport.normalize_response(response) - _trunc_msg = _trunc_result - - _trunc_content = getattr(_trunc_msg, "content", None) if _trunc_msg else None - _trunc_has_tool_calls = bool(getattr(_trunc_msg, "tool_calls", None)) if _trunc_msg else False - - # ── Detect thinking-budget exhaustion ────────────── - # When the model spends ALL output tokens on reasoning - # and has none left for the response, continuation - # retries are pointless. Detect this early and give a - # targeted error instead of wasting 3 API calls. - # A response is "thinking exhausted" only when the model - # actually produced reasoning blocks but no visible text after - # them. Models that do not use tags (e.g. GLM-4.7 on - # NVIDIA Build, minimax) may return content=None or an empty - # string for unrelated reasons β€” treat those as normal - # truncations that deserve continuation retries, not as - # thinking-budget exhaustion. - _has_think_tags = bool( - _trunc_content and re.search( - r'<(?:think|thinking|reasoning|REASONING_SCRATCHPAD)[^>]*>', - _trunc_content, - re.IGNORECASE, - ) - ) - _thinking_exhausted = ( - not _trunc_has_tool_calls - and _has_think_tags - and ( - (_trunc_content is not None and not self._has_content_after_think_block(_trunc_content)) - or _trunc_content is None - ) - ) - - if _thinking_exhausted: - _exhaust_error = ( - "Model used all output tokens on reasoning with none left " - "for the response. Try lowering reasoning effort or " - "increasing max_tokens." - ) - self._vprint( - f"{self.log_prefix}πŸ’­ Reasoning exhausted the output token budget β€” " - f"no visible response was produced.", - force=True, - ) - # Return a user-friendly message as the response so - # CLI (response box) and gateway (chat message) both - # display it naturally instead of a suppressed error. - _exhaust_response = ( - "⚠️ **Thinking Budget Exhausted**\n\n" - "The model used all its output tokens on reasoning " - "and had none left for the actual response.\n\n" - "To fix this:\n" - "β†’ Lower reasoning effort: `/thinkon low` or `/thinkon minimal`\n" - "β†’ Or switch to a larger/non-reasoning model with `/model`" - ) - self._cleanup_task_resources(effective_task_id) - self._persist_session(messages, conversation_history) - return { - "final_response": _exhaust_response, - "messages": messages, - "api_calls": api_call_count, - "completed": False, - "partial": True, - "error": _exhaust_error, - } - - if self.api_mode in {"chat_completions", "bedrock_converse", "anthropic_messages"}: - assistant_message = _trunc_msg - if assistant_message is not None and not _trunc_has_tool_calls: - length_continue_retries += 1 - interim_msg = self._build_assistant_message(assistant_message, finish_reason) - messages.append(interim_msg) - if assistant_message.content: - truncated_response_prefix += assistant_message.content - - if length_continue_retries < 3: - self._vprint( - f"{self.log_prefix}↻ Requesting continuation " - f"({length_continue_retries}/3)..." - ) - continue_msg = { - "role": "user", - "content": ( - "[System: Your previous response was truncated by the output " - "length limit. Continue exactly where you left off. Do not " - "restart or repeat prior text. Finish the answer directly.]" - ), - } - messages.append(continue_msg) - self._session_messages = messages - self._save_session_log(messages) - restart_with_length_continuation = True - break - - partial_response = self._strip_think_blocks(truncated_response_prefix).strip() - self._cleanup_task_resources(effective_task_id) - self._persist_session(messages, conversation_history) - return { - "final_response": partial_response or None, - "messages": messages, - "api_calls": api_call_count, - "completed": False, - "partial": True, - "error": "Response remained truncated after 3 continuation attempts", - } - - if self.api_mode in {"chat_completions", "bedrock_converse", "anthropic_messages"}: - assistant_message = _trunc_msg - if assistant_message is not None and _trunc_has_tool_calls: - if truncated_tool_call_retries < 1: - truncated_tool_call_retries += 1 - self._vprint( - f"{self.log_prefix}⚠️ Truncated tool call detected β€” retrying API call...", - force=True, - ) - # Don't append the broken response to messages; - # just re-run the same API call from the current - # message state, giving the model another chance. - continue - self._vprint( - f"{self.log_prefix}⚠️ Truncated tool call response detected again β€” refusing to execute incomplete tool arguments.", - force=True, - ) - self._cleanup_task_resources(effective_task_id) - self._persist_session(messages, conversation_history) - return { - "final_response": None, - "messages": messages, - "api_calls": api_call_count, - "completed": False, - "partial": True, - "error": "Response truncated due to output length limit", - } - - # If we have prior messages, roll back to last complete state - if len(messages) > 1: - self._vprint(f"{self.log_prefix} βͺ Rolling back to last complete assistant turn") - rolled_back_messages = self._get_messages_up_to_last_assistant(messages) - - self._cleanup_task_resources(effective_task_id) - self._persist_session(messages, conversation_history) - - return { - "final_response": None, - "messages": rolled_back_messages, - "api_calls": api_call_count, - "completed": False, - "partial": True, - "error": "Response truncated due to output length limit" - } - else: - # First message was truncated - mark as failed - self._vprint(f"{self.log_prefix}❌ First response truncated - cannot recover", force=True) - self._persist_session(messages, conversation_history) - return { - "final_response": None, - "messages": messages, - "api_calls": api_call_count, - "completed": False, - "failed": True, - "error": "First response truncated due to output length limit" - } - - # Track actual token usage from response for context management - if hasattr(response, 'usage') and response.usage: - canonical_usage = normalize_usage( - response.usage, - provider=self.provider, - api_mode=self.api_mode, - ) - prompt_tokens = canonical_usage.prompt_tokens - completion_tokens = canonical_usage.output_tokens - total_tokens = canonical_usage.total_tokens - usage_dict = { - "prompt_tokens": prompt_tokens, - "completion_tokens": completion_tokens, - "total_tokens": total_tokens, - } - self.context_compressor.update_from_response(usage_dict) - - # Cache discovered context length after successful call. - # Only persist limits confirmed by the provider (parsed - # from the error message), not guessed probe tiers. - if getattr(self.context_compressor, "_context_probed", False): - ctx = self.context_compressor.context_length - if getattr(self.context_compressor, "_context_probe_persistable", False): - save_context_length(self.model, self.base_url, ctx) - self._safe_print(f"{self.log_prefix}πŸ’Ύ Cached context length: {ctx:,} tokens for {self.model}") - self.context_compressor._context_probed = False - self.context_compressor._context_probe_persistable = False - - self.session_prompt_tokens += prompt_tokens - self.session_completion_tokens += completion_tokens - self.session_total_tokens += total_tokens - self.session_api_calls += 1 - self.session_input_tokens += canonical_usage.input_tokens - self.session_output_tokens += canonical_usage.output_tokens - self.session_cache_read_tokens += canonical_usage.cache_read_tokens - self.session_cache_write_tokens += canonical_usage.cache_write_tokens - self.session_reasoning_tokens += canonical_usage.reasoning_tokens - - # Log API call details for debugging/observability - _cache_pct = "" - if canonical_usage.cache_read_tokens and prompt_tokens: - _cache_pct = f" cache={canonical_usage.cache_read_tokens}/{prompt_tokens} ({100*canonical_usage.cache_read_tokens/prompt_tokens:.0f}%)" - logger.info( - "API call #%d: model=%s provider=%s in=%d out=%d total=%d latency=%.1fs%s", - self.session_api_calls, self.model, self.provider or "unknown", - prompt_tokens, completion_tokens, total_tokens, - api_duration, _cache_pct, - ) - - cost_result = estimate_usage_cost( - self.model, - canonical_usage, - provider=self.provider, - base_url=self.base_url, - api_key=getattr(self, "api_key", ""), - ) - if cost_result.amount_usd is not None: - self.session_estimated_cost_usd += float(cost_result.amount_usd) - self.session_cost_status = cost_result.status - self.session_cost_source = cost_result.source - - # Persist token counts to session DB for /insights. - # Do this for every platform with a session_id so non-CLI - # sessions (gateway, cron, delegated runs) cannot lose - # token/accounting data if a higher-level persistence path - # is skipped or fails. Gateway/session-store writes use - # absolute totals, so they safely overwrite these per-call - # deltas instead of double-counting them. - if self._session_db and self.session_id: - try: - # Ensure the session row exists before attempting UPDATE. - # Under concurrent load (cron/kanban), the initial - # _ensure_db_session() may have failed due to SQLite - # locking. Retry here so per-call token deltas are - # not silently lost (UPDATE on a non-existent row - # affects 0 rows without error). - if not self._session_db_created: - self._ensure_db_session() - self._session_db.update_token_counts( - self.session_id, - input_tokens=canonical_usage.input_tokens, - output_tokens=canonical_usage.output_tokens, - cache_read_tokens=canonical_usage.cache_read_tokens, - cache_write_tokens=canonical_usage.cache_write_tokens, - reasoning_tokens=canonical_usage.reasoning_tokens, - estimated_cost_usd=float(cost_result.amount_usd) - if cost_result.amount_usd is not None else None, - cost_status=cost_result.status, - cost_source=cost_result.source, - billing_provider=self.provider, - billing_base_url=self.base_url, - billing_mode="subscription_included" - if cost_result.status == "included" else None, - model=self.model, - api_call_count=1, - ) - except Exception as e: - # Log token persistence failures so they're - # visible in agent.log β€” silent loss here is - # the root cause of undercounted analytics. - logger.debug( - "Token persistence failed (session=%s, tokens=%d): %s", - self.session_id, total_tokens, e, - ) - - if self.verbose_logging: - logging.debug(f"Token usage: prompt={usage_dict['prompt_tokens']:,}, completion={usage_dict['completion_tokens']:,}, total={usage_dict['total_tokens']:,}") - - # Surface cache hit stats for any provider that reports - # them β€” not just those where we inject cache_control - # markers. OpenAI/Kimi/DeepSeek/Qwen all do automatic - # server-side prefix caching and return - # ``prompt_tokens_details.cached_tokens``; users - # previously could not see their cache % because this - # line was gated on ``_use_prompt_caching``, which is - # only True for Anthropic-style marker injection. - # ``canonical_usage`` is already normalised from all - # three API shapes (Anthropic / Codex / OpenAI-chat) - # so we can rely on its values directly. - cached = canonical_usage.cache_read_tokens - written = canonical_usage.cache_write_tokens - prompt = usage_dict["prompt_tokens"] - if (cached or written) and not self.quiet_mode: - hit_pct = (cached / prompt * 100) if prompt > 0 else 0 - self._vprint( - f"{self.log_prefix} πŸ’Ύ Cache: " - f"{cached:,}/{prompt:,} tokens " - f"({hit_pct:.0f}% hit, {written:,} written)" - ) - - has_retried_429 = False # Reset on success - # Clear Nous rate limit state on successful request β€” - # proves the limit has reset and other sessions can - # resume hitting Nous. - if self.provider == "nous": - try: - from agent.nous_rate_guard import clear_nous_rate_limit - clear_nous_rate_limit() - except Exception: - pass - self._touch_activity(f"API call #{api_call_count} completed") - break # Success, exit retry loop - - except InterruptedError: - if thinking_spinner: - thinking_spinner.stop("") - thinking_spinner = None - if self.thinking_callback: - self.thinking_callback("") - api_elapsed = time.time() - api_start_time - self._vprint(f"{self.log_prefix}⚑ Interrupted during API call.", force=True) - self._persist_session(messages, conversation_history) - interrupted = True - final_response = f"Operation interrupted: waiting for model response ({api_elapsed:.1f}s elapsed)." - break - - except Exception as api_error: - # Stop spinner before printing error messages - if thinking_spinner: - thinking_spinner.stop("(β•₯_β•₯) error, retrying...") - thinking_spinner = None - if self.thinking_callback: - self.thinking_callback("") - - # ----------------------------------------------------------- - # UnicodeEncodeError recovery. Two common causes: - # 1. Lone surrogates (U+D800..U+DFFF) from clipboard paste - # (Google Docs, rich-text editors) β€” sanitize and retry. - # 2. ASCII codec on systems with LANG=C or non-UTF-8 locale - # (e.g. Chromebooks) β€” any non-ASCII character fails. - # Detect via the error message mentioning 'ascii' codec. - # We sanitize messages in-place and may retry twice: - # first to strip surrogates, then once more for pure - # ASCII-only locale sanitization if needed. - # ----------------------------------------------------------- - if isinstance(api_error, UnicodeEncodeError) and getattr(self, '_unicode_sanitization_passes', 0) < 2: - _err_str = str(api_error).lower() - _is_ascii_codec = "'ascii'" in _err_str or "ascii" in _err_str - # Detect surrogate errors β€” utf-8 codec refusing to - # encode U+D800..U+DFFF. The error text is: - # "'utf-8' codec can't encode characters in position - # N-M: surrogates not allowed" - _is_surrogate_error = ( - "surrogate" in _err_str - or ("'utf-8'" in _err_str and not _is_ascii_codec) - ) - # Sanitize surrogates from both the canonical `messages` - # list AND `api_messages` (the API-copy, which may carry - # `reasoning_content`/`reasoning_details` transformed - # from `reasoning` β€” fields the canonical list doesn't - # have directly). Also clean `api_kwargs` if built and - # `prefill_messages` if present. Mirrors the ASCII - # codec recovery below. - _surrogates_found = _sanitize_messages_surrogates(messages) - if isinstance(api_messages, list): - if _sanitize_messages_surrogates(api_messages): - _surrogates_found = True - if isinstance(api_kwargs, dict): - if _sanitize_structure_surrogates(api_kwargs): - _surrogates_found = True - if isinstance(getattr(self, "prefill_messages", None), list): - if _sanitize_messages_surrogates(self.prefill_messages): - _surrogates_found = True - # Gate the retry on the error type, not on whether we - # found anything β€” _force_ascii_payload / the extended - # surrogate walker above cover all known paths, but a - # new transformed field could still slip through. If - # the error was a surrogate encode failure, always let - # the retry run; the proactive sanitizer at line ~8781 - # runs again on the next iteration. Bounded by - # _unicode_sanitization_passes < 2 (outer guard). - if _surrogates_found or _is_surrogate_error: - self._unicode_sanitization_passes += 1 - if _surrogates_found: - self._vprint( - f"{self.log_prefix}⚠️ Stripped invalid surrogate characters from messages. Retrying...", - force=True, - ) - else: - self._vprint( - f"{self.log_prefix}⚠️ Surrogate encoding error β€” retrying after full-payload sanitization...", - force=True, - ) - continue - if _is_ascii_codec: - self._force_ascii_payload = True - # ASCII codec: the system encoding can't handle - # non-ASCII characters at all. Sanitize all - # non-ASCII content from messages/tool schemas and retry. - # Sanitize both the canonical `messages` list and - # `api_messages` (the API-copy built before the retry - # loop, which may contain extra fields like - # reasoning_content that are not in `messages`). - _messages_sanitized = _sanitize_messages_non_ascii(messages) - if isinstance(api_messages, list): - _sanitize_messages_non_ascii(api_messages) - # Also sanitize the last api_kwargs if already built, - # so a leftover non-ASCII value in a transformed field - # (e.g. extra_body, reasoning_content) doesn't survive - # into the next attempt via _build_api_kwargs cache paths. - if isinstance(api_kwargs, dict): - _sanitize_structure_non_ascii(api_kwargs) - _prefill_sanitized = False - if isinstance(getattr(self, "prefill_messages", None), list): - _prefill_sanitized = _sanitize_messages_non_ascii(self.prefill_messages) - - _tools_sanitized = False - if isinstance(getattr(self, "tools", None), list): - _tools_sanitized = _sanitize_tools_non_ascii(self.tools) - - _system_sanitized = False - if isinstance(active_system_prompt, str): - _sanitized_system = _strip_non_ascii(active_system_prompt) - if _sanitized_system != active_system_prompt: - active_system_prompt = _sanitized_system - self._cached_system_prompt = _sanitized_system - _system_sanitized = True - if isinstance(getattr(self, "ephemeral_system_prompt", None), str): - _sanitized_ephemeral = _strip_non_ascii(self.ephemeral_system_prompt) - if _sanitized_ephemeral != self.ephemeral_system_prompt: - self.ephemeral_system_prompt = _sanitized_ephemeral - _system_sanitized = True - - _headers_sanitized = False - _default_headers = ( - self._client_kwargs.get("default_headers") - if isinstance(getattr(self, "_client_kwargs", None), dict) - else None - ) - if isinstance(_default_headers, dict): - _headers_sanitized = _sanitize_structure_non_ascii(_default_headers) - - # Sanitize the API key β€” non-ASCII characters in - # credentials (e.g. Κ‹ instead of v from a bad - # copy-paste) cause httpx to fail when encoding - # the Authorization header as ASCII. This is the - # most common cause of persistent UnicodeEncodeError - # that survives message/tool sanitization (#6843). - _credential_sanitized = False - _raw_key = getattr(self, "api_key", None) or "" - if _raw_key: - _clean_key = _strip_non_ascii(_raw_key) - if _clean_key != _raw_key: - self.api_key = _clean_key - if isinstance(getattr(self, "_client_kwargs", None), dict): - self._client_kwargs["api_key"] = _clean_key - # Also update the live client β€” it holds its - # own copy of api_key which auth_headers reads - # dynamically on every request. - if getattr(self, "client", None) is not None and hasattr(self.client, "api_key"): - self.client.api_key = _clean_key - _credential_sanitized = True - self._vprint( - f"{self.log_prefix}⚠️ API key contained non-ASCII characters " - f"(bad copy-paste?) β€” stripped them. If auth fails, " - f"re-copy the key from your provider's dashboard.", - force=True, - ) - - # Always retry on ASCII codec detection β€” - # _force_ascii_payload guarantees the full - # api_kwargs payload is sanitized on the - # next iteration (line ~8475). Even when - # per-component checks above find nothing - # (e.g. non-ASCII only in api_messages' - # reasoning_content), the flag catches it. - # Bounded by _unicode_sanitization_passes < 2. - self._unicode_sanitization_passes += 1 - _any_sanitized = ( - _messages_sanitized - or _prefill_sanitized - or _tools_sanitized - or _system_sanitized - or _headers_sanitized - or _credential_sanitized - ) - if _any_sanitized: - self._vprint( - f"{self.log_prefix}⚠️ System encoding is ASCII β€” stripped non-ASCII characters from request payload. Retrying...", - force=True, - ) - else: - self._vprint( - f"{self.log_prefix}⚠️ System encoding is ASCII β€” enabling full-payload sanitization for retry...", - force=True, - ) - continue - - # ── Image-rejection recovery ────────────────────────────── - # Some providers (mlx-lm, text-only endpoints, text-only - # fallbacks on multimodal models) reject any message that - # contains image_url content with a 4xx error like - # "Only 'text' content type is supported." On first hit, - # strip all images from the message list, mark the session - # as vision-unsupported, and retry with text only. - # - # Detection is best-effort English phrase matching β€” a - # locale-translated or heavily-reworded upstream error - # will bypass this guard and fall through to the normal - # error handler. Expand the phrase list when new - # provider wordings are observed in the wild. - _err_body = "" - try: - _err_body = str(getattr(api_error, "body", None) or - getattr(api_error, "message", None) or - str(api_error)) - except Exception: - pass - _err_status = getattr(api_error, "status_code", None) - _IMAGE_REJECTION_PHRASES = ( - "only 'text' content type is supported", - "only text content type is supported", - "image_url is not supported", - "image content is not supported", - "multimodal is not supported", - "multimodal content is not supported", - "multimodal input is not supported", - "vision is not supported", - "vision input is not supported", - "does not support images", - "does not support image input", - "does not support multimodal", - "does not support vision", - "model does not support image", - # ChatGPT-account Codex backend - # (https://chatgpt.com/backend-api/codex) rejects - # data:image/...base64 URLs in input_image fields - # with HTTP 400 "Invalid 'input[N].content[K].image_url'. - # Expected a valid URL, but got a value with an - # invalid format." The OpenAI Responses API on the - # public endpoint accepts data URLs, but the - # ChatGPT-account variant does not. Without this - # phrase the agent cascaded into compression / - # context-too-large recovery instead of just - # stripping the images. Match is narrow on - # purpose β€” keyed on the field-path apostrophe so - # we don't false-trip on other URL validation - # errors. (issue #23570) - "image_url'. expected", - # DeepSeek's OpenAI-compatible API reports text-only - # request-body variants as: - # "unknown variant `image_url`, expected `text`". - "unknown variant `image_url`, expected `text`", - "unknown variant image_url, expected text", - ) - _err_lower = _err_body.lower() - _looks_like_image_rejection = any( - p in _err_lower for p in _IMAGE_REJECTION_PHRASES - ) - # 4xx-only gate: never interpret 5xx/timeout as "server - # said no to images" β€” those are transient and must - # route to the normal retry path. - _status_ok = _err_status is None or (400 <= int(_err_status) < 500) - if ( - getattr(self, "_vision_supported", True) - and _looks_like_image_rejection - and _status_ok - ): - self._vision_supported = False - _imgs_removed = _strip_images_from_messages(messages) - if isinstance(api_messages, list): - _strip_images_from_messages(api_messages) - self._vprint( - f"{self.log_prefix}⚠️ Server rejected image content β€” " - f"switching to text-only mode for this session" - + (". Stripped images from history and retrying." if _imgs_removed else "."), - force=True, - ) - continue - - status_code = getattr(api_error, "status_code", None) - error_context = self._extract_api_error_context(api_error) - - # ── Classify the error for structured recovery decisions ── - _compressor = getattr(self, "context_compressor", None) - _ctx_len = getattr(_compressor, "context_length", 200000) if _compressor else 200000 - classified = classify_api_error( - api_error, - provider=getattr(self, "provider", "") or "", - model=getattr(self, "model", "") or "", - approx_tokens=approx_tokens, - context_length=_ctx_len, - num_messages=len(api_messages) if api_messages else 0, - ) - logger.debug( - "Error classified: reason=%s status=%s retryable=%s compress=%s rotate=%s fallback=%s", - classified.reason.value, classified.status_code, - classified.retryable, classified.should_compress, - classified.should_rotate_credential, classified.should_fallback, - ) - - recovered_with_pool, has_retried_429 = self._recover_with_credential_pool( - status_code=status_code, - has_retried_429=has_retried_429, - classified_reason=classified.reason, - error_context=error_context, - ) - if recovered_with_pool: - continue - - # Image-too-large recovery: shrink oversized native image - # parts in-place and retry once. Triggered by Anthropic's - # per-image 5 MB ceiling (400 with "image exceeds 5 MB - # maximum") or any other provider that complains about - # image size. If shrink fails or a second attempt still - # fails, fall through to normal error handling. - if ( - classified.reason == FailoverReason.image_too_large - and not image_shrink_retry_attempted - ): - image_shrink_retry_attempted = True - if self._try_shrink_image_parts_in_messages(api_messages): - self._vprint( - f"{self.log_prefix}πŸ“ Image(s) exceeded provider size limit β€” " - f"shrank and retrying...", - force=True, - ) - continue - else: - logger.info( - "image-shrink recovery: no data-URL image parts found " - "or shrink didn't reduce size; surfacing original error." - ) - - # Anthropic OAuth subscription rejected the 1M-context beta - # header ("long context beta is not yet available for this - # subscription"). Disable the beta for the rest of this - # session, rebuild the client, and retry once. 1M-capable - # subscriptions never hit this branch β€” they accept the - # beta and keep full 1M context. See PR #17680 for the - # original report (we chose reactive recovery over the - # proposed unconditional omit so capable subscriptions - # don't silently lose the capability). - if ( - classified.reason == FailoverReason.oauth_long_context_beta_forbidden - and self.api_mode == "anthropic_messages" - and self._is_anthropic_oauth - and not oauth_1m_beta_retry_attempted - ): - oauth_1m_beta_retry_attempted = True - if not getattr(self, "_oauth_1m_beta_disabled", False): - self._oauth_1m_beta_disabled = True - try: - self._anthropic_client.close() - except Exception: - pass - self._rebuild_anthropic_client() - self._vprint( - f"{self.log_prefix}πŸ”• OAuth subscription doesn't support " - f"the 1M-context beta β€” disabled for this session and retrying...", - force=True, - ) - continue - - if ( - self.api_mode == "codex_responses" - and self.provider == "openai-codex" - and status_code == 401 - and not codex_auth_retry_attempted - ): - codex_auth_retry_attempted = True - if self._try_refresh_codex_client_credentials(force=True): - self._vprint(f"{self.log_prefix}πŸ” Codex auth refreshed after 401. Retrying request...") - continue - if ( - self.api_mode == "chat_completions" - and self.provider == "nous" - and status_code == 401 - and not nous_auth_retry_attempted - ): - nous_auth_retry_attempted = True - if self._try_refresh_nous_client_credentials(force=True): - print(f"{self.log_prefix}πŸ” Nous agent key refreshed after 401. Retrying request...") - continue - # Credential refresh didn't help β€” show diagnostic info. - # Most common causes: Portal OAuth expired/revoked, - # account out of credits, or agent key blocked. - from hermes_constants import display_hermes_home as _dhh_fn - _dhh = _dhh_fn() - _body_text = "" - try: - _body = getattr(api_error, "body", None) or getattr(api_error, "response", None) - if _body is not None: - _body_text = str(_body)[:200] - except Exception: - pass - print(f"{self.log_prefix}πŸ” Nous 401 β€” Portal authentication failed.") - if _body_text: - print(f"{self.log_prefix} Response: {_body_text}") - print(f"{self.log_prefix} Most likely: Portal OAuth expired, account out of credits, or agent key revoked.") - print(f"{self.log_prefix} Troubleshooting:") - print(f"{self.log_prefix} β€’ Re-authenticate: hermes login --provider nous") - print(f"{self.log_prefix} β€’ Check credits / billing: https://portal.nousresearch.com") - print(f"{self.log_prefix} β€’ Verify stored credentials: {_dhh}/auth.json") - print(f"{self.log_prefix} β€’ Switch providers temporarily: /model --provider openrouter") - if ( - self.provider == "copilot" - and status_code == 401 - and not copilot_auth_retry_attempted - ): - copilot_auth_retry_attempted = True - if self._try_refresh_copilot_client_credentials(): - self._vprint(f"{self.log_prefix}πŸ” Copilot credentials refreshed after 401. Retrying request...") - continue - if ( - self.api_mode == "anthropic_messages" - and status_code == 401 - and hasattr(self, '_anthropic_api_key') - and not anthropic_auth_retry_attempted - ): - anthropic_auth_retry_attempted = True - from agent.anthropic_adapter import _is_oauth_token - if self._try_refresh_anthropic_client_credentials(): - print(f"{self.log_prefix}πŸ” Anthropic credentials refreshed after 401. Retrying request...") - continue - # Credential refresh didn't help β€” show diagnostic info - key = self._anthropic_api_key - auth_method = "Bearer (OAuth/setup-token)" if _is_oauth_token(key) else "x-api-key (API key)" - print(f"{self.log_prefix}πŸ” Anthropic 401 β€” authentication failed.") - print(f"{self.log_prefix} Auth method: {auth_method}") - print(f"{self.log_prefix} Token prefix: {key[:12]}..." if key and len(key) > 12 else f"{self.log_prefix} Token: (empty or short)") - print(f"{self.log_prefix} Troubleshooting:") - from hermes_constants import display_hermes_home as _dhh_fn - _dhh = _dhh_fn() - print(f"{self.log_prefix} β€’ Check ANTHROPIC_TOKEN in {_dhh}/.env for Hermes-managed OAuth/setup tokens") - print(f"{self.log_prefix} β€’ Check ANTHROPIC_API_KEY in {_dhh}/.env for API keys or legacy token values") - print(f"{self.log_prefix} β€’ For API keys: verify at https://platform.claude.com/settings/keys") - print(f"{self.log_prefix} β€’ For Claude Code: run 'claude /login' to refresh, then retry") - print(f"{self.log_prefix} β€’ Legacy cleanup: hermes config set ANTHROPIC_TOKEN \"\"") - print(f"{self.log_prefix} β€’ Clear stale keys: hermes config set ANTHROPIC_API_KEY \"\"") - - # ── Thinking block signature recovery ───────────────── - # Anthropic signs thinking blocks against the full turn - # content. Any upstream mutation (context compression, - # session truncation, message merging) invalidates the - # signature β†’ HTTP 400. Recovery: strip reasoning_details - # from all messages so the next retry sends no thinking - # blocks at all. One-shot β€” don't retry infinitely. - if ( - classified.reason == FailoverReason.thinking_signature - and not thinking_sig_retry_attempted - ): - thinking_sig_retry_attempted = True - for _m in messages: - if isinstance(_m, dict): - _m.pop("reasoning_details", None) - self._vprint( - f"{self.log_prefix}⚠️ Thinking block signature invalid β€” " - f"stripped all thinking blocks, retrying...", - force=True, - ) - logging.warning( - "%sThinking block signature recovery: stripped " - "reasoning_details from %d messages", - self.log_prefix, len(messages), - ) - continue - - # ── llama.cpp grammar-parse recovery ────────────────── - # llama.cpp's ``json-schema-to-grammar`` converter rejects - # regex escape classes (``\d``, ``\w``, ``\s``) and most - # ``format`` values in tool schemas. MCP servers emit - # these routinely for date/phone/email params. Recovery: - # strip ``pattern``/``format`` from ``self.tools`` and - # retry once. We keep the keywords by default so cloud - # providers get the full prompting hints; this branch - # fires only for users on llama.cpp's OAI server. - if ( - classified.reason == FailoverReason.llama_cpp_grammar_pattern - and not llama_cpp_grammar_retry_attempted - ): - llama_cpp_grammar_retry_attempted = True - try: - from tools.schema_sanitizer import strip_pattern_and_format - _, _stripped = strip_pattern_and_format(self.tools) - except Exception as _strip_exc: # pragma: no cover β€” defensive - logging.warning( - "%sllama.cpp grammar recovery: strip helper failed: %s", - self.log_prefix, _strip_exc, - ) - _stripped = 0 - if _stripped: - self._vprint( - f"{self.log_prefix}⚠️ llama.cpp rejected tool schema grammar β€” " - f"stripped {_stripped} pattern/format keyword(s), retrying...", - force=True, - ) - logging.warning( - "%sllama.cpp grammar recovery: stripped %d " - "pattern/format keyword(s) from tool schemas", - self.log_prefix, _stripped, - ) - continue - # No keywords found to strip β€” fall through to normal - # retry path rather than loop forever on the same error. - logging.warning( - "%sllama.cpp grammar error but no pattern/format " - "keywords to strip β€” falling through to normal retry", - self.log_prefix, - ) - - retry_count += 1 - elapsed_time = time.time() - api_start_time - self._touch_activity( - f"API error recovery (attempt {retry_count}/{max_retries})" - ) - - error_type = type(api_error).__name__ - error_msg = str(api_error).lower() - _error_summary = self._summarize_api_error(api_error) - logger.warning( - "API call failed (attempt %s/%s) error_type=%s %s summary=%s", - retry_count, - max_retries, - error_type, - self._client_log_context(), - _error_summary, - ) - - _provider = getattr(self, "provider", "unknown") - _base = getattr(self, "base_url", "unknown") - _model = getattr(self, "model", "unknown") - _status_code_str = f" [HTTP {status_code}]" if status_code else "" - self._vprint(f"{self.log_prefix}⚠️ API call failed (attempt {retry_count}/{max_retries}): {error_type}{_status_code_str}", force=True) - self._vprint(f"{self.log_prefix} πŸ”Œ Provider: {_provider} Model: {_model}", force=True) - self._vprint(f"{self.log_prefix} 🌐 Endpoint: {_base}", force=True) - self._vprint(f"{self.log_prefix} πŸ“ Error: {_error_summary}", force=True) - if status_code and status_code < 500: - _err_body = getattr(api_error, "body", None) - _err_body_str = str(_err_body)[:300] if _err_body else None - if _err_body_str: - self._vprint(f"{self.log_prefix} πŸ“‹ Details: {_err_body_str}", force=True) - self._vprint(f"{self.log_prefix} ⏱️ Elapsed: {elapsed_time:.2f}s Context: {len(api_messages)} msgs, ~{approx_tokens:,} tokens") - - # Actionable hint for OpenRouter "no tool endpoints" error. - # This fires regardless of whether fallback succeeds β€” the - # user needs to know WHY their model failed so they can fix - # their provider routing, not just silently fall back. - if ( - self._is_openrouter_url() - and "support tool use" in error_msg - ): - self._vprint( - f"{self.log_prefix} πŸ’‘ No OpenRouter providers for {_model} support tool calling with your current settings.", - force=True, - ) - if self.providers_allowed: - self._vprint( - f"{self.log_prefix} Your provider_routing.only restriction is filtering out tool-capable providers.", - force=True, - ) - self._vprint( - f"{self.log_prefix} Try removing the restriction or adding providers that support tools for this model.", - force=True, - ) - self._vprint( - f"{self.log_prefix} Check which providers support tools: https://openrouter.ai/models/{_model}", - force=True, - ) - - # Check for interrupt before deciding to retry - if self._interrupt_requested: - self._vprint(f"{self.log_prefix}⚑ Interrupt detected during error handling, aborting retries.", force=True) - self._persist_session(messages, conversation_history) - self.clear_interrupt() - return { - "final_response": f"Operation interrupted: handling API error ({error_type}: {self._clean_error_message(str(api_error))}).", - "messages": messages, - "api_calls": api_call_count, - "completed": False, - "interrupted": True, - } - - # Check for 413 payload-too-large BEFORE generic 4xx handler. - # A 413 is a payload-size error β€” the correct response is to - # compress history and retry, not abort immediately. - status_code = getattr(api_error, "status_code", None) - - # ── Anthropic Sonnet long-context tier gate ─────────── - # Anthropic returns HTTP 429 "Extra usage is required for - # long context requests" when a Claude Max (or similar) - # subscription doesn't include the 1M-context tier. This - # is NOT a transient rate limit β€” retrying or switching - # credentials won't help. Reduce context to 200k (the - # standard tier) and compress. - if classified.reason == FailoverReason.long_context_tier: - _reduced_ctx = 200000 - compressor = self.context_compressor - old_ctx = compressor.context_length - if old_ctx > _reduced_ctx: - compressor.update_model( - model=self.model, - context_length=_reduced_ctx, - base_url=self.base_url, - api_key=getattr(self, "api_key", ""), - provider=self.provider, - ) - # Context probing flags β€” only set on built-in - # compressor (plugin engines manage their own). - if hasattr(compressor, "_context_probed"): - compressor._context_probed = True - # Don't persist β€” this is a subscription-tier - # limitation, not a model capability. If the - # user later enables extra usage the 1M limit - # should come back automatically. - compressor._context_probe_persistable = False - self._vprint( - f"{self.log_prefix}⚠️ Anthropic long-context tier " - f"requires extra usage β€” reducing context: " - f"{old_ctx:,} β†’ {_reduced_ctx:,} tokens", - force=True, - ) - - compression_attempts += 1 - if compression_attempts <= max_compression_attempts: - original_len = len(messages) - messages, active_system_prompt = self._compress_context( - messages, system_message, - approx_tokens=approx_tokens, - task_id=effective_task_id, - ) - # Compression created a new session β€” clear history - # so _flush_messages_to_session_db writes compressed - # messages to the new session, not skipping them. - conversation_history = None - if len(messages) < original_len or old_ctx > _reduced_ctx: - self._emit_status( - f"πŸ—œοΈ Context reduced to {_reduced_ctx:,} tokens " - f"(was {old_ctx:,}), retrying..." - ) - time.sleep(2) - restart_with_compressed_messages = True - break - # Fall through to normal error handling if compression - # is exhausted or didn't help. - - # Eager fallback for rate-limit errors (429 or quota exhaustion). - # When a fallback model is configured, switch immediately instead - # of burning through retries with exponential backoff -- the - # primary provider won't recover within the retry window. - is_rate_limited = classified.reason in { - FailoverReason.rate_limit, - FailoverReason.billing, - } - if is_rate_limited and self._fallback_index < len(self._fallback_chain): - # Don't eagerly fallback if credential pool rotation may - # still recover. See _pool_may_recover_from_rate_limit - # for the single-credential-pool and CloudCode-quota - # exceptions. Fixes #11314 and #13636. - pool_may_recover = _pool_may_recover_from_rate_limit( - self._credential_pool, - provider=self.provider, - base_url=getattr(self, "base_url", None), - ) - if not pool_may_recover: - self._emit_status("⚠️ Rate limited β€” switching to fallback provider...") - if self._try_activate_fallback(reason=classified.reason): - retry_count = 0 - compression_attempts = 0 - primary_recovery_attempted = False - continue - - # ── Nous Portal: record rate limit & skip retries ───── - # When Nous returns a 429 that is a genuine account- - # level rate limit, record the reset time to a shared - # file so ALL sessions (cron, gateway, auxiliary) know - # not to pile on, then skip further retries -- each - # one burns another RPH request and deepens the hole. - # The retry loop's top-of-iteration guard will catch - # this on the next pass and try fallback or bail. - # - # IMPORTANT: Nous Portal multiplexes multiple upstream - # providers (DeepSeek, Kimi, MiMo, Hermes). A 429 can - # also mean an UPSTREAM provider is out of capacity - # for one specific model -- transient, clears in - # seconds, nothing to do with the caller's quota. - # Tripping the cross-session breaker on that would - # block every Nous model for minutes. We use - # ``is_genuine_nous_rate_limit`` to tell the two - # apart via the 429's own x-ratelimit-* headers and - # the last-known-good state captured on the previous - # successful response. - if ( - is_rate_limited - and self.provider == "nous" - and classified.reason == FailoverReason.rate_limit - and not recovered_with_pool - ): - _genuine_nous_rate_limit = False - try: - from agent.nous_rate_guard import ( - is_genuine_nous_rate_limit, - record_nous_rate_limit, - ) - _err_resp = getattr(api_error, "response", None) - _err_hdrs = ( - getattr(_err_resp, "headers", None) - if _err_resp else None - ) - _genuine_nous_rate_limit = is_genuine_nous_rate_limit( - headers=_err_hdrs, - last_known_state=self._rate_limit_state, - ) - if _genuine_nous_rate_limit: - record_nous_rate_limit( - headers=_err_hdrs, - error_context=error_context, - ) - else: - logging.info( - "Nous 429 looks like upstream capacity " - "(no exhausted bucket in headers or " - "last-known state) -- not tripping " - "cross-session breaker." - ) - except Exception: - pass - if _genuine_nous_rate_limit: - # Skip straight to max_retries -- the - # top-of-loop guard will handle fallback or - # bail cleanly. - retry_count = max_retries - continue - # Upstream capacity 429: fall through to normal - # retry logic. A different model (or the same - # model a moment later) will typically succeed. - - is_payload_too_large = ( - classified.reason == FailoverReason.payload_too_large - ) - - if is_payload_too_large: - compression_attempts += 1 - if compression_attempts > max_compression_attempts: - self._vprint(f"{self.log_prefix}❌ Max compression attempts ({max_compression_attempts}) reached for payload-too-large error.", force=True) - self._vprint(f"{self.log_prefix} πŸ’‘ Try /new to start a fresh conversation, or /compress to retry compression.", force=True) - logging.error(f"{self.log_prefix}413 compression failed after {max_compression_attempts} attempts.") - self._persist_session(messages, conversation_history) - return { - "messages": messages, - "completed": False, - "api_calls": api_call_count, - "error": f"Request payload too large: max compression attempts ({max_compression_attempts}) reached.", - "partial": True, - "failed": True, - "compression_exhausted": True, - } - self._emit_status(f"⚠️ Request payload too large (413) β€” compression attempt {compression_attempts}/{max_compression_attempts}...") - - original_len = len(messages) - messages, active_system_prompt = self._compress_context( - messages, system_message, approx_tokens=approx_tokens, - task_id=effective_task_id, - ) - # Compression created a new session β€” clear history - # so _flush_messages_to_session_db writes compressed - # messages to the new session, not skipping them. - conversation_history = None - - if len(messages) < original_len: - self._emit_status(f"πŸ—œοΈ Compressed {original_len} β†’ {len(messages)} messages, retrying...") - time.sleep(2) # Brief pause between compression retries - restart_with_compressed_messages = True - break - else: - self._vprint(f"{self.log_prefix}❌ Payload too large and cannot compress further.", force=True) - self._vprint(f"{self.log_prefix} πŸ’‘ Try /new to start a fresh conversation, or /compress to retry compression.", force=True) - logging.error(f"{self.log_prefix}413 payload too large. Cannot compress further.") - self._persist_session(messages, conversation_history) - return { - "messages": messages, - "completed": False, - "api_calls": api_call_count, - "error": "Request payload too large (413). Cannot compress further.", - "partial": True, - "failed": True, - "compression_exhausted": True, - } - - # Check for context-length errors BEFORE generic 4xx handler. - # The classifier detects context overflow from: explicit error - # messages, generic 400 + large session heuristic (#1630), and - # server disconnect + large session pattern (#2153). - is_context_length_error = ( - classified.reason == FailoverReason.context_overflow - ) - - if is_context_length_error: - compressor = self.context_compressor - old_ctx = compressor.context_length - - # ── Distinguish two very different errors ─────────── - # 1. "Prompt too long": the INPUT exceeds the context window. - # Fix: reduce context_length + compress history. - # 2. "max_tokens too large": input is fine, but - # input_tokens + requested max_tokens > context_window. - # Fix: reduce max_tokens (the OUTPUT cap) for this call. - # Do NOT shrink context_length β€” the window is unchanged. - # - # Note: max_tokens = output token cap (one response). - # context_length = total window (input + output combined). - available_out = parse_available_output_tokens_from_error(error_msg) - if available_out is not None: - # Error is purely about the output cap being too large. - # Cap output to the available space and retry without - # touching context_length or triggering compression. - safe_out = max(1, available_out - 64) # small safety margin - self._ephemeral_max_output_tokens = safe_out - self._vprint( - f"{self.log_prefix}⚠️ Output cap too large for current prompt β€” " - f"retrying with max_tokens={safe_out:,} " - f"(available_tokens={available_out:,}; context_length unchanged at {old_ctx:,})", - force=True, - ) - # Still count against compression_attempts so we don't - # loop forever if the error keeps recurring. - compression_attempts += 1 - if compression_attempts > max_compression_attempts: - self._vprint(f"{self.log_prefix}❌ Max compression attempts ({max_compression_attempts}) reached.", force=True) - self._vprint(f"{self.log_prefix} πŸ’‘ Try /new to start a fresh conversation, or /compress to retry compression.", force=True) - logging.error(f"{self.log_prefix}Context compression failed after {max_compression_attempts} attempts.") - self._persist_session(messages, conversation_history) - return { - "messages": messages, - "completed": False, - "api_calls": api_call_count, - "error": f"Context length exceeded: max compression attempts ({max_compression_attempts}) reached.", - "partial": True, - "failed": True, - "compression_exhausted": True, - } - restart_with_compressed_messages = True - break - - # Error is about the INPUT being too large β€” reduce context_length. - # Try to parse the actual limit from the error message - parsed_limit = parse_context_limit_from_error(error_msg) - _provider_lower = (getattr(self, "provider", "") or "").lower() - _base_lower = (getattr(self, "base_url", "") or "").rstrip("/").lower() - is_minimax_provider = ( - _provider_lower in {"minimax", "minimax-cn"} - or _base_lower.startswith(( - "https://api.minimax.io/anthropic", - "https://api.minimaxi.com/anthropic", - )) - ) - minimax_delta_only_overflow = ( - is_minimax_provider - and parsed_limit is None - and "context window exceeds limit (" in error_msg - ) - if parsed_limit and parsed_limit < old_ctx: - new_ctx = parsed_limit - self._vprint(f"{self.log_prefix}Context limit detected from API: {new_ctx:,} tokens (was {old_ctx:,})", force=True) - elif minimax_delta_only_overflow: - new_ctx = old_ctx - self._vprint( - f"{self.log_prefix}Provider reported overflow amount only; " - f"keeping context_length at {old_ctx:,} tokens and compressing.", - force=True, - ) - else: - # Step down to the next probe tier - new_ctx = get_next_probe_tier(old_ctx) - - if new_ctx and new_ctx < old_ctx: - compressor.update_model( - model=self.model, - context_length=new_ctx, - base_url=self.base_url, - api_key=getattr(self, "api_key", ""), - provider=self.provider, - ) - # Context probing flags β€” only set on built-in - # compressor (plugin engines manage their own). - if hasattr(compressor, "_context_probed"): - compressor._context_probed = True - # Only persist limits parsed from the provider's - # error message (a real number). Guessed fallback - # tiers from get_next_probe_tier() should stay - # in-memory only β€” persisting them pollutes the - # cache with wrong values. - compressor._context_probe_persistable = bool( - parsed_limit and parsed_limit == new_ctx - ) - self._vprint(f"{self.log_prefix}⚠️ Context length exceeded β€” stepping down: {old_ctx:,} β†’ {new_ctx:,} tokens", force=True) - else: - self._vprint(f"{self.log_prefix}⚠️ Context length exceeded at minimum tier β€” attempting compression...", force=True) - - compression_attempts += 1 - if compression_attempts > max_compression_attempts: - self._vprint(f"{self.log_prefix}❌ Max compression attempts ({max_compression_attempts}) reached.", force=True) - self._vprint(f"{self.log_prefix} πŸ’‘ Try /new to start a fresh conversation, or /compress to retry compression.", force=True) - logging.error(f"{self.log_prefix}Context compression failed after {max_compression_attempts} attempts.") - self._persist_session(messages, conversation_history) - return { - "messages": messages, - "completed": False, - "api_calls": api_call_count, - "error": f"Context length exceeded: max compression attempts ({max_compression_attempts}) reached.", - "partial": True, - "failed": True, - "compression_exhausted": True, - } - self._emit_status(f"πŸ—œοΈ Context too large (~{approx_tokens:,} tokens) β€” compressing ({compression_attempts}/{max_compression_attempts})...") - - original_len = len(messages) - messages, active_system_prompt = self._compress_context( - messages, system_message, approx_tokens=approx_tokens, - task_id=effective_task_id, - ) - # Compression created a new session β€” clear history - # so _flush_messages_to_session_db writes compressed - # messages to the new session, not skipping them. - conversation_history = None - - if len(messages) < original_len or new_ctx and new_ctx < old_ctx: - if len(messages) < original_len: - self._emit_status(f"πŸ—œοΈ Compressed {original_len} β†’ {len(messages)} messages, retrying...") - time.sleep(2) # Brief pause between compression retries - restart_with_compressed_messages = True - break - else: - # Can't compress further and already at minimum tier - self._vprint(f"{self.log_prefix}❌ Context length exceeded and cannot compress further.", force=True) - self._vprint(f"{self.log_prefix} πŸ’‘ The conversation has accumulated too much content. Try /new to start fresh, or /compress to manually trigger compression.", force=True) - logging.error(f"{self.log_prefix}Context length exceeded: {approx_tokens:,} tokens. Cannot compress further.") - self._persist_session(messages, conversation_history) - return { - "messages": messages, - "completed": False, - "api_calls": api_call_count, - "error": f"Context length exceeded ({approx_tokens:,} tokens). Cannot compress further.", - "partial": True, - "failed": True, - "compression_exhausted": True, - } - - # Check for non-retryable client errors. The classifier - # already accounts for 413, 429, 529 (transient), context - # overflow, and generic-400 heuristics. Local validation - # errors (ValueError, TypeError) are programming bugs. - # Exclude UnicodeEncodeError β€” it's a ValueError subclass - # but is handled separately by the surrogate sanitization - # path above. Exclude json.JSONDecodeError β€” also a - # ValueError subclass, but it indicates a transient - # provider/network failure (malformed response body, - # truncated stream, routing layer corruption), not a - # local programming bug, and should be retried (#14782). - is_local_validation_error = ( - isinstance(api_error, (ValueError, TypeError)) - and not isinstance( - api_error, (UnicodeEncodeError, json.JSONDecodeError) - ) - # ssl.SSLError (and its subclass SSLCertVerificationError) - # inherits from OSError *and* ValueError via Python MRO, - # so the isinstance(ValueError) check above would - # misclassify a TLS transport failure as a local - # programming bug and abort without retrying. Exclude - # ssl.SSLError explicitly so the error classifier's - # retryable=True mapping takes effect instead. - and not isinstance(api_error, ssl.SSLError) - ) - is_client_error = ( - is_local_validation_error - or ( - not classified.retryable - and not classified.should_compress - and classified.reason not in { - FailoverReason.rate_limit, - FailoverReason.billing, - FailoverReason.overloaded, - FailoverReason.context_overflow, - FailoverReason.payload_too_large, - FailoverReason.long_context_tier, - FailoverReason.thinking_signature, - } - ) - ) and not is_context_length_error - - if is_client_error: - # Try fallback before aborting β€” a different provider - # may not have the same issue (rate limit, auth, etc.) - self._emit_status(f"⚠️ Non-retryable error (HTTP {status_code}) β€” trying fallback...") - if self._try_activate_fallback(): - retry_count = 0 - compression_attempts = 0 - primary_recovery_attempted = False - continue - if api_kwargs is not None: - self._dump_api_request_debug( - api_kwargs, reason="non_retryable_client_error", error=api_error, - ) - self._emit_status( - f"❌ Non-retryable error (HTTP {status_code}): " - f"{self._summarize_api_error(api_error)}" - ) - self._vprint(f"{self.log_prefix}❌ Non-retryable client error (HTTP {status_code}). Aborting.", force=True) - self._vprint(f"{self.log_prefix} πŸ”Œ Provider: {_provider} Model: {_model}", force=True) - self._vprint(f"{self.log_prefix} 🌐 Endpoint: {_base}", force=True) - # Actionable guidance for common auth errors - if classified.is_auth or classified.reason == FailoverReason.billing: - if _provider == "openai-codex" and status_code == 401: - self._vprint(f"{self.log_prefix} πŸ’‘ Codex OAuth token was rejected (HTTP 401). Your token may have been", force=True) - self._vprint(f"{self.log_prefix} refreshed by another client (Codex CLI, VS Code). To fix:", force=True) - self._vprint(f"{self.log_prefix} 1. Run `codex` in your terminal to generate fresh tokens.", force=True) - self._vprint(f"{self.log_prefix} 2. Then run `hermes auth` to re-authenticate.", force=True) - else: - self._vprint(f"{self.log_prefix} πŸ’‘ Your API key was rejected by the provider. Check:", force=True) - self._vprint(f"{self.log_prefix} β€’ Is the key valid? Run: hermes setup", force=True) - self._vprint(f"{self.log_prefix} β€’ Does your account have access to {_model}?", force=True) - if base_url_host_matches(str(_base), "openrouter.ai"): - self._vprint(f"{self.log_prefix} β€’ Check credits: https://openrouter.ai/settings/credits", force=True) - else: - self._vprint(f"{self.log_prefix} πŸ’‘ This type of error won't be fixed by retrying.", force=True) - logging.error(f"{self.log_prefix}Non-retryable client error: {api_error}") - # Skip session persistence when the error is likely - # context-overflow related (status 400 + large session). - # Persisting the failed user message would make the - # session even larger, causing the same failure on the - # next attempt. (#1630) - if status_code == 400 and (approx_tokens > 50000 or len(api_messages) > 80): - self._vprint( - f"{self.log_prefix}⚠️ Skipping session persistence " - f"for large failed session to prevent growth loop.", - force=True, - ) - else: - self._persist_session(messages, conversation_history) - return { - "final_response": None, - "messages": messages, - "api_calls": api_call_count, - "completed": False, - "failed": True, - "error": str(api_error), - } - - if retry_count >= max_retries: - # Before falling back, try rebuilding the primary - # client once for transient transport errors (stale - # connection pool, TCP reset). Only attempted once - # per API call block. - if not primary_recovery_attempted and self._try_recover_primary_transport( - api_error, retry_count=retry_count, max_retries=max_retries, - ): - primary_recovery_attempted = True - retry_count = 0 - continue - # Try fallback before giving up entirely - self._emit_status(f"⚠️ Max retries ({max_retries}) exhausted β€” trying fallback...") - if self._try_activate_fallback(): - retry_count = 0 - compression_attempts = 0 - primary_recovery_attempted = False - continue - _final_summary = self._summarize_api_error(api_error) - if is_rate_limited: - self._emit_status(f"❌ Rate limited after {max_retries} retries β€” {_final_summary}") - else: - self._emit_status(f"❌ API failed after {max_retries} retries β€” {_final_summary}") - self._vprint(f"{self.log_prefix} πŸ’€ Final error: {_final_summary}", force=True) - - # Detect SSE stream-drop pattern (e.g. "Network - # connection lost") and surface actionable guidance. - # This typically happens when the model generates a - # very large tool call (write_file with huge content) - # and the proxy/CDN drops the stream mid-response. - _is_stream_drop = ( - not getattr(api_error, "status_code", None) - and any(p in error_msg for p in ( - "connection lost", "connection reset", - "connection closed", "network connection", - "network error", "terminated", - )) - ) - if _is_stream_drop: - self._vprint( - f"{self.log_prefix} πŸ’‘ The provider's stream " - f"connection keeps dropping. This often happens " - f"when the model tries to write a very large " - f"file in a single tool call.", - force=True, - ) - self._vprint( - f"{self.log_prefix} Try asking the model " - f"to use execute_code with Python's open() for " - f"large files, or to write the file in smaller " - f"sections.", - force=True, - ) - - logging.error( - "%sAPI call failed after %s retries. %s | provider=%s model=%s msgs=%s tokens=~%s", - self.log_prefix, max_retries, _final_summary, - _provider, _model, len(api_messages), f"{approx_tokens:,}", - ) - if api_kwargs is not None: - self._dump_api_request_debug( - api_kwargs, reason="max_retries_exhausted", error=api_error, - ) - self._persist_session(messages, conversation_history) - _final_response = f"API call failed after {max_retries} retries: {_final_summary}" - if _is_stream_drop: - _final_response += ( - "\n\nThe provider's stream connection keeps " - "dropping β€” this often happens when generating " - "very large tool call responses (e.g. write_file " - "with long content). Try asking me to use " - "execute_code with Python's open() for large " - "files, or to write in smaller sections." - ) - return { - "final_response": _final_response, - "messages": messages, - "api_calls": api_call_count, - "completed": False, - "failed": True, - "error": _final_summary, - } - - # For rate limits, respect the Retry-After header if present - _retry_after = None - if is_rate_limited: - _resp_headers = getattr(getattr(api_error, "response", None), "headers", None) - if _resp_headers and hasattr(_resp_headers, "get"): - _ra_raw = _resp_headers.get("retry-after") or _resp_headers.get("Retry-After") - if _ra_raw: - try: - _retry_after = min(float(_ra_raw), 120) # Cap at 2 minutes - except (TypeError, ValueError): - pass - wait_time = _retry_after if _retry_after else jittered_backoff(retry_count, base_delay=2.0, max_delay=60.0) - if is_rate_limited: - self._emit_status(f"⏱️ Rate limited. Waiting {wait_time:.1f}s (attempt {retry_count + 1}/{max_retries})...") - else: - self._emit_status(f"⏳ Retrying in {wait_time:.1f}s (attempt {retry_count}/{max_retries})...") - logger.warning( - "Retrying API call in %ss (attempt %s/%s) %s error=%s", - wait_time, - retry_count, - max_retries, - self._client_log_context(), - api_error, - ) - # Sleep in small increments so we can respond to interrupts quickly - # instead of blocking the entire wait_time in one sleep() call - sleep_end = time.time() + wait_time - _backoff_touch_counter = 0 - while time.time() < sleep_end: - if self._interrupt_requested: - self._vprint(f"{self.log_prefix}⚑ Interrupt detected during retry wait, aborting.", force=True) - self._persist_session(messages, conversation_history) - self.clear_interrupt() - return { - "final_response": f"Operation interrupted: retrying API call after error (retry {retry_count}/{max_retries}).", - "messages": messages, - "api_calls": api_call_count, - "completed": False, - "interrupted": True, - } - time.sleep(0.2) # Check interrupt every 200ms - # Touch activity every ~30s so the gateway's inactivity - # monitor knows we're alive during backoff waits. - _backoff_touch_counter += 1 - if _backoff_touch_counter % 150 == 0: # 150 Γ— 0.2s = 30s - self._touch_activity( - f"error retry backoff ({retry_count}/{max_retries}), " - f"{int(sleep_end - time.time())}s remaining" - ) - - # If the API call was interrupted, skip response processing - if interrupted: - _turn_exit_reason = "interrupted_during_api_call" - break - - if restart_with_compressed_messages: - api_call_count -= 1 - self.iteration_budget.refund() - # Count compression restarts toward the retry limit to prevent - # infinite loops when compression reduces messages but not enough - # to fit the context window. - retry_count += 1 - restart_with_compressed_messages = False - continue - - if restart_with_length_continuation: - # Progressively boost the output token budget on each retry. - # Retry 1 β†’ 2Γ— base, retry 2 β†’ 3Γ— base, capped at 32 768. - # Applies to all providers via _ephemeral_max_output_tokens. - _boost_base = self.max_tokens if self.max_tokens else 4096 - _boost = _boost_base * (length_continue_retries + 1) - self._ephemeral_max_output_tokens = min(_boost, 32768) - continue - - # Guard: if all retries exhausted without a successful response - # (e.g. repeated context-length errors that exhausted retry_count), - # the `response` variable is still None. Break out cleanly. - if response is None: - _turn_exit_reason = "all_retries_exhausted_no_response" - print(f"{self.log_prefix}❌ All API retries exhausted with no successful response.") - self._persist_session(messages, conversation_history) - break - - try: - _transport = self._get_transport() - _normalize_kwargs = {} - if self.api_mode == "anthropic_messages": - _normalize_kwargs["strip_tool_prefix"] = self._is_anthropic_oauth - normalized = _transport.normalize_response(response, **_normalize_kwargs) - assistant_message = normalized - finish_reason = normalized.finish_reason - - # Normalize content to string β€” some OpenAI-compatible servers - # (llama-server, etc.) return content as a dict or list instead - # of a plain string, which crashes downstream .strip() calls. - if assistant_message.content is not None and not isinstance(assistant_message.content, str): - raw = assistant_message.content - if isinstance(raw, dict): - assistant_message.content = raw.get("text", "") or raw.get("content", "") or json.dumps(raw) - elif isinstance(raw, list): - # Multimodal content list β€” extract text parts - parts = [] - for part in raw: - if isinstance(part, str): - parts.append(part) - elif isinstance(part, dict) and part.get("type") == "text": - parts.append(part.get("text", "")) - elif isinstance(part, dict) and "text" in part: - parts.append(str(part["text"])) - assistant_message.content = "\n".join(parts) - else: - assistant_message.content = str(raw) - - try: - from hermes_cli.plugins import invoke_hook as _invoke_hook - _assistant_tool_calls = getattr(assistant_message, "tool_calls", None) or [] - _assistant_text = assistant_message.content or "" - _invoke_hook( - "post_api_request", - task_id=effective_task_id, - session_id=self.session_id or "", - platform=self.platform or "", - model=self.model, - provider=self.provider, - base_url=self.base_url, - api_mode=self.api_mode, - api_call_count=api_call_count, - api_duration=api_duration, - finish_reason=finish_reason, - message_count=len(api_messages), - response_model=getattr(response, "model", None), - usage=self._usage_summary_for_api_request_hook(response), - assistant_content_chars=len(_assistant_text), - assistant_tool_call_count=len(_assistant_tool_calls), - ) - except Exception: - pass - - # Handle assistant response - if assistant_message.content and not self.quiet_mode: - if self.verbose_logging: - self._vprint(f"{self.log_prefix}πŸ€– Assistant: {assistant_message.content}") - else: - self._vprint(f"{self.log_prefix}πŸ€– Assistant: {assistant_message.content[:100]}{'...' if len(assistant_message.content) > 100 else ''}") - - # Notify progress callback of model's thinking (used by subagent - # delegation to relay the child's reasoning to the parent display). - if (assistant_message.content and self.tool_progress_callback): - _think_text = assistant_message.content.strip() - # Strip reasoning XML tags that shouldn't leak to parent display - _think_text = re.sub( - r'', '', _think_text - ).strip() - # For subagents: relay first line to parent display (existing behaviour). - # For all agents with a structured callback: emit reasoning.available event. - first_line = _think_text.split('\n')[0][:80] if _think_text else "" - if first_line and getattr(self, '_delegate_depth', 0) > 0: - try: - self.tool_progress_callback("_thinking", first_line) - except Exception: - pass - elif _think_text: - try: - self.tool_progress_callback("reasoning.available", "_thinking", _think_text[:500], None) - except Exception: - pass - - # Check for incomplete (opened but never closed) - # This means the model ran out of output tokens mid-reasoning β€” retry up to 2 times - if has_incomplete_scratchpad(assistant_message.content or ""): - self._incomplete_scratchpad_retries += 1 - - self._vprint(f"{self.log_prefix}⚠️ Incomplete detected (opened but never closed)") - - if self._incomplete_scratchpad_retries <= 2: - self._vprint(f"{self.log_prefix}πŸ”„ Retrying API call ({self._incomplete_scratchpad_retries}/2)...") - # Don't add the broken message, just retry - continue - else: - # Max retries - discard this turn and save as partial - self._vprint(f"{self.log_prefix}❌ Max retries (2) for incomplete scratchpad. Saving as partial.", force=True) - self._incomplete_scratchpad_retries = 0 - - rolled_back_messages = self._get_messages_up_to_last_assistant(messages) - self._cleanup_task_resources(effective_task_id) - self._persist_session(messages, conversation_history) - - return { - "final_response": None, - "messages": rolled_back_messages, - "api_calls": api_call_count, - "completed": False, - "partial": True, - "error": "Incomplete REASONING_SCRATCHPAD after 2 retries" - } - - # Reset incomplete scratchpad counter on clean response - self._incomplete_scratchpad_retries = 0 - - if self.api_mode == "codex_responses" and finish_reason == "incomplete": - self._codex_incomplete_retries += 1 - - interim_msg = self._build_assistant_message(assistant_message, finish_reason) - interim_has_content = bool((interim_msg.get("content") or "").strip()) - interim_has_reasoning = bool(interim_msg.get("reasoning", "").strip()) if isinstance(interim_msg.get("reasoning"), str) else False - interim_has_codex_reasoning = bool(interim_msg.get("codex_reasoning_items")) - interim_has_codex_message_items = bool(interim_msg.get("codex_message_items")) - - if ( - interim_has_content - or interim_has_reasoning - or interim_has_codex_reasoning - or interim_has_codex_message_items - ): - last_msg = messages[-1] if messages else None - # Duplicate detection: two consecutive incomplete assistant - # messages with identical content AND reasoning are collapsed. - # For provider-state-only changes (encrypted reasoning - # items or replayable message ids/phases/statuses differ - # while visible content/reasoning are unchanged), compare - # those opaque payloads too so we don't silently drop the - # newer continuation state. - last_codex_items = last_msg.get("codex_reasoning_items") if isinstance(last_msg, dict) else None - interim_codex_items = interim_msg.get("codex_reasoning_items") - last_codex_message_items = last_msg.get("codex_message_items") if isinstance(last_msg, dict) else None - interim_codex_message_items = interim_msg.get("codex_message_items") - duplicate_interim = ( - isinstance(last_msg, dict) - and last_msg.get("role") == "assistant" - and last_msg.get("finish_reason") == "incomplete" - and (last_msg.get("content") or "") == (interim_msg.get("content") or "") - and (last_msg.get("reasoning") or "") == (interim_msg.get("reasoning") or "") - and last_codex_items == interim_codex_items - and last_codex_message_items == interim_codex_message_items - ) - if not duplicate_interim: - messages.append(interim_msg) - self._emit_interim_assistant_message(interim_msg) - - if self._codex_incomplete_retries < 3: - if not self.quiet_mode: - self._vprint(f"{self.log_prefix}↻ Codex response incomplete; continuing turn ({self._codex_incomplete_retries}/3)") - self._session_messages = messages - self._save_session_log(messages) - continue - - self._codex_incomplete_retries = 0 - self._persist_session(messages, conversation_history) - return { - "final_response": None, - "messages": messages, - "api_calls": api_call_count, - "completed": False, - "partial": True, - "error": "Codex response remained incomplete after 3 continuation attempts", - } - elif hasattr(self, "_codex_incomplete_retries"): - self._codex_incomplete_retries = 0 - - # Check for tool calls - if assistant_message.tool_calls: - if not self.quiet_mode: - self._vprint(f"{self.log_prefix}πŸ”§ Processing {len(assistant_message.tool_calls)} tool call(s)...") - - if self.verbose_logging: - for tc in assistant_message.tool_calls: - logging.debug(f"Tool call: {tc.function.name} with args: {tc.function.arguments[:200]}...") - - # Validate tool call names - detect model hallucinations - # Repair mismatched tool names before validating - for tc in assistant_message.tool_calls: - if tc.function.name not in self.valid_tool_names: - repaired = self._repair_tool_call(tc.function.name) - if repaired: - print(f"{self.log_prefix}πŸ”§ Auto-repaired tool name: '{tc.function.name}' -> '{repaired}'") - tc.function.name = repaired - invalid_tool_calls = [ - tc.function.name for tc in assistant_message.tool_calls - if tc.function.name not in self.valid_tool_names - ] - if invalid_tool_calls: - # Track retries for invalid tool calls - self._invalid_tool_retries += 1 - - # Return helpful error to model β€” model can self-correct next turn - available = ", ".join(sorted(self.valid_tool_names)) - invalid_name = invalid_tool_calls[0] - invalid_preview = invalid_name[:80] + "..." if len(invalid_name) > 80 else invalid_name - self._vprint(f"{self.log_prefix}⚠️ Unknown tool '{invalid_preview}' β€” sending error to model for self-correction ({self._invalid_tool_retries}/3)") - - if self._invalid_tool_retries >= 3: - self._vprint(f"{self.log_prefix}❌ Max retries (3) for invalid tool calls exceeded. Stopping as partial.", force=True) - self._invalid_tool_retries = 0 - self._persist_session(messages, conversation_history) - return { - "final_response": None, - "messages": messages, - "api_calls": api_call_count, - "completed": False, - "partial": True, - "error": f"Model generated invalid tool call: {invalid_preview}" - } - - assistant_msg = self._build_assistant_message(assistant_message, finish_reason) - messages.append(assistant_msg) - for tc in assistant_message.tool_calls: - if tc.function.name not in self.valid_tool_names: - content = f"Tool '{tc.function.name}' does not exist. Available tools: {available}" - else: - content = "Skipped: another tool call in this turn used an invalid name. Please retry this tool call." - messages.append({ - "role": "tool", - "name": tc.function.name, - "tool_call_id": tc.id, - "content": content, - }) - continue - # Reset retry counter on successful tool call validation - self._invalid_tool_retries = 0 - - # Validate tool call arguments are valid JSON - # Handle empty strings as empty objects (common model quirk) - invalid_json_args = [] - for tc in assistant_message.tool_calls: - args = tc.function.arguments - if isinstance(args, (dict, list)): - tc.function.arguments = json.dumps(args) - continue - if args is not None and not isinstance(args, str): - tc.function.arguments = str(args) - args = tc.function.arguments - # Treat empty/whitespace strings as empty object - if not args or not args.strip(): - tc.function.arguments = "{}" - continue - try: - json.loads(args) - except json.JSONDecodeError as e: - invalid_json_args.append((tc.function.name, str(e))) - - if invalid_json_args: - # Check if the invalid JSON is due to truncation rather - # than a model formatting mistake. Routers sometimes - # rewrite finish_reason from "length" to "tool_calls", - # hiding the truncation from the length handler above. - # Detect truncation: args that don't end with } or ] - # (after stripping whitespace) are cut off mid-stream. - _truncated = any( - not (tc.function.arguments or "").rstrip().endswith(("}", "]")) - for tc in assistant_message.tool_calls - if tc.function.name in {n for n, _ in invalid_json_args} - ) - if _truncated: - self._vprint( - f"{self.log_prefix}⚠️ Truncated tool call arguments detected " - f"(finish_reason={finish_reason!r}) β€” refusing to execute.", - force=True, - ) - self._invalid_json_retries = 0 - self._cleanup_task_resources(effective_task_id) - self._persist_session(messages, conversation_history) - return { - "final_response": None, - "messages": messages, - "api_calls": api_call_count, - "completed": False, - "partial": True, - "error": "Response truncated due to output length limit", - } - - # Track retries for invalid JSON arguments - self._invalid_json_retries += 1 - - tool_name, error_msg = invalid_json_args[0] - self._vprint(f"{self.log_prefix}⚠️ Invalid JSON in tool call arguments for '{tool_name}': {error_msg}") - - if self._invalid_json_retries < 3: - self._vprint(f"{self.log_prefix}πŸ”„ Retrying API call ({self._invalid_json_retries}/3)...") - # Don't add anything to messages, just retry the API call - continue - else: - # Instead of returning partial, inject tool error results so the model can recover. - # Using tool results (not user messages) preserves role alternation. - self._vprint(f"{self.log_prefix}⚠️ Injecting recovery tool results for invalid JSON...") - self._invalid_json_retries = 0 # Reset for next attempt - - # Append the assistant message with its (broken) tool_calls - recovery_assistant = self._build_assistant_message(assistant_message, finish_reason) - messages.append(recovery_assistant) - - # Respond with tool error results for each tool call - invalid_names = {name for name, _ in invalid_json_args} - for tc in assistant_message.tool_calls: - if tc.function.name in invalid_names: - err = next(e for n, e in invalid_json_args if n == tc.function.name) - tool_result = ( - f"Error: Invalid JSON arguments. {err}. " - f"For tools with no required parameters, use an empty object: {{}}. " - f"Please retry with valid JSON." - ) - else: - tool_result = "Skipped: other tool call in this response had invalid JSON." - messages.append({ - "role": "tool", - "name": tc.function.name, - "tool_call_id": tc.id, - "content": tool_result, - }) - continue - - # Reset retry counter on successful JSON validation - self._invalid_json_retries = 0 - - # ── Post-call guardrails ────────────────────────── - assistant_message.tool_calls = self._cap_delegate_task_calls( - assistant_message.tool_calls - ) - assistant_message.tool_calls = self._deduplicate_tool_calls( - assistant_message.tool_calls - ) - - assistant_msg = self._build_assistant_message(assistant_message, finish_reason) - - # If this turn has both content AND tool_calls, capture the content - # as a fallback final response. Common pattern: model delivers its - # answer and calls memory/skill tools as a side-effect in the same - # turn. If the follow-up turn after tools is empty, we use this. - turn_content = assistant_message.content or "" - if turn_content and self._has_content_after_think_block(turn_content): - self._last_content_with_tools = turn_content - # Only mute subsequent output when EVERY tool call in - # this turn is post-response housekeeping (memory, todo, - # skill_manage, etc.). If any substantive tool is present - # (search_files, read_file, write_file, terminal, ...), - # keep output visible so the user sees progress. - _HOUSEKEEPING_TOOLS = frozenset({ - "memory", "todo", "skill_manage", "session_search", - }) - _all_housekeeping = all( - tc.function.name in _HOUSEKEEPING_TOOLS - for tc in assistant_message.tool_calls - ) - self._last_content_tools_all_housekeeping = _all_housekeeping - if _all_housekeeping and self._has_stream_consumers(): - self._mute_post_response = True - elif self._should_emit_quiet_tool_messages(): - clean = self._strip_think_blocks(turn_content).strip() - if clean: - self._vprint(f" β”Š πŸ’¬ {clean}") - - # Pop thinking-only prefill message(s) before appending - # (tool-call path β€” same rationale as the final-response path). - _had_prefill = False - while ( - messages - and isinstance(messages[-1], dict) - and messages[-1].get("_thinking_prefill") - ): - messages.pop() - _had_prefill = True - - # Reset prefill counter when tool calls follow a prefill - # recovery. Without this, the counter accumulates across - # the whole conversation β€” a model that intermittently - # empties (empty β†’ prefill β†’ tools β†’ empty β†’ prefill β†’ - # tools) burns both prefill attempts and the third empty - # gets zero recovery. Resetting here treats each tool- - # call success as a fresh start. - if _had_prefill: - self._thinking_prefill_retries = 0 - self._empty_content_retries = 0 - # Successful tool execution β€” reset the post-tool nudge - # flag so it can fire again if the model goes empty on - # a LATER tool round. - self._post_tool_empty_retried = False - - messages.append(assistant_msg) - self._emit_interim_assistant_message(assistant_msg) - - # Close any open streaming display (response box, reasoning - # box) before tool execution begins. Intermediate turns may - # have streamed early content that opened the response box; - # flushing here prevents it from wrapping tool feed lines. - # Only signal the display callback β€” TTS (_stream_callback) - # should NOT receive None (it uses None as end-of-stream). - if self.stream_delta_callback: - try: - self.stream_delta_callback(None) - except Exception: - pass - - self._execute_tool_calls(assistant_message, messages, effective_task_id, api_call_count) - - if self._tool_guardrail_halt_decision is not None: - decision = self._tool_guardrail_halt_decision - _turn_exit_reason = "guardrail_halt" - final_response = self._toolguard_controlled_halt_response(decision) - self._emit_status( - f"⚠️ Tool guardrail halted {decision.tool_name}: {decision.code}" - ) - messages.append({"role": "assistant", "content": final_response}) - break - - # Reset per-turn retry counters after successful tool - # execution so a single truncation doesn't poison the - # entire conversation. - truncated_tool_call_retries = 0 - - # Signal that a paragraph break is needed before the next - # streamed text. We don't emit it immediately because - # multiple consecutive tool iterations would stack up - # redundant blank lines. Instead, _fire_stream_delta() - # will prepend a single "\n\n" the next time real text - # arrives. - self._stream_needs_break = True - - # Refund the iteration if the ONLY tool(s) called were - # execute_code (programmatic tool calling). These are - # cheap RPC-style calls that shouldn't eat the budget. - _tc_names = {tc.function.name for tc in assistant_message.tool_calls} - if _tc_names == {"execute_code"}: - self.iteration_budget.refund() - - # Use real token counts from the API response to decide - # compression. prompt_tokens + completion_tokens is the - # actual context size the provider reported plus the - # assistant turn β€” a tight lower bound for the next prompt. - # Tool results appended above aren't counted yet, but the - # threshold (default 50%) leaves ample headroom; if tool - # results push past it, the next API call will report the - # real total and trigger compression then. - # - # If last_prompt_tokens is 0 (stale after API disconnect - # or provider returned no usage data), fall back to rough - # estimate to avoid missing compression. Without this, - # a session can grow unbounded after disconnects because - # should_compress(0) never fires. (#2153) - _compressor = self.context_compressor - if _compressor.last_prompt_tokens > 0: - # Only use prompt_tokens β€” completion/reasoning - # tokens don't consume context window space. - # Thinking models (GLM-5.1, QwQ, DeepSeek R1) - # inflate completion_tokens with reasoning, - # causing premature compression. (#12026) - _real_tokens = _compressor.last_prompt_tokens - else: - # Include tool schemas β€” with 50+ tools enabled - # these add 20-30K tokens the messages-only - # estimate misses, which can skip compression - # past the configured threshold (#14695). - _real_tokens = estimate_request_tokens_rough( - messages, tools=self.tools or None - ) - - if self.compression_enabled and _compressor.should_compress(_real_tokens): - self._safe_print(" ⟳ compacting context…") - messages, active_system_prompt = self._compress_context( - messages, system_message, - approx_tokens=self.context_compressor.last_prompt_tokens, - task_id=effective_task_id, - ) - # Compression created a new session β€” clear history so - # _flush_messages_to_session_db writes compressed messages - # to the new session (see preflight compression comment). - conversation_history = None - - # Save session log incrementally (so progress is visible even if interrupted) - self._session_messages = messages - self._save_session_log(messages) - - # Continue loop for next response - continue - - else: - # No tool calls - this is the final response - final_response = assistant_message.content or "" - - # Fix: unmute output when entering the no-tool-call branch - # so the user can see empty-response warnings and recovery - # status messages. _mute_post_response was set during a - # prior housekeeping tool turn and should not silence the - # final response path. - self._mute_post_response = False - - # Check if response only has think block with no actual content after it - if not self._has_content_after_think_block(final_response): - # ── Partial stream recovery ───────────────────── - # If content was already streamed to the user before - # the connection died, use it as the final response - # instead of falling through to prior-turn fallback - # or wasting API calls on retries. - _partial_streamed = ( - getattr(self, "_current_streamed_assistant_text", "") or "" - ) - if self._has_content_after_think_block(_partial_streamed): - _turn_exit_reason = "partial_stream_recovery" - _recovered = self._strip_think_blocks(_partial_streamed).strip() - logger.info( - "Partial stream content delivered (%d chars) " - "β€” using as final response", - len(_recovered), - ) - self._emit_status( - "↻ Stream interrupted β€” using delivered content " - "as final response" - ) - final_response = _recovered - self._response_was_previewed = True - break - - # If the previous turn already delivered real content alongside - # HOUSEKEEPING tool calls (e.g. "You're welcome!" + memory save), - # the model has nothing more to say. Use the earlier content - # immediately instead of wasting API calls on retries. - # NOTE: Only use this shortcut when ALL tools in that turn were - # housekeeping (memory, todo, etc.). When substantive tools - # were called (terminal, search_files, etc.), the content was - # likely mid-task narration ("I'll scan the directory...") and - # the empty follow-up means the model choked β€” let the - # post-tool nudge below handle that instead of exiting early. - fallback = getattr(self, '_last_content_with_tools', None) - if fallback and getattr(self, '_last_content_tools_all_housekeeping', False): - _turn_exit_reason = "fallback_prior_turn_content" - logger.info("Empty follow-up after tool calls β€” using prior turn content as final response") - self._emit_status("↻ Empty response after tool calls β€” using earlier content as final answer") - self._last_content_with_tools = None - self._last_content_tools_all_housekeeping = False - self._empty_content_retries = 0 - # Do NOT modify the assistant message content β€” the - # old code injected "Calling the X tools..." which - # poisoned the conversation history. Just use the - # fallback text as the final response and break. - final_response = self._strip_think_blocks(fallback).strip() - self._response_was_previewed = True - break - - # ── Post-tool-call empty response nudge ─────────── - # The model returned empty after executing tool calls. - # This covers two cases: - # (a) No prior-turn content at all β€” model went silent - # (b) Prior turn had content + SUBSTANTIVE tools (the - # fallback above was skipped because the content - # was mid-task narration, not a final answer) - # Instead of giving up, nudge the model to continue by - # appending a user-level hint. This is the #9400 case: - # weaker models (mimo-v2-pro, GLM-5, etc.) sometimes - # return empty after tool results instead of continuing - # to the next step. One retry with a nudge usually - # fixes it. - _prior_was_tool = any( - m.get("role") == "tool" - for m in messages[-5:] # check recent messages - ) - # Detect Qwen3/Ollama-style in-content thinking blocks. - # Ollama puts in the content field (not in - # reasoning_content), so _has_structured below would - # miss it. We check here so thinking-only responses - # after tool calls route to prefill instead of nudge. - _has_inline_thinking = bool( - re.search( - r'||', - final_response or "", - re.IGNORECASE, - ) - ) - if ( - _prior_was_tool - and not getattr(self, "_post_tool_empty_retried", False) - and not _has_inline_thinking # thinking model still working β€” let prefill handle - ): - self._post_tool_empty_retried = True - # Clear stale narration so it doesn't resurface - # on a later empty response after the nudge. - self._last_content_with_tools = None - self._last_content_tools_all_housekeeping = False - logger.info( - "Empty response after tool calls β€” nudging model " - "to continue processing" - ) - self._emit_status( - "⚠️ Model returned empty after tool calls β€” " - "nudging to continue" - ) - # Append the empty assistant message first so the - # message sequence stays valid: - # tool(result) β†’ assistant("(empty)") β†’ user(nudge) - # Without this, we'd have tool β†’ user which most - # APIs reject as an invalid sequence. - _nudge_msg = self._build_assistant_message(assistant_message, finish_reason) - _nudge_msg["content"] = "(empty)" - _nudge_msg["_empty_recovery_synthetic"] = True - messages.append(_nudge_msg) - messages.append({ - "role": "user", - "content": ( - "You just executed tool calls but returned an " - "empty response. Please process the tool " - "results above and continue with the task." - ), - "_empty_recovery_synthetic": True, - }) - continue - - # ── Thinking-only prefill continuation ────────── - # The model produced structured reasoning (via API - # fields) but no visible text content. Rather than - # giving up, append the assistant message as-is and - # continue β€” the model will see its own reasoning - # on the next turn and produce the text portion. - # Inspired by clawdbot's "incomplete-text" recovery. - # Also covers Qwen3/Ollama in-content blocks - # (detected above as _has_inline_thinking). - _has_structured = bool( - getattr(assistant_message, "reasoning", None) - or getattr(assistant_message, "reasoning_content", None) - or getattr(assistant_message, "reasoning_details", None) - or _has_inline_thinking - ) - if _has_structured and self._thinking_prefill_retries < 2: - self._thinking_prefill_retries += 1 - logger.info( - "Thinking-only response (no visible content) β€” " - "prefilling to continue (%d/2)", - self._thinking_prefill_retries, - ) - self._emit_status( - f"↻ Thinking-only response β€” prefilling to continue " - f"({self._thinking_prefill_retries}/2)" - ) - interim_msg = self._build_assistant_message( - assistant_message, "incomplete" - ) - interim_msg["_thinking_prefill"] = True - messages.append(interim_msg) - self._session_messages = messages - self._save_session_log(messages) - continue - - # ── Empty response retry ────────────────────── - # Model returned nothing usable. Retry up to 3 - # times before attempting fallback. This covers - # both truly empty responses (no content, no - # reasoning) AND reasoning-only responses after - # prefill exhaustion β€” models like mimo-v2-pro - # always populate reasoning fields via OpenRouter, - # so the old `not _has_structured` guard blocked - # retries for every reasoning model after prefill. - _truly_empty = not self._strip_think_blocks( - final_response - ).strip() - _prefill_exhausted = ( - _has_structured - and self._thinking_prefill_retries >= 2 - ) - if _truly_empty and (not _has_structured or _prefill_exhausted) and self._empty_content_retries < 3: - self._empty_content_retries += 1 - logger.warning( - "Empty response (no content or reasoning) β€” " - "retry %d/3 (model=%s)", - self._empty_content_retries, self.model, - ) - self._emit_status( - f"⚠️ Empty response from model β€” retrying " - f"({self._empty_content_retries}/3)" - ) - continue - - # ── Exhausted retries β€” try fallback provider ── - # Before giving up with "(empty)", attempt to - # switch to the next provider in the fallback - # chain. This covers the case where a model - # (e.g. GLM-4.5-Air) consistently returns empty - # due to context degradation or provider issues. - if _truly_empty and self._fallback_chain: - logger.warning( - "Empty response after %d retries β€” " - "attempting fallback (model=%s, provider=%s)", - self._empty_content_retries, self.model, - self.provider, - ) - self._emit_status( - "⚠️ Model returning empty responses β€” " - "switching to fallback provider..." - ) - if self._try_activate_fallback(): - self._empty_content_retries = 0 - self._emit_status( - f"↻ Switched to fallback: {self.model} " - f"({self.provider})" - ) - logger.info( - "Fallback activated after empty responses: " - "now using %s on %s", - self.model, self.provider, - ) - continue - - # Exhausted retries and fallback chain (or no - # fallback configured). Fall through to the - # "(empty)" terminal. - _turn_exit_reason = "empty_response_exhausted" - reasoning_text = self._extract_reasoning(assistant_message) - self._drop_trailing_empty_response_scaffolding(messages) - assistant_msg = self._build_assistant_message(assistant_message, finish_reason) - assistant_msg["content"] = "(empty)" - # This is a user-facing failure sentinel for the gateway, - # not real assistant content. Persisting it makes later - # "continue" turns replay assistant("(empty)") as if it - # were a meaningful model response, which can keep long - # tool-heavy sessions stuck in empty-response loops. - assistant_msg["_empty_terminal_sentinel"] = True - messages.append(assistant_msg) - - if reasoning_text: - reasoning_preview = reasoning_text[:500] + "..." if len(reasoning_text) > 500 else reasoning_text - logger.warning( - "Reasoning-only response (no visible content) " - "after exhausting retries and fallback. " - "Reasoning: %s", reasoning_preview, - ) - self._emit_status( - "⚠️ Model produced reasoning but no visible " - "response after all retries. Returning empty." - ) - else: - logger.warning( - "Empty response (no content or reasoning) " - "after %d retries. No fallback available. " - "model=%s provider=%s", - self._empty_content_retries, self.model, - self.provider, - ) - self._emit_status( - "❌ Model returned no content after all retries" - + (" and fallback attempts." if self._fallback_chain else - ". No fallback providers configured.") - ) - - final_response = "(empty)" - break - - # Reset retry counter/signature on successful content - self._empty_content_retries = 0 - self._thinking_prefill_retries = 0 - - if ( - self.api_mode == "codex_responses" - and self.valid_tool_names - and codex_ack_continuations < 2 - and self._looks_like_codex_intermediate_ack( - user_message=user_message, - assistant_content=final_response, - messages=messages, - ) - ): - codex_ack_continuations += 1 - interim_msg = self._build_assistant_message(assistant_message, "incomplete") - messages.append(interim_msg) - self._emit_interim_assistant_message(interim_msg) - - continue_msg = { - "role": "user", - "content": ( - "[System: Continue now. Execute the required tool calls and only " - "send your final answer after completing the task.]" - ), - } - messages.append(continue_msg) - self._session_messages = messages - self._save_session_log(messages) - continue - - codex_ack_continuations = 0 - - if truncated_response_prefix: - final_response = truncated_response_prefix + final_response - truncated_response_prefix = "" - length_continue_retries = 0 - - final_response = self._strip_think_blocks(final_response).strip() - - final_msg = self._build_assistant_message(assistant_message, finish_reason) - - # Pop thinking-only prefill and empty-response retry - # scaffolding before appending the final response. These - # internal turns are only for the next API retry and should - # not become durable transcript context. - while ( - messages - and isinstance(messages[-1], dict) - and ( - messages[-1].get("_thinking_prefill") - or messages[-1].get("_empty_recovery_synthetic") - or messages[-1].get("_empty_terminal_sentinel") - ) - ): - messages.pop() - - messages.append(final_msg) - - _turn_exit_reason = f"text_response(finish_reason={finish_reason})" - if not self.quiet_mode: - self._safe_print(f"πŸŽ‰ Conversation completed after {api_call_count} OpenAI-compatible API call(s)") - break - - except Exception as e: - error_msg = f"Error during OpenAI-compatible API call #{api_call_count}: {str(e)}" - try: - print(f"❌ {error_msg}") - except (OSError, ValueError): - logger.error(error_msg) - - logger.debug("Outer loop error in API call #%d", api_call_count, exc_info=True) - - # If an assistant message with tool_calls was already appended, - # the API expects a role="tool" result for every tool_call_id. - # Fill in error results for any that weren't answered yet. - for idx in range(len(messages) - 1, -1, -1): - msg = messages[idx] - if not isinstance(msg, dict): - break - if msg.get("role") == "tool": - continue - if msg.get("role") == "assistant" and msg.get("tool_calls"): - answered_ids = { - m["tool_call_id"] - for m in messages[idx + 1:] - if isinstance(m, dict) and m.get("role") == "tool" - } - for tc in msg["tool_calls"]: - if not tc or not isinstance(tc, dict): continue - if tc["id"] not in answered_ids: - err_msg = { - "role": "tool", - "name": AIAgent._get_tool_call_name_static(tc), - "tool_call_id": tc["id"], - "content": f"Error executing tool: {error_msg}", - } - messages.append(err_msg) - break - - # Non-tool errors don't need a synthetic message injected. - # The error is already printed to the user (line above), and - # the retry loop continues. Injecting a fake user/assistant - # message pollutes history, burns tokens, and risks violating - # role-alternation invariants. - - # If we're near the limit, break to avoid infinite loops - if api_call_count >= self.max_iterations - 1: - _turn_exit_reason = f"error_near_max_iterations({error_msg[:80]})" - final_response = f"I apologize, but I encountered repeated errors: {error_msg}" - # Append as assistant so the history stays valid for - # session resume (avoids consecutive user messages). - messages.append({"role": "assistant", "content": final_response}) - break - - if final_response is None and ( - api_call_count >= self.max_iterations - or self.iteration_budget.remaining <= 0 - ): - # Budget exhausted β€” ask the model for a summary via one extra - # API call with tools stripped. _handle_max_iterations injects a - # user message and makes a single toolless request. - _turn_exit_reason = f"max_iterations_reached({api_call_count}/{self.max_iterations})" - self._emit_status( - f"⚠️ Iteration budget exhausted ({api_call_count}/{self.max_iterations}) " - "β€” asking model to summarise" - ) - if not self.quiet_mode: - self._safe_print( - f"\n⚠️ Iteration budget exhausted ({api_call_count}/{self.max_iterations}) " - "β€” requesting summary..." - ) - final_response = self._handle_max_iterations(messages, api_call_count) - - # If running as a kanban worker, block the task so the dispatcher - # knows the worker could not complete (rather than treating it as a - # protocol violation). The agent loop strips tools before calling - # _handle_max_iterations, so the model cannot call kanban_block - # itself β€” we must do it on its behalf. - _kanban_task = os.environ.get("HERMES_KANBAN_TASK") - if _kanban_task: - try: - handle_function_call( - "kanban_block", - { - "task_id": _kanban_task, - "reason": ( - f"Iteration budget exhausted " - f"({api_call_count}/{self.max_iterations}) β€” " - "task could not complete within the allowed " - "iterations" - ), - }, - task_id=effective_task_id, - ) - logger.info( - "kanban_block called for task %s after iteration " - "exhaustion (%d/%d)", - _kanban_task, api_call_count, self.max_iterations, - ) - except Exception: - logger.warning( - "Failed to call kanban_block after iteration " - "exhaustion for task %s", - _kanban_task, - exc_info=True, - ) - - # Determine if conversation completed successfully - completed = final_response is not None and api_call_count < self.max_iterations - - # Save trajectory if enabled. ``user_message`` may be a multimodal - # list of parts; the trajectory format wants a plain string. - self._save_trajectory(messages, _summarize_user_message_for_log(user_message), completed) - - # Clean up VM and browser for this task after conversation completes - self._cleanup_task_resources(effective_task_id) - - # Persist session to both JSON log and SQLite only after private retry - # scaffolding has been removed. Otherwise a later user "continue" turn - # can replay assistant("(empty)") / recovery nudges and fall into the - # same empty-response loop again. - self._drop_trailing_empty_response_scaffolding(messages) - self._persist_session(messages, conversation_history) - - # ── Turn-exit diagnostic log ───────────────────────────────────── - # Always logged at INFO so agent.log captures WHY every turn ended. - # When the last message is a tool result (agent was mid-work), log - # at WARNING β€” this is the "just stops" scenario users report. - _last_msg_role = messages[-1].get("role") if messages else None - _last_tool_name = None - if _last_msg_role == "tool": - # Walk back to find the assistant message with the tool call - for _m in reversed(messages): - if _m.get("role") == "assistant" and _m.get("tool_calls"): - _tcs = _m["tool_calls"] - if _tcs and isinstance(_tcs[0], dict): - _last_tool_name = _tcs[-1].get("function", {}).get("name") - break - - _turn_tool_count = sum( - 1 for m in messages - if isinstance(m, dict) and m.get("role") == "assistant" and m.get("tool_calls") - ) - _resp_len = len(final_response) if final_response else 0 - _budget_used = self.iteration_budget.used if self.iteration_budget else 0 - _budget_max = self.iteration_budget.max_total if self.iteration_budget else 0 - - _diag_msg = ( - "Turn ended: reason=%s model=%s api_calls=%d/%d budget=%d/%d " - "tool_turns=%d last_msg_role=%s response_len=%d session=%s" - ) - _diag_args = ( - _turn_exit_reason, self.model, api_call_count, self.max_iterations, - _budget_used, _budget_max, - _turn_tool_count, _last_msg_role, _resp_len, - self.session_id or "none", - ) - - if _last_msg_role == "tool" and not interrupted: - # Agent was mid-work β€” this is the "just stops" case. - logger.warning( - "Turn ended with pending tool result (agent may appear stuck). " - + _diag_msg + " last_tool=%s", - *_diag_args, _last_tool_name, - ) - else: - logger.info(_diag_msg, *_diag_args) - - # File-mutation verifier footer. - # If one or more ``write_file`` / ``patch`` calls failed during this - # turn and were never superseded by a successful write to the same - # path, append an advisory footer to the assistant response. This - # catches the specific case β€” reported by Ben Eng (#15524-adjacent) - # β€” where a model issues a batch of parallel patches, half of them - # fail with "Could not find old_string", and the model summarises - # the turn claiming every file was edited. The user then has to - # manually run ``git status`` to catch the lie. With this footer - # the truth is surfaced on every turn, so over-claiming is - # structurally impossible past the model. - # - # Gate: only applied when a real text response exists for this - # turn and the user didn't interrupt. Empty/interrupted turns - # already have other surface text that shouldn't be augmented. - if final_response and not interrupted: - try: - _failed = getattr(self, "_turn_failed_file_mutations", None) or {} - if _failed and self._file_mutation_verifier_enabled(): - footer = self._format_file_mutation_failure_footer(_failed) - if footer: - final_response = final_response.rstrip() + "\n\n" + footer - except Exception as _ver_err: - logger.debug("file-mutation verifier footer failed: %s", _ver_err) - - # Plugin hook: transform_llm_output - # Fired once per turn after the tool-calling loop completes. - # Plugins can transform the LLM's output text before it's returned. - # First hook to return a string wins; None/empty return leaves text unchanged. - if final_response and not interrupted: - try: - from hermes_cli.plugins import invoke_hook as _invoke_hook - _transform_results = _invoke_hook( - "transform_llm_output", - response_text=final_response, - session_id=self.session_id or "", - model=self.model, - platform=getattr(self, "platform", None) or "", - ) - for _hook_result in _transform_results: - if isinstance(_hook_result, str) and _hook_result: - final_response = _hook_result - break # First non-empty string wins - except Exception as exc: - logger.warning("transform_llm_output hook failed: %s", exc) - - # Plugin hook: post_llm_call - # Fired once per turn after the tool-calling loop completes. - # Plugins can use this to persist conversation data (e.g. sync - # to an external memory system). - if final_response and not interrupted: - try: - from hermes_cli.plugins import invoke_hook as _invoke_hook - _invoke_hook( - "post_llm_call", - session_id=self.session_id, - user_message=original_user_message, - assistant_response=final_response, - conversation_history=list(messages), - model=self.model, - platform=getattr(self, "platform", None) or "", - ) - except Exception as exc: - logger.warning("post_llm_call hook failed: %s", exc) - - # Extract reasoning from the CURRENT turn only. Walk backwards - # but stop at the user message that started this turn β€” anything - # earlier is from a prior turn and must not leak into the reasoning - # box (confusing stale display; #17055). Within the current turn - # we still want the *most recent* non-empty reasoning: many - # providers (Claude thinking, DeepSeek v4, Codex Responses) emit - # reasoning on the tool-call step and leave the final-answer step - # with reasoning=None, so picking only the last assistant would - # silently drop legitimate same-turn reasoning. - last_reasoning = None - for msg in reversed(messages): - if msg.get("role") == "user": - break # turn boundary β€” don't cross into prior turns - if msg.get("role") == "assistant" and msg.get("reasoning"): - last_reasoning = msg["reasoning"] - break - - # Build result with interrupt info if applicable - result = { - "final_response": final_response, - "last_reasoning": last_reasoning, - "messages": messages, - "api_calls": api_call_count, - "completed": completed, - "turn_exit_reason": _turn_exit_reason, - "partial": False, # True only when stopped due to invalid tool calls - "interrupted": interrupted, - "response_previewed": getattr(self, "_response_was_previewed", False), - "model": self.model, - "provider": self.provider, - "base_url": self.base_url, - "input_tokens": self.session_input_tokens, - "output_tokens": self.session_output_tokens, - "cache_read_tokens": self.session_cache_read_tokens, - "cache_write_tokens": self.session_cache_write_tokens, - "reasoning_tokens": self.session_reasoning_tokens, - "prompt_tokens": self.session_prompt_tokens, - "completion_tokens": self.session_completion_tokens, - "total_tokens": self.session_total_tokens, - "last_prompt_tokens": getattr(self.context_compressor, "last_prompt_tokens", 0) or 0, - "estimated_cost_usd": self.session_estimated_cost_usd, - "cost_status": self.session_cost_status, - "cost_source": self.session_cost_source, - } - if self._tool_guardrail_halt_decision is not None: - result["guardrail"] = self._tool_guardrail_halt_decision.to_metadata() - # If a /steer landed after the final assistant turn (no more tool - # batches to drain into), hand it back to the caller so it can be - # delivered as the next user turn instead of being silently lost. - _leftover_steer = self._drain_pending_steer() - if _leftover_steer: - result["pending_steer"] = _leftover_steer - self._response_was_previewed = False - - # Include interrupt message if one triggered the interrupt - if interrupted and self._interrupt_message: - result["interrupt_message"] = self._interrupt_message - - # Clear interrupt state after handling - self.clear_interrupt() - - # Clear stream callback so it doesn't leak into future calls - self._stream_callback = None - - # Check skill trigger NOW β€” based on how many tool iterations THIS turn used. - _should_review_skills = False - if (self._skill_nudge_interval > 0 - and self._iters_since_skill >= self._skill_nudge_interval - and "skill_manage" in self.valid_tool_names): - _should_review_skills = True - self._iters_since_skill = 0 - - # External memory provider: sync the completed turn + queue next prefetch. - self._sync_external_memory_for_turn( - original_user_message=original_user_message, - final_response=final_response, - interrupted=interrupted, - ) - - # Background memory/skill review β€” runs AFTER the response is delivered - # so it never competes with the user's task for model attention. - if final_response and not interrupted and (_should_review_memory or _should_review_skills): - try: - self._spawn_background_review( - messages_snapshot=list(messages), - review_memory=_should_review_memory, - review_skills=_should_review_skills, - ) - except Exception: - pass # Background review is best-effort - - # Note: Memory provider on_session_end() + shutdown_all() are NOT - # called here β€” run_conversation() is called once per user message in - # multi-turn sessions. Shutting down after every turn would kill the - # provider before the second message. Actual session-end cleanup is - # handled by the CLI (atexit / /reset) and gateway (session expiry / - # _reset_session). - - # Plugin hook: on_session_end - # Fired at the very end of every run_conversation call. - # Plugins can use this for cleanup, flushing buffers, etc. - try: - from hermes_cli.plugins import invoke_hook as _invoke_hook - _invoke_hook( - "on_session_end", - session_id=self.session_id, - completed=completed, - interrupted=interrupted, - model=self.model, - platform=getattr(self, "platform", None) or "", - ) - except Exception as exc: - logger.warning("on_session_end hook failed: %s", exc) - - return result + """Forwarder β€” see ``agent.conversation_loop.run_conversation``.""" + from agent.conversation_loop import run_conversation + return run_conversation(self, user_message, system_message, conversation_history, task_id, stream_callback, persist_user_message) def chat(self, message: str, stream_callback: Optional[callable] = None) -> str: """ diff --git a/tests/run_agent/test_jsondecodeerror_retryable.py b/tests/run_agent/test_jsondecodeerror_retryable.py index 201521ddb22..e810092613e 100644 --- a/tests/run_agent/test_jsondecodeerror_retryable.py +++ b/tests/run_agent/test_jsondecodeerror_retryable.py @@ -75,7 +75,9 @@ class TestAgentLoopSourceStillHasCarveOut: def test_run_agent_excludes_jsondecodeerror_from_local_validation(self): import run_agent import inspect - src = inspect.getsource(run_agent) + from agent import conversation_loop + # The body moved into agent/conversation_loop.py; scan both for safety. + src = inspect.getsource(run_agent) + inspect.getsource(conversation_loop) # The predicate we care about must reference json.JSONDecodeError # in its exclusion tuple. We check for the specific co-occurrence # rather than the literal string so harmless reformatting doesn't diff --git a/tests/run_agent/test_memory_nudge_counter_hydration.py b/tests/run_agent/test_memory_nudge_counter_hydration.py index abf97d265a6..f3923f83442 100644 --- a/tests/run_agent/test_memory_nudge_counter_hydration.py +++ b/tests/run_agent/test_memory_nudge_counter_hydration.py @@ -120,10 +120,20 @@ def test_production_code_contains_hydration_block(): """Smoke test: confirm the hydration code is actually wired into run_conversation(). If someone deletes it, tests above still pass against the inline replica β€” this fails them awake. + + The body now lives in agent/conversation_loop.py after the + run_agent.py refactor; check both files for safety. """ from pathlib import Path - src = Path(__file__).resolve().parents[2] / "run_agent.py" - content = src.read_text(encoding="utf-8") + repo = Path(__file__).resolve().parents[2] + src_ra = (repo / "run_agent.py").read_text(encoding="utf-8") + src_cl = (repo / "agent" / "conversation_loop.py").read_text(encoding="utf-8") + content = src_ra + src_cl # Anchor on the unique comment + the modulo line. assert "Hydrate per-session nudge counters from persisted history" in content - assert "self._turns_since_memory = prior_user_turns % self._memory_nudge_interval" in content + # The line uses ``self.`` in run_agent.py form and ``agent.`` in the + # extracted module, accept either. + assert ( + "self._turns_since_memory = prior_user_turns % self._memory_nudge_interval" in content + or "agent._turns_since_memory = prior_user_turns % agent._memory_nudge_interval" in content + ) diff --git a/tests/run_agent/test_run_agent.py b/tests/run_agent/test_run_agent.py index eb5efcafca7..76254d4eda5 100644 --- a/tests/run_agent/test_run_agent.py +++ b/tests/run_agent/test_run_agent.py @@ -5205,14 +5205,19 @@ class TestMemoryNudgeCounterPersistence: def test_counters_not_reset_in_preamble(self): """The run_conversation preamble must not zero the nudge counters.""" import inspect - src = inspect.getsource(AIAgent.run_conversation) + from agent.conversation_loop import run_conversation as _rc + src = inspect.getsource(_rc) # The preamble resets many fields (retry counts, budget, etc.) # before the main loop. Find that reset block and verify our # counters aren't in it. The reset block ends at iteration_budget. - preamble_end = src.index("self.iteration_budget = IterationBudget") + # After the run_agent.py refactor the body uses ``agent.X`` instead + # of ``self.X``, so accept either form. + preamble_end = src.index("iteration_budget = IterationBudget") preamble = src[:preamble_end] assert "self._turns_since_memory = 0" not in preamble assert "self._iters_since_skill = 0" not in preamble + assert "agent._turns_since_memory = 0" not in preamble + assert "agent._iters_since_skill = 0" not in preamble class TestDeadRetryCode: @@ -5220,7 +5225,8 @@ class TestDeadRetryCode: def test_no_unreachable_max_retries_after_backoff(self): import inspect - source = inspect.getsource(AIAgent.run_conversation) + from agent.conversation_loop import run_conversation as _rc + source = inspect.getsource(_rc) occurrences = source.count("if retry_count >= max_retries:") assert occurrences == 2, ( f"Expected 2 occurrences of 'if retry_count >= max_retries:' " @@ -5258,7 +5264,8 @@ class TestMemoryContextSanitization: a literal tag we don't silently delete their text. The streaming scrubber + plugin-side scrub cover real leak paths.""" import inspect - src = inspect.getsource(AIAgent.run_conversation) + from agent.conversation_loop import run_conversation as _rc + src = inspect.getsource(_rc) assert "sanitize_context(user_message)" not in src assert "sanitize_context(persist_user_message)" not in src @@ -5294,7 +5301,8 @@ class TestMemoryProviderTurnStart: def test_on_turn_start_called_before_prefetch(self): """Source-level check: on_turn_start appears before prefetch_all in run_conversation.""" import inspect - src = inspect.getsource(AIAgent.run_conversation) + from agent.conversation_loop import run_conversation as _rc + src = inspect.getsource(_rc) # Find the actual method calls, not comments idx_turn_start = src.index(".on_turn_start(") idx_prefetch = src.index(".prefetch_all(") @@ -5304,7 +5312,13 @@ class TestMemoryProviderTurnStart: ) def test_on_turn_start_uses_user_turn_count(self): - """Source-level check: on_turn_start receives self._user_turn_count.""" + """Source-level check: on_turn_start receives the user_turn_count.""" import inspect - src = inspect.getsource(AIAgent.run_conversation) - assert "on_turn_start(self._user_turn_count" in src + from agent.conversation_loop import run_conversation as _rc + src = inspect.getsource(_rc) + # After the run_agent.py refactor the body uses ``agent.X`` instead + # of ``self.X``. Accept either spelling. + assert ( + "on_turn_start(self._user_turn_count" in src + or "on_turn_start(agent._user_turn_count" in src + )